1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubGenStubId stub_id = StubGenStubId::call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 // All of j_rargN may be used to return inline type fields so be careful 332 // not to clobber those. 333 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register 334 // assignment of Rresult below. 335 Register Rresult = r14, Rresult_type = r15; 336 __ ldr(Rresult, result); 337 Label is_long, is_float, is_double, check_prim, exit; 338 __ ldr(Rresult_type, result_type); 339 __ cmp(Rresult_type, (u1)T_OBJECT); 340 __ br(Assembler::EQ, check_prim); 341 __ cmp(Rresult_type, (u1)T_LONG); 342 __ br(Assembler::EQ, is_long); 343 __ cmp(Rresult_type, (u1)T_FLOAT); 344 __ br(Assembler::EQ, is_float); 345 __ cmp(Rresult_type, (u1)T_DOUBLE); 346 __ br(Assembler::EQ, is_double); 347 348 // handle T_INT case 349 __ strw(r0, Address(Rresult)); 350 351 __ BIND(exit); 352 353 // pop parameters 354 __ sub(esp, rfp, -sp_after_call_off * wordSize); 355 356 #ifdef ASSERT 357 // verify that threads correspond 358 { 359 Label L, S; 360 __ ldr(rscratch1, thread); 361 __ cmp(rthread, rscratch1); 362 __ br(Assembler::NE, S); 363 __ get_thread(rscratch1); 364 __ cmp(rthread, rscratch1); 365 __ br(Assembler::EQ, L); 366 __ BIND(S); 367 __ stop("StubRoutines::call_stub: threads must correspond"); 368 __ BIND(L); 369 } 370 #endif 371 372 __ pop_cont_fastpath(rthread); 373 374 // restore callee-save registers 375 __ ldpd(v15, v14, d15_save); 376 __ ldpd(v13, v12, d13_save); 377 __ ldpd(v11, v10, d11_save); 378 __ ldpd(v9, v8, d9_save); 379 380 __ ldp(r28, r27, r28_save); 381 __ ldp(r26, r25, r26_save); 382 __ ldp(r24, r23, r24_save); 383 __ ldp(r22, r21, r22_save); 384 __ ldp(r20, r19, r20_save); 385 386 // restore fpcr 387 __ ldr(rscratch1, fpcr_save); 388 __ set_fpcr(rscratch1); 389 390 __ ldp(c_rarg0, c_rarg1, call_wrapper); 391 __ ldrw(c_rarg2, result_type); 392 __ ldr(c_rarg3, method); 393 __ ldp(c_rarg4, c_rarg5, entry_point); 394 __ ldp(c_rarg6, c_rarg7, parameter_size); 395 396 // leave frame and return to caller 397 __ leave(); 398 __ ret(lr); 399 400 // handle return types different from T_INT 401 __ BIND(check_prim); 402 if (InlineTypeReturnedAsFields) { 403 // Check for scalarized return value 404 __ tbz(r0, 0, is_long); 405 // Load pack handler address 406 __ andr(rscratch1, r0, -2); 407 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset())); 408 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset())); 409 __ blr(rscratch1); 410 __ b(exit); 411 } 412 413 __ BIND(is_long); 414 __ str(r0, Address(Rresult, 0)); 415 __ br(Assembler::AL, exit); 416 417 __ BIND(is_float); 418 __ strs(j_farg0, Address(Rresult, 0)); 419 __ br(Assembler::AL, exit); 420 421 __ BIND(is_double); 422 __ strd(j_farg0, Address(Rresult, 0)); 423 __ br(Assembler::AL, exit); 424 425 return start; 426 } 427 428 // Return point for a Java call if there's an exception thrown in 429 // Java code. The exception is caught and transformed into a 430 // pending exception stored in JavaThread that can be tested from 431 // within the VM. 432 // 433 // Note: Usually the parameters are removed by the callee. In case 434 // of an exception crossing an activation frame boundary, that is 435 // not the case if the callee is compiled code => need to setup the 436 // rsp. 437 // 438 // r0: exception oop 439 440 address generate_catch_exception() { 441 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 442 StubCodeMark mark(this, stub_id); 443 address start = __ pc(); 444 445 // same as in generate_call_stub(): 446 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 447 const Address thread (rfp, thread_off * wordSize); 448 449 #ifdef ASSERT 450 // verify that threads correspond 451 { 452 Label L, S; 453 __ ldr(rscratch1, thread); 454 __ cmp(rthread, rscratch1); 455 __ br(Assembler::NE, S); 456 __ get_thread(rscratch1); 457 __ cmp(rthread, rscratch1); 458 __ br(Assembler::EQ, L); 459 __ bind(S); 460 __ stop("StubRoutines::catch_exception: threads must correspond"); 461 __ bind(L); 462 } 463 #endif 464 465 // set pending exception 466 __ verify_oop(r0); 467 468 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 469 __ mov(rscratch1, (address)__FILE__); 470 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 471 __ movw(rscratch1, (int)__LINE__); 472 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 473 474 // complete return to VM 475 assert(StubRoutines::_call_stub_return_address != nullptr, 476 "_call_stub_return_address must have been generated before"); 477 __ b(StubRoutines::_call_stub_return_address); 478 479 return start; 480 } 481 482 // Continuation point for runtime calls returning with a pending 483 // exception. The pending exception check happened in the runtime 484 // or native call stub. The pending exception in Thread is 485 // converted into a Java-level exception. 486 // 487 // Contract with Java-level exception handlers: 488 // r0: exception 489 // r3: throwing pc 490 // 491 // NOTE: At entry of this stub, exception-pc must be in LR !! 492 493 // NOTE: this is always used as a jump target within generated code 494 // so it just needs to be generated code with no x86 prolog 495 496 address generate_forward_exception() { 497 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 498 StubCodeMark mark(this, stub_id); 499 address start = __ pc(); 500 501 // Upon entry, LR points to the return address returning into 502 // Java (interpreted or compiled) code; i.e., the return address 503 // becomes the throwing pc. 504 // 505 // Arguments pushed before the runtime call are still on the stack 506 // but the exception handler will reset the stack pointer -> 507 // ignore them. A potential result in registers can be ignored as 508 // well. 509 510 #ifdef ASSERT 511 // make sure this code is only executed if there is a pending exception 512 { 513 Label L; 514 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 515 __ cbnz(rscratch1, L); 516 __ stop("StubRoutines::forward exception: no pending exception (1)"); 517 __ bind(L); 518 } 519 #endif 520 521 // compute exception handler into r19 522 523 // call the VM to find the handler address associated with the 524 // caller address. pass thread in r0 and caller pc (ret address) 525 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 526 // the stack. 527 __ mov(c_rarg1, lr); 528 // lr will be trashed by the VM call so we move it to R19 529 // (callee-saved) because we also need to pass it to the handler 530 // returned by this call. 531 __ mov(r19, lr); 532 BLOCK_COMMENT("call exception_handler_for_return_address"); 533 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 534 SharedRuntime::exception_handler_for_return_address), 535 rthread, c_rarg1); 536 // Reinitialize the ptrue predicate register, in case the external runtime 537 // call clobbers ptrue reg, as we may return to SVE compiled code. 538 __ reinitialize_ptrue(); 539 540 // we should not really care that lr is no longer the callee 541 // address. we saved the value the handler needs in r19 so we can 542 // just copy it to r3. however, the C2 handler will push its own 543 // frame and then calls into the VM and the VM code asserts that 544 // the PC for the frame above the handler belongs to a compiled 545 // Java method. So, we restore lr here to satisfy that assert. 546 __ mov(lr, r19); 547 // setup r0 & r3 & clear pending exception 548 __ mov(r3, r19); 549 __ mov(r19, r0); 550 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 551 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 552 553 #ifdef ASSERT 554 // make sure exception is set 555 { 556 Label L; 557 __ cbnz(r0, L); 558 __ stop("StubRoutines::forward exception: no pending exception (2)"); 559 __ bind(L); 560 } 561 #endif 562 563 // continue at exception handler 564 // r0: exception 565 // r3: throwing pc 566 // r19: exception handler 567 __ verify_oop(r0); 568 __ br(r19); 569 570 return start; 571 } 572 573 // Non-destructive plausibility checks for oops 574 // 575 // Arguments: 576 // r0: oop to verify 577 // rscratch1: error message 578 // 579 // Stack after saving c_rarg3: 580 // [tos + 0]: saved c_rarg3 581 // [tos + 1]: saved c_rarg2 582 // [tos + 2]: saved lr 583 // [tos + 3]: saved rscratch2 584 // [tos + 4]: saved r0 585 // [tos + 5]: saved rscratch1 586 address generate_verify_oop() { 587 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 588 StubCodeMark mark(this, stub_id); 589 address start = __ pc(); 590 591 Label exit, error; 592 593 // save c_rarg2 and c_rarg3 594 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 595 596 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 597 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 598 __ ldr(c_rarg3, Address(c_rarg2)); 599 __ add(c_rarg3, c_rarg3, 1); 600 __ str(c_rarg3, Address(c_rarg2)); 601 602 // object is in r0 603 // make sure object is 'reasonable' 604 __ cbz(r0, exit); // if obj is null it is OK 605 606 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 607 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 608 609 // return if everything seems ok 610 __ bind(exit); 611 612 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 613 __ ret(lr); 614 615 // handle errors 616 __ bind(error); 617 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 618 619 __ push(RegSet::range(r0, r29), sp); 620 // debug(char* msg, int64_t pc, int64_t regs[]) 621 __ mov(c_rarg0, rscratch1); // pass address of error message 622 __ mov(c_rarg1, lr); // pass return address 623 __ mov(c_rarg2, sp); // pass address of regs on stack 624 #ifndef PRODUCT 625 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 626 #endif 627 BLOCK_COMMENT("call MacroAssembler::debug"); 628 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 629 __ blr(rscratch1); 630 __ hlt(0); 631 632 return start; 633 } 634 635 // Generate indices for iota vector. 636 address generate_iota_indices(StubGenStubId stub_id) { 637 __ align(CodeEntryAlignment); 638 StubCodeMark mark(this, stub_id); 639 address start = __ pc(); 640 // B 641 __ emit_data64(0x0706050403020100, relocInfo::none); 642 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 643 // H 644 __ emit_data64(0x0003000200010000, relocInfo::none); 645 __ emit_data64(0x0007000600050004, relocInfo::none); 646 // S 647 __ emit_data64(0x0000000100000000, relocInfo::none); 648 __ emit_data64(0x0000000300000002, relocInfo::none); 649 // D 650 __ emit_data64(0x0000000000000000, relocInfo::none); 651 __ emit_data64(0x0000000000000001, relocInfo::none); 652 // S - FP 653 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 654 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 655 // D - FP 656 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 657 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 658 return start; 659 } 660 661 // The inner part of zero_words(). This is the bulk operation, 662 // zeroing words in blocks, possibly using DC ZVA to do it. The 663 // caller is responsible for zeroing the last few words. 664 // 665 // Inputs: 666 // r10: the HeapWord-aligned base address of an array to zero. 667 // r11: the count in HeapWords, r11 > 0. 668 // 669 // Returns r10 and r11, adjusted for the caller to clear. 670 // r10: the base address of the tail of words left to clear. 671 // r11: the number of words in the tail. 672 // r11 < MacroAssembler::zero_words_block_size. 673 674 address generate_zero_blocks() { 675 Label done; 676 Label base_aligned; 677 678 Register base = r10, cnt = r11; 679 680 __ align(CodeEntryAlignment); 681 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 682 StubCodeMark mark(this, stub_id); 683 address start = __ pc(); 684 685 if (UseBlockZeroing) { 686 int zva_length = VM_Version::zva_length(); 687 688 // Ensure ZVA length can be divided by 16. This is required by 689 // the subsequent operations. 690 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 691 692 __ tbz(base, 3, base_aligned); 693 __ str(zr, Address(__ post(base, 8))); 694 __ sub(cnt, cnt, 1); 695 __ bind(base_aligned); 696 697 // Ensure count >= zva_length * 2 so that it still deserves a zva after 698 // alignment. 699 Label small; 700 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 701 __ subs(rscratch1, cnt, low_limit >> 3); 702 __ br(Assembler::LT, small); 703 __ zero_dcache_blocks(base, cnt); 704 __ bind(small); 705 } 706 707 { 708 // Number of stp instructions we'll unroll 709 const int unroll = 710 MacroAssembler::zero_words_block_size / 2; 711 // Clear the remaining blocks. 712 Label loop; 713 __ subs(cnt, cnt, unroll * 2); 714 __ br(Assembler::LT, done); 715 __ bind(loop); 716 for (int i = 0; i < unroll; i++) 717 __ stp(zr, zr, __ post(base, 16)); 718 __ subs(cnt, cnt, unroll * 2); 719 __ br(Assembler::GE, loop); 720 __ bind(done); 721 __ add(cnt, cnt, unroll * 2); 722 } 723 724 __ ret(lr); 725 726 return start; 727 } 728 729 730 typedef enum { 731 copy_forwards = 1, 732 copy_backwards = -1 733 } copy_direction; 734 735 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 736 // for arraycopy stubs. 737 class ArrayCopyBarrierSetHelper : StackObj { 738 BarrierSetAssembler* _bs_asm; 739 MacroAssembler* _masm; 740 DecoratorSet _decorators; 741 BasicType _type; 742 Register _gct1; 743 Register _gct2; 744 Register _gct3; 745 FloatRegister _gcvt1; 746 FloatRegister _gcvt2; 747 FloatRegister _gcvt3; 748 749 public: 750 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 751 DecoratorSet decorators, 752 BasicType type, 753 Register gct1, 754 Register gct2, 755 Register gct3, 756 FloatRegister gcvt1, 757 FloatRegister gcvt2, 758 FloatRegister gcvt3) 759 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 760 _masm(masm), 761 _decorators(decorators), 762 _type(type), 763 _gct1(gct1), 764 _gct2(gct2), 765 _gct3(gct3), 766 _gcvt1(gcvt1), 767 _gcvt2(gcvt2), 768 _gcvt3(gcvt3) { 769 } 770 771 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 772 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 773 dst1, dst2, src, 774 _gct1, _gct2, _gcvt1); 775 } 776 777 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 778 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 779 dst, src1, src2, 780 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 781 } 782 783 void copy_load_at_16(Register dst1, Register dst2, Address src) { 784 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 785 dst1, dst2, src, 786 _gct1); 787 } 788 789 void copy_store_at_16(Address dst, Register src1, Register src2) { 790 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 791 dst, src1, src2, 792 _gct1, _gct2, _gct3); 793 } 794 795 void copy_load_at_8(Register dst, Address src) { 796 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 797 dst, noreg, src, 798 _gct1); 799 } 800 801 void copy_store_at_8(Address dst, Register src) { 802 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 803 dst, src, noreg, 804 _gct1, _gct2, _gct3); 805 } 806 }; 807 808 // Bulk copy of blocks of 8 words. 809 // 810 // count is a count of words. 811 // 812 // Precondition: count >= 8 813 // 814 // Postconditions: 815 // 816 // The least significant bit of count contains the remaining count 817 // of words to copy. The rest of count is trash. 818 // 819 // s and d are adjusted to point to the remaining words to copy 820 // 821 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 822 BasicType type; 823 copy_direction direction; 824 825 switch (stub_id) { 826 case copy_byte_f_id: 827 direction = copy_forwards; 828 type = T_BYTE; 829 break; 830 case copy_byte_b_id: 831 direction = copy_backwards; 832 type = T_BYTE; 833 break; 834 case copy_oop_f_id: 835 direction = copy_forwards; 836 type = T_OBJECT; 837 break; 838 case copy_oop_b_id: 839 direction = copy_backwards; 840 type = T_OBJECT; 841 break; 842 case copy_oop_uninit_f_id: 843 direction = copy_forwards; 844 type = T_OBJECT; 845 break; 846 case copy_oop_uninit_b_id: 847 direction = copy_backwards; 848 type = T_OBJECT; 849 break; 850 default: 851 ShouldNotReachHere(); 852 } 853 854 int unit = wordSize * direction; 855 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 856 857 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 858 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 859 const Register stride = r14; 860 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 861 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 862 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 863 864 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 865 assert_different_registers(s, d, count, rscratch1, rscratch2); 866 867 Label again, drain; 868 869 __ align(CodeEntryAlignment); 870 871 StubCodeMark mark(this, stub_id); 872 873 __ bind(start); 874 875 Label unaligned_copy_long; 876 if (AvoidUnalignedAccesses) { 877 __ tbnz(d, 3, unaligned_copy_long); 878 } 879 880 if (direction == copy_forwards) { 881 __ sub(s, s, bias); 882 __ sub(d, d, bias); 883 } 884 885 #ifdef ASSERT 886 // Make sure we are never given < 8 words 887 { 888 Label L; 889 __ cmp(count, (u1)8); 890 __ br(Assembler::GE, L); 891 __ stop("genrate_copy_longs called with < 8 words"); 892 __ bind(L); 893 } 894 #endif 895 896 // Fill 8 registers 897 if (UseSIMDForMemoryOps) { 898 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 899 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 900 } else { 901 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 902 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 903 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 904 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 905 } 906 907 __ subs(count, count, 16); 908 __ br(Assembler::LO, drain); 909 910 int prefetch = PrefetchCopyIntervalInBytes; 911 bool use_stride = false; 912 if (direction == copy_backwards) { 913 use_stride = prefetch > 256; 914 prefetch = -prefetch; 915 if (use_stride) __ mov(stride, prefetch); 916 } 917 918 __ bind(again); 919 920 if (PrefetchCopyIntervalInBytes > 0) 921 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 922 923 if (UseSIMDForMemoryOps) { 924 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 925 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 926 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 927 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 928 } else { 929 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 930 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 931 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 932 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 933 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 934 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 937 } 938 939 __ subs(count, count, 8); 940 __ br(Assembler::HS, again); 941 942 // Drain 943 __ bind(drain); 944 if (UseSIMDForMemoryOps) { 945 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 946 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 947 } else { 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 950 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 951 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 952 } 953 954 { 955 Label L1, L2; 956 __ tbz(count, exact_log2(4), L1); 957 if (UseSIMDForMemoryOps) { 958 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 959 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 960 } else { 961 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 962 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 963 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 964 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 965 } 966 __ bind(L1); 967 968 if (direction == copy_forwards) { 969 __ add(s, s, bias); 970 __ add(d, d, bias); 971 } 972 973 __ tbz(count, 1, L2); 974 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 975 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 976 __ bind(L2); 977 } 978 979 __ ret(lr); 980 981 if (AvoidUnalignedAccesses) { 982 Label drain, again; 983 // Register order for storing. Order is different for backward copy. 984 985 __ bind(unaligned_copy_long); 986 987 // source address is even aligned, target odd aligned 988 // 989 // when forward copying word pairs we read long pairs at offsets 990 // {0, 2, 4, 6} (in long words). when backwards copying we read 991 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 992 // address by -2 in the forwards case so we can compute the 993 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 994 // or -1. 995 // 996 // when forward copying we need to store 1 word, 3 pairs and 997 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 998 // zero offset We adjust the destination by -1 which means we 999 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 1000 // 1001 // When backwards copyng we need to store 1 word, 3 pairs and 1002 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 1003 // offsets {1, 3, 5, 7, 8} * unit. 1004 1005 if (direction == copy_forwards) { 1006 __ sub(s, s, 16); 1007 __ sub(d, d, 8); 1008 } 1009 1010 // Fill 8 registers 1011 // 1012 // for forwards copy s was offset by -16 from the original input 1013 // value of s so the register contents are at these offsets 1014 // relative to the 64 bit block addressed by that original input 1015 // and so on for each successive 64 byte block when s is updated 1016 // 1017 // t0 at offset 0, t1 at offset 8 1018 // t2 at offset 16, t3 at offset 24 1019 // t4 at offset 32, t5 at offset 40 1020 // t6 at offset 48, t7 at offset 56 1021 1022 // for backwards copy s was not offset so the register contents 1023 // are at these offsets into the preceding 64 byte block 1024 // relative to that original input and so on for each successive 1025 // preceding 64 byte block when s is updated. this explains the 1026 // slightly counter-intuitive looking pattern of register usage 1027 // in the stp instructions for backwards copy. 1028 // 1029 // t0 at offset -16, t1 at offset -8 1030 // t2 at offset -32, t3 at offset -24 1031 // t4 at offset -48, t5 at offset -40 1032 // t6 at offset -64, t7 at offset -56 1033 1034 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1035 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1036 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1037 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1038 1039 __ subs(count, count, 16); 1040 __ br(Assembler::LO, drain); 1041 1042 int prefetch = PrefetchCopyIntervalInBytes; 1043 bool use_stride = false; 1044 if (direction == copy_backwards) { 1045 use_stride = prefetch > 256; 1046 prefetch = -prefetch; 1047 if (use_stride) __ mov(stride, prefetch); 1048 } 1049 1050 __ bind(again); 1051 1052 if (PrefetchCopyIntervalInBytes > 0) 1053 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1054 1055 if (direction == copy_forwards) { 1056 // allowing for the offset of -8 the store instructions place 1057 // registers into the target 64 bit block at the following 1058 // offsets 1059 // 1060 // t0 at offset 0 1061 // t1 at offset 8, t2 at offset 16 1062 // t3 at offset 24, t4 at offset 32 1063 // t5 at offset 40, t6 at offset 48 1064 // t7 at offset 56 1065 1066 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1067 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1068 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1069 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1070 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1071 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1072 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1073 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1074 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1075 } else { 1076 // d was not offset when we started so the registers are 1077 // written into the 64 bit block preceding d with the following 1078 // offsets 1079 // 1080 // t1 at offset -8 1081 // t3 at offset -24, t0 at offset -16 1082 // t5 at offset -48, t2 at offset -32 1083 // t7 at offset -56, t4 at offset -48 1084 // t6 at offset -64 1085 // 1086 // note that this matches the offsets previously noted for the 1087 // loads 1088 1089 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1090 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1091 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1092 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1093 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1094 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1095 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1096 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1097 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1098 } 1099 1100 __ subs(count, count, 8); 1101 __ br(Assembler::HS, again); 1102 1103 // Drain 1104 // 1105 // this uses the same pattern of offsets and register arguments 1106 // as above 1107 __ bind(drain); 1108 if (direction == copy_forwards) { 1109 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1110 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1111 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1112 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1113 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1114 } else { 1115 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1116 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1117 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1118 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1119 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1120 } 1121 // now we need to copy any remaining part block which may 1122 // include a 4 word block subblock and/or a 2 word subblock. 1123 // bits 2 and 1 in the count are the tell-tale for whether we 1124 // have each such subblock 1125 { 1126 Label L1, L2; 1127 __ tbz(count, exact_log2(4), L1); 1128 // this is the same as above but copying only 4 longs hence 1129 // with only one intervening stp between the str instructions 1130 // but note that the offsets and registers still follow the 1131 // same pattern 1132 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1133 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1134 if (direction == copy_forwards) { 1135 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1136 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1137 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1141 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1142 } 1143 __ bind(L1); 1144 1145 __ tbz(count, 1, L2); 1146 // this is the same as above but copying only 2 longs hence 1147 // there is no intervening stp between the str instructions 1148 // but note that the offset and register patterns are still 1149 // the same 1150 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1151 if (direction == copy_forwards) { 1152 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1153 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1154 } else { 1155 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1156 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1157 } 1158 __ bind(L2); 1159 1160 // for forwards copy we need to re-adjust the offsets we 1161 // applied so that s and d are follow the last words written 1162 1163 if (direction == copy_forwards) { 1164 __ add(s, s, 16); 1165 __ add(d, d, 8); 1166 } 1167 1168 } 1169 1170 __ ret(lr); 1171 } 1172 } 1173 1174 // Small copy: less than 16 bytes. 1175 // 1176 // NB: Ignores all of the bits of count which represent more than 15 1177 // bytes, so a caller doesn't have to mask them. 1178 1179 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1180 bool is_backwards = step < 0; 1181 size_t granularity = uabs(step); 1182 int direction = is_backwards ? -1 : 1; 1183 1184 Label Lword, Lint, Lshort, Lbyte; 1185 1186 assert(granularity 1187 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1188 1189 const Register t0 = r3; 1190 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1191 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1192 1193 // ??? I don't know if this bit-test-and-branch is the right thing 1194 // to do. It does a lot of jumping, resulting in several 1195 // mispredicted branches. It might make more sense to do this 1196 // with something like Duff's device with a single computed branch. 1197 1198 __ tbz(count, 3 - exact_log2(granularity), Lword); 1199 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1200 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1201 __ bind(Lword); 1202 1203 if (granularity <= sizeof (jint)) { 1204 __ tbz(count, 2 - exact_log2(granularity), Lint); 1205 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1206 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1207 __ bind(Lint); 1208 } 1209 1210 if (granularity <= sizeof (jshort)) { 1211 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1212 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1213 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1214 __ bind(Lshort); 1215 } 1216 1217 if (granularity <= sizeof (jbyte)) { 1218 __ tbz(count, 0, Lbyte); 1219 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1220 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1221 __ bind(Lbyte); 1222 } 1223 } 1224 1225 Label copy_f, copy_b; 1226 Label copy_obj_f, copy_obj_b; 1227 Label copy_obj_uninit_f, copy_obj_uninit_b; 1228 1229 // All-singing all-dancing memory copy. 1230 // 1231 // Copy count units of memory from s to d. The size of a unit is 1232 // step, which can be positive or negative depending on the direction 1233 // of copy. If is_aligned is false, we align the source address. 1234 // 1235 1236 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1237 Register s, Register d, Register count, int step) { 1238 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1239 bool is_backwards = step < 0; 1240 unsigned int granularity = uabs(step); 1241 const Register t0 = r3, t1 = r4; 1242 1243 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1244 // load all the data before writing anything 1245 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1246 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1247 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1248 const Register send = r17, dend = r16; 1249 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1250 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1251 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1252 1253 if (PrefetchCopyIntervalInBytes > 0) 1254 __ prfm(Address(s, 0), PLDL1KEEP); 1255 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1256 __ br(Assembler::HI, copy_big); 1257 1258 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1259 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1260 1261 __ cmp(count, u1(16/granularity)); 1262 __ br(Assembler::LS, copy16); 1263 1264 __ cmp(count, u1(64/granularity)); 1265 __ br(Assembler::HI, copy80); 1266 1267 __ cmp(count, u1(32/granularity)); 1268 __ br(Assembler::LS, copy32); 1269 1270 // 33..64 bytes 1271 if (UseSIMDForMemoryOps) { 1272 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1273 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1274 bs.copy_store_at_32(Address(d, 0), v0, v1); 1275 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1276 } else { 1277 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1278 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1279 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1280 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1281 1282 bs.copy_store_at_16(Address(d, 0), t0, t1); 1283 bs.copy_store_at_16(Address(d, 16), t2, t3); 1284 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1285 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1286 } 1287 __ b(finish); 1288 1289 // 17..32 bytes 1290 __ bind(copy32); 1291 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1292 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1293 1294 bs.copy_store_at_16(Address(d, 0), t0, t1); 1295 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1296 __ b(finish); 1297 1298 // 65..80/96 bytes 1299 // (96 bytes if SIMD because we do 32 byes per instruction) 1300 __ bind(copy80); 1301 if (UseSIMDForMemoryOps) { 1302 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1303 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1304 // Unaligned pointers can be an issue for copying. 1305 // The issue has more chances to happen when granularity of data is 1306 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1307 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1308 // The most performance drop has been seen for the range 65-80 bytes. 1309 // For such cases using the pair of ldp/stp instead of the third pair of 1310 // ldpq/stpq fixes the performance issue. 1311 if (granularity < sizeof (jint)) { 1312 Label copy96; 1313 __ cmp(count, u1(80/granularity)); 1314 __ br(Assembler::HI, copy96); 1315 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1316 1317 bs.copy_store_at_32(Address(d, 0), v0, v1); 1318 bs.copy_store_at_32(Address(d, 32), v2, v3); 1319 1320 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1321 __ b(finish); 1322 1323 __ bind(copy96); 1324 } 1325 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1326 1327 bs.copy_store_at_32(Address(d, 0), v0, v1); 1328 bs.copy_store_at_32(Address(d, 32), v2, v3); 1329 1330 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1331 } else { 1332 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1333 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1334 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1335 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1336 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1337 1338 bs.copy_store_at_16(Address(d, 0), t0, t1); 1339 bs.copy_store_at_16(Address(d, 16), t2, t3); 1340 bs.copy_store_at_16(Address(d, 32), t4, t5); 1341 bs.copy_store_at_16(Address(d, 48), t6, t7); 1342 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1343 } 1344 __ b(finish); 1345 1346 // 0..16 bytes 1347 __ bind(copy16); 1348 __ cmp(count, u1(8/granularity)); 1349 __ br(Assembler::LO, copy8); 1350 1351 // 8..16 bytes 1352 bs.copy_load_at_8(t0, Address(s, 0)); 1353 bs.copy_load_at_8(t1, Address(send, -8)); 1354 bs.copy_store_at_8(Address(d, 0), t0); 1355 bs.copy_store_at_8(Address(dend, -8), t1); 1356 __ b(finish); 1357 1358 if (granularity < 8) { 1359 // 4..7 bytes 1360 __ bind(copy8); 1361 __ tbz(count, 2 - exact_log2(granularity), copy4); 1362 __ ldrw(t0, Address(s, 0)); 1363 __ ldrw(t1, Address(send, -4)); 1364 __ strw(t0, Address(d, 0)); 1365 __ strw(t1, Address(dend, -4)); 1366 __ b(finish); 1367 if (granularity < 4) { 1368 // 0..3 bytes 1369 __ bind(copy4); 1370 __ cbz(count, finish); // get rid of 0 case 1371 if (granularity == 2) { 1372 __ ldrh(t0, Address(s, 0)); 1373 __ strh(t0, Address(d, 0)); 1374 } else { // granularity == 1 1375 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1376 // the first and last byte. 1377 // Handle the 3 byte case by loading and storing base + count/2 1378 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1379 // This does means in the 1 byte case we load/store the same 1380 // byte 3 times. 1381 __ lsr(count, count, 1); 1382 __ ldrb(t0, Address(s, 0)); 1383 __ ldrb(t1, Address(send, -1)); 1384 __ ldrb(t2, Address(s, count)); 1385 __ strb(t0, Address(d, 0)); 1386 __ strb(t1, Address(dend, -1)); 1387 __ strb(t2, Address(d, count)); 1388 } 1389 __ b(finish); 1390 } 1391 } 1392 1393 __ bind(copy_big); 1394 if (is_backwards) { 1395 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1396 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1397 } 1398 1399 // Now we've got the small case out of the way we can align the 1400 // source address on a 2-word boundary. 1401 1402 // Here we will materialize a count in r15, which is used by copy_memory_small 1403 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1404 // Up until here, we have used t9, which aliases r15, but from here on, that register 1405 // can not be used as a temp register, as it contains the count. 1406 1407 Label aligned; 1408 1409 if (is_aligned) { 1410 // We may have to adjust by 1 word to get s 2-word-aligned. 1411 __ tbz(s, exact_log2(wordSize), aligned); 1412 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1413 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1414 __ sub(count, count, wordSize/granularity); 1415 } else { 1416 if (is_backwards) { 1417 __ andr(r15, s, 2 * wordSize - 1); 1418 } else { 1419 __ neg(r15, s); 1420 __ andr(r15, r15, 2 * wordSize - 1); 1421 } 1422 // r15 is the byte adjustment needed to align s. 1423 __ cbz(r15, aligned); 1424 int shift = exact_log2(granularity); 1425 if (shift > 0) { 1426 __ lsr(r15, r15, shift); 1427 } 1428 __ sub(count, count, r15); 1429 1430 #if 0 1431 // ?? This code is only correct for a disjoint copy. It may or 1432 // may not make sense to use it in that case. 1433 1434 // Copy the first pair; s and d may not be aligned. 1435 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1436 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1437 1438 // Align s and d, adjust count 1439 if (is_backwards) { 1440 __ sub(s, s, r15); 1441 __ sub(d, d, r15); 1442 } else { 1443 __ add(s, s, r15); 1444 __ add(d, d, r15); 1445 } 1446 #else 1447 copy_memory_small(decorators, type, s, d, r15, step); 1448 #endif 1449 } 1450 1451 __ bind(aligned); 1452 1453 // s is now 2-word-aligned. 1454 1455 // We have a count of units and some trailing bytes. Adjust the 1456 // count and do a bulk copy of words. If the shift is zero 1457 // perform a move instead to benefit from zero latency moves. 1458 int shift = exact_log2(wordSize/granularity); 1459 if (shift > 0) { 1460 __ lsr(r15, count, shift); 1461 } else { 1462 __ mov(r15, count); 1463 } 1464 if (direction == copy_forwards) { 1465 if (type != T_OBJECT) { 1466 __ bl(copy_f); 1467 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1468 __ bl(copy_obj_uninit_f); 1469 } else { 1470 __ bl(copy_obj_f); 1471 } 1472 } else { 1473 if (type != T_OBJECT) { 1474 __ bl(copy_b); 1475 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1476 __ bl(copy_obj_uninit_b); 1477 } else { 1478 __ bl(copy_obj_b); 1479 } 1480 } 1481 1482 // And the tail. 1483 copy_memory_small(decorators, type, s, d, count, step); 1484 1485 if (granularity >= 8) __ bind(copy8); 1486 if (granularity >= 4) __ bind(copy4); 1487 __ bind(finish); 1488 } 1489 1490 1491 void clobber_registers() { 1492 #ifdef ASSERT 1493 RegSet clobbered 1494 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1495 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1496 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1497 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1498 __ mov(*it, rscratch1); 1499 } 1500 #endif 1501 1502 } 1503 1504 // Scan over array at a for count oops, verifying each one. 1505 // Preserves a and count, clobbers rscratch1 and rscratch2. 1506 void verify_oop_array (int size, Register a, Register count, Register temp) { 1507 Label loop, end; 1508 __ mov(rscratch1, a); 1509 __ mov(rscratch2, zr); 1510 __ bind(loop); 1511 __ cmp(rscratch2, count); 1512 __ br(Assembler::HS, end); 1513 if (size == wordSize) { 1514 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1515 __ verify_oop(temp); 1516 } else { 1517 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1518 __ decode_heap_oop(temp); // calls verify_oop 1519 } 1520 __ add(rscratch2, rscratch2, 1); 1521 __ b(loop); 1522 __ bind(end); 1523 } 1524 1525 // Arguments: 1526 // stub_id - is used to name the stub and identify all details of 1527 // how to perform the copy. 1528 // 1529 // entry - is assigned to the stub's post push entry point unless 1530 // it is null 1531 // 1532 // Inputs: 1533 // c_rarg0 - source array address 1534 // c_rarg1 - destination array address 1535 // c_rarg2 - element count, treated as ssize_t, can be zero 1536 // 1537 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1538 // the hardware handle it. The two dwords within qwords that span 1539 // cache line boundaries will still be loaded and stored atomically. 1540 // 1541 // Side Effects: entry is set to the (post push) entry point so it 1542 // can be used by the corresponding conjoint copy 1543 // method 1544 // 1545 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1546 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1547 RegSet saved_reg = RegSet::of(s, d, count); 1548 int size; 1549 bool aligned; 1550 bool is_oop; 1551 bool dest_uninitialized; 1552 switch (stub_id) { 1553 case jbyte_disjoint_arraycopy_id: 1554 size = sizeof(jbyte); 1555 aligned = false; 1556 is_oop = false; 1557 dest_uninitialized = false; 1558 break; 1559 case arrayof_jbyte_disjoint_arraycopy_id: 1560 size = sizeof(jbyte); 1561 aligned = true; 1562 is_oop = false; 1563 dest_uninitialized = false; 1564 break; 1565 case jshort_disjoint_arraycopy_id: 1566 size = sizeof(jshort); 1567 aligned = false; 1568 is_oop = false; 1569 dest_uninitialized = false; 1570 break; 1571 case arrayof_jshort_disjoint_arraycopy_id: 1572 size = sizeof(jshort); 1573 aligned = true; 1574 is_oop = false; 1575 dest_uninitialized = false; 1576 break; 1577 case jint_disjoint_arraycopy_id: 1578 size = sizeof(jint); 1579 aligned = false; 1580 is_oop = false; 1581 dest_uninitialized = false; 1582 break; 1583 case arrayof_jint_disjoint_arraycopy_id: 1584 size = sizeof(jint); 1585 aligned = true; 1586 is_oop = false; 1587 dest_uninitialized = false; 1588 break; 1589 case jlong_disjoint_arraycopy_id: 1590 // since this is always aligned we can (should!) use the same 1591 // stub as for case arrayof_jlong_disjoint_arraycopy 1592 ShouldNotReachHere(); 1593 break; 1594 case arrayof_jlong_disjoint_arraycopy_id: 1595 size = sizeof(jlong); 1596 aligned = true; 1597 is_oop = false; 1598 dest_uninitialized = false; 1599 break; 1600 case oop_disjoint_arraycopy_id: 1601 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1602 aligned = !UseCompressedOops; 1603 is_oop = true; 1604 dest_uninitialized = false; 1605 break; 1606 case arrayof_oop_disjoint_arraycopy_id: 1607 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1608 aligned = !UseCompressedOops; 1609 is_oop = true; 1610 dest_uninitialized = false; 1611 break; 1612 case oop_disjoint_arraycopy_uninit_id: 1613 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1614 aligned = !UseCompressedOops; 1615 is_oop = true; 1616 dest_uninitialized = true; 1617 break; 1618 case arrayof_oop_disjoint_arraycopy_uninit_id: 1619 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1620 aligned = !UseCompressedOops; 1621 is_oop = true; 1622 dest_uninitialized = true; 1623 break; 1624 default: 1625 ShouldNotReachHere(); 1626 break; 1627 } 1628 1629 __ align(CodeEntryAlignment); 1630 StubCodeMark mark(this, stub_id); 1631 address start = __ pc(); 1632 __ enter(); 1633 1634 if (entry != nullptr) { 1635 *entry = __ pc(); 1636 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1637 BLOCK_COMMENT("Entry:"); 1638 } 1639 1640 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1641 if (dest_uninitialized) { 1642 decorators |= IS_DEST_UNINITIALIZED; 1643 } 1644 if (aligned) { 1645 decorators |= ARRAYCOPY_ALIGNED; 1646 } 1647 1648 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1649 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1650 1651 if (is_oop) { 1652 // save regs before copy_memory 1653 __ push(RegSet::of(d, count), sp); 1654 } 1655 { 1656 // UnsafeMemoryAccess page error: continue after unsafe access 1657 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1658 UnsafeMemoryAccessMark umam(this, add_entry, true); 1659 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1660 } 1661 1662 if (is_oop) { 1663 __ pop(RegSet::of(d, count), sp); 1664 if (VerifyOops) 1665 verify_oop_array(size, d, count, r16); 1666 } 1667 1668 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1669 1670 __ leave(); 1671 __ mov(r0, zr); // return 0 1672 __ ret(lr); 1673 return start; 1674 } 1675 1676 // Arguments: 1677 // stub_id - is used to name the stub and identify all details of 1678 // how to perform the copy. 1679 // 1680 // nooverlap_target - identifes the (post push) entry for the 1681 // corresponding disjoint copy routine which can be 1682 // jumped to if the ranges do not actually overlap 1683 // 1684 // entry - is assigned to the stub's post push entry point unless 1685 // it is null 1686 // 1687 // 1688 // Inputs: 1689 // c_rarg0 - source array address 1690 // c_rarg1 - destination array address 1691 // c_rarg2 - element count, treated as ssize_t, can be zero 1692 // 1693 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1694 // the hardware handle it. The two dwords within qwords that span 1695 // cache line boundaries will still be loaded and stored atomically. 1696 // 1697 // Side Effects: 1698 // entry is set to the no-overlap entry point so it can be used by 1699 // some other conjoint copy method 1700 // 1701 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1702 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1703 RegSet saved_regs = RegSet::of(s, d, count); 1704 int size; 1705 bool aligned; 1706 bool is_oop; 1707 bool dest_uninitialized; 1708 switch (stub_id) { 1709 case jbyte_arraycopy_id: 1710 size = sizeof(jbyte); 1711 aligned = false; 1712 is_oop = false; 1713 dest_uninitialized = false; 1714 break; 1715 case arrayof_jbyte_arraycopy_id: 1716 size = sizeof(jbyte); 1717 aligned = true; 1718 is_oop = false; 1719 dest_uninitialized = false; 1720 break; 1721 case jshort_arraycopy_id: 1722 size = sizeof(jshort); 1723 aligned = false; 1724 is_oop = false; 1725 dest_uninitialized = false; 1726 break; 1727 case arrayof_jshort_arraycopy_id: 1728 size = sizeof(jshort); 1729 aligned = true; 1730 is_oop = false; 1731 dest_uninitialized = false; 1732 break; 1733 case jint_arraycopy_id: 1734 size = sizeof(jint); 1735 aligned = false; 1736 is_oop = false; 1737 dest_uninitialized = false; 1738 break; 1739 case arrayof_jint_arraycopy_id: 1740 size = sizeof(jint); 1741 aligned = true; 1742 is_oop = false; 1743 dest_uninitialized = false; 1744 break; 1745 case jlong_arraycopy_id: 1746 // since this is always aligned we can (should!) use the same 1747 // stub as for case arrayof_jlong_disjoint_arraycopy 1748 ShouldNotReachHere(); 1749 break; 1750 case arrayof_jlong_arraycopy_id: 1751 size = sizeof(jlong); 1752 aligned = true; 1753 is_oop = false; 1754 dest_uninitialized = false; 1755 break; 1756 case oop_arraycopy_id: 1757 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1758 aligned = !UseCompressedOops; 1759 is_oop = true; 1760 dest_uninitialized = false; 1761 break; 1762 case arrayof_oop_arraycopy_id: 1763 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1764 aligned = !UseCompressedOops; 1765 is_oop = true; 1766 dest_uninitialized = false; 1767 break; 1768 case oop_arraycopy_uninit_id: 1769 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1770 aligned = !UseCompressedOops; 1771 is_oop = true; 1772 dest_uninitialized = true; 1773 break; 1774 case arrayof_oop_arraycopy_uninit_id: 1775 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1776 aligned = !UseCompressedOops; 1777 is_oop = true; 1778 dest_uninitialized = true; 1779 break; 1780 default: 1781 ShouldNotReachHere(); 1782 } 1783 1784 StubCodeMark mark(this, stub_id); 1785 address start = __ pc(); 1786 __ enter(); 1787 1788 if (entry != nullptr) { 1789 *entry = __ pc(); 1790 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1791 BLOCK_COMMENT("Entry:"); 1792 } 1793 1794 // use fwd copy when (d-s) above_equal (count*size) 1795 __ sub(rscratch1, d, s); 1796 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1797 __ br(Assembler::HS, nooverlap_target); 1798 1799 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1800 if (dest_uninitialized) { 1801 decorators |= IS_DEST_UNINITIALIZED; 1802 } 1803 if (aligned) { 1804 decorators |= ARRAYCOPY_ALIGNED; 1805 } 1806 1807 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1808 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1809 1810 if (is_oop) { 1811 // save regs before copy_memory 1812 __ push(RegSet::of(d, count), sp); 1813 } 1814 { 1815 // UnsafeMemoryAccess page error: continue after unsafe access 1816 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1817 UnsafeMemoryAccessMark umam(this, add_entry, true); 1818 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1819 } 1820 if (is_oop) { 1821 __ pop(RegSet::of(d, count), sp); 1822 if (VerifyOops) 1823 verify_oop_array(size, d, count, r16); 1824 } 1825 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1826 __ leave(); 1827 __ mov(r0, zr); // return 0 1828 __ ret(lr); 1829 return start; 1830 } 1831 1832 // Helper for generating a dynamic type check. 1833 // Smashes rscratch1, rscratch2. 1834 void generate_type_check(Register sub_klass, 1835 Register super_check_offset, 1836 Register super_klass, 1837 Register temp1, 1838 Register temp2, 1839 Register result, 1840 Label& L_success) { 1841 assert_different_registers(sub_klass, super_check_offset, super_klass); 1842 1843 BLOCK_COMMENT("type_check:"); 1844 1845 Label L_miss; 1846 1847 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1848 super_check_offset); 1849 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1850 1851 // Fall through on failure! 1852 __ BIND(L_miss); 1853 } 1854 1855 // 1856 // Generate checkcasting array copy stub 1857 // 1858 // Input: 1859 // c_rarg0 - source array address 1860 // c_rarg1 - destination array address 1861 // c_rarg2 - element count, treated as ssize_t, can be zero 1862 // c_rarg3 - size_t ckoff (super_check_offset) 1863 // c_rarg4 - oop ckval (super_klass) 1864 // 1865 // Output: 1866 // r0 == 0 - success 1867 // r0 == -1^K - failure, where K is partial transfer count 1868 // 1869 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1870 bool dest_uninitialized; 1871 switch (stub_id) { 1872 case checkcast_arraycopy_id: 1873 dest_uninitialized = false; 1874 break; 1875 case checkcast_arraycopy_uninit_id: 1876 dest_uninitialized = true; 1877 break; 1878 default: 1879 ShouldNotReachHere(); 1880 } 1881 1882 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1883 1884 // Input registers (after setup_arg_regs) 1885 const Register from = c_rarg0; // source array address 1886 const Register to = c_rarg1; // destination array address 1887 const Register count = c_rarg2; // elementscount 1888 const Register ckoff = c_rarg3; // super_check_offset 1889 const Register ckval = c_rarg4; // super_klass 1890 1891 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1892 RegSet wb_post_saved_regs = RegSet::of(count); 1893 1894 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1895 const Register copied_oop = r22; // actual oop copied 1896 const Register count_save = r21; // orig elementscount 1897 const Register start_to = r20; // destination array start address 1898 const Register r19_klass = r19; // oop._klass 1899 1900 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1901 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1902 1903 //--------------------------------------------------------------- 1904 // Assembler stub will be used for this call to arraycopy 1905 // if the two arrays are subtypes of Object[] but the 1906 // destination array type is not equal to or a supertype 1907 // of the source type. Each element must be separately 1908 // checked. 1909 1910 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1911 copied_oop, r19_klass, count_save); 1912 1913 __ align(CodeEntryAlignment); 1914 StubCodeMark mark(this, stub_id); 1915 address start = __ pc(); 1916 1917 __ enter(); // required for proper stackwalking of RuntimeStub frame 1918 1919 #ifdef ASSERT 1920 // caller guarantees that the arrays really are different 1921 // otherwise, we would have to make conjoint checks 1922 { Label L; 1923 __ b(L); // conjoint check not yet implemented 1924 __ stop("checkcast_copy within a single array"); 1925 __ bind(L); 1926 } 1927 #endif //ASSERT 1928 1929 // Caller of this entry point must set up the argument registers. 1930 if (entry != nullptr) { 1931 *entry = __ pc(); 1932 BLOCK_COMMENT("Entry:"); 1933 } 1934 1935 // Empty array: Nothing to do. 1936 __ cbz(count, L_done); 1937 __ push(RegSet::of(r19, r20, r21, r22), sp); 1938 1939 #ifdef ASSERT 1940 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1941 // The ckoff and ckval must be mutually consistent, 1942 // even though caller generates both. 1943 { Label L; 1944 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1945 __ ldrw(start_to, Address(ckval, sco_offset)); 1946 __ cmpw(ckoff, start_to); 1947 __ br(Assembler::EQ, L); 1948 __ stop("super_check_offset inconsistent"); 1949 __ bind(L); 1950 } 1951 #endif //ASSERT 1952 1953 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1954 bool is_oop = true; 1955 int element_size = UseCompressedOops ? 4 : 8; 1956 if (dest_uninitialized) { 1957 decorators |= IS_DEST_UNINITIALIZED; 1958 } 1959 1960 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1961 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1962 1963 // save the original count 1964 __ mov(count_save, count); 1965 1966 // Copy from low to high addresses 1967 __ mov(start_to, to); // Save destination array start address 1968 __ b(L_load_element); 1969 1970 // ======== begin loop ======== 1971 // (Loop is rotated; its entry is L_load_element.) 1972 // Loop control: 1973 // for (; count != 0; count--) { 1974 // copied_oop = load_heap_oop(from++); 1975 // ... generate_type_check ...; 1976 // store_heap_oop(to++, copied_oop); 1977 // } 1978 __ align(OptoLoopAlignment); 1979 1980 __ BIND(L_store_element); 1981 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1982 __ post(to, element_size), copied_oop, noreg, 1983 gct1, gct2, gct3); 1984 __ sub(count, count, 1); 1985 __ cbz(count, L_do_card_marks); 1986 1987 // ======== loop entry is here ======== 1988 __ BIND(L_load_element); 1989 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1990 copied_oop, noreg, __ post(from, element_size), 1991 gct1); 1992 __ cbz(copied_oop, L_store_element); 1993 1994 __ load_klass(r19_klass, copied_oop);// query the object klass 1995 1996 BLOCK_COMMENT("type_check:"); 1997 generate_type_check(/*sub_klass*/r19_klass, 1998 /*super_check_offset*/ckoff, 1999 /*super_klass*/ckval, 2000 /*r_array_base*/gct1, 2001 /*temp2*/gct2, 2002 /*result*/r10, L_store_element); 2003 2004 // Fall through on failure! 2005 2006 // ======== end loop ======== 2007 2008 // It was a real error; we must depend on the caller to finish the job. 2009 // Register count = remaining oops, count_orig = total oops. 2010 // Emit GC store barriers for the oops we have copied and report 2011 // their number to the caller. 2012 2013 __ subs(count, count_save, count); // K = partially copied oop count 2014 __ eon(count, count, zr); // report (-1^K) to caller 2015 __ br(Assembler::EQ, L_done_pop); 2016 2017 __ BIND(L_do_card_marks); 2018 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2019 2020 __ bind(L_done_pop); 2021 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2022 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2023 2024 __ bind(L_done); 2025 __ mov(r0, count); 2026 __ leave(); 2027 __ ret(lr); 2028 2029 return start; 2030 } 2031 2032 // Perform range checks on the proposed arraycopy. 2033 // Kills temp, but nothing else. 2034 // Also, clean the sign bits of src_pos and dst_pos. 2035 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2036 Register src_pos, // source position (c_rarg1) 2037 Register dst, // destination array oo (c_rarg2) 2038 Register dst_pos, // destination position (c_rarg3) 2039 Register length, 2040 Register temp, 2041 Label& L_failed) { 2042 BLOCK_COMMENT("arraycopy_range_checks:"); 2043 2044 assert_different_registers(rscratch1, temp); 2045 2046 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2047 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2048 __ addw(temp, length, src_pos); 2049 __ cmpw(temp, rscratch1); 2050 __ br(Assembler::HI, L_failed); 2051 2052 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2053 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2054 __ addw(temp, length, dst_pos); 2055 __ cmpw(temp, rscratch1); 2056 __ br(Assembler::HI, L_failed); 2057 2058 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2059 __ movw(src_pos, src_pos); 2060 __ movw(dst_pos, dst_pos); 2061 2062 BLOCK_COMMENT("arraycopy_range_checks done"); 2063 } 2064 2065 // These stubs get called from some dumb test routine. 2066 // I'll write them properly when they're called from 2067 // something that's actually doing something. 2068 static void fake_arraycopy_stub(address src, address dst, int count) { 2069 assert(count == 0, "huh?"); 2070 } 2071 2072 2073 // 2074 // Generate 'unsafe' array copy stub 2075 // Though just as safe as the other stubs, it takes an unscaled 2076 // size_t argument instead of an element count. 2077 // 2078 // Input: 2079 // c_rarg0 - source array address 2080 // c_rarg1 - destination array address 2081 // c_rarg2 - byte count, treated as ssize_t, can be zero 2082 // 2083 // Examines the alignment of the operands and dispatches 2084 // to a long, int, short, or byte copy loop. 2085 // 2086 address generate_unsafe_copy(address byte_copy_entry, 2087 address short_copy_entry, 2088 address int_copy_entry, 2089 address long_copy_entry) { 2090 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2091 2092 Label L_long_aligned, L_int_aligned, L_short_aligned; 2093 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2094 2095 __ align(CodeEntryAlignment); 2096 StubCodeMark mark(this, stub_id); 2097 address start = __ pc(); 2098 __ enter(); // required for proper stackwalking of RuntimeStub frame 2099 2100 // bump this on entry, not on exit: 2101 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2102 2103 __ orr(rscratch1, s, d); 2104 __ orr(rscratch1, rscratch1, count); 2105 2106 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2107 __ cbz(rscratch1, L_long_aligned); 2108 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2109 __ cbz(rscratch1, L_int_aligned); 2110 __ tbz(rscratch1, 0, L_short_aligned); 2111 __ b(RuntimeAddress(byte_copy_entry)); 2112 2113 __ BIND(L_short_aligned); 2114 __ lsr(count, count, LogBytesPerShort); // size => short_count 2115 __ b(RuntimeAddress(short_copy_entry)); 2116 __ BIND(L_int_aligned); 2117 __ lsr(count, count, LogBytesPerInt); // size => int_count 2118 __ b(RuntimeAddress(int_copy_entry)); 2119 __ BIND(L_long_aligned); 2120 __ lsr(count, count, LogBytesPerLong); // size => long_count 2121 __ b(RuntimeAddress(long_copy_entry)); 2122 2123 return start; 2124 } 2125 2126 // 2127 // Generate generic array copy stubs 2128 // 2129 // Input: 2130 // c_rarg0 - src oop 2131 // c_rarg1 - src_pos (32-bits) 2132 // c_rarg2 - dst oop 2133 // c_rarg3 - dst_pos (32-bits) 2134 // c_rarg4 - element count (32-bits) 2135 // 2136 // Output: 2137 // r0 == 0 - success 2138 // r0 == -1^K - failure, where K is partial transfer count 2139 // 2140 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2141 address int_copy_entry, address oop_copy_entry, 2142 address long_copy_entry, address checkcast_copy_entry) { 2143 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2144 2145 Label L_failed, L_objArray; 2146 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2147 2148 // Input registers 2149 const Register src = c_rarg0; // source array oop 2150 const Register src_pos = c_rarg1; // source position 2151 const Register dst = c_rarg2; // destination array oop 2152 const Register dst_pos = c_rarg3; // destination position 2153 const Register length = c_rarg4; 2154 2155 2156 // Registers used as temps 2157 const Register dst_klass = c_rarg5; 2158 2159 __ align(CodeEntryAlignment); 2160 2161 StubCodeMark mark(this, stub_id); 2162 2163 address start = __ pc(); 2164 2165 __ enter(); // required for proper stackwalking of RuntimeStub frame 2166 2167 // bump this on entry, not on exit: 2168 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2169 2170 //----------------------------------------------------------------------- 2171 // Assembler stub will be used for this call to arraycopy 2172 // if the following conditions are met: 2173 // 2174 // (1) src and dst must not be null. 2175 // (2) src_pos must not be negative. 2176 // (3) dst_pos must not be negative. 2177 // (4) length must not be negative. 2178 // (5) src klass and dst klass should be the same and not null. 2179 // (6) src and dst should be arrays. 2180 // (7) src_pos + length must not exceed length of src. 2181 // (8) dst_pos + length must not exceed length of dst. 2182 // 2183 2184 // if (src == nullptr) return -1; 2185 __ cbz(src, L_failed); 2186 2187 // if (src_pos < 0) return -1; 2188 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2189 2190 // if (dst == nullptr) return -1; 2191 __ cbz(dst, L_failed); 2192 2193 // if (dst_pos < 0) return -1; 2194 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2195 2196 // registers used as temp 2197 const Register scratch_length = r16; // elements count to copy 2198 const Register scratch_src_klass = r17; // array klass 2199 const Register lh = r15; // layout helper 2200 2201 // if (length < 0) return -1; 2202 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2203 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2204 2205 __ load_klass(scratch_src_klass, src); 2206 #ifdef ASSERT 2207 // assert(src->klass() != nullptr); 2208 { 2209 BLOCK_COMMENT("assert klasses not null {"); 2210 Label L1, L2; 2211 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2212 __ bind(L1); 2213 __ stop("broken null klass"); 2214 __ bind(L2); 2215 __ load_klass(rscratch1, dst); 2216 __ cbz(rscratch1, L1); // this would be broken also 2217 BLOCK_COMMENT("} assert klasses not null done"); 2218 } 2219 #endif 2220 2221 // Load layout helper (32-bits) 2222 // 2223 // |array_tag| | header_size | element_type | |log2_element_size| 2224 // 32 30 24 16 8 2 0 2225 // 2226 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2227 // 2228 2229 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2230 2231 // Handle objArrays completely differently... 2232 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2233 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2234 __ movw(rscratch1, objArray_lh); 2235 __ eorw(rscratch2, lh, rscratch1); 2236 __ cbzw(rscratch2, L_objArray); 2237 2238 // if (src->klass() != dst->klass()) return -1; 2239 __ load_klass(rscratch2, dst); 2240 __ eor(rscratch2, rscratch2, scratch_src_klass); 2241 __ cbnz(rscratch2, L_failed); 2242 2243 // Check for flat inline type array -> return -1 2244 __ test_flat_array_oop(src, rscratch2, L_failed); 2245 2246 // Check for null-free (non-flat) inline type array -> handle as object array 2247 __ test_null_free_array_oop(src, rscratch2, L_objArray); 2248 2249 // if (!src->is_Array()) return -1; 2250 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2251 2252 // At this point, it is known to be a typeArray (array_tag 0x3). 2253 #ifdef ASSERT 2254 { 2255 BLOCK_COMMENT("assert primitive array {"); 2256 Label L; 2257 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2258 __ cmpw(lh, rscratch2); 2259 __ br(Assembler::GE, L); 2260 __ stop("must be a primitive array"); 2261 __ bind(L); 2262 BLOCK_COMMENT("} assert primitive array done"); 2263 } 2264 #endif 2265 2266 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2267 rscratch2, L_failed); 2268 2269 // TypeArrayKlass 2270 // 2271 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2272 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2273 // 2274 2275 const Register rscratch1_offset = rscratch1; // array offset 2276 const Register r15_elsize = lh; // element size 2277 2278 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2279 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2280 __ add(src, src, rscratch1_offset); // src array offset 2281 __ add(dst, dst, rscratch1_offset); // dst array offset 2282 BLOCK_COMMENT("choose copy loop based on element size"); 2283 2284 // next registers should be set before the jump to corresponding stub 2285 const Register from = c_rarg0; // source array address 2286 const Register to = c_rarg1; // destination array address 2287 const Register count = c_rarg2; // elements count 2288 2289 // 'from', 'to', 'count' registers should be set in such order 2290 // since they are the same as 'src', 'src_pos', 'dst'. 2291 2292 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2293 2294 // The possible values of elsize are 0-3, i.e. exact_log2(element 2295 // size in bytes). We do a simple bitwise binary search. 2296 __ BIND(L_copy_bytes); 2297 __ tbnz(r15_elsize, 1, L_copy_ints); 2298 __ tbnz(r15_elsize, 0, L_copy_shorts); 2299 __ lea(from, Address(src, src_pos));// src_addr 2300 __ lea(to, Address(dst, dst_pos));// dst_addr 2301 __ movw(count, scratch_length); // length 2302 __ b(RuntimeAddress(byte_copy_entry)); 2303 2304 __ BIND(L_copy_shorts); 2305 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2306 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2307 __ movw(count, scratch_length); // length 2308 __ b(RuntimeAddress(short_copy_entry)); 2309 2310 __ BIND(L_copy_ints); 2311 __ tbnz(r15_elsize, 0, L_copy_longs); 2312 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2313 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2314 __ movw(count, scratch_length); // length 2315 __ b(RuntimeAddress(int_copy_entry)); 2316 2317 __ BIND(L_copy_longs); 2318 #ifdef ASSERT 2319 { 2320 BLOCK_COMMENT("assert long copy {"); 2321 Label L; 2322 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2323 __ cmpw(r15_elsize, LogBytesPerLong); 2324 __ br(Assembler::EQ, L); 2325 __ stop("must be long copy, but elsize is wrong"); 2326 __ bind(L); 2327 BLOCK_COMMENT("} assert long copy done"); 2328 } 2329 #endif 2330 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2331 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2332 __ movw(count, scratch_length); // length 2333 __ b(RuntimeAddress(long_copy_entry)); 2334 2335 // ObjArrayKlass 2336 __ BIND(L_objArray); 2337 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2338 2339 Label L_plain_copy, L_checkcast_copy; 2340 // test array classes for subtyping 2341 __ load_klass(r15, dst); 2342 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2343 __ br(Assembler::NE, L_checkcast_copy); 2344 2345 // Identically typed arrays can be copied without element-wise checks. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 rscratch2, L_failed); 2348 2349 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2350 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2351 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ movw(count, scratch_length); // length 2354 __ BIND(L_plain_copy); 2355 __ b(RuntimeAddress(oop_copy_entry)); 2356 2357 __ BIND(L_checkcast_copy); 2358 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2359 { 2360 // Before looking at dst.length, make sure dst is also an objArray. 2361 __ ldrw(rscratch1, Address(r15, lh_offset)); 2362 __ movw(rscratch2, objArray_lh); 2363 __ eorw(rscratch1, rscratch1, rscratch2); 2364 __ cbnzw(rscratch1, L_failed); 2365 2366 // It is safe to examine both src.length and dst.length. 2367 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2368 r15, L_failed); 2369 2370 __ load_klass(dst_klass, dst); // reload 2371 2372 // Marshal the base address arguments now, freeing registers. 2373 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2374 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2375 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2376 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2377 __ movw(count, length); // length (reloaded) 2378 Register sco_temp = c_rarg3; // this register is free now 2379 assert_different_registers(from, to, count, sco_temp, 2380 dst_klass, scratch_src_klass); 2381 // assert_clean_int(count, sco_temp); 2382 2383 // Generate the type check. 2384 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2385 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2386 2387 // Smashes rscratch1, rscratch2 2388 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2389 L_plain_copy); 2390 2391 // Fetch destination element klass from the ObjArrayKlass header. 2392 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2393 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2394 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2395 2396 // the checkcast_copy loop needs two extra arguments: 2397 assert(c_rarg3 == sco_temp, "#3 already in place"); 2398 // Set up arguments for checkcast_copy_entry. 2399 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2400 __ b(RuntimeAddress(checkcast_copy_entry)); 2401 } 2402 2403 __ BIND(L_failed); 2404 __ mov(r0, -1); 2405 __ leave(); // required for proper stackwalking of RuntimeStub frame 2406 __ ret(lr); 2407 2408 return start; 2409 } 2410 2411 // 2412 // Generate stub for array fill. If "aligned" is true, the 2413 // "to" address is assumed to be heapword aligned. 2414 // 2415 // Arguments for generated stub: 2416 // to: c_rarg0 2417 // value: c_rarg1 2418 // count: c_rarg2 treated as signed 2419 // 2420 address generate_fill(StubGenStubId stub_id) { 2421 BasicType t; 2422 bool aligned; 2423 2424 switch (stub_id) { 2425 case jbyte_fill_id: 2426 t = T_BYTE; 2427 aligned = false; 2428 break; 2429 case jshort_fill_id: 2430 t = T_SHORT; 2431 aligned = false; 2432 break; 2433 case jint_fill_id: 2434 t = T_INT; 2435 aligned = false; 2436 break; 2437 case arrayof_jbyte_fill_id: 2438 t = T_BYTE; 2439 aligned = true; 2440 break; 2441 case arrayof_jshort_fill_id: 2442 t = T_SHORT; 2443 aligned = true; 2444 break; 2445 case arrayof_jint_fill_id: 2446 t = T_INT; 2447 aligned = true; 2448 break; 2449 default: 2450 ShouldNotReachHere(); 2451 }; 2452 2453 __ align(CodeEntryAlignment); 2454 StubCodeMark mark(this, stub_id); 2455 address start = __ pc(); 2456 2457 BLOCK_COMMENT("Entry:"); 2458 2459 const Register to = c_rarg0; // source array address 2460 const Register value = c_rarg1; // value 2461 const Register count = c_rarg2; // elements count 2462 2463 const Register bz_base = r10; // base for block_zero routine 2464 const Register cnt_words = r11; // temp register 2465 2466 __ enter(); 2467 2468 Label L_fill_elements, L_exit1; 2469 2470 int shift = -1; 2471 switch (t) { 2472 case T_BYTE: 2473 shift = 0; 2474 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2475 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2476 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2477 __ br(Assembler::LO, L_fill_elements); 2478 break; 2479 case T_SHORT: 2480 shift = 1; 2481 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2482 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2483 __ br(Assembler::LO, L_fill_elements); 2484 break; 2485 case T_INT: 2486 shift = 2; 2487 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2488 __ br(Assembler::LO, L_fill_elements); 2489 break; 2490 default: ShouldNotReachHere(); 2491 } 2492 2493 // Align source address at 8 bytes address boundary. 2494 Label L_skip_align1, L_skip_align2, L_skip_align4; 2495 if (!aligned) { 2496 switch (t) { 2497 case T_BYTE: 2498 // One byte misalignment happens only for byte arrays. 2499 __ tbz(to, 0, L_skip_align1); 2500 __ strb(value, Address(__ post(to, 1))); 2501 __ subw(count, count, 1); 2502 __ bind(L_skip_align1); 2503 // Fallthrough 2504 case T_SHORT: 2505 // Two bytes misalignment happens only for byte and short (char) arrays. 2506 __ tbz(to, 1, L_skip_align2); 2507 __ strh(value, Address(__ post(to, 2))); 2508 __ subw(count, count, 2 >> shift); 2509 __ bind(L_skip_align2); 2510 // Fallthrough 2511 case T_INT: 2512 // Align to 8 bytes, we know we are 4 byte aligned to start. 2513 __ tbz(to, 2, L_skip_align4); 2514 __ strw(value, Address(__ post(to, 4))); 2515 __ subw(count, count, 4 >> shift); 2516 __ bind(L_skip_align4); 2517 break; 2518 default: ShouldNotReachHere(); 2519 } 2520 } 2521 2522 // 2523 // Fill large chunks 2524 // 2525 __ lsrw(cnt_words, count, 3 - shift); // number of words 2526 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2527 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2528 if (UseBlockZeroing) { 2529 Label non_block_zeroing, rest; 2530 // If the fill value is zero we can use the fast zero_words(). 2531 __ cbnz(value, non_block_zeroing); 2532 __ mov(bz_base, to); 2533 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2534 address tpc = __ zero_words(bz_base, cnt_words); 2535 if (tpc == nullptr) { 2536 fatal("CodeCache is full at generate_fill"); 2537 } 2538 __ b(rest); 2539 __ bind(non_block_zeroing); 2540 __ fill_words(to, cnt_words, value); 2541 __ bind(rest); 2542 } else { 2543 __ fill_words(to, cnt_words, value); 2544 } 2545 2546 // Remaining count is less than 8 bytes. Fill it by a single store. 2547 // Note that the total length is no less than 8 bytes. 2548 if (t == T_BYTE || t == T_SHORT) { 2549 Label L_exit1; 2550 __ cbzw(count, L_exit1); 2551 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2552 __ str(value, Address(to, -8)); // overwrite some elements 2553 __ bind(L_exit1); 2554 __ leave(); 2555 __ ret(lr); 2556 } 2557 2558 // Handle copies less than 8 bytes. 2559 Label L_fill_2, L_fill_4, L_exit2; 2560 __ bind(L_fill_elements); 2561 switch (t) { 2562 case T_BYTE: 2563 __ tbz(count, 0, L_fill_2); 2564 __ strb(value, Address(__ post(to, 1))); 2565 __ bind(L_fill_2); 2566 __ tbz(count, 1, L_fill_4); 2567 __ strh(value, Address(__ post(to, 2))); 2568 __ bind(L_fill_4); 2569 __ tbz(count, 2, L_exit2); 2570 __ strw(value, Address(to)); 2571 break; 2572 case T_SHORT: 2573 __ tbz(count, 0, L_fill_4); 2574 __ strh(value, Address(__ post(to, 2))); 2575 __ bind(L_fill_4); 2576 __ tbz(count, 1, L_exit2); 2577 __ strw(value, Address(to)); 2578 break; 2579 case T_INT: 2580 __ cbzw(count, L_exit2); 2581 __ strw(value, Address(to)); 2582 break; 2583 default: ShouldNotReachHere(); 2584 } 2585 __ bind(L_exit2); 2586 __ leave(); 2587 __ ret(lr); 2588 return start; 2589 } 2590 2591 address generate_data_cache_writeback() { 2592 const Register line = c_rarg0; // address of line to write back 2593 2594 __ align(CodeEntryAlignment); 2595 2596 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2597 StubCodeMark mark(this, stub_id); 2598 2599 address start = __ pc(); 2600 __ enter(); 2601 __ cache_wb(Address(line, 0)); 2602 __ leave(); 2603 __ ret(lr); 2604 2605 return start; 2606 } 2607 2608 address generate_data_cache_writeback_sync() { 2609 const Register is_pre = c_rarg0; // pre or post sync 2610 2611 __ align(CodeEntryAlignment); 2612 2613 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2614 StubCodeMark mark(this, stub_id); 2615 2616 // pre wbsync is a no-op 2617 // post wbsync translates to an sfence 2618 2619 Label skip; 2620 address start = __ pc(); 2621 __ enter(); 2622 __ cbnz(is_pre, skip); 2623 __ cache_wbsync(false); 2624 __ bind(skip); 2625 __ leave(); 2626 __ ret(lr); 2627 2628 return start; 2629 } 2630 2631 void generate_arraycopy_stubs() { 2632 address entry; 2633 address entry_jbyte_arraycopy; 2634 address entry_jshort_arraycopy; 2635 address entry_jint_arraycopy; 2636 address entry_oop_arraycopy; 2637 address entry_jlong_arraycopy; 2638 address entry_checkcast_arraycopy; 2639 2640 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2641 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2642 2643 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2644 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2645 2646 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2647 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2648 2649 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2650 2651 //*** jbyte 2652 // Always need aligned and unaligned versions 2653 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2654 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2655 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2656 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2657 2658 //*** jshort 2659 // Always need aligned and unaligned versions 2660 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2661 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2662 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2663 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2664 2665 //*** jint 2666 // Aligned versions 2667 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2668 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2669 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2670 // entry_jint_arraycopy always points to the unaligned version 2671 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2672 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2673 2674 //*** jlong 2675 // It is always aligned 2676 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2677 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2678 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2679 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2680 2681 //*** oops 2682 { 2683 // With compressed oops we need unaligned versions; notice that 2684 // we overwrite entry_oop_arraycopy. 2685 bool aligned = !UseCompressedOops; 2686 2687 StubRoutines::_arrayof_oop_disjoint_arraycopy 2688 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2689 StubRoutines::_arrayof_oop_arraycopy 2690 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2691 // Aligned versions without pre-barriers 2692 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2693 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2694 StubRoutines::_arrayof_oop_arraycopy_uninit 2695 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2696 } 2697 2698 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2699 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2700 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2701 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2702 2703 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2704 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2705 2706 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2707 entry_jshort_arraycopy, 2708 entry_jint_arraycopy, 2709 entry_jlong_arraycopy); 2710 2711 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2712 entry_jshort_arraycopy, 2713 entry_jint_arraycopy, 2714 entry_oop_arraycopy, 2715 entry_jlong_arraycopy, 2716 entry_checkcast_arraycopy); 2717 2718 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2719 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2720 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2721 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2722 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2723 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2724 } 2725 2726 void generate_math_stubs() { Unimplemented(); } 2727 2728 // Arguments: 2729 // 2730 // Inputs: 2731 // c_rarg0 - source byte array address 2732 // c_rarg1 - destination byte array address 2733 // c_rarg2 - K (key) in little endian int array 2734 // 2735 address generate_aescrypt_encryptBlock() { 2736 __ align(CodeEntryAlignment); 2737 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2738 StubCodeMark mark(this, stub_id); 2739 2740 const Register from = c_rarg0; // source array address 2741 const Register to = c_rarg1; // destination array address 2742 const Register key = c_rarg2; // key array address 2743 const Register keylen = rscratch1; 2744 2745 address start = __ pc(); 2746 __ enter(); 2747 2748 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2749 2750 __ aesenc_loadkeys(key, keylen); 2751 __ aesecb_encrypt(from, to, keylen); 2752 2753 __ mov(r0, 0); 2754 2755 __ leave(); 2756 __ ret(lr); 2757 2758 return start; 2759 } 2760 2761 // Arguments: 2762 // 2763 // Inputs: 2764 // c_rarg0 - source byte array address 2765 // c_rarg1 - destination byte array address 2766 // c_rarg2 - K (key) in little endian int array 2767 // 2768 address generate_aescrypt_decryptBlock() { 2769 assert(UseAES, "need AES cryptographic extension support"); 2770 __ align(CodeEntryAlignment); 2771 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2772 StubCodeMark mark(this, stub_id); 2773 Label L_doLast; 2774 2775 const Register from = c_rarg0; // source array address 2776 const Register to = c_rarg1; // destination array address 2777 const Register key = c_rarg2; // key array address 2778 const Register keylen = rscratch1; 2779 2780 address start = __ pc(); 2781 __ enter(); // required for proper stackwalking of RuntimeStub frame 2782 2783 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2784 2785 __ aesecb_decrypt(from, to, key, keylen); 2786 2787 __ mov(r0, 0); 2788 2789 __ leave(); 2790 __ ret(lr); 2791 2792 return start; 2793 } 2794 2795 // Arguments: 2796 // 2797 // Inputs: 2798 // c_rarg0 - source byte array address 2799 // c_rarg1 - destination byte array address 2800 // c_rarg2 - K (key) in little endian int array 2801 // c_rarg3 - r vector byte array address 2802 // c_rarg4 - input length 2803 // 2804 // Output: 2805 // x0 - input length 2806 // 2807 address generate_cipherBlockChaining_encryptAESCrypt() { 2808 assert(UseAES, "need AES cryptographic extension support"); 2809 __ align(CodeEntryAlignment); 2810 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2811 StubCodeMark mark(this, stub_id); 2812 2813 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2814 2815 const Register from = c_rarg0; // source array address 2816 const Register to = c_rarg1; // destination array address 2817 const Register key = c_rarg2; // key array address 2818 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2819 // and left with the results of the last encryption block 2820 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2821 const Register keylen = rscratch1; 2822 2823 address start = __ pc(); 2824 2825 __ enter(); 2826 2827 __ movw(rscratch2, len_reg); 2828 2829 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2830 2831 __ ld1(v0, __ T16B, rvec); 2832 2833 __ cmpw(keylen, 52); 2834 __ br(Assembler::CC, L_loadkeys_44); 2835 __ br(Assembler::EQ, L_loadkeys_52); 2836 2837 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2838 __ rev32(v17, __ T16B, v17); 2839 __ rev32(v18, __ T16B, v18); 2840 __ BIND(L_loadkeys_52); 2841 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2842 __ rev32(v19, __ T16B, v19); 2843 __ rev32(v20, __ T16B, v20); 2844 __ BIND(L_loadkeys_44); 2845 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2846 __ rev32(v21, __ T16B, v21); 2847 __ rev32(v22, __ T16B, v22); 2848 __ rev32(v23, __ T16B, v23); 2849 __ rev32(v24, __ T16B, v24); 2850 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2851 __ rev32(v25, __ T16B, v25); 2852 __ rev32(v26, __ T16B, v26); 2853 __ rev32(v27, __ T16B, v27); 2854 __ rev32(v28, __ T16B, v28); 2855 __ ld1(v29, v30, v31, __ T16B, key); 2856 __ rev32(v29, __ T16B, v29); 2857 __ rev32(v30, __ T16B, v30); 2858 __ rev32(v31, __ T16B, v31); 2859 2860 __ BIND(L_aes_loop); 2861 __ ld1(v1, __ T16B, __ post(from, 16)); 2862 __ eor(v0, __ T16B, v0, v1); 2863 2864 __ br(Assembler::CC, L_rounds_44); 2865 __ br(Assembler::EQ, L_rounds_52); 2866 2867 __ aese(v0, v17); __ aesmc(v0, v0); 2868 __ aese(v0, v18); __ aesmc(v0, v0); 2869 __ BIND(L_rounds_52); 2870 __ aese(v0, v19); __ aesmc(v0, v0); 2871 __ aese(v0, v20); __ aesmc(v0, v0); 2872 __ BIND(L_rounds_44); 2873 __ aese(v0, v21); __ aesmc(v0, v0); 2874 __ aese(v0, v22); __ aesmc(v0, v0); 2875 __ aese(v0, v23); __ aesmc(v0, v0); 2876 __ aese(v0, v24); __ aesmc(v0, v0); 2877 __ aese(v0, v25); __ aesmc(v0, v0); 2878 __ aese(v0, v26); __ aesmc(v0, v0); 2879 __ aese(v0, v27); __ aesmc(v0, v0); 2880 __ aese(v0, v28); __ aesmc(v0, v0); 2881 __ aese(v0, v29); __ aesmc(v0, v0); 2882 __ aese(v0, v30); 2883 __ eor(v0, __ T16B, v0, v31); 2884 2885 __ st1(v0, __ T16B, __ post(to, 16)); 2886 2887 __ subw(len_reg, len_reg, 16); 2888 __ cbnzw(len_reg, L_aes_loop); 2889 2890 __ st1(v0, __ T16B, rvec); 2891 2892 __ mov(r0, rscratch2); 2893 2894 __ leave(); 2895 __ ret(lr); 2896 2897 return start; 2898 } 2899 2900 // Arguments: 2901 // 2902 // Inputs: 2903 // c_rarg0 - source byte array address 2904 // c_rarg1 - destination byte array address 2905 // c_rarg2 - K (key) in little endian int array 2906 // c_rarg3 - r vector byte array address 2907 // c_rarg4 - input length 2908 // 2909 // Output: 2910 // r0 - input length 2911 // 2912 address generate_cipherBlockChaining_decryptAESCrypt() { 2913 assert(UseAES, "need AES cryptographic extension support"); 2914 __ align(CodeEntryAlignment); 2915 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 2916 StubCodeMark mark(this, stub_id); 2917 2918 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2919 2920 const Register from = c_rarg0; // source array address 2921 const Register to = c_rarg1; // destination array address 2922 const Register key = c_rarg2; // key array address 2923 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2924 // and left with the results of the last encryption block 2925 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2926 const Register keylen = rscratch1; 2927 2928 address start = __ pc(); 2929 2930 __ enter(); 2931 2932 __ movw(rscratch2, len_reg); 2933 2934 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2935 2936 __ ld1(v2, __ T16B, rvec); 2937 2938 __ ld1(v31, __ T16B, __ post(key, 16)); 2939 __ rev32(v31, __ T16B, v31); 2940 2941 __ cmpw(keylen, 52); 2942 __ br(Assembler::CC, L_loadkeys_44); 2943 __ br(Assembler::EQ, L_loadkeys_52); 2944 2945 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2946 __ rev32(v17, __ T16B, v17); 2947 __ rev32(v18, __ T16B, v18); 2948 __ BIND(L_loadkeys_52); 2949 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2950 __ rev32(v19, __ T16B, v19); 2951 __ rev32(v20, __ T16B, v20); 2952 __ BIND(L_loadkeys_44); 2953 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2954 __ rev32(v21, __ T16B, v21); 2955 __ rev32(v22, __ T16B, v22); 2956 __ rev32(v23, __ T16B, v23); 2957 __ rev32(v24, __ T16B, v24); 2958 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2959 __ rev32(v25, __ T16B, v25); 2960 __ rev32(v26, __ T16B, v26); 2961 __ rev32(v27, __ T16B, v27); 2962 __ rev32(v28, __ T16B, v28); 2963 __ ld1(v29, v30, __ T16B, key); 2964 __ rev32(v29, __ T16B, v29); 2965 __ rev32(v30, __ T16B, v30); 2966 2967 __ BIND(L_aes_loop); 2968 __ ld1(v0, __ T16B, __ post(from, 16)); 2969 __ orr(v1, __ T16B, v0, v0); 2970 2971 __ br(Assembler::CC, L_rounds_44); 2972 __ br(Assembler::EQ, L_rounds_52); 2973 2974 __ aesd(v0, v17); __ aesimc(v0, v0); 2975 __ aesd(v0, v18); __ aesimc(v0, v0); 2976 __ BIND(L_rounds_52); 2977 __ aesd(v0, v19); __ aesimc(v0, v0); 2978 __ aesd(v0, v20); __ aesimc(v0, v0); 2979 __ BIND(L_rounds_44); 2980 __ aesd(v0, v21); __ aesimc(v0, v0); 2981 __ aesd(v0, v22); __ aesimc(v0, v0); 2982 __ aesd(v0, v23); __ aesimc(v0, v0); 2983 __ aesd(v0, v24); __ aesimc(v0, v0); 2984 __ aesd(v0, v25); __ aesimc(v0, v0); 2985 __ aesd(v0, v26); __ aesimc(v0, v0); 2986 __ aesd(v0, v27); __ aesimc(v0, v0); 2987 __ aesd(v0, v28); __ aesimc(v0, v0); 2988 __ aesd(v0, v29); __ aesimc(v0, v0); 2989 __ aesd(v0, v30); 2990 __ eor(v0, __ T16B, v0, v31); 2991 __ eor(v0, __ T16B, v0, v2); 2992 2993 __ st1(v0, __ T16B, __ post(to, 16)); 2994 __ orr(v2, __ T16B, v1, v1); 2995 2996 __ subw(len_reg, len_reg, 16); 2997 __ cbnzw(len_reg, L_aes_loop); 2998 2999 __ st1(v2, __ T16B, rvec); 3000 3001 __ mov(r0, rscratch2); 3002 3003 __ leave(); 3004 __ ret(lr); 3005 3006 return start; 3007 } 3008 3009 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3010 // Inputs: 128-bits. in is preserved. 3011 // The least-significant 64-bit word is in the upper dword of each vector. 3012 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3013 // Output: result 3014 void be_add_128_64(FloatRegister result, FloatRegister in, 3015 FloatRegister inc, FloatRegister tmp) { 3016 assert_different_registers(result, tmp, inc); 3017 3018 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3019 // input 3020 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3021 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3022 // MSD == 0 (must be!) to LSD 3023 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3024 } 3025 3026 // CTR AES crypt. 3027 // Arguments: 3028 // 3029 // Inputs: 3030 // c_rarg0 - source byte array address 3031 // c_rarg1 - destination byte array address 3032 // c_rarg2 - K (key) in little endian int array 3033 // c_rarg3 - counter vector byte array address 3034 // c_rarg4 - input length 3035 // c_rarg5 - saved encryptedCounter start 3036 // c_rarg6 - saved used length 3037 // 3038 // Output: 3039 // r0 - input length 3040 // 3041 address generate_counterMode_AESCrypt() { 3042 const Register in = c_rarg0; 3043 const Register out = c_rarg1; 3044 const Register key = c_rarg2; 3045 const Register counter = c_rarg3; 3046 const Register saved_len = c_rarg4, len = r10; 3047 const Register saved_encrypted_ctr = c_rarg5; 3048 const Register used_ptr = c_rarg6, used = r12; 3049 3050 const Register offset = r7; 3051 const Register keylen = r11; 3052 3053 const unsigned char block_size = 16; 3054 const int bulk_width = 4; 3055 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3056 // performance with larger data sizes, but it also means that the 3057 // fast path isn't used until you have at least 8 blocks, and up 3058 // to 127 bytes of data will be executed on the slow path. For 3059 // that reason, and also so as not to blow away too much icache, 4 3060 // blocks seems like a sensible compromise. 3061 3062 // Algorithm: 3063 // 3064 // if (len == 0) { 3065 // goto DONE; 3066 // } 3067 // int result = len; 3068 // do { 3069 // if (used >= blockSize) { 3070 // if (len >= bulk_width * blockSize) { 3071 // CTR_large_block(); 3072 // if (len == 0) 3073 // goto DONE; 3074 // } 3075 // for (;;) { 3076 // 16ByteVector v0 = counter; 3077 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3078 // used = 0; 3079 // if (len < blockSize) 3080 // break; /* goto NEXT */ 3081 // 16ByteVector v1 = load16Bytes(in, offset); 3082 // v1 = v1 ^ encryptedCounter; 3083 // store16Bytes(out, offset); 3084 // used = blockSize; 3085 // offset += blockSize; 3086 // len -= blockSize; 3087 // if (len == 0) 3088 // goto DONE; 3089 // } 3090 // } 3091 // NEXT: 3092 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3093 // len--; 3094 // } while (len != 0); 3095 // DONE: 3096 // return result; 3097 // 3098 // CTR_large_block() 3099 // Wide bulk encryption of whole blocks. 3100 3101 __ align(CodeEntryAlignment); 3102 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3103 StubCodeMark mark(this, stub_id); 3104 const address start = __ pc(); 3105 __ enter(); 3106 3107 Label DONE, CTR_large_block, large_block_return; 3108 __ ldrw(used, Address(used_ptr)); 3109 __ cbzw(saved_len, DONE); 3110 3111 __ mov(len, saved_len); 3112 __ mov(offset, 0); 3113 3114 // Compute #rounds for AES based on the length of the key array 3115 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3116 3117 __ aesenc_loadkeys(key, keylen); 3118 3119 { 3120 Label L_CTR_loop, NEXT; 3121 3122 __ bind(L_CTR_loop); 3123 3124 __ cmp(used, block_size); 3125 __ br(__ LO, NEXT); 3126 3127 // Maybe we have a lot of data 3128 __ subsw(rscratch1, len, bulk_width * block_size); 3129 __ br(__ HS, CTR_large_block); 3130 __ BIND(large_block_return); 3131 __ cbzw(len, DONE); 3132 3133 // Setup the counter 3134 __ movi(v4, __ T4S, 0); 3135 __ movi(v5, __ T4S, 1); 3136 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3137 3138 // 128-bit big-endian increment 3139 __ ld1(v0, __ T16B, counter); 3140 __ rev64(v16, __ T16B, v0); 3141 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3142 __ rev64(v16, __ T16B, v16); 3143 __ st1(v16, __ T16B, counter); 3144 // Previous counter value is in v0 3145 // v4 contains { 0, 1 } 3146 3147 { 3148 // We have fewer than bulk_width blocks of data left. Encrypt 3149 // them one by one until there is less than a full block 3150 // remaining, being careful to save both the encrypted counter 3151 // and the counter. 3152 3153 Label inner_loop; 3154 __ bind(inner_loop); 3155 // Counter to encrypt is in v0 3156 __ aesecb_encrypt(noreg, noreg, keylen); 3157 __ st1(v0, __ T16B, saved_encrypted_ctr); 3158 3159 // Do we have a remaining full block? 3160 3161 __ mov(used, 0); 3162 __ cmp(len, block_size); 3163 __ br(__ LO, NEXT); 3164 3165 // Yes, we have a full block 3166 __ ldrq(v1, Address(in, offset)); 3167 __ eor(v1, __ T16B, v1, v0); 3168 __ strq(v1, Address(out, offset)); 3169 __ mov(used, block_size); 3170 __ add(offset, offset, block_size); 3171 3172 __ subw(len, len, block_size); 3173 __ cbzw(len, DONE); 3174 3175 // Increment the counter, store it back 3176 __ orr(v0, __ T16B, v16, v16); 3177 __ rev64(v16, __ T16B, v16); 3178 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3179 __ rev64(v16, __ T16B, v16); 3180 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3181 3182 __ b(inner_loop); 3183 } 3184 3185 __ BIND(NEXT); 3186 3187 // Encrypt a single byte, and loop. 3188 // We expect this to be a rare event. 3189 __ ldrb(rscratch1, Address(in, offset)); 3190 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3191 __ eor(rscratch1, rscratch1, rscratch2); 3192 __ strb(rscratch1, Address(out, offset)); 3193 __ add(offset, offset, 1); 3194 __ add(used, used, 1); 3195 __ subw(len, len,1); 3196 __ cbnzw(len, L_CTR_loop); 3197 } 3198 3199 __ bind(DONE); 3200 __ strw(used, Address(used_ptr)); 3201 __ mov(r0, saved_len); 3202 3203 __ leave(); // required for proper stackwalking of RuntimeStub frame 3204 __ ret(lr); 3205 3206 // Bulk encryption 3207 3208 __ BIND (CTR_large_block); 3209 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3210 3211 if (bulk_width == 8) { 3212 __ sub(sp, sp, 4 * 16); 3213 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3214 } 3215 __ sub(sp, sp, 4 * 16); 3216 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3217 RegSet saved_regs = (RegSet::of(in, out, offset) 3218 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3219 __ push(saved_regs, sp); 3220 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3221 __ add(in, in, offset); 3222 __ add(out, out, offset); 3223 3224 // Keys should already be loaded into the correct registers 3225 3226 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3227 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3228 3229 // AES/CTR loop 3230 { 3231 Label L_CTR_loop; 3232 __ BIND(L_CTR_loop); 3233 3234 // Setup the counters 3235 __ movi(v8, __ T4S, 0); 3236 __ movi(v9, __ T4S, 1); 3237 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3238 3239 for (int i = 0; i < bulk_width; i++) { 3240 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3241 __ rev64(v0_ofs, __ T16B, v16); 3242 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3243 } 3244 3245 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3246 3247 // Encrypt the counters 3248 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3249 3250 if (bulk_width == 8) { 3251 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3252 } 3253 3254 // XOR the encrypted counters with the inputs 3255 for (int i = 0; i < bulk_width; i++) { 3256 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3257 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3258 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3259 } 3260 3261 // Write the encrypted data 3262 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3263 if (bulk_width == 8) { 3264 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3265 } 3266 3267 __ subw(len, len, 16 * bulk_width); 3268 __ cbnzw(len, L_CTR_loop); 3269 } 3270 3271 // Save the counter back where it goes 3272 __ rev64(v16, __ T16B, v16); 3273 __ st1(v16, __ T16B, counter); 3274 3275 __ pop(saved_regs, sp); 3276 3277 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3278 if (bulk_width == 8) { 3279 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3280 } 3281 3282 __ andr(rscratch1, len, -16 * bulk_width); 3283 __ sub(len, len, rscratch1); 3284 __ add(offset, offset, rscratch1); 3285 __ mov(used, 16); 3286 __ strw(used, Address(used_ptr)); 3287 __ b(large_block_return); 3288 3289 return start; 3290 } 3291 3292 // Vector AES Galois Counter Mode implementation. Parameters: 3293 // 3294 // in = c_rarg0 3295 // len = c_rarg1 3296 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3297 // out = c_rarg3 3298 // key = c_rarg4 3299 // state = c_rarg5 - GHASH.state 3300 // subkeyHtbl = c_rarg6 - powers of H 3301 // counter = c_rarg7 - 16 bytes of CTR 3302 // return - number of processed bytes 3303 address generate_galoisCounterMode_AESCrypt() { 3304 address ghash_polynomial = __ pc(); 3305 __ emit_int64(0x87); // The low-order bits of the field 3306 // polynomial (i.e. p = z^7+z^2+z+1) 3307 // repeated in the low and high parts of a 3308 // 128-bit vector 3309 __ emit_int64(0x87); 3310 3311 __ align(CodeEntryAlignment); 3312 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3313 StubCodeMark mark(this, stub_id); 3314 address start = __ pc(); 3315 __ enter(); 3316 3317 const Register in = c_rarg0; 3318 const Register len = c_rarg1; 3319 const Register ct = c_rarg2; 3320 const Register out = c_rarg3; 3321 // and updated with the incremented counter in the end 3322 3323 const Register key = c_rarg4; 3324 const Register state = c_rarg5; 3325 3326 const Register subkeyHtbl = c_rarg6; 3327 3328 const Register counter = c_rarg7; 3329 3330 const Register keylen = r10; 3331 // Save state before entering routine 3332 __ sub(sp, sp, 4 * 16); 3333 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3334 __ sub(sp, sp, 4 * 16); 3335 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3336 3337 // __ andr(len, len, -512); 3338 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3339 __ str(len, __ pre(sp, -2 * wordSize)); 3340 3341 Label DONE; 3342 __ cbz(len, DONE); 3343 3344 // Compute #rounds for AES based on the length of the key array 3345 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3346 3347 __ aesenc_loadkeys(key, keylen); 3348 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3349 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3350 3351 // AES/CTR loop 3352 { 3353 Label L_CTR_loop; 3354 __ BIND(L_CTR_loop); 3355 3356 // Setup the counters 3357 __ movi(v8, __ T4S, 0); 3358 __ movi(v9, __ T4S, 1); 3359 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3360 3361 assert(v0->encoding() < v8->encoding(), ""); 3362 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3363 FloatRegister f = as_FloatRegister(i); 3364 __ rev32(f, __ T16B, v16); 3365 __ addv(v16, __ T4S, v16, v8); 3366 } 3367 3368 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3369 3370 // Encrypt the counters 3371 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3372 3373 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3374 3375 // XOR the encrypted counters with the inputs 3376 for (int i = 0; i < 8; i++) { 3377 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3378 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3379 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3380 } 3381 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3382 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3383 3384 __ subw(len, len, 16 * 8); 3385 __ cbnzw(len, L_CTR_loop); 3386 } 3387 3388 __ rev32(v16, __ T16B, v16); 3389 __ st1(v16, __ T16B, counter); 3390 3391 __ ldr(len, Address(sp)); 3392 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3393 3394 // GHASH/CTR loop 3395 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3396 len, /*unrolls*/4); 3397 3398 #ifdef ASSERT 3399 { Label L; 3400 __ cmp(len, (unsigned char)0); 3401 __ br(Assembler::EQ, L); 3402 __ stop("stubGenerator: abort"); 3403 __ bind(L); 3404 } 3405 #endif 3406 3407 __ bind(DONE); 3408 // Return the number of bytes processed 3409 __ ldr(r0, __ post(sp, 2 * wordSize)); 3410 3411 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3412 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3413 3414 __ leave(); // required for proper stackwalking of RuntimeStub frame 3415 __ ret(lr); 3416 return start; 3417 } 3418 3419 class Cached64Bytes { 3420 private: 3421 MacroAssembler *_masm; 3422 Register _regs[8]; 3423 3424 public: 3425 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3426 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3427 auto it = rs.begin(); 3428 for (auto &r: _regs) { 3429 r = *it; 3430 ++it; 3431 } 3432 } 3433 3434 void gen_loads(Register base) { 3435 for (int i = 0; i < 8; i += 2) { 3436 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3437 } 3438 } 3439 3440 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3441 void extract_u32(Register dest, int i) { 3442 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3443 } 3444 }; 3445 3446 // Utility routines for md5. 3447 // Clobbers r10 and r11. 3448 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3449 int k, int s, int t) { 3450 Register rscratch3 = r10; 3451 Register rscratch4 = r11; 3452 3453 __ eorw(rscratch3, r3, r4); 3454 __ movw(rscratch2, t); 3455 __ andw(rscratch3, rscratch3, r2); 3456 __ addw(rscratch4, r1, rscratch2); 3457 reg_cache.extract_u32(rscratch1, k); 3458 __ eorw(rscratch3, rscratch3, r4); 3459 __ addw(rscratch4, rscratch4, rscratch1); 3460 __ addw(rscratch3, rscratch3, rscratch4); 3461 __ rorw(rscratch2, rscratch3, 32 - s); 3462 __ addw(r1, rscratch2, r2); 3463 } 3464 3465 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3466 int k, int s, int t) { 3467 Register rscratch3 = r10; 3468 Register rscratch4 = r11; 3469 3470 reg_cache.extract_u32(rscratch1, k); 3471 __ movw(rscratch2, t); 3472 __ addw(rscratch4, r1, rscratch2); 3473 __ addw(rscratch4, rscratch4, rscratch1); 3474 __ bicw(rscratch2, r3, r4); 3475 __ andw(rscratch3, r2, r4); 3476 __ addw(rscratch2, rscratch2, rscratch4); 3477 __ addw(rscratch2, rscratch2, rscratch3); 3478 __ rorw(rscratch2, rscratch2, 32 - s); 3479 __ addw(r1, rscratch2, r2); 3480 } 3481 3482 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3483 int k, int s, int t) { 3484 Register rscratch3 = r10; 3485 Register rscratch4 = r11; 3486 3487 __ eorw(rscratch3, r3, r4); 3488 __ movw(rscratch2, t); 3489 __ addw(rscratch4, r1, rscratch2); 3490 reg_cache.extract_u32(rscratch1, k); 3491 __ eorw(rscratch3, rscratch3, r2); 3492 __ addw(rscratch4, rscratch4, rscratch1); 3493 __ addw(rscratch3, rscratch3, rscratch4); 3494 __ rorw(rscratch2, rscratch3, 32 - s); 3495 __ addw(r1, rscratch2, r2); 3496 } 3497 3498 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3499 int k, int s, int t) { 3500 Register rscratch3 = r10; 3501 Register rscratch4 = r11; 3502 3503 __ movw(rscratch3, t); 3504 __ ornw(rscratch2, r2, r4); 3505 __ addw(rscratch4, r1, rscratch3); 3506 reg_cache.extract_u32(rscratch1, k); 3507 __ eorw(rscratch3, rscratch2, r3); 3508 __ addw(rscratch4, rscratch4, rscratch1); 3509 __ addw(rscratch3, rscratch3, rscratch4); 3510 __ rorw(rscratch2, rscratch3, 32 - s); 3511 __ addw(r1, rscratch2, r2); 3512 } 3513 3514 // Arguments: 3515 // 3516 // Inputs: 3517 // c_rarg0 - byte[] source+offset 3518 // c_rarg1 - int[] SHA.state 3519 // c_rarg2 - int offset 3520 // c_rarg3 - int limit 3521 // 3522 address generate_md5_implCompress(StubGenStubId stub_id) { 3523 bool multi_block; 3524 switch (stub_id) { 3525 case md5_implCompress_id: 3526 multi_block = false; 3527 break; 3528 case md5_implCompressMB_id: 3529 multi_block = true; 3530 break; 3531 default: 3532 ShouldNotReachHere(); 3533 } 3534 __ align(CodeEntryAlignment); 3535 3536 StubCodeMark mark(this, stub_id); 3537 address start = __ pc(); 3538 3539 Register buf = c_rarg0; 3540 Register state = c_rarg1; 3541 Register ofs = c_rarg2; 3542 Register limit = c_rarg3; 3543 Register a = r4; 3544 Register b = r5; 3545 Register c = r6; 3546 Register d = r7; 3547 Register rscratch3 = r10; 3548 Register rscratch4 = r11; 3549 3550 Register state_regs[2] = { r12, r13 }; 3551 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3552 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3553 3554 __ push(saved_regs, sp); 3555 3556 __ ldp(state_regs[0], state_regs[1], Address(state)); 3557 __ ubfx(a, state_regs[0], 0, 32); 3558 __ ubfx(b, state_regs[0], 32, 32); 3559 __ ubfx(c, state_regs[1], 0, 32); 3560 __ ubfx(d, state_regs[1], 32, 32); 3561 3562 Label md5_loop; 3563 __ BIND(md5_loop); 3564 3565 reg_cache.gen_loads(buf); 3566 3567 // Round 1 3568 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3569 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3570 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3571 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3572 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3573 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3574 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3575 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3576 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3577 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3578 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3579 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3580 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3581 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3582 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3583 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3584 3585 // Round 2 3586 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3587 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3588 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3589 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3590 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3591 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3592 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3593 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3594 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3595 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3596 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3597 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3598 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3599 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3600 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3601 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3602 3603 // Round 3 3604 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3605 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3606 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3607 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3608 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3609 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3610 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3611 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3612 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3613 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3614 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3615 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3616 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3617 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3618 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3619 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3620 3621 // Round 4 3622 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3623 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3624 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3625 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3626 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3627 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3628 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3629 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3630 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3631 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3632 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3633 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3634 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3635 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3636 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3637 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3638 3639 __ addw(a, state_regs[0], a); 3640 __ ubfx(rscratch2, state_regs[0], 32, 32); 3641 __ addw(b, rscratch2, b); 3642 __ addw(c, state_regs[1], c); 3643 __ ubfx(rscratch4, state_regs[1], 32, 32); 3644 __ addw(d, rscratch4, d); 3645 3646 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3647 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3648 3649 if (multi_block) { 3650 __ add(buf, buf, 64); 3651 __ add(ofs, ofs, 64); 3652 __ cmp(ofs, limit); 3653 __ br(Assembler::LE, md5_loop); 3654 __ mov(c_rarg0, ofs); // return ofs 3655 } 3656 3657 // write hash values back in the correct order 3658 __ stp(state_regs[0], state_regs[1], Address(state)); 3659 3660 __ pop(saved_regs, sp); 3661 3662 __ ret(lr); 3663 3664 return start; 3665 } 3666 3667 // Arguments: 3668 // 3669 // Inputs: 3670 // c_rarg0 - byte[] source+offset 3671 // c_rarg1 - int[] SHA.state 3672 // c_rarg2 - int offset 3673 // c_rarg3 - int limit 3674 // 3675 address generate_sha1_implCompress(StubGenStubId stub_id) { 3676 bool multi_block; 3677 switch (stub_id) { 3678 case sha1_implCompress_id: 3679 multi_block = false; 3680 break; 3681 case sha1_implCompressMB_id: 3682 multi_block = true; 3683 break; 3684 default: 3685 ShouldNotReachHere(); 3686 } 3687 3688 __ align(CodeEntryAlignment); 3689 3690 StubCodeMark mark(this, stub_id); 3691 address start = __ pc(); 3692 3693 Register buf = c_rarg0; 3694 Register state = c_rarg1; 3695 Register ofs = c_rarg2; 3696 Register limit = c_rarg3; 3697 3698 Label keys; 3699 Label sha1_loop; 3700 3701 // load the keys into v0..v3 3702 __ adr(rscratch1, keys); 3703 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3704 // load 5 words state into v6, v7 3705 __ ldrq(v6, Address(state, 0)); 3706 __ ldrs(v7, Address(state, 16)); 3707 3708 3709 __ BIND(sha1_loop); 3710 // load 64 bytes of data into v16..v19 3711 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3712 __ rev32(v16, __ T16B, v16); 3713 __ rev32(v17, __ T16B, v17); 3714 __ rev32(v18, __ T16B, v18); 3715 __ rev32(v19, __ T16B, v19); 3716 3717 // do the sha1 3718 __ addv(v4, __ T4S, v16, v0); 3719 __ orr(v20, __ T16B, v6, v6); 3720 3721 FloatRegister d0 = v16; 3722 FloatRegister d1 = v17; 3723 FloatRegister d2 = v18; 3724 FloatRegister d3 = v19; 3725 3726 for (int round = 0; round < 20; round++) { 3727 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3728 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3729 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3730 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3731 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3732 3733 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3734 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3735 __ sha1h(tmp2, __ T4S, v20); 3736 if (round < 5) 3737 __ sha1c(v20, __ T4S, tmp3, tmp4); 3738 else if (round < 10 || round >= 15) 3739 __ sha1p(v20, __ T4S, tmp3, tmp4); 3740 else 3741 __ sha1m(v20, __ T4S, tmp3, tmp4); 3742 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3743 3744 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3745 } 3746 3747 __ addv(v7, __ T2S, v7, v21); 3748 __ addv(v6, __ T4S, v6, v20); 3749 3750 if (multi_block) { 3751 __ add(ofs, ofs, 64); 3752 __ cmp(ofs, limit); 3753 __ br(Assembler::LE, sha1_loop); 3754 __ mov(c_rarg0, ofs); // return ofs 3755 } 3756 3757 __ strq(v6, Address(state, 0)); 3758 __ strs(v7, Address(state, 16)); 3759 3760 __ ret(lr); 3761 3762 __ bind(keys); 3763 __ emit_int32(0x5a827999); 3764 __ emit_int32(0x6ed9eba1); 3765 __ emit_int32(0x8f1bbcdc); 3766 __ emit_int32(0xca62c1d6); 3767 3768 return start; 3769 } 3770 3771 3772 // Arguments: 3773 // 3774 // Inputs: 3775 // c_rarg0 - byte[] source+offset 3776 // c_rarg1 - int[] SHA.state 3777 // c_rarg2 - int offset 3778 // c_rarg3 - int limit 3779 // 3780 address generate_sha256_implCompress(StubGenStubId stub_id) { 3781 bool multi_block; 3782 switch (stub_id) { 3783 case sha256_implCompress_id: 3784 multi_block = false; 3785 break; 3786 case sha256_implCompressMB_id: 3787 multi_block = true; 3788 break; 3789 default: 3790 ShouldNotReachHere(); 3791 } 3792 3793 static const uint32_t round_consts[64] = { 3794 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3795 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3796 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3797 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3798 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3799 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3800 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3801 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3802 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3803 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3804 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3805 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3806 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3807 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3808 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3809 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3810 }; 3811 3812 __ align(CodeEntryAlignment); 3813 3814 StubCodeMark mark(this, stub_id); 3815 address start = __ pc(); 3816 3817 Register buf = c_rarg0; 3818 Register state = c_rarg1; 3819 Register ofs = c_rarg2; 3820 Register limit = c_rarg3; 3821 3822 Label sha1_loop; 3823 3824 __ stpd(v8, v9, __ pre(sp, -32)); 3825 __ stpd(v10, v11, Address(sp, 16)); 3826 3827 // dga == v0 3828 // dgb == v1 3829 // dg0 == v2 3830 // dg1 == v3 3831 // dg2 == v4 3832 // t0 == v6 3833 // t1 == v7 3834 3835 // load 16 keys to v16..v31 3836 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3837 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3838 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3839 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3840 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3841 3842 // load 8 words (256 bits) state 3843 __ ldpq(v0, v1, state); 3844 3845 __ BIND(sha1_loop); 3846 // load 64 bytes of data into v8..v11 3847 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3848 __ rev32(v8, __ T16B, v8); 3849 __ rev32(v9, __ T16B, v9); 3850 __ rev32(v10, __ T16B, v10); 3851 __ rev32(v11, __ T16B, v11); 3852 3853 __ addv(v6, __ T4S, v8, v16); 3854 __ orr(v2, __ T16B, v0, v0); 3855 __ orr(v3, __ T16B, v1, v1); 3856 3857 FloatRegister d0 = v8; 3858 FloatRegister d1 = v9; 3859 FloatRegister d2 = v10; 3860 FloatRegister d3 = v11; 3861 3862 3863 for (int round = 0; round < 16; round++) { 3864 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3865 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3866 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3867 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3868 3869 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3870 __ orr(v4, __ T16B, v2, v2); 3871 if (round < 15) 3872 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3873 __ sha256h(v2, __ T4S, v3, tmp2); 3874 __ sha256h2(v3, __ T4S, v4, tmp2); 3875 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3876 3877 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3878 } 3879 3880 __ addv(v0, __ T4S, v0, v2); 3881 __ addv(v1, __ T4S, v1, v3); 3882 3883 if (multi_block) { 3884 __ add(ofs, ofs, 64); 3885 __ cmp(ofs, limit); 3886 __ br(Assembler::LE, sha1_loop); 3887 __ mov(c_rarg0, ofs); // return ofs 3888 } 3889 3890 __ ldpd(v10, v11, Address(sp, 16)); 3891 __ ldpd(v8, v9, __ post(sp, 32)); 3892 3893 __ stpq(v0, v1, state); 3894 3895 __ ret(lr); 3896 3897 return start; 3898 } 3899 3900 // Double rounds for sha512. 3901 void sha512_dround(int dr, 3902 FloatRegister vi0, FloatRegister vi1, 3903 FloatRegister vi2, FloatRegister vi3, 3904 FloatRegister vi4, FloatRegister vrc0, 3905 FloatRegister vrc1, FloatRegister vin0, 3906 FloatRegister vin1, FloatRegister vin2, 3907 FloatRegister vin3, FloatRegister vin4) { 3908 if (dr < 36) { 3909 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3910 } 3911 __ addv(v5, __ T2D, vrc0, vin0); 3912 __ ext(v6, __ T16B, vi2, vi3, 8); 3913 __ ext(v5, __ T16B, v5, v5, 8); 3914 __ ext(v7, __ T16B, vi1, vi2, 8); 3915 __ addv(vi3, __ T2D, vi3, v5); 3916 if (dr < 32) { 3917 __ ext(v5, __ T16B, vin3, vin4, 8); 3918 __ sha512su0(vin0, __ T2D, vin1); 3919 } 3920 __ sha512h(vi3, __ T2D, v6, v7); 3921 if (dr < 32) { 3922 __ sha512su1(vin0, __ T2D, vin2, v5); 3923 } 3924 __ addv(vi4, __ T2D, vi1, vi3); 3925 __ sha512h2(vi3, __ T2D, vi1, vi0); 3926 } 3927 3928 // Arguments: 3929 // 3930 // Inputs: 3931 // c_rarg0 - byte[] source+offset 3932 // c_rarg1 - int[] SHA.state 3933 // c_rarg2 - int offset 3934 // c_rarg3 - int limit 3935 // 3936 address generate_sha512_implCompress(StubGenStubId stub_id) { 3937 bool multi_block; 3938 switch (stub_id) { 3939 case sha512_implCompress_id: 3940 multi_block = false; 3941 break; 3942 case sha512_implCompressMB_id: 3943 multi_block = true; 3944 break; 3945 default: 3946 ShouldNotReachHere(); 3947 } 3948 3949 static const uint64_t round_consts[80] = { 3950 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3951 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3952 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3953 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3954 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3955 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3956 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3957 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3958 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3959 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3960 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3961 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3962 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3963 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3964 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3965 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3966 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3967 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3968 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3969 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3970 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3971 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3972 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3973 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3974 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3975 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3976 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3977 }; 3978 3979 __ align(CodeEntryAlignment); 3980 3981 StubCodeMark mark(this, stub_id); 3982 address start = __ pc(); 3983 3984 Register buf = c_rarg0; 3985 Register state = c_rarg1; 3986 Register ofs = c_rarg2; 3987 Register limit = c_rarg3; 3988 3989 __ stpd(v8, v9, __ pre(sp, -64)); 3990 __ stpd(v10, v11, Address(sp, 16)); 3991 __ stpd(v12, v13, Address(sp, 32)); 3992 __ stpd(v14, v15, Address(sp, 48)); 3993 3994 Label sha512_loop; 3995 3996 // load state 3997 __ ld1(v8, v9, v10, v11, __ T2D, state); 3998 3999 // load first 4 round constants 4000 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4001 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4002 4003 __ BIND(sha512_loop); 4004 // load 128B of data into v12..v19 4005 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4006 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4007 __ rev64(v12, __ T16B, v12); 4008 __ rev64(v13, __ T16B, v13); 4009 __ rev64(v14, __ T16B, v14); 4010 __ rev64(v15, __ T16B, v15); 4011 __ rev64(v16, __ T16B, v16); 4012 __ rev64(v17, __ T16B, v17); 4013 __ rev64(v18, __ T16B, v18); 4014 __ rev64(v19, __ T16B, v19); 4015 4016 __ mov(rscratch2, rscratch1); 4017 4018 __ mov(v0, __ T16B, v8); 4019 __ mov(v1, __ T16B, v9); 4020 __ mov(v2, __ T16B, v10); 4021 __ mov(v3, __ T16B, v11); 4022 4023 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4024 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4025 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4026 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4027 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4028 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4029 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4030 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4031 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4032 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4033 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4034 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4035 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4036 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4037 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4038 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4039 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4040 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4041 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4042 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4043 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4044 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4045 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4046 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4047 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4048 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4049 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4050 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4051 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4052 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4053 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4054 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4055 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4056 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4057 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4058 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4059 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4060 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4061 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4062 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4063 4064 __ addv(v8, __ T2D, v8, v0); 4065 __ addv(v9, __ T2D, v9, v1); 4066 __ addv(v10, __ T2D, v10, v2); 4067 __ addv(v11, __ T2D, v11, v3); 4068 4069 if (multi_block) { 4070 __ add(ofs, ofs, 128); 4071 __ cmp(ofs, limit); 4072 __ br(Assembler::LE, sha512_loop); 4073 __ mov(c_rarg0, ofs); // return ofs 4074 } 4075 4076 __ st1(v8, v9, v10, v11, __ T2D, state); 4077 4078 __ ldpd(v14, v15, Address(sp, 48)); 4079 __ ldpd(v12, v13, Address(sp, 32)); 4080 __ ldpd(v10, v11, Address(sp, 16)); 4081 __ ldpd(v8, v9, __ post(sp, 64)); 4082 4083 __ ret(lr); 4084 4085 return start; 4086 } 4087 4088 // Execute one round of keccak of two computations in parallel. 4089 // One of the states should be loaded into the lower halves of 4090 // the vector registers v0-v24, the other should be loaded into 4091 // the upper halves of those registers. The ld1r instruction loads 4092 // the round constant into both halves of register v31. 4093 // Intermediate results c0...c5 and d0...d5 are computed 4094 // in registers v25...v30. 4095 // All vector instructions that are used operate on both register 4096 // halves in parallel. 4097 // If only a single computation is needed, one can only load the lower halves. 4098 void keccak_round(Register rscratch1) { 4099 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4100 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4101 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4102 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4103 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4104 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4105 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4106 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4107 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4108 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4109 4110 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4111 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4112 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4113 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4114 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4115 4116 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4117 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4118 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4119 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4120 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4121 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4122 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4123 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4124 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4125 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4126 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4127 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4128 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4129 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4130 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4131 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4132 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4133 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4134 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4135 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4136 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4137 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4138 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4139 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4140 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4141 4142 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4143 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4144 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4145 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4146 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4147 4148 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4149 4150 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4151 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4152 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4153 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4154 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4155 4156 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4157 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4158 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4159 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4160 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4161 4162 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4163 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4164 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4165 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4166 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4167 4168 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4169 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4170 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4171 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4172 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4173 4174 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4175 } 4176 4177 // Arguments: 4178 // 4179 // Inputs: 4180 // c_rarg0 - byte[] source+offset 4181 // c_rarg1 - byte[] SHA.state 4182 // c_rarg2 - int block_size 4183 // c_rarg3 - int offset 4184 // c_rarg4 - int limit 4185 // 4186 address generate_sha3_implCompress(StubGenStubId stub_id) { 4187 bool multi_block; 4188 switch (stub_id) { 4189 case sha3_implCompress_id: 4190 multi_block = false; 4191 break; 4192 case sha3_implCompressMB_id: 4193 multi_block = true; 4194 break; 4195 default: 4196 ShouldNotReachHere(); 4197 } 4198 4199 static const uint64_t round_consts[24] = { 4200 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4201 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4202 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4203 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4204 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4205 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4206 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4207 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4208 }; 4209 4210 __ align(CodeEntryAlignment); 4211 4212 StubCodeMark mark(this, stub_id); 4213 address start = __ pc(); 4214 4215 Register buf = c_rarg0; 4216 Register state = c_rarg1; 4217 Register block_size = c_rarg2; 4218 Register ofs = c_rarg3; 4219 Register limit = c_rarg4; 4220 4221 Label sha3_loop, rounds24_loop; 4222 Label sha3_512_or_sha3_384, shake128; 4223 4224 __ stpd(v8, v9, __ pre(sp, -64)); 4225 __ stpd(v10, v11, Address(sp, 16)); 4226 __ stpd(v12, v13, Address(sp, 32)); 4227 __ stpd(v14, v15, Address(sp, 48)); 4228 4229 // load state 4230 __ add(rscratch1, state, 32); 4231 __ ld1(v0, v1, v2, v3, __ T1D, state); 4232 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4233 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4234 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4235 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4236 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4237 __ ld1(v24, __ T1D, rscratch1); 4238 4239 __ BIND(sha3_loop); 4240 4241 // 24 keccak rounds 4242 __ movw(rscratch2, 24); 4243 4244 // load round_constants base 4245 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4246 4247 // load input 4248 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4249 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4250 __ eor(v0, __ T8B, v0, v25); 4251 __ eor(v1, __ T8B, v1, v26); 4252 __ eor(v2, __ T8B, v2, v27); 4253 __ eor(v3, __ T8B, v3, v28); 4254 __ eor(v4, __ T8B, v4, v29); 4255 __ eor(v5, __ T8B, v5, v30); 4256 __ eor(v6, __ T8B, v6, v31); 4257 4258 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4259 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4260 4261 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4262 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4263 __ eor(v7, __ T8B, v7, v25); 4264 __ eor(v8, __ T8B, v8, v26); 4265 __ eor(v9, __ T8B, v9, v27); 4266 __ eor(v10, __ T8B, v10, v28); 4267 __ eor(v11, __ T8B, v11, v29); 4268 __ eor(v12, __ T8B, v12, v30); 4269 __ eor(v13, __ T8B, v13, v31); 4270 4271 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4272 __ eor(v14, __ T8B, v14, v25); 4273 __ eor(v15, __ T8B, v15, v26); 4274 __ eor(v16, __ T8B, v16, v27); 4275 4276 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4277 __ andw(c_rarg5, block_size, 48); 4278 __ cbzw(c_rarg5, rounds24_loop); 4279 4280 __ tbnz(block_size, 5, shake128); 4281 // block_size == 144, bit5 == 0, SHA3-224 4282 __ ldrd(v28, __ post(buf, 8)); 4283 __ eor(v17, __ T8B, v17, v28); 4284 __ b(rounds24_loop); 4285 4286 __ BIND(shake128); 4287 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4288 __ eor(v17, __ T8B, v17, v28); 4289 __ eor(v18, __ T8B, v18, v29); 4290 __ eor(v19, __ T8B, v19, v30); 4291 __ eor(v20, __ T8B, v20, v31); 4292 __ b(rounds24_loop); // block_size == 168, SHAKE128 4293 4294 __ BIND(sha3_512_or_sha3_384); 4295 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4296 __ eor(v7, __ T8B, v7, v25); 4297 __ eor(v8, __ T8B, v8, v26); 4298 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4299 4300 // SHA3-384 4301 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4302 __ eor(v9, __ T8B, v9, v27); 4303 __ eor(v10, __ T8B, v10, v28); 4304 __ eor(v11, __ T8B, v11, v29); 4305 __ eor(v12, __ T8B, v12, v30); 4306 4307 __ BIND(rounds24_loop); 4308 __ subw(rscratch2, rscratch2, 1); 4309 4310 keccak_round(rscratch1); 4311 4312 __ cbnzw(rscratch2, rounds24_loop); 4313 4314 if (multi_block) { 4315 __ add(ofs, ofs, block_size); 4316 __ cmp(ofs, limit); 4317 __ br(Assembler::LE, sha3_loop); 4318 __ mov(c_rarg0, ofs); // return ofs 4319 } 4320 4321 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4322 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4323 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4324 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4325 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4326 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4327 __ st1(v24, __ T1D, state); 4328 4329 // restore callee-saved registers 4330 __ ldpd(v14, v15, Address(sp, 48)); 4331 __ ldpd(v12, v13, Address(sp, 32)); 4332 __ ldpd(v10, v11, Address(sp, 16)); 4333 __ ldpd(v8, v9, __ post(sp, 64)); 4334 4335 __ ret(lr); 4336 4337 return start; 4338 } 4339 4340 // Inputs: 4341 // c_rarg0 - long[] state0 4342 // c_rarg1 - long[] state1 4343 address generate_double_keccak() { 4344 static const uint64_t round_consts[24] = { 4345 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4346 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4347 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4348 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4349 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4350 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4351 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4352 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4353 }; 4354 4355 // Implements the double_keccak() method of the 4356 // sun.secyrity.provider.SHA3Parallel class 4357 __ align(CodeEntryAlignment); 4358 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4359 address start = __ pc(); 4360 __ enter(); 4361 4362 Register state0 = c_rarg0; 4363 Register state1 = c_rarg1; 4364 4365 Label rounds24_loop; 4366 4367 // save callee-saved registers 4368 __ stpd(v8, v9, __ pre(sp, -64)); 4369 __ stpd(v10, v11, Address(sp, 16)); 4370 __ stpd(v12, v13, Address(sp, 32)); 4371 __ stpd(v14, v15, Address(sp, 48)); 4372 4373 // load states 4374 __ add(rscratch1, state0, 32); 4375 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4376 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4377 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4378 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4379 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4380 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4381 __ ld1(v24, __ D, 0, rscratch1); 4382 __ add(rscratch1, state1, 32); 4383 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4384 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4385 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4386 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4387 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4388 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4389 __ ld1(v24, __ D, 1, rscratch1); 4390 4391 // 24 keccak rounds 4392 __ movw(rscratch2, 24); 4393 4394 // load round_constants base 4395 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4396 4397 __ BIND(rounds24_loop); 4398 __ subw(rscratch2, rscratch2, 1); 4399 keccak_round(rscratch1); 4400 __ cbnzw(rscratch2, rounds24_loop); 4401 4402 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4403 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4404 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4405 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4406 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4407 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4408 __ st1(v24, __ D, 0, state0); 4409 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4410 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4411 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4412 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4413 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4414 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4415 __ st1(v24, __ D, 1, state1); 4416 4417 // restore callee-saved vector registers 4418 __ ldpd(v14, v15, Address(sp, 48)); 4419 __ ldpd(v12, v13, Address(sp, 32)); 4420 __ ldpd(v10, v11, Address(sp, 16)); 4421 __ ldpd(v8, v9, __ post(sp, 64)); 4422 4423 __ leave(); // required for proper stackwalking of RuntimeStub frame 4424 __ mov(r0, zr); // return 0 4425 __ ret(lr); 4426 4427 return start; 4428 } 4429 4430 // ChaCha20 block function. This version parallelizes the 32-bit 4431 // state elements on each of 16 vectors, producing 4 blocks of 4432 // keystream at a time. 4433 // 4434 // state (int[16]) = c_rarg0 4435 // keystream (byte[256]) = c_rarg1 4436 // return - number of bytes of produced keystream (always 256) 4437 // 4438 // This implementation takes each 32-bit integer from the state 4439 // array and broadcasts it across all 4 32-bit lanes of a vector register 4440 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4441 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4442 // the quarter round schedule is implemented as outlined in RFC 7539 section 4443 // 2.3. However, instead of sequentially processing the 3 quarter round 4444 // operations represented by one QUARTERROUND function, we instead stack all 4445 // the adds, xors and left-rotations from the first 4 quarter rounds together 4446 // and then do the same for the second set of 4 quarter rounds. This removes 4447 // some latency that would otherwise be incurred by waiting for an add to 4448 // complete before performing an xor (which depends on the result of the 4449 // add), etc. An adjustment happens between the first and second groups of 4 4450 // quarter rounds, but this is done only in the inputs to the macro functions 4451 // that generate the assembly instructions - these adjustments themselves are 4452 // not part of the resulting assembly. 4453 // The 4 registers v0-v3 are used during the quarter round operations as 4454 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4455 // registers become the vectors involved in adding the start state back onto 4456 // the post-QR working state. After the adds are complete, each of the 16 4457 // vectors write their first lane back to the keystream buffer, followed 4458 // by the second lane from all vectors and so on. 4459 address generate_chacha20Block_blockpar() { 4460 Label L_twoRounds, L_cc20_const; 4461 // The constant data is broken into two 128-bit segments to be loaded 4462 // onto FloatRegisters. The first 128 bits are a counter add overlay 4463 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4464 // The second 128-bits is a table constant used for 8-bit left rotations. 4465 __ BIND(L_cc20_const); 4466 __ emit_int64(0x0000000100000000UL); 4467 __ emit_int64(0x0000000300000002UL); 4468 __ emit_int64(0x0605040702010003UL); 4469 __ emit_int64(0x0E0D0C0F0A09080BUL); 4470 4471 __ align(CodeEntryAlignment); 4472 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4473 StubCodeMark mark(this, stub_id); 4474 address start = __ pc(); 4475 __ enter(); 4476 4477 int i, j; 4478 const Register state = c_rarg0; 4479 const Register keystream = c_rarg1; 4480 const Register loopCtr = r10; 4481 const Register tmpAddr = r11; 4482 const FloatRegister ctrAddOverlay = v28; 4483 const FloatRegister lrot8Tbl = v29; 4484 4485 // Organize SIMD registers in an array that facilitates 4486 // putting repetitive opcodes into loop structures. It is 4487 // important that each grouping of 4 registers is monotonically 4488 // increasing to support the requirements of multi-register 4489 // instructions (e.g. ld4r, st4, etc.) 4490 const FloatRegister workSt[16] = { 4491 v4, v5, v6, v7, v16, v17, v18, v19, 4492 v20, v21, v22, v23, v24, v25, v26, v27 4493 }; 4494 4495 // Pull in constant data. The first 16 bytes are the add overlay 4496 // which is applied to the vector holding the counter (state[12]). 4497 // The second 16 bytes is the index register for the 8-bit left 4498 // rotation tbl instruction. 4499 __ adr(tmpAddr, L_cc20_const); 4500 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4501 4502 // Load from memory and interlace across 16 SIMD registers, 4503 // With each word from memory being broadcast to all lanes of 4504 // each successive SIMD register. 4505 // Addr(0) -> All lanes in workSt[i] 4506 // Addr(4) -> All lanes workSt[i + 1], etc. 4507 __ mov(tmpAddr, state); 4508 for (i = 0; i < 16; i += 4) { 4509 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4510 __ post(tmpAddr, 16)); 4511 } 4512 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4513 4514 // Before entering the loop, create 5 4-register arrays. These 4515 // will hold the 4 registers that represent the a/b/c/d fields 4516 // in the quarter round operation. For instance the "b" field 4517 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4518 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4519 // since it is part of a diagonal organization. The aSet and scratch 4520 // register sets are defined at declaration time because they do not change 4521 // organization at any point during the 20-round processing. 4522 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4523 FloatRegister bSet[4]; 4524 FloatRegister cSet[4]; 4525 FloatRegister dSet[4]; 4526 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4527 4528 // Set up the 10 iteration loop and perform all 8 quarter round ops 4529 __ mov(loopCtr, 10); 4530 __ BIND(L_twoRounds); 4531 4532 // Set to columnar organization and do the following 4 quarter-rounds: 4533 // QUARTERROUND(0, 4, 8, 12) 4534 // QUARTERROUND(1, 5, 9, 13) 4535 // QUARTERROUND(2, 6, 10, 14) 4536 // QUARTERROUND(3, 7, 11, 15) 4537 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4538 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4539 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4540 4541 __ cc20_qr_add4(aSet, bSet); // a += b 4542 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4543 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4544 4545 __ cc20_qr_add4(cSet, dSet); // c += d 4546 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4547 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4548 4549 __ cc20_qr_add4(aSet, bSet); // a += b 4550 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4551 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4552 4553 __ cc20_qr_add4(cSet, dSet); // c += d 4554 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4555 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4556 4557 // Set to diagonal organization and do the next 4 quarter-rounds: 4558 // QUARTERROUND(0, 5, 10, 15) 4559 // QUARTERROUND(1, 6, 11, 12) 4560 // QUARTERROUND(2, 7, 8, 13) 4561 // QUARTERROUND(3, 4, 9, 14) 4562 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4563 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4564 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4565 4566 __ cc20_qr_add4(aSet, bSet); // a += b 4567 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4568 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4569 4570 __ cc20_qr_add4(cSet, dSet); // c += d 4571 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4572 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4573 4574 __ cc20_qr_add4(aSet, bSet); // a += b 4575 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4576 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4577 4578 __ cc20_qr_add4(cSet, dSet); // c += d 4579 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4580 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4581 4582 // Decrement and iterate 4583 __ sub(loopCtr, loopCtr, 1); 4584 __ cbnz(loopCtr, L_twoRounds); 4585 4586 __ mov(tmpAddr, state); 4587 4588 // Add the starting state back to the post-loop keystream 4589 // state. We read/interlace the state array from memory into 4590 // 4 registers similar to what we did in the beginning. Then 4591 // add the counter overlay onto workSt[12] at the end. 4592 for (i = 0; i < 16; i += 4) { 4593 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4594 __ addv(workSt[i], __ T4S, workSt[i], v0); 4595 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4596 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4597 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4598 } 4599 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4600 4601 // Write working state into the keystream buffer. This is accomplished 4602 // by taking the lane "i" from each of the four vectors and writing 4603 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4604 // repeating with the next 4 vectors until all 16 vectors have been used. 4605 // Then move to the next lane and repeat the process until all lanes have 4606 // been written. 4607 for (i = 0; i < 4; i++) { 4608 for (j = 0; j < 16; j += 4) { 4609 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4610 __ post(keystream, 16)); 4611 } 4612 } 4613 4614 __ mov(r0, 256); // Return length of output keystream 4615 __ leave(); 4616 __ ret(lr); 4617 4618 return start; 4619 } 4620 4621 // Helpers to schedule parallel operation bundles across vector 4622 // register sequences of size 2, 4 or 8. 4623 4624 // Implement various primitive computations across vector sequences 4625 4626 template<int N> 4627 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4628 const VSeq<N>& v1, const VSeq<N>& v2) { 4629 // output must not be constant 4630 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4631 // output cannot overwrite pending inputs 4632 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4633 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4634 for (int i = 0; i < N; i++) { 4635 __ addv(v[i], T, v1[i], v2[i]); 4636 } 4637 } 4638 4639 template<int N> 4640 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4641 const VSeq<N>& v1, const VSeq<N>& v2) { 4642 // output must not be constant 4643 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4644 // output cannot overwrite pending inputs 4645 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4646 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4647 for (int i = 0; i < N; i++) { 4648 __ subv(v[i], T, v1[i], v2[i]); 4649 } 4650 } 4651 4652 template<int N> 4653 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4654 const VSeq<N>& v1, const VSeq<N>& v2) { 4655 // output must not be constant 4656 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4657 // output cannot overwrite pending inputs 4658 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4659 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4660 for (int i = 0; i < N; i++) { 4661 __ mulv(v[i], T, v1[i], v2[i]); 4662 } 4663 } 4664 4665 template<int N> 4666 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4667 // output must not be constant 4668 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4669 // output cannot overwrite pending inputs 4670 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4671 for (int i = 0; i < N; i++) { 4672 __ negr(v[i], T, v1[i]); 4673 } 4674 } 4675 4676 template<int N> 4677 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4678 const VSeq<N>& v1, int shift) { 4679 // output must not be constant 4680 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4681 // output cannot overwrite pending inputs 4682 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4683 for (int i = 0; i < N; i++) { 4684 __ sshr(v[i], T, v1[i], shift); 4685 } 4686 } 4687 4688 template<int N> 4689 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4690 // output must not be constant 4691 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4692 // output cannot overwrite pending inputs 4693 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4694 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4695 for (int i = 0; i < N; i++) { 4696 __ andr(v[i], __ T16B, v1[i], v2[i]); 4697 } 4698 } 4699 4700 template<int N> 4701 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4702 // output must not be constant 4703 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4704 // output cannot overwrite pending inputs 4705 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4706 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4707 for (int i = 0; i < N; i++) { 4708 __ orr(v[i], __ T16B, v1[i], v2[i]); 4709 } 4710 } 4711 4712 template<int N> 4713 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4714 // output must not be constant 4715 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4716 // output cannot overwrite pending inputs 4717 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4718 for (int i = 0; i < N; i++) { 4719 __ notr(v[i], __ T16B, v1[i]); 4720 } 4721 } 4722 4723 template<int N> 4724 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4725 // output must not be constant 4726 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4727 // output cannot overwrite pending inputs 4728 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4729 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4730 for (int i = 0; i < N; i++) { 4731 __ sqdmulh(v[i], T, v1[i], v2[i]); 4732 } 4733 } 4734 4735 template<int N> 4736 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4737 // output must not be constant 4738 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4739 // output cannot overwrite pending inputs 4740 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4741 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4742 for (int i = 0; i < N; i++) { 4743 __ mlsv(v[i], T, v1[i], v2[i]); 4744 } 4745 } 4746 4747 // load N/2 successive pairs of quadword values from memory in order 4748 // into N successive vector registers of the sequence via the 4749 // address supplied in base. 4750 template<int N> 4751 void vs_ldpq(const VSeq<N>& v, Register base) { 4752 for (int i = 0; i < N; i += 2) { 4753 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4754 } 4755 } 4756 4757 // load N/2 successive pairs of quadword values from memory in order 4758 // into N vector registers of the sequence via the address supplied 4759 // in base using post-increment addressing 4760 template<int N> 4761 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4762 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4763 for (int i = 0; i < N; i += 2) { 4764 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4765 } 4766 } 4767 4768 // store N successive vector registers of the sequence into N/2 4769 // successive pairs of quadword memory locations via the address 4770 // supplied in base using post-increment addressing 4771 template<int N> 4772 void vs_stpq_post(const VSeq<N>& v, Register base) { 4773 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4774 for (int i = 0; i < N; i += 2) { 4775 __ stpq(v[i], v[i+1], __ post(base, 32)); 4776 } 4777 } 4778 4779 // load N/2 pairs of quadword values from memory de-interleaved into 4780 // N vector registers 2 at a time via the address supplied in base 4781 // using post-increment addressing. 4782 template<int N> 4783 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4784 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4785 for (int i = 0; i < N; i += 2) { 4786 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4787 } 4788 } 4789 4790 // store N vector registers interleaved into N/2 pairs of quadword 4791 // memory locations via the address supplied in base using 4792 // post-increment addressing. 4793 template<int N> 4794 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4795 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4796 for (int i = 0; i < N; i += 2) { 4797 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4798 } 4799 } 4800 4801 // load N quadword values from memory de-interleaved into N vector 4802 // registers 3 elements at a time via the address supplied in base. 4803 template<int N> 4804 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4805 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4806 for (int i = 0; i < N; i += 3) { 4807 __ ld3(v[i], v[i+1], v[i+2], T, base); 4808 } 4809 } 4810 4811 // load N quadword values from memory de-interleaved into N vector 4812 // registers 3 elements at a time via the address supplied in base 4813 // using post-increment addressing. 4814 template<int N> 4815 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4816 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4817 for (int i = 0; i < N; i += 3) { 4818 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4819 } 4820 } 4821 4822 // load N/2 pairs of quadword values from memory into N vector 4823 // registers via the address supplied in base with each pair indexed 4824 // using the the start offset plus the corresponding entry in the 4825 // offsets array 4826 template<int N> 4827 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4828 for (int i = 0; i < N/2; i++) { 4829 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4830 } 4831 } 4832 4833 // store N vector registers into N/2 pairs of quadword memory 4834 // locations via the address supplied in base with each pair indexed 4835 // using the the start offset plus the corresponding entry in the 4836 // offsets array 4837 template<int N> 4838 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4839 for (int i = 0; i < N/2; i++) { 4840 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4841 } 4842 } 4843 4844 // load N single quadword values from memory into N vector registers 4845 // via the address supplied in base with each value indexed using 4846 // the the start offset plus the corresponding entry in the offsets 4847 // array 4848 template<int N> 4849 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4850 int start, int (&offsets)[N]) { 4851 for (int i = 0; i < N; i++) { 4852 __ ldr(v[i], T, Address(base, start + offsets[i])); 4853 } 4854 } 4855 4856 // store N vector registers into N single quadword memory locations 4857 // via the address supplied in base with each value indexed using 4858 // the the start offset plus the corresponding entry in the offsets 4859 // array 4860 template<int N> 4861 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4862 int start, int (&offsets)[N]) { 4863 for (int i = 0; i < N; i++) { 4864 __ str(v[i], T, Address(base, start + offsets[i])); 4865 } 4866 } 4867 4868 // load N/2 pairs of quadword values from memory de-interleaved into 4869 // N vector registers 2 at a time via the address supplied in base 4870 // with each pair indexed using the the start offset plus the 4871 // corresponding entry in the offsets array 4872 template<int N> 4873 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4874 Register tmp, int start, int (&offsets)[N/2]) { 4875 for (int i = 0; i < N/2; i++) { 4876 __ add(tmp, base, start + offsets[i]); 4877 __ ld2(v[2*i], v[2*i+1], T, tmp); 4878 } 4879 } 4880 4881 // store N vector registers 2 at a time interleaved into N/2 pairs 4882 // of quadword memory locations via the address supplied in base 4883 // with each pair indexed using the the start offset plus the 4884 // corresponding entry in the offsets array 4885 template<int N> 4886 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4887 Register tmp, int start, int (&offsets)[N/2]) { 4888 for (int i = 0; i < N/2; i++) { 4889 __ add(tmp, base, start + offsets[i]); 4890 __ st2(v[2*i], v[2*i+1], T, tmp); 4891 } 4892 } 4893 4894 // Helper routines for various flavours of Montgomery multiply 4895 4896 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 4897 // multiplications in parallel 4898 // 4899 4900 // See the montMul() method of the sun.security.provider.ML_DSA 4901 // class. 4902 // 4903 // Computes 4x4S results or 8x8H results 4904 // a = b * c * 2^MONT_R_BITS mod MONT_Q 4905 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 4906 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 4907 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 4908 // Outputs: va - 4x4S or 4x8H vector register sequences 4909 // vb, vc, vtmp and vq must all be disjoint 4910 // va must be disjoint from all other inputs/temps or must equal vc 4911 // va must have a non-zero delta i.e. it must not be a constant vseq. 4912 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 4913 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 4914 Assembler::SIMD_Arrangement T, 4915 const VSeq<4>& vtmp, const VSeq<2>& vq) { 4916 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 4917 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 4918 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 4919 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 4920 4921 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 4922 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 4923 4924 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 4925 4926 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 4927 assert(vs_disjoint(va, vb), "va and vb overlap"); 4928 assert(vs_disjoint(va, vq), "va and vq overlap"); 4929 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 4930 assert(!va.is_constant(), "output vector must identify 4 different registers"); 4931 4932 // schedule 4 streams of instructions across the vector sequences 4933 for (int i = 0; i < 4; i++) { 4934 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 4935 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 4936 } 4937 4938 for (int i = 0; i < 4; i++) { 4939 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 4940 } 4941 4942 for (int i = 0; i < 4; i++) { 4943 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 4944 } 4945 4946 for (int i = 0; i < 4; i++) { 4947 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 4948 } 4949 } 4950 4951 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 4952 // multiplications in parallel 4953 // 4954 4955 // See the montMul() method of the sun.security.provider.ML_DSA 4956 // class. 4957 // 4958 // Computes 4x4S results or 8x8H results 4959 // a = b * c * 2^MONT_R_BITS mod MONT_Q 4960 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 4961 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 4962 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 4963 // Outputs: va - 4x4S or 4x8H vector register sequences 4964 // vb, vc, vtmp and vq must all be disjoint 4965 // va must be disjoint from all other inputs/temps or must equal vc 4966 // va must have a non-zero delta i.e. it must not be a constant vseq. 4967 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 4968 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 4969 Assembler::SIMD_Arrangement T, 4970 const VSeq<2>& vtmp, const VSeq<2>& vq) { 4971 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 4972 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 4973 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 4974 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 4975 4976 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 4977 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 4978 4979 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 4980 4981 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 4982 assert(vs_disjoint(va, vb), "va and vb overlap"); 4983 assert(vs_disjoint(va, vq), "va and vq overlap"); 4984 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 4985 assert(!va.is_constant(), "output vector must identify 2 different registers"); 4986 4987 // schedule 2 streams of instructions across the vector sequences 4988 for (int i = 0; i < 2; i++) { 4989 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 4990 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 4991 } 4992 4993 for (int i = 0; i < 2; i++) { 4994 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 4995 } 4996 4997 for (int i = 0; i < 2; i++) { 4998 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 4999 } 5000 5001 for (int i = 0; i < 2; i++) { 5002 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5003 } 5004 } 5005 5006 // Perform 16 16-bit Montgomery multiplications in parallel. 5007 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5008 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5009 // Use the helper routine to schedule a 2x8H Montgomery multiply. 5010 // It will assert that the register use is valid 5011 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 5012 } 5013 5014 // Perform 32 16-bit Montgomery multiplications in parallel. 5015 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5016 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5017 // Use the helper routine to schedule a 4x8H Montgomery multiply. 5018 // It will assert that the register use is valid 5019 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 5020 } 5021 5022 // Perform 64 16-bit Montgomery multiplications in parallel. 5023 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5024 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5025 // Schedule two successive 4x8H multiplies via the montmul helper 5026 // on the front and back halves of va, vb and vc. The helper will 5027 // assert that the register use has no overlap conflicts on each 5028 // individual call but we also need to ensure that the necessary 5029 // disjoint/equality constraints are met across both calls. 5030 5031 // vb, vc, vtmp and vq must be disjoint. va must either be 5032 // disjoint from all other registers or equal vc 5033 5034 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5035 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5036 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5037 5038 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5039 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5040 5041 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5042 5043 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5044 assert(vs_disjoint(va, vb), "va and vb overlap"); 5045 assert(vs_disjoint(va, vq), "va and vq overlap"); 5046 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5047 5048 // we multiply the front and back halves of each sequence 4 at a 5049 // time because 5050 // 5051 // 1) we are currently only able to get 4-way instruction 5052 // parallelism at best 5053 // 5054 // 2) we need registers for the constants in vq and temporary 5055 // scratch registers to hold intermediate results so vtmp can only 5056 // be a VSeq<4> which means we only have 4 scratch slots 5057 5058 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5059 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5060 } 5061 5062 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5063 const VSeq<4>& vc, 5064 const VSeq<4>& vtmp, 5065 const VSeq<2>& vq) { 5066 // compute a = montmul(a1, c) 5067 kyber_montmul32(vc, va1, vc, vtmp, vq); 5068 // ouptut a1 = a0 - a 5069 vs_subv(va1, __ T8H, va0, vc); 5070 // and a0 = a0 + a 5071 vs_addv(va0, __ T8H, va0, vc); 5072 } 5073 5074 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5075 const VSeq<4>& vb, 5076 const VSeq<4>& vtmp1, 5077 const VSeq<4>& vtmp2, 5078 const VSeq<2>& vq) { 5079 // compute c = a0 - a1 5080 vs_subv(vtmp1, __ T8H, va0, va1); 5081 // output a0 = a0 + a1 5082 vs_addv(va0, __ T8H, va0, va1); 5083 // output a1 = b montmul c 5084 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5085 } 5086 5087 void load64shorts(const VSeq<8>& v, Register shorts) { 5088 vs_ldpq_post(v, shorts); 5089 } 5090 5091 void load32shorts(const VSeq<4>& v, Register shorts) { 5092 vs_ldpq_post(v, shorts); 5093 } 5094 5095 void store64shorts(VSeq<8> v, Register tmpAddr) { 5096 vs_stpq_post(v, tmpAddr); 5097 } 5098 5099 // Kyber NTT function. 5100 // Implements 5101 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5102 // 5103 // coeffs (short[256]) = c_rarg0 5104 // ntt_zetas (short[256]) = c_rarg1 5105 address generate_kyberNtt() { 5106 5107 __ align(CodeEntryAlignment); 5108 StubGenStubId stub_id = StubGenStubId::kyberNtt_id; 5109 StubCodeMark mark(this, stub_id); 5110 address start = __ pc(); 5111 __ enter(); 5112 5113 const Register coeffs = c_rarg0; 5114 const Register zetas = c_rarg1; 5115 5116 const Register kyberConsts = r10; 5117 const Register tmpAddr = r11; 5118 5119 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5120 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5121 VSeq<2> vq(30); // n.b. constants overlap vs3 5122 5123 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5124 // load the montmul constants 5125 vs_ldpq(vq, kyberConsts); 5126 5127 // Each level corresponds to an iteration of the outermost loop of the 5128 // Java method seilerNTT(int[] coeffs). There are some differences 5129 // from what is done in the seilerNTT() method, though: 5130 // 1. The computation is using 16-bit signed values, we do not convert them 5131 // to ints here. 5132 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5133 // this array for each level, it is easier that way to fill up the vector 5134 // registers. 5135 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5136 // multiplications (this is because that way there should not be any 5137 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5138 // that we can use the 16-bit arithmetic in the vector unit. 5139 // 5140 // On each level, we fill up the vector registers in such a way that the 5141 // array elements that need to be multiplied by the zetas go into one 5142 // set of vector registers while the corresponding ones that don't need to 5143 // be multiplied, go into another set. 5144 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5145 // registers interleaving the steps of 4 identical computations, 5146 // each done on 8 16-bit values per register. 5147 5148 // At levels 0-3 the coefficients multiplied by or added/subtracted 5149 // to the zetas occur in discrete blocks whose size is some multiple 5150 // of 32. 5151 5152 // level 0 5153 __ add(tmpAddr, coeffs, 256); 5154 load64shorts(vs1, tmpAddr); 5155 load64shorts(vs2, zetas); 5156 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5157 __ add(tmpAddr, coeffs, 0); 5158 load64shorts(vs1, tmpAddr); 5159 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5160 vs_addv(vs1, __ T8H, vs1, vs2); 5161 __ add(tmpAddr, coeffs, 0); 5162 vs_stpq_post(vs1, tmpAddr); 5163 __ add(tmpAddr, coeffs, 256); 5164 vs_stpq_post(vs3, tmpAddr); 5165 // restore montmul constants 5166 vs_ldpq(vq, kyberConsts); 5167 load64shorts(vs1, tmpAddr); 5168 load64shorts(vs2, zetas); 5169 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5170 __ add(tmpAddr, coeffs, 128); 5171 load64shorts(vs1, tmpAddr); 5172 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5173 vs_addv(vs1, __ T8H, vs1, vs2); 5174 __ add(tmpAddr, coeffs, 128); 5175 store64shorts(vs1, tmpAddr); 5176 __ add(tmpAddr, coeffs, 384); 5177 store64shorts(vs3, tmpAddr); 5178 5179 // level 1 5180 // restore montmul constants 5181 vs_ldpq(vq, kyberConsts); 5182 __ add(tmpAddr, coeffs, 128); 5183 load64shorts(vs1, tmpAddr); 5184 load64shorts(vs2, zetas); 5185 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5186 __ add(tmpAddr, coeffs, 0); 5187 load64shorts(vs1, tmpAddr); 5188 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5189 vs_addv(vs1, __ T8H, vs1, vs2); 5190 __ add(tmpAddr, coeffs, 0); 5191 store64shorts(vs1, tmpAddr); 5192 store64shorts(vs3, tmpAddr); 5193 vs_ldpq(vq, kyberConsts); 5194 __ add(tmpAddr, coeffs, 384); 5195 load64shorts(vs1, tmpAddr); 5196 load64shorts(vs2, zetas); 5197 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5198 __ add(tmpAddr, coeffs, 256); 5199 load64shorts(vs1, tmpAddr); 5200 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5201 vs_addv(vs1, __ T8H, vs1, vs2); 5202 __ add(tmpAddr, coeffs, 256); 5203 store64shorts(vs1, tmpAddr); 5204 store64shorts(vs3, tmpAddr); 5205 5206 // level 2 5207 vs_ldpq(vq, kyberConsts); 5208 int offsets1[4] = { 0, 32, 128, 160 }; 5209 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5210 load64shorts(vs2, zetas); 5211 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5212 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5213 // kyber_subv_addv64(); 5214 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5215 vs_addv(vs1, __ T8H, vs1, vs2); 5216 __ add(tmpAddr, coeffs, 0); 5217 vs_stpq_post(vs_front(vs1), tmpAddr); 5218 vs_stpq_post(vs_front(vs3), tmpAddr); 5219 vs_stpq_post(vs_back(vs1), tmpAddr); 5220 vs_stpq_post(vs_back(vs3), tmpAddr); 5221 vs_ldpq(vq, kyberConsts); 5222 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5223 load64shorts(vs2, zetas); 5224 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5225 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5226 // kyber_subv_addv64(); 5227 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5228 vs_addv(vs1, __ T8H, vs1, vs2); 5229 __ add(tmpAddr, coeffs, 256); 5230 vs_stpq_post(vs_front(vs1), tmpAddr); 5231 vs_stpq_post(vs_front(vs3), tmpAddr); 5232 vs_stpq_post(vs_back(vs1), tmpAddr); 5233 vs_stpq_post(vs_back(vs3), tmpAddr); 5234 5235 // level 3 5236 vs_ldpq(vq, kyberConsts); 5237 int offsets2[4] = { 0, 64, 128, 192 }; 5238 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5239 load64shorts(vs2, zetas); 5240 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5241 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5242 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5243 vs_addv(vs1, __ T8H, vs1, vs2); 5244 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5245 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5246 5247 vs_ldpq(vq, kyberConsts); 5248 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5249 load64shorts(vs2, zetas); 5250 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5251 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5252 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5253 vs_addv(vs1, __ T8H, vs1, vs2); 5254 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5255 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5256 5257 // level 4 5258 // At level 4 coefficients occur in 8 discrete blocks of size 16 5259 // so they are loaded using employing an ldr at 8 distinct offsets. 5260 5261 vs_ldpq(vq, kyberConsts); 5262 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5263 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5264 load64shorts(vs2, zetas); 5265 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5266 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5267 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5268 vs_addv(vs1, __ T8H, vs1, vs2); 5269 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5270 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5271 5272 vs_ldpq(vq, kyberConsts); 5273 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5274 load64shorts(vs2, zetas); 5275 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5276 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5277 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5278 vs_addv(vs1, __ T8H, vs1, vs2); 5279 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5280 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5281 5282 // level 5 5283 // At level 5 related coefficients occur in discrete blocks of size 8 so 5284 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5285 5286 vs_ldpq(vq, kyberConsts); 5287 int offsets4[4] = { 0, 32, 64, 96 }; 5288 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5289 load32shorts(vs_front(vs2), zetas); 5290 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5291 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5292 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5293 load32shorts(vs_front(vs2), zetas); 5294 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5295 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5296 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5297 load32shorts(vs_front(vs2), zetas); 5298 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5299 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5300 5301 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5302 load32shorts(vs_front(vs2), zetas); 5303 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5304 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5305 5306 // level 6 5307 // At level 6 related coefficients occur in discrete blocks of size 4 so 5308 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5309 5310 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5311 load32shorts(vs_front(vs2), zetas); 5312 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5313 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5314 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5315 // __ ldpq(v18, v19, __ post(zetas, 32)); 5316 load32shorts(vs_front(vs2), zetas); 5317 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5318 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5319 5320 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5321 load32shorts(vs_front(vs2), zetas); 5322 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5323 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5324 5325 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5326 load32shorts(vs_front(vs2), zetas); 5327 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5328 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5329 5330 __ leave(); // required for proper stackwalking of RuntimeStub frame 5331 __ mov(r0, zr); // return 0 5332 __ ret(lr); 5333 5334 return start; 5335 } 5336 5337 // Kyber Inverse NTT function 5338 // Implements 5339 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5340 // 5341 // coeffs (short[256]) = c_rarg0 5342 // ntt_zetas (short[256]) = c_rarg1 5343 address generate_kyberInverseNtt() { 5344 5345 __ align(CodeEntryAlignment); 5346 StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id; 5347 StubCodeMark mark(this, stub_id); 5348 address start = __ pc(); 5349 __ enter(); 5350 5351 const Register coeffs = c_rarg0; 5352 const Register zetas = c_rarg1; 5353 5354 const Register kyberConsts = r10; 5355 const Register tmpAddr = r11; 5356 const Register tmpAddr2 = c_rarg2; 5357 5358 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5359 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5360 VSeq<2> vq(30); // n.b. constants overlap vs3 5361 5362 __ lea(kyberConsts, 5363 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5364 5365 // level 0 5366 // At level 0 related coefficients occur in discrete blocks of size 4 so 5367 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5368 5369 vs_ldpq(vq, kyberConsts); 5370 int offsets4[4] = { 0, 32, 64, 96 }; 5371 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5372 load32shorts(vs_front(vs2), zetas); 5373 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5374 vs_front(vs2), vs_back(vs2), vtmp, vq); 5375 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5376 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5377 load32shorts(vs_front(vs2), zetas); 5378 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5379 vs_front(vs2), vs_back(vs2), vtmp, vq); 5380 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5381 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5382 load32shorts(vs_front(vs2), zetas); 5383 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5384 vs_front(vs2), vs_back(vs2), vtmp, vq); 5385 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5386 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5387 load32shorts(vs_front(vs2), zetas); 5388 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5389 vs_front(vs2), vs_back(vs2), vtmp, vq); 5390 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5391 5392 // level 1 5393 // At level 1 related coefficients occur in discrete blocks of size 8 so 5394 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5395 5396 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5397 load32shorts(vs_front(vs2), zetas); 5398 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5399 vs_front(vs2), vs_back(vs2), vtmp, vq); 5400 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5401 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5402 load32shorts(vs_front(vs2), zetas); 5403 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5404 vs_front(vs2), vs_back(vs2), vtmp, vq); 5405 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5406 5407 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5408 load32shorts(vs_front(vs2), zetas); 5409 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5410 vs_front(vs2), vs_back(vs2), vtmp, vq); 5411 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5412 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5413 load32shorts(vs_front(vs2), zetas); 5414 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5415 vs_front(vs2), vs_back(vs2), vtmp, vq); 5416 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5417 5418 // level 2 5419 // At level 2 coefficients occur in 8 discrete blocks of size 16 5420 // so they are loaded using employing an ldr at 8 distinct offsets. 5421 5422 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5423 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5424 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5425 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5426 vs_subv(vs1, __ T8H, vs1, vs2); 5427 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5428 load64shorts(vs2, zetas); 5429 vs_ldpq(vq, kyberConsts); 5430 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5431 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5432 5433 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5434 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5435 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5436 vs_subv(vs1, __ T8H, vs1, vs2); 5437 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5438 load64shorts(vs2, zetas); 5439 vs_ldpq(vq, kyberConsts); 5440 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5441 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5442 5443 // Barrett reduction at indexes where overflow may happen 5444 5445 // load q and the multiplier for the Barrett reduction 5446 __ add(tmpAddr, kyberConsts, 16); 5447 vs_ldpq(vq, tmpAddr); 5448 5449 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5450 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5451 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5452 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5453 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5454 vs_sshr(vs2, __ T8H, vs2, 11); 5455 vs_mlsv(vs1, __ T8H, vs2, vq1); 5456 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5457 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5458 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5459 vs_sshr(vs2, __ T8H, vs2, 11); 5460 vs_mlsv(vs1, __ T8H, vs2, vq1); 5461 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5462 5463 // level 3 5464 // From level 3 upwards coefficients occur in discrete blocks whose size is 5465 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5466 5467 int offsets2[4] = { 0, 64, 128, 192 }; 5468 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5469 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5470 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5471 vs_subv(vs1, __ T8H, vs1, vs2); 5472 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5473 load64shorts(vs2, zetas); 5474 vs_ldpq(vq, kyberConsts); 5475 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5476 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5477 5478 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5479 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5480 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5481 vs_subv(vs1, __ T8H, vs1, vs2); 5482 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5483 load64shorts(vs2, zetas); 5484 vs_ldpq(vq, kyberConsts); 5485 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5486 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5487 5488 // level 4 5489 5490 int offsets1[4] = { 0, 32, 128, 160 }; 5491 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5492 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5493 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5494 vs_subv(vs1, __ T8H, vs1, vs2); 5495 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5496 load64shorts(vs2, zetas); 5497 vs_ldpq(vq, kyberConsts); 5498 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5499 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5500 5501 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5502 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5503 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5504 vs_subv(vs1, __ T8H, vs1, vs2); 5505 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5506 load64shorts(vs2, zetas); 5507 vs_ldpq(vq, kyberConsts); 5508 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5509 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5510 5511 // level 5 5512 5513 __ add(tmpAddr, coeffs, 0); 5514 load64shorts(vs1, tmpAddr); 5515 __ add(tmpAddr, coeffs, 128); 5516 load64shorts(vs2, tmpAddr); 5517 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5518 vs_subv(vs1, __ T8H, vs1, vs2); 5519 __ add(tmpAddr, coeffs, 0); 5520 store64shorts(vs3, tmpAddr); 5521 load64shorts(vs2, zetas); 5522 vs_ldpq(vq, kyberConsts); 5523 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5524 __ add(tmpAddr, coeffs, 128); 5525 store64shorts(vs2, tmpAddr); 5526 5527 load64shorts(vs1, tmpAddr); 5528 __ add(tmpAddr, coeffs, 384); 5529 load64shorts(vs2, tmpAddr); 5530 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5531 vs_subv(vs1, __ T8H, vs1, vs2); 5532 __ add(tmpAddr, coeffs, 256); 5533 store64shorts(vs3, tmpAddr); 5534 load64shorts(vs2, zetas); 5535 vs_ldpq(vq, kyberConsts); 5536 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5537 __ add(tmpAddr, coeffs, 384); 5538 store64shorts(vs2, tmpAddr); 5539 5540 // Barrett reduction at indexes where overflow may happen 5541 5542 // load q and the multiplier for the Barrett reduction 5543 __ add(tmpAddr, kyberConsts, 16); 5544 vs_ldpq(vq, tmpAddr); 5545 5546 int offsets0[2] = { 0, 256 }; 5547 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5548 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5549 vs_sshr(vs2, __ T8H, vs2, 11); 5550 vs_mlsv(vs1, __ T8H, vs2, vq1); 5551 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5552 5553 // level 6 5554 5555 __ add(tmpAddr, coeffs, 0); 5556 load64shorts(vs1, tmpAddr); 5557 __ add(tmpAddr, coeffs, 256); 5558 load64shorts(vs2, tmpAddr); 5559 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5560 vs_subv(vs1, __ T8H, vs1, vs2); 5561 __ add(tmpAddr, coeffs, 0); 5562 store64shorts(vs3, tmpAddr); 5563 load64shorts(vs2, zetas); 5564 vs_ldpq(vq, kyberConsts); 5565 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5566 __ add(tmpAddr, coeffs, 256); 5567 store64shorts(vs2, tmpAddr); 5568 5569 __ add(tmpAddr, coeffs, 128); 5570 load64shorts(vs1, tmpAddr); 5571 __ add(tmpAddr, coeffs, 384); 5572 load64shorts(vs2, tmpAddr); 5573 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5574 vs_subv(vs1, __ T8H, vs1, vs2); 5575 __ add(tmpAddr, coeffs, 128); 5576 store64shorts(vs3, tmpAddr); 5577 load64shorts(vs2, zetas); 5578 vs_ldpq(vq, kyberConsts); 5579 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5580 __ add(tmpAddr, coeffs, 384); 5581 store64shorts(vs2, tmpAddr); 5582 5583 // multiply by 2^-n 5584 5585 // load toMont(2^-n mod q) 5586 __ add(tmpAddr, kyberConsts, 48); 5587 __ ldr(v29, __ Q, tmpAddr); 5588 5589 vs_ldpq(vq, kyberConsts); 5590 __ add(tmpAddr, coeffs, 0); 5591 load64shorts(vs1, tmpAddr); 5592 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5593 __ add(tmpAddr, coeffs, 0); 5594 store64shorts(vs2, tmpAddr); 5595 5596 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5597 load64shorts(vs1, tmpAddr); 5598 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5599 __ add(tmpAddr, coeffs, 128); 5600 store64shorts(vs2, tmpAddr); 5601 5602 // now tmpAddr contains coeffs + 256 5603 load64shorts(vs1, tmpAddr); 5604 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5605 __ add(tmpAddr, coeffs, 256); 5606 store64shorts(vs2, tmpAddr); 5607 5608 // now tmpAddr contains coeffs + 384 5609 load64shorts(vs1, tmpAddr); 5610 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5611 __ add(tmpAddr, coeffs, 384); 5612 store64shorts(vs2, tmpAddr); 5613 5614 __ leave(); // required for proper stackwalking of RuntimeStub frame 5615 __ mov(r0, zr); // return 0 5616 __ ret(lr); 5617 5618 return start; 5619 } 5620 5621 // Kyber multiply polynomials in the NTT domain. 5622 // Implements 5623 // static int implKyberNttMult( 5624 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5625 // 5626 // result (short[256]) = c_rarg0 5627 // ntta (short[256]) = c_rarg1 5628 // nttb (short[256]) = c_rarg2 5629 // zetas (short[128]) = c_rarg3 5630 address generate_kyberNttMult() { 5631 5632 __ align(CodeEntryAlignment); 5633 StubGenStubId stub_id = StubGenStubId::kyberNttMult_id; 5634 StubCodeMark mark(this, stub_id); 5635 address start = __ pc(); 5636 __ enter(); 5637 5638 const Register result = c_rarg0; 5639 const Register ntta = c_rarg1; 5640 const Register nttb = c_rarg2; 5641 const Register zetas = c_rarg3; 5642 5643 const Register kyberConsts = r10; 5644 const Register limit = r11; 5645 5646 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5647 VSeq<4> vs3(16), vs4(20); 5648 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5649 VSeq<2> vz(28); // pair of zetas 5650 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5651 5652 __ lea(kyberConsts, 5653 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5654 5655 Label kyberNttMult_loop; 5656 5657 __ add(limit, result, 512); 5658 5659 // load q and qinv 5660 vs_ldpq(vq, kyberConsts); 5661 5662 // load R^2 mod q (to convert back from Montgomery representation) 5663 __ add(kyberConsts, kyberConsts, 64); 5664 __ ldr(v27, __ Q, kyberConsts); 5665 5666 __ BIND(kyberNttMult_loop); 5667 5668 // load 16 zetas 5669 vs_ldpq_post(vz, zetas); 5670 5671 // load 2 sets of 32 coefficients from the two input arrays 5672 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5673 // are striped across pairs of vector registers 5674 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5675 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5676 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5677 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5678 5679 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5680 // i.e. montmul the first and second halves of vs1 in order and 5681 // then with one sequence reversed storing the two results in vs3 5682 // 5683 // vs3[0] <- montmul(a0, b0) 5684 // vs3[1] <- montmul(a1, b1) 5685 // vs3[2] <- montmul(a0, b1) 5686 // vs3[3] <- montmul(a1, b0) 5687 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5688 kyber_montmul16(vs_back(vs3), 5689 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5690 5691 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5692 // i.e. montmul the first and second halves of vs4 in order and 5693 // then with one sequence reversed storing the two results in vs1 5694 // 5695 // vs1[0] <- montmul(a2, b2) 5696 // vs1[1] <- montmul(a3, b3) 5697 // vs1[2] <- montmul(a2, b3) 5698 // vs1[3] <- montmul(a3, b2) 5699 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5700 kyber_montmul16(vs_back(vs1), 5701 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5702 5703 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5704 // We can schedule two montmuls at a time if we use a suitable vector 5705 // sequence <vs3[1], vs1[1]>. 5706 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5707 VSeq<2> vs5(vs3[1], delta); 5708 5709 // vs3[1] <- montmul(montmul(a1, b1), z0) 5710 // vs1[1] <- montmul(montmul(a3, b3), z1) 5711 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5712 5713 // add results in pairs storing in vs3 5714 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5715 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5716 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5717 5718 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5719 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5720 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5721 5722 // vs1 <- montmul(vs3, montRSquareModQ) 5723 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5724 5725 // store back the two pairs of result vectors de-interleaved as 8H elements 5726 // i.e. storing each pairs of shorts striped across a register pair adjacent 5727 // in memory 5728 vs_st2_post(vs1, __ T8H, result); 5729 5730 __ cmp(result, limit); 5731 __ br(Assembler::NE, kyberNttMult_loop); 5732 5733 __ leave(); // required for proper stackwalking of RuntimeStub frame 5734 __ mov(r0, zr); // return 0 5735 __ ret(lr); 5736 5737 return start; 5738 } 5739 5740 // Kyber add 2 polynomials. 5741 // Implements 5742 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5743 // 5744 // result (short[256]) = c_rarg0 5745 // a (short[256]) = c_rarg1 5746 // b (short[256]) = c_rarg2 5747 address generate_kyberAddPoly_2() { 5748 5749 __ align(CodeEntryAlignment); 5750 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id; 5751 StubCodeMark mark(this, stub_id); 5752 address start = __ pc(); 5753 __ enter(); 5754 5755 const Register result = c_rarg0; 5756 const Register a = c_rarg1; 5757 const Register b = c_rarg2; 5758 5759 const Register kyberConsts = r11; 5760 5761 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5762 // So, we can load, add and store the data in 3 groups of 11, 5763 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5764 // registers. A further constraint is that the mapping needs 5765 // to skip callee saves. So, we allocate the register 5766 // sequences using two 8 sequences, two 2 sequences and two 5767 // single registers. 5768 VSeq<8> vs1_1(0); 5769 VSeq<2> vs1_2(16); 5770 FloatRegister vs1_3 = v28; 5771 VSeq<8> vs2_1(18); 5772 VSeq<2> vs2_2(26); 5773 FloatRegister vs2_3 = v29; 5774 5775 // two constant vector sequences 5776 VSeq<8> vc_1(31, 0); 5777 VSeq<2> vc_2(31, 0); 5778 5779 FloatRegister vc_3 = v31; 5780 __ lea(kyberConsts, 5781 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5782 5783 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5784 for (int i = 0; i < 3; i++) { 5785 // load 80 or 88 values from a into vs1_1/2/3 5786 vs_ldpq_post(vs1_1, a); 5787 vs_ldpq_post(vs1_2, a); 5788 if (i < 2) { 5789 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5790 } 5791 // load 80 or 88 values from b into vs2_1/2/3 5792 vs_ldpq_post(vs2_1, b); 5793 vs_ldpq_post(vs2_2, b); 5794 if (i < 2) { 5795 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5796 } 5797 // sum 80 or 88 values across vs1 and vs2 into vs1 5798 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5799 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5800 if (i < 2) { 5801 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5802 } 5803 // add constant to all 80 or 88 results 5804 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5805 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5806 if (i < 2) { 5807 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5808 } 5809 // store 80 or 88 values 5810 vs_stpq_post(vs1_1, result); 5811 vs_stpq_post(vs1_2, result); 5812 if (i < 2) { 5813 __ str(vs1_3, __ Q, __ post(result, 16)); 5814 } 5815 } 5816 5817 __ leave(); // required for proper stackwalking of RuntimeStub frame 5818 __ mov(r0, zr); // return 0 5819 __ ret(lr); 5820 5821 return start; 5822 } 5823 5824 // Kyber add 3 polynomials. 5825 // Implements 5826 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5827 // 5828 // result (short[256]) = c_rarg0 5829 // a (short[256]) = c_rarg1 5830 // b (short[256]) = c_rarg2 5831 // c (short[256]) = c_rarg3 5832 address generate_kyberAddPoly_3() { 5833 5834 __ align(CodeEntryAlignment); 5835 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id; 5836 StubCodeMark mark(this, stub_id); 5837 address start = __ pc(); 5838 __ enter(); 5839 5840 const Register result = c_rarg0; 5841 const Register a = c_rarg1; 5842 const Register b = c_rarg2; 5843 const Register c = c_rarg3; 5844 5845 const Register kyberConsts = r11; 5846 5847 // As above we sum 256 sets of values in total i.e. 32 x 8H 5848 // quadwords. So, we can load, add and store the data in 3 5849 // groups of 11, 11 and 10 at a time i.e. we need to map sets 5850 // of 10 or 11 registers. A further constraint is that the 5851 // mapping needs to skip callee saves. So, we allocate the 5852 // register sequences using two 8 sequences, two 2 sequences 5853 // and two single registers. 5854 VSeq<8> vs1_1(0); 5855 VSeq<2> vs1_2(16); 5856 FloatRegister vs1_3 = v28; 5857 VSeq<8> vs2_1(18); 5858 VSeq<2> vs2_2(26); 5859 FloatRegister vs2_3 = v29; 5860 5861 // two constant vector sequences 5862 VSeq<8> vc_1(31, 0); 5863 VSeq<2> vc_2(31, 0); 5864 5865 FloatRegister vc_3 = v31; 5866 5867 __ lea(kyberConsts, 5868 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5869 5870 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5871 for (int i = 0; i < 3; i++) { 5872 // load 80 or 88 values from a into vs1_1/2/3 5873 vs_ldpq_post(vs1_1, a); 5874 vs_ldpq_post(vs1_2, a); 5875 if (i < 2) { 5876 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5877 } 5878 // load 80 or 88 values from b into vs2_1/2/3 5879 vs_ldpq_post(vs2_1, b); 5880 vs_ldpq_post(vs2_2, b); 5881 if (i < 2) { 5882 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5883 } 5884 // sum 80 or 88 values across vs1 and vs2 into vs1 5885 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5886 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5887 if (i < 2) { 5888 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5889 } 5890 // load 80 or 88 values from c into vs2_1/2/3 5891 vs_ldpq_post(vs2_1, c); 5892 vs_ldpq_post(vs2_2, c); 5893 if (i < 2) { 5894 __ ldr(vs2_3, __ Q, __ post(c, 16)); 5895 } 5896 // sum 80 or 88 values across vs1 and vs2 into vs1 5897 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5898 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5899 if (i < 2) { 5900 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5901 } 5902 // add constant to all 80 or 88 results 5903 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5904 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5905 if (i < 2) { 5906 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5907 } 5908 // store 80 or 88 values 5909 vs_stpq_post(vs1_1, result); 5910 vs_stpq_post(vs1_2, result); 5911 if (i < 2) { 5912 __ str(vs1_3, __ Q, __ post(result, 16)); 5913 } 5914 } 5915 5916 __ leave(); // required for proper stackwalking of RuntimeStub frame 5917 __ mov(r0, zr); // return 0 5918 __ ret(lr); 5919 5920 return start; 5921 } 5922 5923 // Kyber parse XOF output to polynomial coefficient candidates 5924 // or decodePoly(12, ...). 5925 // Implements 5926 // static int implKyber12To16( 5927 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 5928 // 5929 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 5930 // 5931 // condensed (byte[]) = c_rarg0 5932 // condensedIndex = c_rarg1 5933 // parsed (short[112 or 256]) = c_rarg2 5934 // parsedLength (112 or 256) = c_rarg3 5935 address generate_kyber12To16() { 5936 Label L_F00, L_loop, L_end; 5937 5938 __ BIND(L_F00); 5939 __ emit_int64(0x0f000f000f000f00); 5940 __ emit_int64(0x0f000f000f000f00); 5941 5942 __ align(CodeEntryAlignment); 5943 StubGenStubId stub_id = StubGenStubId::kyber12To16_id; 5944 StubCodeMark mark(this, stub_id); 5945 address start = __ pc(); 5946 __ enter(); 5947 5948 const Register condensed = c_rarg0; 5949 const Register condensedOffs = c_rarg1; 5950 const Register parsed = c_rarg2; 5951 const Register parsedLength = c_rarg3; 5952 5953 const Register tmpAddr = r11; 5954 5955 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 5956 // quadwords so we need a 6 vector sequence for the inputs. 5957 // Parsing produces 64 shorts, employing two 8 vector 5958 // sequences to store and combine the intermediate data. 5959 VSeq<6> vin(24); 5960 VSeq<8> va(0), vb(16); 5961 5962 __ adr(tmpAddr, L_F00); 5963 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 5964 __ add(condensed, condensed, condensedOffs); 5965 5966 __ BIND(L_loop); 5967 // load 96 (6 x 16B) byte values 5968 vs_ld3_post(vin, __ T16B, condensed); 5969 5970 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 5971 // holds 48 (16x3) contiguous bytes from memory striped 5972 // horizontally across each of the 16 byte lanes. Equivalently, 5973 // that is 16 pairs of 12-bit integers. Likewise the back half 5974 // holds the next 48 bytes in the same arrangement. 5975 5976 // Each vector in the front half can also be viewed as a vertical 5977 // strip across the 16 pairs of 12 bit integers. Each byte in 5978 // vin[0] stores the low 8 bits of the first int in a pair. Each 5979 // byte in vin[1] stores the high 4 bits of the first int and the 5980 // low 4 bits of the second int. Each byte in vin[2] stores the 5981 // high 8 bits of the second int. Likewise the vectors in second 5982 // half. 5983 5984 // Converting the data to 16-bit shorts requires first of all 5985 // expanding each of the 6 x 16B vectors into 6 corresponding 5986 // pairs of 8H vectors. Mask, shift and add operations on the 5987 // resulting vector pairs can be used to combine 4 and 8 bit 5988 // parts of related 8H vector elements. 5989 // 5990 // The middle vectors (vin[2] and vin[5]) are actually expanded 5991 // twice, one copy manipulated to provide the lower 4 bits 5992 // belonging to the first short in a pair and another copy 5993 // manipulated to provide the higher 4 bits belonging to the 5994 // second short in a pair. This is why the the vector sequences va 5995 // and vb used to hold the expanded 8H elements are of length 8. 5996 5997 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 5998 // n.b. target elements 2 and 3 duplicate elements 4 and 5 5999 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6000 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6001 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6002 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6003 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6004 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6005 6006 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 6007 // and vb[4:5] 6008 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6009 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 6010 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6011 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 6012 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6013 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 6014 6015 // shift lo byte of copy 1 of the middle stripe into the high byte 6016 __ shl(va[2], __ T8H, va[2], 8); 6017 __ shl(va[3], __ T8H, va[3], 8); 6018 __ shl(vb[2], __ T8H, vb[2], 8); 6019 __ shl(vb[3], __ T8H, vb[3], 8); 6020 6021 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6022 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6023 // are in bit positions [4..11]. 6024 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6025 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6026 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6027 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6028 6029 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6030 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6031 // copy2 6032 __ andr(va[2], __ T16B, va[2], v31); 6033 __ andr(va[3], __ T16B, va[3], v31); 6034 __ ushr(va[4], __ T8H, va[4], 4); 6035 __ ushr(va[5], __ T8H, va[5], 4); 6036 __ andr(vb[2], __ T16B, vb[2], v31); 6037 __ andr(vb[3], __ T16B, vb[3], v31); 6038 __ ushr(vb[4], __ T8H, vb[4], 4); 6039 __ ushr(vb[5], __ T8H, vb[5], 4); 6040 6041 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6042 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6043 // n.b. the ordering ensures: i) inputs are consumed before they 6044 // are overwritten ii) the order of 16-bit results across successive 6045 // pairs of vectors in va and then vb reflects the order of the 6046 // corresponding 12-bit inputs 6047 __ addv(va[0], __ T8H, va[0], va[2]); 6048 __ addv(va[2], __ T8H, va[1], va[3]); 6049 __ addv(va[1], __ T8H, va[4], va[6]); 6050 __ addv(va[3], __ T8H, va[5], va[7]); 6051 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6052 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6053 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6054 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6055 6056 // store 64 results interleaved as shorts 6057 vs_st2_post(vs_front(va), __ T8H, parsed); 6058 vs_st2_post(vs_front(vb), __ T8H, parsed); 6059 6060 __ sub(parsedLength, parsedLength, 64); 6061 __ cmp(parsedLength, (u1)64); 6062 __ br(Assembler::GE, L_loop); 6063 __ cbz(parsedLength, L_end); 6064 6065 // if anything is left it should be a final 72 bytes of input 6066 // i.e. a final 48 12-bit values. so we handle this by loading 6067 // 48 bytes into all 16B lanes of front(vin) and only 24 6068 // bytes into the lower 8B lane of back(vin) 6069 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6070 vs_ld3(vs_back(vin), __ T8B, condensed); 6071 6072 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6073 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6074 // 5 and target element 2 of vb duplicates element 4. 6075 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6076 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6077 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6078 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6079 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6080 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6081 6082 // This time expand just the lower 8 lanes 6083 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6084 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6085 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6086 6087 // shift lo byte of copy 1 of the middle stripe into the high byte 6088 __ shl(va[2], __ T8H, va[2], 8); 6089 __ shl(va[3], __ T8H, va[3], 8); 6090 __ shl(vb[2], __ T8H, vb[2], 8); 6091 6092 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6093 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6094 // int are in bit positions [4..11]. 6095 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6096 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6097 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6098 6099 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6100 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6101 // copy2 6102 __ andr(va[2], __ T16B, va[2], v31); 6103 __ andr(va[3], __ T16B, va[3], v31); 6104 __ ushr(va[4], __ T8H, va[4], 4); 6105 __ ushr(va[5], __ T8H, va[5], 4); 6106 __ andr(vb[2], __ T16B, vb[2], v31); 6107 __ ushr(vb[4], __ T8H, vb[4], 4); 6108 6109 6110 6111 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6112 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6113 6114 // n.b. ordering ensures: i) inputs are consumed before they are 6115 // overwritten ii) order of 16-bit results across succsessive 6116 // pairs of vectors in va and then lower half of vb reflects order 6117 // of corresponding 12-bit inputs 6118 __ addv(va[0], __ T8H, va[0], va[2]); 6119 __ addv(va[2], __ T8H, va[1], va[3]); 6120 __ addv(va[1], __ T8H, va[4], va[6]); 6121 __ addv(va[3], __ T8H, va[5], va[7]); 6122 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6123 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6124 6125 // store 48 results interleaved as shorts 6126 vs_st2_post(vs_front(va), __ T8H, parsed); 6127 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6128 6129 __ BIND(L_end); 6130 6131 __ leave(); // required for proper stackwalking of RuntimeStub frame 6132 __ mov(r0, zr); // return 0 6133 __ ret(lr); 6134 6135 return start; 6136 } 6137 6138 // Kyber Barrett reduce function. 6139 // Implements 6140 // static int implKyberBarrettReduce(short[] coeffs) {} 6141 // 6142 // coeffs (short[256]) = c_rarg0 6143 address generate_kyberBarrettReduce() { 6144 6145 __ align(CodeEntryAlignment); 6146 StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id; 6147 StubCodeMark mark(this, stub_id); 6148 address start = __ pc(); 6149 __ enter(); 6150 6151 const Register coeffs = c_rarg0; 6152 6153 const Register kyberConsts = r10; 6154 const Register result = r11; 6155 6156 // As above we process 256 sets of values in total i.e. 32 x 6157 // 8H quadwords. So, we can load, add and store the data in 3 6158 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6159 // of 10 or 11 registers. A further constraint is that the 6160 // mapping needs to skip callee saves. So, we allocate the 6161 // register sequences using two 8 sequences, two 2 sequences 6162 // and two single registers. 6163 VSeq<8> vs1_1(0); 6164 VSeq<2> vs1_2(16); 6165 FloatRegister vs1_3 = v28; 6166 VSeq<8> vs2_1(18); 6167 VSeq<2> vs2_2(26); 6168 FloatRegister vs2_3 = v29; 6169 6170 // we also need a pair of corresponding constant sequences 6171 6172 VSeq<8> vc1_1(30, 0); 6173 VSeq<2> vc1_2(30, 0); 6174 FloatRegister vc1_3 = v30; // for kyber_q 6175 6176 VSeq<8> vc2_1(31, 0); 6177 VSeq<2> vc2_2(31, 0); 6178 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6179 6180 __ add(result, coeffs, 0); 6181 __ lea(kyberConsts, 6182 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6183 6184 // load q and the multiplier for the Barrett reduction 6185 __ add(kyberConsts, kyberConsts, 16); 6186 __ ldpq(vc1_3, vc2_3, kyberConsts); 6187 6188 for (int i = 0; i < 3; i++) { 6189 // load 80 or 88 coefficients 6190 vs_ldpq_post(vs1_1, coeffs); 6191 vs_ldpq_post(vs1_2, coeffs); 6192 if (i < 2) { 6193 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6194 } 6195 6196 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6197 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6198 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6199 if (i < 2) { 6200 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6201 } 6202 6203 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6204 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6205 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6206 if (i < 2) { 6207 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6208 } 6209 6210 // vs1 <- vs1 - vs2 * kyber_q 6211 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6212 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6213 if (i < 2) { 6214 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6215 } 6216 6217 vs_stpq_post(vs1_1, result); 6218 vs_stpq_post(vs1_2, result); 6219 if (i < 2) { 6220 __ str(vs1_3, __ Q, __ post(result, 16)); 6221 } 6222 } 6223 6224 __ leave(); // required for proper stackwalking of RuntimeStub frame 6225 __ mov(r0, zr); // return 0 6226 __ ret(lr); 6227 6228 return start; 6229 } 6230 6231 6232 // Dilithium-specific montmul helper routines that generate parallel 6233 // code for, respectively, a single 4x4s vector sequence montmul or 6234 // two such multiplies in a row. 6235 6236 // Perform 16 32-bit Montgomery multiplications in parallel 6237 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6238 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6239 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6240 // It will assert that the register use is valid 6241 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6242 } 6243 6244 // Perform 2x16 32-bit Montgomery multiplications in parallel 6245 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6246 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6247 // Schedule two successive 4x4S multiplies via the montmul helper 6248 // on the front and back halves of va, vb and vc. The helper will 6249 // assert that the register use has no overlap conflicts on each 6250 // individual call but we also need to ensure that the necessary 6251 // disjoint/equality constraints are met across both calls. 6252 6253 // vb, vc, vtmp and vq must be disjoint. va must either be 6254 // disjoint from all other registers or equal vc 6255 6256 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6257 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6258 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6259 6260 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6261 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6262 6263 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6264 6265 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6266 assert(vs_disjoint(va, vb), "va and vb overlap"); 6267 assert(vs_disjoint(va, vq), "va and vq overlap"); 6268 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6269 6270 // We multiply the front and back halves of each sequence 4 at a 6271 // time because 6272 // 6273 // 1) we are currently only able to get 4-way instruction 6274 // parallelism at best 6275 // 6276 // 2) we need registers for the constants in vq and temporary 6277 // scratch registers to hold intermediate results so vtmp can only 6278 // be a VSeq<4> which means we only have 4 scratch slots. 6279 6280 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6281 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6282 } 6283 6284 // Perform combined montmul then add/sub on 4x4S vectors. 6285 void dilithium_montmul16_sub_add( 6286 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6287 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6288 // compute a = montmul(a1, c) 6289 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6290 // ouptut a1 = a0 - a 6291 vs_subv(va1, __ T4S, va0, vc); 6292 // and a0 = a0 + a 6293 vs_addv(va0, __ T4S, va0, vc); 6294 } 6295 6296 // Perform combined add/sub then montul on 4x4S vectors. 6297 void dilithium_sub_add_montmul16( 6298 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6299 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6300 // compute c = a0 - a1 6301 vs_subv(vtmp1, __ T4S, va0, va1); 6302 // output a0 = a0 + a1 6303 vs_addv(va0, __ T4S, va0, va1); 6304 // output a1 = b montmul c 6305 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6306 } 6307 6308 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6309 // in the Java implementation come in sequences of at least 8, so we 6310 // can use ldpq to collect the corresponding data into pairs of vector 6311 // registers. 6312 // We collect the coefficients corresponding to the 'j+l' indexes into 6313 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6314 // then we do the (Montgomery) multiplications by the zetas in parallel 6315 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6316 // v0-v7, then do the additions into v24-v31 and the subtractions into 6317 // v0-v7 and finally save the results back to the coeffs array. 6318 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6319 const Register coeffs, const Register zetas) { 6320 int c1 = 0; 6321 int c2 = 512; 6322 int startIncr; 6323 // don't use callee save registers v8 - v15 6324 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6325 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6326 VSeq<2> vq(30); // n.b. constants overlap vs3 6327 int offsets[4] = { 0, 32, 64, 96 }; 6328 6329 for (int level = 0; level < 5; level++) { 6330 int c1Start = c1; 6331 int c2Start = c2; 6332 if (level == 3) { 6333 offsets[1] = 32; 6334 offsets[2] = 128; 6335 offsets[3] = 160; 6336 } else if (level == 4) { 6337 offsets[1] = 64; 6338 offsets[2] = 128; 6339 offsets[3] = 192; 6340 } 6341 6342 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6343 // time at 4 different offsets and multiply them in order by the 6344 // next set of input values. So we employ indexed load and store 6345 // pair instructions with arrangement 4S. 6346 for (int i = 0; i < 4; i++) { 6347 // reload q and qinv 6348 vs_ldpq(vq, dilithiumConsts); // qInv, q 6349 // load 8x4S coefficients via second start pos == c2 6350 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6351 // load next 8x4S inputs == b 6352 vs_ldpq_post(vs2, zetas); 6353 // compute a == c2 * b mod MONT_Q 6354 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6355 // load 8x4s coefficients via first start pos == c1 6356 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6357 // compute a1 = c1 + a 6358 vs_addv(vs3, __ T4S, vs1, vs2); 6359 // compute a2 = c1 - a 6360 vs_subv(vs1, __ T4S, vs1, vs2); 6361 // output a1 and a2 6362 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6363 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6364 6365 int k = 4 * level + i; 6366 6367 if (k > 7) { 6368 startIncr = 256; 6369 } else if (k == 5) { 6370 startIncr = 384; 6371 } else { 6372 startIncr = 128; 6373 } 6374 6375 c1Start += startIncr; 6376 c2Start += startIncr; 6377 } 6378 6379 c2 /= 2; 6380 } 6381 } 6382 6383 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6384 // Implements the method 6385 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6386 // of the Java class sun.security.provider 6387 // 6388 // coeffs (int[256]) = c_rarg0 6389 // zetas (int[256]) = c_rarg1 6390 address generate_dilithiumAlmostNtt() { 6391 6392 __ align(CodeEntryAlignment); 6393 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 6394 StubCodeMark mark(this, stub_id); 6395 address start = __ pc(); 6396 __ enter(); 6397 6398 const Register coeffs = c_rarg0; 6399 const Register zetas = c_rarg1; 6400 6401 const Register tmpAddr = r9; 6402 const Register dilithiumConsts = r10; 6403 const Register result = r11; 6404 // don't use callee save registers v8 - v15 6405 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6406 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6407 VSeq<2> vq(30); // n.b. constants overlap vs3 6408 int offsets[4] = { 0, 32, 64, 96}; 6409 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6410 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6411 __ add(result, coeffs, 0); 6412 __ lea(dilithiumConsts, 6413 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6414 6415 // Each level represents one iteration of the outer for loop of the Java version. 6416 6417 // level 0-4 6418 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6419 6420 // level 5 6421 6422 // At level 5 the coefficients we need to combine with the zetas 6423 // are grouped in memory in blocks of size 4. So, for both sets of 6424 // coefficients we load 4 adjacent values at 8 different offsets 6425 // using an indexed ldr with register variant Q and multiply them 6426 // in sequence order by the next set of inputs. Likewise we store 6427 // the resuls using an indexed str with register variant Q. 6428 for (int i = 0; i < 1024; i += 256) { 6429 // reload constants q, qinv each iteration as they get clobbered later 6430 vs_ldpq(vq, dilithiumConsts); // qInv, q 6431 // load 32 (8x4S) coefficients via first offsets = c1 6432 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6433 // load next 32 (8x4S) inputs = b 6434 vs_ldpq_post(vs2, zetas); 6435 // a = b montul c1 6436 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6437 // load 32 (8x4S) coefficients via second offsets = c2 6438 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6439 // add/sub with result of multiply 6440 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6441 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6442 // write back new coefficients using same offsets 6443 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6444 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6445 } 6446 6447 // level 6 6448 // At level 6 the coefficients we need to combine with the zetas 6449 // are grouped in memory in pairs, the first two being montmul 6450 // inputs and the second add/sub inputs. We can still implement 6451 // the montmul+sub+add using 4-way parallelism but only if we 6452 // combine the coefficients with the zetas 16 at a time. We load 8 6453 // adjacent values at 4 different offsets using an ld2 load with 6454 // arrangement 2D. That interleaves the lower and upper halves of 6455 // each pair of quadwords into successive vector registers. We 6456 // then need to montmul the 4 even elements of the coefficients 6457 // register sequence by the zetas in order and then add/sub the 4 6458 // odd elements of the coefficients register sequence. We use an 6459 // equivalent st2 operation to store the results back into memory 6460 // de-interleaved. 6461 for (int i = 0; i < 1024; i += 128) { 6462 // reload constants q, qinv each iteration as they get clobbered later 6463 vs_ldpq(vq, dilithiumConsts); // qInv, q 6464 // load interleaved 16 (4x2D) coefficients via offsets 6465 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6466 // load next 16 (4x4S) inputs 6467 vs_ldpq_post(vs_front(vs2), zetas); 6468 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6469 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6470 vs_front(vs2), vtmp, vq); 6471 // store interleaved 16 (4x2D) coefficients via offsets 6472 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6473 } 6474 6475 // level 7 6476 // At level 7 the coefficients we need to combine with the zetas 6477 // occur singly with montmul inputs alterating with add/sub 6478 // inputs. Once again we can use 4-way parallelism to combine 16 6479 // zetas at a time. However, we have to load 8 adjacent values at 6480 // 4 different offsets using an ld2 load with arrangement 4S. That 6481 // interleaves the the odd words of each pair into one 6482 // coefficients vector register and the even words of the pair 6483 // into the next register. We then need to montmul the 4 even 6484 // elements of the coefficients register sequence by the zetas in 6485 // order and then add/sub the 4 odd elements of the coefficients 6486 // register sequence. We use an equivalent st2 operation to store 6487 // the results back into memory de-interleaved. 6488 6489 for (int i = 0; i < 1024; i += 128) { 6490 // reload constants q, qinv each iteration as they get clobbered later 6491 vs_ldpq(vq, dilithiumConsts); // qInv, q 6492 // load interleaved 16 (4x4S) coefficients via offsets 6493 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6494 // load next 16 (4x4S) inputs 6495 vs_ldpq_post(vs_front(vs2), zetas); 6496 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6497 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6498 vs_front(vs2), vtmp, vq); 6499 // store interleaved 16 (4x4S) coefficients via offsets 6500 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6501 } 6502 __ leave(); // required for proper stackwalking of RuntimeStub frame 6503 __ mov(r0, zr); // return 0 6504 __ ret(lr); 6505 6506 return start; 6507 } 6508 6509 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6510 // in the Java implementation come in sequences of at least 8, so we 6511 // can use ldpq to collect the corresponding data into pairs of vector 6512 // registers 6513 // We collect the coefficients that correspond to the 'j's into vs1 6514 // the coefficiets that correspond to the 'j+l's into vs2 then 6515 // do the additions into vs3 and the subtractions into vs1 then 6516 // save the result of the additions, load the zetas into vs2 6517 // do the (Montgomery) multiplications by zeta in parallel into vs2 6518 // finally save the results back to the coeffs array 6519 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6520 const Register coeffs, const Register zetas) { 6521 int c1 = 0; 6522 int c2 = 32; 6523 int startIncr; 6524 int offsets[4]; 6525 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6526 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6527 VSeq<2> vq(30); // n.b. constants overlap vs3 6528 6529 offsets[0] = 0; 6530 6531 for (int level = 3; level < 8; level++) { 6532 int c1Start = c1; 6533 int c2Start = c2; 6534 if (level == 3) { 6535 offsets[1] = 64; 6536 offsets[2] = 128; 6537 offsets[3] = 192; 6538 } else if (level == 4) { 6539 offsets[1] = 32; 6540 offsets[2] = 128; 6541 offsets[3] = 160; 6542 } else { 6543 offsets[1] = 32; 6544 offsets[2] = 64; 6545 offsets[3] = 96; 6546 } 6547 6548 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6549 // time at 4 different offsets and multiply them in order by the 6550 // next set of input values. So we employ indexed load and store 6551 // pair instructions with arrangement 4S. 6552 for (int i = 0; i < 4; i++) { 6553 // load v1 32 (8x4S) coefficients relative to first start index 6554 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6555 // load v2 32 (8x4S) coefficients relative to second start index 6556 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6557 // a0 = v1 + v2 -- n.b. clobbers vqs 6558 vs_addv(vs3, __ T4S, vs1, vs2); 6559 // a1 = v1 - v2 6560 vs_subv(vs1, __ T4S, vs1, vs2); 6561 // save a1 relative to first start index 6562 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6563 // load constants q, qinv each iteration as they get clobbered above 6564 vs_ldpq(vq, dilithiumConsts); // qInv, q 6565 // load b next 32 (8x4S) inputs 6566 vs_ldpq_post(vs2, zetas); 6567 // a = a1 montmul b 6568 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6569 // save a relative to second start index 6570 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6571 6572 int k = 4 * level + i; 6573 6574 if (k < 24) { 6575 startIncr = 256; 6576 } else if (k == 25) { 6577 startIncr = 384; 6578 } else { 6579 startIncr = 128; 6580 } 6581 6582 c1Start += startIncr; 6583 c2Start += startIncr; 6584 } 6585 6586 c2 *= 2; 6587 } 6588 } 6589 6590 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6591 // Implements the method 6592 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6593 // the sun.security.provider.ML_DSA class. 6594 // 6595 // coeffs (int[256]) = c_rarg0 6596 // zetas (int[256]) = c_rarg1 6597 address generate_dilithiumAlmostInverseNtt() { 6598 6599 __ align(CodeEntryAlignment); 6600 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 6601 StubCodeMark mark(this, stub_id); 6602 address start = __ pc(); 6603 __ enter(); 6604 6605 const Register coeffs = c_rarg0; 6606 const Register zetas = c_rarg1; 6607 6608 const Register tmpAddr = r9; 6609 const Register dilithiumConsts = r10; 6610 const Register result = r11; 6611 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6612 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6613 VSeq<2> vq(30); // n.b. constants overlap vs3 6614 int offsets[4] = { 0, 32, 64, 96 }; 6615 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6616 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6617 6618 __ add(result, coeffs, 0); 6619 __ lea(dilithiumConsts, 6620 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6621 6622 // Each level represents one iteration of the outer for loop of the Java version 6623 6624 // level 0 6625 // At level 0 we need to interleave adjacent quartets of 6626 // coefficients before we multiply and add/sub by the next 16 6627 // zetas just as we did for level 7 in the multiply code. So we 6628 // load and store the values using an ld2/st2 with arrangement 4S. 6629 for (int i = 0; i < 1024; i += 128) { 6630 // load constants q, qinv 6631 // n.b. this can be moved out of the loop as they do not get 6632 // clobbered by first two loops 6633 vs_ldpq(vq, dilithiumConsts); // qInv, q 6634 // a0/a1 load interleaved 32 (8x4S) coefficients 6635 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6636 // b load next 32 (8x4S) inputs 6637 vs_ldpq_post(vs_front(vs2), zetas); 6638 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6639 // n.b. second half of vs2 provides temporary register storage 6640 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6641 vs_front(vs2), vs_back(vs2), vtmp, vq); 6642 // a0/a1 store interleaved 32 (8x4S) coefficients 6643 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6644 } 6645 6646 // level 1 6647 // At level 1 we need to interleave pairs of adjacent pairs of 6648 // coefficients before we multiply by the next 16 zetas just as we 6649 // did for level 6 in the multiply code. So we load and store the 6650 // values an ld2/st2 with arrangement 2D. 6651 for (int i = 0; i < 1024; i += 128) { 6652 // a0/a1 load interleaved 32 (8x2D) coefficients 6653 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6654 // b load next 16 (4x4S) inputs 6655 vs_ldpq_post(vs_front(vs2), zetas); 6656 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6657 // n.b. second half of vs2 provides temporary register storage 6658 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6659 vs_front(vs2), vs_back(vs2), vtmp, vq); 6660 // a0/a1 store interleaved 32 (8x2D) coefficients 6661 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6662 } 6663 6664 // level 2 6665 // At level 2 coefficients come in blocks of 4. So, we load 4 6666 // adjacent coefficients at 8 distinct offsets for both the first 6667 // and second coefficient sequences, using an ldr with register 6668 // variant Q then combine them with next set of 32 zetas. Likewise 6669 // we store the results using an str with register variant Q. 6670 for (int i = 0; i < 1024; i += 256) { 6671 // c0 load 32 (8x4S) coefficients via first offsets 6672 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6673 // c1 load 32 (8x4S) coefficients via second offsets 6674 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6675 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6676 vs_addv(vs3, __ T4S, vs1, vs2); 6677 // c = c0 - c1 6678 vs_subv(vs1, __ T4S, vs1, vs2); 6679 // store a0 32 (8x4S) coefficients via first offsets 6680 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6681 // b load 32 (8x4S) next inputs 6682 vs_ldpq_post(vs2, zetas); 6683 // reload constants q, qinv -- they were clobbered earlier 6684 vs_ldpq(vq, dilithiumConsts); // qInv, q 6685 // compute a1 = b montmul c 6686 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6687 // store a1 32 (8x4S) coefficients via second offsets 6688 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6689 } 6690 6691 // level 3-7 6692 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6693 6694 __ leave(); // required for proper stackwalking of RuntimeStub frame 6695 __ mov(r0, zr); // return 0 6696 __ ret(lr); 6697 6698 return start; 6699 } 6700 6701 // Dilithium multiply polynomials in the NTT domain. 6702 // Straightforward implementation of the method 6703 // static int implDilithiumNttMult( 6704 // int[] result, int[] ntta, int[] nttb {} of 6705 // the sun.security.provider.ML_DSA class. 6706 // 6707 // result (int[256]) = c_rarg0 6708 // poly1 (int[256]) = c_rarg1 6709 // poly2 (int[256]) = c_rarg2 6710 address generate_dilithiumNttMult() { 6711 6712 __ align(CodeEntryAlignment); 6713 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 6714 StubCodeMark mark(this, stub_id); 6715 address start = __ pc(); 6716 __ enter(); 6717 6718 Label L_loop; 6719 6720 const Register result = c_rarg0; 6721 const Register poly1 = c_rarg1; 6722 const Register poly2 = c_rarg2; 6723 6724 const Register dilithiumConsts = r10; 6725 const Register len = r11; 6726 6727 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6728 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6729 VSeq<2> vq(30); // n.b. constants overlap vs3 6730 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6731 6732 __ lea(dilithiumConsts, 6733 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6734 6735 // load constants q, qinv 6736 vs_ldpq(vq, dilithiumConsts); // qInv, q 6737 // load constant rSquare into v29 6738 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6739 6740 __ mov(len, zr); 6741 __ add(len, len, 1024); 6742 6743 __ BIND(L_loop); 6744 6745 // b load 32 (8x4S) next inputs from poly1 6746 vs_ldpq_post(vs1, poly1); 6747 // c load 32 (8x4S) next inputs from poly2 6748 vs_ldpq_post(vs2, poly2); 6749 // compute a = b montmul c 6750 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6751 // compute a = rsquare montmul a 6752 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6753 // save a 32 (8x4S) results 6754 vs_stpq_post(vs2, result); 6755 6756 __ sub(len, len, 128); 6757 __ cmp(len, (u1)128); 6758 __ br(Assembler::GE, L_loop); 6759 6760 __ leave(); // required for proper stackwalking of RuntimeStub frame 6761 __ mov(r0, zr); // return 0 6762 __ ret(lr); 6763 6764 return start; 6765 } 6766 6767 // Dilithium Motgomery multiply an array by a constant. 6768 // A straightforward implementation of the method 6769 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6770 // of the sun.security.provider.MLDSA class 6771 // 6772 // coeffs (int[256]) = c_rarg0 6773 // constant (int) = c_rarg1 6774 address generate_dilithiumMontMulByConstant() { 6775 6776 __ align(CodeEntryAlignment); 6777 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 6778 StubCodeMark mark(this, stub_id); 6779 address start = __ pc(); 6780 __ enter(); 6781 6782 Label L_loop; 6783 6784 const Register coeffs = c_rarg0; 6785 const Register constant = c_rarg1; 6786 6787 const Register dilithiumConsts = r10; 6788 const Register result = r11; 6789 const Register len = r12; 6790 6791 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6792 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6793 VSeq<2> vq(30); // n.b. constants overlap vs3 6794 VSeq<8> vconst(29, 0); // for montmul by constant 6795 6796 // results track inputs 6797 __ add(result, coeffs, 0); 6798 __ lea(dilithiumConsts, 6799 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6800 6801 // load constants q, qinv -- they do not get clobbered by first two loops 6802 vs_ldpq(vq, dilithiumConsts); // qInv, q 6803 // copy caller supplied constant across vconst 6804 __ dup(vconst[0], __ T4S, constant); 6805 __ mov(len, zr); 6806 __ add(len, len, 1024); 6807 6808 __ BIND(L_loop); 6809 6810 // load next 32 inputs 6811 vs_ldpq_post(vs2, coeffs); 6812 // mont mul by constant 6813 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6814 // write next 32 results 6815 vs_stpq_post(vs2, result); 6816 6817 __ sub(len, len, 128); 6818 __ cmp(len, (u1)128); 6819 __ br(Assembler::GE, L_loop); 6820 6821 __ leave(); // required for proper stackwalking of RuntimeStub frame 6822 __ mov(r0, zr); // return 0 6823 __ ret(lr); 6824 6825 return start; 6826 } 6827 6828 // Dilithium decompose poly. 6829 // Implements the method 6830 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6831 // of the sun.security.provider.ML_DSA class 6832 // 6833 // input (int[256]) = c_rarg0 6834 // lowPart (int[256]) = c_rarg1 6835 // highPart (int[256]) = c_rarg2 6836 // twoGamma2 (int) = c_rarg3 6837 // multiplier (int) = c_rarg4 6838 address generate_dilithiumDecomposePoly() { 6839 6840 __ align(CodeEntryAlignment); 6841 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 6842 StubCodeMark mark(this, stub_id); 6843 address start = __ pc(); 6844 Label L_loop; 6845 6846 const Register input = c_rarg0; 6847 const Register lowPart = c_rarg1; 6848 const Register highPart = c_rarg2; 6849 const Register twoGamma2 = c_rarg3; 6850 const Register multiplier = c_rarg4; 6851 6852 const Register len = r9; 6853 const Register dilithiumConsts = r10; 6854 const Register tmp = r11; 6855 6856 // 6 independent sets of 4x4s values 6857 VSeq<4> vs1(0), vs2(4), vs3(8); 6858 VSeq<4> vs4(12), vs5(16), vtmp(20); 6859 6860 // 7 constants for cross-multiplying 6861 VSeq<4> one(25, 0); 6862 VSeq<4> qminus1(26, 0); 6863 VSeq<4> g2(27, 0); 6864 VSeq<4> twog2(28, 0); 6865 VSeq<4> mult(29, 0); 6866 VSeq<4> q(30, 0); 6867 VSeq<4> qadd(31, 0); 6868 6869 __ enter(); 6870 6871 __ lea(dilithiumConsts, 6872 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6873 6874 // save callee-saved registers 6875 __ stpd(v8, v9, __ pre(sp, -64)); 6876 __ stpd(v10, v11, Address(sp, 16)); 6877 __ stpd(v12, v13, Address(sp, 32)); 6878 __ stpd(v14, v15, Address(sp, 48)); 6879 6880 // populate constant registers 6881 __ mov(tmp, zr); 6882 __ add(tmp, tmp, 1); 6883 __ dup(one[0], __ T4S, tmp); // 1 6884 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 6885 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 6886 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 6887 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 6888 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 6889 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 6890 6891 __ mov(len, zr); 6892 __ add(len, len, 1024); 6893 6894 __ BIND(L_loop); 6895 6896 // load next 4x4S inputs interleaved: rplus --> vs1 6897 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 6898 6899 // rplus = rplus - ((rplus + qadd) >> 23) * q 6900 vs_addv(vtmp, __ T4S, vs1, qadd); 6901 vs_sshr(vtmp, __ T4S, vtmp, 23); 6902 vs_mulv(vtmp, __ T4S, vtmp, q); 6903 vs_subv(vs1, __ T4S, vs1, vtmp); 6904 6905 // rplus = rplus + ((rplus >> 31) & dilithium_q); 6906 vs_sshr(vtmp, __ T4S, vs1, 31); 6907 vs_andr(vtmp, vtmp, q); 6908 vs_addv(vs1, __ T4S, vs1, vtmp); 6909 6910 // quotient --> vs2 6911 // int quotient = (rplus * multiplier) >> 22; 6912 vs_mulv(vtmp, __ T4S, vs1, mult); 6913 vs_sshr(vs2, __ T4S, vtmp, 22); 6914 6915 // r0 --> vs3 6916 // int r0 = rplus - quotient * twoGamma2; 6917 vs_mulv(vtmp, __ T4S, vs2, twog2); 6918 vs_subv(vs3, __ T4S, vs1, vtmp); 6919 6920 // mask --> vs4 6921 // int mask = (twoGamma2 - r0) >> 22; 6922 vs_subv(vtmp, __ T4S, twog2, vs3); 6923 vs_sshr(vs4, __ T4S, vtmp, 22); 6924 6925 // r0 -= (mask & twoGamma2); 6926 vs_andr(vtmp, vs4, twog2); 6927 vs_subv(vs3, __ T4S, vs3, vtmp); 6928 6929 // quotient += (mask & 1); 6930 vs_andr(vtmp, vs4, one); 6931 vs_addv(vs2, __ T4S, vs2, vtmp); 6932 6933 // mask = (twoGamma2 / 2 - r0) >> 31; 6934 vs_subv(vtmp, __ T4S, g2, vs3); 6935 vs_sshr(vs4, __ T4S, vtmp, 31); 6936 6937 // r0 -= (mask & twoGamma2); 6938 vs_andr(vtmp, vs4, twog2); 6939 vs_subv(vs3, __ T4S, vs3, vtmp); 6940 6941 // quotient += (mask & 1); 6942 vs_andr(vtmp, vs4, one); 6943 vs_addv(vs2, __ T4S, vs2, vtmp); 6944 6945 // r1 --> vs5 6946 // int r1 = rplus - r0 - (dilithium_q - 1); 6947 vs_subv(vtmp, __ T4S, vs1, vs3); 6948 vs_subv(vs5, __ T4S, vtmp, qminus1); 6949 6950 // r1 --> vs1 (overwriting rplus) 6951 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 6952 vs_negr(vtmp, __ T4S, vs5); 6953 vs_orr(vtmp, vs5, vtmp); 6954 vs_sshr(vs1, __ T4S, vtmp, 31); 6955 6956 // r0 += ~r1; 6957 vs_notr(vtmp, vs1); 6958 vs_addv(vs3, __ T4S, vs3, vtmp); 6959 6960 // r1 = r1 & quotient; 6961 vs_andr(vs1, vs2, vs1); 6962 6963 // store results inteleaved 6964 // lowPart[m] = r0; 6965 // highPart[m] = r1; 6966 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 6967 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 6968 6969 __ sub(len, len, 64); 6970 __ cmp(len, (u1)64); 6971 __ br(Assembler::GE, L_loop); 6972 6973 // restore callee-saved vector registers 6974 __ ldpd(v14, v15, Address(sp, 48)); 6975 __ ldpd(v12, v13, Address(sp, 32)); 6976 __ ldpd(v10, v11, Address(sp, 16)); 6977 __ ldpd(v8, v9, __ post(sp, 64)); 6978 6979 __ leave(); // required for proper stackwalking of RuntimeStub frame 6980 __ mov(r0, zr); // return 0 6981 __ ret(lr); 6982 6983 return start; 6984 } 6985 6986 /** 6987 * Arguments: 6988 * 6989 * Inputs: 6990 * c_rarg0 - int crc 6991 * c_rarg1 - byte* buf 6992 * c_rarg2 - int length 6993 * 6994 * Output: 6995 * rax - int crc result 6996 */ 6997 address generate_updateBytesCRC32() { 6998 assert(UseCRC32Intrinsics, "what are we doing here?"); 6999 7000 __ align(CodeEntryAlignment); 7001 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 7002 StubCodeMark mark(this, stub_id); 7003 7004 address start = __ pc(); 7005 7006 const Register crc = c_rarg0; // crc 7007 const Register buf = c_rarg1; // source java byte array address 7008 const Register len = c_rarg2; // length 7009 const Register table0 = c_rarg3; // crc_table address 7010 const Register table1 = c_rarg4; 7011 const Register table2 = c_rarg5; 7012 const Register table3 = c_rarg6; 7013 const Register tmp3 = c_rarg7; 7014 7015 BLOCK_COMMENT("Entry:"); 7016 __ enter(); // required for proper stackwalking of RuntimeStub frame 7017 7018 __ kernel_crc32(crc, buf, len, 7019 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7020 7021 __ leave(); // required for proper stackwalking of RuntimeStub frame 7022 __ ret(lr); 7023 7024 return start; 7025 } 7026 7027 /** 7028 * Arguments: 7029 * 7030 * Inputs: 7031 * c_rarg0 - int crc 7032 * c_rarg1 - byte* buf 7033 * c_rarg2 - int length 7034 * c_rarg3 - int* table 7035 * 7036 * Output: 7037 * r0 - int crc result 7038 */ 7039 address generate_updateBytesCRC32C() { 7040 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7041 7042 __ align(CodeEntryAlignment); 7043 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 7044 StubCodeMark mark(this, stub_id); 7045 7046 address start = __ pc(); 7047 7048 const Register crc = c_rarg0; // crc 7049 const Register buf = c_rarg1; // source java byte array address 7050 const Register len = c_rarg2; // length 7051 const Register table0 = c_rarg3; // crc_table address 7052 const Register table1 = c_rarg4; 7053 const Register table2 = c_rarg5; 7054 const Register table3 = c_rarg6; 7055 const Register tmp3 = c_rarg7; 7056 7057 BLOCK_COMMENT("Entry:"); 7058 __ enter(); // required for proper stackwalking of RuntimeStub frame 7059 7060 __ kernel_crc32c(crc, buf, len, 7061 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7062 7063 __ leave(); // required for proper stackwalking of RuntimeStub frame 7064 __ ret(lr); 7065 7066 return start; 7067 } 7068 7069 /*** 7070 * Arguments: 7071 * 7072 * Inputs: 7073 * c_rarg0 - int adler 7074 * c_rarg1 - byte* buff 7075 * c_rarg2 - int len 7076 * 7077 * Output: 7078 * c_rarg0 - int adler result 7079 */ 7080 address generate_updateBytesAdler32() { 7081 __ align(CodeEntryAlignment); 7082 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 7083 StubCodeMark mark(this, stub_id); 7084 address start = __ pc(); 7085 7086 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7087 7088 // Aliases 7089 Register adler = c_rarg0; 7090 Register s1 = c_rarg0; 7091 Register s2 = c_rarg3; 7092 Register buff = c_rarg1; 7093 Register len = c_rarg2; 7094 Register nmax = r4; 7095 Register base = r5; 7096 Register count = r6; 7097 Register temp0 = rscratch1; 7098 Register temp1 = rscratch2; 7099 FloatRegister vbytes = v0; 7100 FloatRegister vs1acc = v1; 7101 FloatRegister vs2acc = v2; 7102 FloatRegister vtable = v3; 7103 7104 // Max number of bytes we can process before having to take the mod 7105 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7106 uint64_t BASE = 0xfff1; 7107 uint64_t NMAX = 0x15B0; 7108 7109 __ mov(base, BASE); 7110 __ mov(nmax, NMAX); 7111 7112 // Load accumulation coefficients for the upper 16 bits 7113 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7114 __ ld1(vtable, __ T16B, Address(temp0)); 7115 7116 // s1 is initialized to the lower 16 bits of adler 7117 // s2 is initialized to the upper 16 bits of adler 7118 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7119 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7120 7121 // The pipelined loop needs at least 16 elements for 1 iteration 7122 // It does check this, but it is more effective to skip to the cleanup loop 7123 __ cmp(len, (u1)16); 7124 __ br(Assembler::HS, L_nmax); 7125 __ cbz(len, L_combine); 7126 7127 __ bind(L_simple_by1_loop); 7128 __ ldrb(temp0, Address(__ post(buff, 1))); 7129 __ add(s1, s1, temp0); 7130 __ add(s2, s2, s1); 7131 __ subs(len, len, 1); 7132 __ br(Assembler::HI, L_simple_by1_loop); 7133 7134 // s1 = s1 % BASE 7135 __ subs(temp0, s1, base); 7136 __ csel(s1, temp0, s1, Assembler::HS); 7137 7138 // s2 = s2 % BASE 7139 __ lsr(temp0, s2, 16); 7140 __ lsl(temp1, temp0, 4); 7141 __ sub(temp1, temp1, temp0); 7142 __ add(s2, temp1, s2, ext::uxth); 7143 7144 __ subs(temp0, s2, base); 7145 __ csel(s2, temp0, s2, Assembler::HS); 7146 7147 __ b(L_combine); 7148 7149 __ bind(L_nmax); 7150 __ subs(len, len, nmax); 7151 __ sub(count, nmax, 16); 7152 __ br(Assembler::LO, L_by16); 7153 7154 __ bind(L_nmax_loop); 7155 7156 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7157 vbytes, vs1acc, vs2acc, vtable); 7158 7159 __ subs(count, count, 16); 7160 __ br(Assembler::HS, L_nmax_loop); 7161 7162 // s1 = s1 % BASE 7163 __ lsr(temp0, s1, 16); 7164 __ lsl(temp1, temp0, 4); 7165 __ sub(temp1, temp1, temp0); 7166 __ add(temp1, temp1, s1, ext::uxth); 7167 7168 __ lsr(temp0, temp1, 16); 7169 __ lsl(s1, temp0, 4); 7170 __ sub(s1, s1, temp0); 7171 __ add(s1, s1, temp1, ext:: uxth); 7172 7173 __ subs(temp0, s1, base); 7174 __ csel(s1, temp0, s1, Assembler::HS); 7175 7176 // s2 = s2 % BASE 7177 __ lsr(temp0, s2, 16); 7178 __ lsl(temp1, temp0, 4); 7179 __ sub(temp1, temp1, temp0); 7180 __ add(temp1, temp1, s2, ext::uxth); 7181 7182 __ lsr(temp0, temp1, 16); 7183 __ lsl(s2, temp0, 4); 7184 __ sub(s2, s2, temp0); 7185 __ add(s2, s2, temp1, ext:: uxth); 7186 7187 __ subs(temp0, s2, base); 7188 __ csel(s2, temp0, s2, Assembler::HS); 7189 7190 __ subs(len, len, nmax); 7191 __ sub(count, nmax, 16); 7192 __ br(Assembler::HS, L_nmax_loop); 7193 7194 __ bind(L_by16); 7195 __ adds(len, len, count); 7196 __ br(Assembler::LO, L_by1); 7197 7198 __ bind(L_by16_loop); 7199 7200 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7201 vbytes, vs1acc, vs2acc, vtable); 7202 7203 __ subs(len, len, 16); 7204 __ br(Assembler::HS, L_by16_loop); 7205 7206 __ bind(L_by1); 7207 __ adds(len, len, 15); 7208 __ br(Assembler::LO, L_do_mod); 7209 7210 __ bind(L_by1_loop); 7211 __ ldrb(temp0, Address(__ post(buff, 1))); 7212 __ add(s1, temp0, s1); 7213 __ add(s2, s2, s1); 7214 __ subs(len, len, 1); 7215 __ br(Assembler::HS, L_by1_loop); 7216 7217 __ bind(L_do_mod); 7218 // s1 = s1 % BASE 7219 __ lsr(temp0, s1, 16); 7220 __ lsl(temp1, temp0, 4); 7221 __ sub(temp1, temp1, temp0); 7222 __ add(temp1, temp1, s1, ext::uxth); 7223 7224 __ lsr(temp0, temp1, 16); 7225 __ lsl(s1, temp0, 4); 7226 __ sub(s1, s1, temp0); 7227 __ add(s1, s1, temp1, ext:: uxth); 7228 7229 __ subs(temp0, s1, base); 7230 __ csel(s1, temp0, s1, Assembler::HS); 7231 7232 // s2 = s2 % BASE 7233 __ lsr(temp0, s2, 16); 7234 __ lsl(temp1, temp0, 4); 7235 __ sub(temp1, temp1, temp0); 7236 __ add(temp1, temp1, s2, ext::uxth); 7237 7238 __ lsr(temp0, temp1, 16); 7239 __ lsl(s2, temp0, 4); 7240 __ sub(s2, s2, temp0); 7241 __ add(s2, s2, temp1, ext:: uxth); 7242 7243 __ subs(temp0, s2, base); 7244 __ csel(s2, temp0, s2, Assembler::HS); 7245 7246 // Combine lower bits and higher bits 7247 __ bind(L_combine); 7248 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7249 7250 __ ret(lr); 7251 7252 return start; 7253 } 7254 7255 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7256 Register temp0, Register temp1, FloatRegister vbytes, 7257 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7258 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7259 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7260 // In non-vectorized code, we update s1 and s2 as: 7261 // s1 <- s1 + b1 7262 // s2 <- s2 + s1 7263 // s1 <- s1 + b2 7264 // s2 <- s2 + b1 7265 // ... 7266 // s1 <- s1 + b16 7267 // s2 <- s2 + s1 7268 // Putting above assignments together, we have: 7269 // s1_new = s1 + b1 + b2 + ... + b16 7270 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7271 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7272 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7273 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7274 7275 // s2 = s2 + s1 * 16 7276 __ add(s2, s2, s1, Assembler::LSL, 4); 7277 7278 // vs1acc = b1 + b2 + b3 + ... + b16 7279 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7280 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7281 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7282 __ uaddlv(vs1acc, __ T16B, vbytes); 7283 __ uaddlv(vs2acc, __ T8H, vs2acc); 7284 7285 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7286 __ fmovd(temp0, vs1acc); 7287 __ fmovd(temp1, vs2acc); 7288 __ add(s1, s1, temp0); 7289 __ add(s2, s2, temp1); 7290 } 7291 7292 /** 7293 * Arguments: 7294 * 7295 * Input: 7296 * c_rarg0 - x address 7297 * c_rarg1 - x length 7298 * c_rarg2 - y address 7299 * c_rarg3 - y length 7300 * c_rarg4 - z address 7301 */ 7302 address generate_multiplyToLen() { 7303 __ align(CodeEntryAlignment); 7304 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 7305 StubCodeMark mark(this, stub_id); 7306 7307 address start = __ pc(); 7308 const Register x = r0; 7309 const Register xlen = r1; 7310 const Register y = r2; 7311 const Register ylen = r3; 7312 const Register z = r4; 7313 7314 const Register tmp0 = r5; 7315 const Register tmp1 = r10; 7316 const Register tmp2 = r11; 7317 const Register tmp3 = r12; 7318 const Register tmp4 = r13; 7319 const Register tmp5 = r14; 7320 const Register tmp6 = r15; 7321 const Register tmp7 = r16; 7322 7323 BLOCK_COMMENT("Entry:"); 7324 __ enter(); // required for proper stackwalking of RuntimeStub frame 7325 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7326 __ leave(); // required for proper stackwalking of RuntimeStub frame 7327 __ ret(lr); 7328 7329 return start; 7330 } 7331 7332 address generate_squareToLen() { 7333 // squareToLen algorithm for sizes 1..127 described in java code works 7334 // faster than multiply_to_len on some CPUs and slower on others, but 7335 // multiply_to_len shows a bit better overall results 7336 __ align(CodeEntryAlignment); 7337 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 7338 StubCodeMark mark(this, stub_id); 7339 address start = __ pc(); 7340 7341 const Register x = r0; 7342 const Register xlen = r1; 7343 const Register z = r2; 7344 const Register y = r4; // == x 7345 const Register ylen = r5; // == xlen 7346 7347 const Register tmp0 = r3; 7348 const Register tmp1 = r10; 7349 const Register tmp2 = r11; 7350 const Register tmp3 = r12; 7351 const Register tmp4 = r13; 7352 const Register tmp5 = r14; 7353 const Register tmp6 = r15; 7354 const Register tmp7 = r16; 7355 7356 RegSet spilled_regs = RegSet::of(y, ylen); 7357 BLOCK_COMMENT("Entry:"); 7358 __ enter(); 7359 __ push(spilled_regs, sp); 7360 __ mov(y, x); 7361 __ mov(ylen, xlen); 7362 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7363 __ pop(spilled_regs, sp); 7364 __ leave(); 7365 __ ret(lr); 7366 return start; 7367 } 7368 7369 address generate_mulAdd() { 7370 __ align(CodeEntryAlignment); 7371 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 7372 StubCodeMark mark(this, stub_id); 7373 7374 address start = __ pc(); 7375 7376 const Register out = r0; 7377 const Register in = r1; 7378 const Register offset = r2; 7379 const Register len = r3; 7380 const Register k = r4; 7381 7382 BLOCK_COMMENT("Entry:"); 7383 __ enter(); 7384 __ mul_add(out, in, offset, len, k); 7385 __ leave(); 7386 __ ret(lr); 7387 7388 return start; 7389 } 7390 7391 // Arguments: 7392 // 7393 // Input: 7394 // c_rarg0 - newArr address 7395 // c_rarg1 - oldArr address 7396 // c_rarg2 - newIdx 7397 // c_rarg3 - shiftCount 7398 // c_rarg4 - numIter 7399 // 7400 address generate_bigIntegerRightShift() { 7401 __ align(CodeEntryAlignment); 7402 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 7403 StubCodeMark mark(this, stub_id); 7404 address start = __ pc(); 7405 7406 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7407 7408 Register newArr = c_rarg0; 7409 Register oldArr = c_rarg1; 7410 Register newIdx = c_rarg2; 7411 Register shiftCount = c_rarg3; 7412 Register numIter = c_rarg4; 7413 Register idx = numIter; 7414 7415 Register newArrCur = rscratch1; 7416 Register shiftRevCount = rscratch2; 7417 Register oldArrCur = r13; 7418 Register oldArrNext = r14; 7419 7420 FloatRegister oldElem0 = v0; 7421 FloatRegister oldElem1 = v1; 7422 FloatRegister newElem = v2; 7423 FloatRegister shiftVCount = v3; 7424 FloatRegister shiftVRevCount = v4; 7425 7426 __ cbz(idx, Exit); 7427 7428 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7429 7430 // left shift count 7431 __ movw(shiftRevCount, 32); 7432 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7433 7434 // numIter too small to allow a 4-words SIMD loop, rolling back 7435 __ cmp(numIter, (u1)4); 7436 __ br(Assembler::LT, ShiftThree); 7437 7438 __ dup(shiftVCount, __ T4S, shiftCount); 7439 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7440 __ negr(shiftVCount, __ T4S, shiftVCount); 7441 7442 __ BIND(ShiftSIMDLoop); 7443 7444 // Calculate the load addresses 7445 __ sub(idx, idx, 4); 7446 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7447 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7448 __ add(oldArrCur, oldArrNext, 4); 7449 7450 // Load 4 words and process 7451 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7452 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7453 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7454 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7455 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7456 __ st1(newElem, __ T4S, Address(newArrCur)); 7457 7458 __ cmp(idx, (u1)4); 7459 __ br(Assembler::LT, ShiftTwoLoop); 7460 __ b(ShiftSIMDLoop); 7461 7462 __ BIND(ShiftTwoLoop); 7463 __ cbz(idx, Exit); 7464 __ cmp(idx, (u1)1); 7465 __ br(Assembler::EQ, ShiftOne); 7466 7467 // Calculate the load addresses 7468 __ sub(idx, idx, 2); 7469 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7470 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7471 __ add(oldArrCur, oldArrNext, 4); 7472 7473 // Load 2 words and process 7474 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 7475 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 7476 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7477 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7478 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7479 __ st1(newElem, __ T2S, Address(newArrCur)); 7480 __ b(ShiftTwoLoop); 7481 7482 __ BIND(ShiftThree); 7483 __ tbz(idx, 1, ShiftOne); 7484 __ tbz(idx, 0, ShiftTwo); 7485 __ ldrw(r10, Address(oldArr, 12)); 7486 __ ldrw(r11, Address(oldArr, 8)); 7487 __ lsrvw(r10, r10, shiftCount); 7488 __ lslvw(r11, r11, shiftRevCount); 7489 __ orrw(r12, r10, r11); 7490 __ strw(r12, Address(newArr, 8)); 7491 7492 __ BIND(ShiftTwo); 7493 __ ldrw(r10, Address(oldArr, 8)); 7494 __ ldrw(r11, Address(oldArr, 4)); 7495 __ lsrvw(r10, r10, shiftCount); 7496 __ lslvw(r11, r11, shiftRevCount); 7497 __ orrw(r12, r10, r11); 7498 __ strw(r12, Address(newArr, 4)); 7499 7500 __ BIND(ShiftOne); 7501 __ ldrw(r10, Address(oldArr, 4)); 7502 __ ldrw(r11, Address(oldArr)); 7503 __ lsrvw(r10, r10, shiftCount); 7504 __ lslvw(r11, r11, shiftRevCount); 7505 __ orrw(r12, r10, r11); 7506 __ strw(r12, Address(newArr)); 7507 7508 __ BIND(Exit); 7509 __ ret(lr); 7510 7511 return start; 7512 } 7513 7514 // Arguments: 7515 // 7516 // Input: 7517 // c_rarg0 - newArr address 7518 // c_rarg1 - oldArr address 7519 // c_rarg2 - newIdx 7520 // c_rarg3 - shiftCount 7521 // c_rarg4 - numIter 7522 // 7523 address generate_bigIntegerLeftShift() { 7524 __ align(CodeEntryAlignment); 7525 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 7526 StubCodeMark mark(this, stub_id); 7527 address start = __ pc(); 7528 7529 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7530 7531 Register newArr = c_rarg0; 7532 Register oldArr = c_rarg1; 7533 Register newIdx = c_rarg2; 7534 Register shiftCount = c_rarg3; 7535 Register numIter = c_rarg4; 7536 7537 Register shiftRevCount = rscratch1; 7538 Register oldArrNext = rscratch2; 7539 7540 FloatRegister oldElem0 = v0; 7541 FloatRegister oldElem1 = v1; 7542 FloatRegister newElem = v2; 7543 FloatRegister shiftVCount = v3; 7544 FloatRegister shiftVRevCount = v4; 7545 7546 __ cbz(numIter, Exit); 7547 7548 __ add(oldArrNext, oldArr, 4); 7549 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7550 7551 // right shift count 7552 __ movw(shiftRevCount, 32); 7553 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7554 7555 // numIter too small to allow a 4-words SIMD loop, rolling back 7556 __ cmp(numIter, (u1)4); 7557 __ br(Assembler::LT, ShiftThree); 7558 7559 __ dup(shiftVCount, __ T4S, shiftCount); 7560 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7561 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 7562 7563 __ BIND(ShiftSIMDLoop); 7564 7565 // load 4 words and process 7566 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 7567 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 7568 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7569 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7570 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7571 __ st1(newElem, __ T4S, __ post(newArr, 16)); 7572 __ sub(numIter, numIter, 4); 7573 7574 __ cmp(numIter, (u1)4); 7575 __ br(Assembler::LT, ShiftTwoLoop); 7576 __ b(ShiftSIMDLoop); 7577 7578 __ BIND(ShiftTwoLoop); 7579 __ cbz(numIter, Exit); 7580 __ cmp(numIter, (u1)1); 7581 __ br(Assembler::EQ, ShiftOne); 7582 7583 // load 2 words and process 7584 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 7585 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 7586 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7587 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7588 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7589 __ st1(newElem, __ T2S, __ post(newArr, 8)); 7590 __ sub(numIter, numIter, 2); 7591 __ b(ShiftTwoLoop); 7592 7593 __ BIND(ShiftThree); 7594 __ ldrw(r10, __ post(oldArr, 4)); 7595 __ ldrw(r11, __ post(oldArrNext, 4)); 7596 __ lslvw(r10, r10, shiftCount); 7597 __ lsrvw(r11, r11, shiftRevCount); 7598 __ orrw(r12, r10, r11); 7599 __ strw(r12, __ post(newArr, 4)); 7600 __ tbz(numIter, 1, Exit); 7601 __ tbz(numIter, 0, ShiftOne); 7602 7603 __ BIND(ShiftTwo); 7604 __ ldrw(r10, __ post(oldArr, 4)); 7605 __ ldrw(r11, __ post(oldArrNext, 4)); 7606 __ lslvw(r10, r10, shiftCount); 7607 __ lsrvw(r11, r11, shiftRevCount); 7608 __ orrw(r12, r10, r11); 7609 __ strw(r12, __ post(newArr, 4)); 7610 7611 __ BIND(ShiftOne); 7612 __ ldrw(r10, Address(oldArr)); 7613 __ ldrw(r11, Address(oldArrNext)); 7614 __ lslvw(r10, r10, shiftCount); 7615 __ lsrvw(r11, r11, shiftRevCount); 7616 __ orrw(r12, r10, r11); 7617 __ strw(r12, Address(newArr)); 7618 7619 __ BIND(Exit); 7620 __ ret(lr); 7621 7622 return start; 7623 } 7624 7625 address generate_count_positives(address &count_positives_long) { 7626 const u1 large_loop_size = 64; 7627 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 7628 int dcache_line = VM_Version::dcache_line_size(); 7629 7630 Register ary1 = r1, len = r2, result = r0; 7631 7632 __ align(CodeEntryAlignment); 7633 7634 StubGenStubId stub_id = StubGenStubId::count_positives_id; 7635 StubCodeMark mark(this, stub_id); 7636 7637 address entry = __ pc(); 7638 7639 __ enter(); 7640 // precondition: a copy of len is already in result 7641 // __ mov(result, len); 7642 7643 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 7644 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 7645 7646 __ cmp(len, (u1)15); 7647 __ br(Assembler::GT, LEN_OVER_15); 7648 // The only case when execution falls into this code is when pointer is near 7649 // the end of memory page and we have to avoid reading next page 7650 __ add(ary1, ary1, len); 7651 __ subs(len, len, 8); 7652 __ br(Assembler::GT, LEN_OVER_8); 7653 __ ldr(rscratch2, Address(ary1, -8)); 7654 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 7655 __ lsrv(rscratch2, rscratch2, rscratch1); 7656 __ tst(rscratch2, UPPER_BIT_MASK); 7657 __ csel(result, zr, result, Assembler::NE); 7658 __ leave(); 7659 __ ret(lr); 7660 __ bind(LEN_OVER_8); 7661 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 7662 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 7663 __ tst(rscratch2, UPPER_BIT_MASK); 7664 __ br(Assembler::NE, RET_NO_POP); 7665 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 7666 __ lsrv(rscratch1, rscratch1, rscratch2); 7667 __ tst(rscratch1, UPPER_BIT_MASK); 7668 __ bind(RET_NO_POP); 7669 __ csel(result, zr, result, Assembler::NE); 7670 __ leave(); 7671 __ ret(lr); 7672 7673 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 7674 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 7675 7676 count_positives_long = __ pc(); // 2nd entry point 7677 7678 __ enter(); 7679 7680 __ bind(LEN_OVER_15); 7681 __ push(spilled_regs, sp); 7682 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 7683 __ cbz(rscratch2, ALIGNED); 7684 __ ldp(tmp6, tmp1, Address(ary1)); 7685 __ mov(tmp5, 16); 7686 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 7687 __ add(ary1, ary1, rscratch1); 7688 __ orr(tmp6, tmp6, tmp1); 7689 __ tst(tmp6, UPPER_BIT_MASK); 7690 __ br(Assembler::NE, RET_ADJUST); 7691 __ sub(len, len, rscratch1); 7692 7693 __ bind(ALIGNED); 7694 __ cmp(len, large_loop_size); 7695 __ br(Assembler::LT, CHECK_16); 7696 // Perform 16-byte load as early return in pre-loop to handle situation 7697 // when initially aligned large array has negative values at starting bytes, 7698 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 7699 // slower. Cases with negative bytes further ahead won't be affected that 7700 // much. In fact, it'll be faster due to early loads, less instructions and 7701 // less branches in LARGE_LOOP. 7702 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 7703 __ sub(len, len, 16); 7704 __ orr(tmp6, tmp6, tmp1); 7705 __ tst(tmp6, UPPER_BIT_MASK); 7706 __ br(Assembler::NE, RET_ADJUST_16); 7707 __ cmp(len, large_loop_size); 7708 __ br(Assembler::LT, CHECK_16); 7709 7710 if (SoftwarePrefetchHintDistance >= 0 7711 && SoftwarePrefetchHintDistance >= dcache_line) { 7712 // initial prefetch 7713 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 7714 } 7715 __ bind(LARGE_LOOP); 7716 if (SoftwarePrefetchHintDistance >= 0) { 7717 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 7718 } 7719 // Issue load instructions first, since it can save few CPU/MEM cycles, also 7720 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 7721 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 7722 // instructions per cycle and have less branches, but this approach disables 7723 // early return, thus, all 64 bytes are loaded and checked every time. 7724 __ ldp(tmp2, tmp3, Address(ary1)); 7725 __ ldp(tmp4, tmp5, Address(ary1, 16)); 7726 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 7727 __ ldp(tmp6, tmp1, Address(ary1, 48)); 7728 __ add(ary1, ary1, large_loop_size); 7729 __ sub(len, len, large_loop_size); 7730 __ orr(tmp2, tmp2, tmp3); 7731 __ orr(tmp4, tmp4, tmp5); 7732 __ orr(rscratch1, rscratch1, rscratch2); 7733 __ orr(tmp6, tmp6, tmp1); 7734 __ orr(tmp2, tmp2, tmp4); 7735 __ orr(rscratch1, rscratch1, tmp6); 7736 __ orr(tmp2, tmp2, rscratch1); 7737 __ tst(tmp2, UPPER_BIT_MASK); 7738 __ br(Assembler::NE, RET_ADJUST_LONG); 7739 __ cmp(len, large_loop_size); 7740 __ br(Assembler::GE, LARGE_LOOP); 7741 7742 __ bind(CHECK_16); // small 16-byte load pre-loop 7743 __ cmp(len, (u1)16); 7744 __ br(Assembler::LT, POST_LOOP16); 7745 7746 __ bind(LOOP16); // small 16-byte load loop 7747 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 7748 __ sub(len, len, 16); 7749 __ orr(tmp2, tmp2, tmp3); 7750 __ tst(tmp2, UPPER_BIT_MASK); 7751 __ br(Assembler::NE, RET_ADJUST_16); 7752 __ cmp(len, (u1)16); 7753 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 7754 7755 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 7756 __ cmp(len, (u1)8); 7757 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 7758 __ ldr(tmp3, Address(__ post(ary1, 8))); 7759 __ tst(tmp3, UPPER_BIT_MASK); 7760 __ br(Assembler::NE, RET_ADJUST); 7761 __ sub(len, len, 8); 7762 7763 __ bind(POST_LOOP16_LOAD_TAIL); 7764 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 7765 __ ldr(tmp1, Address(ary1)); 7766 __ mov(tmp2, 64); 7767 __ sub(tmp4, tmp2, len, __ LSL, 3); 7768 __ lslv(tmp1, tmp1, tmp4); 7769 __ tst(tmp1, UPPER_BIT_MASK); 7770 __ br(Assembler::NE, RET_ADJUST); 7771 // Fallthrough 7772 7773 __ bind(RET_LEN); 7774 __ pop(spilled_regs, sp); 7775 __ leave(); 7776 __ ret(lr); 7777 7778 // difference result - len is the count of guaranteed to be 7779 // positive bytes 7780 7781 __ bind(RET_ADJUST_LONG); 7782 __ add(len, len, (u1)(large_loop_size - 16)); 7783 __ bind(RET_ADJUST_16); 7784 __ add(len, len, 16); 7785 __ bind(RET_ADJUST); 7786 __ pop(spilled_regs, sp); 7787 __ leave(); 7788 __ sub(result, result, len); 7789 __ ret(lr); 7790 7791 return entry; 7792 } 7793 7794 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 7795 bool usePrefetch, Label &NOT_EQUAL) { 7796 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7797 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 7798 tmp7 = r12, tmp8 = r13; 7799 Label LOOP; 7800 7801 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7802 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7803 __ bind(LOOP); 7804 if (usePrefetch) { 7805 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 7806 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 7807 } 7808 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 7809 __ eor(tmp1, tmp1, tmp2); 7810 __ eor(tmp3, tmp3, tmp4); 7811 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 7812 __ orr(tmp1, tmp1, tmp3); 7813 __ cbnz(tmp1, NOT_EQUAL); 7814 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7815 __ eor(tmp5, tmp5, tmp6); 7816 __ eor(tmp7, tmp7, tmp8); 7817 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7818 __ orr(tmp5, tmp5, tmp7); 7819 __ cbnz(tmp5, NOT_EQUAL); 7820 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 7821 __ eor(tmp1, tmp1, tmp2); 7822 __ eor(tmp3, tmp3, tmp4); 7823 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 7824 __ orr(tmp1, tmp1, tmp3); 7825 __ cbnz(tmp1, NOT_EQUAL); 7826 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7827 __ eor(tmp5, tmp5, tmp6); 7828 __ sub(cnt1, cnt1, 8 * wordSize); 7829 __ eor(tmp7, tmp7, tmp8); 7830 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7831 // tmp6 is not used. MacroAssembler::subs is used here (rather than 7832 // cmp) because subs allows an unlimited range of immediate operand. 7833 __ subs(tmp6, cnt1, loopThreshold); 7834 __ orr(tmp5, tmp5, tmp7); 7835 __ cbnz(tmp5, NOT_EQUAL); 7836 __ br(__ GE, LOOP); 7837 // post-loop 7838 __ eor(tmp1, tmp1, tmp2); 7839 __ eor(tmp3, tmp3, tmp4); 7840 __ orr(tmp1, tmp1, tmp3); 7841 __ sub(cnt1, cnt1, 2 * wordSize); 7842 __ cbnz(tmp1, NOT_EQUAL); 7843 } 7844 7845 void generate_large_array_equals_loop_simd(int loopThreshold, 7846 bool usePrefetch, Label &NOT_EQUAL) { 7847 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7848 tmp2 = rscratch2; 7849 Label LOOP; 7850 7851 __ bind(LOOP); 7852 if (usePrefetch) { 7853 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 7854 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 7855 } 7856 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 7857 __ sub(cnt1, cnt1, 8 * wordSize); 7858 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 7859 __ subs(tmp1, cnt1, loopThreshold); 7860 __ eor(v0, __ T16B, v0, v4); 7861 __ eor(v1, __ T16B, v1, v5); 7862 __ eor(v2, __ T16B, v2, v6); 7863 __ eor(v3, __ T16B, v3, v7); 7864 __ orr(v0, __ T16B, v0, v1); 7865 __ orr(v1, __ T16B, v2, v3); 7866 __ orr(v0, __ T16B, v0, v1); 7867 __ umov(tmp1, v0, __ D, 0); 7868 __ umov(tmp2, v0, __ D, 1); 7869 __ orr(tmp1, tmp1, tmp2); 7870 __ cbnz(tmp1, NOT_EQUAL); 7871 __ br(__ GE, LOOP); 7872 } 7873 7874 // a1 = r1 - array1 address 7875 // a2 = r2 - array2 address 7876 // result = r0 - return value. Already contains "false" 7877 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 7878 // r3-r5 are reserved temporary registers 7879 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 7880 address generate_large_array_equals() { 7881 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7882 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 7883 tmp7 = r12, tmp8 = r13; 7884 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 7885 SMALL_LOOP, POST_LOOP; 7886 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 7887 // calculate if at least 32 prefetched bytes are used 7888 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 7889 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 7890 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 7891 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 7892 tmp5, tmp6, tmp7, tmp8); 7893 7894 __ align(CodeEntryAlignment); 7895 7896 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 7897 StubCodeMark mark(this, stub_id); 7898 7899 address entry = __ pc(); 7900 __ enter(); 7901 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 7902 // also advance pointers to use post-increment instead of pre-increment 7903 __ add(a1, a1, wordSize); 7904 __ add(a2, a2, wordSize); 7905 if (AvoidUnalignedAccesses) { 7906 // both implementations (SIMD/nonSIMD) are using relatively large load 7907 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 7908 // on some CPUs in case of address is not at least 16-byte aligned. 7909 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 7910 // load if needed at least for 1st address and make if 16-byte aligned. 7911 Label ALIGNED16; 7912 __ tbz(a1, 3, ALIGNED16); 7913 __ ldr(tmp1, Address(__ post(a1, wordSize))); 7914 __ ldr(tmp2, Address(__ post(a2, wordSize))); 7915 __ sub(cnt1, cnt1, wordSize); 7916 __ eor(tmp1, tmp1, tmp2); 7917 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 7918 __ bind(ALIGNED16); 7919 } 7920 if (UseSIMDForArrayEquals) { 7921 if (SoftwarePrefetchHintDistance >= 0) { 7922 __ subs(tmp1, cnt1, prefetchLoopThreshold); 7923 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 7924 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 7925 /* prfm = */ true, NOT_EQUAL); 7926 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 7927 __ br(__ LT, TAIL); 7928 } 7929 __ bind(NO_PREFETCH_LARGE_LOOP); 7930 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 7931 /* prfm = */ false, NOT_EQUAL); 7932 } else { 7933 __ push(spilled_regs, sp); 7934 if (SoftwarePrefetchHintDistance >= 0) { 7935 __ subs(tmp1, cnt1, prefetchLoopThreshold); 7936 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 7937 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 7938 /* prfm = */ true, NOT_EQUAL); 7939 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 7940 __ br(__ LT, TAIL); 7941 } 7942 __ bind(NO_PREFETCH_LARGE_LOOP); 7943 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 7944 /* prfm = */ false, NOT_EQUAL); 7945 } 7946 __ bind(TAIL); 7947 __ cbz(cnt1, EQUAL); 7948 __ subs(cnt1, cnt1, wordSize); 7949 __ br(__ LE, POST_LOOP); 7950 __ bind(SMALL_LOOP); 7951 __ ldr(tmp1, Address(__ post(a1, wordSize))); 7952 __ ldr(tmp2, Address(__ post(a2, wordSize))); 7953 __ subs(cnt1, cnt1, wordSize); 7954 __ eor(tmp1, tmp1, tmp2); 7955 __ cbnz(tmp1, NOT_EQUAL); 7956 __ br(__ GT, SMALL_LOOP); 7957 __ bind(POST_LOOP); 7958 __ ldr(tmp1, Address(a1, cnt1)); 7959 __ ldr(tmp2, Address(a2, cnt1)); 7960 __ eor(tmp1, tmp1, tmp2); 7961 __ cbnz(tmp1, NOT_EQUAL); 7962 __ bind(EQUAL); 7963 __ mov(result, true); 7964 __ bind(NOT_EQUAL); 7965 if (!UseSIMDForArrayEquals) { 7966 __ pop(spilled_regs, sp); 7967 } 7968 __ bind(NOT_EQUAL_NO_POP); 7969 __ leave(); 7970 __ ret(lr); 7971 return entry; 7972 } 7973 7974 // result = r0 - return value. Contains initial hashcode value on entry. 7975 // ary = r1 - array address 7976 // cnt = r2 - elements count 7977 // Clobbers: v0-v13, rscratch1, rscratch2 7978 address generate_large_arrays_hashcode(BasicType eltype) { 7979 const Register result = r0, ary = r1, cnt = r2; 7980 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 7981 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 7982 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 7983 const FloatRegister vpowm = v13; 7984 7985 ARRAYS_HASHCODE_REGISTERS; 7986 7987 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 7988 7989 unsigned int vf; // vectorization factor 7990 bool multiply_by_halves; 7991 Assembler::SIMD_Arrangement load_arrangement; 7992 switch (eltype) { 7993 case T_BOOLEAN: 7994 case T_BYTE: 7995 load_arrangement = Assembler::T8B; 7996 multiply_by_halves = true; 7997 vf = 8; 7998 break; 7999 case T_CHAR: 8000 case T_SHORT: 8001 load_arrangement = Assembler::T8H; 8002 multiply_by_halves = true; 8003 vf = 8; 8004 break; 8005 case T_INT: 8006 load_arrangement = Assembler::T4S; 8007 multiply_by_halves = false; 8008 vf = 4; 8009 break; 8010 default: 8011 ShouldNotReachHere(); 8012 } 8013 8014 // Unroll factor 8015 const unsigned uf = 4; 8016 8017 // Effective vectorization factor 8018 const unsigned evf = vf * uf; 8019 8020 __ align(CodeEntryAlignment); 8021 8022 StubGenStubId stub_id; 8023 switch (eltype) { 8024 case T_BOOLEAN: 8025 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 8026 break; 8027 case T_BYTE: 8028 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 8029 break; 8030 case T_CHAR: 8031 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 8032 break; 8033 case T_SHORT: 8034 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 8035 break; 8036 case T_INT: 8037 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 8038 break; 8039 default: 8040 stub_id = StubGenStubId::NO_STUBID; 8041 ShouldNotReachHere(); 8042 }; 8043 8044 StubCodeMark mark(this, stub_id); 8045 8046 address entry = __ pc(); 8047 __ enter(); 8048 8049 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8050 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8051 // value shouldn't change throughout both loops. 8052 __ movw(rscratch1, intpow(31U, 3)); 8053 __ mov(vpow, Assembler::S, 0, rscratch1); 8054 __ movw(rscratch1, intpow(31U, 2)); 8055 __ mov(vpow, Assembler::S, 1, rscratch1); 8056 __ movw(rscratch1, intpow(31U, 1)); 8057 __ mov(vpow, Assembler::S, 2, rscratch1); 8058 __ movw(rscratch1, intpow(31U, 0)); 8059 __ mov(vpow, Assembler::S, 3, rscratch1); 8060 8061 __ mov(vmul0, Assembler::T16B, 0); 8062 __ mov(vmul0, Assembler::S, 3, result); 8063 8064 __ andr(rscratch2, cnt, (uf - 1) * vf); 8065 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8066 8067 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8068 __ mov(vpowm, Assembler::S, 0, rscratch1); 8069 8070 // SMALL LOOP 8071 __ bind(SMALL_LOOP); 8072 8073 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8074 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8075 __ subsw(rscratch2, rscratch2, vf); 8076 8077 if (load_arrangement == Assembler::T8B) { 8078 // Extend 8B to 8H to be able to use vector multiply 8079 // instructions 8080 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8081 if (is_signed_subword_type(eltype)) { 8082 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8083 } else { 8084 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8085 } 8086 } 8087 8088 switch (load_arrangement) { 8089 case Assembler::T4S: 8090 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8091 break; 8092 case Assembler::T8B: 8093 case Assembler::T8H: 8094 assert(is_subword_type(eltype), "subword type expected"); 8095 if (is_signed_subword_type(eltype)) { 8096 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8097 } else { 8098 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8099 } 8100 break; 8101 default: 8102 __ should_not_reach_here(); 8103 } 8104 8105 // Process the upper half of a vector 8106 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8107 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8108 if (is_signed_subword_type(eltype)) { 8109 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8110 } else { 8111 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8112 } 8113 } 8114 8115 __ br(Assembler::HI, SMALL_LOOP); 8116 8117 // SMALL LOOP'S EPILOQUE 8118 __ lsr(rscratch2, cnt, exact_log2(evf)); 8119 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8120 8121 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8122 __ addv(vmul0, Assembler::T4S, vmul0); 8123 __ umov(result, vmul0, Assembler::S, 0); 8124 8125 // TAIL 8126 __ bind(TAIL); 8127 8128 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8129 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8130 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8131 __ andr(rscratch2, cnt, vf - 1); 8132 __ bind(TAIL_SHORTCUT); 8133 __ adr(rscratch1, BR_BASE); 8134 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 8135 __ movw(rscratch2, 0x1f); 8136 __ br(rscratch1); 8137 8138 for (size_t i = 0; i < vf - 1; ++i) { 8139 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8140 eltype); 8141 __ maddw(result, result, rscratch2, rscratch1); 8142 } 8143 __ bind(BR_BASE); 8144 8145 __ leave(); 8146 __ ret(lr); 8147 8148 // LARGE LOOP 8149 __ bind(LARGE_LOOP_PREHEADER); 8150 8151 __ lsr(rscratch2, cnt, exact_log2(evf)); 8152 8153 if (multiply_by_halves) { 8154 // 31^4 - multiplier between lower and upper parts of a register 8155 __ movw(rscratch1, intpow(31U, vf / 2)); 8156 __ mov(vpowm, Assembler::S, 1, rscratch1); 8157 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8158 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8159 __ mov(vpowm, Assembler::S, 0, rscratch1); 8160 } else { 8161 // 31^16 8162 __ movw(rscratch1, intpow(31U, evf)); 8163 __ mov(vpowm, Assembler::S, 0, rscratch1); 8164 } 8165 8166 __ mov(vmul3, Assembler::T16B, 0); 8167 __ mov(vmul2, Assembler::T16B, 0); 8168 __ mov(vmul1, Assembler::T16B, 0); 8169 8170 __ bind(LARGE_LOOP); 8171 8172 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8173 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8174 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8175 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8176 8177 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8178 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8179 8180 if (load_arrangement == Assembler::T8B) { 8181 // Extend 8B to 8H to be able to use vector multiply 8182 // instructions 8183 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8184 if (is_signed_subword_type(eltype)) { 8185 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8186 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8187 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8188 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8189 } else { 8190 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8191 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8192 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8193 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8194 } 8195 } 8196 8197 switch (load_arrangement) { 8198 case Assembler::T4S: 8199 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8200 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8201 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8202 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8203 break; 8204 case Assembler::T8B: 8205 case Assembler::T8H: 8206 assert(is_subword_type(eltype), "subword type expected"); 8207 if (is_signed_subword_type(eltype)) { 8208 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8209 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8210 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8211 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8212 } else { 8213 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8214 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8215 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8216 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8217 } 8218 break; 8219 default: 8220 __ should_not_reach_here(); 8221 } 8222 8223 // Process the upper half of a vector 8224 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8225 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8226 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8227 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8228 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8229 if (is_signed_subword_type(eltype)) { 8230 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8231 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8232 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8233 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8234 } else { 8235 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8236 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8237 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8238 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8239 } 8240 } 8241 8242 __ subsw(rscratch2, rscratch2, 1); 8243 __ br(Assembler::HI, LARGE_LOOP); 8244 8245 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8246 __ addv(vmul3, Assembler::T4S, vmul3); 8247 __ umov(result, vmul3, Assembler::S, 0); 8248 8249 __ mov(rscratch2, intpow(31U, vf)); 8250 8251 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8252 __ addv(vmul2, Assembler::T4S, vmul2); 8253 __ umov(rscratch1, vmul2, Assembler::S, 0); 8254 __ maddw(result, result, rscratch2, rscratch1); 8255 8256 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8257 __ addv(vmul1, Assembler::T4S, vmul1); 8258 __ umov(rscratch1, vmul1, Assembler::S, 0); 8259 __ maddw(result, result, rscratch2, rscratch1); 8260 8261 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8262 __ addv(vmul0, Assembler::T4S, vmul0); 8263 __ umov(rscratch1, vmul0, Assembler::S, 0); 8264 __ maddw(result, result, rscratch2, rscratch1); 8265 8266 __ andr(rscratch2, cnt, vf - 1); 8267 __ cbnz(rscratch2, TAIL_SHORTCUT); 8268 8269 __ leave(); 8270 __ ret(lr); 8271 8272 return entry; 8273 } 8274 8275 address generate_dsin_dcos(bool isCos) { 8276 __ align(CodeEntryAlignment); 8277 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 8278 StubCodeMark mark(this, stub_id); 8279 address start = __ pc(); 8280 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8281 (address)StubRoutines::aarch64::_two_over_pi, 8282 (address)StubRoutines::aarch64::_pio2, 8283 (address)StubRoutines::aarch64::_dsin_coef, 8284 (address)StubRoutines::aarch64::_dcos_coef); 8285 return start; 8286 } 8287 8288 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8289 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8290 Label &DIFF2) { 8291 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8292 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8293 8294 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8295 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8296 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8297 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8298 8299 __ fmovd(tmpL, vtmp3); 8300 __ eor(rscratch2, tmp3, tmpL); 8301 __ cbnz(rscratch2, DIFF2); 8302 8303 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8304 __ umov(tmpL, vtmp3, __ D, 1); 8305 __ eor(rscratch2, tmpU, tmpL); 8306 __ cbnz(rscratch2, DIFF1); 8307 8308 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8309 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8310 __ fmovd(tmpL, vtmp); 8311 __ eor(rscratch2, tmp3, tmpL); 8312 __ cbnz(rscratch2, DIFF2); 8313 8314 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8315 __ umov(tmpL, vtmp, __ D, 1); 8316 __ eor(rscratch2, tmpU, tmpL); 8317 __ cbnz(rscratch2, DIFF1); 8318 } 8319 8320 // r0 = result 8321 // r1 = str1 8322 // r2 = cnt1 8323 // r3 = str2 8324 // r4 = cnt2 8325 // r10 = tmp1 8326 // r11 = tmp2 8327 address generate_compare_long_string_different_encoding(bool isLU) { 8328 __ align(CodeEntryAlignment); 8329 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 8330 StubCodeMark mark(this, stub_id); 8331 address entry = __ pc(); 8332 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8333 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8334 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8335 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8336 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8337 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8338 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8339 8340 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8341 8342 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8343 // cnt2 == amount of characters left to compare 8344 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8345 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8346 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8347 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8348 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8349 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8350 __ eor(rscratch2, tmp1, tmp2); 8351 __ mov(rscratch1, tmp2); 8352 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8353 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8354 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8355 __ push(spilled_regs, sp); 8356 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8357 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8358 8359 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8360 8361 if (SoftwarePrefetchHintDistance >= 0) { 8362 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8363 __ br(__ LT, NO_PREFETCH); 8364 __ bind(LARGE_LOOP_PREFETCH); 8365 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8366 __ mov(tmp4, 2); 8367 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8368 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8369 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8370 __ subs(tmp4, tmp4, 1); 8371 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8372 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8373 __ mov(tmp4, 2); 8374 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8375 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8376 __ subs(tmp4, tmp4, 1); 8377 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8378 __ sub(cnt2, cnt2, 64); 8379 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8380 __ br(__ GE, LARGE_LOOP_PREFETCH); 8381 } 8382 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8383 __ bind(NO_PREFETCH); 8384 __ subs(cnt2, cnt2, 16); 8385 __ br(__ LT, TAIL); 8386 __ align(OptoLoopAlignment); 8387 __ bind(SMALL_LOOP); // smaller loop 8388 __ subs(cnt2, cnt2, 16); 8389 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8390 __ br(__ GE, SMALL_LOOP); 8391 __ cmn(cnt2, (u1)16); 8392 __ br(__ EQ, LOAD_LAST); 8393 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8394 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8395 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8396 __ ldr(tmp3, Address(cnt1, -8)); 8397 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8398 __ b(LOAD_LAST); 8399 __ bind(DIFF2); 8400 __ mov(tmpU, tmp3); 8401 __ bind(DIFF1); 8402 __ pop(spilled_regs, sp); 8403 __ b(CALCULATE_DIFFERENCE); 8404 __ bind(LOAD_LAST); 8405 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8406 // No need to load it again 8407 __ mov(tmpU, tmp3); 8408 __ pop(spilled_regs, sp); 8409 8410 // tmp2 points to the address of the last 4 Latin1 characters right now 8411 __ ldrs(vtmp, Address(tmp2)); 8412 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8413 __ fmovd(tmpL, vtmp); 8414 8415 __ eor(rscratch2, tmpU, tmpL); 8416 __ cbz(rscratch2, DONE); 8417 8418 // Find the first different characters in the longwords and 8419 // compute their difference. 8420 __ bind(CALCULATE_DIFFERENCE); 8421 __ rev(rscratch2, rscratch2); 8422 __ clz(rscratch2, rscratch2); 8423 __ andr(rscratch2, rscratch2, -16); 8424 __ lsrv(tmp1, tmp1, rscratch2); 8425 __ uxthw(tmp1, tmp1); 8426 __ lsrv(rscratch1, rscratch1, rscratch2); 8427 __ uxthw(rscratch1, rscratch1); 8428 __ subw(result, tmp1, rscratch1); 8429 __ bind(DONE); 8430 __ ret(lr); 8431 return entry; 8432 } 8433 8434 // r0 = input (float16) 8435 // v0 = result (float) 8436 // v1 = temporary float register 8437 address generate_float16ToFloat() { 8438 __ align(CodeEntryAlignment); 8439 StubGenStubId stub_id = StubGenStubId::hf2f_id; 8440 StubCodeMark mark(this, stub_id); 8441 address entry = __ pc(); 8442 BLOCK_COMMENT("Entry:"); 8443 __ flt16_to_flt(v0, r0, v1); 8444 __ ret(lr); 8445 return entry; 8446 } 8447 8448 // v0 = input (float) 8449 // r0 = result (float16) 8450 // v1 = temporary float register 8451 address generate_floatToFloat16() { 8452 __ align(CodeEntryAlignment); 8453 StubGenStubId stub_id = StubGenStubId::f2hf_id; 8454 StubCodeMark mark(this, stub_id); 8455 address entry = __ pc(); 8456 BLOCK_COMMENT("Entry:"); 8457 __ flt_to_flt16(r0, v0, v1); 8458 __ ret(lr); 8459 return entry; 8460 } 8461 8462 address generate_method_entry_barrier() { 8463 __ align(CodeEntryAlignment); 8464 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 8465 StubCodeMark mark(this, stub_id); 8466 8467 Label deoptimize_label; 8468 8469 address start = __ pc(); 8470 8471 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 8472 8473 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 8474 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8475 // We can get here despite the nmethod being good, if we have not 8476 // yet applied our cross modification fence (or data fence). 8477 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 8478 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 8479 __ ldrw(rscratch2, rscratch2); 8480 __ strw(rscratch2, thread_epoch_addr); 8481 __ isb(); 8482 __ membar(__ LoadLoad); 8483 } 8484 8485 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 8486 8487 __ enter(); 8488 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 8489 8490 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 8491 8492 __ push_call_clobbered_registers(); 8493 8494 __ mov(c_rarg0, rscratch2); 8495 __ call_VM_leaf 8496 (CAST_FROM_FN_PTR 8497 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 8498 8499 __ reset_last_Java_frame(true); 8500 8501 __ mov(rscratch1, r0); 8502 8503 __ pop_call_clobbered_registers(); 8504 8505 __ cbnz(rscratch1, deoptimize_label); 8506 8507 __ leave(); 8508 __ ret(lr); 8509 8510 __ BIND(deoptimize_label); 8511 8512 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 8513 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 8514 8515 __ mov(sp, rscratch1); 8516 __ br(rscratch2); 8517 8518 return start; 8519 } 8520 8521 // r0 = result 8522 // r1 = str1 8523 // r2 = cnt1 8524 // r3 = str2 8525 // r4 = cnt2 8526 // r10 = tmp1 8527 // r11 = tmp2 8528 address generate_compare_long_string_same_encoding(bool isLL) { 8529 __ align(CodeEntryAlignment); 8530 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 8531 StubCodeMark mark(this, stub_id); 8532 address entry = __ pc(); 8533 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8534 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 8535 8536 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 8537 8538 // exit from large loop when less than 64 bytes left to read or we're about 8539 // to prefetch memory behind array border 8540 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 8541 8542 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 8543 __ eor(rscratch2, tmp1, tmp2); 8544 __ cbnz(rscratch2, CAL_DIFFERENCE); 8545 8546 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 8547 // update pointers, because of previous read 8548 __ add(str1, str1, wordSize); 8549 __ add(str2, str2, wordSize); 8550 if (SoftwarePrefetchHintDistance >= 0) { 8551 __ align(OptoLoopAlignment); 8552 __ bind(LARGE_LOOP_PREFETCH); 8553 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 8554 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 8555 8556 for (int i = 0; i < 4; i++) { 8557 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 8558 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 8559 __ cmp(tmp1, tmp2); 8560 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8561 __ br(Assembler::NE, DIFF); 8562 } 8563 __ sub(cnt2, cnt2, isLL ? 64 : 32); 8564 __ add(str1, str1, 64); 8565 __ add(str2, str2, 64); 8566 __ subs(rscratch2, cnt2, largeLoopExitCondition); 8567 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 8568 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 8569 } 8570 8571 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 8572 __ br(Assembler::LE, LESS16); 8573 __ align(OptoLoopAlignment); 8574 __ bind(LOOP_COMPARE16); 8575 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 8576 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 8577 __ cmp(tmp1, tmp2); 8578 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8579 __ br(Assembler::NE, DIFF); 8580 __ sub(cnt2, cnt2, isLL ? 16 : 8); 8581 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 8582 __ br(Assembler::LT, LESS16); 8583 8584 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 8585 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 8586 __ cmp(tmp1, tmp2); 8587 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8588 __ br(Assembler::NE, DIFF); 8589 __ sub(cnt2, cnt2, isLL ? 16 : 8); 8590 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 8591 __ br(Assembler::GE, LOOP_COMPARE16); 8592 __ cbz(cnt2, LENGTH_DIFF); 8593 8594 __ bind(LESS16); 8595 // each 8 compare 8596 __ subs(cnt2, cnt2, isLL ? 8 : 4); 8597 __ br(Assembler::LE, LESS8); 8598 __ ldr(tmp1, Address(__ post(str1, 8))); 8599 __ ldr(tmp2, Address(__ post(str2, 8))); 8600 __ eor(rscratch2, tmp1, tmp2); 8601 __ cbnz(rscratch2, CAL_DIFFERENCE); 8602 __ sub(cnt2, cnt2, isLL ? 8 : 4); 8603 8604 __ bind(LESS8); // directly load last 8 bytes 8605 if (!isLL) { 8606 __ add(cnt2, cnt2, cnt2); 8607 } 8608 __ ldr(tmp1, Address(str1, cnt2)); 8609 __ ldr(tmp2, Address(str2, cnt2)); 8610 __ eor(rscratch2, tmp1, tmp2); 8611 __ cbz(rscratch2, LENGTH_DIFF); 8612 __ b(CAL_DIFFERENCE); 8613 8614 __ bind(DIFF); 8615 __ cmp(tmp1, tmp2); 8616 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 8617 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 8618 // reuse rscratch2 register for the result of eor instruction 8619 __ eor(rscratch2, tmp1, tmp2); 8620 8621 __ bind(CAL_DIFFERENCE); 8622 __ rev(rscratch2, rscratch2); 8623 __ clz(rscratch2, rscratch2); 8624 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 8625 __ lsrv(tmp1, tmp1, rscratch2); 8626 __ lsrv(tmp2, tmp2, rscratch2); 8627 if (isLL) { 8628 __ uxtbw(tmp1, tmp1); 8629 __ uxtbw(tmp2, tmp2); 8630 } else { 8631 __ uxthw(tmp1, tmp1); 8632 __ uxthw(tmp2, tmp2); 8633 } 8634 __ subw(result, tmp1, tmp2); 8635 8636 __ bind(LENGTH_DIFF); 8637 __ ret(lr); 8638 return entry; 8639 } 8640 8641 enum string_compare_mode { 8642 LL, 8643 LU, 8644 UL, 8645 UU, 8646 }; 8647 8648 // The following registers are declared in aarch64.ad 8649 // r0 = result 8650 // r1 = str1 8651 // r2 = cnt1 8652 // r3 = str2 8653 // r4 = cnt2 8654 // r10 = tmp1 8655 // r11 = tmp2 8656 // z0 = ztmp1 8657 // z1 = ztmp2 8658 // p0 = pgtmp1 8659 // p1 = pgtmp2 8660 address generate_compare_long_string_sve(string_compare_mode mode) { 8661 StubGenStubId stub_id; 8662 switch (mode) { 8663 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 8664 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 8665 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 8666 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 8667 default: ShouldNotReachHere(); 8668 } 8669 8670 __ align(CodeEntryAlignment); 8671 address entry = __ pc(); 8672 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8673 tmp1 = r10, tmp2 = r11; 8674 8675 Label LOOP, DONE, MISMATCH; 8676 Register vec_len = tmp1; 8677 Register idx = tmp2; 8678 // The minimum of the string lengths has been stored in cnt2. 8679 Register cnt = cnt2; 8680 FloatRegister ztmp1 = z0, ztmp2 = z1; 8681 PRegister pgtmp1 = p0, pgtmp2 = p1; 8682 8683 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 8684 switch (mode) { \ 8685 case LL: \ 8686 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 8687 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 8688 break; \ 8689 case LU: \ 8690 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 8691 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 8692 break; \ 8693 case UL: \ 8694 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 8695 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 8696 break; \ 8697 case UU: \ 8698 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 8699 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 8700 break; \ 8701 default: \ 8702 ShouldNotReachHere(); \ 8703 } 8704 8705 StubCodeMark mark(this, stub_id); 8706 8707 __ mov(idx, 0); 8708 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 8709 8710 if (mode == LL) { 8711 __ sve_cntb(vec_len); 8712 } else { 8713 __ sve_cnth(vec_len); 8714 } 8715 8716 __ sub(rscratch1, cnt, vec_len); 8717 8718 __ bind(LOOP); 8719 8720 // main loop 8721 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 8722 __ add(idx, idx, vec_len); 8723 // Compare strings. 8724 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 8725 __ br(__ NE, MISMATCH); 8726 __ cmp(idx, rscratch1); 8727 __ br(__ LT, LOOP); 8728 8729 // post loop, last iteration 8730 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 8731 8732 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 8733 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 8734 __ br(__ EQ, DONE); 8735 8736 __ bind(MISMATCH); 8737 8738 // Crop the vector to find its location. 8739 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 8740 // Extract the first different characters of each string. 8741 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 8742 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 8743 8744 // Compute the difference of the first different characters. 8745 __ sub(result, rscratch1, rscratch2); 8746 8747 __ bind(DONE); 8748 __ ret(lr); 8749 #undef LOAD_PAIR 8750 return entry; 8751 } 8752 8753 void generate_compare_long_strings() { 8754 if (UseSVE == 0) { 8755 StubRoutines::aarch64::_compare_long_string_LL 8756 = generate_compare_long_string_same_encoding(true); 8757 StubRoutines::aarch64::_compare_long_string_UU 8758 = generate_compare_long_string_same_encoding(false); 8759 StubRoutines::aarch64::_compare_long_string_LU 8760 = generate_compare_long_string_different_encoding(true); 8761 StubRoutines::aarch64::_compare_long_string_UL 8762 = generate_compare_long_string_different_encoding(false); 8763 } else { 8764 StubRoutines::aarch64::_compare_long_string_LL 8765 = generate_compare_long_string_sve(LL); 8766 StubRoutines::aarch64::_compare_long_string_UU 8767 = generate_compare_long_string_sve(UU); 8768 StubRoutines::aarch64::_compare_long_string_LU 8769 = generate_compare_long_string_sve(LU); 8770 StubRoutines::aarch64::_compare_long_string_UL 8771 = generate_compare_long_string_sve(UL); 8772 } 8773 } 8774 8775 // R0 = result 8776 // R1 = str2 8777 // R2 = cnt1 8778 // R3 = str1 8779 // R4 = cnt2 8780 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 8781 // 8782 // This generic linear code use few additional ideas, which makes it faster: 8783 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 8784 // in order to skip initial loading(help in systems with 1 ld pipeline) 8785 // 2) we can use "fast" algorithm of finding single character to search for 8786 // first symbol with less branches(1 branch per each loaded register instead 8787 // of branch for each symbol), so, this is where constants like 8788 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 8789 // 3) after loading and analyzing 1st register of source string, it can be 8790 // used to search for every 1st character entry, saving few loads in 8791 // comparison with "simplier-but-slower" implementation 8792 // 4) in order to avoid lots of push/pop operations, code below is heavily 8793 // re-using/re-initializing/compressing register values, which makes code 8794 // larger and a bit less readable, however, most of extra operations are 8795 // issued during loads or branches, so, penalty is minimal 8796 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 8797 StubGenStubId stub_id; 8798 if (str1_isL) { 8799 if (str2_isL) { 8800 stub_id = StubGenStubId::string_indexof_linear_ll_id; 8801 } else { 8802 stub_id = StubGenStubId::string_indexof_linear_ul_id; 8803 } 8804 } else { 8805 if (str2_isL) { 8806 ShouldNotReachHere(); 8807 } else { 8808 stub_id = StubGenStubId::string_indexof_linear_uu_id; 8809 } 8810 } 8811 __ align(CodeEntryAlignment); 8812 StubCodeMark mark(this, stub_id); 8813 address entry = __ pc(); 8814 8815 int str1_chr_size = str1_isL ? 1 : 2; 8816 int str2_chr_size = str2_isL ? 1 : 2; 8817 int str1_chr_shift = str1_isL ? 0 : 1; 8818 int str2_chr_shift = str2_isL ? 0 : 1; 8819 bool isL = str1_isL && str2_isL; 8820 // parameters 8821 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 8822 // temporary registers 8823 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 8824 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 8825 // redefinitions 8826 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 8827 8828 __ push(spilled_regs, sp); 8829 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 8830 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 8831 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 8832 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 8833 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 8834 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 8835 // Read whole register from str1. It is safe, because length >=8 here 8836 __ ldr(ch1, Address(str1)); 8837 // Read whole register from str2. It is safe, because length >=8 here 8838 __ ldr(ch2, Address(str2)); 8839 __ sub(cnt2, cnt2, cnt1); 8840 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 8841 if (str1_isL != str2_isL) { 8842 __ eor(v0, __ T16B, v0, v0); 8843 } 8844 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 8845 __ mul(first, first, tmp1); 8846 // check if we have less than 1 register to check 8847 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 8848 if (str1_isL != str2_isL) { 8849 __ fmovd(v1, ch1); 8850 } 8851 __ br(__ LE, L_SMALL); 8852 __ eor(ch2, first, ch2); 8853 if (str1_isL != str2_isL) { 8854 __ zip1(v1, __ T16B, v1, v0); 8855 } 8856 __ sub(tmp2, ch2, tmp1); 8857 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8858 __ bics(tmp2, tmp2, ch2); 8859 if (str1_isL != str2_isL) { 8860 __ fmovd(ch1, v1); 8861 } 8862 __ br(__ NE, L_HAS_ZERO); 8863 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 8864 __ add(result, result, wordSize/str2_chr_size); 8865 __ add(str2, str2, wordSize); 8866 __ br(__ LT, L_POST_LOOP); 8867 __ BIND(L_LOOP); 8868 __ ldr(ch2, Address(str2)); 8869 __ eor(ch2, first, ch2); 8870 __ sub(tmp2, ch2, tmp1); 8871 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8872 __ bics(tmp2, tmp2, ch2); 8873 __ br(__ NE, L_HAS_ZERO); 8874 __ BIND(L_LOOP_PROCEED); 8875 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 8876 __ add(str2, str2, wordSize); 8877 __ add(result, result, wordSize/str2_chr_size); 8878 __ br(__ GE, L_LOOP); 8879 __ BIND(L_POST_LOOP); 8880 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 8881 __ br(__ LE, NOMATCH); 8882 __ ldr(ch2, Address(str2)); 8883 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 8884 __ eor(ch2, first, ch2); 8885 __ sub(tmp2, ch2, tmp1); 8886 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8887 __ mov(tmp4, -1); // all bits set 8888 __ b(L_SMALL_PROCEED); 8889 __ align(OptoLoopAlignment); 8890 __ BIND(L_SMALL); 8891 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 8892 __ eor(ch2, first, ch2); 8893 if (str1_isL != str2_isL) { 8894 __ zip1(v1, __ T16B, v1, v0); 8895 } 8896 __ sub(tmp2, ch2, tmp1); 8897 __ mov(tmp4, -1); // all bits set 8898 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8899 if (str1_isL != str2_isL) { 8900 __ fmovd(ch1, v1); // move converted 4 symbols 8901 } 8902 __ BIND(L_SMALL_PROCEED); 8903 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 8904 __ bic(tmp2, tmp2, ch2); 8905 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 8906 __ rbit(tmp2, tmp2); 8907 __ br(__ EQ, NOMATCH); 8908 __ BIND(L_SMALL_HAS_ZERO_LOOP); 8909 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 8910 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 8911 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 8912 if (str2_isL) { // LL 8913 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 8914 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 8915 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 8916 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 8917 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8918 } else { 8919 __ mov(ch2, 0xE); // all bits in byte set except last one 8920 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 8921 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8922 __ lslv(tmp2, tmp2, tmp4); 8923 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8924 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8925 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8926 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8927 } 8928 __ cmp(ch1, ch2); 8929 __ mov(tmp4, wordSize/str2_chr_size); 8930 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 8931 __ BIND(L_SMALL_CMP_LOOP); 8932 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 8933 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 8934 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 8935 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 8936 __ add(tmp4, tmp4, 1); 8937 __ cmp(tmp4, cnt1); 8938 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 8939 __ cmp(first, ch2); 8940 __ br(__ EQ, L_SMALL_CMP_LOOP); 8941 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 8942 __ cbz(tmp2, NOMATCH); // no more matches. exit 8943 __ clz(tmp4, tmp2); 8944 __ add(result, result, 1); // advance index 8945 __ add(str2, str2, str2_chr_size); // advance pointer 8946 __ b(L_SMALL_HAS_ZERO_LOOP); 8947 __ align(OptoLoopAlignment); 8948 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 8949 __ cmp(first, ch2); 8950 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 8951 __ b(DONE); 8952 __ align(OptoLoopAlignment); 8953 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 8954 if (str2_isL) { // LL 8955 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 8956 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 8957 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 8958 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 8959 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8960 } else { 8961 __ mov(ch2, 0xE); // all bits in byte set except last one 8962 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 8963 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8964 __ lslv(tmp2, tmp2, tmp4); 8965 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8966 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8967 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 8968 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8969 } 8970 __ cmp(ch1, ch2); 8971 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 8972 __ b(DONE); 8973 __ align(OptoLoopAlignment); 8974 __ BIND(L_HAS_ZERO); 8975 __ rbit(tmp2, tmp2); 8976 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 8977 // Now, perform compression of counters(cnt2 and cnt1) into one register. 8978 // It's fine because both counters are 32bit and are not changed in this 8979 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 8980 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 8981 __ sub(result, result, 1); 8982 __ BIND(L_HAS_ZERO_LOOP); 8983 __ mov(cnt1, wordSize/str2_chr_size); 8984 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 8985 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 8986 if (str2_isL) { 8987 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 8988 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8989 __ lslv(tmp2, tmp2, tmp4); 8990 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8991 __ add(tmp4, tmp4, 1); 8992 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 8993 __ lsl(tmp2, tmp2, 1); 8994 __ mov(tmp4, wordSize/str2_chr_size); 8995 } else { 8996 __ mov(ch2, 0xE); 8997 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 8998 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 8999 __ lslv(tmp2, tmp2, tmp4); 9000 __ add(tmp4, tmp4, 1); 9001 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9002 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9003 __ lsl(tmp2, tmp2, 1); 9004 __ mov(tmp4, wordSize/str2_chr_size); 9005 __ sub(str2, str2, str2_chr_size); 9006 } 9007 __ cmp(ch1, ch2); 9008 __ mov(tmp4, wordSize/str2_chr_size); 9009 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9010 __ BIND(L_CMP_LOOP); 9011 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9012 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9013 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9014 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9015 __ add(tmp4, tmp4, 1); 9016 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9017 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 9018 __ cmp(cnt1, ch2); 9019 __ br(__ EQ, L_CMP_LOOP); 9020 __ BIND(L_CMP_LOOP_NOMATCH); 9021 // here we're not matched 9022 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9023 __ clz(tmp4, tmp2); 9024 __ add(str2, str2, str2_chr_size); // advance pointer 9025 __ b(L_HAS_ZERO_LOOP); 9026 __ align(OptoLoopAlignment); 9027 __ BIND(L_CMP_LOOP_LAST_CMP); 9028 __ cmp(cnt1, ch2); 9029 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9030 __ b(DONE); 9031 __ align(OptoLoopAlignment); 9032 __ BIND(L_CMP_LOOP_LAST_CMP2); 9033 if (str2_isL) { 9034 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9035 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9036 __ lslv(tmp2, tmp2, tmp4); 9037 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9038 __ add(tmp4, tmp4, 1); 9039 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9040 __ lsl(tmp2, tmp2, 1); 9041 } else { 9042 __ mov(ch2, 0xE); 9043 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9044 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9045 __ lslv(tmp2, tmp2, tmp4); 9046 __ add(tmp4, tmp4, 1); 9047 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9048 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9049 __ lsl(tmp2, tmp2, 1); 9050 __ sub(str2, str2, str2_chr_size); 9051 } 9052 __ cmp(ch1, ch2); 9053 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9054 __ b(DONE); 9055 __ align(OptoLoopAlignment); 9056 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9057 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9058 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9059 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9060 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9061 // result by analyzed characters value, so, we can just reset lower bits 9062 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9063 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9064 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9065 // index of last analyzed substring inside current octet. So, str2 in at 9066 // respective start address. We need to advance it to next octet 9067 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9068 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9069 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9070 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9071 __ movw(cnt2, cnt2); 9072 __ b(L_LOOP_PROCEED); 9073 __ align(OptoLoopAlignment); 9074 __ BIND(NOMATCH); 9075 __ mov(result, -1); 9076 __ BIND(DONE); 9077 __ pop(spilled_regs, sp); 9078 __ ret(lr); 9079 return entry; 9080 } 9081 9082 void generate_string_indexof_stubs() { 9083 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9084 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9085 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9086 } 9087 9088 void inflate_and_store_2_fp_registers(bool generatePrfm, 9089 FloatRegister src1, FloatRegister src2) { 9090 Register dst = r1; 9091 __ zip1(v1, __ T16B, src1, v0); 9092 __ zip2(v2, __ T16B, src1, v0); 9093 if (generatePrfm) { 9094 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9095 } 9096 __ zip1(v3, __ T16B, src2, v0); 9097 __ zip2(v4, __ T16B, src2, v0); 9098 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9099 } 9100 9101 // R0 = src 9102 // R1 = dst 9103 // R2 = len 9104 // R3 = len >> 3 9105 // V0 = 0 9106 // v1 = loaded 8 bytes 9107 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9108 address generate_large_byte_array_inflate() { 9109 __ align(CodeEntryAlignment); 9110 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 9111 StubCodeMark mark(this, stub_id); 9112 address entry = __ pc(); 9113 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9114 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9115 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9116 9117 // do one more 8-byte read to have address 16-byte aligned in most cases 9118 // also use single store instruction 9119 __ ldrd(v2, __ post(src, 8)); 9120 __ sub(octetCounter, octetCounter, 2); 9121 __ zip1(v1, __ T16B, v1, v0); 9122 __ zip1(v2, __ T16B, v2, v0); 9123 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9124 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9125 __ subs(rscratch1, octetCounter, large_loop_threshold); 9126 __ br(__ LE, LOOP_START); 9127 __ b(LOOP_PRFM_START); 9128 __ bind(LOOP_PRFM); 9129 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9130 __ bind(LOOP_PRFM_START); 9131 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9132 __ sub(octetCounter, octetCounter, 8); 9133 __ subs(rscratch1, octetCounter, large_loop_threshold); 9134 inflate_and_store_2_fp_registers(true, v3, v4); 9135 inflate_and_store_2_fp_registers(true, v5, v6); 9136 __ br(__ GT, LOOP_PRFM); 9137 __ cmp(octetCounter, (u1)8); 9138 __ br(__ LT, DONE); 9139 __ bind(LOOP); 9140 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9141 __ bind(LOOP_START); 9142 __ sub(octetCounter, octetCounter, 8); 9143 __ cmp(octetCounter, (u1)8); 9144 inflate_and_store_2_fp_registers(false, v3, v4); 9145 inflate_and_store_2_fp_registers(false, v5, v6); 9146 __ br(__ GE, LOOP); 9147 __ bind(DONE); 9148 __ ret(lr); 9149 return entry; 9150 } 9151 9152 /** 9153 * Arguments: 9154 * 9155 * Input: 9156 * c_rarg0 - current state address 9157 * c_rarg1 - H key address 9158 * c_rarg2 - data address 9159 * c_rarg3 - number of blocks 9160 * 9161 * Output: 9162 * Updated state at c_rarg0 9163 */ 9164 address generate_ghash_processBlocks() { 9165 // Bafflingly, GCM uses little-endian for the byte order, but 9166 // big-endian for the bit order. For example, the polynomial 1 is 9167 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9168 // 9169 // So, we must either reverse the bytes in each word and do 9170 // everything big-endian or reverse the bits in each byte and do 9171 // it little-endian. On AArch64 it's more idiomatic to reverse 9172 // the bits in each byte (we have an instruction, RBIT, to do 9173 // that) and keep the data in little-endian bit order through the 9174 // calculation, bit-reversing the inputs and outputs. 9175 9176 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 9177 StubCodeMark mark(this, stub_id); 9178 __ align(wordSize * 2); 9179 address p = __ pc(); 9180 __ emit_int64(0x87); // The low-order bits of the field 9181 // polynomial (i.e. p = z^7+z^2+z+1) 9182 // repeated in the low and high parts of a 9183 // 128-bit vector 9184 __ emit_int64(0x87); 9185 9186 __ align(CodeEntryAlignment); 9187 address start = __ pc(); 9188 9189 Register state = c_rarg0; 9190 Register subkeyH = c_rarg1; 9191 Register data = c_rarg2; 9192 Register blocks = c_rarg3; 9193 9194 FloatRegister vzr = v30; 9195 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9196 9197 __ ldrq(v24, p); // The field polynomial 9198 9199 __ ldrq(v0, Address(state)); 9200 __ ldrq(v1, Address(subkeyH)); 9201 9202 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9203 __ rbit(v0, __ T16B, v0); 9204 __ rev64(v1, __ T16B, v1); 9205 __ rbit(v1, __ T16B, v1); 9206 9207 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9208 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9209 9210 { 9211 Label L_ghash_loop; 9212 __ bind(L_ghash_loop); 9213 9214 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9215 // reversing each byte 9216 __ rbit(v2, __ T16B, v2); 9217 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9218 9219 // Multiply state in v2 by subkey in v1 9220 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9221 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9222 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9223 // Reduce v7:v5 by the field polynomial 9224 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9225 9226 __ sub(blocks, blocks, 1); 9227 __ cbnz(blocks, L_ghash_loop); 9228 } 9229 9230 // The bit-reversed result is at this point in v0 9231 __ rev64(v0, __ T16B, v0); 9232 __ rbit(v0, __ T16B, v0); 9233 9234 __ st1(v0, __ T16B, state); 9235 __ ret(lr); 9236 9237 return start; 9238 } 9239 9240 address generate_ghash_processBlocks_wide() { 9241 address small = generate_ghash_processBlocks(); 9242 9243 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 9244 StubCodeMark mark(this, stub_id); 9245 __ align(wordSize * 2); 9246 address p = __ pc(); 9247 __ emit_int64(0x87); // The low-order bits of the field 9248 // polynomial (i.e. p = z^7+z^2+z+1) 9249 // repeated in the low and high parts of a 9250 // 128-bit vector 9251 __ emit_int64(0x87); 9252 9253 __ align(CodeEntryAlignment); 9254 address start = __ pc(); 9255 9256 Register state = c_rarg0; 9257 Register subkeyH = c_rarg1; 9258 Register data = c_rarg2; 9259 Register blocks = c_rarg3; 9260 9261 const int unroll = 4; 9262 9263 __ cmp(blocks, (unsigned char)(unroll * 2)); 9264 __ br(__ LT, small); 9265 9266 if (unroll > 1) { 9267 // Save state before entering routine 9268 __ sub(sp, sp, 4 * 16); 9269 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9270 __ sub(sp, sp, 4 * 16); 9271 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9272 } 9273 9274 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 9275 9276 if (unroll > 1) { 9277 // And restore state 9278 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9279 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9280 } 9281 9282 __ cmp(blocks, (unsigned char)0); 9283 __ br(__ GT, small); 9284 9285 __ ret(lr); 9286 9287 return start; 9288 } 9289 9290 void generate_base64_encode_simdround(Register src, Register dst, 9291 FloatRegister codec, u8 size) { 9292 9293 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9294 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9295 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9296 9297 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9298 9299 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9300 9301 __ ushr(ind0, arrangement, in0, 2); 9302 9303 __ ushr(ind1, arrangement, in1, 2); 9304 __ shl(in0, arrangement, in0, 6); 9305 __ orr(ind1, arrangement, ind1, in0); 9306 __ ushr(ind1, arrangement, ind1, 2); 9307 9308 __ ushr(ind2, arrangement, in2, 4); 9309 __ shl(in1, arrangement, in1, 4); 9310 __ orr(ind2, arrangement, in1, ind2); 9311 __ ushr(ind2, arrangement, ind2, 2); 9312 9313 __ shl(ind3, arrangement, in2, 2); 9314 __ ushr(ind3, arrangement, ind3, 2); 9315 9316 __ tbl(out0, arrangement, codec, 4, ind0); 9317 __ tbl(out1, arrangement, codec, 4, ind1); 9318 __ tbl(out2, arrangement, codec, 4, ind2); 9319 __ tbl(out3, arrangement, codec, 4, ind3); 9320 9321 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9322 } 9323 9324 /** 9325 * Arguments: 9326 * 9327 * Input: 9328 * c_rarg0 - src_start 9329 * c_rarg1 - src_offset 9330 * c_rarg2 - src_length 9331 * c_rarg3 - dest_start 9332 * c_rarg4 - dest_offset 9333 * c_rarg5 - isURL 9334 * 9335 */ 9336 address generate_base64_encodeBlock() { 9337 9338 static const char toBase64[64] = { 9339 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9340 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9341 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9342 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9343 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9344 }; 9345 9346 static const char toBase64URL[64] = { 9347 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9348 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9349 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9350 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9351 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9352 }; 9353 9354 __ align(CodeEntryAlignment); 9355 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 9356 StubCodeMark mark(this, stub_id); 9357 address start = __ pc(); 9358 9359 Register src = c_rarg0; // source array 9360 Register soff = c_rarg1; // source start offset 9361 Register send = c_rarg2; // source end offset 9362 Register dst = c_rarg3; // dest array 9363 Register doff = c_rarg4; // position for writing to dest array 9364 Register isURL = c_rarg5; // Base64 or URL character set 9365 9366 // c_rarg6 and c_rarg7 are free to use as temps 9367 Register codec = c_rarg6; 9368 Register length = c_rarg7; 9369 9370 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9371 9372 __ add(src, src, soff); 9373 __ add(dst, dst, doff); 9374 __ sub(length, send, soff); 9375 9376 // load the codec base address 9377 __ lea(codec, ExternalAddress((address) toBase64)); 9378 __ cbz(isURL, ProcessData); 9379 __ lea(codec, ExternalAddress((address) toBase64URL)); 9380 9381 __ BIND(ProcessData); 9382 9383 // too short to formup a SIMD loop, roll back 9384 __ cmp(length, (u1)24); 9385 __ br(Assembler::LT, Process3B); 9386 9387 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9388 9389 __ BIND(Process48B); 9390 __ cmp(length, (u1)48); 9391 __ br(Assembler::LT, Process24B); 9392 generate_base64_encode_simdround(src, dst, v0, 16); 9393 __ sub(length, length, 48); 9394 __ b(Process48B); 9395 9396 __ BIND(Process24B); 9397 __ cmp(length, (u1)24); 9398 __ br(Assembler::LT, SIMDExit); 9399 generate_base64_encode_simdround(src, dst, v0, 8); 9400 __ sub(length, length, 24); 9401 9402 __ BIND(SIMDExit); 9403 __ cbz(length, Exit); 9404 9405 __ BIND(Process3B); 9406 // 3 src bytes, 24 bits 9407 __ ldrb(r10, __ post(src, 1)); 9408 __ ldrb(r11, __ post(src, 1)); 9409 __ ldrb(r12, __ post(src, 1)); 9410 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9411 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9412 // codec index 9413 __ ubfmw(r15, r12, 18, 23); 9414 __ ubfmw(r14, r12, 12, 17); 9415 __ ubfmw(r13, r12, 6, 11); 9416 __ andw(r12, r12, 63); 9417 // get the code based on the codec 9418 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9419 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9420 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9421 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9422 __ strb(r15, __ post(dst, 1)); 9423 __ strb(r14, __ post(dst, 1)); 9424 __ strb(r13, __ post(dst, 1)); 9425 __ strb(r12, __ post(dst, 1)); 9426 __ sub(length, length, 3); 9427 __ cbnz(length, Process3B); 9428 9429 __ BIND(Exit); 9430 __ ret(lr); 9431 9432 return start; 9433 } 9434 9435 void generate_base64_decode_simdround(Register src, Register dst, 9436 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9437 9438 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9439 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9440 9441 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9442 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9443 9444 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9445 9446 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9447 9448 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9449 9450 // we need unsigned saturating subtract, to make sure all input values 9451 // in range [0, 63] will have 0U value in the higher half lookup 9452 __ uqsubv(decH0, __ T16B, in0, v27); 9453 __ uqsubv(decH1, __ T16B, in1, v27); 9454 __ uqsubv(decH2, __ T16B, in2, v27); 9455 __ uqsubv(decH3, __ T16B, in3, v27); 9456 9457 // lower half lookup 9458 __ tbl(decL0, arrangement, codecL, 4, in0); 9459 __ tbl(decL1, arrangement, codecL, 4, in1); 9460 __ tbl(decL2, arrangement, codecL, 4, in2); 9461 __ tbl(decL3, arrangement, codecL, 4, in3); 9462 9463 // higher half lookup 9464 __ tbx(decH0, arrangement, codecH, 4, decH0); 9465 __ tbx(decH1, arrangement, codecH, 4, decH1); 9466 __ tbx(decH2, arrangement, codecH, 4, decH2); 9467 __ tbx(decH3, arrangement, codecH, 4, decH3); 9468 9469 // combine lower and higher 9470 __ orr(decL0, arrangement, decL0, decH0); 9471 __ orr(decL1, arrangement, decL1, decH1); 9472 __ orr(decL2, arrangement, decL2, decH2); 9473 __ orr(decL3, arrangement, decL3, decH3); 9474 9475 // check illegal inputs, value larger than 63 (maximum of 6 bits) 9476 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 9477 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 9478 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 9479 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 9480 __ orr(in0, arrangement, decH0, decH1); 9481 __ orr(in1, arrangement, decH2, decH3); 9482 __ orr(in2, arrangement, in0, in1); 9483 __ umaxv(in3, arrangement, in2); 9484 __ umov(rscratch2, in3, __ B, 0); 9485 9486 // get the data to output 9487 __ shl(out0, arrangement, decL0, 2); 9488 __ ushr(out1, arrangement, decL1, 4); 9489 __ orr(out0, arrangement, out0, out1); 9490 __ shl(out1, arrangement, decL1, 4); 9491 __ ushr(out2, arrangement, decL2, 2); 9492 __ orr(out1, arrangement, out1, out2); 9493 __ shl(out2, arrangement, decL2, 6); 9494 __ orr(out2, arrangement, out2, decL3); 9495 9496 __ cbz(rscratch2, NoIllegalData); 9497 9498 // handle illegal input 9499 __ umov(r10, in2, __ D, 0); 9500 if (size == 16) { 9501 __ cbnz(r10, ErrorInLowerHalf); 9502 9503 // illegal input is in higher half, store the lower half now. 9504 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 9505 9506 __ umov(r10, in2, __ D, 1); 9507 __ umov(r11, out0, __ D, 1); 9508 __ umov(r12, out1, __ D, 1); 9509 __ umov(r13, out2, __ D, 1); 9510 __ b(StoreLegalData); 9511 9512 __ BIND(ErrorInLowerHalf); 9513 } 9514 __ umov(r11, out0, __ D, 0); 9515 __ umov(r12, out1, __ D, 0); 9516 __ umov(r13, out2, __ D, 0); 9517 9518 __ BIND(StoreLegalData); 9519 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 9520 __ strb(r11, __ post(dst, 1)); 9521 __ strb(r12, __ post(dst, 1)); 9522 __ strb(r13, __ post(dst, 1)); 9523 __ lsr(r10, r10, 8); 9524 __ lsr(r11, r11, 8); 9525 __ lsr(r12, r12, 8); 9526 __ lsr(r13, r13, 8); 9527 __ b(StoreLegalData); 9528 9529 __ BIND(NoIllegalData); 9530 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 9531 } 9532 9533 9534 /** 9535 * Arguments: 9536 * 9537 * Input: 9538 * c_rarg0 - src_start 9539 * c_rarg1 - src_offset 9540 * c_rarg2 - src_length 9541 * c_rarg3 - dest_start 9542 * c_rarg4 - dest_offset 9543 * c_rarg5 - isURL 9544 * c_rarg6 - isMIME 9545 * 9546 */ 9547 address generate_base64_decodeBlock() { 9548 9549 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 9550 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 9551 // titled "Base64 decoding". 9552 9553 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 9554 // except the trailing character '=' is also treated illegal value in this intrinsic. That 9555 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 9556 static const uint8_t fromBase64ForNoSIMD[256] = { 9557 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9558 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9559 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 9560 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9561 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 9562 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 9563 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 9564 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 9565 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9566 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9567 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9568 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9569 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9570 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9571 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9572 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9573 }; 9574 9575 static const uint8_t fromBase64URLForNoSIMD[256] = { 9576 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9577 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9578 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 9579 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9580 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 9581 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 9582 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 9583 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 9584 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9585 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9586 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9587 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9588 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9589 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9590 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9591 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9592 }; 9593 9594 // A legal value of base64 code is in range [0, 127]. We need two lookups 9595 // with tbl/tbx and combine them to get the decode data. The 1st table vector 9596 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 9597 // table vector lookup use tbx, out of range indices are unchanged in 9598 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 9599 // The value of index 64 is set to 0, so that we know that we already get the 9600 // decoded data with the 1st lookup. 9601 static const uint8_t fromBase64ForSIMD[128] = { 9602 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9603 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9604 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 9605 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9606 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 9607 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 9608 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 9609 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 9610 }; 9611 9612 static const uint8_t fromBase64URLForSIMD[128] = { 9613 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9614 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9615 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 9616 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9617 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 9618 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 9619 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 9620 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 9621 }; 9622 9623 __ align(CodeEntryAlignment); 9624 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 9625 StubCodeMark mark(this, stub_id); 9626 address start = __ pc(); 9627 9628 Register src = c_rarg0; // source array 9629 Register soff = c_rarg1; // source start offset 9630 Register send = c_rarg2; // source end offset 9631 Register dst = c_rarg3; // dest array 9632 Register doff = c_rarg4; // position for writing to dest array 9633 Register isURL = c_rarg5; // Base64 or URL character set 9634 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 9635 9636 Register length = send; // reuse send as length of source data to process 9637 9638 Register simd_codec = c_rarg6; 9639 Register nosimd_codec = c_rarg7; 9640 9641 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 9642 9643 __ enter(); 9644 9645 __ add(src, src, soff); 9646 __ add(dst, dst, doff); 9647 9648 __ mov(doff, dst); 9649 9650 __ sub(length, send, soff); 9651 __ bfm(length, zr, 0, 1); 9652 9653 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 9654 __ cbz(isURL, ProcessData); 9655 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 9656 9657 __ BIND(ProcessData); 9658 __ mov(rscratch1, length); 9659 __ cmp(length, (u1)144); // 144 = 80 + 64 9660 __ br(Assembler::LT, Process4B); 9661 9662 // In the MIME case, the line length cannot be more than 76 9663 // bytes (see RFC 2045). This is too short a block for SIMD 9664 // to be worthwhile, so we use non-SIMD here. 9665 __ movw(rscratch1, 79); 9666 9667 __ BIND(Process4B); 9668 __ ldrw(r14, __ post(src, 4)); 9669 __ ubfxw(r10, r14, 0, 8); 9670 __ ubfxw(r11, r14, 8, 8); 9671 __ ubfxw(r12, r14, 16, 8); 9672 __ ubfxw(r13, r14, 24, 8); 9673 // get the de-code 9674 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 9675 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 9676 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 9677 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 9678 // error detection, 255u indicates an illegal input 9679 __ orrw(r14, r10, r11); 9680 __ orrw(r15, r12, r13); 9681 __ orrw(r14, r14, r15); 9682 __ tbnz(r14, 7, Exit); 9683 // recover the data 9684 __ lslw(r14, r10, 10); 9685 __ bfiw(r14, r11, 4, 6); 9686 __ bfmw(r14, r12, 2, 5); 9687 __ rev16w(r14, r14); 9688 __ bfiw(r13, r12, 6, 2); 9689 __ strh(r14, __ post(dst, 2)); 9690 __ strb(r13, __ post(dst, 1)); 9691 // non-simd loop 9692 __ subsw(rscratch1, rscratch1, 4); 9693 __ br(Assembler::GT, Process4B); 9694 9695 // if exiting from PreProcess80B, rscratch1 == -1; 9696 // otherwise, rscratch1 == 0. 9697 __ cbzw(rscratch1, Exit); 9698 __ sub(length, length, 80); 9699 9700 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 9701 __ cbz(isURL, SIMDEnter); 9702 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 9703 9704 __ BIND(SIMDEnter); 9705 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 9706 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 9707 __ mov(rscratch1, 63); 9708 __ dup(v27, __ T16B, rscratch1); 9709 9710 __ BIND(Process64B); 9711 __ cmp(length, (u1)64); 9712 __ br(Assembler::LT, Process32B); 9713 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 9714 __ sub(length, length, 64); 9715 __ b(Process64B); 9716 9717 __ BIND(Process32B); 9718 __ cmp(length, (u1)32); 9719 __ br(Assembler::LT, SIMDExit); 9720 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 9721 __ sub(length, length, 32); 9722 __ b(Process32B); 9723 9724 __ BIND(SIMDExit); 9725 __ cbz(length, Exit); 9726 __ movw(rscratch1, length); 9727 __ b(Process4B); 9728 9729 __ BIND(Exit); 9730 __ sub(c_rarg0, dst, doff); 9731 9732 __ leave(); 9733 __ ret(lr); 9734 9735 return start; 9736 } 9737 9738 // Support for spin waits. 9739 address generate_spin_wait() { 9740 __ align(CodeEntryAlignment); 9741 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 9742 StubCodeMark mark(this, stub_id); 9743 address start = __ pc(); 9744 9745 __ spin_wait(); 9746 __ ret(lr); 9747 9748 return start; 9749 } 9750 9751 void generate_lookup_secondary_supers_table_stub() { 9752 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 9753 StubCodeMark mark(this, stub_id); 9754 9755 const Register 9756 r_super_klass = r0, 9757 r_array_base = r1, 9758 r_array_length = r2, 9759 r_array_index = r3, 9760 r_sub_klass = r4, 9761 r_bitmap = rscratch2, 9762 result = r5; 9763 const FloatRegister 9764 vtemp = v0; 9765 9766 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 9767 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 9768 Label L_success; 9769 __ enter(); 9770 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 9771 r_array_base, r_array_length, r_array_index, 9772 vtemp, result, slot, 9773 /*stub_is_near*/true); 9774 __ leave(); 9775 __ ret(lr); 9776 } 9777 } 9778 9779 // Slow path implementation for UseSecondarySupersTable. 9780 address generate_lookup_secondary_supers_table_slow_path_stub() { 9781 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 9782 StubCodeMark mark(this, stub_id); 9783 9784 address start = __ pc(); 9785 const Register 9786 r_super_klass = r0, // argument 9787 r_array_base = r1, // argument 9788 temp1 = r2, // temp 9789 r_array_index = r3, // argument 9790 r_bitmap = rscratch2, // argument 9791 result = r5; // argument 9792 9793 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 9794 __ ret(lr); 9795 9796 return start; 9797 } 9798 9799 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 9800 9801 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 9802 // 9803 // If LSE is in use, generate LSE versions of all the stubs. The 9804 // non-LSE versions are in atomic_aarch64.S. 9805 9806 // class AtomicStubMark records the entry point of a stub and the 9807 // stub pointer which will point to it. The stub pointer is set to 9808 // the entry point when ~AtomicStubMark() is called, which must be 9809 // after ICache::invalidate_range. This ensures safe publication of 9810 // the generated code. 9811 class AtomicStubMark { 9812 address _entry_point; 9813 aarch64_atomic_stub_t *_stub; 9814 MacroAssembler *_masm; 9815 public: 9816 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 9817 _masm = masm; 9818 __ align(32); 9819 _entry_point = __ pc(); 9820 _stub = stub; 9821 } 9822 ~AtomicStubMark() { 9823 *_stub = (aarch64_atomic_stub_t)_entry_point; 9824 } 9825 }; 9826 9827 // NB: For memory_order_conservative we need a trailing membar after 9828 // LSE atomic operations but not a leading membar. 9829 // 9830 // We don't need a leading membar because a clause in the Arm ARM 9831 // says: 9832 // 9833 // Barrier-ordered-before 9834 // 9835 // Barrier instructions order prior Memory effects before subsequent 9836 // Memory effects generated by the same Observer. A read or a write 9837 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 9838 // Observer if and only if RW1 appears in program order before RW 2 9839 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 9840 // instruction with both Acquire and Release semantics. 9841 // 9842 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 9843 // and Release semantics, therefore we don't need a leading 9844 // barrier. However, there is no corresponding Barrier-ordered-after 9845 // relationship, therefore we need a trailing membar to prevent a 9846 // later store or load from being reordered with the store in an 9847 // atomic instruction. 9848 // 9849 // This was checked by using the herd7 consistency model simulator 9850 // (http://diy.inria.fr/) with this test case: 9851 // 9852 // AArch64 LseCas 9853 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 9854 // P0 | P1; 9855 // LDR W4, [X2] | MOV W3, #0; 9856 // DMB LD | MOV W4, #1; 9857 // LDR W3, [X1] | CASAL W3, W4, [X1]; 9858 // | DMB ISH; 9859 // | STR W4, [X2]; 9860 // exists 9861 // (0:X3=0 /\ 0:X4=1) 9862 // 9863 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 9864 // with the store to x in P1. Without the DMB in P1 this may happen. 9865 // 9866 // At the time of writing we don't know of any AArch64 hardware that 9867 // reorders stores in this way, but the Reference Manual permits it. 9868 9869 void gen_cas_entry(Assembler::operand_size size, 9870 atomic_memory_order order) { 9871 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 9872 exchange_val = c_rarg2; 9873 bool acquire, release; 9874 switch (order) { 9875 case memory_order_relaxed: 9876 acquire = false; 9877 release = false; 9878 break; 9879 case memory_order_release: 9880 acquire = false; 9881 release = true; 9882 break; 9883 default: 9884 acquire = true; 9885 release = true; 9886 break; 9887 } 9888 __ mov(prev, compare_val); 9889 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 9890 if (order == memory_order_conservative) { 9891 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 9892 } 9893 if (size == Assembler::xword) { 9894 __ mov(r0, prev); 9895 } else { 9896 __ movw(r0, prev); 9897 } 9898 __ ret(lr); 9899 } 9900 9901 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 9902 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 9903 // If not relaxed, then default to conservative. Relaxed is the only 9904 // case we use enough to be worth specializing. 9905 if (order == memory_order_relaxed) { 9906 __ ldadd(size, incr, prev, addr); 9907 } else { 9908 __ ldaddal(size, incr, prev, addr); 9909 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 9910 } 9911 if (size == Assembler::xword) { 9912 __ mov(r0, prev); 9913 } else { 9914 __ movw(r0, prev); 9915 } 9916 __ ret(lr); 9917 } 9918 9919 void gen_swpal_entry(Assembler::operand_size size) { 9920 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 9921 __ swpal(size, incr, prev, addr); 9922 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 9923 if (size == Assembler::xword) { 9924 __ mov(r0, prev); 9925 } else { 9926 __ movw(r0, prev); 9927 } 9928 __ ret(lr); 9929 } 9930 9931 void generate_atomic_entry_points() { 9932 if (! UseLSE) { 9933 return; 9934 } 9935 __ align(CodeEntryAlignment); 9936 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 9937 StubCodeMark mark(this, stub_id); 9938 address first_entry = __ pc(); 9939 9940 // ADD, memory_order_conservative 9941 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 9942 gen_ldadd_entry(Assembler::word, memory_order_conservative); 9943 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 9944 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 9945 9946 // ADD, memory_order_relaxed 9947 AtomicStubMark mark_fetch_add_4_relaxed 9948 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 9949 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 9950 AtomicStubMark mark_fetch_add_8_relaxed 9951 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 9952 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 9953 9954 // XCHG, memory_order_conservative 9955 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 9956 gen_swpal_entry(Assembler::word); 9957 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 9958 gen_swpal_entry(Assembler::xword); 9959 9960 // CAS, memory_order_conservative 9961 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 9962 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 9963 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 9964 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 9965 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 9966 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 9967 9968 // CAS, memory_order_relaxed 9969 AtomicStubMark mark_cmpxchg_1_relaxed 9970 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 9971 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 9972 AtomicStubMark mark_cmpxchg_4_relaxed 9973 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 9974 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 9975 AtomicStubMark mark_cmpxchg_8_relaxed 9976 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 9977 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 9978 9979 AtomicStubMark mark_cmpxchg_4_release 9980 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 9981 gen_cas_entry(MacroAssembler::word, memory_order_release); 9982 AtomicStubMark mark_cmpxchg_8_release 9983 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 9984 gen_cas_entry(MacroAssembler::xword, memory_order_release); 9985 9986 AtomicStubMark mark_cmpxchg_4_seq_cst 9987 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 9988 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 9989 AtomicStubMark mark_cmpxchg_8_seq_cst 9990 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 9991 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 9992 9993 ICache::invalidate_range(first_entry, __ pc() - first_entry); 9994 } 9995 #endif // LINUX 9996 9997 address generate_cont_thaw(Continuation::thaw_kind kind) { 9998 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 9999 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 10000 10001 address start = __ pc(); 10002 10003 if (return_barrier) { 10004 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 10005 __ mov(sp, rscratch1); 10006 } 10007 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10008 10009 if (return_barrier) { 10010 // preserve possible return value from a method returning to the return barrier 10011 __ fmovd(rscratch1, v0); 10012 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10013 } 10014 10015 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 10016 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 10017 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 10018 10019 if (return_barrier) { 10020 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10021 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10022 __ fmovd(v0, rscratch1); 10023 } 10024 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10025 10026 10027 Label thaw_success; 10028 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10029 __ cbnz(rscratch2, thaw_success); 10030 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10031 __ br(rscratch1); 10032 __ bind(thaw_success); 10033 10034 // make room for the thawed frames 10035 __ sub(rscratch1, sp, rscratch2); 10036 __ andr(rscratch1, rscratch1, -16); // align 10037 __ mov(sp, rscratch1); 10038 10039 if (return_barrier) { 10040 // save original return value -- again 10041 __ fmovd(rscratch1, v0); 10042 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10043 } 10044 10045 // If we want, we can templatize thaw by kind, and have three different entries 10046 __ movw(c_rarg1, (uint32_t)kind); 10047 10048 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10049 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10050 10051 if (return_barrier) { 10052 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10053 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10054 __ fmovd(v0, rscratch1); 10055 } else { 10056 __ mov(r0, zr); // return 0 (success) from doYield 10057 } 10058 10059 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10060 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10061 __ mov(rfp, sp); 10062 10063 if (return_barrier_exception) { 10064 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10065 __ authenticate_return_address(c_rarg1); 10066 __ verify_oop(r0); 10067 // save return value containing the exception oop in callee-saved R19 10068 __ mov(r19, r0); 10069 10070 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10071 10072 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10073 // __ reinitialize_ptrue(); 10074 10075 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10076 10077 __ mov(r1, r0); // the exception handler 10078 __ mov(r0, r19); // restore return value containing the exception oop 10079 __ verify_oop(r0); 10080 10081 __ leave(); 10082 __ mov(r3, lr); 10083 __ br(r1); // the exception handler 10084 } else { 10085 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10086 __ leave(); 10087 __ ret(lr); 10088 } 10089 10090 return start; 10091 } 10092 10093 address generate_cont_thaw() { 10094 if (!Continuations::enabled()) return nullptr; 10095 10096 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 10097 StubCodeMark mark(this, stub_id); 10098 address start = __ pc(); 10099 generate_cont_thaw(Continuation::thaw_top); 10100 return start; 10101 } 10102 10103 address generate_cont_returnBarrier() { 10104 if (!Continuations::enabled()) return nullptr; 10105 10106 // TODO: will probably need multiple return barriers depending on return type 10107 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 10108 StubCodeMark mark(this, stub_id); 10109 address start = __ pc(); 10110 10111 generate_cont_thaw(Continuation::thaw_return_barrier); 10112 10113 return start; 10114 } 10115 10116 address generate_cont_returnBarrier_exception() { 10117 if (!Continuations::enabled()) return nullptr; 10118 10119 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 10120 StubCodeMark mark(this, stub_id); 10121 address start = __ pc(); 10122 10123 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10124 10125 return start; 10126 } 10127 10128 address generate_cont_preempt_stub() { 10129 if (!Continuations::enabled()) return nullptr; 10130 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 10131 StubCodeMark mark(this, stub_id); 10132 address start = __ pc(); 10133 10134 __ reset_last_Java_frame(true); 10135 10136 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10137 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10138 __ mov(sp, rscratch2); 10139 10140 Label preemption_cancelled; 10141 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10142 __ cbnz(rscratch1, preemption_cancelled); 10143 10144 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10145 SharedRuntime::continuation_enter_cleanup(_masm); 10146 __ leave(); 10147 __ ret(lr); 10148 10149 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10150 __ bind(preemption_cancelled); 10151 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10152 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10153 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10154 __ ldr(rscratch1, Address(rscratch1)); 10155 __ br(rscratch1); 10156 10157 return start; 10158 } 10159 10160 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10161 // are represented as long[5], with BITS_PER_LIMB = 26. 10162 // Pack five 26-bit limbs into three 64-bit registers. 10163 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10164 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10165 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10166 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10167 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10168 10169 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10170 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10171 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10172 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10173 10174 if (dest2->is_valid()) { 10175 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10176 } else { 10177 #ifdef ASSERT 10178 Label OK; 10179 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10180 __ br(__ EQ, OK); 10181 __ stop("high bits of Poly1305 integer should be zero"); 10182 __ should_not_reach_here(); 10183 __ bind(OK); 10184 #endif 10185 } 10186 } 10187 10188 // As above, but return only a 128-bit integer, packed into two 10189 // 64-bit registers. 10190 void pack_26(Register dest0, Register dest1, Register src) { 10191 pack_26(dest0, dest1, noreg, src); 10192 } 10193 10194 // Multiply and multiply-accumulate unsigned 64-bit registers. 10195 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10196 __ mul(prod_lo, n, m); 10197 __ umulh(prod_hi, n, m); 10198 } 10199 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10200 wide_mul(rscratch1, rscratch2, n, m); 10201 __ adds(sum_lo, sum_lo, rscratch1); 10202 __ adc(sum_hi, sum_hi, rscratch2); 10203 } 10204 10205 // Poly1305, RFC 7539 10206 10207 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10208 // description of the tricks used to simplify and accelerate this 10209 // computation. 10210 10211 address generate_poly1305_processBlocks() { 10212 __ align(CodeEntryAlignment); 10213 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 10214 StubCodeMark mark(this, stub_id); 10215 address start = __ pc(); 10216 Label here; 10217 __ enter(); 10218 RegSet callee_saved = RegSet::range(r19, r28); 10219 __ push(callee_saved, sp); 10220 10221 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10222 10223 // Arguments 10224 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10225 10226 // R_n is the 128-bit randomly-generated key, packed into two 10227 // registers. The caller passes this key to us as long[5], with 10228 // BITS_PER_LIMB = 26. 10229 const Register R_0 = *++regs, R_1 = *++regs; 10230 pack_26(R_0, R_1, r_start); 10231 10232 // RR_n is (R_n >> 2) * 5 10233 const Register RR_0 = *++regs, RR_1 = *++regs; 10234 __ lsr(RR_0, R_0, 2); 10235 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10236 __ lsr(RR_1, R_1, 2); 10237 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10238 10239 // U_n is the current checksum 10240 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10241 pack_26(U_0, U_1, U_2, acc_start); 10242 10243 static constexpr int BLOCK_LENGTH = 16; 10244 Label DONE, LOOP; 10245 10246 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10247 __ br(Assembler::LT, DONE); { 10248 __ bind(LOOP); 10249 10250 // S_n is to be the sum of U_n and the next block of data 10251 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10252 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10253 __ adds(S_0, U_0, S_0); 10254 __ adcs(S_1, U_1, S_1); 10255 __ adc(S_2, U_2, zr); 10256 __ add(S_2, S_2, 1); 10257 10258 const Register U_0HI = *++regs, U_1HI = *++regs; 10259 10260 // NB: this logic depends on some of the special properties of 10261 // Poly1305 keys. In particular, because we know that the top 10262 // four bits of R_0 and R_1 are zero, we can add together 10263 // partial products without any risk of needing to propagate a 10264 // carry out. 10265 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10266 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10267 __ andr(U_2, R_0, 3); 10268 __ mul(U_2, S_2, U_2); 10269 10270 // Recycle registers S_0, S_1, S_2 10271 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10272 10273 // Partial reduction mod 2**130 - 5 10274 __ adds(U_1, U_0HI, U_1); 10275 __ adc(U_2, U_1HI, U_2); 10276 // Sum now in U_2:U_1:U_0. 10277 // Dead: U_0HI, U_1HI. 10278 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10279 10280 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10281 10282 // First, U_2:U_1:U_0 += (U_2 >> 2) 10283 __ lsr(rscratch1, U_2, 2); 10284 __ andr(U_2, U_2, (u8)3); 10285 __ adds(U_0, U_0, rscratch1); 10286 __ adcs(U_1, U_1, zr); 10287 __ adc(U_2, U_2, zr); 10288 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10289 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10290 __ adcs(U_1, U_1, zr); 10291 __ adc(U_2, U_2, zr); 10292 10293 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10294 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10295 __ br(~ Assembler::LT, LOOP); 10296 } 10297 10298 // Further reduce modulo 2^130 - 5 10299 __ lsr(rscratch1, U_2, 2); 10300 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10301 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10302 __ adcs(U_1, U_1, zr); 10303 __ andr(U_2, U_2, (u1)3); 10304 __ adc(U_2, U_2, zr); 10305 10306 // Unpack the sum into five 26-bit limbs and write to memory. 10307 __ ubfiz(rscratch1, U_0, 0, 26); 10308 __ ubfx(rscratch2, U_0, 26, 26); 10309 __ stp(rscratch1, rscratch2, Address(acc_start)); 10310 __ ubfx(rscratch1, U_0, 52, 12); 10311 __ bfi(rscratch1, U_1, 12, 14); 10312 __ ubfx(rscratch2, U_1, 14, 26); 10313 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10314 __ ubfx(rscratch1, U_1, 40, 24); 10315 __ bfi(rscratch1, U_2, 24, 3); 10316 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10317 10318 __ bind(DONE); 10319 __ pop(callee_saved, sp); 10320 __ leave(); 10321 __ ret(lr); 10322 10323 return start; 10324 } 10325 10326 // exception handler for upcall stubs 10327 address generate_upcall_stub_exception_handler() { 10328 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 10329 StubCodeMark mark(this, stub_id); 10330 address start = __ pc(); 10331 10332 // Native caller has no idea how to handle exceptions, 10333 // so we just crash here. Up to callee to catch exceptions. 10334 __ verify_oop(r0); 10335 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10336 __ blr(rscratch1); 10337 __ should_not_reach_here(); 10338 10339 return start; 10340 } 10341 10342 // load Method* target of MethodHandle 10343 // j_rarg0 = jobject receiver 10344 // rmethod = result 10345 address generate_upcall_stub_load_target() { 10346 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 10347 StubCodeMark mark(this, stub_id); 10348 address start = __ pc(); 10349 10350 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10351 // Load target method from receiver 10352 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10353 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10354 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10355 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10356 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10357 noreg, noreg); 10358 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10359 10360 __ ret(lr); 10361 10362 return start; 10363 } 10364 10365 #undef __ 10366 #define __ masm-> 10367 10368 class MontgomeryMultiplyGenerator : public MacroAssembler { 10369 10370 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10371 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10372 10373 RegSet _toSave; 10374 bool _squaring; 10375 10376 public: 10377 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10378 : MacroAssembler(as->code()), _squaring(squaring) { 10379 10380 // Register allocation 10381 10382 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10383 Pa_base = *regs; // Argument registers 10384 if (squaring) 10385 Pb_base = Pa_base; 10386 else 10387 Pb_base = *++regs; 10388 Pn_base = *++regs; 10389 Rlen= *++regs; 10390 inv = *++regs; 10391 Pm_base = *++regs; 10392 10393 // Working registers: 10394 Ra = *++regs; // The current digit of a, b, n, and m. 10395 Rb = *++regs; 10396 Rm = *++regs; 10397 Rn = *++regs; 10398 10399 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10400 Pb = *++regs; 10401 Pm = *++regs; 10402 Pn = *++regs; 10403 10404 t0 = *++regs; // Three registers which form a 10405 t1 = *++regs; // triple-precision accumuator. 10406 t2 = *++regs; 10407 10408 Ri = *++regs; // Inner and outer loop indexes. 10409 Rj = *++regs; 10410 10411 Rhi_ab = *++regs; // Product registers: low and high parts 10412 Rlo_ab = *++regs; // of a*b and m*n. 10413 Rhi_mn = *++regs; 10414 Rlo_mn = *++regs; 10415 10416 // r19 and up are callee-saved. 10417 _toSave = RegSet::range(r19, *regs) + Pm_base; 10418 } 10419 10420 private: 10421 void save_regs() { 10422 push(_toSave, sp); 10423 } 10424 10425 void restore_regs() { 10426 pop(_toSave, sp); 10427 } 10428 10429 template <typename T> 10430 void unroll_2(Register count, T block) { 10431 Label loop, end, odd; 10432 tbnz(count, 0, odd); 10433 cbz(count, end); 10434 align(16); 10435 bind(loop); 10436 (this->*block)(); 10437 bind(odd); 10438 (this->*block)(); 10439 subs(count, count, 2); 10440 br(Assembler::GT, loop); 10441 bind(end); 10442 } 10443 10444 template <typename T> 10445 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10446 Label loop, end, odd; 10447 tbnz(count, 0, odd); 10448 cbz(count, end); 10449 align(16); 10450 bind(loop); 10451 (this->*block)(d, s, tmp); 10452 bind(odd); 10453 (this->*block)(d, s, tmp); 10454 subs(count, count, 2); 10455 br(Assembler::GT, loop); 10456 bind(end); 10457 } 10458 10459 void pre1(RegisterOrConstant i) { 10460 block_comment("pre1"); 10461 // Pa = Pa_base; 10462 // Pb = Pb_base + i; 10463 // Pm = Pm_base; 10464 // Pn = Pn_base + i; 10465 // Ra = *Pa; 10466 // Rb = *Pb; 10467 // Rm = *Pm; 10468 // Rn = *Pn; 10469 ldr(Ra, Address(Pa_base)); 10470 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10471 ldr(Rm, Address(Pm_base)); 10472 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10473 lea(Pa, Address(Pa_base)); 10474 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10475 lea(Pm, Address(Pm_base)); 10476 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10477 10478 // Zero the m*n result. 10479 mov(Rhi_mn, zr); 10480 mov(Rlo_mn, zr); 10481 } 10482 10483 // The core multiply-accumulate step of a Montgomery 10484 // multiplication. The idea is to schedule operations as a 10485 // pipeline so that instructions with long latencies (loads and 10486 // multiplies) have time to complete before their results are 10487 // used. This most benefits in-order implementations of the 10488 // architecture but out-of-order ones also benefit. 10489 void step() { 10490 block_comment("step"); 10491 // MACC(Ra, Rb, t0, t1, t2); 10492 // Ra = *++Pa; 10493 // Rb = *--Pb; 10494 umulh(Rhi_ab, Ra, Rb); 10495 mul(Rlo_ab, Ra, Rb); 10496 ldr(Ra, pre(Pa, wordSize)); 10497 ldr(Rb, pre(Pb, -wordSize)); 10498 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 10499 // previous iteration. 10500 // MACC(Rm, Rn, t0, t1, t2); 10501 // Rm = *++Pm; 10502 // Rn = *--Pn; 10503 umulh(Rhi_mn, Rm, Rn); 10504 mul(Rlo_mn, Rm, Rn); 10505 ldr(Rm, pre(Pm, wordSize)); 10506 ldr(Rn, pre(Pn, -wordSize)); 10507 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10508 } 10509 10510 void post1() { 10511 block_comment("post1"); 10512 10513 // MACC(Ra, Rb, t0, t1, t2); 10514 // Ra = *++Pa; 10515 // Rb = *--Pb; 10516 umulh(Rhi_ab, Ra, Rb); 10517 mul(Rlo_ab, Ra, Rb); 10518 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10519 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10520 10521 // *Pm = Rm = t0 * inv; 10522 mul(Rm, t0, inv); 10523 str(Rm, Address(Pm)); 10524 10525 // MACC(Rm, Rn, t0, t1, t2); 10526 // t0 = t1; t1 = t2; t2 = 0; 10527 umulh(Rhi_mn, Rm, Rn); 10528 10529 #ifndef PRODUCT 10530 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10531 { 10532 mul(Rlo_mn, Rm, Rn); 10533 add(Rlo_mn, t0, Rlo_mn); 10534 Label ok; 10535 cbz(Rlo_mn, ok); { 10536 stop("broken Montgomery multiply"); 10537 } bind(ok); 10538 } 10539 #endif 10540 // We have very carefully set things up so that 10541 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 10542 // the lower half of Rm * Rn because we know the result already: 10543 // it must be -t0. t0 + (-t0) must generate a carry iff 10544 // t0 != 0. So, rather than do a mul and an adds we just set 10545 // the carry flag iff t0 is nonzero. 10546 // 10547 // mul(Rlo_mn, Rm, Rn); 10548 // adds(zr, t0, Rlo_mn); 10549 subs(zr, t0, 1); // Set carry iff t0 is nonzero 10550 adcs(t0, t1, Rhi_mn); 10551 adc(t1, t2, zr); 10552 mov(t2, zr); 10553 } 10554 10555 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 10556 block_comment("pre2"); 10557 // Pa = Pa_base + i-len; 10558 // Pb = Pb_base + len; 10559 // Pm = Pm_base + i-len; 10560 // Pn = Pn_base + len; 10561 10562 if (i.is_register()) { 10563 sub(Rj, i.as_register(), len); 10564 } else { 10565 mov(Rj, i.as_constant()); 10566 sub(Rj, Rj, len); 10567 } 10568 // Rj == i-len 10569 10570 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 10571 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 10572 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 10573 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 10574 10575 // Ra = *++Pa; 10576 // Rb = *--Pb; 10577 // Rm = *++Pm; 10578 // Rn = *--Pn; 10579 ldr(Ra, pre(Pa, wordSize)); 10580 ldr(Rb, pre(Pb, -wordSize)); 10581 ldr(Rm, pre(Pm, wordSize)); 10582 ldr(Rn, pre(Pn, -wordSize)); 10583 10584 mov(Rhi_mn, zr); 10585 mov(Rlo_mn, zr); 10586 } 10587 10588 void post2(RegisterOrConstant i, RegisterOrConstant len) { 10589 block_comment("post2"); 10590 if (i.is_constant()) { 10591 mov(Rj, i.as_constant()-len.as_constant()); 10592 } else { 10593 sub(Rj, i.as_register(), len); 10594 } 10595 10596 adds(t0, t0, Rlo_mn); // The pending m*n, low part 10597 10598 // As soon as we know the least significant digit of our result, 10599 // store it. 10600 // Pm_base[i-len] = t0; 10601 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 10602 10603 // t0 = t1; t1 = t2; t2 = 0; 10604 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 10605 adc(t1, t2, zr); 10606 mov(t2, zr); 10607 } 10608 10609 // A carry in t0 after Montgomery multiplication means that we 10610 // should subtract multiples of n from our result in m. We'll 10611 // keep doing that until there is no carry. 10612 void normalize(RegisterOrConstant len) { 10613 block_comment("normalize"); 10614 // while (t0) 10615 // t0 = sub(Pm_base, Pn_base, t0, len); 10616 Label loop, post, again; 10617 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 10618 cbz(t0, post); { 10619 bind(again); { 10620 mov(i, zr); 10621 mov(cnt, len); 10622 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10623 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10624 subs(zr, zr, zr); // set carry flag, i.e. no borrow 10625 align(16); 10626 bind(loop); { 10627 sbcs(Rm, Rm, Rn); 10628 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10629 add(i, i, 1); 10630 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10631 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10632 sub(cnt, cnt, 1); 10633 } cbnz(cnt, loop); 10634 sbc(t0, t0, zr); 10635 } cbnz(t0, again); 10636 } bind(post); 10637 } 10638 10639 // Move memory at s to d, reversing words. 10640 // Increments d to end of copied memory 10641 // Destroys tmp1, tmp2 10642 // Preserves len 10643 // Leaves s pointing to the address which was in d at start 10644 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 10645 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 10646 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 10647 10648 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 10649 mov(tmp1, len); 10650 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 10651 sub(s, d, len, ext::uxtw, LogBytesPerWord); 10652 } 10653 // where 10654 void reverse1(Register d, Register s, Register tmp) { 10655 ldr(tmp, pre(s, -wordSize)); 10656 ror(tmp, tmp, 32); 10657 str(tmp, post(d, wordSize)); 10658 } 10659 10660 void step_squaring() { 10661 // An extra ACC 10662 step(); 10663 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10664 } 10665 10666 void last_squaring(RegisterOrConstant i) { 10667 Label dont; 10668 // if ((i & 1) == 0) { 10669 tbnz(i.as_register(), 0, dont); { 10670 // MACC(Ra, Rb, t0, t1, t2); 10671 // Ra = *++Pa; 10672 // Rb = *--Pb; 10673 umulh(Rhi_ab, Ra, Rb); 10674 mul(Rlo_ab, Ra, Rb); 10675 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10676 } bind(dont); 10677 } 10678 10679 void extra_step_squaring() { 10680 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10681 10682 // MACC(Rm, Rn, t0, t1, t2); 10683 // Rm = *++Pm; 10684 // Rn = *--Pn; 10685 umulh(Rhi_mn, Rm, Rn); 10686 mul(Rlo_mn, Rm, Rn); 10687 ldr(Rm, pre(Pm, wordSize)); 10688 ldr(Rn, pre(Pn, -wordSize)); 10689 } 10690 10691 void post1_squaring() { 10692 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10693 10694 // *Pm = Rm = t0 * inv; 10695 mul(Rm, t0, inv); 10696 str(Rm, Address(Pm)); 10697 10698 // MACC(Rm, Rn, t0, t1, t2); 10699 // t0 = t1; t1 = t2; t2 = 0; 10700 umulh(Rhi_mn, Rm, Rn); 10701 10702 #ifndef PRODUCT 10703 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10704 { 10705 mul(Rlo_mn, Rm, Rn); 10706 add(Rlo_mn, t0, Rlo_mn); 10707 Label ok; 10708 cbz(Rlo_mn, ok); { 10709 stop("broken Montgomery multiply"); 10710 } bind(ok); 10711 } 10712 #endif 10713 // We have very carefully set things up so that 10714 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 10715 // the lower half of Rm * Rn because we know the result already: 10716 // it must be -t0. t0 + (-t0) must generate a carry iff 10717 // t0 != 0. So, rather than do a mul and an adds we just set 10718 // the carry flag iff t0 is nonzero. 10719 // 10720 // mul(Rlo_mn, Rm, Rn); 10721 // adds(zr, t0, Rlo_mn); 10722 subs(zr, t0, 1); // Set carry iff t0 is nonzero 10723 adcs(t0, t1, Rhi_mn); 10724 adc(t1, t2, zr); 10725 mov(t2, zr); 10726 } 10727 10728 void acc(Register Rhi, Register Rlo, 10729 Register t0, Register t1, Register t2) { 10730 adds(t0, t0, Rlo); 10731 adcs(t1, t1, Rhi); 10732 adc(t2, t2, zr); 10733 } 10734 10735 public: 10736 /** 10737 * Fast Montgomery multiplication. The derivation of the 10738 * algorithm is in A Cryptographic Library for the Motorola 10739 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 10740 * 10741 * Arguments: 10742 * 10743 * Inputs for multiplication: 10744 * c_rarg0 - int array elements a 10745 * c_rarg1 - int array elements b 10746 * c_rarg2 - int array elements n (the modulus) 10747 * c_rarg3 - int length 10748 * c_rarg4 - int inv 10749 * c_rarg5 - int array elements m (the result) 10750 * 10751 * Inputs for squaring: 10752 * c_rarg0 - int array elements a 10753 * c_rarg1 - int array elements n (the modulus) 10754 * c_rarg2 - int length 10755 * c_rarg3 - int inv 10756 * c_rarg4 - int array elements m (the result) 10757 * 10758 */ 10759 address generate_multiply() { 10760 Label argh, nothing; 10761 bind(argh); 10762 stop("MontgomeryMultiply total_allocation must be <= 8192"); 10763 10764 align(CodeEntryAlignment); 10765 address entry = pc(); 10766 10767 cbzw(Rlen, nothing); 10768 10769 enter(); 10770 10771 // Make room. 10772 cmpw(Rlen, 512); 10773 br(Assembler::HI, argh); 10774 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 10775 andr(sp, Ra, -2 * wordSize); 10776 10777 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 10778 10779 { 10780 // Copy input args, reversing as we go. We use Ra as a 10781 // temporary variable. 10782 reverse(Ra, Pa_base, Rlen, t0, t1); 10783 if (!_squaring) 10784 reverse(Ra, Pb_base, Rlen, t0, t1); 10785 reverse(Ra, Pn_base, Rlen, t0, t1); 10786 } 10787 10788 // Push all call-saved registers and also Pm_base which we'll need 10789 // at the end. 10790 save_regs(); 10791 10792 #ifndef PRODUCT 10793 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 10794 { 10795 ldr(Rn, Address(Pn_base, 0)); 10796 mul(Rlo_mn, Rn, inv); 10797 subs(zr, Rlo_mn, -1); 10798 Label ok; 10799 br(EQ, ok); { 10800 stop("broken inverse in Montgomery multiply"); 10801 } bind(ok); 10802 } 10803 #endif 10804 10805 mov(Pm_base, Ra); 10806 10807 mov(t0, zr); 10808 mov(t1, zr); 10809 mov(t2, zr); 10810 10811 block_comment("for (int i = 0; i < len; i++) {"); 10812 mov(Ri, zr); { 10813 Label loop, end; 10814 cmpw(Ri, Rlen); 10815 br(Assembler::GE, end); 10816 10817 bind(loop); 10818 pre1(Ri); 10819 10820 block_comment(" for (j = i; j; j--) {"); { 10821 movw(Rj, Ri); 10822 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 10823 } block_comment(" } // j"); 10824 10825 post1(); 10826 addw(Ri, Ri, 1); 10827 cmpw(Ri, Rlen); 10828 br(Assembler::LT, loop); 10829 bind(end); 10830 block_comment("} // i"); 10831 } 10832 10833 block_comment("for (int i = len; i < 2*len; i++) {"); 10834 mov(Ri, Rlen); { 10835 Label loop, end; 10836 cmpw(Ri, Rlen, Assembler::LSL, 1); 10837 br(Assembler::GE, end); 10838 10839 bind(loop); 10840 pre2(Ri, Rlen); 10841 10842 block_comment(" for (j = len*2-i-1; j; j--) {"); { 10843 lslw(Rj, Rlen, 1); 10844 subw(Rj, Rj, Ri); 10845 subw(Rj, Rj, 1); 10846 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 10847 } block_comment(" } // j"); 10848 10849 post2(Ri, Rlen); 10850 addw(Ri, Ri, 1); 10851 cmpw(Ri, Rlen, Assembler::LSL, 1); 10852 br(Assembler::LT, loop); 10853 bind(end); 10854 } 10855 block_comment("} // i"); 10856 10857 normalize(Rlen); 10858 10859 mov(Ra, Pm_base); // Save Pm_base in Ra 10860 restore_regs(); // Restore caller's Pm_base 10861 10862 // Copy our result into caller's Pm_base 10863 reverse(Pm_base, Ra, Rlen, t0, t1); 10864 10865 leave(); 10866 bind(nothing); 10867 ret(lr); 10868 10869 return entry; 10870 } 10871 // In C, approximately: 10872 10873 // void 10874 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 10875 // julong Pn_base[], julong Pm_base[], 10876 // julong inv, int len) { 10877 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 10878 // julong *Pa, *Pb, *Pn, *Pm; 10879 // julong Ra, Rb, Rn, Rm; 10880 10881 // int i; 10882 10883 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 10884 10885 // for (i = 0; i < len; i++) { 10886 // int j; 10887 10888 // Pa = Pa_base; 10889 // Pb = Pb_base + i; 10890 // Pm = Pm_base; 10891 // Pn = Pn_base + i; 10892 10893 // Ra = *Pa; 10894 // Rb = *Pb; 10895 // Rm = *Pm; 10896 // Rn = *Pn; 10897 10898 // int iters = i; 10899 // for (j = 0; iters--; j++) { 10900 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 10901 // MACC(Ra, Rb, t0, t1, t2); 10902 // Ra = *++Pa; 10903 // Rb = *--Pb; 10904 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 10905 // MACC(Rm, Rn, t0, t1, t2); 10906 // Rm = *++Pm; 10907 // Rn = *--Pn; 10908 // } 10909 10910 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 10911 // MACC(Ra, Rb, t0, t1, t2); 10912 // *Pm = Rm = t0 * inv; 10913 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 10914 // MACC(Rm, Rn, t0, t1, t2); 10915 10916 // assert(t0 == 0, "broken Montgomery multiply"); 10917 10918 // t0 = t1; t1 = t2; t2 = 0; 10919 // } 10920 10921 // for (i = len; i < 2*len; i++) { 10922 // int j; 10923 10924 // Pa = Pa_base + i-len; 10925 // Pb = Pb_base + len; 10926 // Pm = Pm_base + i-len; 10927 // Pn = Pn_base + len; 10928 10929 // Ra = *++Pa; 10930 // Rb = *--Pb; 10931 // Rm = *++Pm; 10932 // Rn = *--Pn; 10933 10934 // int iters = len*2-i-1; 10935 // for (j = i-len+1; iters--; j++) { 10936 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 10937 // MACC(Ra, Rb, t0, t1, t2); 10938 // Ra = *++Pa; 10939 // Rb = *--Pb; 10940 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 10941 // MACC(Rm, Rn, t0, t1, t2); 10942 // Rm = *++Pm; 10943 // Rn = *--Pn; 10944 // } 10945 10946 // Pm_base[i-len] = t0; 10947 // t0 = t1; t1 = t2; t2 = 0; 10948 // } 10949 10950 // while (t0) 10951 // t0 = sub(Pm_base, Pn_base, t0, len); 10952 // } 10953 10954 /** 10955 * Fast Montgomery squaring. This uses asymptotically 25% fewer 10956 * multiplies than Montgomery multiplication so it should be up to 10957 * 25% faster. However, its loop control is more complex and it 10958 * may actually run slower on some machines. 10959 * 10960 * Arguments: 10961 * 10962 * Inputs: 10963 * c_rarg0 - int array elements a 10964 * c_rarg1 - int array elements n (the modulus) 10965 * c_rarg2 - int length 10966 * c_rarg3 - int inv 10967 * c_rarg4 - int array elements m (the result) 10968 * 10969 */ 10970 address generate_square() { 10971 Label argh; 10972 bind(argh); 10973 stop("MontgomeryMultiply total_allocation must be <= 8192"); 10974 10975 align(CodeEntryAlignment); 10976 address entry = pc(); 10977 10978 enter(); 10979 10980 // Make room. 10981 cmpw(Rlen, 512); 10982 br(Assembler::HI, argh); 10983 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 10984 andr(sp, Ra, -2 * wordSize); 10985 10986 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 10987 10988 { 10989 // Copy input args, reversing as we go. We use Ra as a 10990 // temporary variable. 10991 reverse(Ra, Pa_base, Rlen, t0, t1); 10992 reverse(Ra, Pn_base, Rlen, t0, t1); 10993 } 10994 10995 // Push all call-saved registers and also Pm_base which we'll need 10996 // at the end. 10997 save_regs(); 10998 10999 mov(Pm_base, Ra); 11000 11001 mov(t0, zr); 11002 mov(t1, zr); 11003 mov(t2, zr); 11004 11005 block_comment("for (int i = 0; i < len; i++) {"); 11006 mov(Ri, zr); { 11007 Label loop, end; 11008 bind(loop); 11009 cmp(Ri, Rlen); 11010 br(Assembler::GE, end); 11011 11012 pre1(Ri); 11013 11014 block_comment("for (j = (i+1)/2; j; j--) {"); { 11015 add(Rj, Ri, 1); 11016 lsr(Rj, Rj, 1); 11017 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11018 } block_comment(" } // j"); 11019 11020 last_squaring(Ri); 11021 11022 block_comment(" for (j = i/2; j; j--) {"); { 11023 lsr(Rj, Ri, 1); 11024 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11025 } block_comment(" } // j"); 11026 11027 post1_squaring(); 11028 add(Ri, Ri, 1); 11029 cmp(Ri, Rlen); 11030 br(Assembler::LT, loop); 11031 11032 bind(end); 11033 block_comment("} // i"); 11034 } 11035 11036 block_comment("for (int i = len; i < 2*len; i++) {"); 11037 mov(Ri, Rlen); { 11038 Label loop, end; 11039 bind(loop); 11040 cmp(Ri, Rlen, Assembler::LSL, 1); 11041 br(Assembler::GE, end); 11042 11043 pre2(Ri, Rlen); 11044 11045 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11046 lsl(Rj, Rlen, 1); 11047 sub(Rj, Rj, Ri); 11048 sub(Rj, Rj, 1); 11049 lsr(Rj, Rj, 1); 11050 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11051 } block_comment(" } // j"); 11052 11053 last_squaring(Ri); 11054 11055 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11056 lsl(Rj, Rlen, 1); 11057 sub(Rj, Rj, Ri); 11058 lsr(Rj, Rj, 1); 11059 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11060 } block_comment(" } // j"); 11061 11062 post2(Ri, Rlen); 11063 add(Ri, Ri, 1); 11064 cmp(Ri, Rlen, Assembler::LSL, 1); 11065 11066 br(Assembler::LT, loop); 11067 bind(end); 11068 block_comment("} // i"); 11069 } 11070 11071 normalize(Rlen); 11072 11073 mov(Ra, Pm_base); // Save Pm_base in Ra 11074 restore_regs(); // Restore caller's Pm_base 11075 11076 // Copy our result into caller's Pm_base 11077 reverse(Pm_base, Ra, Rlen, t0, t1); 11078 11079 leave(); 11080 ret(lr); 11081 11082 return entry; 11083 } 11084 // In C, approximately: 11085 11086 // void 11087 // montgomery_square(julong Pa_base[], julong Pn_base[], 11088 // julong Pm_base[], julong inv, int len) { 11089 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11090 // julong *Pa, *Pb, *Pn, *Pm; 11091 // julong Ra, Rb, Rn, Rm; 11092 11093 // int i; 11094 11095 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11096 11097 // for (i = 0; i < len; i++) { 11098 // int j; 11099 11100 // Pa = Pa_base; 11101 // Pb = Pa_base + i; 11102 // Pm = Pm_base; 11103 // Pn = Pn_base + i; 11104 11105 // Ra = *Pa; 11106 // Rb = *Pb; 11107 // Rm = *Pm; 11108 // Rn = *Pn; 11109 11110 // int iters = (i+1)/2; 11111 // for (j = 0; iters--; j++) { 11112 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11113 // MACC2(Ra, Rb, t0, t1, t2); 11114 // Ra = *++Pa; 11115 // Rb = *--Pb; 11116 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11117 // MACC(Rm, Rn, t0, t1, t2); 11118 // Rm = *++Pm; 11119 // Rn = *--Pn; 11120 // } 11121 // if ((i & 1) == 0) { 11122 // assert(Ra == Pa_base[j], "must be"); 11123 // MACC(Ra, Ra, t0, t1, t2); 11124 // } 11125 // iters = i/2; 11126 // assert(iters == i-j, "must be"); 11127 // for (; iters--; j++) { 11128 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11129 // MACC(Rm, Rn, t0, t1, t2); 11130 // Rm = *++Pm; 11131 // Rn = *--Pn; 11132 // } 11133 11134 // *Pm = Rm = t0 * inv; 11135 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11136 // MACC(Rm, Rn, t0, t1, t2); 11137 11138 // assert(t0 == 0, "broken Montgomery multiply"); 11139 11140 // t0 = t1; t1 = t2; t2 = 0; 11141 // } 11142 11143 // for (i = len; i < 2*len; i++) { 11144 // int start = i-len+1; 11145 // int end = start + (len - start)/2; 11146 // int j; 11147 11148 // Pa = Pa_base + i-len; 11149 // Pb = Pa_base + len; 11150 // Pm = Pm_base + i-len; 11151 // Pn = Pn_base + len; 11152 11153 // Ra = *++Pa; 11154 // Rb = *--Pb; 11155 // Rm = *++Pm; 11156 // Rn = *--Pn; 11157 11158 // int iters = (2*len-i-1)/2; 11159 // assert(iters == end-start, "must be"); 11160 // for (j = start; iters--; j++) { 11161 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11162 // MACC2(Ra, Rb, t0, t1, t2); 11163 // Ra = *++Pa; 11164 // Rb = *--Pb; 11165 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11166 // MACC(Rm, Rn, t0, t1, t2); 11167 // Rm = *++Pm; 11168 // Rn = *--Pn; 11169 // } 11170 // if ((i & 1) == 0) { 11171 // assert(Ra == Pa_base[j], "must be"); 11172 // MACC(Ra, Ra, t0, t1, t2); 11173 // } 11174 // iters = (2*len-i)/2; 11175 // assert(iters == len-j, "must be"); 11176 // for (; iters--; j++) { 11177 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11178 // MACC(Rm, Rn, t0, t1, t2); 11179 // Rm = *++Pm; 11180 // Rn = *--Pn; 11181 // } 11182 // Pm_base[i-len] = t0; 11183 // t0 = t1; t1 = t2; t2 = 0; 11184 // } 11185 11186 // while (t0) 11187 // t0 = sub(Pm_base, Pn_base, t0, len); 11188 // } 11189 }; 11190 11191 void generate_vector_math_stubs() { 11192 // Get native vector math stub routine addresses 11193 void* libsleef = nullptr; 11194 char ebuf[1024]; 11195 char dll_name[JVM_MAXPATHLEN]; 11196 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 11197 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 11198 } 11199 if (libsleef == nullptr) { 11200 log_info(library)("Failed to load native vector math library, %s!", ebuf); 11201 return; 11202 } 11203 // Method naming convention 11204 // All the methods are named as <OP><T><N>_<U><suffix> 11205 // Where: 11206 // <OP> is the operation name, e.g. sin 11207 // <T> is optional to indicate float/double 11208 // "f/d" for vector float/double operation 11209 // <N> is the number of elements in the vector 11210 // "2/4" for neon, and "x" for sve 11211 // <U> is the precision level 11212 // "u10/u05" represents 1.0/0.5 ULP error bounds 11213 // We use "u10" for all operations by default 11214 // But for those functions do not have u10 support, we use "u05" instead 11215 // <suffix> indicates neon/sve 11216 // "sve/advsimd" for sve/neon implementations 11217 // e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions 11218 // cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions 11219 // 11220 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 11221 11222 // Math vector stubs implemented with SVE for scalable vector size. 11223 if (UseSVE > 0) { 11224 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 11225 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 11226 // Skip "tanh" because there is performance regression 11227 if (vop == VectorSupport::VECTOR_OP_TANH) { 11228 continue; 11229 } 11230 11231 // The native library does not support u10 level of "hypot". 11232 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 11233 11234 snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf); 11235 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 11236 11237 snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf); 11238 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 11239 } 11240 } 11241 11242 // Math vector stubs implemented with NEON for 64/128 bits vector size. 11243 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 11244 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 11245 // Skip "tanh" because there is performance regression 11246 if (vop == VectorSupport::VECTOR_OP_TANH) { 11247 continue; 11248 } 11249 11250 // The native library does not support u10 level of "hypot". 11251 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 11252 11253 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 11254 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf); 11255 11256 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 11257 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 11258 11259 snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf); 11260 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 11261 } 11262 } 11263 11264 // Call here from the interpreter or compiled code to either load 11265 // multiple returned values from the inline type instance being 11266 // returned to registers or to store returned values to a newly 11267 // allocated inline type instance. 11268 address generate_return_value_stub(address destination, const char* name, bool has_res) { 11269 // We need to save all registers the calling convention may use so 11270 // the runtime calls read or update those registers. This needs to 11271 // be in sync with SharedRuntime::java_return_convention(). 11272 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 11273 enum layout { 11274 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0 11275 j_rarg6_off, j_rarg6_2, 11276 j_rarg5_off, j_rarg5_2, 11277 j_rarg4_off, j_rarg4_2, 11278 j_rarg3_off, j_rarg3_2, 11279 j_rarg2_off, j_rarg2_2, 11280 j_rarg1_off, j_rarg1_2, 11281 j_rarg0_off, j_rarg0_2, 11282 11283 j_farg7_off, j_farg7_2, 11284 j_farg6_off, j_farg6_2, 11285 j_farg5_off, j_farg5_2, 11286 j_farg4_off, j_farg4_2, 11287 j_farg3_off, j_farg3_2, 11288 j_farg2_off, j_farg2_2, 11289 j_farg1_off, j_farg1_2, 11290 j_farg0_off, j_farg0_2, 11291 11292 rfp_off, rfp_off2, 11293 return_off, return_off2, 11294 11295 framesize // inclusive of return address 11296 }; 11297 11298 CodeBuffer code(name, 512, 64); 11299 MacroAssembler* masm = new MacroAssembler(&code); 11300 11301 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16); 11302 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned"); 11303 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 11304 int frame_size_in_words = frame_size_in_bytes / wordSize; 11305 11306 OopMapSet* oop_maps = new OopMapSet(); 11307 OopMap* map = new OopMap(frame_size_in_slots, 0); 11308 11309 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 11310 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 11311 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 11312 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 11313 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 11314 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 11315 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 11316 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 11317 11318 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 11319 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 11320 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 11321 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 11322 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 11323 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 11324 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 11325 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 11326 11327 address start = __ pc(); 11328 11329 __ enter(); // Save FP and LR before call 11330 11331 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize))); 11332 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize))); 11333 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize))); 11334 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize))); 11335 11336 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize))); 11337 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize))); 11338 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize))); 11339 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize))); 11340 11341 int frame_complete = __ offset(); 11342 11343 // Set up last_Java_sp and last_Java_fp 11344 address the_pc = __ pc(); 11345 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1); 11346 11347 // Call runtime 11348 __ mov(c_rarg1, r0); 11349 __ mov(c_rarg0, rthread); 11350 11351 __ mov(rscratch1, destination); 11352 __ blr(rscratch1); 11353 11354 oop_maps->add_gc_map(the_pc - start, map); 11355 11356 __ reset_last_Java_frame(false); 11357 11358 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize))); 11359 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize))); 11360 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize))); 11361 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize))); 11362 11363 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize))); 11364 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize))); 11365 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize))); 11366 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize))); 11367 11368 __ leave(); 11369 11370 // check for pending exceptions 11371 Label pending; 11372 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 11373 __ cbnz(rscratch1, pending); 11374 11375 if (has_res) { 11376 __ get_vm_result_oop(r0, rthread); 11377 } 11378 11379 __ ret(lr); 11380 11381 __ bind(pending); 11382 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 11383 11384 // ------------- 11385 // make sure all code is generated 11386 masm->flush(); 11387 11388 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 11389 return stub->entry_point(); 11390 } 11391 11392 // Initialization 11393 void generate_initial_stubs() { 11394 // Generate initial stubs and initializes the entry points 11395 11396 // entry points that exist in all platforms Note: This is code 11397 // that could be shared among different platforms - however the 11398 // benefit seems to be smaller than the disadvantage of having a 11399 // much more complicated generator structure. See also comment in 11400 // stubRoutines.hpp. 11401 11402 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11403 11404 StubRoutines::_call_stub_entry = 11405 generate_call_stub(StubRoutines::_call_stub_return_address); 11406 11407 // is referenced by megamorphic call 11408 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11409 11410 // Initialize table for copy memory (arraycopy) check. 11411 if (UnsafeMemoryAccess::_table == nullptr) { 11412 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11413 } 11414 11415 if (UseCRC32Intrinsics) { 11416 // set table address before stub generation which use it 11417 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 11418 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11419 } 11420 11421 if (UseCRC32CIntrinsics) { 11422 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11423 } 11424 11425 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11426 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11427 } 11428 11429 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11430 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11431 } 11432 11433 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11434 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11435 StubRoutines::_hf2f = generate_float16ToFloat(); 11436 StubRoutines::_f2hf = generate_floatToFloat16(); 11437 } 11438 11439 if (InlineTypeReturnedAsFields) { 11440 StubRoutines::_load_inline_type_fields_in_regs = 11441 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false); 11442 StubRoutines::_store_inline_type_fields_to_buf = 11443 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true); 11444 } 11445 11446 } 11447 11448 void generate_continuation_stubs() { 11449 // Continuation stubs: 11450 StubRoutines::_cont_thaw = generate_cont_thaw(); 11451 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11452 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11453 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11454 } 11455 11456 void generate_final_stubs() { 11457 // support for verify_oop (must happen after universe_init) 11458 if (VerifyOops) { 11459 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11460 } 11461 11462 // arraycopy stubs used by compilers 11463 generate_arraycopy_stubs(); 11464 11465 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11466 11467 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11468 11469 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11470 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11471 11472 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11473 11474 generate_atomic_entry_points(); 11475 11476 #endif // LINUX 11477 11478 #ifdef COMPILER2 11479 if (UseSecondarySupersTable) { 11480 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11481 if (! InlineSecondarySupersTest) { 11482 generate_lookup_secondary_supers_table_stub(); 11483 } 11484 } 11485 #endif 11486 11487 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11488 } 11489 11490 void generate_compiler_stubs() { 11491 #if COMPILER2_OR_JVMCI 11492 11493 if (UseSVE == 0) { 11494 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 11495 } 11496 11497 // array equals stub for large arrays. 11498 if (!UseSimpleArrayEquals) { 11499 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11500 } 11501 11502 // arrays_hascode stub for large arrays. 11503 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11504 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11505 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11506 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11507 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11508 11509 // byte_array_inflate stub for large arrays. 11510 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11511 11512 // countPositives stub for large arrays. 11513 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11514 11515 generate_compare_long_strings(); 11516 11517 generate_string_indexof_stubs(); 11518 11519 #ifdef COMPILER2 11520 if (UseMultiplyToLenIntrinsic) { 11521 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11522 } 11523 11524 if (UseSquareToLenIntrinsic) { 11525 StubRoutines::_squareToLen = generate_squareToLen(); 11526 } 11527 11528 if (UseMulAddIntrinsic) { 11529 StubRoutines::_mulAdd = generate_mulAdd(); 11530 } 11531 11532 if (UseSIMDForBigIntegerShiftIntrinsics) { 11533 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11534 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11535 } 11536 11537 if (UseMontgomeryMultiplyIntrinsic) { 11538 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 11539 StubCodeMark mark(this, stub_id); 11540 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11541 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11542 } 11543 11544 if (UseMontgomerySquareIntrinsic) { 11545 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 11546 StubCodeMark mark(this, stub_id); 11547 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11548 // We use generate_multiply() rather than generate_square() 11549 // because it's faster for the sizes of modulus we care about. 11550 StubRoutines::_montgomerySquare = g.generate_multiply(); 11551 } 11552 11553 generate_vector_math_stubs(); 11554 11555 #endif // COMPILER2 11556 11557 if (UseChaCha20Intrinsics) { 11558 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11559 } 11560 11561 if (UseKyberIntrinsics) { 11562 StubRoutines::_kyberNtt = generate_kyberNtt(); 11563 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 11564 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 11565 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 11566 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 11567 StubRoutines::_kyber12To16 = generate_kyber12To16(); 11568 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 11569 } 11570 11571 if (UseDilithiumIntrinsics) { 11572 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 11573 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 11574 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 11575 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 11576 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 11577 } 11578 11579 if (UseBASE64Intrinsics) { 11580 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 11581 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 11582 } 11583 11584 // data cache line writeback 11585 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 11586 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 11587 11588 if (UseAESIntrinsics) { 11589 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 11590 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 11591 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 11592 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 11593 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 11594 } 11595 if (UseGHASHIntrinsics) { 11596 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 11597 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 11598 } 11599 if (UseAESIntrinsics && UseGHASHIntrinsics) { 11600 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 11601 } 11602 11603 if (UseMD5Intrinsics) { 11604 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 11605 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 11606 } 11607 if (UseSHA1Intrinsics) { 11608 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 11609 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 11610 } 11611 if (UseSHA256Intrinsics) { 11612 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 11613 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 11614 } 11615 if (UseSHA512Intrinsics) { 11616 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 11617 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 11618 } 11619 if (UseSHA3Intrinsics) { 11620 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 11621 StubRoutines::_double_keccak = generate_double_keccak(); 11622 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 11623 } 11624 11625 if (UsePoly1305Intrinsics) { 11626 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 11627 } 11628 11629 // generate Adler32 intrinsics code 11630 if (UseAdler32Intrinsics) { 11631 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 11632 } 11633 11634 #endif // COMPILER2_OR_JVMCI 11635 } 11636 11637 public: 11638 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 11639 switch(blob_id) { 11640 case initial_id: 11641 generate_initial_stubs(); 11642 break; 11643 case continuation_id: 11644 generate_continuation_stubs(); 11645 break; 11646 case compiler_id: 11647 generate_compiler_stubs(); 11648 break; 11649 case final_id: 11650 generate_final_stubs(); 11651 break; 11652 default: 11653 fatal("unexpected blob id: %d", blob_id); 11654 break; 11655 }; 11656 } 11657 }; // end class declaration 11658 11659 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 11660 StubGenerator g(code, blob_id); 11661 } 11662 11663 11664 #if defined (LINUX) 11665 11666 // Define pointers to atomic stubs and initialize them to point to the 11667 // code in atomic_aarch64.S. 11668 11669 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 11670 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 11671 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 11672 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 11673 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 11674 11675 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 11676 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 11677 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 11678 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 11679 DEFAULT_ATOMIC_OP(xchg, 4, ) 11680 DEFAULT_ATOMIC_OP(xchg, 8, ) 11681 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 11682 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 11683 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 11684 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 11685 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 11686 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 11687 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 11688 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 11689 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 11690 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 11691 11692 #undef DEFAULT_ATOMIC_OP 11693 11694 #endif // LINUX