1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubGenStubId stub_id = StubGenStubId::call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 __ ldr(j_rarg2, result); 332 Label is_long, is_float, is_double, exit; 333 __ ldr(j_rarg1, result_type); 334 __ cmp(j_rarg1, (u1)T_OBJECT); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, (u1)T_LONG); 337 __ br(Assembler::EQ, is_long); 338 __ cmp(j_rarg1, (u1)T_FLOAT); 339 __ br(Assembler::EQ, is_float); 340 __ cmp(j_rarg1, (u1)T_DOUBLE); 341 __ br(Assembler::EQ, is_double); 342 343 // handle T_INT case 344 __ strw(r0, Address(j_rarg2)); 345 346 __ BIND(exit); 347 348 // pop parameters 349 __ sub(esp, rfp, -sp_after_call_off * wordSize); 350 351 #ifdef ASSERT 352 // verify that threads correspond 353 { 354 Label L, S; 355 __ ldr(rscratch1, thread); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::NE, S); 358 __ get_thread(rscratch1); 359 __ cmp(rthread, rscratch1); 360 __ br(Assembler::EQ, L); 361 __ BIND(S); 362 __ stop("StubRoutines::call_stub: threads must correspond"); 363 __ BIND(L); 364 } 365 #endif 366 367 __ pop_cont_fastpath(rthread); 368 369 // restore callee-save registers 370 __ ldpd(v15, v14, d15_save); 371 __ ldpd(v13, v12, d13_save); 372 __ ldpd(v11, v10, d11_save); 373 __ ldpd(v9, v8, d9_save); 374 375 __ ldp(r28, r27, r28_save); 376 __ ldp(r26, r25, r26_save); 377 __ ldp(r24, r23, r24_save); 378 __ ldp(r22, r21, r22_save); 379 __ ldp(r20, r19, r20_save); 380 381 // restore fpcr 382 __ ldr(rscratch1, fpcr_save); 383 __ set_fpcr(rscratch1); 384 385 __ ldp(c_rarg0, c_rarg1, call_wrapper); 386 __ ldrw(c_rarg2, result_type); 387 __ ldr(c_rarg3, method); 388 __ ldp(c_rarg4, c_rarg5, entry_point); 389 __ ldp(c_rarg6, c_rarg7, parameter_size); 390 391 // leave frame and return to caller 392 __ leave(); 393 __ ret(lr); 394 395 // handle return types different from T_INT 396 397 __ BIND(is_long); 398 __ str(r0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 __ BIND(is_float); 402 __ strs(j_farg0, Address(j_rarg2, 0)); 403 __ br(Assembler::AL, exit); 404 405 __ BIND(is_double); 406 __ strd(j_farg0, Address(j_rarg2, 0)); 407 __ br(Assembler::AL, exit); 408 409 return start; 410 } 411 412 // Return point for a Java call if there's an exception thrown in 413 // Java code. The exception is caught and transformed into a 414 // pending exception stored in JavaThread that can be tested from 415 // within the VM. 416 // 417 // Note: Usually the parameters are removed by the callee. In case 418 // of an exception crossing an activation frame boundary, that is 419 // not the case if the callee is compiled code => need to setup the 420 // rsp. 421 // 422 // r0: exception oop 423 424 address generate_catch_exception() { 425 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 426 StubCodeMark mark(this, stub_id); 427 address start = __ pc(); 428 429 // same as in generate_call_stub(): 430 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 431 const Address thread (rfp, thread_off * wordSize); 432 433 #ifdef ASSERT 434 // verify that threads correspond 435 { 436 Label L, S; 437 __ ldr(rscratch1, thread); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::NE, S); 440 __ get_thread(rscratch1); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::EQ, L); 443 __ bind(S); 444 __ stop("StubRoutines::catch_exception: threads must correspond"); 445 __ bind(L); 446 } 447 #endif 448 449 // set pending exception 450 __ verify_oop(r0); 451 452 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 453 __ mov(rscratch1, (address)__FILE__); 454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 455 __ movw(rscratch1, (int)__LINE__); 456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 457 458 // complete return to VM 459 assert(StubRoutines::_call_stub_return_address != nullptr, 460 "_call_stub_return_address must have been generated before"); 461 __ b(StubRoutines::_call_stub_return_address); 462 463 return start; 464 } 465 466 // Continuation point for runtime calls returning with a pending 467 // exception. The pending exception check happened in the runtime 468 // or native call stub. The pending exception in Thread is 469 // converted into a Java-level exception. 470 // 471 // Contract with Java-level exception handlers: 472 // r0: exception 473 // r3: throwing pc 474 // 475 // NOTE: At entry of this stub, exception-pc must be in LR !! 476 477 // NOTE: this is always used as a jump target within generated code 478 // so it just needs to be generated code with no x86 prolog 479 480 address generate_forward_exception() { 481 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 482 StubCodeMark mark(this, stub_id); 483 address start = __ pc(); 484 485 // Upon entry, LR points to the return address returning into 486 // Java (interpreted or compiled) code; i.e., the return address 487 // becomes the throwing pc. 488 // 489 // Arguments pushed before the runtime call are still on the stack 490 // but the exception handler will reset the stack pointer -> 491 // ignore them. A potential result in registers can be ignored as 492 // well. 493 494 #ifdef ASSERT 495 // make sure this code is only executed if there is a pending exception 496 { 497 Label L; 498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 499 __ cbnz(rscratch1, L); 500 __ stop("StubRoutines::forward exception: no pending exception (1)"); 501 __ bind(L); 502 } 503 #endif 504 505 // compute exception handler into r19 506 507 // call the VM to find the handler address associated with the 508 // caller address. pass thread in r0 and caller pc (ret address) 509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 510 // the stack. 511 __ mov(c_rarg1, lr); 512 // lr will be trashed by the VM call so we move it to R19 513 // (callee-saved) because we also need to pass it to the handler 514 // returned by this call. 515 __ mov(r19, lr); 516 BLOCK_COMMENT("call exception_handler_for_return_address"); 517 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 518 SharedRuntime::exception_handler_for_return_address), 519 rthread, c_rarg1); 520 // Reinitialize the ptrue predicate register, in case the external runtime 521 // call clobbers ptrue reg, as we may return to SVE compiled code. 522 __ reinitialize_ptrue(); 523 524 // we should not really care that lr is no longer the callee 525 // address. we saved the value the handler needs in r19 so we can 526 // just copy it to r3. however, the C2 handler will push its own 527 // frame and then calls into the VM and the VM code asserts that 528 // the PC for the frame above the handler belongs to a compiled 529 // Java method. So, we restore lr here to satisfy that assert. 530 __ mov(lr, r19); 531 // setup r0 & r3 & clear pending exception 532 __ mov(r3, r19); 533 __ mov(r19, r0); 534 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 535 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 536 537 #ifdef ASSERT 538 // make sure exception is set 539 { 540 Label L; 541 __ cbnz(r0, L); 542 __ stop("StubRoutines::forward exception: no pending exception (2)"); 543 __ bind(L); 544 } 545 #endif 546 547 // continue at exception handler 548 // r0: exception 549 // r3: throwing pc 550 // r19: exception handler 551 __ verify_oop(r0); 552 __ br(r19); 553 554 return start; 555 } 556 557 // Non-destructive plausibility checks for oops 558 // 559 // Arguments: 560 // r0: oop to verify 561 // rscratch1: error message 562 // 563 // Stack after saving c_rarg3: 564 // [tos + 0]: saved c_rarg3 565 // [tos + 1]: saved c_rarg2 566 // [tos + 2]: saved lr 567 // [tos + 3]: saved rscratch2 568 // [tos + 4]: saved r0 569 // [tos + 5]: saved rscratch1 570 address generate_verify_oop() { 571 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 572 StubCodeMark mark(this, stub_id); 573 address start = __ pc(); 574 575 Label exit, error; 576 577 // save c_rarg2 and c_rarg3 578 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 579 580 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 581 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ ldr(c_rarg3, Address(c_rarg2)); 583 __ add(c_rarg3, c_rarg3, 1); 584 __ str(c_rarg3, Address(c_rarg2)); 585 586 // object is in r0 587 // make sure object is 'reasonable' 588 __ cbz(r0, exit); // if obj is null it is OK 589 590 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 591 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 592 593 // return if everything seems ok 594 __ bind(exit); 595 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 __ ret(lr); 598 599 // handle errors 600 __ bind(error); 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 603 __ push(RegSet::range(r0, r29), sp); 604 // debug(char* msg, int64_t pc, int64_t regs[]) 605 __ mov(c_rarg0, rscratch1); // pass address of error message 606 __ mov(c_rarg1, lr); // pass return address 607 __ mov(c_rarg2, sp); // pass address of regs on stack 608 #ifndef PRODUCT 609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 610 #endif 611 BLOCK_COMMENT("call MacroAssembler::debug"); 612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 613 __ blr(rscratch1); 614 __ hlt(0); 615 616 return start; 617 } 618 619 // Generate indices for iota vector. 620 address generate_iota_indices(StubGenStubId stub_id) { 621 __ align(CodeEntryAlignment); 622 StubCodeMark mark(this, stub_id); 623 address start = __ pc(); 624 // B 625 __ emit_data64(0x0706050403020100, relocInfo::none); 626 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 627 // H 628 __ emit_data64(0x0003000200010000, relocInfo::none); 629 __ emit_data64(0x0007000600050004, relocInfo::none); 630 // S 631 __ emit_data64(0x0000000100000000, relocInfo::none); 632 __ emit_data64(0x0000000300000002, relocInfo::none); 633 // D 634 __ emit_data64(0x0000000000000000, relocInfo::none); 635 __ emit_data64(0x0000000000000001, relocInfo::none); 636 // S - FP 637 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 638 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 639 // D - FP 640 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 641 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 642 return start; 643 } 644 645 // The inner part of zero_words(). This is the bulk operation, 646 // zeroing words in blocks, possibly using DC ZVA to do it. The 647 // caller is responsible for zeroing the last few words. 648 // 649 // Inputs: 650 // r10: the HeapWord-aligned base address of an array to zero. 651 // r11: the count in HeapWords, r11 > 0. 652 // 653 // Returns r10 and r11, adjusted for the caller to clear. 654 // r10: the base address of the tail of words left to clear. 655 // r11: the number of words in the tail. 656 // r11 < MacroAssembler::zero_words_block_size. 657 658 address generate_zero_blocks() { 659 Label done; 660 Label base_aligned; 661 662 Register base = r10, cnt = r11; 663 664 __ align(CodeEntryAlignment); 665 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 666 StubCodeMark mark(this, stub_id); 667 address start = __ pc(); 668 669 if (UseBlockZeroing) { 670 int zva_length = VM_Version::zva_length(); 671 672 // Ensure ZVA length can be divided by 16. This is required by 673 // the subsequent operations. 674 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 675 676 __ tbz(base, 3, base_aligned); 677 __ str(zr, Address(__ post(base, 8))); 678 __ sub(cnt, cnt, 1); 679 __ bind(base_aligned); 680 681 // Ensure count >= zva_length * 2 so that it still deserves a zva after 682 // alignment. 683 Label small; 684 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 685 __ subs(rscratch1, cnt, low_limit >> 3); 686 __ br(Assembler::LT, small); 687 __ zero_dcache_blocks(base, cnt); 688 __ bind(small); 689 } 690 691 { 692 // Number of stp instructions we'll unroll 693 const int unroll = 694 MacroAssembler::zero_words_block_size / 2; 695 // Clear the remaining blocks. 696 Label loop; 697 __ subs(cnt, cnt, unroll * 2); 698 __ br(Assembler::LT, done); 699 __ bind(loop); 700 for (int i = 0; i < unroll; i++) 701 __ stp(zr, zr, __ post(base, 16)); 702 __ subs(cnt, cnt, unroll * 2); 703 __ br(Assembler::GE, loop); 704 __ bind(done); 705 __ add(cnt, cnt, unroll * 2); 706 } 707 708 __ ret(lr); 709 710 return start; 711 } 712 713 714 typedef enum { 715 copy_forwards = 1, 716 copy_backwards = -1 717 } copy_direction; 718 719 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 720 // for arraycopy stubs. 721 class ArrayCopyBarrierSetHelper : StackObj { 722 BarrierSetAssembler* _bs_asm; 723 MacroAssembler* _masm; 724 DecoratorSet _decorators; 725 BasicType _type; 726 Register _gct1; 727 Register _gct2; 728 Register _gct3; 729 FloatRegister _gcvt1; 730 FloatRegister _gcvt2; 731 FloatRegister _gcvt3; 732 733 public: 734 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 735 DecoratorSet decorators, 736 BasicType type, 737 Register gct1, 738 Register gct2, 739 Register gct3, 740 FloatRegister gcvt1, 741 FloatRegister gcvt2, 742 FloatRegister gcvt3) 743 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 744 _masm(masm), 745 _decorators(decorators), 746 _type(type), 747 _gct1(gct1), 748 _gct2(gct2), 749 _gct3(gct3), 750 _gcvt1(gcvt1), 751 _gcvt2(gcvt2), 752 _gcvt3(gcvt3) { 753 } 754 755 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 756 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 757 dst1, dst2, src, 758 _gct1, _gct2, _gcvt1); 759 } 760 761 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 762 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 763 dst, src1, src2, 764 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 765 } 766 767 void copy_load_at_16(Register dst1, Register dst2, Address src) { 768 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 769 dst1, dst2, src, 770 _gct1); 771 } 772 773 void copy_store_at_16(Address dst, Register src1, Register src2) { 774 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 775 dst, src1, src2, 776 _gct1, _gct2, _gct3); 777 } 778 779 void copy_load_at_8(Register dst, Address src) { 780 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 781 dst, noreg, src, 782 _gct1); 783 } 784 785 void copy_store_at_8(Address dst, Register src) { 786 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 787 dst, src, noreg, 788 _gct1, _gct2, _gct3); 789 } 790 }; 791 792 // Bulk copy of blocks of 8 words. 793 // 794 // count is a count of words. 795 // 796 // Precondition: count >= 8 797 // 798 // Postconditions: 799 // 800 // The least significant bit of count contains the remaining count 801 // of words to copy. The rest of count is trash. 802 // 803 // s and d are adjusted to point to the remaining words to copy 804 // 805 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 806 BasicType type; 807 copy_direction direction; 808 809 switch (stub_id) { 810 case copy_byte_f_id: 811 direction = copy_forwards; 812 type = T_BYTE; 813 break; 814 case copy_byte_b_id: 815 direction = copy_backwards; 816 type = T_BYTE; 817 break; 818 case copy_oop_f_id: 819 direction = copy_forwards; 820 type = T_OBJECT; 821 break; 822 case copy_oop_b_id: 823 direction = copy_backwards; 824 type = T_OBJECT; 825 break; 826 case copy_oop_uninit_f_id: 827 direction = copy_forwards; 828 type = T_OBJECT; 829 break; 830 case copy_oop_uninit_b_id: 831 direction = copy_backwards; 832 type = T_OBJECT; 833 break; 834 default: 835 ShouldNotReachHere(); 836 } 837 838 int unit = wordSize * direction; 839 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 840 841 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 842 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 843 const Register stride = r14; 844 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 845 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 846 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 847 848 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 849 assert_different_registers(s, d, count, rscratch1, rscratch2); 850 851 Label again, drain; 852 853 __ align(CodeEntryAlignment); 854 855 StubCodeMark mark(this, stub_id); 856 857 __ bind(start); 858 859 Label unaligned_copy_long; 860 if (AvoidUnalignedAccesses) { 861 __ tbnz(d, 3, unaligned_copy_long); 862 } 863 864 if (direction == copy_forwards) { 865 __ sub(s, s, bias); 866 __ sub(d, d, bias); 867 } 868 869 #ifdef ASSERT 870 // Make sure we are never given < 8 words 871 { 872 Label L; 873 __ cmp(count, (u1)8); 874 __ br(Assembler::GE, L); 875 __ stop("genrate_copy_longs called with < 8 words"); 876 __ bind(L); 877 } 878 #endif 879 880 // Fill 8 registers 881 if (UseSIMDForMemoryOps) { 882 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 884 } else { 885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 886 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 887 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 888 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 889 } 890 891 __ subs(count, count, 16); 892 __ br(Assembler::LO, drain); 893 894 int prefetch = PrefetchCopyIntervalInBytes; 895 bool use_stride = false; 896 if (direction == copy_backwards) { 897 use_stride = prefetch > 256; 898 prefetch = -prefetch; 899 if (use_stride) __ mov(stride, prefetch); 900 } 901 902 __ bind(again); 903 904 if (PrefetchCopyIntervalInBytes > 0) 905 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 906 907 if (UseSIMDForMemoryOps) { 908 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 909 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 910 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 911 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 912 } else { 913 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 914 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 915 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 916 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 917 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 918 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 919 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 920 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 921 } 922 923 __ subs(count, count, 8); 924 __ br(Assembler::HS, again); 925 926 // Drain 927 __ bind(drain); 928 if (UseSIMDForMemoryOps) { 929 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 930 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 931 } else { 932 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 933 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 934 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 } 937 938 { 939 Label L1, L2; 940 __ tbz(count, exact_log2(4), L1); 941 if (UseSIMDForMemoryOps) { 942 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 943 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 944 } else { 945 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 946 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 947 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 948 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 949 } 950 __ bind(L1); 951 952 if (direction == copy_forwards) { 953 __ add(s, s, bias); 954 __ add(d, d, bias); 955 } 956 957 __ tbz(count, 1, L2); 958 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 959 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 960 __ bind(L2); 961 } 962 963 __ ret(lr); 964 965 if (AvoidUnalignedAccesses) { 966 Label drain, again; 967 // Register order for storing. Order is different for backward copy. 968 969 __ bind(unaligned_copy_long); 970 971 // source address is even aligned, target odd aligned 972 // 973 // when forward copying word pairs we read long pairs at offsets 974 // {0, 2, 4, 6} (in long words). when backwards copying we read 975 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 976 // address by -2 in the forwards case so we can compute the 977 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 978 // or -1. 979 // 980 // when forward copying we need to store 1 word, 3 pairs and 981 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 982 // zero offset We adjust the destination by -1 which means we 983 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 984 // 985 // When backwards copyng we need to store 1 word, 3 pairs and 986 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 987 // offsets {1, 3, 5, 7, 8} * unit. 988 989 if (direction == copy_forwards) { 990 __ sub(s, s, 16); 991 __ sub(d, d, 8); 992 } 993 994 // Fill 8 registers 995 // 996 // for forwards copy s was offset by -16 from the original input 997 // value of s so the register contents are at these offsets 998 // relative to the 64 bit block addressed by that original input 999 // and so on for each successive 64 byte block when s is updated 1000 // 1001 // t0 at offset 0, t1 at offset 8 1002 // t2 at offset 16, t3 at offset 24 1003 // t4 at offset 32, t5 at offset 40 1004 // t6 at offset 48, t7 at offset 56 1005 1006 // for backwards copy s was not offset so the register contents 1007 // are at these offsets into the preceding 64 byte block 1008 // relative to that original input and so on for each successive 1009 // preceding 64 byte block when s is updated. this explains the 1010 // slightly counter-intuitive looking pattern of register usage 1011 // in the stp instructions for backwards copy. 1012 // 1013 // t0 at offset -16, t1 at offset -8 1014 // t2 at offset -32, t3 at offset -24 1015 // t4 at offset -48, t5 at offset -40 1016 // t6 at offset -64, t7 at offset -56 1017 1018 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1019 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1020 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1021 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1022 1023 __ subs(count, count, 16); 1024 __ br(Assembler::LO, drain); 1025 1026 int prefetch = PrefetchCopyIntervalInBytes; 1027 bool use_stride = false; 1028 if (direction == copy_backwards) { 1029 use_stride = prefetch > 256; 1030 prefetch = -prefetch; 1031 if (use_stride) __ mov(stride, prefetch); 1032 } 1033 1034 __ bind(again); 1035 1036 if (PrefetchCopyIntervalInBytes > 0) 1037 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1038 1039 if (direction == copy_forwards) { 1040 // allowing for the offset of -8 the store instructions place 1041 // registers into the target 64 bit block at the following 1042 // offsets 1043 // 1044 // t0 at offset 0 1045 // t1 at offset 8, t2 at offset 16 1046 // t3 at offset 24, t4 at offset 32 1047 // t5 at offset 40, t6 at offset 48 1048 // t7 at offset 56 1049 1050 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1051 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1052 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1053 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1054 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1055 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1056 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1057 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1058 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1059 } else { 1060 // d was not offset when we started so the registers are 1061 // written into the 64 bit block preceding d with the following 1062 // offsets 1063 // 1064 // t1 at offset -8 1065 // t3 at offset -24, t0 at offset -16 1066 // t5 at offset -48, t2 at offset -32 1067 // t7 at offset -56, t4 at offset -48 1068 // t6 at offset -64 1069 // 1070 // note that this matches the offsets previously noted for the 1071 // loads 1072 1073 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1074 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1075 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1076 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1077 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1078 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1079 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1080 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1081 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1082 } 1083 1084 __ subs(count, count, 8); 1085 __ br(Assembler::HS, again); 1086 1087 // Drain 1088 // 1089 // this uses the same pattern of offsets and register arguments 1090 // as above 1091 __ bind(drain); 1092 if (direction == copy_forwards) { 1093 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1094 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1095 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1096 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1097 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1098 } else { 1099 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1100 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1101 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1102 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1103 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1104 } 1105 // now we need to copy any remaining part block which may 1106 // include a 4 word block subblock and/or a 2 word subblock. 1107 // bits 2 and 1 in the count are the tell-tale for whether we 1108 // have each such subblock 1109 { 1110 Label L1, L2; 1111 __ tbz(count, exact_log2(4), L1); 1112 // this is the same as above but copying only 4 longs hence 1113 // with only one intervening stp between the str instructions 1114 // but note that the offsets and registers still follow the 1115 // same pattern 1116 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1117 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1118 if (direction == copy_forwards) { 1119 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1120 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1121 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1122 } else { 1123 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1124 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1125 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1126 } 1127 __ bind(L1); 1128 1129 __ tbz(count, 1, L2); 1130 // this is the same as above but copying only 2 longs hence 1131 // there is no intervening stp between the str instructions 1132 // but note that the offset and register patterns are still 1133 // the same 1134 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1135 if (direction == copy_forwards) { 1136 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1137 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1141 } 1142 __ bind(L2); 1143 1144 // for forwards copy we need to re-adjust the offsets we 1145 // applied so that s and d are follow the last words written 1146 1147 if (direction == copy_forwards) { 1148 __ add(s, s, 16); 1149 __ add(d, d, 8); 1150 } 1151 1152 } 1153 1154 __ ret(lr); 1155 } 1156 } 1157 1158 // Small copy: less than 16 bytes. 1159 // 1160 // NB: Ignores all of the bits of count which represent more than 15 1161 // bytes, so a caller doesn't have to mask them. 1162 1163 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1164 bool is_backwards = step < 0; 1165 size_t granularity = g_uabs(step); 1166 int direction = is_backwards ? -1 : 1; 1167 1168 Label Lword, Lint, Lshort, Lbyte; 1169 1170 assert(granularity 1171 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1172 1173 const Register t0 = r3; 1174 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1175 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1176 1177 // ??? I don't know if this bit-test-and-branch is the right thing 1178 // to do. It does a lot of jumping, resulting in several 1179 // mispredicted branches. It might make more sense to do this 1180 // with something like Duff's device with a single computed branch. 1181 1182 __ tbz(count, 3 - exact_log2(granularity), Lword); 1183 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1184 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1185 __ bind(Lword); 1186 1187 if (granularity <= sizeof (jint)) { 1188 __ tbz(count, 2 - exact_log2(granularity), Lint); 1189 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1190 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1191 __ bind(Lint); 1192 } 1193 1194 if (granularity <= sizeof (jshort)) { 1195 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1196 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1197 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1198 __ bind(Lshort); 1199 } 1200 1201 if (granularity <= sizeof (jbyte)) { 1202 __ tbz(count, 0, Lbyte); 1203 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1204 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1205 __ bind(Lbyte); 1206 } 1207 } 1208 1209 Label copy_f, copy_b; 1210 Label copy_obj_f, copy_obj_b; 1211 Label copy_obj_uninit_f, copy_obj_uninit_b; 1212 1213 // All-singing all-dancing memory copy. 1214 // 1215 // Copy count units of memory from s to d. The size of a unit is 1216 // step, which can be positive or negative depending on the direction 1217 // of copy. If is_aligned is false, we align the source address. 1218 // 1219 1220 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1221 Register s, Register d, Register count, int step) { 1222 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1223 bool is_backwards = step < 0; 1224 unsigned int granularity = g_uabs(step); 1225 const Register t0 = r3, t1 = r4; 1226 1227 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1228 // load all the data before writing anything 1229 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1230 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1231 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1232 const Register send = r17, dend = r16; 1233 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1234 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1235 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1236 1237 if (PrefetchCopyIntervalInBytes > 0) 1238 __ prfm(Address(s, 0), PLDL1KEEP); 1239 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1240 __ br(Assembler::HI, copy_big); 1241 1242 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1243 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1244 1245 __ cmp(count, u1(16/granularity)); 1246 __ br(Assembler::LS, copy16); 1247 1248 __ cmp(count, u1(64/granularity)); 1249 __ br(Assembler::HI, copy80); 1250 1251 __ cmp(count, u1(32/granularity)); 1252 __ br(Assembler::LS, copy32); 1253 1254 // 33..64 bytes 1255 if (UseSIMDForMemoryOps) { 1256 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1257 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1258 bs.copy_store_at_32(Address(d, 0), v0, v1); 1259 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1260 } else { 1261 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1262 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1263 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1264 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1265 1266 bs.copy_store_at_16(Address(d, 0), t0, t1); 1267 bs.copy_store_at_16(Address(d, 16), t2, t3); 1268 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1269 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1270 } 1271 __ b(finish); 1272 1273 // 17..32 bytes 1274 __ bind(copy32); 1275 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1276 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1277 1278 bs.copy_store_at_16(Address(d, 0), t0, t1); 1279 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1280 __ b(finish); 1281 1282 // 65..80/96 bytes 1283 // (96 bytes if SIMD because we do 32 byes per instruction) 1284 __ bind(copy80); 1285 if (UseSIMDForMemoryOps) { 1286 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1287 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1288 // Unaligned pointers can be an issue for copying. 1289 // The issue has more chances to happen when granularity of data is 1290 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1291 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1292 // The most performance drop has been seen for the range 65-80 bytes. 1293 // For such cases using the pair of ldp/stp instead of the third pair of 1294 // ldpq/stpq fixes the performance issue. 1295 if (granularity < sizeof (jint)) { 1296 Label copy96; 1297 __ cmp(count, u1(80/granularity)); 1298 __ br(Assembler::HI, copy96); 1299 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1300 1301 bs.copy_store_at_32(Address(d, 0), v0, v1); 1302 bs.copy_store_at_32(Address(d, 32), v2, v3); 1303 1304 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1305 __ b(finish); 1306 1307 __ bind(copy96); 1308 } 1309 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1310 1311 bs.copy_store_at_32(Address(d, 0), v0, v1); 1312 bs.copy_store_at_32(Address(d, 32), v2, v3); 1313 1314 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1315 } else { 1316 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1317 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1318 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1319 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1320 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1321 1322 bs.copy_store_at_16(Address(d, 0), t0, t1); 1323 bs.copy_store_at_16(Address(d, 16), t2, t3); 1324 bs.copy_store_at_16(Address(d, 32), t4, t5); 1325 bs.copy_store_at_16(Address(d, 48), t6, t7); 1326 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1327 } 1328 __ b(finish); 1329 1330 // 0..16 bytes 1331 __ bind(copy16); 1332 __ cmp(count, u1(8/granularity)); 1333 __ br(Assembler::LO, copy8); 1334 1335 // 8..16 bytes 1336 bs.copy_load_at_8(t0, Address(s, 0)); 1337 bs.copy_load_at_8(t1, Address(send, -8)); 1338 bs.copy_store_at_8(Address(d, 0), t0); 1339 bs.copy_store_at_8(Address(dend, -8), t1); 1340 __ b(finish); 1341 1342 if (granularity < 8) { 1343 // 4..7 bytes 1344 __ bind(copy8); 1345 __ tbz(count, 2 - exact_log2(granularity), copy4); 1346 __ ldrw(t0, Address(s, 0)); 1347 __ ldrw(t1, Address(send, -4)); 1348 __ strw(t0, Address(d, 0)); 1349 __ strw(t1, Address(dend, -4)); 1350 __ b(finish); 1351 if (granularity < 4) { 1352 // 0..3 bytes 1353 __ bind(copy4); 1354 __ cbz(count, finish); // get rid of 0 case 1355 if (granularity == 2) { 1356 __ ldrh(t0, Address(s, 0)); 1357 __ strh(t0, Address(d, 0)); 1358 } else { // granularity == 1 1359 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1360 // the first and last byte. 1361 // Handle the 3 byte case by loading and storing base + count/2 1362 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1363 // This does means in the 1 byte case we load/store the same 1364 // byte 3 times. 1365 __ lsr(count, count, 1); 1366 __ ldrb(t0, Address(s, 0)); 1367 __ ldrb(t1, Address(send, -1)); 1368 __ ldrb(t2, Address(s, count)); 1369 __ strb(t0, Address(d, 0)); 1370 __ strb(t1, Address(dend, -1)); 1371 __ strb(t2, Address(d, count)); 1372 } 1373 __ b(finish); 1374 } 1375 } 1376 1377 __ bind(copy_big); 1378 if (is_backwards) { 1379 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1380 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1381 } 1382 1383 // Now we've got the small case out of the way we can align the 1384 // source address on a 2-word boundary. 1385 1386 // Here we will materialize a count in r15, which is used by copy_memory_small 1387 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1388 // Up until here, we have used t9, which aliases r15, but from here on, that register 1389 // can not be used as a temp register, as it contains the count. 1390 1391 Label aligned; 1392 1393 if (is_aligned) { 1394 // We may have to adjust by 1 word to get s 2-word-aligned. 1395 __ tbz(s, exact_log2(wordSize), aligned); 1396 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1397 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1398 __ sub(count, count, wordSize/granularity); 1399 } else { 1400 if (is_backwards) { 1401 __ andr(r15, s, 2 * wordSize - 1); 1402 } else { 1403 __ neg(r15, s); 1404 __ andr(r15, r15, 2 * wordSize - 1); 1405 } 1406 // r15 is the byte adjustment needed to align s. 1407 __ cbz(r15, aligned); 1408 int shift = exact_log2(granularity); 1409 if (shift > 0) { 1410 __ lsr(r15, r15, shift); 1411 } 1412 __ sub(count, count, r15); 1413 1414 #if 0 1415 // ?? This code is only correct for a disjoint copy. It may or 1416 // may not make sense to use it in that case. 1417 1418 // Copy the first pair; s and d may not be aligned. 1419 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1420 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1421 1422 // Align s and d, adjust count 1423 if (is_backwards) { 1424 __ sub(s, s, r15); 1425 __ sub(d, d, r15); 1426 } else { 1427 __ add(s, s, r15); 1428 __ add(d, d, r15); 1429 } 1430 #else 1431 copy_memory_small(decorators, type, s, d, r15, step); 1432 #endif 1433 } 1434 1435 __ bind(aligned); 1436 1437 // s is now 2-word-aligned. 1438 1439 // We have a count of units and some trailing bytes. Adjust the 1440 // count and do a bulk copy of words. If the shift is zero 1441 // perform a move instead to benefit from zero latency moves. 1442 int shift = exact_log2(wordSize/granularity); 1443 if (shift > 0) { 1444 __ lsr(r15, count, shift); 1445 } else { 1446 __ mov(r15, count); 1447 } 1448 if (direction == copy_forwards) { 1449 if (type != T_OBJECT) { 1450 __ bl(copy_f); 1451 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1452 __ bl(copy_obj_uninit_f); 1453 } else { 1454 __ bl(copy_obj_f); 1455 } 1456 } else { 1457 if (type != T_OBJECT) { 1458 __ bl(copy_b); 1459 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1460 __ bl(copy_obj_uninit_b); 1461 } else { 1462 __ bl(copy_obj_b); 1463 } 1464 } 1465 1466 // And the tail. 1467 copy_memory_small(decorators, type, s, d, count, step); 1468 1469 if (granularity >= 8) __ bind(copy8); 1470 if (granularity >= 4) __ bind(copy4); 1471 __ bind(finish); 1472 } 1473 1474 1475 void clobber_registers() { 1476 #ifdef ASSERT 1477 RegSet clobbered 1478 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1479 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1480 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1481 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1482 __ mov(*it, rscratch1); 1483 } 1484 #endif 1485 1486 } 1487 1488 // Scan over array at a for count oops, verifying each one. 1489 // Preserves a and count, clobbers rscratch1 and rscratch2. 1490 void verify_oop_array (int size, Register a, Register count, Register temp) { 1491 Label loop, end; 1492 __ mov(rscratch1, a); 1493 __ mov(rscratch2, zr); 1494 __ bind(loop); 1495 __ cmp(rscratch2, count); 1496 __ br(Assembler::HS, end); 1497 if (size == wordSize) { 1498 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1499 __ verify_oop(temp); 1500 } else { 1501 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1502 __ decode_heap_oop(temp); // calls verify_oop 1503 } 1504 __ add(rscratch2, rscratch2, 1); 1505 __ b(loop); 1506 __ bind(end); 1507 } 1508 1509 // Arguments: 1510 // stub_id - is used to name the stub and identify all details of 1511 // how to perform the copy. 1512 // 1513 // entry - is assigned to the stub's post push entry point unless 1514 // it is null 1515 // 1516 // Inputs: 1517 // c_rarg0 - source array address 1518 // c_rarg1 - destination array address 1519 // c_rarg2 - element count, treated as ssize_t, can be zero 1520 // 1521 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1522 // the hardware handle it. The two dwords within qwords that span 1523 // cache line boundaries will still be loaded and stored atomically. 1524 // 1525 // Side Effects: entry is set to the (post push) entry point so it 1526 // can be used by the corresponding conjoint copy 1527 // method 1528 // 1529 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1530 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1531 RegSet saved_reg = RegSet::of(s, d, count); 1532 int size; 1533 bool aligned; 1534 bool is_oop; 1535 bool dest_uninitialized; 1536 switch (stub_id) { 1537 case jbyte_disjoint_arraycopy_id: 1538 size = sizeof(jbyte); 1539 aligned = false; 1540 is_oop = false; 1541 dest_uninitialized = false; 1542 break; 1543 case arrayof_jbyte_disjoint_arraycopy_id: 1544 size = sizeof(jbyte); 1545 aligned = true; 1546 is_oop = false; 1547 dest_uninitialized = false; 1548 break; 1549 case jshort_disjoint_arraycopy_id: 1550 size = sizeof(jshort); 1551 aligned = false; 1552 is_oop = false; 1553 dest_uninitialized = false; 1554 break; 1555 case arrayof_jshort_disjoint_arraycopy_id: 1556 size = sizeof(jshort); 1557 aligned = true; 1558 is_oop = false; 1559 dest_uninitialized = false; 1560 break; 1561 case jint_disjoint_arraycopy_id: 1562 size = sizeof(jint); 1563 aligned = false; 1564 is_oop = false; 1565 dest_uninitialized = false; 1566 break; 1567 case arrayof_jint_disjoint_arraycopy_id: 1568 size = sizeof(jint); 1569 aligned = true; 1570 is_oop = false; 1571 dest_uninitialized = false; 1572 break; 1573 case jlong_disjoint_arraycopy_id: 1574 // since this is always aligned we can (should!) use the same 1575 // stub as for case arrayof_jlong_disjoint_arraycopy 1576 ShouldNotReachHere(); 1577 break; 1578 case arrayof_jlong_disjoint_arraycopy_id: 1579 size = sizeof(jlong); 1580 aligned = true; 1581 is_oop = false; 1582 dest_uninitialized = false; 1583 break; 1584 case oop_disjoint_arraycopy_id: 1585 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1586 aligned = !UseCompressedOops; 1587 is_oop = true; 1588 dest_uninitialized = false; 1589 break; 1590 case arrayof_oop_disjoint_arraycopy_id: 1591 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1592 aligned = !UseCompressedOops; 1593 is_oop = true; 1594 dest_uninitialized = false; 1595 break; 1596 case oop_disjoint_arraycopy_uninit_id: 1597 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1598 aligned = !UseCompressedOops; 1599 is_oop = true; 1600 dest_uninitialized = true; 1601 break; 1602 case arrayof_oop_disjoint_arraycopy_uninit_id: 1603 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1604 aligned = !UseCompressedOops; 1605 is_oop = true; 1606 dest_uninitialized = true; 1607 break; 1608 default: 1609 ShouldNotReachHere(); 1610 break; 1611 } 1612 1613 __ align(CodeEntryAlignment); 1614 StubCodeMark mark(this, stub_id); 1615 address start = __ pc(); 1616 __ enter(); 1617 1618 if (entry != nullptr) { 1619 *entry = __ pc(); 1620 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1621 BLOCK_COMMENT("Entry:"); 1622 } 1623 1624 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1625 if (dest_uninitialized) { 1626 decorators |= IS_DEST_UNINITIALIZED; 1627 } 1628 if (aligned) { 1629 decorators |= ARRAYCOPY_ALIGNED; 1630 } 1631 1632 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1633 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1634 1635 if (is_oop) { 1636 // save regs before copy_memory 1637 __ push(RegSet::of(d, count), sp); 1638 } 1639 { 1640 // UnsafeMemoryAccess page error: continue after unsafe access 1641 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1642 UnsafeMemoryAccessMark umam(this, add_entry, true); 1643 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1644 } 1645 1646 if (is_oop) { 1647 __ pop(RegSet::of(d, count), sp); 1648 if (VerifyOops) 1649 verify_oop_array(size, d, count, r16); 1650 } 1651 1652 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1653 1654 __ leave(); 1655 __ mov(r0, zr); // return 0 1656 __ ret(lr); 1657 return start; 1658 } 1659 1660 // Arguments: 1661 // stub_id - is used to name the stub and identify all details of 1662 // how to perform the copy. 1663 // 1664 // nooverlap_target - identifes the (post push) entry for the 1665 // corresponding disjoint copy routine which can be 1666 // jumped to if the ranges do not actually overlap 1667 // 1668 // entry - is assigned to the stub's post push entry point unless 1669 // it is null 1670 // 1671 // 1672 // Inputs: 1673 // c_rarg0 - source array address 1674 // c_rarg1 - destination array address 1675 // c_rarg2 - element count, treated as ssize_t, can be zero 1676 // 1677 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1678 // the hardware handle it. The two dwords within qwords that span 1679 // cache line boundaries will still be loaded and stored atomically. 1680 // 1681 // Side Effects: 1682 // entry is set to the no-overlap entry point so it can be used by 1683 // some other conjoint copy method 1684 // 1685 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1686 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1687 RegSet saved_regs = RegSet::of(s, d, count); 1688 int size; 1689 bool aligned; 1690 bool is_oop; 1691 bool dest_uninitialized; 1692 switch (stub_id) { 1693 case jbyte_arraycopy_id: 1694 size = sizeof(jbyte); 1695 aligned = false; 1696 is_oop = false; 1697 dest_uninitialized = false; 1698 break; 1699 case arrayof_jbyte_arraycopy_id: 1700 size = sizeof(jbyte); 1701 aligned = true; 1702 is_oop = false; 1703 dest_uninitialized = false; 1704 break; 1705 case jshort_arraycopy_id: 1706 size = sizeof(jshort); 1707 aligned = false; 1708 is_oop = false; 1709 dest_uninitialized = false; 1710 break; 1711 case arrayof_jshort_arraycopy_id: 1712 size = sizeof(jshort); 1713 aligned = true; 1714 is_oop = false; 1715 dest_uninitialized = false; 1716 break; 1717 case jint_arraycopy_id: 1718 size = sizeof(jint); 1719 aligned = false; 1720 is_oop = false; 1721 dest_uninitialized = false; 1722 break; 1723 case arrayof_jint_arraycopy_id: 1724 size = sizeof(jint); 1725 aligned = true; 1726 is_oop = false; 1727 dest_uninitialized = false; 1728 break; 1729 case jlong_arraycopy_id: 1730 // since this is always aligned we can (should!) use the same 1731 // stub as for case arrayof_jlong_disjoint_arraycopy 1732 ShouldNotReachHere(); 1733 break; 1734 case arrayof_jlong_arraycopy_id: 1735 size = sizeof(jlong); 1736 aligned = true; 1737 is_oop = false; 1738 dest_uninitialized = false; 1739 break; 1740 case oop_arraycopy_id: 1741 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1742 aligned = !UseCompressedOops; 1743 is_oop = true; 1744 dest_uninitialized = false; 1745 break; 1746 case arrayof_oop_arraycopy_id: 1747 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1748 aligned = !UseCompressedOops; 1749 is_oop = true; 1750 dest_uninitialized = false; 1751 break; 1752 case oop_arraycopy_uninit_id: 1753 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1754 aligned = !UseCompressedOops; 1755 is_oop = true; 1756 dest_uninitialized = true; 1757 break; 1758 case arrayof_oop_arraycopy_uninit_id: 1759 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1760 aligned = !UseCompressedOops; 1761 is_oop = true; 1762 dest_uninitialized = true; 1763 break; 1764 default: 1765 ShouldNotReachHere(); 1766 } 1767 1768 StubCodeMark mark(this, stub_id); 1769 address start = __ pc(); 1770 __ enter(); 1771 1772 if (entry != nullptr) { 1773 *entry = __ pc(); 1774 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1775 BLOCK_COMMENT("Entry:"); 1776 } 1777 1778 // use fwd copy when (d-s) above_equal (count*size) 1779 __ sub(rscratch1, d, s); 1780 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1781 __ br(Assembler::HS, nooverlap_target); 1782 1783 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1784 if (dest_uninitialized) { 1785 decorators |= IS_DEST_UNINITIALIZED; 1786 } 1787 if (aligned) { 1788 decorators |= ARRAYCOPY_ALIGNED; 1789 } 1790 1791 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1792 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1793 1794 if (is_oop) { 1795 // save regs before copy_memory 1796 __ push(RegSet::of(d, count), sp); 1797 } 1798 { 1799 // UnsafeMemoryAccess page error: continue after unsafe access 1800 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1801 UnsafeMemoryAccessMark umam(this, add_entry, true); 1802 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1803 } 1804 if (is_oop) { 1805 __ pop(RegSet::of(d, count), sp); 1806 if (VerifyOops) 1807 verify_oop_array(size, d, count, r16); 1808 } 1809 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1810 __ leave(); 1811 __ mov(r0, zr); // return 0 1812 __ ret(lr); 1813 return start; 1814 } 1815 1816 // Helper for generating a dynamic type check. 1817 // Smashes rscratch1, rscratch2. 1818 void generate_type_check(Register sub_klass, 1819 Register super_check_offset, 1820 Register super_klass, 1821 Register temp1, 1822 Register temp2, 1823 Register result, 1824 Label& L_success) { 1825 assert_different_registers(sub_klass, super_check_offset, super_klass); 1826 1827 BLOCK_COMMENT("type_check:"); 1828 1829 Label L_miss; 1830 1831 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1832 super_check_offset); 1833 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1834 1835 // Fall through on failure! 1836 __ BIND(L_miss); 1837 } 1838 1839 // 1840 // Generate checkcasting array copy stub 1841 // 1842 // Input: 1843 // c_rarg0 - source array address 1844 // c_rarg1 - destination array address 1845 // c_rarg2 - element count, treated as ssize_t, can be zero 1846 // c_rarg3 - size_t ckoff (super_check_offset) 1847 // c_rarg4 - oop ckval (super_klass) 1848 // 1849 // Output: 1850 // r0 == 0 - success 1851 // r0 == -1^K - failure, where K is partial transfer count 1852 // 1853 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1854 bool dest_uninitialized; 1855 switch (stub_id) { 1856 case checkcast_arraycopy_id: 1857 dest_uninitialized = false; 1858 break; 1859 case checkcast_arraycopy_uninit_id: 1860 dest_uninitialized = true; 1861 break; 1862 default: 1863 ShouldNotReachHere(); 1864 } 1865 1866 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1867 1868 // Input registers (after setup_arg_regs) 1869 const Register from = c_rarg0; // source array address 1870 const Register to = c_rarg1; // destination array address 1871 const Register count = c_rarg2; // elementscount 1872 const Register ckoff = c_rarg3; // super_check_offset 1873 const Register ckval = c_rarg4; // super_klass 1874 1875 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1876 RegSet wb_post_saved_regs = RegSet::of(count); 1877 1878 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1879 const Register copied_oop = r22; // actual oop copied 1880 const Register count_save = r21; // orig elementscount 1881 const Register start_to = r20; // destination array start address 1882 const Register r19_klass = r19; // oop._klass 1883 1884 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1885 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1886 1887 //--------------------------------------------------------------- 1888 // Assembler stub will be used for this call to arraycopy 1889 // if the two arrays are subtypes of Object[] but the 1890 // destination array type is not equal to or a supertype 1891 // of the source type. Each element must be separately 1892 // checked. 1893 1894 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1895 copied_oop, r19_klass, count_save); 1896 1897 __ align(CodeEntryAlignment); 1898 StubCodeMark mark(this, stub_id); 1899 address start = __ pc(); 1900 1901 __ enter(); // required for proper stackwalking of RuntimeStub frame 1902 1903 #ifdef ASSERT 1904 // caller guarantees that the arrays really are different 1905 // otherwise, we would have to make conjoint checks 1906 { Label L; 1907 __ b(L); // conjoint check not yet implemented 1908 __ stop("checkcast_copy within a single array"); 1909 __ bind(L); 1910 } 1911 #endif //ASSERT 1912 1913 // Caller of this entry point must set up the argument registers. 1914 if (entry != nullptr) { 1915 *entry = __ pc(); 1916 BLOCK_COMMENT("Entry:"); 1917 } 1918 1919 // Empty array: Nothing to do. 1920 __ cbz(count, L_done); 1921 __ push(RegSet::of(r19, r20, r21, r22), sp); 1922 1923 #ifdef ASSERT 1924 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1925 // The ckoff and ckval must be mutually consistent, 1926 // even though caller generates both. 1927 { Label L; 1928 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1929 __ ldrw(start_to, Address(ckval, sco_offset)); 1930 __ cmpw(ckoff, start_to); 1931 __ br(Assembler::EQ, L); 1932 __ stop("super_check_offset inconsistent"); 1933 __ bind(L); 1934 } 1935 #endif //ASSERT 1936 1937 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1938 bool is_oop = true; 1939 int element_size = UseCompressedOops ? 4 : 8; 1940 if (dest_uninitialized) { 1941 decorators |= IS_DEST_UNINITIALIZED; 1942 } 1943 1944 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1945 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1946 1947 // save the original count 1948 __ mov(count_save, count); 1949 1950 // Copy from low to high addresses 1951 __ mov(start_to, to); // Save destination array start address 1952 __ b(L_load_element); 1953 1954 // ======== begin loop ======== 1955 // (Loop is rotated; its entry is L_load_element.) 1956 // Loop control: 1957 // for (; count != 0; count--) { 1958 // copied_oop = load_heap_oop(from++); 1959 // ... generate_type_check ...; 1960 // store_heap_oop(to++, copied_oop); 1961 // } 1962 __ align(OptoLoopAlignment); 1963 1964 __ BIND(L_store_element); 1965 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1966 __ post(to, element_size), copied_oop, noreg, 1967 gct1, gct2, gct3); 1968 __ sub(count, count, 1); 1969 __ cbz(count, L_do_card_marks); 1970 1971 // ======== loop entry is here ======== 1972 __ BIND(L_load_element); 1973 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1974 copied_oop, noreg, __ post(from, element_size), 1975 gct1); 1976 __ cbz(copied_oop, L_store_element); 1977 1978 __ load_klass(r19_klass, copied_oop);// query the object klass 1979 1980 BLOCK_COMMENT("type_check:"); 1981 generate_type_check(/*sub_klass*/r19_klass, 1982 /*super_check_offset*/ckoff, 1983 /*super_klass*/ckval, 1984 /*r_array_base*/gct1, 1985 /*temp2*/gct2, 1986 /*result*/r10, L_store_element); 1987 1988 // Fall through on failure! 1989 1990 // ======== end loop ======== 1991 1992 // It was a real error; we must depend on the caller to finish the job. 1993 // Register count = remaining oops, count_orig = total oops. 1994 // Emit GC store barriers for the oops we have copied and report 1995 // their number to the caller. 1996 1997 __ subs(count, count_save, count); // K = partially copied oop count 1998 __ eon(count, count, zr); // report (-1^K) to caller 1999 __ br(Assembler::EQ, L_done_pop); 2000 2001 __ BIND(L_do_card_marks); 2002 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2003 2004 __ bind(L_done_pop); 2005 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2006 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2007 2008 __ bind(L_done); 2009 __ mov(r0, count); 2010 __ leave(); 2011 __ ret(lr); 2012 2013 return start; 2014 } 2015 2016 // Perform range checks on the proposed arraycopy. 2017 // Kills temp, but nothing else. 2018 // Also, clean the sign bits of src_pos and dst_pos. 2019 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2020 Register src_pos, // source position (c_rarg1) 2021 Register dst, // destination array oo (c_rarg2) 2022 Register dst_pos, // destination position (c_rarg3) 2023 Register length, 2024 Register temp, 2025 Label& L_failed) { 2026 BLOCK_COMMENT("arraycopy_range_checks:"); 2027 2028 assert_different_registers(rscratch1, temp); 2029 2030 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2031 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2032 __ addw(temp, length, src_pos); 2033 __ cmpw(temp, rscratch1); 2034 __ br(Assembler::HI, L_failed); 2035 2036 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2037 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2038 __ addw(temp, length, dst_pos); 2039 __ cmpw(temp, rscratch1); 2040 __ br(Assembler::HI, L_failed); 2041 2042 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2043 __ movw(src_pos, src_pos); 2044 __ movw(dst_pos, dst_pos); 2045 2046 BLOCK_COMMENT("arraycopy_range_checks done"); 2047 } 2048 2049 // These stubs get called from some dumb test routine. 2050 // I'll write them properly when they're called from 2051 // something that's actually doing something. 2052 static void fake_arraycopy_stub(address src, address dst, int count) { 2053 assert(count == 0, "huh?"); 2054 } 2055 2056 2057 // 2058 // Generate 'unsafe' array copy stub 2059 // Though just as safe as the other stubs, it takes an unscaled 2060 // size_t argument instead of an element count. 2061 // 2062 // Input: 2063 // c_rarg0 - source array address 2064 // c_rarg1 - destination array address 2065 // c_rarg2 - byte count, treated as ssize_t, can be zero 2066 // 2067 // Examines the alignment of the operands and dispatches 2068 // to a long, int, short, or byte copy loop. 2069 // 2070 address generate_unsafe_copy(address byte_copy_entry, 2071 address short_copy_entry, 2072 address int_copy_entry, 2073 address long_copy_entry) { 2074 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2075 2076 Label L_long_aligned, L_int_aligned, L_short_aligned; 2077 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2078 2079 __ align(CodeEntryAlignment); 2080 StubCodeMark mark(this, stub_id); 2081 address start = __ pc(); 2082 __ enter(); // required for proper stackwalking of RuntimeStub frame 2083 2084 // bump this on entry, not on exit: 2085 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2086 2087 __ orr(rscratch1, s, d); 2088 __ orr(rscratch1, rscratch1, count); 2089 2090 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2091 __ cbz(rscratch1, L_long_aligned); 2092 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2093 __ cbz(rscratch1, L_int_aligned); 2094 __ tbz(rscratch1, 0, L_short_aligned); 2095 __ b(RuntimeAddress(byte_copy_entry)); 2096 2097 __ BIND(L_short_aligned); 2098 __ lsr(count, count, LogBytesPerShort); // size => short_count 2099 __ b(RuntimeAddress(short_copy_entry)); 2100 __ BIND(L_int_aligned); 2101 __ lsr(count, count, LogBytesPerInt); // size => int_count 2102 __ b(RuntimeAddress(int_copy_entry)); 2103 __ BIND(L_long_aligned); 2104 __ lsr(count, count, LogBytesPerLong); // size => long_count 2105 __ b(RuntimeAddress(long_copy_entry)); 2106 2107 return start; 2108 } 2109 2110 // 2111 // Generate generic array copy stubs 2112 // 2113 // Input: 2114 // c_rarg0 - src oop 2115 // c_rarg1 - src_pos (32-bits) 2116 // c_rarg2 - dst oop 2117 // c_rarg3 - dst_pos (32-bits) 2118 // c_rarg4 - element count (32-bits) 2119 // 2120 // Output: 2121 // r0 == 0 - success 2122 // r0 == -1^K - failure, where K is partial transfer count 2123 // 2124 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2125 address int_copy_entry, address oop_copy_entry, 2126 address long_copy_entry, address checkcast_copy_entry) { 2127 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2128 2129 Label L_failed, L_objArray; 2130 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2131 2132 // Input registers 2133 const Register src = c_rarg0; // source array oop 2134 const Register src_pos = c_rarg1; // source position 2135 const Register dst = c_rarg2; // destination array oop 2136 const Register dst_pos = c_rarg3; // destination position 2137 const Register length = c_rarg4; 2138 2139 2140 // Registers used as temps 2141 const Register dst_klass = c_rarg5; 2142 2143 __ align(CodeEntryAlignment); 2144 2145 StubCodeMark mark(this, stub_id); 2146 2147 address start = __ pc(); 2148 2149 __ enter(); // required for proper stackwalking of RuntimeStub frame 2150 2151 // bump this on entry, not on exit: 2152 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2153 2154 //----------------------------------------------------------------------- 2155 // Assembler stub will be used for this call to arraycopy 2156 // if the following conditions are met: 2157 // 2158 // (1) src and dst must not be null. 2159 // (2) src_pos must not be negative. 2160 // (3) dst_pos must not be negative. 2161 // (4) length must not be negative. 2162 // (5) src klass and dst klass should be the same and not null. 2163 // (6) src and dst should be arrays. 2164 // (7) src_pos + length must not exceed length of src. 2165 // (8) dst_pos + length must not exceed length of dst. 2166 // 2167 2168 // if (src == nullptr) return -1; 2169 __ cbz(src, L_failed); 2170 2171 // if (src_pos < 0) return -1; 2172 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2173 2174 // if (dst == nullptr) return -1; 2175 __ cbz(dst, L_failed); 2176 2177 // if (dst_pos < 0) return -1; 2178 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2179 2180 // registers used as temp 2181 const Register scratch_length = r16; // elements count to copy 2182 const Register scratch_src_klass = r17; // array klass 2183 const Register lh = r15; // layout helper 2184 2185 // if (length < 0) return -1; 2186 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2187 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2188 2189 __ load_klass(scratch_src_klass, src); 2190 #ifdef ASSERT 2191 // assert(src->klass() != nullptr); 2192 { 2193 BLOCK_COMMENT("assert klasses not null {"); 2194 Label L1, L2; 2195 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2196 __ bind(L1); 2197 __ stop("broken null klass"); 2198 __ bind(L2); 2199 __ load_klass(rscratch1, dst); 2200 __ cbz(rscratch1, L1); // this would be broken also 2201 BLOCK_COMMENT("} assert klasses not null done"); 2202 } 2203 #endif 2204 2205 // Load layout helper (32-bits) 2206 // 2207 // |array_tag| | header_size | element_type | |log2_element_size| 2208 // 32 30 24 16 8 2 0 2209 // 2210 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2211 // 2212 2213 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2214 2215 // Handle objArrays completely differently... 2216 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2217 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2218 __ movw(rscratch1, objArray_lh); 2219 __ eorw(rscratch2, lh, rscratch1); 2220 __ cbzw(rscratch2, L_objArray); 2221 2222 // if (src->klass() != dst->klass()) return -1; 2223 __ load_klass(rscratch2, dst); 2224 __ eor(rscratch2, rscratch2, scratch_src_klass); 2225 __ cbnz(rscratch2, L_failed); 2226 2227 // if (!src->is_Array()) return -1; 2228 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2229 2230 // At this point, it is known to be a typeArray (array_tag 0x3). 2231 #ifdef ASSERT 2232 { 2233 BLOCK_COMMENT("assert primitive array {"); 2234 Label L; 2235 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2236 __ cmpw(lh, rscratch2); 2237 __ br(Assembler::GE, L); 2238 __ stop("must be a primitive array"); 2239 __ bind(L); 2240 BLOCK_COMMENT("} assert primitive array done"); 2241 } 2242 #endif 2243 2244 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2245 rscratch2, L_failed); 2246 2247 // TypeArrayKlass 2248 // 2249 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2250 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2251 // 2252 2253 const Register rscratch1_offset = rscratch1; // array offset 2254 const Register r15_elsize = lh; // element size 2255 2256 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2257 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2258 __ add(src, src, rscratch1_offset); // src array offset 2259 __ add(dst, dst, rscratch1_offset); // dst array offset 2260 BLOCK_COMMENT("choose copy loop based on element size"); 2261 2262 // next registers should be set before the jump to corresponding stub 2263 const Register from = c_rarg0; // source array address 2264 const Register to = c_rarg1; // destination array address 2265 const Register count = c_rarg2; // elements count 2266 2267 // 'from', 'to', 'count' registers should be set in such order 2268 // since they are the same as 'src', 'src_pos', 'dst'. 2269 2270 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2271 2272 // The possible values of elsize are 0-3, i.e. exact_log2(element 2273 // size in bytes). We do a simple bitwise binary search. 2274 __ BIND(L_copy_bytes); 2275 __ tbnz(r15_elsize, 1, L_copy_ints); 2276 __ tbnz(r15_elsize, 0, L_copy_shorts); 2277 __ lea(from, Address(src, src_pos));// src_addr 2278 __ lea(to, Address(dst, dst_pos));// dst_addr 2279 __ movw(count, scratch_length); // length 2280 __ b(RuntimeAddress(byte_copy_entry)); 2281 2282 __ BIND(L_copy_shorts); 2283 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2284 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2285 __ movw(count, scratch_length); // length 2286 __ b(RuntimeAddress(short_copy_entry)); 2287 2288 __ BIND(L_copy_ints); 2289 __ tbnz(r15_elsize, 0, L_copy_longs); 2290 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2291 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2292 __ movw(count, scratch_length); // length 2293 __ b(RuntimeAddress(int_copy_entry)); 2294 2295 __ BIND(L_copy_longs); 2296 #ifdef ASSERT 2297 { 2298 BLOCK_COMMENT("assert long copy {"); 2299 Label L; 2300 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2301 __ cmpw(r15_elsize, LogBytesPerLong); 2302 __ br(Assembler::EQ, L); 2303 __ stop("must be long copy, but elsize is wrong"); 2304 __ bind(L); 2305 BLOCK_COMMENT("} assert long copy done"); 2306 } 2307 #endif 2308 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2309 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2310 __ movw(count, scratch_length); // length 2311 __ b(RuntimeAddress(long_copy_entry)); 2312 2313 // ObjArrayKlass 2314 __ BIND(L_objArray); 2315 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2316 2317 Label L_plain_copy, L_checkcast_copy; 2318 // test array classes for subtyping 2319 __ load_klass(r15, dst); 2320 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2321 __ br(Assembler::NE, L_checkcast_copy); 2322 2323 // Identically typed arrays can be copied without element-wise checks. 2324 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2325 rscratch2, L_failed); 2326 2327 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2328 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2329 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2330 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2331 __ movw(count, scratch_length); // length 2332 __ BIND(L_plain_copy); 2333 __ b(RuntimeAddress(oop_copy_entry)); 2334 2335 __ BIND(L_checkcast_copy); 2336 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2337 { 2338 // Before looking at dst.length, make sure dst is also an objArray. 2339 __ ldrw(rscratch1, Address(r15, lh_offset)); 2340 __ movw(rscratch2, objArray_lh); 2341 __ eorw(rscratch1, rscratch1, rscratch2); 2342 __ cbnzw(rscratch1, L_failed); 2343 2344 // It is safe to examine both src.length and dst.length. 2345 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2346 r15, L_failed); 2347 2348 __ load_klass(dst_klass, dst); // reload 2349 2350 // Marshal the base address arguments now, freeing registers. 2351 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2354 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2355 __ movw(count, length); // length (reloaded) 2356 Register sco_temp = c_rarg3; // this register is free now 2357 assert_different_registers(from, to, count, sco_temp, 2358 dst_klass, scratch_src_klass); 2359 // assert_clean_int(count, sco_temp); 2360 2361 // Generate the type check. 2362 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2363 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2364 2365 // Smashes rscratch1, rscratch2 2366 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2367 L_plain_copy); 2368 2369 // Fetch destination element klass from the ObjArrayKlass header. 2370 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2371 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2372 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2373 2374 // the checkcast_copy loop needs two extra arguments: 2375 assert(c_rarg3 == sco_temp, "#3 already in place"); 2376 // Set up arguments for checkcast_copy_entry. 2377 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2378 __ b(RuntimeAddress(checkcast_copy_entry)); 2379 } 2380 2381 __ BIND(L_failed); 2382 __ mov(r0, -1); 2383 __ leave(); // required for proper stackwalking of RuntimeStub frame 2384 __ ret(lr); 2385 2386 return start; 2387 } 2388 2389 // 2390 // Generate stub for array fill. If "aligned" is true, the 2391 // "to" address is assumed to be heapword aligned. 2392 // 2393 // Arguments for generated stub: 2394 // to: c_rarg0 2395 // value: c_rarg1 2396 // count: c_rarg2 treated as signed 2397 // 2398 address generate_fill(StubGenStubId stub_id) { 2399 BasicType t; 2400 bool aligned; 2401 2402 switch (stub_id) { 2403 case jbyte_fill_id: 2404 t = T_BYTE; 2405 aligned = false; 2406 break; 2407 case jshort_fill_id: 2408 t = T_SHORT; 2409 aligned = false; 2410 break; 2411 case jint_fill_id: 2412 t = T_INT; 2413 aligned = false; 2414 break; 2415 case arrayof_jbyte_fill_id: 2416 t = T_BYTE; 2417 aligned = true; 2418 break; 2419 case arrayof_jshort_fill_id: 2420 t = T_SHORT; 2421 aligned = true; 2422 break; 2423 case arrayof_jint_fill_id: 2424 t = T_INT; 2425 aligned = true; 2426 break; 2427 default: 2428 ShouldNotReachHere(); 2429 }; 2430 2431 __ align(CodeEntryAlignment); 2432 StubCodeMark mark(this, stub_id); 2433 address start = __ pc(); 2434 2435 BLOCK_COMMENT("Entry:"); 2436 2437 const Register to = c_rarg0; // source array address 2438 const Register value = c_rarg1; // value 2439 const Register count = c_rarg2; // elements count 2440 2441 const Register bz_base = r10; // base for block_zero routine 2442 const Register cnt_words = r11; // temp register 2443 2444 __ enter(); 2445 2446 Label L_fill_elements, L_exit1; 2447 2448 int shift = -1; 2449 switch (t) { 2450 case T_BYTE: 2451 shift = 0; 2452 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2453 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2454 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2455 __ br(Assembler::LO, L_fill_elements); 2456 break; 2457 case T_SHORT: 2458 shift = 1; 2459 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2460 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2461 __ br(Assembler::LO, L_fill_elements); 2462 break; 2463 case T_INT: 2464 shift = 2; 2465 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2466 __ br(Assembler::LO, L_fill_elements); 2467 break; 2468 default: ShouldNotReachHere(); 2469 } 2470 2471 // Align source address at 8 bytes address boundary. 2472 Label L_skip_align1, L_skip_align2, L_skip_align4; 2473 if (!aligned) { 2474 switch (t) { 2475 case T_BYTE: 2476 // One byte misalignment happens only for byte arrays. 2477 __ tbz(to, 0, L_skip_align1); 2478 __ strb(value, Address(__ post(to, 1))); 2479 __ subw(count, count, 1); 2480 __ bind(L_skip_align1); 2481 // Fallthrough 2482 case T_SHORT: 2483 // Two bytes misalignment happens only for byte and short (char) arrays. 2484 __ tbz(to, 1, L_skip_align2); 2485 __ strh(value, Address(__ post(to, 2))); 2486 __ subw(count, count, 2 >> shift); 2487 __ bind(L_skip_align2); 2488 // Fallthrough 2489 case T_INT: 2490 // Align to 8 bytes, we know we are 4 byte aligned to start. 2491 __ tbz(to, 2, L_skip_align4); 2492 __ strw(value, Address(__ post(to, 4))); 2493 __ subw(count, count, 4 >> shift); 2494 __ bind(L_skip_align4); 2495 break; 2496 default: ShouldNotReachHere(); 2497 } 2498 } 2499 2500 // 2501 // Fill large chunks 2502 // 2503 __ lsrw(cnt_words, count, 3 - shift); // number of words 2504 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2505 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2506 if (UseBlockZeroing) { 2507 Label non_block_zeroing, rest; 2508 // If the fill value is zero we can use the fast zero_words(). 2509 __ cbnz(value, non_block_zeroing); 2510 __ mov(bz_base, to); 2511 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2512 address tpc = __ zero_words(bz_base, cnt_words); 2513 if (tpc == nullptr) { 2514 fatal("CodeCache is full at generate_fill"); 2515 } 2516 __ b(rest); 2517 __ bind(non_block_zeroing); 2518 __ fill_words(to, cnt_words, value); 2519 __ bind(rest); 2520 } else { 2521 __ fill_words(to, cnt_words, value); 2522 } 2523 2524 // Remaining count is less than 8 bytes. Fill it by a single store. 2525 // Note that the total length is no less than 8 bytes. 2526 if (t == T_BYTE || t == T_SHORT) { 2527 Label L_exit1; 2528 __ cbzw(count, L_exit1); 2529 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2530 __ str(value, Address(to, -8)); // overwrite some elements 2531 __ bind(L_exit1); 2532 __ leave(); 2533 __ ret(lr); 2534 } 2535 2536 // Handle copies less than 8 bytes. 2537 Label L_fill_2, L_fill_4, L_exit2; 2538 __ bind(L_fill_elements); 2539 switch (t) { 2540 case T_BYTE: 2541 __ tbz(count, 0, L_fill_2); 2542 __ strb(value, Address(__ post(to, 1))); 2543 __ bind(L_fill_2); 2544 __ tbz(count, 1, L_fill_4); 2545 __ strh(value, Address(__ post(to, 2))); 2546 __ bind(L_fill_4); 2547 __ tbz(count, 2, L_exit2); 2548 __ strw(value, Address(to)); 2549 break; 2550 case T_SHORT: 2551 __ tbz(count, 0, L_fill_4); 2552 __ strh(value, Address(__ post(to, 2))); 2553 __ bind(L_fill_4); 2554 __ tbz(count, 1, L_exit2); 2555 __ strw(value, Address(to)); 2556 break; 2557 case T_INT: 2558 __ cbzw(count, L_exit2); 2559 __ strw(value, Address(to)); 2560 break; 2561 default: ShouldNotReachHere(); 2562 } 2563 __ bind(L_exit2); 2564 __ leave(); 2565 __ ret(lr); 2566 return start; 2567 } 2568 2569 address generate_unsafecopy_common_error_exit() { 2570 address start_pc = __ pc(); 2571 __ leave(); 2572 __ mov(r0, 0); 2573 __ ret(lr); 2574 return start_pc; 2575 } 2576 2577 // 2578 // Generate 'unsafe' set memory stub 2579 // Though just as safe as the other stubs, it takes an unscaled 2580 // size_t (# bytes) argument instead of an element count. 2581 // 2582 // This fill operation is atomicity preserving: as long as the 2583 // address supplied is sufficiently aligned, all writes of up to 64 2584 // bits in size are single-copy atomic. 2585 // 2586 // Input: 2587 // c_rarg0 - destination array address 2588 // c_rarg1 - byte count (size_t) 2589 // c_rarg2 - byte value 2590 // 2591 address generate_unsafe_setmemory() { 2592 __ align(CodeEntryAlignment); 2593 StubCodeMark mark(this, StubGenStubId::unsafe_setmemory_id); 2594 address start = __ pc(); 2595 2596 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2; 2597 Label tail; 2598 2599 UnsafeMemoryAccessMark umam(this, true, false); 2600 2601 __ enter(); // required for proper stackwalking of RuntimeStub frame 2602 2603 __ dup(v0, __ T16B, value); 2604 2605 if (AvoidUnalignedAccesses) { 2606 __ cmp(count, (u1)16); 2607 __ br(__ LO, tail); 2608 2609 __ mov(rscratch1, 16); 2610 __ andr(rscratch2, dest, 15); 2611 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest 2612 __ strq(v0, Address(dest)); 2613 __ sub(count, count, rscratch1); 2614 __ add(dest, dest, rscratch1); 2615 } 2616 2617 __ subs(count, count, (u1)64); 2618 __ br(__ LO, tail); 2619 { 2620 Label again; 2621 __ bind(again); 2622 __ stpq(v0, v0, Address(dest)); 2623 __ stpq(v0, v0, Address(dest, 32)); 2624 2625 __ subs(count, count, 64); 2626 __ add(dest, dest, 64); 2627 __ br(__ HS, again); 2628 } 2629 2630 __ bind(tail); 2631 // The count of bytes is off by 64, but we don't need to correct 2632 // it because we're only going to use the least-significant few 2633 // count bits from here on. 2634 // __ add(count, count, 64); 2635 2636 { 2637 Label dont; 2638 __ tbz(count, exact_log2(32), dont); 2639 __ stpq(v0, v0, __ post(dest, 32)); 2640 __ bind(dont); 2641 } 2642 { 2643 Label dont; 2644 __ tbz(count, exact_log2(16), dont); 2645 __ strq(v0, __ post(dest, 16)); 2646 __ bind(dont); 2647 } 2648 { 2649 Label dont; 2650 __ tbz(count, exact_log2(8), dont); 2651 __ strd(v0, __ post(dest, 8)); 2652 __ bind(dont); 2653 } 2654 2655 Label finished; 2656 __ tst(count, 7); 2657 __ br(__ EQ, finished); 2658 2659 { 2660 Label dont; 2661 __ tbz(count, exact_log2(4), dont); 2662 __ strs(v0, __ post(dest, 4)); 2663 __ bind(dont); 2664 } 2665 { 2666 Label dont; 2667 __ tbz(count, exact_log2(2), dont); 2668 __ bfi(value, value, 8, 8); 2669 __ strh(value, __ post(dest, 2)); 2670 __ bind(dont); 2671 } 2672 { 2673 Label dont; 2674 __ tbz(count, exact_log2(1), dont); 2675 __ strb(value, Address(dest)); 2676 __ bind(dont); 2677 } 2678 2679 __ bind(finished); 2680 __ leave(); 2681 __ ret(lr); 2682 2683 return start; 2684 } 2685 2686 address generate_data_cache_writeback() { 2687 const Register line = c_rarg0; // address of line to write back 2688 2689 __ align(CodeEntryAlignment); 2690 2691 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2692 StubCodeMark mark(this, stub_id); 2693 2694 address start = __ pc(); 2695 __ enter(); 2696 __ cache_wb(Address(line, 0)); 2697 __ leave(); 2698 __ ret(lr); 2699 2700 return start; 2701 } 2702 2703 address generate_data_cache_writeback_sync() { 2704 const Register is_pre = c_rarg0; // pre or post sync 2705 2706 __ align(CodeEntryAlignment); 2707 2708 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2709 StubCodeMark mark(this, stub_id); 2710 2711 // pre wbsync is a no-op 2712 // post wbsync translates to an sfence 2713 2714 Label skip; 2715 address start = __ pc(); 2716 __ enter(); 2717 __ cbnz(is_pre, skip); 2718 __ cache_wbsync(false); 2719 __ bind(skip); 2720 __ leave(); 2721 __ ret(lr); 2722 2723 return start; 2724 } 2725 2726 void generate_arraycopy_stubs() { 2727 address entry; 2728 address entry_jbyte_arraycopy; 2729 address entry_jshort_arraycopy; 2730 address entry_jint_arraycopy; 2731 address entry_oop_arraycopy; 2732 address entry_jlong_arraycopy; 2733 address entry_checkcast_arraycopy; 2734 2735 address ucm_common_error_exit = generate_unsafecopy_common_error_exit(); 2736 UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit); 2737 2738 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2739 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2740 2741 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2742 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2743 2744 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2745 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2746 2747 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2748 2749 //*** jbyte 2750 // Always need aligned and unaligned versions 2751 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2752 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2753 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2754 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2755 2756 //*** jshort 2757 // Always need aligned and unaligned versions 2758 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2759 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2760 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2761 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2762 2763 //*** jint 2764 // Aligned versions 2765 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2766 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2767 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2768 // entry_jint_arraycopy always points to the unaligned version 2769 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2770 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2771 2772 //*** jlong 2773 // It is always aligned 2774 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2775 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2776 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2777 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2778 2779 //*** oops 2780 { 2781 // With compressed oops we need unaligned versions; notice that 2782 // we overwrite entry_oop_arraycopy. 2783 bool aligned = !UseCompressedOops; 2784 2785 StubRoutines::_arrayof_oop_disjoint_arraycopy 2786 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2787 StubRoutines::_arrayof_oop_arraycopy 2788 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2789 // Aligned versions without pre-barriers 2790 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2791 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2792 StubRoutines::_arrayof_oop_arraycopy_uninit 2793 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2794 } 2795 2796 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2797 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2798 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2799 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2800 2801 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2802 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2803 2804 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2805 entry_jshort_arraycopy, 2806 entry_jint_arraycopy, 2807 entry_jlong_arraycopy); 2808 2809 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2810 entry_jshort_arraycopy, 2811 entry_jint_arraycopy, 2812 entry_oop_arraycopy, 2813 entry_jlong_arraycopy, 2814 entry_checkcast_arraycopy); 2815 2816 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2817 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2818 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2819 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2820 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2821 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2822 } 2823 2824 void generate_math_stubs() { Unimplemented(); } 2825 2826 // Arguments: 2827 // 2828 // Inputs: 2829 // c_rarg0 - source byte array address 2830 // c_rarg1 - destination byte array address 2831 // c_rarg2 - K (key) in little endian int array 2832 // 2833 address generate_aescrypt_encryptBlock() { 2834 __ align(CodeEntryAlignment); 2835 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2836 StubCodeMark mark(this, stub_id); 2837 2838 const Register from = c_rarg0; // source array address 2839 const Register to = c_rarg1; // destination array address 2840 const Register key = c_rarg2; // key array address 2841 const Register keylen = rscratch1; 2842 2843 address start = __ pc(); 2844 __ enter(); 2845 2846 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2847 2848 __ aesenc_loadkeys(key, keylen); 2849 __ aesecb_encrypt(from, to, keylen); 2850 2851 __ mov(r0, 0); 2852 2853 __ leave(); 2854 __ ret(lr); 2855 2856 return start; 2857 } 2858 2859 // Arguments: 2860 // 2861 // Inputs: 2862 // c_rarg0 - source byte array address 2863 // c_rarg1 - destination byte array address 2864 // c_rarg2 - K (key) in little endian int array 2865 // 2866 address generate_aescrypt_decryptBlock() { 2867 assert(UseAES, "need AES cryptographic extension support"); 2868 __ align(CodeEntryAlignment); 2869 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2870 StubCodeMark mark(this, stub_id); 2871 Label L_doLast; 2872 2873 const Register from = c_rarg0; // source array address 2874 const Register to = c_rarg1; // destination array address 2875 const Register key = c_rarg2; // key array address 2876 const Register keylen = rscratch1; 2877 2878 address start = __ pc(); 2879 __ enter(); // required for proper stackwalking of RuntimeStub frame 2880 2881 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2882 2883 __ aesecb_decrypt(from, to, key, keylen); 2884 2885 __ mov(r0, 0); 2886 2887 __ leave(); 2888 __ ret(lr); 2889 2890 return start; 2891 } 2892 2893 // Arguments: 2894 // 2895 // Inputs: 2896 // c_rarg0 - source byte array address 2897 // c_rarg1 - destination byte array address 2898 // c_rarg2 - K (key) in little endian int array 2899 // c_rarg3 - r vector byte array address 2900 // c_rarg4 - input length 2901 // 2902 // Output: 2903 // x0 - input length 2904 // 2905 address generate_cipherBlockChaining_encryptAESCrypt() { 2906 assert(UseAES, "need AES cryptographic extension support"); 2907 __ align(CodeEntryAlignment); 2908 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2909 StubCodeMark mark(this, stub_id); 2910 2911 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2912 2913 const Register from = c_rarg0; // source array address 2914 const Register to = c_rarg1; // destination array address 2915 const Register key = c_rarg2; // key array address 2916 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2917 // and left with the results of the last encryption block 2918 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2919 const Register keylen = rscratch1; 2920 2921 address start = __ pc(); 2922 2923 __ enter(); 2924 2925 __ movw(rscratch2, len_reg); 2926 2927 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2928 2929 __ ld1(v0, __ T16B, rvec); 2930 2931 __ cmpw(keylen, 52); 2932 __ br(Assembler::CC, L_loadkeys_44); 2933 __ br(Assembler::EQ, L_loadkeys_52); 2934 2935 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2936 __ rev32(v17, __ T16B, v17); 2937 __ rev32(v18, __ T16B, v18); 2938 __ BIND(L_loadkeys_52); 2939 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2940 __ rev32(v19, __ T16B, v19); 2941 __ rev32(v20, __ T16B, v20); 2942 __ BIND(L_loadkeys_44); 2943 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2944 __ rev32(v21, __ T16B, v21); 2945 __ rev32(v22, __ T16B, v22); 2946 __ rev32(v23, __ T16B, v23); 2947 __ rev32(v24, __ T16B, v24); 2948 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2949 __ rev32(v25, __ T16B, v25); 2950 __ rev32(v26, __ T16B, v26); 2951 __ rev32(v27, __ T16B, v27); 2952 __ rev32(v28, __ T16B, v28); 2953 __ ld1(v29, v30, v31, __ T16B, key); 2954 __ rev32(v29, __ T16B, v29); 2955 __ rev32(v30, __ T16B, v30); 2956 __ rev32(v31, __ T16B, v31); 2957 2958 __ BIND(L_aes_loop); 2959 __ ld1(v1, __ T16B, __ post(from, 16)); 2960 __ eor(v0, __ T16B, v0, v1); 2961 2962 __ br(Assembler::CC, L_rounds_44); 2963 __ br(Assembler::EQ, L_rounds_52); 2964 2965 __ aese(v0, v17); __ aesmc(v0, v0); 2966 __ aese(v0, v18); __ aesmc(v0, v0); 2967 __ BIND(L_rounds_52); 2968 __ aese(v0, v19); __ aesmc(v0, v0); 2969 __ aese(v0, v20); __ aesmc(v0, v0); 2970 __ BIND(L_rounds_44); 2971 __ aese(v0, v21); __ aesmc(v0, v0); 2972 __ aese(v0, v22); __ aesmc(v0, v0); 2973 __ aese(v0, v23); __ aesmc(v0, v0); 2974 __ aese(v0, v24); __ aesmc(v0, v0); 2975 __ aese(v0, v25); __ aesmc(v0, v0); 2976 __ aese(v0, v26); __ aesmc(v0, v0); 2977 __ aese(v0, v27); __ aesmc(v0, v0); 2978 __ aese(v0, v28); __ aesmc(v0, v0); 2979 __ aese(v0, v29); __ aesmc(v0, v0); 2980 __ aese(v0, v30); 2981 __ eor(v0, __ T16B, v0, v31); 2982 2983 __ st1(v0, __ T16B, __ post(to, 16)); 2984 2985 __ subw(len_reg, len_reg, 16); 2986 __ cbnzw(len_reg, L_aes_loop); 2987 2988 __ st1(v0, __ T16B, rvec); 2989 2990 __ mov(r0, rscratch2); 2991 2992 __ leave(); 2993 __ ret(lr); 2994 2995 return start; 2996 } 2997 2998 // Arguments: 2999 // 3000 // Inputs: 3001 // c_rarg0 - source byte array address 3002 // c_rarg1 - destination byte array address 3003 // c_rarg2 - K (key) in little endian int array 3004 // c_rarg3 - r vector byte array address 3005 // c_rarg4 - input length 3006 // 3007 // Output: 3008 // r0 - input length 3009 // 3010 address generate_cipherBlockChaining_decryptAESCrypt() { 3011 assert(UseAES, "need AES cryptographic extension support"); 3012 __ align(CodeEntryAlignment); 3013 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 3014 StubCodeMark mark(this, stub_id); 3015 3016 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 3017 3018 const Register from = c_rarg0; // source array address 3019 const Register to = c_rarg1; // destination array address 3020 const Register key = c_rarg2; // key array address 3021 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3022 // and left with the results of the last encryption block 3023 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3024 const Register keylen = rscratch1; 3025 3026 address start = __ pc(); 3027 3028 __ enter(); 3029 3030 __ movw(rscratch2, len_reg); 3031 3032 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3033 3034 __ ld1(v2, __ T16B, rvec); 3035 3036 __ ld1(v31, __ T16B, __ post(key, 16)); 3037 __ rev32(v31, __ T16B, v31); 3038 3039 __ cmpw(keylen, 52); 3040 __ br(Assembler::CC, L_loadkeys_44); 3041 __ br(Assembler::EQ, L_loadkeys_52); 3042 3043 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 3044 __ rev32(v17, __ T16B, v17); 3045 __ rev32(v18, __ T16B, v18); 3046 __ BIND(L_loadkeys_52); 3047 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 3048 __ rev32(v19, __ T16B, v19); 3049 __ rev32(v20, __ T16B, v20); 3050 __ BIND(L_loadkeys_44); 3051 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 3052 __ rev32(v21, __ T16B, v21); 3053 __ rev32(v22, __ T16B, v22); 3054 __ rev32(v23, __ T16B, v23); 3055 __ rev32(v24, __ T16B, v24); 3056 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3057 __ rev32(v25, __ T16B, v25); 3058 __ rev32(v26, __ T16B, v26); 3059 __ rev32(v27, __ T16B, v27); 3060 __ rev32(v28, __ T16B, v28); 3061 __ ld1(v29, v30, __ T16B, key); 3062 __ rev32(v29, __ T16B, v29); 3063 __ rev32(v30, __ T16B, v30); 3064 3065 __ BIND(L_aes_loop); 3066 __ ld1(v0, __ T16B, __ post(from, 16)); 3067 __ orr(v1, __ T16B, v0, v0); 3068 3069 __ br(Assembler::CC, L_rounds_44); 3070 __ br(Assembler::EQ, L_rounds_52); 3071 3072 __ aesd(v0, v17); __ aesimc(v0, v0); 3073 __ aesd(v0, v18); __ aesimc(v0, v0); 3074 __ BIND(L_rounds_52); 3075 __ aesd(v0, v19); __ aesimc(v0, v0); 3076 __ aesd(v0, v20); __ aesimc(v0, v0); 3077 __ BIND(L_rounds_44); 3078 __ aesd(v0, v21); __ aesimc(v0, v0); 3079 __ aesd(v0, v22); __ aesimc(v0, v0); 3080 __ aesd(v0, v23); __ aesimc(v0, v0); 3081 __ aesd(v0, v24); __ aesimc(v0, v0); 3082 __ aesd(v0, v25); __ aesimc(v0, v0); 3083 __ aesd(v0, v26); __ aesimc(v0, v0); 3084 __ aesd(v0, v27); __ aesimc(v0, v0); 3085 __ aesd(v0, v28); __ aesimc(v0, v0); 3086 __ aesd(v0, v29); __ aesimc(v0, v0); 3087 __ aesd(v0, v30); 3088 __ eor(v0, __ T16B, v0, v31); 3089 __ eor(v0, __ T16B, v0, v2); 3090 3091 __ st1(v0, __ T16B, __ post(to, 16)); 3092 __ orr(v2, __ T16B, v1, v1); 3093 3094 __ subw(len_reg, len_reg, 16); 3095 __ cbnzw(len_reg, L_aes_loop); 3096 3097 __ st1(v2, __ T16B, rvec); 3098 3099 __ mov(r0, rscratch2); 3100 3101 __ leave(); 3102 __ ret(lr); 3103 3104 return start; 3105 } 3106 3107 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3108 // Inputs: 128-bits. in is preserved. 3109 // The least-significant 64-bit word is in the upper dword of each vector. 3110 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3111 // Output: result 3112 void be_add_128_64(FloatRegister result, FloatRegister in, 3113 FloatRegister inc, FloatRegister tmp) { 3114 assert_different_registers(result, tmp, inc); 3115 3116 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3117 // input 3118 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3119 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3120 // MSD == 0 (must be!) to LSD 3121 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3122 } 3123 3124 // CTR AES crypt. 3125 // Arguments: 3126 // 3127 // Inputs: 3128 // c_rarg0 - source byte array address 3129 // c_rarg1 - destination byte array address 3130 // c_rarg2 - K (key) in little endian int array 3131 // c_rarg3 - counter vector byte array address 3132 // c_rarg4 - input length 3133 // c_rarg5 - saved encryptedCounter start 3134 // c_rarg6 - saved used length 3135 // 3136 // Output: 3137 // r0 - input length 3138 // 3139 address generate_counterMode_AESCrypt() { 3140 const Register in = c_rarg0; 3141 const Register out = c_rarg1; 3142 const Register key = c_rarg2; 3143 const Register counter = c_rarg3; 3144 const Register saved_len = c_rarg4, len = r10; 3145 const Register saved_encrypted_ctr = c_rarg5; 3146 const Register used_ptr = c_rarg6, used = r12; 3147 3148 const Register offset = r7; 3149 const Register keylen = r11; 3150 3151 const unsigned char block_size = 16; 3152 const int bulk_width = 4; 3153 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3154 // performance with larger data sizes, but it also means that the 3155 // fast path isn't used until you have at least 8 blocks, and up 3156 // to 127 bytes of data will be executed on the slow path. For 3157 // that reason, and also so as not to blow away too much icache, 4 3158 // blocks seems like a sensible compromise. 3159 3160 // Algorithm: 3161 // 3162 // if (len == 0) { 3163 // goto DONE; 3164 // } 3165 // int result = len; 3166 // do { 3167 // if (used >= blockSize) { 3168 // if (len >= bulk_width * blockSize) { 3169 // CTR_large_block(); 3170 // if (len == 0) 3171 // goto DONE; 3172 // } 3173 // for (;;) { 3174 // 16ByteVector v0 = counter; 3175 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3176 // used = 0; 3177 // if (len < blockSize) 3178 // break; /* goto NEXT */ 3179 // 16ByteVector v1 = load16Bytes(in, offset); 3180 // v1 = v1 ^ encryptedCounter; 3181 // store16Bytes(out, offset); 3182 // used = blockSize; 3183 // offset += blockSize; 3184 // len -= blockSize; 3185 // if (len == 0) 3186 // goto DONE; 3187 // } 3188 // } 3189 // NEXT: 3190 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3191 // len--; 3192 // } while (len != 0); 3193 // DONE: 3194 // return result; 3195 // 3196 // CTR_large_block() 3197 // Wide bulk encryption of whole blocks. 3198 3199 __ align(CodeEntryAlignment); 3200 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3201 StubCodeMark mark(this, stub_id); 3202 const address start = __ pc(); 3203 __ enter(); 3204 3205 Label DONE, CTR_large_block, large_block_return; 3206 __ ldrw(used, Address(used_ptr)); 3207 __ cbzw(saved_len, DONE); 3208 3209 __ mov(len, saved_len); 3210 __ mov(offset, 0); 3211 3212 // Compute #rounds for AES based on the length of the key array 3213 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3214 3215 __ aesenc_loadkeys(key, keylen); 3216 3217 { 3218 Label L_CTR_loop, NEXT; 3219 3220 __ bind(L_CTR_loop); 3221 3222 __ cmp(used, block_size); 3223 __ br(__ LO, NEXT); 3224 3225 // Maybe we have a lot of data 3226 __ subsw(rscratch1, len, bulk_width * block_size); 3227 __ br(__ HS, CTR_large_block); 3228 __ BIND(large_block_return); 3229 __ cbzw(len, DONE); 3230 3231 // Setup the counter 3232 __ movi(v4, __ T4S, 0); 3233 __ movi(v5, __ T4S, 1); 3234 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3235 3236 // 128-bit big-endian increment 3237 __ ld1(v0, __ T16B, counter); 3238 __ rev64(v16, __ T16B, v0); 3239 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3240 __ rev64(v16, __ T16B, v16); 3241 __ st1(v16, __ T16B, counter); 3242 // Previous counter value is in v0 3243 // v4 contains { 0, 1 } 3244 3245 { 3246 // We have fewer than bulk_width blocks of data left. Encrypt 3247 // them one by one until there is less than a full block 3248 // remaining, being careful to save both the encrypted counter 3249 // and the counter. 3250 3251 Label inner_loop; 3252 __ bind(inner_loop); 3253 // Counter to encrypt is in v0 3254 __ aesecb_encrypt(noreg, noreg, keylen); 3255 __ st1(v0, __ T16B, saved_encrypted_ctr); 3256 3257 // Do we have a remaining full block? 3258 3259 __ mov(used, 0); 3260 __ cmp(len, block_size); 3261 __ br(__ LO, NEXT); 3262 3263 // Yes, we have a full block 3264 __ ldrq(v1, Address(in, offset)); 3265 __ eor(v1, __ T16B, v1, v0); 3266 __ strq(v1, Address(out, offset)); 3267 __ mov(used, block_size); 3268 __ add(offset, offset, block_size); 3269 3270 __ subw(len, len, block_size); 3271 __ cbzw(len, DONE); 3272 3273 // Increment the counter, store it back 3274 __ orr(v0, __ T16B, v16, v16); 3275 __ rev64(v16, __ T16B, v16); 3276 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3277 __ rev64(v16, __ T16B, v16); 3278 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3279 3280 __ b(inner_loop); 3281 } 3282 3283 __ BIND(NEXT); 3284 3285 // Encrypt a single byte, and loop. 3286 // We expect this to be a rare event. 3287 __ ldrb(rscratch1, Address(in, offset)); 3288 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3289 __ eor(rscratch1, rscratch1, rscratch2); 3290 __ strb(rscratch1, Address(out, offset)); 3291 __ add(offset, offset, 1); 3292 __ add(used, used, 1); 3293 __ subw(len, len,1); 3294 __ cbnzw(len, L_CTR_loop); 3295 } 3296 3297 __ bind(DONE); 3298 __ strw(used, Address(used_ptr)); 3299 __ mov(r0, saved_len); 3300 3301 __ leave(); // required for proper stackwalking of RuntimeStub frame 3302 __ ret(lr); 3303 3304 // Bulk encryption 3305 3306 __ BIND (CTR_large_block); 3307 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3308 3309 if (bulk_width == 8) { 3310 __ sub(sp, sp, 4 * 16); 3311 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3312 } 3313 __ sub(sp, sp, 4 * 16); 3314 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3315 RegSet saved_regs = (RegSet::of(in, out, offset) 3316 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3317 __ push(saved_regs, sp); 3318 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3319 __ add(in, in, offset); 3320 __ add(out, out, offset); 3321 3322 // Keys should already be loaded into the correct registers 3323 3324 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3325 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3326 3327 // AES/CTR loop 3328 { 3329 Label L_CTR_loop; 3330 __ BIND(L_CTR_loop); 3331 3332 // Setup the counters 3333 __ movi(v8, __ T4S, 0); 3334 __ movi(v9, __ T4S, 1); 3335 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3336 3337 for (int i = 0; i < bulk_width; i++) { 3338 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3339 __ rev64(v0_ofs, __ T16B, v16); 3340 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3341 } 3342 3343 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3344 3345 // Encrypt the counters 3346 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3347 3348 if (bulk_width == 8) { 3349 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3350 } 3351 3352 // XOR the encrypted counters with the inputs 3353 for (int i = 0; i < bulk_width; i++) { 3354 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3355 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3356 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3357 } 3358 3359 // Write the encrypted data 3360 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3361 if (bulk_width == 8) { 3362 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3363 } 3364 3365 __ subw(len, len, 16 * bulk_width); 3366 __ cbnzw(len, L_CTR_loop); 3367 } 3368 3369 // Save the counter back where it goes 3370 __ rev64(v16, __ T16B, v16); 3371 __ st1(v16, __ T16B, counter); 3372 3373 __ pop(saved_regs, sp); 3374 3375 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3376 if (bulk_width == 8) { 3377 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3378 } 3379 3380 __ andr(rscratch1, len, -16 * bulk_width); 3381 __ sub(len, len, rscratch1); 3382 __ add(offset, offset, rscratch1); 3383 __ mov(used, 16); 3384 __ strw(used, Address(used_ptr)); 3385 __ b(large_block_return); 3386 3387 return start; 3388 } 3389 3390 // Vector AES Galois Counter Mode implementation. Parameters: 3391 // 3392 // in = c_rarg0 3393 // len = c_rarg1 3394 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3395 // out = c_rarg3 3396 // key = c_rarg4 3397 // state = c_rarg5 - GHASH.state 3398 // subkeyHtbl = c_rarg6 - powers of H 3399 // counter = c_rarg7 - 16 bytes of CTR 3400 // return - number of processed bytes 3401 address generate_galoisCounterMode_AESCrypt() { 3402 address ghash_polynomial = __ pc(); 3403 __ emit_int64(0x87); // The low-order bits of the field 3404 // polynomial (i.e. p = z^7+z^2+z+1) 3405 // repeated in the low and high parts of a 3406 // 128-bit vector 3407 __ emit_int64(0x87); 3408 3409 __ align(CodeEntryAlignment); 3410 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3411 StubCodeMark mark(this, stub_id); 3412 address start = __ pc(); 3413 __ enter(); 3414 3415 const Register in = c_rarg0; 3416 const Register len = c_rarg1; 3417 const Register ct = c_rarg2; 3418 const Register out = c_rarg3; 3419 // and updated with the incremented counter in the end 3420 3421 const Register key = c_rarg4; 3422 const Register state = c_rarg5; 3423 3424 const Register subkeyHtbl = c_rarg6; 3425 3426 const Register counter = c_rarg7; 3427 3428 const Register keylen = r10; 3429 // Save state before entering routine 3430 __ sub(sp, sp, 4 * 16); 3431 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3432 __ sub(sp, sp, 4 * 16); 3433 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3434 3435 // __ andr(len, len, -512); 3436 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3437 __ str(len, __ pre(sp, -2 * wordSize)); 3438 3439 Label DONE; 3440 __ cbz(len, DONE); 3441 3442 // Compute #rounds for AES based on the length of the key array 3443 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3444 3445 __ aesenc_loadkeys(key, keylen); 3446 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3447 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3448 3449 // AES/CTR loop 3450 { 3451 Label L_CTR_loop; 3452 __ BIND(L_CTR_loop); 3453 3454 // Setup the counters 3455 __ movi(v8, __ T4S, 0); 3456 __ movi(v9, __ T4S, 1); 3457 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3458 3459 assert(v0->encoding() < v8->encoding(), ""); 3460 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3461 FloatRegister f = as_FloatRegister(i); 3462 __ rev32(f, __ T16B, v16); 3463 __ addv(v16, __ T4S, v16, v8); 3464 } 3465 3466 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3467 3468 // Encrypt the counters 3469 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3470 3471 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3472 3473 // XOR the encrypted counters with the inputs 3474 for (int i = 0; i < 8; i++) { 3475 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3476 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3477 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3478 } 3479 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3480 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3481 3482 __ subw(len, len, 16 * 8); 3483 __ cbnzw(len, L_CTR_loop); 3484 } 3485 3486 __ rev32(v16, __ T16B, v16); 3487 __ st1(v16, __ T16B, counter); 3488 3489 __ ldr(len, Address(sp)); 3490 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3491 3492 // GHASH/CTR loop 3493 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3494 len, /*unrolls*/4); 3495 3496 #ifdef ASSERT 3497 { Label L; 3498 __ cmp(len, (unsigned char)0); 3499 __ br(Assembler::EQ, L); 3500 __ stop("stubGenerator: abort"); 3501 __ bind(L); 3502 } 3503 #endif 3504 3505 __ bind(DONE); 3506 // Return the number of bytes processed 3507 __ ldr(r0, __ post(sp, 2 * wordSize)); 3508 3509 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3510 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3511 3512 __ leave(); // required for proper stackwalking of RuntimeStub frame 3513 __ ret(lr); 3514 return start; 3515 } 3516 3517 class Cached64Bytes { 3518 private: 3519 MacroAssembler *_masm; 3520 Register _regs[8]; 3521 3522 public: 3523 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3524 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3525 auto it = rs.begin(); 3526 for (auto &r: _regs) { 3527 r = *it; 3528 ++it; 3529 } 3530 } 3531 3532 void gen_loads(Register base) { 3533 for (int i = 0; i < 8; i += 2) { 3534 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3535 } 3536 } 3537 3538 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3539 void extract_u32(Register dest, int i) { 3540 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3541 } 3542 }; 3543 3544 // Utility routines for md5. 3545 // Clobbers r10 and r11. 3546 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3547 int k, int s, int t) { 3548 Register rscratch3 = r10; 3549 Register rscratch4 = r11; 3550 3551 __ eorw(rscratch3, r3, r4); 3552 __ movw(rscratch2, t); 3553 __ andw(rscratch3, rscratch3, r2); 3554 __ addw(rscratch4, r1, rscratch2); 3555 reg_cache.extract_u32(rscratch1, k); 3556 __ eorw(rscratch3, rscratch3, r4); 3557 __ addw(rscratch4, rscratch4, rscratch1); 3558 __ addw(rscratch3, rscratch3, rscratch4); 3559 __ rorw(rscratch2, rscratch3, 32 - s); 3560 __ addw(r1, rscratch2, r2); 3561 } 3562 3563 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3564 int k, int s, int t) { 3565 Register rscratch3 = r10; 3566 Register rscratch4 = r11; 3567 3568 reg_cache.extract_u32(rscratch1, k); 3569 __ movw(rscratch2, t); 3570 __ addw(rscratch4, r1, rscratch2); 3571 __ addw(rscratch4, rscratch4, rscratch1); 3572 __ bicw(rscratch2, r3, r4); 3573 __ andw(rscratch3, r2, r4); 3574 __ addw(rscratch2, rscratch2, rscratch4); 3575 __ addw(rscratch2, rscratch2, rscratch3); 3576 __ rorw(rscratch2, rscratch2, 32 - s); 3577 __ addw(r1, rscratch2, r2); 3578 } 3579 3580 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3581 int k, int s, int t) { 3582 Register rscratch3 = r10; 3583 Register rscratch4 = r11; 3584 3585 __ eorw(rscratch3, r3, r4); 3586 __ movw(rscratch2, t); 3587 __ addw(rscratch4, r1, rscratch2); 3588 reg_cache.extract_u32(rscratch1, k); 3589 __ eorw(rscratch3, rscratch3, r2); 3590 __ addw(rscratch4, rscratch4, rscratch1); 3591 __ addw(rscratch3, rscratch3, rscratch4); 3592 __ rorw(rscratch2, rscratch3, 32 - s); 3593 __ addw(r1, rscratch2, r2); 3594 } 3595 3596 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3597 int k, int s, int t) { 3598 Register rscratch3 = r10; 3599 Register rscratch4 = r11; 3600 3601 __ movw(rscratch3, t); 3602 __ ornw(rscratch2, r2, r4); 3603 __ addw(rscratch4, r1, rscratch3); 3604 reg_cache.extract_u32(rscratch1, k); 3605 __ eorw(rscratch3, rscratch2, r3); 3606 __ addw(rscratch4, rscratch4, rscratch1); 3607 __ addw(rscratch3, rscratch3, rscratch4); 3608 __ rorw(rscratch2, rscratch3, 32 - s); 3609 __ addw(r1, rscratch2, r2); 3610 } 3611 3612 // Arguments: 3613 // 3614 // Inputs: 3615 // c_rarg0 - byte[] source+offset 3616 // c_rarg1 - int[] SHA.state 3617 // c_rarg2 - int offset 3618 // c_rarg3 - int limit 3619 // 3620 address generate_md5_implCompress(StubGenStubId stub_id) { 3621 bool multi_block; 3622 switch (stub_id) { 3623 case md5_implCompress_id: 3624 multi_block = false; 3625 break; 3626 case md5_implCompressMB_id: 3627 multi_block = true; 3628 break; 3629 default: 3630 ShouldNotReachHere(); 3631 } 3632 __ align(CodeEntryAlignment); 3633 3634 StubCodeMark mark(this, stub_id); 3635 address start = __ pc(); 3636 3637 Register buf = c_rarg0; 3638 Register state = c_rarg1; 3639 Register ofs = c_rarg2; 3640 Register limit = c_rarg3; 3641 Register a = r4; 3642 Register b = r5; 3643 Register c = r6; 3644 Register d = r7; 3645 Register rscratch3 = r10; 3646 Register rscratch4 = r11; 3647 3648 Register state_regs[2] = { r12, r13 }; 3649 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3650 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3651 3652 __ push(saved_regs, sp); 3653 3654 __ ldp(state_regs[0], state_regs[1], Address(state)); 3655 __ ubfx(a, state_regs[0], 0, 32); 3656 __ ubfx(b, state_regs[0], 32, 32); 3657 __ ubfx(c, state_regs[1], 0, 32); 3658 __ ubfx(d, state_regs[1], 32, 32); 3659 3660 Label md5_loop; 3661 __ BIND(md5_loop); 3662 3663 reg_cache.gen_loads(buf); 3664 3665 // Round 1 3666 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3667 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3668 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3669 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3670 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3671 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3672 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3673 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3674 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3675 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3676 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3677 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3678 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3679 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3680 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3681 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3682 3683 // Round 2 3684 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3685 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3686 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3687 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3688 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3689 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3690 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3691 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3692 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3693 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3694 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3695 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3696 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3697 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3698 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3699 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3700 3701 // Round 3 3702 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3703 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3704 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3705 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3706 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3707 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3708 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3709 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3710 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3711 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3712 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3713 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3714 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3715 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3716 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3717 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3718 3719 // Round 4 3720 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3721 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3722 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3723 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3724 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3725 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3726 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3727 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3728 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3729 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3730 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3731 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3732 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3733 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3734 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3735 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3736 3737 __ addw(a, state_regs[0], a); 3738 __ ubfx(rscratch2, state_regs[0], 32, 32); 3739 __ addw(b, rscratch2, b); 3740 __ addw(c, state_regs[1], c); 3741 __ ubfx(rscratch4, state_regs[1], 32, 32); 3742 __ addw(d, rscratch4, d); 3743 3744 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3745 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3746 3747 if (multi_block) { 3748 __ add(buf, buf, 64); 3749 __ add(ofs, ofs, 64); 3750 __ cmp(ofs, limit); 3751 __ br(Assembler::LE, md5_loop); 3752 __ mov(c_rarg0, ofs); // return ofs 3753 } 3754 3755 // write hash values back in the correct order 3756 __ stp(state_regs[0], state_regs[1], Address(state)); 3757 3758 __ pop(saved_regs, sp); 3759 3760 __ ret(lr); 3761 3762 return start; 3763 } 3764 3765 // Arguments: 3766 // 3767 // Inputs: 3768 // c_rarg0 - byte[] source+offset 3769 // c_rarg1 - int[] SHA.state 3770 // c_rarg2 - int offset 3771 // c_rarg3 - int limit 3772 // 3773 address generate_sha1_implCompress(StubGenStubId stub_id) { 3774 bool multi_block; 3775 switch (stub_id) { 3776 case sha1_implCompress_id: 3777 multi_block = false; 3778 break; 3779 case sha1_implCompressMB_id: 3780 multi_block = true; 3781 break; 3782 default: 3783 ShouldNotReachHere(); 3784 } 3785 3786 __ align(CodeEntryAlignment); 3787 3788 StubCodeMark mark(this, stub_id); 3789 address start = __ pc(); 3790 3791 Register buf = c_rarg0; 3792 Register state = c_rarg1; 3793 Register ofs = c_rarg2; 3794 Register limit = c_rarg3; 3795 3796 Label keys; 3797 Label sha1_loop; 3798 3799 // load the keys into v0..v3 3800 __ adr(rscratch1, keys); 3801 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3802 // load 5 words state into v6, v7 3803 __ ldrq(v6, Address(state, 0)); 3804 __ ldrs(v7, Address(state, 16)); 3805 3806 3807 __ BIND(sha1_loop); 3808 // load 64 bytes of data into v16..v19 3809 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3810 __ rev32(v16, __ T16B, v16); 3811 __ rev32(v17, __ T16B, v17); 3812 __ rev32(v18, __ T16B, v18); 3813 __ rev32(v19, __ T16B, v19); 3814 3815 // do the sha1 3816 __ addv(v4, __ T4S, v16, v0); 3817 __ orr(v20, __ T16B, v6, v6); 3818 3819 FloatRegister d0 = v16; 3820 FloatRegister d1 = v17; 3821 FloatRegister d2 = v18; 3822 FloatRegister d3 = v19; 3823 3824 for (int round = 0; round < 20; round++) { 3825 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3826 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3827 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3828 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3829 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3830 3831 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3832 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3833 __ sha1h(tmp2, __ T4S, v20); 3834 if (round < 5) 3835 __ sha1c(v20, __ T4S, tmp3, tmp4); 3836 else if (round < 10 || round >= 15) 3837 __ sha1p(v20, __ T4S, tmp3, tmp4); 3838 else 3839 __ sha1m(v20, __ T4S, tmp3, tmp4); 3840 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3841 3842 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3843 } 3844 3845 __ addv(v7, __ T2S, v7, v21); 3846 __ addv(v6, __ T4S, v6, v20); 3847 3848 if (multi_block) { 3849 __ add(ofs, ofs, 64); 3850 __ cmp(ofs, limit); 3851 __ br(Assembler::LE, sha1_loop); 3852 __ mov(c_rarg0, ofs); // return ofs 3853 } 3854 3855 __ strq(v6, Address(state, 0)); 3856 __ strs(v7, Address(state, 16)); 3857 3858 __ ret(lr); 3859 3860 __ bind(keys); 3861 __ emit_int32(0x5a827999); 3862 __ emit_int32(0x6ed9eba1); 3863 __ emit_int32(0x8f1bbcdc); 3864 __ emit_int32(0xca62c1d6); 3865 3866 return start; 3867 } 3868 3869 3870 // Arguments: 3871 // 3872 // Inputs: 3873 // c_rarg0 - byte[] source+offset 3874 // c_rarg1 - int[] SHA.state 3875 // c_rarg2 - int offset 3876 // c_rarg3 - int limit 3877 // 3878 address generate_sha256_implCompress(StubGenStubId stub_id) { 3879 bool multi_block; 3880 switch (stub_id) { 3881 case sha256_implCompress_id: 3882 multi_block = false; 3883 break; 3884 case sha256_implCompressMB_id: 3885 multi_block = true; 3886 break; 3887 default: 3888 ShouldNotReachHere(); 3889 } 3890 3891 static const uint32_t round_consts[64] = { 3892 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3893 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3894 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3895 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3896 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3897 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3898 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3899 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3900 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3901 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3902 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3903 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3904 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3905 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3906 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3907 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3908 }; 3909 3910 __ align(CodeEntryAlignment); 3911 3912 StubCodeMark mark(this, stub_id); 3913 address start = __ pc(); 3914 3915 Register buf = c_rarg0; 3916 Register state = c_rarg1; 3917 Register ofs = c_rarg2; 3918 Register limit = c_rarg3; 3919 3920 Label sha1_loop; 3921 3922 __ stpd(v8, v9, __ pre(sp, -32)); 3923 __ stpd(v10, v11, Address(sp, 16)); 3924 3925 // dga == v0 3926 // dgb == v1 3927 // dg0 == v2 3928 // dg1 == v3 3929 // dg2 == v4 3930 // t0 == v6 3931 // t1 == v7 3932 3933 // load 16 keys to v16..v31 3934 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3935 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3936 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3937 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3938 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3939 3940 // load 8 words (256 bits) state 3941 __ ldpq(v0, v1, state); 3942 3943 __ BIND(sha1_loop); 3944 // load 64 bytes of data into v8..v11 3945 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3946 __ rev32(v8, __ T16B, v8); 3947 __ rev32(v9, __ T16B, v9); 3948 __ rev32(v10, __ T16B, v10); 3949 __ rev32(v11, __ T16B, v11); 3950 3951 __ addv(v6, __ T4S, v8, v16); 3952 __ orr(v2, __ T16B, v0, v0); 3953 __ orr(v3, __ T16B, v1, v1); 3954 3955 FloatRegister d0 = v8; 3956 FloatRegister d1 = v9; 3957 FloatRegister d2 = v10; 3958 FloatRegister d3 = v11; 3959 3960 3961 for (int round = 0; round < 16; round++) { 3962 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3963 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3964 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3965 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3966 3967 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3968 __ orr(v4, __ T16B, v2, v2); 3969 if (round < 15) 3970 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3971 __ sha256h(v2, __ T4S, v3, tmp2); 3972 __ sha256h2(v3, __ T4S, v4, tmp2); 3973 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3974 3975 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3976 } 3977 3978 __ addv(v0, __ T4S, v0, v2); 3979 __ addv(v1, __ T4S, v1, v3); 3980 3981 if (multi_block) { 3982 __ add(ofs, ofs, 64); 3983 __ cmp(ofs, limit); 3984 __ br(Assembler::LE, sha1_loop); 3985 __ mov(c_rarg0, ofs); // return ofs 3986 } 3987 3988 __ ldpd(v10, v11, Address(sp, 16)); 3989 __ ldpd(v8, v9, __ post(sp, 32)); 3990 3991 __ stpq(v0, v1, state); 3992 3993 __ ret(lr); 3994 3995 return start; 3996 } 3997 3998 // Double rounds for sha512. 3999 void sha512_dround(int dr, 4000 FloatRegister vi0, FloatRegister vi1, 4001 FloatRegister vi2, FloatRegister vi3, 4002 FloatRegister vi4, FloatRegister vrc0, 4003 FloatRegister vrc1, FloatRegister vin0, 4004 FloatRegister vin1, FloatRegister vin2, 4005 FloatRegister vin3, FloatRegister vin4) { 4006 if (dr < 36) { 4007 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 4008 } 4009 __ addv(v5, __ T2D, vrc0, vin0); 4010 __ ext(v6, __ T16B, vi2, vi3, 8); 4011 __ ext(v5, __ T16B, v5, v5, 8); 4012 __ ext(v7, __ T16B, vi1, vi2, 8); 4013 __ addv(vi3, __ T2D, vi3, v5); 4014 if (dr < 32) { 4015 __ ext(v5, __ T16B, vin3, vin4, 8); 4016 __ sha512su0(vin0, __ T2D, vin1); 4017 } 4018 __ sha512h(vi3, __ T2D, v6, v7); 4019 if (dr < 32) { 4020 __ sha512su1(vin0, __ T2D, vin2, v5); 4021 } 4022 __ addv(vi4, __ T2D, vi1, vi3); 4023 __ sha512h2(vi3, __ T2D, vi1, vi0); 4024 } 4025 4026 // Arguments: 4027 // 4028 // Inputs: 4029 // c_rarg0 - byte[] source+offset 4030 // c_rarg1 - int[] SHA.state 4031 // c_rarg2 - int offset 4032 // c_rarg3 - int limit 4033 // 4034 address generate_sha512_implCompress(StubGenStubId stub_id) { 4035 bool multi_block; 4036 switch (stub_id) { 4037 case sha512_implCompress_id: 4038 multi_block = false; 4039 break; 4040 case sha512_implCompressMB_id: 4041 multi_block = true; 4042 break; 4043 default: 4044 ShouldNotReachHere(); 4045 } 4046 4047 static const uint64_t round_consts[80] = { 4048 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 4049 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 4050 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 4051 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 4052 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 4053 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 4054 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 4055 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 4056 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 4057 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 4058 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 4059 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 4060 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 4061 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 4062 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 4063 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 4064 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 4065 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 4066 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 4067 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 4068 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 4069 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 4070 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 4071 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 4072 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 4073 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 4074 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 4075 }; 4076 4077 __ align(CodeEntryAlignment); 4078 4079 StubCodeMark mark(this, stub_id); 4080 address start = __ pc(); 4081 4082 Register buf = c_rarg0; 4083 Register state = c_rarg1; 4084 Register ofs = c_rarg2; 4085 Register limit = c_rarg3; 4086 4087 __ stpd(v8, v9, __ pre(sp, -64)); 4088 __ stpd(v10, v11, Address(sp, 16)); 4089 __ stpd(v12, v13, Address(sp, 32)); 4090 __ stpd(v14, v15, Address(sp, 48)); 4091 4092 Label sha512_loop; 4093 4094 // load state 4095 __ ld1(v8, v9, v10, v11, __ T2D, state); 4096 4097 // load first 4 round constants 4098 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4099 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4100 4101 __ BIND(sha512_loop); 4102 // load 128B of data into v12..v19 4103 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4104 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4105 __ rev64(v12, __ T16B, v12); 4106 __ rev64(v13, __ T16B, v13); 4107 __ rev64(v14, __ T16B, v14); 4108 __ rev64(v15, __ T16B, v15); 4109 __ rev64(v16, __ T16B, v16); 4110 __ rev64(v17, __ T16B, v17); 4111 __ rev64(v18, __ T16B, v18); 4112 __ rev64(v19, __ T16B, v19); 4113 4114 __ mov(rscratch2, rscratch1); 4115 4116 __ mov(v0, __ T16B, v8); 4117 __ mov(v1, __ T16B, v9); 4118 __ mov(v2, __ T16B, v10); 4119 __ mov(v3, __ T16B, v11); 4120 4121 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4122 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4123 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4124 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4125 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4126 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4127 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4128 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4129 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4130 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4131 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4132 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4133 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4134 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4135 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4136 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4137 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4138 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4139 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4140 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4141 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4142 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4143 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4144 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4145 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4146 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4147 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4148 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4149 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4150 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4151 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4152 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4153 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4154 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4155 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4156 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4157 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4158 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4159 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4160 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4161 4162 __ addv(v8, __ T2D, v8, v0); 4163 __ addv(v9, __ T2D, v9, v1); 4164 __ addv(v10, __ T2D, v10, v2); 4165 __ addv(v11, __ T2D, v11, v3); 4166 4167 if (multi_block) { 4168 __ add(ofs, ofs, 128); 4169 __ cmp(ofs, limit); 4170 __ br(Assembler::LE, sha512_loop); 4171 __ mov(c_rarg0, ofs); // return ofs 4172 } 4173 4174 __ st1(v8, v9, v10, v11, __ T2D, state); 4175 4176 __ ldpd(v14, v15, Address(sp, 48)); 4177 __ ldpd(v12, v13, Address(sp, 32)); 4178 __ ldpd(v10, v11, Address(sp, 16)); 4179 __ ldpd(v8, v9, __ post(sp, 64)); 4180 4181 __ ret(lr); 4182 4183 return start; 4184 } 4185 4186 // Execute one round of keccak of two computations in parallel. 4187 // One of the states should be loaded into the lower halves of 4188 // the vector registers v0-v24, the other should be loaded into 4189 // the upper halves of those registers. The ld1r instruction loads 4190 // the round constant into both halves of register v31. 4191 // Intermediate results c0...c5 and d0...d5 are computed 4192 // in registers v25...v30. 4193 // All vector instructions that are used operate on both register 4194 // halves in parallel. 4195 // If only a single computation is needed, one can only load the lower halves. 4196 void keccak_round(Register rscratch1) { 4197 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4198 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4199 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4200 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4201 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4202 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4203 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4204 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4205 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4206 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4207 4208 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4209 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4210 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4211 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4212 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4213 4214 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4215 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4216 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4217 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4218 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4219 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4220 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4221 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4222 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4223 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4224 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4225 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4226 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4227 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4228 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4229 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4230 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4231 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4232 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4233 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4234 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4235 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4236 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4237 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4238 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4239 4240 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4241 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4242 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4243 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4244 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4245 4246 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4247 4248 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4249 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4250 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4251 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4252 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4253 4254 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4255 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4256 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4257 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4258 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4259 4260 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4261 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4262 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4263 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4264 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4265 4266 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4267 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4268 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4269 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4270 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4271 4272 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4273 } 4274 4275 // Arguments: 4276 // 4277 // Inputs: 4278 // c_rarg0 - byte[] source+offset 4279 // c_rarg1 - byte[] SHA.state 4280 // c_rarg2 - int block_size 4281 // c_rarg3 - int offset 4282 // c_rarg4 - int limit 4283 // 4284 address generate_sha3_implCompress(StubGenStubId stub_id) { 4285 bool multi_block; 4286 switch (stub_id) { 4287 case sha3_implCompress_id: 4288 multi_block = false; 4289 break; 4290 case sha3_implCompressMB_id: 4291 multi_block = true; 4292 break; 4293 default: 4294 ShouldNotReachHere(); 4295 } 4296 4297 static const uint64_t round_consts[24] = { 4298 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4299 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4300 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4301 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4302 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4303 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4304 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4305 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4306 }; 4307 4308 __ align(CodeEntryAlignment); 4309 4310 StubCodeMark mark(this, stub_id); 4311 address start = __ pc(); 4312 4313 Register buf = c_rarg0; 4314 Register state = c_rarg1; 4315 Register block_size = c_rarg2; 4316 Register ofs = c_rarg3; 4317 Register limit = c_rarg4; 4318 4319 Label sha3_loop, rounds24_loop; 4320 Label sha3_512_or_sha3_384, shake128; 4321 4322 __ stpd(v8, v9, __ pre(sp, -64)); 4323 __ stpd(v10, v11, Address(sp, 16)); 4324 __ stpd(v12, v13, Address(sp, 32)); 4325 __ stpd(v14, v15, Address(sp, 48)); 4326 4327 // load state 4328 __ add(rscratch1, state, 32); 4329 __ ld1(v0, v1, v2, v3, __ T1D, state); 4330 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4331 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4332 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4333 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4334 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4335 __ ld1(v24, __ T1D, rscratch1); 4336 4337 __ BIND(sha3_loop); 4338 4339 // 24 keccak rounds 4340 __ movw(rscratch2, 24); 4341 4342 // load round_constants base 4343 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4344 4345 // load input 4346 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4347 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4348 __ eor(v0, __ T8B, v0, v25); 4349 __ eor(v1, __ T8B, v1, v26); 4350 __ eor(v2, __ T8B, v2, v27); 4351 __ eor(v3, __ T8B, v3, v28); 4352 __ eor(v4, __ T8B, v4, v29); 4353 __ eor(v5, __ T8B, v5, v30); 4354 __ eor(v6, __ T8B, v6, v31); 4355 4356 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4357 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4358 4359 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4360 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4361 __ eor(v7, __ T8B, v7, v25); 4362 __ eor(v8, __ T8B, v8, v26); 4363 __ eor(v9, __ T8B, v9, v27); 4364 __ eor(v10, __ T8B, v10, v28); 4365 __ eor(v11, __ T8B, v11, v29); 4366 __ eor(v12, __ T8B, v12, v30); 4367 __ eor(v13, __ T8B, v13, v31); 4368 4369 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4370 __ eor(v14, __ T8B, v14, v25); 4371 __ eor(v15, __ T8B, v15, v26); 4372 __ eor(v16, __ T8B, v16, v27); 4373 4374 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4375 __ andw(c_rarg5, block_size, 48); 4376 __ cbzw(c_rarg5, rounds24_loop); 4377 4378 __ tbnz(block_size, 5, shake128); 4379 // block_size == 144, bit5 == 0, SHA3-224 4380 __ ldrd(v28, __ post(buf, 8)); 4381 __ eor(v17, __ T8B, v17, v28); 4382 __ b(rounds24_loop); 4383 4384 __ BIND(shake128); 4385 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4386 __ eor(v17, __ T8B, v17, v28); 4387 __ eor(v18, __ T8B, v18, v29); 4388 __ eor(v19, __ T8B, v19, v30); 4389 __ eor(v20, __ T8B, v20, v31); 4390 __ b(rounds24_loop); // block_size == 168, SHAKE128 4391 4392 __ BIND(sha3_512_or_sha3_384); 4393 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4394 __ eor(v7, __ T8B, v7, v25); 4395 __ eor(v8, __ T8B, v8, v26); 4396 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4397 4398 // SHA3-384 4399 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4400 __ eor(v9, __ T8B, v9, v27); 4401 __ eor(v10, __ T8B, v10, v28); 4402 __ eor(v11, __ T8B, v11, v29); 4403 __ eor(v12, __ T8B, v12, v30); 4404 4405 __ BIND(rounds24_loop); 4406 __ subw(rscratch2, rscratch2, 1); 4407 4408 keccak_round(rscratch1); 4409 4410 __ cbnzw(rscratch2, rounds24_loop); 4411 4412 if (multi_block) { 4413 __ add(ofs, ofs, block_size); 4414 __ cmp(ofs, limit); 4415 __ br(Assembler::LE, sha3_loop); 4416 __ mov(c_rarg0, ofs); // return ofs 4417 } 4418 4419 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4420 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4421 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4422 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4423 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4424 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4425 __ st1(v24, __ T1D, state); 4426 4427 // restore callee-saved registers 4428 __ ldpd(v14, v15, Address(sp, 48)); 4429 __ ldpd(v12, v13, Address(sp, 32)); 4430 __ ldpd(v10, v11, Address(sp, 16)); 4431 __ ldpd(v8, v9, __ post(sp, 64)); 4432 4433 __ ret(lr); 4434 4435 return start; 4436 } 4437 4438 // Inputs: 4439 // c_rarg0 - long[] state0 4440 // c_rarg1 - long[] state1 4441 address generate_double_keccak() { 4442 static const uint64_t round_consts[24] = { 4443 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4444 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4445 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4446 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4447 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4448 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4449 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4450 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4451 }; 4452 4453 // Implements the double_keccak() method of the 4454 // sun.secyrity.provider.SHA3Parallel class 4455 __ align(CodeEntryAlignment); 4456 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4457 address start = __ pc(); 4458 __ enter(); 4459 4460 Register state0 = c_rarg0; 4461 Register state1 = c_rarg1; 4462 4463 Label rounds24_loop; 4464 4465 // save callee-saved registers 4466 __ stpd(v8, v9, __ pre(sp, -64)); 4467 __ stpd(v10, v11, Address(sp, 16)); 4468 __ stpd(v12, v13, Address(sp, 32)); 4469 __ stpd(v14, v15, Address(sp, 48)); 4470 4471 // load states 4472 __ add(rscratch1, state0, 32); 4473 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4474 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4475 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4476 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4477 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4478 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4479 __ ld1(v24, __ D, 0, rscratch1); 4480 __ add(rscratch1, state1, 32); 4481 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4482 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4483 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4484 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4485 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4486 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4487 __ ld1(v24, __ D, 1, rscratch1); 4488 4489 // 24 keccak rounds 4490 __ movw(rscratch2, 24); 4491 4492 // load round_constants base 4493 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4494 4495 __ BIND(rounds24_loop); 4496 __ subw(rscratch2, rscratch2, 1); 4497 keccak_round(rscratch1); 4498 __ cbnzw(rscratch2, rounds24_loop); 4499 4500 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4501 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4502 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4503 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4504 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4505 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4506 __ st1(v24, __ D, 0, state0); 4507 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4508 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4509 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4510 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4511 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4512 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4513 __ st1(v24, __ D, 1, state1); 4514 4515 // restore callee-saved vector registers 4516 __ ldpd(v14, v15, Address(sp, 48)); 4517 __ ldpd(v12, v13, Address(sp, 32)); 4518 __ ldpd(v10, v11, Address(sp, 16)); 4519 __ ldpd(v8, v9, __ post(sp, 64)); 4520 4521 __ leave(); // required for proper stackwalking of RuntimeStub frame 4522 __ mov(r0, zr); // return 0 4523 __ ret(lr); 4524 4525 return start; 4526 } 4527 4528 // ChaCha20 block function. This version parallelizes the 32-bit 4529 // state elements on each of 16 vectors, producing 4 blocks of 4530 // keystream at a time. 4531 // 4532 // state (int[16]) = c_rarg0 4533 // keystream (byte[256]) = c_rarg1 4534 // return - number of bytes of produced keystream (always 256) 4535 // 4536 // This implementation takes each 32-bit integer from the state 4537 // array and broadcasts it across all 4 32-bit lanes of a vector register 4538 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4539 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4540 // the quarter round schedule is implemented as outlined in RFC 7539 section 4541 // 2.3. However, instead of sequentially processing the 3 quarter round 4542 // operations represented by one QUARTERROUND function, we instead stack all 4543 // the adds, xors and left-rotations from the first 4 quarter rounds together 4544 // and then do the same for the second set of 4 quarter rounds. This removes 4545 // some latency that would otherwise be incurred by waiting for an add to 4546 // complete before performing an xor (which depends on the result of the 4547 // add), etc. An adjustment happens between the first and second groups of 4 4548 // quarter rounds, but this is done only in the inputs to the macro functions 4549 // that generate the assembly instructions - these adjustments themselves are 4550 // not part of the resulting assembly. 4551 // The 4 registers v0-v3 are used during the quarter round operations as 4552 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4553 // registers become the vectors involved in adding the start state back onto 4554 // the post-QR working state. After the adds are complete, each of the 16 4555 // vectors write their first lane back to the keystream buffer, followed 4556 // by the second lane from all vectors and so on. 4557 address generate_chacha20Block_blockpar() { 4558 Label L_twoRounds, L_cc20_const; 4559 // The constant data is broken into two 128-bit segments to be loaded 4560 // onto FloatRegisters. The first 128 bits are a counter add overlay 4561 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4562 // The second 128-bits is a table constant used for 8-bit left rotations. 4563 __ BIND(L_cc20_const); 4564 __ emit_int64(0x0000000100000000UL); 4565 __ emit_int64(0x0000000300000002UL); 4566 __ emit_int64(0x0605040702010003UL); 4567 __ emit_int64(0x0E0D0C0F0A09080BUL); 4568 4569 __ align(CodeEntryAlignment); 4570 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4571 StubCodeMark mark(this, stub_id); 4572 address start = __ pc(); 4573 __ enter(); 4574 4575 int i, j; 4576 const Register state = c_rarg0; 4577 const Register keystream = c_rarg1; 4578 const Register loopCtr = r10; 4579 const Register tmpAddr = r11; 4580 const FloatRegister ctrAddOverlay = v28; 4581 const FloatRegister lrot8Tbl = v29; 4582 4583 // Organize SIMD registers in an array that facilitates 4584 // putting repetitive opcodes into loop structures. It is 4585 // important that each grouping of 4 registers is monotonically 4586 // increasing to support the requirements of multi-register 4587 // instructions (e.g. ld4r, st4, etc.) 4588 const FloatRegister workSt[16] = { 4589 v4, v5, v6, v7, v16, v17, v18, v19, 4590 v20, v21, v22, v23, v24, v25, v26, v27 4591 }; 4592 4593 // Pull in constant data. The first 16 bytes are the add overlay 4594 // which is applied to the vector holding the counter (state[12]). 4595 // The second 16 bytes is the index register for the 8-bit left 4596 // rotation tbl instruction. 4597 __ adr(tmpAddr, L_cc20_const); 4598 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4599 4600 // Load from memory and interlace across 16 SIMD registers, 4601 // With each word from memory being broadcast to all lanes of 4602 // each successive SIMD register. 4603 // Addr(0) -> All lanes in workSt[i] 4604 // Addr(4) -> All lanes workSt[i + 1], etc. 4605 __ mov(tmpAddr, state); 4606 for (i = 0; i < 16; i += 4) { 4607 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4608 __ post(tmpAddr, 16)); 4609 } 4610 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4611 4612 // Before entering the loop, create 5 4-register arrays. These 4613 // will hold the 4 registers that represent the a/b/c/d fields 4614 // in the quarter round operation. For instance the "b" field 4615 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4616 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4617 // since it is part of a diagonal organization. The aSet and scratch 4618 // register sets are defined at declaration time because they do not change 4619 // organization at any point during the 20-round processing. 4620 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4621 FloatRegister bSet[4]; 4622 FloatRegister cSet[4]; 4623 FloatRegister dSet[4]; 4624 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4625 4626 // Set up the 10 iteration loop and perform all 8 quarter round ops 4627 __ mov(loopCtr, 10); 4628 __ BIND(L_twoRounds); 4629 4630 // Set to columnar organization and do the following 4 quarter-rounds: 4631 // QUARTERROUND(0, 4, 8, 12) 4632 // QUARTERROUND(1, 5, 9, 13) 4633 // QUARTERROUND(2, 6, 10, 14) 4634 // QUARTERROUND(3, 7, 11, 15) 4635 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4636 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4637 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4638 4639 __ cc20_qr_add4(aSet, bSet); // a += b 4640 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4641 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4642 4643 __ cc20_qr_add4(cSet, dSet); // c += d 4644 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4645 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4646 4647 __ cc20_qr_add4(aSet, bSet); // a += b 4648 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4649 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4650 4651 __ cc20_qr_add4(cSet, dSet); // c += d 4652 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4653 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4654 4655 // Set to diagonal organization and do the next 4 quarter-rounds: 4656 // QUARTERROUND(0, 5, 10, 15) 4657 // QUARTERROUND(1, 6, 11, 12) 4658 // QUARTERROUND(2, 7, 8, 13) 4659 // QUARTERROUND(3, 4, 9, 14) 4660 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4661 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4662 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4663 4664 __ cc20_qr_add4(aSet, bSet); // a += b 4665 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4666 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4667 4668 __ cc20_qr_add4(cSet, dSet); // c += d 4669 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4670 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4671 4672 __ cc20_qr_add4(aSet, bSet); // a += b 4673 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4674 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4675 4676 __ cc20_qr_add4(cSet, dSet); // c += d 4677 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4678 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4679 4680 // Decrement and iterate 4681 __ sub(loopCtr, loopCtr, 1); 4682 __ cbnz(loopCtr, L_twoRounds); 4683 4684 __ mov(tmpAddr, state); 4685 4686 // Add the starting state back to the post-loop keystream 4687 // state. We read/interlace the state array from memory into 4688 // 4 registers similar to what we did in the beginning. Then 4689 // add the counter overlay onto workSt[12] at the end. 4690 for (i = 0; i < 16; i += 4) { 4691 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4692 __ addv(workSt[i], __ T4S, workSt[i], v0); 4693 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4694 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4695 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4696 } 4697 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4698 4699 // Write working state into the keystream buffer. This is accomplished 4700 // by taking the lane "i" from each of the four vectors and writing 4701 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4702 // repeating with the next 4 vectors until all 16 vectors have been used. 4703 // Then move to the next lane and repeat the process until all lanes have 4704 // been written. 4705 for (i = 0; i < 4; i++) { 4706 for (j = 0; j < 16; j += 4) { 4707 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4708 __ post(keystream, 16)); 4709 } 4710 } 4711 4712 __ mov(r0, 256); // Return length of output keystream 4713 __ leave(); 4714 __ ret(lr); 4715 4716 return start; 4717 } 4718 4719 // Helpers to schedule parallel operation bundles across vector 4720 // register sequences of size 2, 4 or 8. 4721 4722 // Implement various primitive computations across vector sequences 4723 4724 template<int N> 4725 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4726 const VSeq<N>& v1, const VSeq<N>& v2) { 4727 // output must not be constant 4728 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4729 // output cannot overwrite pending inputs 4730 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4731 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4732 for (int i = 0; i < N; i++) { 4733 __ addv(v[i], T, v1[i], v2[i]); 4734 } 4735 } 4736 4737 template<int N> 4738 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4739 const VSeq<N>& v1, const VSeq<N>& v2) { 4740 // output must not be constant 4741 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4742 // output cannot overwrite pending inputs 4743 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4744 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4745 for (int i = 0; i < N; i++) { 4746 __ subv(v[i], T, v1[i], v2[i]); 4747 } 4748 } 4749 4750 template<int N> 4751 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4752 const VSeq<N>& v1, const VSeq<N>& v2) { 4753 // output must not be constant 4754 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4755 // output cannot overwrite pending inputs 4756 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4757 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4758 for (int i = 0; i < N; i++) { 4759 __ mulv(v[i], T, v1[i], v2[i]); 4760 } 4761 } 4762 4763 template<int N> 4764 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4765 // output must not be constant 4766 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4767 // output cannot overwrite pending inputs 4768 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4769 for (int i = 0; i < N; i++) { 4770 __ negr(v[i], T, v1[i]); 4771 } 4772 } 4773 4774 template<int N> 4775 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4776 const VSeq<N>& v1, int shift) { 4777 // output must not be constant 4778 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4779 // output cannot overwrite pending inputs 4780 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4781 for (int i = 0; i < N; i++) { 4782 __ sshr(v[i], T, v1[i], shift); 4783 } 4784 } 4785 4786 template<int N> 4787 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4788 // output must not be constant 4789 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4790 // output cannot overwrite pending inputs 4791 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4792 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4793 for (int i = 0; i < N; i++) { 4794 __ andr(v[i], __ T16B, v1[i], v2[i]); 4795 } 4796 } 4797 4798 template<int N> 4799 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4800 // output must not be constant 4801 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4802 // output cannot overwrite pending inputs 4803 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4804 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4805 for (int i = 0; i < N; i++) { 4806 __ orr(v[i], __ T16B, v1[i], v2[i]); 4807 } 4808 } 4809 4810 template<int N> 4811 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4812 // output must not be constant 4813 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4814 // output cannot overwrite pending inputs 4815 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4816 for (int i = 0; i < N; i++) { 4817 __ notr(v[i], __ T16B, v1[i]); 4818 } 4819 } 4820 4821 template<int N> 4822 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4823 // output must not be constant 4824 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4825 // output cannot overwrite pending inputs 4826 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4827 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4828 for (int i = 0; i < N; i++) { 4829 __ sqdmulh(v[i], T, v1[i], v2[i]); 4830 } 4831 } 4832 4833 template<int N> 4834 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4835 // output must not be constant 4836 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4837 // output cannot overwrite pending inputs 4838 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4839 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4840 for (int i = 0; i < N; i++) { 4841 __ mlsv(v[i], T, v1[i], v2[i]); 4842 } 4843 } 4844 4845 // load N/2 successive pairs of quadword values from memory in order 4846 // into N successive vector registers of the sequence via the 4847 // address supplied in base. 4848 template<int N> 4849 void vs_ldpq(const VSeq<N>& v, Register base) { 4850 for (int i = 0; i < N; i += 2) { 4851 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4852 } 4853 } 4854 4855 // load N/2 successive pairs of quadword values from memory in order 4856 // into N vector registers of the sequence via the address supplied 4857 // in base using post-increment addressing 4858 template<int N> 4859 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4860 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4861 for (int i = 0; i < N; i += 2) { 4862 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4863 } 4864 } 4865 4866 // store N successive vector registers of the sequence into N/2 4867 // successive pairs of quadword memory locations via the address 4868 // supplied in base using post-increment addressing 4869 template<int N> 4870 void vs_stpq_post(const VSeq<N>& v, Register base) { 4871 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4872 for (int i = 0; i < N; i += 2) { 4873 __ stpq(v[i], v[i+1], __ post(base, 32)); 4874 } 4875 } 4876 4877 // load N/2 pairs of quadword values from memory de-interleaved into 4878 // N vector registers 2 at a time via the address supplied in base 4879 // using post-increment addressing. 4880 template<int N> 4881 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4882 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4883 for (int i = 0; i < N; i += 2) { 4884 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4885 } 4886 } 4887 4888 // store N vector registers interleaved into N/2 pairs of quadword 4889 // memory locations via the address supplied in base using 4890 // post-increment addressing. 4891 template<int N> 4892 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4893 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4894 for (int i = 0; i < N; i += 2) { 4895 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4896 } 4897 } 4898 4899 // load N quadword values from memory de-interleaved into N vector 4900 // registers 3 elements at a time via the address supplied in base. 4901 template<int N> 4902 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4903 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4904 for (int i = 0; i < N; i += 3) { 4905 __ ld3(v[i], v[i+1], v[i+2], T, base); 4906 } 4907 } 4908 4909 // load N quadword values from memory de-interleaved into N vector 4910 // registers 3 elements at a time via the address supplied in base 4911 // using post-increment addressing. 4912 template<int N> 4913 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4914 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4915 for (int i = 0; i < N; i += 3) { 4916 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4917 } 4918 } 4919 4920 // load N/2 pairs of quadword values from memory into N vector 4921 // registers via the address supplied in base with each pair indexed 4922 // using the the start offset plus the corresponding entry in the 4923 // offsets array 4924 template<int N> 4925 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4926 for (int i = 0; i < N/2; i++) { 4927 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4928 } 4929 } 4930 4931 // store N vector registers into N/2 pairs of quadword memory 4932 // locations via the address supplied in base with each pair indexed 4933 // using the the start offset plus the corresponding entry in the 4934 // offsets array 4935 template<int N> 4936 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4937 for (int i = 0; i < N/2; i++) { 4938 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4939 } 4940 } 4941 4942 // load N single quadword values from memory into N vector registers 4943 // via the address supplied in base with each value indexed using 4944 // the the start offset plus the corresponding entry in the offsets 4945 // array 4946 template<int N> 4947 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4948 int start, int (&offsets)[N]) { 4949 for (int i = 0; i < N; i++) { 4950 __ ldr(v[i], T, Address(base, start + offsets[i])); 4951 } 4952 } 4953 4954 // store N vector registers into N single quadword memory locations 4955 // via the address supplied in base with each value indexed using 4956 // the the start offset plus the corresponding entry in the offsets 4957 // array 4958 template<int N> 4959 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4960 int start, int (&offsets)[N]) { 4961 for (int i = 0; i < N; i++) { 4962 __ str(v[i], T, Address(base, start + offsets[i])); 4963 } 4964 } 4965 4966 // load N/2 pairs of quadword values from memory de-interleaved into 4967 // N vector registers 2 at a time via the address supplied in base 4968 // with each pair indexed using the the start offset plus the 4969 // corresponding entry in the offsets array 4970 template<int N> 4971 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4972 Register tmp, int start, int (&offsets)[N/2]) { 4973 for (int i = 0; i < N/2; i++) { 4974 __ add(tmp, base, start + offsets[i]); 4975 __ ld2(v[2*i], v[2*i+1], T, tmp); 4976 } 4977 } 4978 4979 // store N vector registers 2 at a time interleaved into N/2 pairs 4980 // of quadword memory locations via the address supplied in base 4981 // with each pair indexed using the the start offset plus the 4982 // corresponding entry in the offsets array 4983 template<int N> 4984 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4985 Register tmp, int start, int (&offsets)[N/2]) { 4986 for (int i = 0; i < N/2; i++) { 4987 __ add(tmp, base, start + offsets[i]); 4988 __ st2(v[2*i], v[2*i+1], T, tmp); 4989 } 4990 } 4991 4992 // Helper routines for various flavours of Montgomery multiply 4993 4994 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 4995 // multiplications in parallel 4996 // 4997 4998 // See the montMul() method of the sun.security.provider.ML_DSA 4999 // class. 5000 // 5001 // Computes 4x4S results or 8x8H results 5002 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5003 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5004 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5005 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5006 // Outputs: va - 4x4S or 4x8H vector register sequences 5007 // vb, vc, vtmp and vq must all be disjoint 5008 // va must be disjoint from all other inputs/temps or must equal vc 5009 // va must have a non-zero delta i.e. it must not be a constant vseq. 5010 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5011 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5012 Assembler::SIMD_Arrangement T, 5013 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5014 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5015 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5016 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5017 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5018 5019 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5020 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5021 5022 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5023 5024 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5025 assert(vs_disjoint(va, vb), "va and vb overlap"); 5026 assert(vs_disjoint(va, vq), "va and vq overlap"); 5027 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5028 assert(!va.is_constant(), "output vector must identify 4 different registers"); 5029 5030 // schedule 4 streams of instructions across the vector sequences 5031 for (int i = 0; i < 4; i++) { 5032 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5033 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5034 } 5035 5036 for (int i = 0; i < 4; i++) { 5037 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5038 } 5039 5040 for (int i = 0; i < 4; i++) { 5041 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5042 } 5043 5044 for (int i = 0; i < 4; i++) { 5045 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5046 } 5047 } 5048 5049 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 5050 // multiplications in parallel 5051 // 5052 5053 // See the montMul() method of the sun.security.provider.ML_DSA 5054 // class. 5055 // 5056 // Computes 4x4S results or 8x8H results 5057 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5058 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5059 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5060 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5061 // Outputs: va - 4x4S or 4x8H vector register sequences 5062 // vb, vc, vtmp and vq must all be disjoint 5063 // va must be disjoint from all other inputs/temps or must equal vc 5064 // va must have a non-zero delta i.e. it must not be a constant vseq. 5065 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5066 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5067 Assembler::SIMD_Arrangement T, 5068 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5069 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5070 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5071 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5072 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5073 5074 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5075 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5076 5077 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5078 5079 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5080 assert(vs_disjoint(va, vb), "va and vb overlap"); 5081 assert(vs_disjoint(va, vq), "va and vq overlap"); 5082 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5083 assert(!va.is_constant(), "output vector must identify 2 different registers"); 5084 5085 // schedule 2 streams of instructions across the vector sequences 5086 for (int i = 0; i < 2; i++) { 5087 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5088 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5089 } 5090 5091 for (int i = 0; i < 2; i++) { 5092 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5093 } 5094 5095 for (int i = 0; i < 2; i++) { 5096 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5097 } 5098 5099 for (int i = 0; i < 2; i++) { 5100 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5101 } 5102 } 5103 5104 // Perform 16 16-bit Montgomery multiplications in parallel. 5105 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5106 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5107 // Use the helper routine to schedule a 2x8H Montgomery multiply. 5108 // It will assert that the register use is valid 5109 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 5110 } 5111 5112 // Perform 32 16-bit Montgomery multiplications in parallel. 5113 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5114 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5115 // Use the helper routine to schedule a 4x8H Montgomery multiply. 5116 // It will assert that the register use is valid 5117 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 5118 } 5119 5120 // Perform 64 16-bit Montgomery multiplications in parallel. 5121 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5122 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5123 // Schedule two successive 4x8H multiplies via the montmul helper 5124 // on the front and back halves of va, vb and vc. The helper will 5125 // assert that the register use has no overlap conflicts on each 5126 // individual call but we also need to ensure that the necessary 5127 // disjoint/equality constraints are met across both calls. 5128 5129 // vb, vc, vtmp and vq must be disjoint. va must either be 5130 // disjoint from all other registers or equal vc 5131 5132 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5133 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5134 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5135 5136 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5137 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5138 5139 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5140 5141 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5142 assert(vs_disjoint(va, vb), "va and vb overlap"); 5143 assert(vs_disjoint(va, vq), "va and vq overlap"); 5144 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5145 5146 // we multiply the front and back halves of each sequence 4 at a 5147 // time because 5148 // 5149 // 1) we are currently only able to get 4-way instruction 5150 // parallelism at best 5151 // 5152 // 2) we need registers for the constants in vq and temporary 5153 // scratch registers to hold intermediate results so vtmp can only 5154 // be a VSeq<4> which means we only have 4 scratch slots 5155 5156 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5157 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5158 } 5159 5160 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5161 const VSeq<4>& vc, 5162 const VSeq<4>& vtmp, 5163 const VSeq<2>& vq) { 5164 // compute a = montmul(a1, c) 5165 kyber_montmul32(vc, va1, vc, vtmp, vq); 5166 // ouptut a1 = a0 - a 5167 vs_subv(va1, __ T8H, va0, vc); 5168 // and a0 = a0 + a 5169 vs_addv(va0, __ T8H, va0, vc); 5170 } 5171 5172 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5173 const VSeq<4>& vb, 5174 const VSeq<4>& vtmp1, 5175 const VSeq<4>& vtmp2, 5176 const VSeq<2>& vq) { 5177 // compute c = a0 - a1 5178 vs_subv(vtmp1, __ T8H, va0, va1); 5179 // output a0 = a0 + a1 5180 vs_addv(va0, __ T8H, va0, va1); 5181 // output a1 = b montmul c 5182 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5183 } 5184 5185 void load64shorts(const VSeq<8>& v, Register shorts) { 5186 vs_ldpq_post(v, shorts); 5187 } 5188 5189 void load32shorts(const VSeq<4>& v, Register shorts) { 5190 vs_ldpq_post(v, shorts); 5191 } 5192 5193 void store64shorts(VSeq<8> v, Register tmpAddr) { 5194 vs_stpq_post(v, tmpAddr); 5195 } 5196 5197 // Kyber NTT function. 5198 // Implements 5199 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5200 // 5201 // coeffs (short[256]) = c_rarg0 5202 // ntt_zetas (short[256]) = c_rarg1 5203 address generate_kyberNtt() { 5204 5205 __ align(CodeEntryAlignment); 5206 StubGenStubId stub_id = StubGenStubId::kyberNtt_id; 5207 StubCodeMark mark(this, stub_id); 5208 address start = __ pc(); 5209 __ enter(); 5210 5211 const Register coeffs = c_rarg0; 5212 const Register zetas = c_rarg1; 5213 5214 const Register kyberConsts = r10; 5215 const Register tmpAddr = r11; 5216 5217 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5218 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5219 VSeq<2> vq(30); // n.b. constants overlap vs3 5220 5221 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5222 // load the montmul constants 5223 vs_ldpq(vq, kyberConsts); 5224 5225 // Each level corresponds to an iteration of the outermost loop of the 5226 // Java method seilerNTT(int[] coeffs). There are some differences 5227 // from what is done in the seilerNTT() method, though: 5228 // 1. The computation is using 16-bit signed values, we do not convert them 5229 // to ints here. 5230 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5231 // this array for each level, it is easier that way to fill up the vector 5232 // registers. 5233 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5234 // multiplications (this is because that way there should not be any 5235 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5236 // that we can use the 16-bit arithmetic in the vector unit. 5237 // 5238 // On each level, we fill up the vector registers in such a way that the 5239 // array elements that need to be multiplied by the zetas go into one 5240 // set of vector registers while the corresponding ones that don't need to 5241 // be multiplied, go into another set. 5242 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5243 // registers interleaving the steps of 4 identical computations, 5244 // each done on 8 16-bit values per register. 5245 5246 // At levels 0-3 the coefficients multiplied by or added/subtracted 5247 // to the zetas occur in discrete blocks whose size is some multiple 5248 // of 32. 5249 5250 // level 0 5251 __ add(tmpAddr, coeffs, 256); 5252 load64shorts(vs1, tmpAddr); 5253 load64shorts(vs2, zetas); 5254 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5255 __ add(tmpAddr, coeffs, 0); 5256 load64shorts(vs1, tmpAddr); 5257 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5258 vs_addv(vs1, __ T8H, vs1, vs2); 5259 __ add(tmpAddr, coeffs, 0); 5260 vs_stpq_post(vs1, tmpAddr); 5261 __ add(tmpAddr, coeffs, 256); 5262 vs_stpq_post(vs3, tmpAddr); 5263 // restore montmul constants 5264 vs_ldpq(vq, kyberConsts); 5265 load64shorts(vs1, tmpAddr); 5266 load64shorts(vs2, zetas); 5267 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5268 __ add(tmpAddr, coeffs, 128); 5269 load64shorts(vs1, tmpAddr); 5270 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5271 vs_addv(vs1, __ T8H, vs1, vs2); 5272 __ add(tmpAddr, coeffs, 128); 5273 store64shorts(vs1, tmpAddr); 5274 __ add(tmpAddr, coeffs, 384); 5275 store64shorts(vs3, tmpAddr); 5276 5277 // level 1 5278 // restore montmul constants 5279 vs_ldpq(vq, kyberConsts); 5280 __ add(tmpAddr, coeffs, 128); 5281 load64shorts(vs1, tmpAddr); 5282 load64shorts(vs2, zetas); 5283 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5284 __ add(tmpAddr, coeffs, 0); 5285 load64shorts(vs1, tmpAddr); 5286 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5287 vs_addv(vs1, __ T8H, vs1, vs2); 5288 __ add(tmpAddr, coeffs, 0); 5289 store64shorts(vs1, tmpAddr); 5290 store64shorts(vs3, tmpAddr); 5291 vs_ldpq(vq, kyberConsts); 5292 __ add(tmpAddr, coeffs, 384); 5293 load64shorts(vs1, tmpAddr); 5294 load64shorts(vs2, zetas); 5295 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5296 __ add(tmpAddr, coeffs, 256); 5297 load64shorts(vs1, tmpAddr); 5298 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5299 vs_addv(vs1, __ T8H, vs1, vs2); 5300 __ add(tmpAddr, coeffs, 256); 5301 store64shorts(vs1, tmpAddr); 5302 store64shorts(vs3, tmpAddr); 5303 5304 // level 2 5305 vs_ldpq(vq, kyberConsts); 5306 int offsets1[4] = { 0, 32, 128, 160 }; 5307 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5308 load64shorts(vs2, zetas); 5309 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5310 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5311 // kyber_subv_addv64(); 5312 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5313 vs_addv(vs1, __ T8H, vs1, vs2); 5314 __ add(tmpAddr, coeffs, 0); 5315 vs_stpq_post(vs_front(vs1), tmpAddr); 5316 vs_stpq_post(vs_front(vs3), tmpAddr); 5317 vs_stpq_post(vs_back(vs1), tmpAddr); 5318 vs_stpq_post(vs_back(vs3), tmpAddr); 5319 vs_ldpq(vq, kyberConsts); 5320 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5321 load64shorts(vs2, zetas); 5322 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5323 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5324 // kyber_subv_addv64(); 5325 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5326 vs_addv(vs1, __ T8H, vs1, vs2); 5327 __ add(tmpAddr, coeffs, 256); 5328 vs_stpq_post(vs_front(vs1), tmpAddr); 5329 vs_stpq_post(vs_front(vs3), tmpAddr); 5330 vs_stpq_post(vs_back(vs1), tmpAddr); 5331 vs_stpq_post(vs_back(vs3), tmpAddr); 5332 5333 // level 3 5334 vs_ldpq(vq, kyberConsts); 5335 int offsets2[4] = { 0, 64, 128, 192 }; 5336 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5337 load64shorts(vs2, zetas); 5338 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5339 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5340 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5341 vs_addv(vs1, __ T8H, vs1, vs2); 5342 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5343 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5344 5345 vs_ldpq(vq, kyberConsts); 5346 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5347 load64shorts(vs2, zetas); 5348 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5349 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5350 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5351 vs_addv(vs1, __ T8H, vs1, vs2); 5352 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5353 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5354 5355 // level 4 5356 // At level 4 coefficients occur in 8 discrete blocks of size 16 5357 // so they are loaded using employing an ldr at 8 distinct offsets. 5358 5359 vs_ldpq(vq, kyberConsts); 5360 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5361 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5362 load64shorts(vs2, zetas); 5363 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5364 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5365 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5366 vs_addv(vs1, __ T8H, vs1, vs2); 5367 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5368 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5369 5370 vs_ldpq(vq, kyberConsts); 5371 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5372 load64shorts(vs2, zetas); 5373 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5374 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5375 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5376 vs_addv(vs1, __ T8H, vs1, vs2); 5377 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5378 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5379 5380 // level 5 5381 // At level 5 related coefficients occur in discrete blocks of size 8 so 5382 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5383 5384 vs_ldpq(vq, kyberConsts); 5385 int offsets4[4] = { 0, 32, 64, 96 }; 5386 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5387 load32shorts(vs_front(vs2), zetas); 5388 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5389 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5390 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5391 load32shorts(vs_front(vs2), zetas); 5392 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5393 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5394 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5395 load32shorts(vs_front(vs2), zetas); 5396 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5397 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5398 5399 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5400 load32shorts(vs_front(vs2), zetas); 5401 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5402 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5403 5404 // level 6 5405 // At level 6 related coefficients occur in discrete blocks of size 4 so 5406 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5407 5408 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5409 load32shorts(vs_front(vs2), zetas); 5410 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5411 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5412 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5413 // __ ldpq(v18, v19, __ post(zetas, 32)); 5414 load32shorts(vs_front(vs2), zetas); 5415 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5416 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5417 5418 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5419 load32shorts(vs_front(vs2), zetas); 5420 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5421 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5422 5423 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5424 load32shorts(vs_front(vs2), zetas); 5425 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5426 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5427 5428 __ leave(); // required for proper stackwalking of RuntimeStub frame 5429 __ mov(r0, zr); // return 0 5430 __ ret(lr); 5431 5432 return start; 5433 } 5434 5435 // Kyber Inverse NTT function 5436 // Implements 5437 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5438 // 5439 // coeffs (short[256]) = c_rarg0 5440 // ntt_zetas (short[256]) = c_rarg1 5441 address generate_kyberInverseNtt() { 5442 5443 __ align(CodeEntryAlignment); 5444 StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id; 5445 StubCodeMark mark(this, stub_id); 5446 address start = __ pc(); 5447 __ enter(); 5448 5449 const Register coeffs = c_rarg0; 5450 const Register zetas = c_rarg1; 5451 5452 const Register kyberConsts = r10; 5453 const Register tmpAddr = r11; 5454 const Register tmpAddr2 = c_rarg2; 5455 5456 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5457 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5458 VSeq<2> vq(30); // n.b. constants overlap vs3 5459 5460 __ lea(kyberConsts, 5461 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5462 5463 // level 0 5464 // At level 0 related coefficients occur in discrete blocks of size 4 so 5465 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5466 5467 vs_ldpq(vq, kyberConsts); 5468 int offsets4[4] = { 0, 32, 64, 96 }; 5469 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5470 load32shorts(vs_front(vs2), zetas); 5471 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5472 vs_front(vs2), vs_back(vs2), vtmp, vq); 5473 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5474 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5475 load32shorts(vs_front(vs2), zetas); 5476 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5477 vs_front(vs2), vs_back(vs2), vtmp, vq); 5478 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5479 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5480 load32shorts(vs_front(vs2), zetas); 5481 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5482 vs_front(vs2), vs_back(vs2), vtmp, vq); 5483 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5484 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5485 load32shorts(vs_front(vs2), zetas); 5486 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5487 vs_front(vs2), vs_back(vs2), vtmp, vq); 5488 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5489 5490 // level 1 5491 // At level 1 related coefficients occur in discrete blocks of size 8 so 5492 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5493 5494 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5495 load32shorts(vs_front(vs2), zetas); 5496 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5497 vs_front(vs2), vs_back(vs2), vtmp, vq); 5498 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5499 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5500 load32shorts(vs_front(vs2), zetas); 5501 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5502 vs_front(vs2), vs_back(vs2), vtmp, vq); 5503 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5504 5505 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5506 load32shorts(vs_front(vs2), zetas); 5507 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5508 vs_front(vs2), vs_back(vs2), vtmp, vq); 5509 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5510 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5511 load32shorts(vs_front(vs2), zetas); 5512 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5513 vs_front(vs2), vs_back(vs2), vtmp, vq); 5514 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5515 5516 // level 2 5517 // At level 2 coefficients occur in 8 discrete blocks of size 16 5518 // so they are loaded using employing an ldr at 8 distinct offsets. 5519 5520 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5521 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5522 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5523 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5524 vs_subv(vs1, __ T8H, vs1, vs2); 5525 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5526 load64shorts(vs2, zetas); 5527 vs_ldpq(vq, kyberConsts); 5528 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5529 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5530 5531 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5532 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5533 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5534 vs_subv(vs1, __ T8H, vs1, vs2); 5535 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5536 load64shorts(vs2, zetas); 5537 vs_ldpq(vq, kyberConsts); 5538 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5539 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5540 5541 // Barrett reduction at indexes where overflow may happen 5542 5543 // load q and the multiplier for the Barrett reduction 5544 __ add(tmpAddr, kyberConsts, 16); 5545 vs_ldpq(vq, tmpAddr); 5546 5547 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5548 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5549 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5550 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5551 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5552 vs_sshr(vs2, __ T8H, vs2, 11); 5553 vs_mlsv(vs1, __ T8H, vs2, vq1); 5554 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5555 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5556 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5557 vs_sshr(vs2, __ T8H, vs2, 11); 5558 vs_mlsv(vs1, __ T8H, vs2, vq1); 5559 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5560 5561 // level 3 5562 // From level 3 upwards coefficients occur in discrete blocks whose size is 5563 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5564 5565 int offsets2[4] = { 0, 64, 128, 192 }; 5566 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5567 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5568 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5569 vs_subv(vs1, __ T8H, vs1, vs2); 5570 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5571 load64shorts(vs2, zetas); 5572 vs_ldpq(vq, kyberConsts); 5573 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5574 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5575 5576 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5577 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5578 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5579 vs_subv(vs1, __ T8H, vs1, vs2); 5580 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5581 load64shorts(vs2, zetas); 5582 vs_ldpq(vq, kyberConsts); 5583 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5584 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5585 5586 // level 4 5587 5588 int offsets1[4] = { 0, 32, 128, 160 }; 5589 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5590 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5591 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5592 vs_subv(vs1, __ T8H, vs1, vs2); 5593 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5594 load64shorts(vs2, zetas); 5595 vs_ldpq(vq, kyberConsts); 5596 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5597 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5598 5599 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5600 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5601 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5602 vs_subv(vs1, __ T8H, vs1, vs2); 5603 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5604 load64shorts(vs2, zetas); 5605 vs_ldpq(vq, kyberConsts); 5606 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5607 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5608 5609 // level 5 5610 5611 __ add(tmpAddr, coeffs, 0); 5612 load64shorts(vs1, tmpAddr); 5613 __ add(tmpAddr, coeffs, 128); 5614 load64shorts(vs2, tmpAddr); 5615 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5616 vs_subv(vs1, __ T8H, vs1, vs2); 5617 __ add(tmpAddr, coeffs, 0); 5618 store64shorts(vs3, tmpAddr); 5619 load64shorts(vs2, zetas); 5620 vs_ldpq(vq, kyberConsts); 5621 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5622 __ add(tmpAddr, coeffs, 128); 5623 store64shorts(vs2, tmpAddr); 5624 5625 load64shorts(vs1, tmpAddr); 5626 __ add(tmpAddr, coeffs, 384); 5627 load64shorts(vs2, tmpAddr); 5628 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5629 vs_subv(vs1, __ T8H, vs1, vs2); 5630 __ add(tmpAddr, coeffs, 256); 5631 store64shorts(vs3, tmpAddr); 5632 load64shorts(vs2, zetas); 5633 vs_ldpq(vq, kyberConsts); 5634 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5635 __ add(tmpAddr, coeffs, 384); 5636 store64shorts(vs2, tmpAddr); 5637 5638 // Barrett reduction at indexes where overflow may happen 5639 5640 // load q and the multiplier for the Barrett reduction 5641 __ add(tmpAddr, kyberConsts, 16); 5642 vs_ldpq(vq, tmpAddr); 5643 5644 int offsets0[2] = { 0, 256 }; 5645 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5646 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5647 vs_sshr(vs2, __ T8H, vs2, 11); 5648 vs_mlsv(vs1, __ T8H, vs2, vq1); 5649 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5650 5651 // level 6 5652 5653 __ add(tmpAddr, coeffs, 0); 5654 load64shorts(vs1, tmpAddr); 5655 __ add(tmpAddr, coeffs, 256); 5656 load64shorts(vs2, tmpAddr); 5657 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5658 vs_subv(vs1, __ T8H, vs1, vs2); 5659 __ add(tmpAddr, coeffs, 0); 5660 store64shorts(vs3, tmpAddr); 5661 load64shorts(vs2, zetas); 5662 vs_ldpq(vq, kyberConsts); 5663 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5664 __ add(tmpAddr, coeffs, 256); 5665 store64shorts(vs2, tmpAddr); 5666 5667 __ add(tmpAddr, coeffs, 128); 5668 load64shorts(vs1, tmpAddr); 5669 __ add(tmpAddr, coeffs, 384); 5670 load64shorts(vs2, tmpAddr); 5671 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5672 vs_subv(vs1, __ T8H, vs1, vs2); 5673 __ add(tmpAddr, coeffs, 128); 5674 store64shorts(vs3, tmpAddr); 5675 load64shorts(vs2, zetas); 5676 vs_ldpq(vq, kyberConsts); 5677 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5678 __ add(tmpAddr, coeffs, 384); 5679 store64shorts(vs2, tmpAddr); 5680 5681 // multiply by 2^-n 5682 5683 // load toMont(2^-n mod q) 5684 __ add(tmpAddr, kyberConsts, 48); 5685 __ ldr(v29, __ Q, tmpAddr); 5686 5687 vs_ldpq(vq, kyberConsts); 5688 __ add(tmpAddr, coeffs, 0); 5689 load64shorts(vs1, tmpAddr); 5690 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5691 __ add(tmpAddr, coeffs, 0); 5692 store64shorts(vs2, tmpAddr); 5693 5694 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5695 load64shorts(vs1, tmpAddr); 5696 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5697 __ add(tmpAddr, coeffs, 128); 5698 store64shorts(vs2, tmpAddr); 5699 5700 // now tmpAddr contains coeffs + 256 5701 load64shorts(vs1, tmpAddr); 5702 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5703 __ add(tmpAddr, coeffs, 256); 5704 store64shorts(vs2, tmpAddr); 5705 5706 // now tmpAddr contains coeffs + 384 5707 load64shorts(vs1, tmpAddr); 5708 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5709 __ add(tmpAddr, coeffs, 384); 5710 store64shorts(vs2, tmpAddr); 5711 5712 __ leave(); // required for proper stackwalking of RuntimeStub frame 5713 __ mov(r0, zr); // return 0 5714 __ ret(lr); 5715 5716 return start; 5717 } 5718 5719 // Kyber multiply polynomials in the NTT domain. 5720 // Implements 5721 // static int implKyberNttMult( 5722 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5723 // 5724 // result (short[256]) = c_rarg0 5725 // ntta (short[256]) = c_rarg1 5726 // nttb (short[256]) = c_rarg2 5727 // zetas (short[128]) = c_rarg3 5728 address generate_kyberNttMult() { 5729 5730 __ align(CodeEntryAlignment); 5731 StubGenStubId stub_id = StubGenStubId::kyberNttMult_id; 5732 StubCodeMark mark(this, stub_id); 5733 address start = __ pc(); 5734 __ enter(); 5735 5736 const Register result = c_rarg0; 5737 const Register ntta = c_rarg1; 5738 const Register nttb = c_rarg2; 5739 const Register zetas = c_rarg3; 5740 5741 const Register kyberConsts = r10; 5742 const Register limit = r11; 5743 5744 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5745 VSeq<4> vs3(16), vs4(20); 5746 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5747 VSeq<2> vz(28); // pair of zetas 5748 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5749 5750 __ lea(kyberConsts, 5751 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5752 5753 Label kyberNttMult_loop; 5754 5755 __ add(limit, result, 512); 5756 5757 // load q and qinv 5758 vs_ldpq(vq, kyberConsts); 5759 5760 // load R^2 mod q (to convert back from Montgomery representation) 5761 __ add(kyberConsts, kyberConsts, 64); 5762 __ ldr(v27, __ Q, kyberConsts); 5763 5764 __ BIND(kyberNttMult_loop); 5765 5766 // load 16 zetas 5767 vs_ldpq_post(vz, zetas); 5768 5769 // load 2 sets of 32 coefficients from the two input arrays 5770 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5771 // are striped across pairs of vector registers 5772 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5773 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5774 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5775 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5776 5777 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5778 // i.e. montmul the first and second halves of vs1 in order and 5779 // then with one sequence reversed storing the two results in vs3 5780 // 5781 // vs3[0] <- montmul(a0, b0) 5782 // vs3[1] <- montmul(a1, b1) 5783 // vs3[2] <- montmul(a0, b1) 5784 // vs3[3] <- montmul(a1, b0) 5785 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5786 kyber_montmul16(vs_back(vs3), 5787 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5788 5789 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5790 // i.e. montmul the first and second halves of vs4 in order and 5791 // then with one sequence reversed storing the two results in vs1 5792 // 5793 // vs1[0] <- montmul(a2, b2) 5794 // vs1[1] <- montmul(a3, b3) 5795 // vs1[2] <- montmul(a2, b3) 5796 // vs1[3] <- montmul(a3, b2) 5797 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5798 kyber_montmul16(vs_back(vs1), 5799 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5800 5801 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5802 // We can schedule two montmuls at a time if we use a suitable vector 5803 // sequence <vs3[1], vs1[1]>. 5804 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5805 VSeq<2> vs5(vs3[1], delta); 5806 5807 // vs3[1] <- montmul(montmul(a1, b1), z0) 5808 // vs1[1] <- montmul(montmul(a3, b3), z1) 5809 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5810 5811 // add results in pairs storing in vs3 5812 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5813 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5814 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5815 5816 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5817 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5818 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5819 5820 // vs1 <- montmul(vs3, montRSquareModQ) 5821 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5822 5823 // store back the two pairs of result vectors de-interleaved as 8H elements 5824 // i.e. storing each pairs of shorts striped across a register pair adjacent 5825 // in memory 5826 vs_st2_post(vs1, __ T8H, result); 5827 5828 __ cmp(result, limit); 5829 __ br(Assembler::NE, kyberNttMult_loop); 5830 5831 __ leave(); // required for proper stackwalking of RuntimeStub frame 5832 __ mov(r0, zr); // return 0 5833 __ ret(lr); 5834 5835 return start; 5836 } 5837 5838 // Kyber add 2 polynomials. 5839 // Implements 5840 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5841 // 5842 // result (short[256]) = c_rarg0 5843 // a (short[256]) = c_rarg1 5844 // b (short[256]) = c_rarg2 5845 address generate_kyberAddPoly_2() { 5846 5847 __ align(CodeEntryAlignment); 5848 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id; 5849 StubCodeMark mark(this, stub_id); 5850 address start = __ pc(); 5851 __ enter(); 5852 5853 const Register result = c_rarg0; 5854 const Register a = c_rarg1; 5855 const Register b = c_rarg2; 5856 5857 const Register kyberConsts = r11; 5858 5859 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5860 // So, we can load, add and store the data in 3 groups of 11, 5861 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5862 // registers. A further constraint is that the mapping needs 5863 // to skip callee saves. So, we allocate the register 5864 // sequences using two 8 sequences, two 2 sequences and two 5865 // single registers. 5866 VSeq<8> vs1_1(0); 5867 VSeq<2> vs1_2(16); 5868 FloatRegister vs1_3 = v28; 5869 VSeq<8> vs2_1(18); 5870 VSeq<2> vs2_2(26); 5871 FloatRegister vs2_3 = v29; 5872 5873 // two constant vector sequences 5874 VSeq<8> vc_1(31, 0); 5875 VSeq<2> vc_2(31, 0); 5876 5877 FloatRegister vc_3 = v31; 5878 __ lea(kyberConsts, 5879 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5880 5881 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5882 for (int i = 0; i < 3; i++) { 5883 // load 80 or 88 values from a into vs1_1/2/3 5884 vs_ldpq_post(vs1_1, a); 5885 vs_ldpq_post(vs1_2, a); 5886 if (i < 2) { 5887 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5888 } 5889 // load 80 or 88 values from b into vs2_1/2/3 5890 vs_ldpq_post(vs2_1, b); 5891 vs_ldpq_post(vs2_2, b); 5892 if (i < 2) { 5893 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5894 } 5895 // sum 80 or 88 values across vs1 and vs2 into vs1 5896 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5897 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5898 if (i < 2) { 5899 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5900 } 5901 // add constant to all 80 or 88 results 5902 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5903 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5904 if (i < 2) { 5905 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5906 } 5907 // store 80 or 88 values 5908 vs_stpq_post(vs1_1, result); 5909 vs_stpq_post(vs1_2, result); 5910 if (i < 2) { 5911 __ str(vs1_3, __ Q, __ post(result, 16)); 5912 } 5913 } 5914 5915 __ leave(); // required for proper stackwalking of RuntimeStub frame 5916 __ mov(r0, zr); // return 0 5917 __ ret(lr); 5918 5919 return start; 5920 } 5921 5922 // Kyber add 3 polynomials. 5923 // Implements 5924 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5925 // 5926 // result (short[256]) = c_rarg0 5927 // a (short[256]) = c_rarg1 5928 // b (short[256]) = c_rarg2 5929 // c (short[256]) = c_rarg3 5930 address generate_kyberAddPoly_3() { 5931 5932 __ align(CodeEntryAlignment); 5933 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id; 5934 StubCodeMark mark(this, stub_id); 5935 address start = __ pc(); 5936 __ enter(); 5937 5938 const Register result = c_rarg0; 5939 const Register a = c_rarg1; 5940 const Register b = c_rarg2; 5941 const Register c = c_rarg3; 5942 5943 const Register kyberConsts = r11; 5944 5945 // As above we sum 256 sets of values in total i.e. 32 x 8H 5946 // quadwords. So, we can load, add and store the data in 3 5947 // groups of 11, 11 and 10 at a time i.e. we need to map sets 5948 // of 10 or 11 registers. A further constraint is that the 5949 // mapping needs to skip callee saves. So, we allocate the 5950 // register sequences using two 8 sequences, two 2 sequences 5951 // and two single registers. 5952 VSeq<8> vs1_1(0); 5953 VSeq<2> vs1_2(16); 5954 FloatRegister vs1_3 = v28; 5955 VSeq<8> vs2_1(18); 5956 VSeq<2> vs2_2(26); 5957 FloatRegister vs2_3 = v29; 5958 5959 // two constant vector sequences 5960 VSeq<8> vc_1(31, 0); 5961 VSeq<2> vc_2(31, 0); 5962 5963 FloatRegister vc_3 = v31; 5964 5965 __ lea(kyberConsts, 5966 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5967 5968 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5969 for (int i = 0; i < 3; i++) { 5970 // load 80 or 88 values from a into vs1_1/2/3 5971 vs_ldpq_post(vs1_1, a); 5972 vs_ldpq_post(vs1_2, a); 5973 if (i < 2) { 5974 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5975 } 5976 // load 80 or 88 values from b into vs2_1/2/3 5977 vs_ldpq_post(vs2_1, b); 5978 vs_ldpq_post(vs2_2, b); 5979 if (i < 2) { 5980 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5981 } 5982 // sum 80 or 88 values across vs1 and vs2 into vs1 5983 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5984 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5985 if (i < 2) { 5986 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5987 } 5988 // load 80 or 88 values from c into vs2_1/2/3 5989 vs_ldpq_post(vs2_1, c); 5990 vs_ldpq_post(vs2_2, c); 5991 if (i < 2) { 5992 __ ldr(vs2_3, __ Q, __ post(c, 16)); 5993 } 5994 // sum 80 or 88 values across vs1 and vs2 into vs1 5995 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5996 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5997 if (i < 2) { 5998 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5999 } 6000 // add constant to all 80 or 88 results 6001 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 6002 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 6003 if (i < 2) { 6004 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 6005 } 6006 // store 80 or 88 values 6007 vs_stpq_post(vs1_1, result); 6008 vs_stpq_post(vs1_2, result); 6009 if (i < 2) { 6010 __ str(vs1_3, __ Q, __ post(result, 16)); 6011 } 6012 } 6013 6014 __ leave(); // required for proper stackwalking of RuntimeStub frame 6015 __ mov(r0, zr); // return 0 6016 __ ret(lr); 6017 6018 return start; 6019 } 6020 6021 // Kyber parse XOF output to polynomial coefficient candidates 6022 // or decodePoly(12, ...). 6023 // Implements 6024 // static int implKyber12To16( 6025 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 6026 // 6027 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 6028 // 6029 // condensed (byte[]) = c_rarg0 6030 // condensedIndex = c_rarg1 6031 // parsed (short[112 or 256]) = c_rarg2 6032 // parsedLength (112 or 256) = c_rarg3 6033 address generate_kyber12To16() { 6034 Label L_F00, L_loop, L_end; 6035 6036 __ BIND(L_F00); 6037 __ emit_int64(0x0f000f000f000f00); 6038 __ emit_int64(0x0f000f000f000f00); 6039 6040 __ align(CodeEntryAlignment); 6041 StubGenStubId stub_id = StubGenStubId::kyber12To16_id; 6042 StubCodeMark mark(this, stub_id); 6043 address start = __ pc(); 6044 __ enter(); 6045 6046 const Register condensed = c_rarg0; 6047 const Register condensedOffs = c_rarg1; 6048 const Register parsed = c_rarg2; 6049 const Register parsedLength = c_rarg3; 6050 6051 const Register tmpAddr = r11; 6052 6053 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 6054 // quadwords so we need a 6 vector sequence for the inputs. 6055 // Parsing produces 64 shorts, employing two 8 vector 6056 // sequences to store and combine the intermediate data. 6057 VSeq<6> vin(24); 6058 VSeq<8> va(0), vb(16); 6059 6060 __ adr(tmpAddr, L_F00); 6061 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 6062 __ add(condensed, condensed, condensedOffs); 6063 6064 __ BIND(L_loop); 6065 // load 96 (6 x 16B) byte values 6066 vs_ld3_post(vin, __ T16B, condensed); 6067 6068 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 6069 // holds 48 (16x3) contiguous bytes from memory striped 6070 // horizontally across each of the 16 byte lanes. Equivalently, 6071 // that is 16 pairs of 12-bit integers. Likewise the back half 6072 // holds the next 48 bytes in the same arrangement. 6073 6074 // Each vector in the front half can also be viewed as a vertical 6075 // strip across the 16 pairs of 12 bit integers. Each byte in 6076 // vin[0] stores the low 8 bits of the first int in a pair. Each 6077 // byte in vin[1] stores the high 4 bits of the first int and the 6078 // low 4 bits of the second int. Each byte in vin[2] stores the 6079 // high 8 bits of the second int. Likewise the vectors in second 6080 // half. 6081 6082 // Converting the data to 16-bit shorts requires first of all 6083 // expanding each of the 6 x 16B vectors into 6 corresponding 6084 // pairs of 8H vectors. Mask, shift and add operations on the 6085 // resulting vector pairs can be used to combine 4 and 8 bit 6086 // parts of related 8H vector elements. 6087 // 6088 // The middle vectors (vin[2] and vin[5]) are actually expanded 6089 // twice, one copy manipulated to provide the lower 4 bits 6090 // belonging to the first short in a pair and another copy 6091 // manipulated to provide the higher 4 bits belonging to the 6092 // second short in a pair. This is why the the vector sequences va 6093 // and vb used to hold the expanded 8H elements are of length 8. 6094 6095 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6096 // n.b. target elements 2 and 3 duplicate elements 4 and 5 6097 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6098 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6099 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6100 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6101 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6102 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6103 6104 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 6105 // and vb[4:5] 6106 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6107 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 6108 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6109 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 6110 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6111 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 6112 6113 // shift lo byte of copy 1 of the middle stripe into the high byte 6114 __ shl(va[2], __ T8H, va[2], 8); 6115 __ shl(va[3], __ T8H, va[3], 8); 6116 __ shl(vb[2], __ T8H, vb[2], 8); 6117 __ shl(vb[3], __ T8H, vb[3], 8); 6118 6119 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6120 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6121 // are in bit positions [4..11]. 6122 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6123 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6124 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6125 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6126 6127 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6128 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6129 // copy2 6130 __ andr(va[2], __ T16B, va[2], v31); 6131 __ andr(va[3], __ T16B, va[3], v31); 6132 __ ushr(va[4], __ T8H, va[4], 4); 6133 __ ushr(va[5], __ T8H, va[5], 4); 6134 __ andr(vb[2], __ T16B, vb[2], v31); 6135 __ andr(vb[3], __ T16B, vb[3], v31); 6136 __ ushr(vb[4], __ T8H, vb[4], 4); 6137 __ ushr(vb[5], __ T8H, vb[5], 4); 6138 6139 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6140 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6141 // n.b. the ordering ensures: i) inputs are consumed before they 6142 // are overwritten ii) the order of 16-bit results across successive 6143 // pairs of vectors in va and then vb reflects the order of the 6144 // corresponding 12-bit inputs 6145 __ addv(va[0], __ T8H, va[0], va[2]); 6146 __ addv(va[2], __ T8H, va[1], va[3]); 6147 __ addv(va[1], __ T8H, va[4], va[6]); 6148 __ addv(va[3], __ T8H, va[5], va[7]); 6149 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6150 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6151 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6152 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6153 6154 // store 64 results interleaved as shorts 6155 vs_st2_post(vs_front(va), __ T8H, parsed); 6156 vs_st2_post(vs_front(vb), __ T8H, parsed); 6157 6158 __ sub(parsedLength, parsedLength, 64); 6159 __ cmp(parsedLength, (u1)64); 6160 __ br(Assembler::GE, L_loop); 6161 __ cbz(parsedLength, L_end); 6162 6163 // if anything is left it should be a final 72 bytes of input 6164 // i.e. a final 48 12-bit values. so we handle this by loading 6165 // 48 bytes into all 16B lanes of front(vin) and only 24 6166 // bytes into the lower 8B lane of back(vin) 6167 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6168 vs_ld3(vs_back(vin), __ T8B, condensed); 6169 6170 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6171 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6172 // 5 and target element 2 of vb duplicates element 4. 6173 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6174 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6175 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6176 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6177 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6178 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6179 6180 // This time expand just the lower 8 lanes 6181 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6182 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6183 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6184 6185 // shift lo byte of copy 1 of the middle stripe into the high byte 6186 __ shl(va[2], __ T8H, va[2], 8); 6187 __ shl(va[3], __ T8H, va[3], 8); 6188 __ shl(vb[2], __ T8H, vb[2], 8); 6189 6190 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6191 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6192 // int are in bit positions [4..11]. 6193 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6194 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6195 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6196 6197 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6198 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6199 // copy2 6200 __ andr(va[2], __ T16B, va[2], v31); 6201 __ andr(va[3], __ T16B, va[3], v31); 6202 __ ushr(va[4], __ T8H, va[4], 4); 6203 __ ushr(va[5], __ T8H, va[5], 4); 6204 __ andr(vb[2], __ T16B, vb[2], v31); 6205 __ ushr(vb[4], __ T8H, vb[4], 4); 6206 6207 6208 6209 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6210 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6211 6212 // n.b. ordering ensures: i) inputs are consumed before they are 6213 // overwritten ii) order of 16-bit results across succsessive 6214 // pairs of vectors in va and then lower half of vb reflects order 6215 // of corresponding 12-bit inputs 6216 __ addv(va[0], __ T8H, va[0], va[2]); 6217 __ addv(va[2], __ T8H, va[1], va[3]); 6218 __ addv(va[1], __ T8H, va[4], va[6]); 6219 __ addv(va[3], __ T8H, va[5], va[7]); 6220 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6221 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6222 6223 // store 48 results interleaved as shorts 6224 vs_st2_post(vs_front(va), __ T8H, parsed); 6225 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6226 6227 __ BIND(L_end); 6228 6229 __ leave(); // required for proper stackwalking of RuntimeStub frame 6230 __ mov(r0, zr); // return 0 6231 __ ret(lr); 6232 6233 return start; 6234 } 6235 6236 // Kyber Barrett reduce function. 6237 // Implements 6238 // static int implKyberBarrettReduce(short[] coeffs) {} 6239 // 6240 // coeffs (short[256]) = c_rarg0 6241 address generate_kyberBarrettReduce() { 6242 6243 __ align(CodeEntryAlignment); 6244 StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id; 6245 StubCodeMark mark(this, stub_id); 6246 address start = __ pc(); 6247 __ enter(); 6248 6249 const Register coeffs = c_rarg0; 6250 6251 const Register kyberConsts = r10; 6252 const Register result = r11; 6253 6254 // As above we process 256 sets of values in total i.e. 32 x 6255 // 8H quadwords. So, we can load, add and store the data in 3 6256 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6257 // of 10 or 11 registers. A further constraint is that the 6258 // mapping needs to skip callee saves. So, we allocate the 6259 // register sequences using two 8 sequences, two 2 sequences 6260 // and two single registers. 6261 VSeq<8> vs1_1(0); 6262 VSeq<2> vs1_2(16); 6263 FloatRegister vs1_3 = v28; 6264 VSeq<8> vs2_1(18); 6265 VSeq<2> vs2_2(26); 6266 FloatRegister vs2_3 = v29; 6267 6268 // we also need a pair of corresponding constant sequences 6269 6270 VSeq<8> vc1_1(30, 0); 6271 VSeq<2> vc1_2(30, 0); 6272 FloatRegister vc1_3 = v30; // for kyber_q 6273 6274 VSeq<8> vc2_1(31, 0); 6275 VSeq<2> vc2_2(31, 0); 6276 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6277 6278 __ add(result, coeffs, 0); 6279 __ lea(kyberConsts, 6280 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6281 6282 // load q and the multiplier for the Barrett reduction 6283 __ add(kyberConsts, kyberConsts, 16); 6284 __ ldpq(vc1_3, vc2_3, kyberConsts); 6285 6286 for (int i = 0; i < 3; i++) { 6287 // load 80 or 88 coefficients 6288 vs_ldpq_post(vs1_1, coeffs); 6289 vs_ldpq_post(vs1_2, coeffs); 6290 if (i < 2) { 6291 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6292 } 6293 6294 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6295 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6296 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6297 if (i < 2) { 6298 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6299 } 6300 6301 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6302 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6303 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6304 if (i < 2) { 6305 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6306 } 6307 6308 // vs1 <- vs1 - vs2 * kyber_q 6309 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6310 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6311 if (i < 2) { 6312 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6313 } 6314 6315 vs_stpq_post(vs1_1, result); 6316 vs_stpq_post(vs1_2, result); 6317 if (i < 2) { 6318 __ str(vs1_3, __ Q, __ post(result, 16)); 6319 } 6320 } 6321 6322 __ leave(); // required for proper stackwalking of RuntimeStub frame 6323 __ mov(r0, zr); // return 0 6324 __ ret(lr); 6325 6326 return start; 6327 } 6328 6329 6330 // Dilithium-specific montmul helper routines that generate parallel 6331 // code for, respectively, a single 4x4s vector sequence montmul or 6332 // two such multiplies in a row. 6333 6334 // Perform 16 32-bit Montgomery multiplications in parallel 6335 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6336 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6337 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6338 // It will assert that the register use is valid 6339 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6340 } 6341 6342 // Perform 2x16 32-bit Montgomery multiplications in parallel 6343 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6344 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6345 // Schedule two successive 4x4S multiplies via the montmul helper 6346 // on the front and back halves of va, vb and vc. The helper will 6347 // assert that the register use has no overlap conflicts on each 6348 // individual call but we also need to ensure that the necessary 6349 // disjoint/equality constraints are met across both calls. 6350 6351 // vb, vc, vtmp and vq must be disjoint. va must either be 6352 // disjoint from all other registers or equal vc 6353 6354 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6355 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6356 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6357 6358 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6359 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6360 6361 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6362 6363 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6364 assert(vs_disjoint(va, vb), "va and vb overlap"); 6365 assert(vs_disjoint(va, vq), "va and vq overlap"); 6366 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6367 6368 // We multiply the front and back halves of each sequence 4 at a 6369 // time because 6370 // 6371 // 1) we are currently only able to get 4-way instruction 6372 // parallelism at best 6373 // 6374 // 2) we need registers for the constants in vq and temporary 6375 // scratch registers to hold intermediate results so vtmp can only 6376 // be a VSeq<4> which means we only have 4 scratch slots. 6377 6378 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6379 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6380 } 6381 6382 // Perform combined montmul then add/sub on 4x4S vectors. 6383 void dilithium_montmul16_sub_add( 6384 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6385 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6386 // compute a = montmul(a1, c) 6387 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6388 // ouptut a1 = a0 - a 6389 vs_subv(va1, __ T4S, va0, vc); 6390 // and a0 = a0 + a 6391 vs_addv(va0, __ T4S, va0, vc); 6392 } 6393 6394 // Perform combined add/sub then montul on 4x4S vectors. 6395 void dilithium_sub_add_montmul16( 6396 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6397 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6398 // compute c = a0 - a1 6399 vs_subv(vtmp1, __ T4S, va0, va1); 6400 // output a0 = a0 + a1 6401 vs_addv(va0, __ T4S, va0, va1); 6402 // output a1 = b montmul c 6403 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6404 } 6405 6406 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6407 // in the Java implementation come in sequences of at least 8, so we 6408 // can use ldpq to collect the corresponding data into pairs of vector 6409 // registers. 6410 // We collect the coefficients corresponding to the 'j+l' indexes into 6411 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6412 // then we do the (Montgomery) multiplications by the zetas in parallel 6413 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6414 // v0-v7, then do the additions into v24-v31 and the subtractions into 6415 // v0-v7 and finally save the results back to the coeffs array. 6416 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6417 const Register coeffs, const Register zetas) { 6418 int c1 = 0; 6419 int c2 = 512; 6420 int startIncr; 6421 // don't use callee save registers v8 - v15 6422 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6423 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6424 VSeq<2> vq(30); // n.b. constants overlap vs3 6425 int offsets[4] = { 0, 32, 64, 96 }; 6426 6427 for (int level = 0; level < 5; level++) { 6428 int c1Start = c1; 6429 int c2Start = c2; 6430 if (level == 3) { 6431 offsets[1] = 32; 6432 offsets[2] = 128; 6433 offsets[3] = 160; 6434 } else if (level == 4) { 6435 offsets[1] = 64; 6436 offsets[2] = 128; 6437 offsets[3] = 192; 6438 } 6439 6440 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6441 // time at 4 different offsets and multiply them in order by the 6442 // next set of input values. So we employ indexed load and store 6443 // pair instructions with arrangement 4S. 6444 for (int i = 0; i < 4; i++) { 6445 // reload q and qinv 6446 vs_ldpq(vq, dilithiumConsts); // qInv, q 6447 // load 8x4S coefficients via second start pos == c2 6448 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6449 // load next 8x4S inputs == b 6450 vs_ldpq_post(vs2, zetas); 6451 // compute a == c2 * b mod MONT_Q 6452 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6453 // load 8x4s coefficients via first start pos == c1 6454 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6455 // compute a1 = c1 + a 6456 vs_addv(vs3, __ T4S, vs1, vs2); 6457 // compute a2 = c1 - a 6458 vs_subv(vs1, __ T4S, vs1, vs2); 6459 // output a1 and a2 6460 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6461 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6462 6463 int k = 4 * level + i; 6464 6465 if (k > 7) { 6466 startIncr = 256; 6467 } else if (k == 5) { 6468 startIncr = 384; 6469 } else { 6470 startIncr = 128; 6471 } 6472 6473 c1Start += startIncr; 6474 c2Start += startIncr; 6475 } 6476 6477 c2 /= 2; 6478 } 6479 } 6480 6481 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6482 // Implements the method 6483 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6484 // of the Java class sun.security.provider 6485 // 6486 // coeffs (int[256]) = c_rarg0 6487 // zetas (int[256]) = c_rarg1 6488 address generate_dilithiumAlmostNtt() { 6489 6490 __ align(CodeEntryAlignment); 6491 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 6492 StubCodeMark mark(this, stub_id); 6493 address start = __ pc(); 6494 __ enter(); 6495 6496 const Register coeffs = c_rarg0; 6497 const Register zetas = c_rarg1; 6498 6499 const Register tmpAddr = r9; 6500 const Register dilithiumConsts = r10; 6501 const Register result = r11; 6502 // don't use callee save registers v8 - v15 6503 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6504 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6505 VSeq<2> vq(30); // n.b. constants overlap vs3 6506 int offsets[4] = { 0, 32, 64, 96}; 6507 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6508 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6509 __ add(result, coeffs, 0); 6510 __ lea(dilithiumConsts, 6511 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6512 6513 // Each level represents one iteration of the outer for loop of the Java version. 6514 6515 // level 0-4 6516 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6517 6518 // level 5 6519 6520 // At level 5 the coefficients we need to combine with the zetas 6521 // are grouped in memory in blocks of size 4. So, for both sets of 6522 // coefficients we load 4 adjacent values at 8 different offsets 6523 // using an indexed ldr with register variant Q and multiply them 6524 // in sequence order by the next set of inputs. Likewise we store 6525 // the resuls using an indexed str with register variant Q. 6526 for (int i = 0; i < 1024; i += 256) { 6527 // reload constants q, qinv each iteration as they get clobbered later 6528 vs_ldpq(vq, dilithiumConsts); // qInv, q 6529 // load 32 (8x4S) coefficients via first offsets = c1 6530 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6531 // load next 32 (8x4S) inputs = b 6532 vs_ldpq_post(vs2, zetas); 6533 // a = b montul c1 6534 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6535 // load 32 (8x4S) coefficients via second offsets = c2 6536 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6537 // add/sub with result of multiply 6538 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6539 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6540 // write back new coefficients using same offsets 6541 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6542 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6543 } 6544 6545 // level 6 6546 // At level 6 the coefficients we need to combine with the zetas 6547 // are grouped in memory in pairs, the first two being montmul 6548 // inputs and the second add/sub inputs. We can still implement 6549 // the montmul+sub+add using 4-way parallelism but only if we 6550 // combine the coefficients with the zetas 16 at a time. We load 8 6551 // adjacent values at 4 different offsets using an ld2 load with 6552 // arrangement 2D. That interleaves the lower and upper halves of 6553 // each pair of quadwords into successive vector registers. We 6554 // then need to montmul the 4 even elements of the coefficients 6555 // register sequence by the zetas in order and then add/sub the 4 6556 // odd elements of the coefficients register sequence. We use an 6557 // equivalent st2 operation to store the results back into memory 6558 // de-interleaved. 6559 for (int i = 0; i < 1024; i += 128) { 6560 // reload constants q, qinv each iteration as they get clobbered later 6561 vs_ldpq(vq, dilithiumConsts); // qInv, q 6562 // load interleaved 16 (4x2D) coefficients via offsets 6563 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6564 // load next 16 (4x4S) inputs 6565 vs_ldpq_post(vs_front(vs2), zetas); 6566 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6567 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6568 vs_front(vs2), vtmp, vq); 6569 // store interleaved 16 (4x2D) coefficients via offsets 6570 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6571 } 6572 6573 // level 7 6574 // At level 7 the coefficients we need to combine with the zetas 6575 // occur singly with montmul inputs alterating with add/sub 6576 // inputs. Once again we can use 4-way parallelism to combine 16 6577 // zetas at a time. However, we have to load 8 adjacent values at 6578 // 4 different offsets using an ld2 load with arrangement 4S. That 6579 // interleaves the the odd words of each pair into one 6580 // coefficients vector register and the even words of the pair 6581 // into the next register. We then need to montmul the 4 even 6582 // elements of the coefficients register sequence by the zetas in 6583 // order and then add/sub the 4 odd elements of the coefficients 6584 // register sequence. We use an equivalent st2 operation to store 6585 // the results back into memory de-interleaved. 6586 6587 for (int i = 0; i < 1024; i += 128) { 6588 // reload constants q, qinv each iteration as they get clobbered later 6589 vs_ldpq(vq, dilithiumConsts); // qInv, q 6590 // load interleaved 16 (4x4S) coefficients via offsets 6591 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6592 // load next 16 (4x4S) inputs 6593 vs_ldpq_post(vs_front(vs2), zetas); 6594 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6595 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6596 vs_front(vs2), vtmp, vq); 6597 // store interleaved 16 (4x4S) coefficients via offsets 6598 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6599 } 6600 __ leave(); // required for proper stackwalking of RuntimeStub frame 6601 __ mov(r0, zr); // return 0 6602 __ ret(lr); 6603 6604 return start; 6605 } 6606 6607 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6608 // in the Java implementation come in sequences of at least 8, so we 6609 // can use ldpq to collect the corresponding data into pairs of vector 6610 // registers 6611 // We collect the coefficients that correspond to the 'j's into vs1 6612 // the coefficiets that correspond to the 'j+l's into vs2 then 6613 // do the additions into vs3 and the subtractions into vs1 then 6614 // save the result of the additions, load the zetas into vs2 6615 // do the (Montgomery) multiplications by zeta in parallel into vs2 6616 // finally save the results back to the coeffs array 6617 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6618 const Register coeffs, const Register zetas) { 6619 int c1 = 0; 6620 int c2 = 32; 6621 int startIncr; 6622 int offsets[4]; 6623 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6624 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6625 VSeq<2> vq(30); // n.b. constants overlap vs3 6626 6627 offsets[0] = 0; 6628 6629 for (int level = 3; level < 8; level++) { 6630 int c1Start = c1; 6631 int c2Start = c2; 6632 if (level == 3) { 6633 offsets[1] = 64; 6634 offsets[2] = 128; 6635 offsets[3] = 192; 6636 } else if (level == 4) { 6637 offsets[1] = 32; 6638 offsets[2] = 128; 6639 offsets[3] = 160; 6640 } else { 6641 offsets[1] = 32; 6642 offsets[2] = 64; 6643 offsets[3] = 96; 6644 } 6645 6646 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6647 // time at 4 different offsets and multiply them in order by the 6648 // next set of input values. So we employ indexed load and store 6649 // pair instructions with arrangement 4S. 6650 for (int i = 0; i < 4; i++) { 6651 // load v1 32 (8x4S) coefficients relative to first start index 6652 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6653 // load v2 32 (8x4S) coefficients relative to second start index 6654 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6655 // a0 = v1 + v2 -- n.b. clobbers vqs 6656 vs_addv(vs3, __ T4S, vs1, vs2); 6657 // a1 = v1 - v2 6658 vs_subv(vs1, __ T4S, vs1, vs2); 6659 // save a1 relative to first start index 6660 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6661 // load constants q, qinv each iteration as they get clobbered above 6662 vs_ldpq(vq, dilithiumConsts); // qInv, q 6663 // load b next 32 (8x4S) inputs 6664 vs_ldpq_post(vs2, zetas); 6665 // a = a1 montmul b 6666 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6667 // save a relative to second start index 6668 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6669 6670 int k = 4 * level + i; 6671 6672 if (k < 24) { 6673 startIncr = 256; 6674 } else if (k == 25) { 6675 startIncr = 384; 6676 } else { 6677 startIncr = 128; 6678 } 6679 6680 c1Start += startIncr; 6681 c2Start += startIncr; 6682 } 6683 6684 c2 *= 2; 6685 } 6686 } 6687 6688 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6689 // Implements the method 6690 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6691 // the sun.security.provider.ML_DSA class. 6692 // 6693 // coeffs (int[256]) = c_rarg0 6694 // zetas (int[256]) = c_rarg1 6695 address generate_dilithiumAlmostInverseNtt() { 6696 6697 __ align(CodeEntryAlignment); 6698 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 6699 StubCodeMark mark(this, stub_id); 6700 address start = __ pc(); 6701 __ enter(); 6702 6703 const Register coeffs = c_rarg0; 6704 const Register zetas = c_rarg1; 6705 6706 const Register tmpAddr = r9; 6707 const Register dilithiumConsts = r10; 6708 const Register result = r11; 6709 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6710 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6711 VSeq<2> vq(30); // n.b. constants overlap vs3 6712 int offsets[4] = { 0, 32, 64, 96 }; 6713 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6714 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6715 6716 __ add(result, coeffs, 0); 6717 __ lea(dilithiumConsts, 6718 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6719 6720 // Each level represents one iteration of the outer for loop of the Java version 6721 6722 // level 0 6723 // At level 0 we need to interleave adjacent quartets of 6724 // coefficients before we multiply and add/sub by the next 16 6725 // zetas just as we did for level 7 in the multiply code. So we 6726 // load and store the values using an ld2/st2 with arrangement 4S. 6727 for (int i = 0; i < 1024; i += 128) { 6728 // load constants q, qinv 6729 // n.b. this can be moved out of the loop as they do not get 6730 // clobbered by first two loops 6731 vs_ldpq(vq, dilithiumConsts); // qInv, q 6732 // a0/a1 load interleaved 32 (8x4S) coefficients 6733 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6734 // b load next 32 (8x4S) inputs 6735 vs_ldpq_post(vs_front(vs2), zetas); 6736 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6737 // n.b. second half of vs2 provides temporary register storage 6738 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6739 vs_front(vs2), vs_back(vs2), vtmp, vq); 6740 // a0/a1 store interleaved 32 (8x4S) coefficients 6741 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6742 } 6743 6744 // level 1 6745 // At level 1 we need to interleave pairs of adjacent pairs of 6746 // coefficients before we multiply by the next 16 zetas just as we 6747 // did for level 6 in the multiply code. So we load and store the 6748 // values an ld2/st2 with arrangement 2D. 6749 for (int i = 0; i < 1024; i += 128) { 6750 // a0/a1 load interleaved 32 (8x2D) coefficients 6751 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6752 // b load next 16 (4x4S) inputs 6753 vs_ldpq_post(vs_front(vs2), zetas); 6754 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6755 // n.b. second half of vs2 provides temporary register storage 6756 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6757 vs_front(vs2), vs_back(vs2), vtmp, vq); 6758 // a0/a1 store interleaved 32 (8x2D) coefficients 6759 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6760 } 6761 6762 // level 2 6763 // At level 2 coefficients come in blocks of 4. So, we load 4 6764 // adjacent coefficients at 8 distinct offsets for both the first 6765 // and second coefficient sequences, using an ldr with register 6766 // variant Q then combine them with next set of 32 zetas. Likewise 6767 // we store the results using an str with register variant Q. 6768 for (int i = 0; i < 1024; i += 256) { 6769 // c0 load 32 (8x4S) coefficients via first offsets 6770 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6771 // c1 load 32 (8x4S) coefficients via second offsets 6772 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6773 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6774 vs_addv(vs3, __ T4S, vs1, vs2); 6775 // c = c0 - c1 6776 vs_subv(vs1, __ T4S, vs1, vs2); 6777 // store a0 32 (8x4S) coefficients via first offsets 6778 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6779 // b load 32 (8x4S) next inputs 6780 vs_ldpq_post(vs2, zetas); 6781 // reload constants q, qinv -- they were clobbered earlier 6782 vs_ldpq(vq, dilithiumConsts); // qInv, q 6783 // compute a1 = b montmul c 6784 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6785 // store a1 32 (8x4S) coefficients via second offsets 6786 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6787 } 6788 6789 // level 3-7 6790 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6791 6792 __ leave(); // required for proper stackwalking of RuntimeStub frame 6793 __ mov(r0, zr); // return 0 6794 __ ret(lr); 6795 6796 return start; 6797 } 6798 6799 // Dilithium multiply polynomials in the NTT domain. 6800 // Straightforward implementation of the method 6801 // static int implDilithiumNttMult( 6802 // int[] result, int[] ntta, int[] nttb {} of 6803 // the sun.security.provider.ML_DSA class. 6804 // 6805 // result (int[256]) = c_rarg0 6806 // poly1 (int[256]) = c_rarg1 6807 // poly2 (int[256]) = c_rarg2 6808 address generate_dilithiumNttMult() { 6809 6810 __ align(CodeEntryAlignment); 6811 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 6812 StubCodeMark mark(this, stub_id); 6813 address start = __ pc(); 6814 __ enter(); 6815 6816 Label L_loop; 6817 6818 const Register result = c_rarg0; 6819 const Register poly1 = c_rarg1; 6820 const Register poly2 = c_rarg2; 6821 6822 const Register dilithiumConsts = r10; 6823 const Register len = r11; 6824 6825 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6826 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6827 VSeq<2> vq(30); // n.b. constants overlap vs3 6828 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6829 6830 __ lea(dilithiumConsts, 6831 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6832 6833 // load constants q, qinv 6834 vs_ldpq(vq, dilithiumConsts); // qInv, q 6835 // load constant rSquare into v29 6836 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6837 6838 __ mov(len, zr); 6839 __ add(len, len, 1024); 6840 6841 __ BIND(L_loop); 6842 6843 // b load 32 (8x4S) next inputs from poly1 6844 vs_ldpq_post(vs1, poly1); 6845 // c load 32 (8x4S) next inputs from poly2 6846 vs_ldpq_post(vs2, poly2); 6847 // compute a = b montmul c 6848 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6849 // compute a = rsquare montmul a 6850 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6851 // save a 32 (8x4S) results 6852 vs_stpq_post(vs2, result); 6853 6854 __ sub(len, len, 128); 6855 __ cmp(len, (u1)128); 6856 __ br(Assembler::GE, L_loop); 6857 6858 __ leave(); // required for proper stackwalking of RuntimeStub frame 6859 __ mov(r0, zr); // return 0 6860 __ ret(lr); 6861 6862 return start; 6863 } 6864 6865 // Dilithium Motgomery multiply an array by a constant. 6866 // A straightforward implementation of the method 6867 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6868 // of the sun.security.provider.MLDSA class 6869 // 6870 // coeffs (int[256]) = c_rarg0 6871 // constant (int) = c_rarg1 6872 address generate_dilithiumMontMulByConstant() { 6873 6874 __ align(CodeEntryAlignment); 6875 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 6876 StubCodeMark mark(this, stub_id); 6877 address start = __ pc(); 6878 __ enter(); 6879 6880 Label L_loop; 6881 6882 const Register coeffs = c_rarg0; 6883 const Register constant = c_rarg1; 6884 6885 const Register dilithiumConsts = r10; 6886 const Register result = r11; 6887 const Register len = r12; 6888 6889 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6890 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6891 VSeq<2> vq(30); // n.b. constants overlap vs3 6892 VSeq<8> vconst(29, 0); // for montmul by constant 6893 6894 // results track inputs 6895 __ add(result, coeffs, 0); 6896 __ lea(dilithiumConsts, 6897 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6898 6899 // load constants q, qinv -- they do not get clobbered by first two loops 6900 vs_ldpq(vq, dilithiumConsts); // qInv, q 6901 // copy caller supplied constant across vconst 6902 __ dup(vconst[0], __ T4S, constant); 6903 __ mov(len, zr); 6904 __ add(len, len, 1024); 6905 6906 __ BIND(L_loop); 6907 6908 // load next 32 inputs 6909 vs_ldpq_post(vs2, coeffs); 6910 // mont mul by constant 6911 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6912 // write next 32 results 6913 vs_stpq_post(vs2, result); 6914 6915 __ sub(len, len, 128); 6916 __ cmp(len, (u1)128); 6917 __ br(Assembler::GE, L_loop); 6918 6919 __ leave(); // required for proper stackwalking of RuntimeStub frame 6920 __ mov(r0, zr); // return 0 6921 __ ret(lr); 6922 6923 return start; 6924 } 6925 6926 // Dilithium decompose poly. 6927 // Implements the method 6928 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6929 // of the sun.security.provider.ML_DSA class 6930 // 6931 // input (int[256]) = c_rarg0 6932 // lowPart (int[256]) = c_rarg1 6933 // highPart (int[256]) = c_rarg2 6934 // twoGamma2 (int) = c_rarg3 6935 // multiplier (int) = c_rarg4 6936 address generate_dilithiumDecomposePoly() { 6937 6938 __ align(CodeEntryAlignment); 6939 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 6940 StubCodeMark mark(this, stub_id); 6941 address start = __ pc(); 6942 Label L_loop; 6943 6944 const Register input = c_rarg0; 6945 const Register lowPart = c_rarg1; 6946 const Register highPart = c_rarg2; 6947 const Register twoGamma2 = c_rarg3; 6948 const Register multiplier = c_rarg4; 6949 6950 const Register len = r9; 6951 const Register dilithiumConsts = r10; 6952 const Register tmp = r11; 6953 6954 // 6 independent sets of 4x4s values 6955 VSeq<4> vs1(0), vs2(4), vs3(8); 6956 VSeq<4> vs4(12), vs5(16), vtmp(20); 6957 6958 // 7 constants for cross-multiplying 6959 VSeq<4> one(25, 0); 6960 VSeq<4> qminus1(26, 0); 6961 VSeq<4> g2(27, 0); 6962 VSeq<4> twog2(28, 0); 6963 VSeq<4> mult(29, 0); 6964 VSeq<4> q(30, 0); 6965 VSeq<4> qadd(31, 0); 6966 6967 __ enter(); 6968 6969 __ lea(dilithiumConsts, 6970 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6971 6972 // save callee-saved registers 6973 __ stpd(v8, v9, __ pre(sp, -64)); 6974 __ stpd(v10, v11, Address(sp, 16)); 6975 __ stpd(v12, v13, Address(sp, 32)); 6976 __ stpd(v14, v15, Address(sp, 48)); 6977 6978 // populate constant registers 6979 __ mov(tmp, zr); 6980 __ add(tmp, tmp, 1); 6981 __ dup(one[0], __ T4S, tmp); // 1 6982 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 6983 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 6984 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 6985 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 6986 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 6987 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 6988 6989 __ mov(len, zr); 6990 __ add(len, len, 1024); 6991 6992 __ BIND(L_loop); 6993 6994 // load next 4x4S inputs interleaved: rplus --> vs1 6995 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 6996 6997 // rplus = rplus - ((rplus + qadd) >> 23) * q 6998 vs_addv(vtmp, __ T4S, vs1, qadd); 6999 vs_sshr(vtmp, __ T4S, vtmp, 23); 7000 vs_mulv(vtmp, __ T4S, vtmp, q); 7001 vs_subv(vs1, __ T4S, vs1, vtmp); 7002 7003 // rplus = rplus + ((rplus >> 31) & dilithium_q); 7004 vs_sshr(vtmp, __ T4S, vs1, 31); 7005 vs_andr(vtmp, vtmp, q); 7006 vs_addv(vs1, __ T4S, vs1, vtmp); 7007 7008 // quotient --> vs2 7009 // int quotient = (rplus * multiplier) >> 22; 7010 vs_mulv(vtmp, __ T4S, vs1, mult); 7011 vs_sshr(vs2, __ T4S, vtmp, 22); 7012 7013 // r0 --> vs3 7014 // int r0 = rplus - quotient * twoGamma2; 7015 vs_mulv(vtmp, __ T4S, vs2, twog2); 7016 vs_subv(vs3, __ T4S, vs1, vtmp); 7017 7018 // mask --> vs4 7019 // int mask = (twoGamma2 - r0) >> 22; 7020 vs_subv(vtmp, __ T4S, twog2, vs3); 7021 vs_sshr(vs4, __ T4S, vtmp, 22); 7022 7023 // r0 -= (mask & twoGamma2); 7024 vs_andr(vtmp, vs4, twog2); 7025 vs_subv(vs3, __ T4S, vs3, vtmp); 7026 7027 // quotient += (mask & 1); 7028 vs_andr(vtmp, vs4, one); 7029 vs_addv(vs2, __ T4S, vs2, vtmp); 7030 7031 // mask = (twoGamma2 / 2 - r0) >> 31; 7032 vs_subv(vtmp, __ T4S, g2, vs3); 7033 vs_sshr(vs4, __ T4S, vtmp, 31); 7034 7035 // r0 -= (mask & twoGamma2); 7036 vs_andr(vtmp, vs4, twog2); 7037 vs_subv(vs3, __ T4S, vs3, vtmp); 7038 7039 // quotient += (mask & 1); 7040 vs_andr(vtmp, vs4, one); 7041 vs_addv(vs2, __ T4S, vs2, vtmp); 7042 7043 // r1 --> vs5 7044 // int r1 = rplus - r0 - (dilithium_q - 1); 7045 vs_subv(vtmp, __ T4S, vs1, vs3); 7046 vs_subv(vs5, __ T4S, vtmp, qminus1); 7047 7048 // r1 --> vs1 (overwriting rplus) 7049 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 7050 vs_negr(vtmp, __ T4S, vs5); 7051 vs_orr(vtmp, vs5, vtmp); 7052 vs_sshr(vs1, __ T4S, vtmp, 31); 7053 7054 // r0 += ~r1; 7055 vs_notr(vtmp, vs1); 7056 vs_addv(vs3, __ T4S, vs3, vtmp); 7057 7058 // r1 = r1 & quotient; 7059 vs_andr(vs1, vs2, vs1); 7060 7061 // store results inteleaved 7062 // lowPart[m] = r0; 7063 // highPart[m] = r1; 7064 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 7065 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 7066 7067 __ sub(len, len, 64); 7068 __ cmp(len, (u1)64); 7069 __ br(Assembler::GE, L_loop); 7070 7071 // restore callee-saved vector registers 7072 __ ldpd(v14, v15, Address(sp, 48)); 7073 __ ldpd(v12, v13, Address(sp, 32)); 7074 __ ldpd(v10, v11, Address(sp, 16)); 7075 __ ldpd(v8, v9, __ post(sp, 64)); 7076 7077 __ leave(); // required for proper stackwalking of RuntimeStub frame 7078 __ mov(r0, zr); // return 0 7079 __ ret(lr); 7080 7081 return start; 7082 } 7083 7084 /** 7085 * Arguments: 7086 * 7087 * Inputs: 7088 * c_rarg0 - int crc 7089 * c_rarg1 - byte* buf 7090 * c_rarg2 - int length 7091 * 7092 * Output: 7093 * rax - int crc result 7094 */ 7095 address generate_updateBytesCRC32() { 7096 assert(UseCRC32Intrinsics, "what are we doing here?"); 7097 7098 __ align(CodeEntryAlignment); 7099 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 7100 StubCodeMark mark(this, stub_id); 7101 7102 address start = __ pc(); 7103 7104 const Register crc = c_rarg0; // crc 7105 const Register buf = c_rarg1; // source java byte array address 7106 const Register len = c_rarg2; // length 7107 const Register table0 = c_rarg3; // crc_table address 7108 const Register table1 = c_rarg4; 7109 const Register table2 = c_rarg5; 7110 const Register table3 = c_rarg6; 7111 const Register tmp3 = c_rarg7; 7112 7113 BLOCK_COMMENT("Entry:"); 7114 __ enter(); // required for proper stackwalking of RuntimeStub frame 7115 7116 __ kernel_crc32(crc, buf, len, 7117 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7118 7119 __ leave(); // required for proper stackwalking of RuntimeStub frame 7120 __ ret(lr); 7121 7122 return start; 7123 } 7124 7125 /** 7126 * Arguments: 7127 * 7128 * Inputs: 7129 * c_rarg0 - int crc 7130 * c_rarg1 - byte* buf 7131 * c_rarg2 - int length 7132 * c_rarg3 - int* table 7133 * 7134 * Output: 7135 * r0 - int crc result 7136 */ 7137 address generate_updateBytesCRC32C() { 7138 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7139 7140 __ align(CodeEntryAlignment); 7141 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 7142 StubCodeMark mark(this, stub_id); 7143 7144 address start = __ pc(); 7145 7146 const Register crc = c_rarg0; // crc 7147 const Register buf = c_rarg1; // source java byte array address 7148 const Register len = c_rarg2; // length 7149 const Register table0 = c_rarg3; // crc_table address 7150 const Register table1 = c_rarg4; 7151 const Register table2 = c_rarg5; 7152 const Register table3 = c_rarg6; 7153 const Register tmp3 = c_rarg7; 7154 7155 BLOCK_COMMENT("Entry:"); 7156 __ enter(); // required for proper stackwalking of RuntimeStub frame 7157 7158 __ kernel_crc32c(crc, buf, len, 7159 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7160 7161 __ leave(); // required for proper stackwalking of RuntimeStub frame 7162 __ ret(lr); 7163 7164 return start; 7165 } 7166 7167 /*** 7168 * Arguments: 7169 * 7170 * Inputs: 7171 * c_rarg0 - int adler 7172 * c_rarg1 - byte* buff 7173 * c_rarg2 - int len 7174 * 7175 * Output: 7176 * c_rarg0 - int adler result 7177 */ 7178 address generate_updateBytesAdler32() { 7179 __ align(CodeEntryAlignment); 7180 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 7181 StubCodeMark mark(this, stub_id); 7182 address start = __ pc(); 7183 7184 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7185 7186 // Aliases 7187 Register adler = c_rarg0; 7188 Register s1 = c_rarg0; 7189 Register s2 = c_rarg3; 7190 Register buff = c_rarg1; 7191 Register len = c_rarg2; 7192 Register nmax = r4; 7193 Register base = r5; 7194 Register count = r6; 7195 Register temp0 = rscratch1; 7196 Register temp1 = rscratch2; 7197 FloatRegister vbytes = v0; 7198 FloatRegister vs1acc = v1; 7199 FloatRegister vs2acc = v2; 7200 FloatRegister vtable = v3; 7201 7202 // Max number of bytes we can process before having to take the mod 7203 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7204 uint64_t BASE = 0xfff1; 7205 uint64_t NMAX = 0x15B0; 7206 7207 __ mov(base, BASE); 7208 __ mov(nmax, NMAX); 7209 7210 // Load accumulation coefficients for the upper 16 bits 7211 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7212 __ ld1(vtable, __ T16B, Address(temp0)); 7213 7214 // s1 is initialized to the lower 16 bits of adler 7215 // s2 is initialized to the upper 16 bits of adler 7216 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7217 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7218 7219 // The pipelined loop needs at least 16 elements for 1 iteration 7220 // It does check this, but it is more effective to skip to the cleanup loop 7221 __ cmp(len, (u1)16); 7222 __ br(Assembler::HS, L_nmax); 7223 __ cbz(len, L_combine); 7224 7225 __ bind(L_simple_by1_loop); 7226 __ ldrb(temp0, Address(__ post(buff, 1))); 7227 __ add(s1, s1, temp0); 7228 __ add(s2, s2, s1); 7229 __ subs(len, len, 1); 7230 __ br(Assembler::HI, L_simple_by1_loop); 7231 7232 // s1 = s1 % BASE 7233 __ subs(temp0, s1, base); 7234 __ csel(s1, temp0, s1, Assembler::HS); 7235 7236 // s2 = s2 % BASE 7237 __ lsr(temp0, s2, 16); 7238 __ lsl(temp1, temp0, 4); 7239 __ sub(temp1, temp1, temp0); 7240 __ add(s2, temp1, s2, ext::uxth); 7241 7242 __ subs(temp0, s2, base); 7243 __ csel(s2, temp0, s2, Assembler::HS); 7244 7245 __ b(L_combine); 7246 7247 __ bind(L_nmax); 7248 __ subs(len, len, nmax); 7249 __ sub(count, nmax, 16); 7250 __ br(Assembler::LO, L_by16); 7251 7252 __ bind(L_nmax_loop); 7253 7254 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7255 vbytes, vs1acc, vs2acc, vtable); 7256 7257 __ subs(count, count, 16); 7258 __ br(Assembler::HS, L_nmax_loop); 7259 7260 // s1 = s1 % BASE 7261 __ lsr(temp0, s1, 16); 7262 __ lsl(temp1, temp0, 4); 7263 __ sub(temp1, temp1, temp0); 7264 __ add(temp1, temp1, s1, ext::uxth); 7265 7266 __ lsr(temp0, temp1, 16); 7267 __ lsl(s1, temp0, 4); 7268 __ sub(s1, s1, temp0); 7269 __ add(s1, s1, temp1, ext:: uxth); 7270 7271 __ subs(temp0, s1, base); 7272 __ csel(s1, temp0, s1, Assembler::HS); 7273 7274 // s2 = s2 % BASE 7275 __ lsr(temp0, s2, 16); 7276 __ lsl(temp1, temp0, 4); 7277 __ sub(temp1, temp1, temp0); 7278 __ add(temp1, temp1, s2, ext::uxth); 7279 7280 __ lsr(temp0, temp1, 16); 7281 __ lsl(s2, temp0, 4); 7282 __ sub(s2, s2, temp0); 7283 __ add(s2, s2, temp1, ext:: uxth); 7284 7285 __ subs(temp0, s2, base); 7286 __ csel(s2, temp0, s2, Assembler::HS); 7287 7288 __ subs(len, len, nmax); 7289 __ sub(count, nmax, 16); 7290 __ br(Assembler::HS, L_nmax_loop); 7291 7292 __ bind(L_by16); 7293 __ adds(len, len, count); 7294 __ br(Assembler::LO, L_by1); 7295 7296 __ bind(L_by16_loop); 7297 7298 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7299 vbytes, vs1acc, vs2acc, vtable); 7300 7301 __ subs(len, len, 16); 7302 __ br(Assembler::HS, L_by16_loop); 7303 7304 __ bind(L_by1); 7305 __ adds(len, len, 15); 7306 __ br(Assembler::LO, L_do_mod); 7307 7308 __ bind(L_by1_loop); 7309 __ ldrb(temp0, Address(__ post(buff, 1))); 7310 __ add(s1, temp0, s1); 7311 __ add(s2, s2, s1); 7312 __ subs(len, len, 1); 7313 __ br(Assembler::HS, L_by1_loop); 7314 7315 __ bind(L_do_mod); 7316 // s1 = s1 % BASE 7317 __ lsr(temp0, s1, 16); 7318 __ lsl(temp1, temp0, 4); 7319 __ sub(temp1, temp1, temp0); 7320 __ add(temp1, temp1, s1, ext::uxth); 7321 7322 __ lsr(temp0, temp1, 16); 7323 __ lsl(s1, temp0, 4); 7324 __ sub(s1, s1, temp0); 7325 __ add(s1, s1, temp1, ext:: uxth); 7326 7327 __ subs(temp0, s1, base); 7328 __ csel(s1, temp0, s1, Assembler::HS); 7329 7330 // s2 = s2 % BASE 7331 __ lsr(temp0, s2, 16); 7332 __ lsl(temp1, temp0, 4); 7333 __ sub(temp1, temp1, temp0); 7334 __ add(temp1, temp1, s2, ext::uxth); 7335 7336 __ lsr(temp0, temp1, 16); 7337 __ lsl(s2, temp0, 4); 7338 __ sub(s2, s2, temp0); 7339 __ add(s2, s2, temp1, ext:: uxth); 7340 7341 __ subs(temp0, s2, base); 7342 __ csel(s2, temp0, s2, Assembler::HS); 7343 7344 // Combine lower bits and higher bits 7345 __ bind(L_combine); 7346 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7347 7348 __ ret(lr); 7349 7350 return start; 7351 } 7352 7353 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7354 Register temp0, Register temp1, FloatRegister vbytes, 7355 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7356 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7357 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7358 // In non-vectorized code, we update s1 and s2 as: 7359 // s1 <- s1 + b1 7360 // s2 <- s2 + s1 7361 // s1 <- s1 + b2 7362 // s2 <- s2 + b1 7363 // ... 7364 // s1 <- s1 + b16 7365 // s2 <- s2 + s1 7366 // Putting above assignments together, we have: 7367 // s1_new = s1 + b1 + b2 + ... + b16 7368 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7369 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7370 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7371 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7372 7373 // s2 = s2 + s1 * 16 7374 __ add(s2, s2, s1, Assembler::LSL, 4); 7375 7376 // vs1acc = b1 + b2 + b3 + ... + b16 7377 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7378 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7379 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7380 __ uaddlv(vs1acc, __ T16B, vbytes); 7381 __ uaddlv(vs2acc, __ T8H, vs2acc); 7382 7383 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7384 __ fmovd(temp0, vs1acc); 7385 __ fmovd(temp1, vs2acc); 7386 __ add(s1, s1, temp0); 7387 __ add(s2, s2, temp1); 7388 } 7389 7390 /** 7391 * Arguments: 7392 * 7393 * Input: 7394 * c_rarg0 - x address 7395 * c_rarg1 - x length 7396 * c_rarg2 - y address 7397 * c_rarg3 - y length 7398 * c_rarg4 - z address 7399 */ 7400 address generate_multiplyToLen() { 7401 __ align(CodeEntryAlignment); 7402 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 7403 StubCodeMark mark(this, stub_id); 7404 7405 address start = __ pc(); 7406 const Register x = r0; 7407 const Register xlen = r1; 7408 const Register y = r2; 7409 const Register ylen = r3; 7410 const Register z = r4; 7411 7412 const Register tmp0 = r5; 7413 const Register tmp1 = r10; 7414 const Register tmp2 = r11; 7415 const Register tmp3 = r12; 7416 const Register tmp4 = r13; 7417 const Register tmp5 = r14; 7418 const Register tmp6 = r15; 7419 const Register tmp7 = r16; 7420 7421 BLOCK_COMMENT("Entry:"); 7422 __ enter(); // required for proper stackwalking of RuntimeStub frame 7423 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7424 __ leave(); // required for proper stackwalking of RuntimeStub frame 7425 __ ret(lr); 7426 7427 return start; 7428 } 7429 7430 address generate_squareToLen() { 7431 // squareToLen algorithm for sizes 1..127 described in java code works 7432 // faster than multiply_to_len on some CPUs and slower on others, but 7433 // multiply_to_len shows a bit better overall results 7434 __ align(CodeEntryAlignment); 7435 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 7436 StubCodeMark mark(this, stub_id); 7437 address start = __ pc(); 7438 7439 const Register x = r0; 7440 const Register xlen = r1; 7441 const Register z = r2; 7442 const Register y = r4; // == x 7443 const Register ylen = r5; // == xlen 7444 7445 const Register tmp0 = r3; 7446 const Register tmp1 = r10; 7447 const Register tmp2 = r11; 7448 const Register tmp3 = r12; 7449 const Register tmp4 = r13; 7450 const Register tmp5 = r14; 7451 const Register tmp6 = r15; 7452 const Register tmp7 = r16; 7453 7454 RegSet spilled_regs = RegSet::of(y, ylen); 7455 BLOCK_COMMENT("Entry:"); 7456 __ enter(); 7457 __ push(spilled_regs, sp); 7458 __ mov(y, x); 7459 __ mov(ylen, xlen); 7460 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7461 __ pop(spilled_regs, sp); 7462 __ leave(); 7463 __ ret(lr); 7464 return start; 7465 } 7466 7467 address generate_mulAdd() { 7468 __ align(CodeEntryAlignment); 7469 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 7470 StubCodeMark mark(this, stub_id); 7471 7472 address start = __ pc(); 7473 7474 const Register out = r0; 7475 const Register in = r1; 7476 const Register offset = r2; 7477 const Register len = r3; 7478 const Register k = r4; 7479 7480 BLOCK_COMMENT("Entry:"); 7481 __ enter(); 7482 __ mul_add(out, in, offset, len, k); 7483 __ leave(); 7484 __ ret(lr); 7485 7486 return start; 7487 } 7488 7489 // Arguments: 7490 // 7491 // Input: 7492 // c_rarg0 - newArr address 7493 // c_rarg1 - oldArr address 7494 // c_rarg2 - newIdx 7495 // c_rarg3 - shiftCount 7496 // c_rarg4 - numIter 7497 // 7498 address generate_bigIntegerRightShift() { 7499 __ align(CodeEntryAlignment); 7500 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 7501 StubCodeMark mark(this, stub_id); 7502 address start = __ pc(); 7503 7504 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7505 7506 Register newArr = c_rarg0; 7507 Register oldArr = c_rarg1; 7508 Register newIdx = c_rarg2; 7509 Register shiftCount = c_rarg3; 7510 Register numIter = c_rarg4; 7511 Register idx = numIter; 7512 7513 Register newArrCur = rscratch1; 7514 Register shiftRevCount = rscratch2; 7515 Register oldArrCur = r13; 7516 Register oldArrNext = r14; 7517 7518 FloatRegister oldElem0 = v0; 7519 FloatRegister oldElem1 = v1; 7520 FloatRegister newElem = v2; 7521 FloatRegister shiftVCount = v3; 7522 FloatRegister shiftVRevCount = v4; 7523 7524 __ cbz(idx, Exit); 7525 7526 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7527 7528 // left shift count 7529 __ movw(shiftRevCount, 32); 7530 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7531 7532 // numIter too small to allow a 4-words SIMD loop, rolling back 7533 __ cmp(numIter, (u1)4); 7534 __ br(Assembler::LT, ShiftThree); 7535 7536 __ dup(shiftVCount, __ T4S, shiftCount); 7537 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7538 __ negr(shiftVCount, __ T4S, shiftVCount); 7539 7540 __ BIND(ShiftSIMDLoop); 7541 7542 // Calculate the load addresses 7543 __ sub(idx, idx, 4); 7544 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7545 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7546 __ add(oldArrCur, oldArrNext, 4); 7547 7548 // Load 4 words and process 7549 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7550 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7551 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7552 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7553 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7554 __ st1(newElem, __ T4S, Address(newArrCur)); 7555 7556 __ cmp(idx, (u1)4); 7557 __ br(Assembler::LT, ShiftTwoLoop); 7558 __ b(ShiftSIMDLoop); 7559 7560 __ BIND(ShiftTwoLoop); 7561 __ cbz(idx, Exit); 7562 __ cmp(idx, (u1)1); 7563 __ br(Assembler::EQ, ShiftOne); 7564 7565 // Calculate the load addresses 7566 __ sub(idx, idx, 2); 7567 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7568 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7569 __ add(oldArrCur, oldArrNext, 4); 7570 7571 // Load 2 words and process 7572 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 7573 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 7574 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7575 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7576 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7577 __ st1(newElem, __ T2S, Address(newArrCur)); 7578 __ b(ShiftTwoLoop); 7579 7580 __ BIND(ShiftThree); 7581 __ tbz(idx, 1, ShiftOne); 7582 __ tbz(idx, 0, ShiftTwo); 7583 __ ldrw(r10, Address(oldArr, 12)); 7584 __ ldrw(r11, Address(oldArr, 8)); 7585 __ lsrvw(r10, r10, shiftCount); 7586 __ lslvw(r11, r11, shiftRevCount); 7587 __ orrw(r12, r10, r11); 7588 __ strw(r12, Address(newArr, 8)); 7589 7590 __ BIND(ShiftTwo); 7591 __ ldrw(r10, Address(oldArr, 8)); 7592 __ ldrw(r11, Address(oldArr, 4)); 7593 __ lsrvw(r10, r10, shiftCount); 7594 __ lslvw(r11, r11, shiftRevCount); 7595 __ orrw(r12, r10, r11); 7596 __ strw(r12, Address(newArr, 4)); 7597 7598 __ BIND(ShiftOne); 7599 __ ldrw(r10, Address(oldArr, 4)); 7600 __ ldrw(r11, Address(oldArr)); 7601 __ lsrvw(r10, r10, shiftCount); 7602 __ lslvw(r11, r11, shiftRevCount); 7603 __ orrw(r12, r10, r11); 7604 __ strw(r12, Address(newArr)); 7605 7606 __ BIND(Exit); 7607 __ ret(lr); 7608 7609 return start; 7610 } 7611 7612 // Arguments: 7613 // 7614 // Input: 7615 // c_rarg0 - newArr address 7616 // c_rarg1 - oldArr address 7617 // c_rarg2 - newIdx 7618 // c_rarg3 - shiftCount 7619 // c_rarg4 - numIter 7620 // 7621 address generate_bigIntegerLeftShift() { 7622 __ align(CodeEntryAlignment); 7623 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 7624 StubCodeMark mark(this, stub_id); 7625 address start = __ pc(); 7626 7627 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7628 7629 Register newArr = c_rarg0; 7630 Register oldArr = c_rarg1; 7631 Register newIdx = c_rarg2; 7632 Register shiftCount = c_rarg3; 7633 Register numIter = c_rarg4; 7634 7635 Register shiftRevCount = rscratch1; 7636 Register oldArrNext = rscratch2; 7637 7638 FloatRegister oldElem0 = v0; 7639 FloatRegister oldElem1 = v1; 7640 FloatRegister newElem = v2; 7641 FloatRegister shiftVCount = v3; 7642 FloatRegister shiftVRevCount = v4; 7643 7644 __ cbz(numIter, Exit); 7645 7646 __ add(oldArrNext, oldArr, 4); 7647 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7648 7649 // right shift count 7650 __ movw(shiftRevCount, 32); 7651 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7652 7653 // numIter too small to allow a 4-words SIMD loop, rolling back 7654 __ cmp(numIter, (u1)4); 7655 __ br(Assembler::LT, ShiftThree); 7656 7657 __ dup(shiftVCount, __ T4S, shiftCount); 7658 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7659 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 7660 7661 __ BIND(ShiftSIMDLoop); 7662 7663 // load 4 words and process 7664 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 7665 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 7666 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7667 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7668 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7669 __ st1(newElem, __ T4S, __ post(newArr, 16)); 7670 __ sub(numIter, numIter, 4); 7671 7672 __ cmp(numIter, (u1)4); 7673 __ br(Assembler::LT, ShiftTwoLoop); 7674 __ b(ShiftSIMDLoop); 7675 7676 __ BIND(ShiftTwoLoop); 7677 __ cbz(numIter, Exit); 7678 __ cmp(numIter, (u1)1); 7679 __ br(Assembler::EQ, ShiftOne); 7680 7681 // load 2 words and process 7682 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 7683 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 7684 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7685 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7686 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7687 __ st1(newElem, __ T2S, __ post(newArr, 8)); 7688 __ sub(numIter, numIter, 2); 7689 __ b(ShiftTwoLoop); 7690 7691 __ BIND(ShiftThree); 7692 __ ldrw(r10, __ post(oldArr, 4)); 7693 __ ldrw(r11, __ post(oldArrNext, 4)); 7694 __ lslvw(r10, r10, shiftCount); 7695 __ lsrvw(r11, r11, shiftRevCount); 7696 __ orrw(r12, r10, r11); 7697 __ strw(r12, __ post(newArr, 4)); 7698 __ tbz(numIter, 1, Exit); 7699 __ tbz(numIter, 0, ShiftOne); 7700 7701 __ BIND(ShiftTwo); 7702 __ ldrw(r10, __ post(oldArr, 4)); 7703 __ ldrw(r11, __ post(oldArrNext, 4)); 7704 __ lslvw(r10, r10, shiftCount); 7705 __ lsrvw(r11, r11, shiftRevCount); 7706 __ orrw(r12, r10, r11); 7707 __ strw(r12, __ post(newArr, 4)); 7708 7709 __ BIND(ShiftOne); 7710 __ ldrw(r10, Address(oldArr)); 7711 __ ldrw(r11, Address(oldArrNext)); 7712 __ lslvw(r10, r10, shiftCount); 7713 __ lsrvw(r11, r11, shiftRevCount); 7714 __ orrw(r12, r10, r11); 7715 __ strw(r12, Address(newArr)); 7716 7717 __ BIND(Exit); 7718 __ ret(lr); 7719 7720 return start; 7721 } 7722 7723 address generate_count_positives(address &count_positives_long) { 7724 const u1 large_loop_size = 64; 7725 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 7726 int dcache_line = VM_Version::dcache_line_size(); 7727 7728 Register ary1 = r1, len = r2, result = r0; 7729 7730 __ align(CodeEntryAlignment); 7731 7732 StubGenStubId stub_id = StubGenStubId::count_positives_id; 7733 StubCodeMark mark(this, stub_id); 7734 7735 address entry = __ pc(); 7736 7737 __ enter(); 7738 // precondition: a copy of len is already in result 7739 // __ mov(result, len); 7740 7741 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 7742 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 7743 7744 __ cmp(len, (u1)15); 7745 __ br(Assembler::GT, LEN_OVER_15); 7746 // The only case when execution falls into this code is when pointer is near 7747 // the end of memory page and we have to avoid reading next page 7748 __ add(ary1, ary1, len); 7749 __ subs(len, len, 8); 7750 __ br(Assembler::GT, LEN_OVER_8); 7751 __ ldr(rscratch2, Address(ary1, -8)); 7752 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 7753 __ lsrv(rscratch2, rscratch2, rscratch1); 7754 __ tst(rscratch2, UPPER_BIT_MASK); 7755 __ csel(result, zr, result, Assembler::NE); 7756 __ leave(); 7757 __ ret(lr); 7758 __ bind(LEN_OVER_8); 7759 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 7760 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 7761 __ tst(rscratch2, UPPER_BIT_MASK); 7762 __ br(Assembler::NE, RET_NO_POP); 7763 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 7764 __ lsrv(rscratch1, rscratch1, rscratch2); 7765 __ tst(rscratch1, UPPER_BIT_MASK); 7766 __ bind(RET_NO_POP); 7767 __ csel(result, zr, result, Assembler::NE); 7768 __ leave(); 7769 __ ret(lr); 7770 7771 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 7772 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 7773 7774 count_positives_long = __ pc(); // 2nd entry point 7775 7776 __ enter(); 7777 7778 __ bind(LEN_OVER_15); 7779 __ push(spilled_regs, sp); 7780 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 7781 __ cbz(rscratch2, ALIGNED); 7782 __ ldp(tmp6, tmp1, Address(ary1)); 7783 __ mov(tmp5, 16); 7784 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 7785 __ add(ary1, ary1, rscratch1); 7786 __ orr(tmp6, tmp6, tmp1); 7787 __ tst(tmp6, UPPER_BIT_MASK); 7788 __ br(Assembler::NE, RET_ADJUST); 7789 __ sub(len, len, rscratch1); 7790 7791 __ bind(ALIGNED); 7792 __ cmp(len, large_loop_size); 7793 __ br(Assembler::LT, CHECK_16); 7794 // Perform 16-byte load as early return in pre-loop to handle situation 7795 // when initially aligned large array has negative values at starting bytes, 7796 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 7797 // slower. Cases with negative bytes further ahead won't be affected that 7798 // much. In fact, it'll be faster due to early loads, less instructions and 7799 // less branches in LARGE_LOOP. 7800 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 7801 __ sub(len, len, 16); 7802 __ orr(tmp6, tmp6, tmp1); 7803 __ tst(tmp6, UPPER_BIT_MASK); 7804 __ br(Assembler::NE, RET_ADJUST_16); 7805 __ cmp(len, large_loop_size); 7806 __ br(Assembler::LT, CHECK_16); 7807 7808 if (SoftwarePrefetchHintDistance >= 0 7809 && SoftwarePrefetchHintDistance >= dcache_line) { 7810 // initial prefetch 7811 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 7812 } 7813 __ bind(LARGE_LOOP); 7814 if (SoftwarePrefetchHintDistance >= 0) { 7815 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 7816 } 7817 // Issue load instructions first, since it can save few CPU/MEM cycles, also 7818 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 7819 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 7820 // instructions per cycle and have less branches, but this approach disables 7821 // early return, thus, all 64 bytes are loaded and checked every time. 7822 __ ldp(tmp2, tmp3, Address(ary1)); 7823 __ ldp(tmp4, tmp5, Address(ary1, 16)); 7824 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 7825 __ ldp(tmp6, tmp1, Address(ary1, 48)); 7826 __ add(ary1, ary1, large_loop_size); 7827 __ sub(len, len, large_loop_size); 7828 __ orr(tmp2, tmp2, tmp3); 7829 __ orr(tmp4, tmp4, tmp5); 7830 __ orr(rscratch1, rscratch1, rscratch2); 7831 __ orr(tmp6, tmp6, tmp1); 7832 __ orr(tmp2, tmp2, tmp4); 7833 __ orr(rscratch1, rscratch1, tmp6); 7834 __ orr(tmp2, tmp2, rscratch1); 7835 __ tst(tmp2, UPPER_BIT_MASK); 7836 __ br(Assembler::NE, RET_ADJUST_LONG); 7837 __ cmp(len, large_loop_size); 7838 __ br(Assembler::GE, LARGE_LOOP); 7839 7840 __ bind(CHECK_16); // small 16-byte load pre-loop 7841 __ cmp(len, (u1)16); 7842 __ br(Assembler::LT, POST_LOOP16); 7843 7844 __ bind(LOOP16); // small 16-byte load loop 7845 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 7846 __ sub(len, len, 16); 7847 __ orr(tmp2, tmp2, tmp3); 7848 __ tst(tmp2, UPPER_BIT_MASK); 7849 __ br(Assembler::NE, RET_ADJUST_16); 7850 __ cmp(len, (u1)16); 7851 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 7852 7853 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 7854 __ cmp(len, (u1)8); 7855 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 7856 __ ldr(tmp3, Address(__ post(ary1, 8))); 7857 __ tst(tmp3, UPPER_BIT_MASK); 7858 __ br(Assembler::NE, RET_ADJUST); 7859 __ sub(len, len, 8); 7860 7861 __ bind(POST_LOOP16_LOAD_TAIL); 7862 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 7863 __ ldr(tmp1, Address(ary1)); 7864 __ mov(tmp2, 64); 7865 __ sub(tmp4, tmp2, len, __ LSL, 3); 7866 __ lslv(tmp1, tmp1, tmp4); 7867 __ tst(tmp1, UPPER_BIT_MASK); 7868 __ br(Assembler::NE, RET_ADJUST); 7869 // Fallthrough 7870 7871 __ bind(RET_LEN); 7872 __ pop(spilled_regs, sp); 7873 __ leave(); 7874 __ ret(lr); 7875 7876 // difference result - len is the count of guaranteed to be 7877 // positive bytes 7878 7879 __ bind(RET_ADJUST_LONG); 7880 __ add(len, len, (u1)(large_loop_size - 16)); 7881 __ bind(RET_ADJUST_16); 7882 __ add(len, len, 16); 7883 __ bind(RET_ADJUST); 7884 __ pop(spilled_regs, sp); 7885 __ leave(); 7886 __ sub(result, result, len); 7887 __ ret(lr); 7888 7889 return entry; 7890 } 7891 7892 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 7893 bool usePrefetch, Label &NOT_EQUAL) { 7894 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7895 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 7896 tmp7 = r12, tmp8 = r13; 7897 Label LOOP; 7898 7899 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7900 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7901 __ bind(LOOP); 7902 if (usePrefetch) { 7903 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 7904 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 7905 } 7906 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 7907 __ eor(tmp1, tmp1, tmp2); 7908 __ eor(tmp3, tmp3, tmp4); 7909 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 7910 __ orr(tmp1, tmp1, tmp3); 7911 __ cbnz(tmp1, NOT_EQUAL); 7912 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7913 __ eor(tmp5, tmp5, tmp6); 7914 __ eor(tmp7, tmp7, tmp8); 7915 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7916 __ orr(tmp5, tmp5, tmp7); 7917 __ cbnz(tmp5, NOT_EQUAL); 7918 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 7919 __ eor(tmp1, tmp1, tmp2); 7920 __ eor(tmp3, tmp3, tmp4); 7921 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 7922 __ orr(tmp1, tmp1, tmp3); 7923 __ cbnz(tmp1, NOT_EQUAL); 7924 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7925 __ eor(tmp5, tmp5, tmp6); 7926 __ sub(cnt1, cnt1, 8 * wordSize); 7927 __ eor(tmp7, tmp7, tmp8); 7928 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7929 // tmp6 is not used. MacroAssembler::subs is used here (rather than 7930 // cmp) because subs allows an unlimited range of immediate operand. 7931 __ subs(tmp6, cnt1, loopThreshold); 7932 __ orr(tmp5, tmp5, tmp7); 7933 __ cbnz(tmp5, NOT_EQUAL); 7934 __ br(__ GE, LOOP); 7935 // post-loop 7936 __ eor(tmp1, tmp1, tmp2); 7937 __ eor(tmp3, tmp3, tmp4); 7938 __ orr(tmp1, tmp1, tmp3); 7939 __ sub(cnt1, cnt1, 2 * wordSize); 7940 __ cbnz(tmp1, NOT_EQUAL); 7941 } 7942 7943 void generate_large_array_equals_loop_simd(int loopThreshold, 7944 bool usePrefetch, Label &NOT_EQUAL) { 7945 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7946 tmp2 = rscratch2; 7947 Label LOOP; 7948 7949 __ bind(LOOP); 7950 if (usePrefetch) { 7951 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 7952 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 7953 } 7954 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 7955 __ sub(cnt1, cnt1, 8 * wordSize); 7956 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 7957 __ subs(tmp1, cnt1, loopThreshold); 7958 __ eor(v0, __ T16B, v0, v4); 7959 __ eor(v1, __ T16B, v1, v5); 7960 __ eor(v2, __ T16B, v2, v6); 7961 __ eor(v3, __ T16B, v3, v7); 7962 __ orr(v0, __ T16B, v0, v1); 7963 __ orr(v1, __ T16B, v2, v3); 7964 __ orr(v0, __ T16B, v0, v1); 7965 __ umov(tmp1, v0, __ D, 0); 7966 __ umov(tmp2, v0, __ D, 1); 7967 __ orr(tmp1, tmp1, tmp2); 7968 __ cbnz(tmp1, NOT_EQUAL); 7969 __ br(__ GE, LOOP); 7970 } 7971 7972 // a1 = r1 - array1 address 7973 // a2 = r2 - array2 address 7974 // result = r0 - return value. Already contains "false" 7975 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 7976 // r3-r5 are reserved temporary registers 7977 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 7978 address generate_large_array_equals() { 7979 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7980 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 7981 tmp7 = r12, tmp8 = r13; 7982 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 7983 SMALL_LOOP, POST_LOOP; 7984 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 7985 // calculate if at least 32 prefetched bytes are used 7986 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 7987 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 7988 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 7989 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 7990 tmp5, tmp6, tmp7, tmp8); 7991 7992 __ align(CodeEntryAlignment); 7993 7994 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 7995 StubCodeMark mark(this, stub_id); 7996 7997 address entry = __ pc(); 7998 __ enter(); 7999 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 8000 // also advance pointers to use post-increment instead of pre-increment 8001 __ add(a1, a1, wordSize); 8002 __ add(a2, a2, wordSize); 8003 if (AvoidUnalignedAccesses) { 8004 // both implementations (SIMD/nonSIMD) are using relatively large load 8005 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 8006 // on some CPUs in case of address is not at least 16-byte aligned. 8007 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 8008 // load if needed at least for 1st address and make if 16-byte aligned. 8009 Label ALIGNED16; 8010 __ tbz(a1, 3, ALIGNED16); 8011 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8012 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8013 __ sub(cnt1, cnt1, wordSize); 8014 __ eor(tmp1, tmp1, tmp2); 8015 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 8016 __ bind(ALIGNED16); 8017 } 8018 if (UseSIMDForArrayEquals) { 8019 if (SoftwarePrefetchHintDistance >= 0) { 8020 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8021 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8022 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 8023 /* prfm = */ true, NOT_EQUAL); 8024 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8025 __ br(__ LT, TAIL); 8026 } 8027 __ bind(NO_PREFETCH_LARGE_LOOP); 8028 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 8029 /* prfm = */ false, NOT_EQUAL); 8030 } else { 8031 __ push(spilled_regs, sp); 8032 if (SoftwarePrefetchHintDistance >= 0) { 8033 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8034 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8035 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 8036 /* prfm = */ true, NOT_EQUAL); 8037 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8038 __ br(__ LT, TAIL); 8039 } 8040 __ bind(NO_PREFETCH_LARGE_LOOP); 8041 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 8042 /* prfm = */ false, NOT_EQUAL); 8043 } 8044 __ bind(TAIL); 8045 __ cbz(cnt1, EQUAL); 8046 __ subs(cnt1, cnt1, wordSize); 8047 __ br(__ LE, POST_LOOP); 8048 __ bind(SMALL_LOOP); 8049 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8050 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8051 __ subs(cnt1, cnt1, wordSize); 8052 __ eor(tmp1, tmp1, tmp2); 8053 __ cbnz(tmp1, NOT_EQUAL); 8054 __ br(__ GT, SMALL_LOOP); 8055 __ bind(POST_LOOP); 8056 __ ldr(tmp1, Address(a1, cnt1)); 8057 __ ldr(tmp2, Address(a2, cnt1)); 8058 __ eor(tmp1, tmp1, tmp2); 8059 __ cbnz(tmp1, NOT_EQUAL); 8060 __ bind(EQUAL); 8061 __ mov(result, true); 8062 __ bind(NOT_EQUAL); 8063 if (!UseSIMDForArrayEquals) { 8064 __ pop(spilled_regs, sp); 8065 } 8066 __ bind(NOT_EQUAL_NO_POP); 8067 __ leave(); 8068 __ ret(lr); 8069 return entry; 8070 } 8071 8072 // result = r0 - return value. Contains initial hashcode value on entry. 8073 // ary = r1 - array address 8074 // cnt = r2 - elements count 8075 // Clobbers: v0-v13, rscratch1, rscratch2 8076 address generate_large_arrays_hashcode(BasicType eltype) { 8077 const Register result = r0, ary = r1, cnt = r2; 8078 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 8079 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 8080 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 8081 const FloatRegister vpowm = v13; 8082 8083 ARRAYS_HASHCODE_REGISTERS; 8084 8085 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 8086 8087 unsigned int vf; // vectorization factor 8088 bool multiply_by_halves; 8089 Assembler::SIMD_Arrangement load_arrangement; 8090 switch (eltype) { 8091 case T_BOOLEAN: 8092 case T_BYTE: 8093 load_arrangement = Assembler::T8B; 8094 multiply_by_halves = true; 8095 vf = 8; 8096 break; 8097 case T_CHAR: 8098 case T_SHORT: 8099 load_arrangement = Assembler::T8H; 8100 multiply_by_halves = true; 8101 vf = 8; 8102 break; 8103 case T_INT: 8104 load_arrangement = Assembler::T4S; 8105 multiply_by_halves = false; 8106 vf = 4; 8107 break; 8108 default: 8109 ShouldNotReachHere(); 8110 } 8111 8112 // Unroll factor 8113 const unsigned uf = 4; 8114 8115 // Effective vectorization factor 8116 const unsigned evf = vf * uf; 8117 8118 __ align(CodeEntryAlignment); 8119 8120 StubGenStubId stub_id; 8121 switch (eltype) { 8122 case T_BOOLEAN: 8123 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 8124 break; 8125 case T_BYTE: 8126 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 8127 break; 8128 case T_CHAR: 8129 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 8130 break; 8131 case T_SHORT: 8132 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 8133 break; 8134 case T_INT: 8135 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 8136 break; 8137 default: 8138 stub_id = StubGenStubId::NO_STUBID; 8139 ShouldNotReachHere(); 8140 }; 8141 8142 StubCodeMark mark(this, stub_id); 8143 8144 address entry = __ pc(); 8145 __ enter(); 8146 8147 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8148 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8149 // value shouldn't change throughout both loops. 8150 __ movw(rscratch1, intpow(31U, 3)); 8151 __ mov(vpow, Assembler::S, 0, rscratch1); 8152 __ movw(rscratch1, intpow(31U, 2)); 8153 __ mov(vpow, Assembler::S, 1, rscratch1); 8154 __ movw(rscratch1, intpow(31U, 1)); 8155 __ mov(vpow, Assembler::S, 2, rscratch1); 8156 __ movw(rscratch1, intpow(31U, 0)); 8157 __ mov(vpow, Assembler::S, 3, rscratch1); 8158 8159 __ mov(vmul0, Assembler::T16B, 0); 8160 __ mov(vmul0, Assembler::S, 3, result); 8161 8162 __ andr(rscratch2, cnt, (uf - 1) * vf); 8163 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8164 8165 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8166 __ mov(vpowm, Assembler::S, 0, rscratch1); 8167 8168 // SMALL LOOP 8169 __ bind(SMALL_LOOP); 8170 8171 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8172 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8173 __ subsw(rscratch2, rscratch2, vf); 8174 8175 if (load_arrangement == Assembler::T8B) { 8176 // Extend 8B to 8H to be able to use vector multiply 8177 // instructions 8178 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8179 if (is_signed_subword_type(eltype)) { 8180 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8181 } else { 8182 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8183 } 8184 } 8185 8186 switch (load_arrangement) { 8187 case Assembler::T4S: 8188 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8189 break; 8190 case Assembler::T8B: 8191 case Assembler::T8H: 8192 assert(is_subword_type(eltype), "subword type expected"); 8193 if (is_signed_subword_type(eltype)) { 8194 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8195 } else { 8196 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8197 } 8198 break; 8199 default: 8200 __ should_not_reach_here(); 8201 } 8202 8203 // Process the upper half of a vector 8204 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8205 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8206 if (is_signed_subword_type(eltype)) { 8207 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8208 } else { 8209 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8210 } 8211 } 8212 8213 __ br(Assembler::HI, SMALL_LOOP); 8214 8215 // SMALL LOOP'S EPILOQUE 8216 __ lsr(rscratch2, cnt, exact_log2(evf)); 8217 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8218 8219 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8220 __ addv(vmul0, Assembler::T4S, vmul0); 8221 __ umov(result, vmul0, Assembler::S, 0); 8222 8223 // TAIL 8224 __ bind(TAIL); 8225 8226 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8227 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8228 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8229 __ andr(rscratch2, cnt, vf - 1); 8230 __ bind(TAIL_SHORTCUT); 8231 __ adr(rscratch1, BR_BASE); 8232 // For Cortex-A53 offset is 4 because 2 nops are generated. 8233 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3); 8234 __ movw(rscratch2, 0x1f); 8235 __ br(rscratch1); 8236 8237 for (size_t i = 0; i < vf - 1; ++i) { 8238 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8239 eltype); 8240 __ maddw(result, result, rscratch2, rscratch1); 8241 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 8242 // Generate 2nd nop to have 4 instructions per iteration. 8243 if (VM_Version::supports_a53mac()) { 8244 __ nop(); 8245 } 8246 } 8247 __ bind(BR_BASE); 8248 8249 __ leave(); 8250 __ ret(lr); 8251 8252 // LARGE LOOP 8253 __ bind(LARGE_LOOP_PREHEADER); 8254 8255 __ lsr(rscratch2, cnt, exact_log2(evf)); 8256 8257 if (multiply_by_halves) { 8258 // 31^4 - multiplier between lower and upper parts of a register 8259 __ movw(rscratch1, intpow(31U, vf / 2)); 8260 __ mov(vpowm, Assembler::S, 1, rscratch1); 8261 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8262 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8263 __ mov(vpowm, Assembler::S, 0, rscratch1); 8264 } else { 8265 // 31^16 8266 __ movw(rscratch1, intpow(31U, evf)); 8267 __ mov(vpowm, Assembler::S, 0, rscratch1); 8268 } 8269 8270 __ mov(vmul3, Assembler::T16B, 0); 8271 __ mov(vmul2, Assembler::T16B, 0); 8272 __ mov(vmul1, Assembler::T16B, 0); 8273 8274 __ bind(LARGE_LOOP); 8275 8276 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8277 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8278 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8279 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8280 8281 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8282 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8283 8284 if (load_arrangement == Assembler::T8B) { 8285 // Extend 8B to 8H to be able to use vector multiply 8286 // instructions 8287 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8288 if (is_signed_subword_type(eltype)) { 8289 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8290 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8291 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8292 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8293 } else { 8294 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8295 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8296 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8297 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8298 } 8299 } 8300 8301 switch (load_arrangement) { 8302 case Assembler::T4S: 8303 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8304 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8305 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8306 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8307 break; 8308 case Assembler::T8B: 8309 case Assembler::T8H: 8310 assert(is_subword_type(eltype), "subword type expected"); 8311 if (is_signed_subword_type(eltype)) { 8312 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8313 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8314 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8315 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8316 } else { 8317 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8318 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8319 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8320 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8321 } 8322 break; 8323 default: 8324 __ should_not_reach_here(); 8325 } 8326 8327 // Process the upper half of a vector 8328 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8329 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8330 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8331 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8332 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8333 if (is_signed_subword_type(eltype)) { 8334 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8335 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8336 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8337 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8338 } else { 8339 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8340 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8341 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8342 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8343 } 8344 } 8345 8346 __ subsw(rscratch2, rscratch2, 1); 8347 __ br(Assembler::HI, LARGE_LOOP); 8348 8349 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8350 __ addv(vmul3, Assembler::T4S, vmul3); 8351 __ umov(result, vmul3, Assembler::S, 0); 8352 8353 __ mov(rscratch2, intpow(31U, vf)); 8354 8355 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8356 __ addv(vmul2, Assembler::T4S, vmul2); 8357 __ umov(rscratch1, vmul2, Assembler::S, 0); 8358 __ maddw(result, result, rscratch2, rscratch1); 8359 8360 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8361 __ addv(vmul1, Assembler::T4S, vmul1); 8362 __ umov(rscratch1, vmul1, Assembler::S, 0); 8363 __ maddw(result, result, rscratch2, rscratch1); 8364 8365 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8366 __ addv(vmul0, Assembler::T4S, vmul0); 8367 __ umov(rscratch1, vmul0, Assembler::S, 0); 8368 __ maddw(result, result, rscratch2, rscratch1); 8369 8370 __ andr(rscratch2, cnt, vf - 1); 8371 __ cbnz(rscratch2, TAIL_SHORTCUT); 8372 8373 __ leave(); 8374 __ ret(lr); 8375 8376 return entry; 8377 } 8378 8379 address generate_dsin_dcos(bool isCos) { 8380 __ align(CodeEntryAlignment); 8381 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 8382 StubCodeMark mark(this, stub_id); 8383 address start = __ pc(); 8384 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8385 (address)StubRoutines::aarch64::_two_over_pi, 8386 (address)StubRoutines::aarch64::_pio2, 8387 (address)StubRoutines::aarch64::_dsin_coef, 8388 (address)StubRoutines::aarch64::_dcos_coef); 8389 return start; 8390 } 8391 8392 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8393 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8394 Label &DIFF2) { 8395 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8396 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8397 8398 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8399 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8400 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8401 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8402 8403 __ fmovd(tmpL, vtmp3); 8404 __ eor(rscratch2, tmp3, tmpL); 8405 __ cbnz(rscratch2, DIFF2); 8406 8407 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8408 __ umov(tmpL, vtmp3, __ D, 1); 8409 __ eor(rscratch2, tmpU, tmpL); 8410 __ cbnz(rscratch2, DIFF1); 8411 8412 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8413 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8414 __ fmovd(tmpL, vtmp); 8415 __ eor(rscratch2, tmp3, tmpL); 8416 __ cbnz(rscratch2, DIFF2); 8417 8418 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8419 __ umov(tmpL, vtmp, __ D, 1); 8420 __ eor(rscratch2, tmpU, tmpL); 8421 __ cbnz(rscratch2, DIFF1); 8422 } 8423 8424 // r0 = result 8425 // r1 = str1 8426 // r2 = cnt1 8427 // r3 = str2 8428 // r4 = cnt2 8429 // r10 = tmp1 8430 // r11 = tmp2 8431 address generate_compare_long_string_different_encoding(bool isLU) { 8432 __ align(CodeEntryAlignment); 8433 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 8434 StubCodeMark mark(this, stub_id); 8435 address entry = __ pc(); 8436 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8437 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8438 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8439 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8440 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8441 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8442 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8443 8444 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8445 8446 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8447 // cnt2 == amount of characters left to compare 8448 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8449 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8450 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8451 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8452 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8453 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8454 __ eor(rscratch2, tmp1, tmp2); 8455 __ mov(rscratch1, tmp2); 8456 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8457 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8458 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8459 __ push(spilled_regs, sp); 8460 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8461 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8462 8463 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8464 8465 if (SoftwarePrefetchHintDistance >= 0) { 8466 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8467 __ br(__ LT, NO_PREFETCH); 8468 __ bind(LARGE_LOOP_PREFETCH); 8469 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8470 __ mov(tmp4, 2); 8471 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8472 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8473 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8474 __ subs(tmp4, tmp4, 1); 8475 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8476 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8477 __ mov(tmp4, 2); 8478 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8479 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8480 __ subs(tmp4, tmp4, 1); 8481 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8482 __ sub(cnt2, cnt2, 64); 8483 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8484 __ br(__ GE, LARGE_LOOP_PREFETCH); 8485 } 8486 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8487 __ bind(NO_PREFETCH); 8488 __ subs(cnt2, cnt2, 16); 8489 __ br(__ LT, TAIL); 8490 __ align(OptoLoopAlignment); 8491 __ bind(SMALL_LOOP); // smaller loop 8492 __ subs(cnt2, cnt2, 16); 8493 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8494 __ br(__ GE, SMALL_LOOP); 8495 __ cmn(cnt2, (u1)16); 8496 __ br(__ EQ, LOAD_LAST); 8497 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8498 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8499 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8500 __ ldr(tmp3, Address(cnt1, -8)); 8501 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8502 __ b(LOAD_LAST); 8503 __ bind(DIFF2); 8504 __ mov(tmpU, tmp3); 8505 __ bind(DIFF1); 8506 __ pop(spilled_regs, sp); 8507 __ b(CALCULATE_DIFFERENCE); 8508 __ bind(LOAD_LAST); 8509 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8510 // No need to load it again 8511 __ mov(tmpU, tmp3); 8512 __ pop(spilled_regs, sp); 8513 8514 // tmp2 points to the address of the last 4 Latin1 characters right now 8515 __ ldrs(vtmp, Address(tmp2)); 8516 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8517 __ fmovd(tmpL, vtmp); 8518 8519 __ eor(rscratch2, tmpU, tmpL); 8520 __ cbz(rscratch2, DONE); 8521 8522 // Find the first different characters in the longwords and 8523 // compute their difference. 8524 __ bind(CALCULATE_DIFFERENCE); 8525 __ rev(rscratch2, rscratch2); 8526 __ clz(rscratch2, rscratch2); 8527 __ andr(rscratch2, rscratch2, -16); 8528 __ lsrv(tmp1, tmp1, rscratch2); 8529 __ uxthw(tmp1, tmp1); 8530 __ lsrv(rscratch1, rscratch1, rscratch2); 8531 __ uxthw(rscratch1, rscratch1); 8532 __ subw(result, tmp1, rscratch1); 8533 __ bind(DONE); 8534 __ ret(lr); 8535 return entry; 8536 } 8537 8538 // r0 = input (float16) 8539 // v0 = result (float) 8540 // v1 = temporary float register 8541 address generate_float16ToFloat() { 8542 __ align(CodeEntryAlignment); 8543 StubGenStubId stub_id = StubGenStubId::hf2f_id; 8544 StubCodeMark mark(this, stub_id); 8545 address entry = __ pc(); 8546 BLOCK_COMMENT("Entry:"); 8547 __ flt16_to_flt(v0, r0, v1); 8548 __ ret(lr); 8549 return entry; 8550 } 8551 8552 // v0 = input (float) 8553 // r0 = result (float16) 8554 // v1 = temporary float register 8555 address generate_floatToFloat16() { 8556 __ align(CodeEntryAlignment); 8557 StubGenStubId stub_id = StubGenStubId::f2hf_id; 8558 StubCodeMark mark(this, stub_id); 8559 address entry = __ pc(); 8560 BLOCK_COMMENT("Entry:"); 8561 __ flt_to_flt16(r0, v0, v1); 8562 __ ret(lr); 8563 return entry; 8564 } 8565 8566 address generate_method_entry_barrier() { 8567 __ align(CodeEntryAlignment); 8568 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 8569 StubCodeMark mark(this, stub_id); 8570 8571 Label deoptimize_label; 8572 8573 address start = __ pc(); 8574 8575 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 8576 8577 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 8578 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8579 // We can get here despite the nmethod being good, if we have not 8580 // yet applied our cross modification fence (or data fence). 8581 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 8582 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 8583 __ ldrw(rscratch2, rscratch2); 8584 __ strw(rscratch2, thread_epoch_addr); 8585 __ isb(); 8586 __ membar(__ LoadLoad); 8587 } 8588 8589 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 8590 8591 __ enter(); 8592 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 8593 8594 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 8595 8596 __ push_call_clobbered_registers(); 8597 8598 __ mov(c_rarg0, rscratch2); 8599 __ call_VM_leaf 8600 (CAST_FROM_FN_PTR 8601 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 8602 8603 __ reset_last_Java_frame(true); 8604 8605 __ mov(rscratch1, r0); 8606 8607 __ pop_call_clobbered_registers(); 8608 8609 __ cbnz(rscratch1, deoptimize_label); 8610 8611 __ leave(); 8612 __ ret(lr); 8613 8614 __ BIND(deoptimize_label); 8615 8616 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 8617 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 8618 8619 __ mov(sp, rscratch1); 8620 __ br(rscratch2); 8621 8622 return start; 8623 } 8624 8625 // r0 = result 8626 // r1 = str1 8627 // r2 = cnt1 8628 // r3 = str2 8629 // r4 = cnt2 8630 // r10 = tmp1 8631 // r11 = tmp2 8632 address generate_compare_long_string_same_encoding(bool isLL) { 8633 __ align(CodeEntryAlignment); 8634 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 8635 StubCodeMark mark(this, stub_id); 8636 address entry = __ pc(); 8637 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8638 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 8639 8640 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 8641 8642 // exit from large loop when less than 64 bytes left to read or we're about 8643 // to prefetch memory behind array border 8644 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 8645 8646 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 8647 __ eor(rscratch2, tmp1, tmp2); 8648 __ cbnz(rscratch2, CAL_DIFFERENCE); 8649 8650 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 8651 // update pointers, because of previous read 8652 __ add(str1, str1, wordSize); 8653 __ add(str2, str2, wordSize); 8654 if (SoftwarePrefetchHintDistance >= 0) { 8655 __ align(OptoLoopAlignment); 8656 __ bind(LARGE_LOOP_PREFETCH); 8657 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 8658 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 8659 8660 for (int i = 0; i < 4; i++) { 8661 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 8662 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 8663 __ cmp(tmp1, tmp2); 8664 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8665 __ br(Assembler::NE, DIFF); 8666 } 8667 __ sub(cnt2, cnt2, isLL ? 64 : 32); 8668 __ add(str1, str1, 64); 8669 __ add(str2, str2, 64); 8670 __ subs(rscratch2, cnt2, largeLoopExitCondition); 8671 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 8672 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 8673 } 8674 8675 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 8676 __ br(Assembler::LE, LESS16); 8677 __ align(OptoLoopAlignment); 8678 __ bind(LOOP_COMPARE16); 8679 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 8680 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 8681 __ cmp(tmp1, tmp2); 8682 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8683 __ br(Assembler::NE, DIFF); 8684 __ sub(cnt2, cnt2, isLL ? 16 : 8); 8685 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 8686 __ br(Assembler::LT, LESS16); 8687 8688 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 8689 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 8690 __ cmp(tmp1, tmp2); 8691 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8692 __ br(Assembler::NE, DIFF); 8693 __ sub(cnt2, cnt2, isLL ? 16 : 8); 8694 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 8695 __ br(Assembler::GE, LOOP_COMPARE16); 8696 __ cbz(cnt2, LENGTH_DIFF); 8697 8698 __ bind(LESS16); 8699 // each 8 compare 8700 __ subs(cnt2, cnt2, isLL ? 8 : 4); 8701 __ br(Assembler::LE, LESS8); 8702 __ ldr(tmp1, Address(__ post(str1, 8))); 8703 __ ldr(tmp2, Address(__ post(str2, 8))); 8704 __ eor(rscratch2, tmp1, tmp2); 8705 __ cbnz(rscratch2, CAL_DIFFERENCE); 8706 __ sub(cnt2, cnt2, isLL ? 8 : 4); 8707 8708 __ bind(LESS8); // directly load last 8 bytes 8709 if (!isLL) { 8710 __ add(cnt2, cnt2, cnt2); 8711 } 8712 __ ldr(tmp1, Address(str1, cnt2)); 8713 __ ldr(tmp2, Address(str2, cnt2)); 8714 __ eor(rscratch2, tmp1, tmp2); 8715 __ cbz(rscratch2, LENGTH_DIFF); 8716 __ b(CAL_DIFFERENCE); 8717 8718 __ bind(DIFF); 8719 __ cmp(tmp1, tmp2); 8720 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 8721 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 8722 // reuse rscratch2 register for the result of eor instruction 8723 __ eor(rscratch2, tmp1, tmp2); 8724 8725 __ bind(CAL_DIFFERENCE); 8726 __ rev(rscratch2, rscratch2); 8727 __ clz(rscratch2, rscratch2); 8728 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 8729 __ lsrv(tmp1, tmp1, rscratch2); 8730 __ lsrv(tmp2, tmp2, rscratch2); 8731 if (isLL) { 8732 __ uxtbw(tmp1, tmp1); 8733 __ uxtbw(tmp2, tmp2); 8734 } else { 8735 __ uxthw(tmp1, tmp1); 8736 __ uxthw(tmp2, tmp2); 8737 } 8738 __ subw(result, tmp1, tmp2); 8739 8740 __ bind(LENGTH_DIFF); 8741 __ ret(lr); 8742 return entry; 8743 } 8744 8745 enum string_compare_mode { 8746 LL, 8747 LU, 8748 UL, 8749 UU, 8750 }; 8751 8752 // The following registers are declared in aarch64.ad 8753 // r0 = result 8754 // r1 = str1 8755 // r2 = cnt1 8756 // r3 = str2 8757 // r4 = cnt2 8758 // r10 = tmp1 8759 // r11 = tmp2 8760 // z0 = ztmp1 8761 // z1 = ztmp2 8762 // p0 = pgtmp1 8763 // p1 = pgtmp2 8764 address generate_compare_long_string_sve(string_compare_mode mode) { 8765 StubGenStubId stub_id; 8766 switch (mode) { 8767 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 8768 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 8769 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 8770 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 8771 default: ShouldNotReachHere(); 8772 } 8773 8774 __ align(CodeEntryAlignment); 8775 address entry = __ pc(); 8776 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8777 tmp1 = r10, tmp2 = r11; 8778 8779 Label LOOP, DONE, MISMATCH; 8780 Register vec_len = tmp1; 8781 Register idx = tmp2; 8782 // The minimum of the string lengths has been stored in cnt2. 8783 Register cnt = cnt2; 8784 FloatRegister ztmp1 = z0, ztmp2 = z1; 8785 PRegister pgtmp1 = p0, pgtmp2 = p1; 8786 8787 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 8788 switch (mode) { \ 8789 case LL: \ 8790 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 8791 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 8792 break; \ 8793 case LU: \ 8794 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 8795 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 8796 break; \ 8797 case UL: \ 8798 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 8799 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 8800 break; \ 8801 case UU: \ 8802 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 8803 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 8804 break; \ 8805 default: \ 8806 ShouldNotReachHere(); \ 8807 } 8808 8809 StubCodeMark mark(this, stub_id); 8810 8811 __ mov(idx, 0); 8812 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 8813 8814 if (mode == LL) { 8815 __ sve_cntb(vec_len); 8816 } else { 8817 __ sve_cnth(vec_len); 8818 } 8819 8820 __ sub(rscratch1, cnt, vec_len); 8821 8822 __ bind(LOOP); 8823 8824 // main loop 8825 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 8826 __ add(idx, idx, vec_len); 8827 // Compare strings. 8828 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 8829 __ br(__ NE, MISMATCH); 8830 __ cmp(idx, rscratch1); 8831 __ br(__ LT, LOOP); 8832 8833 // post loop, last iteration 8834 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 8835 8836 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 8837 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 8838 __ br(__ EQ, DONE); 8839 8840 __ bind(MISMATCH); 8841 8842 // Crop the vector to find its location. 8843 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 8844 // Extract the first different characters of each string. 8845 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 8846 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 8847 8848 // Compute the difference of the first different characters. 8849 __ sub(result, rscratch1, rscratch2); 8850 8851 __ bind(DONE); 8852 __ ret(lr); 8853 #undef LOAD_PAIR 8854 return entry; 8855 } 8856 8857 void generate_compare_long_strings() { 8858 if (UseSVE == 0) { 8859 StubRoutines::aarch64::_compare_long_string_LL 8860 = generate_compare_long_string_same_encoding(true); 8861 StubRoutines::aarch64::_compare_long_string_UU 8862 = generate_compare_long_string_same_encoding(false); 8863 StubRoutines::aarch64::_compare_long_string_LU 8864 = generate_compare_long_string_different_encoding(true); 8865 StubRoutines::aarch64::_compare_long_string_UL 8866 = generate_compare_long_string_different_encoding(false); 8867 } else { 8868 StubRoutines::aarch64::_compare_long_string_LL 8869 = generate_compare_long_string_sve(LL); 8870 StubRoutines::aarch64::_compare_long_string_UU 8871 = generate_compare_long_string_sve(UU); 8872 StubRoutines::aarch64::_compare_long_string_LU 8873 = generate_compare_long_string_sve(LU); 8874 StubRoutines::aarch64::_compare_long_string_UL 8875 = generate_compare_long_string_sve(UL); 8876 } 8877 } 8878 8879 // R0 = result 8880 // R1 = str2 8881 // R2 = cnt1 8882 // R3 = str1 8883 // R4 = cnt2 8884 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 8885 // 8886 // This generic linear code use few additional ideas, which makes it faster: 8887 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 8888 // in order to skip initial loading(help in systems with 1 ld pipeline) 8889 // 2) we can use "fast" algorithm of finding single character to search for 8890 // first symbol with less branches(1 branch per each loaded register instead 8891 // of branch for each symbol), so, this is where constants like 8892 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 8893 // 3) after loading and analyzing 1st register of source string, it can be 8894 // used to search for every 1st character entry, saving few loads in 8895 // comparison with "simplier-but-slower" implementation 8896 // 4) in order to avoid lots of push/pop operations, code below is heavily 8897 // re-using/re-initializing/compressing register values, which makes code 8898 // larger and a bit less readable, however, most of extra operations are 8899 // issued during loads or branches, so, penalty is minimal 8900 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 8901 StubGenStubId stub_id; 8902 if (str1_isL) { 8903 if (str2_isL) { 8904 stub_id = StubGenStubId::string_indexof_linear_ll_id; 8905 } else { 8906 stub_id = StubGenStubId::string_indexof_linear_ul_id; 8907 } 8908 } else { 8909 if (str2_isL) { 8910 ShouldNotReachHere(); 8911 } else { 8912 stub_id = StubGenStubId::string_indexof_linear_uu_id; 8913 } 8914 } 8915 __ align(CodeEntryAlignment); 8916 StubCodeMark mark(this, stub_id); 8917 address entry = __ pc(); 8918 8919 int str1_chr_size = str1_isL ? 1 : 2; 8920 int str2_chr_size = str2_isL ? 1 : 2; 8921 int str1_chr_shift = str1_isL ? 0 : 1; 8922 int str2_chr_shift = str2_isL ? 0 : 1; 8923 bool isL = str1_isL && str2_isL; 8924 // parameters 8925 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 8926 // temporary registers 8927 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 8928 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 8929 // redefinitions 8930 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 8931 8932 __ push(spilled_regs, sp); 8933 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 8934 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 8935 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 8936 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 8937 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 8938 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 8939 // Read whole register from str1. It is safe, because length >=8 here 8940 __ ldr(ch1, Address(str1)); 8941 // Read whole register from str2. It is safe, because length >=8 here 8942 __ ldr(ch2, Address(str2)); 8943 __ sub(cnt2, cnt2, cnt1); 8944 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 8945 if (str1_isL != str2_isL) { 8946 __ eor(v0, __ T16B, v0, v0); 8947 } 8948 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 8949 __ mul(first, first, tmp1); 8950 // check if we have less than 1 register to check 8951 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 8952 if (str1_isL != str2_isL) { 8953 __ fmovd(v1, ch1); 8954 } 8955 __ br(__ LE, L_SMALL); 8956 __ eor(ch2, first, ch2); 8957 if (str1_isL != str2_isL) { 8958 __ zip1(v1, __ T16B, v1, v0); 8959 } 8960 __ sub(tmp2, ch2, tmp1); 8961 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8962 __ bics(tmp2, tmp2, ch2); 8963 if (str1_isL != str2_isL) { 8964 __ fmovd(ch1, v1); 8965 } 8966 __ br(__ NE, L_HAS_ZERO); 8967 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 8968 __ add(result, result, wordSize/str2_chr_size); 8969 __ add(str2, str2, wordSize); 8970 __ br(__ LT, L_POST_LOOP); 8971 __ BIND(L_LOOP); 8972 __ ldr(ch2, Address(str2)); 8973 __ eor(ch2, first, ch2); 8974 __ sub(tmp2, ch2, tmp1); 8975 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8976 __ bics(tmp2, tmp2, ch2); 8977 __ br(__ NE, L_HAS_ZERO); 8978 __ BIND(L_LOOP_PROCEED); 8979 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 8980 __ add(str2, str2, wordSize); 8981 __ add(result, result, wordSize/str2_chr_size); 8982 __ br(__ GE, L_LOOP); 8983 __ BIND(L_POST_LOOP); 8984 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 8985 __ br(__ LE, NOMATCH); 8986 __ ldr(ch2, Address(str2)); 8987 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 8988 __ eor(ch2, first, ch2); 8989 __ sub(tmp2, ch2, tmp1); 8990 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8991 __ mov(tmp4, -1); // all bits set 8992 __ b(L_SMALL_PROCEED); 8993 __ align(OptoLoopAlignment); 8994 __ BIND(L_SMALL); 8995 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 8996 __ eor(ch2, first, ch2); 8997 if (str1_isL != str2_isL) { 8998 __ zip1(v1, __ T16B, v1, v0); 8999 } 9000 __ sub(tmp2, ch2, tmp1); 9001 __ mov(tmp4, -1); // all bits set 9002 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9003 if (str1_isL != str2_isL) { 9004 __ fmovd(ch1, v1); // move converted 4 symbols 9005 } 9006 __ BIND(L_SMALL_PROCEED); 9007 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 9008 __ bic(tmp2, tmp2, ch2); 9009 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 9010 __ rbit(tmp2, tmp2); 9011 __ br(__ EQ, NOMATCH); 9012 __ BIND(L_SMALL_HAS_ZERO_LOOP); 9013 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 9014 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 9015 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 9016 if (str2_isL) { // LL 9017 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9018 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9019 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9020 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9021 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9022 } else { 9023 __ mov(ch2, 0xE); // all bits in byte set except last one 9024 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9025 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9026 __ lslv(tmp2, tmp2, tmp4); 9027 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9028 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9029 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9030 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9031 } 9032 __ cmp(ch1, ch2); 9033 __ mov(tmp4, wordSize/str2_chr_size); 9034 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9035 __ BIND(L_SMALL_CMP_LOOP); 9036 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9037 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9038 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9039 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9040 __ add(tmp4, tmp4, 1); 9041 __ cmp(tmp4, cnt1); 9042 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 9043 __ cmp(first, ch2); 9044 __ br(__ EQ, L_SMALL_CMP_LOOP); 9045 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 9046 __ cbz(tmp2, NOMATCH); // no more matches. exit 9047 __ clz(tmp4, tmp2); 9048 __ add(result, result, 1); // advance index 9049 __ add(str2, str2, str2_chr_size); // advance pointer 9050 __ b(L_SMALL_HAS_ZERO_LOOP); 9051 __ align(OptoLoopAlignment); 9052 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 9053 __ cmp(first, ch2); 9054 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9055 __ b(DONE); 9056 __ align(OptoLoopAlignment); 9057 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 9058 if (str2_isL) { // LL 9059 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9060 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9061 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9062 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9063 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9064 } else { 9065 __ mov(ch2, 0xE); // all bits in byte set except last one 9066 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9067 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9068 __ lslv(tmp2, tmp2, tmp4); 9069 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9070 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9071 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9072 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9073 } 9074 __ cmp(ch1, ch2); 9075 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9076 __ b(DONE); 9077 __ align(OptoLoopAlignment); 9078 __ BIND(L_HAS_ZERO); 9079 __ rbit(tmp2, tmp2); 9080 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 9081 // Now, perform compression of counters(cnt2 and cnt1) into one register. 9082 // It's fine because both counters are 32bit and are not changed in this 9083 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 9084 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 9085 __ sub(result, result, 1); 9086 __ BIND(L_HAS_ZERO_LOOP); 9087 __ mov(cnt1, wordSize/str2_chr_size); 9088 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9089 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 9090 if (str2_isL) { 9091 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9092 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9093 __ lslv(tmp2, tmp2, tmp4); 9094 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9095 __ add(tmp4, tmp4, 1); 9096 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9097 __ lsl(tmp2, tmp2, 1); 9098 __ mov(tmp4, wordSize/str2_chr_size); 9099 } else { 9100 __ mov(ch2, 0xE); 9101 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9102 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9103 __ lslv(tmp2, tmp2, tmp4); 9104 __ add(tmp4, tmp4, 1); 9105 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9106 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9107 __ lsl(tmp2, tmp2, 1); 9108 __ mov(tmp4, wordSize/str2_chr_size); 9109 __ sub(str2, str2, str2_chr_size); 9110 } 9111 __ cmp(ch1, ch2); 9112 __ mov(tmp4, wordSize/str2_chr_size); 9113 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9114 __ BIND(L_CMP_LOOP); 9115 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9116 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9117 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9118 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9119 __ add(tmp4, tmp4, 1); 9120 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9121 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 9122 __ cmp(cnt1, ch2); 9123 __ br(__ EQ, L_CMP_LOOP); 9124 __ BIND(L_CMP_LOOP_NOMATCH); 9125 // here we're not matched 9126 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9127 __ clz(tmp4, tmp2); 9128 __ add(str2, str2, str2_chr_size); // advance pointer 9129 __ b(L_HAS_ZERO_LOOP); 9130 __ align(OptoLoopAlignment); 9131 __ BIND(L_CMP_LOOP_LAST_CMP); 9132 __ cmp(cnt1, ch2); 9133 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9134 __ b(DONE); 9135 __ align(OptoLoopAlignment); 9136 __ BIND(L_CMP_LOOP_LAST_CMP2); 9137 if (str2_isL) { 9138 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9139 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9140 __ lslv(tmp2, tmp2, tmp4); 9141 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9142 __ add(tmp4, tmp4, 1); 9143 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9144 __ lsl(tmp2, tmp2, 1); 9145 } else { 9146 __ mov(ch2, 0xE); 9147 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9148 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9149 __ lslv(tmp2, tmp2, tmp4); 9150 __ add(tmp4, tmp4, 1); 9151 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9152 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9153 __ lsl(tmp2, tmp2, 1); 9154 __ sub(str2, str2, str2_chr_size); 9155 } 9156 __ cmp(ch1, ch2); 9157 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9158 __ b(DONE); 9159 __ align(OptoLoopAlignment); 9160 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9161 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9162 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9163 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9164 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9165 // result by analyzed characters value, so, we can just reset lower bits 9166 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9167 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9168 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9169 // index of last analyzed substring inside current octet. So, str2 in at 9170 // respective start address. We need to advance it to next octet 9171 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9172 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9173 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9174 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9175 __ movw(cnt2, cnt2); 9176 __ b(L_LOOP_PROCEED); 9177 __ align(OptoLoopAlignment); 9178 __ BIND(NOMATCH); 9179 __ mov(result, -1); 9180 __ BIND(DONE); 9181 __ pop(spilled_regs, sp); 9182 __ ret(lr); 9183 return entry; 9184 } 9185 9186 void generate_string_indexof_stubs() { 9187 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9188 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9189 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9190 } 9191 9192 void inflate_and_store_2_fp_registers(bool generatePrfm, 9193 FloatRegister src1, FloatRegister src2) { 9194 Register dst = r1; 9195 __ zip1(v1, __ T16B, src1, v0); 9196 __ zip2(v2, __ T16B, src1, v0); 9197 if (generatePrfm) { 9198 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9199 } 9200 __ zip1(v3, __ T16B, src2, v0); 9201 __ zip2(v4, __ T16B, src2, v0); 9202 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9203 } 9204 9205 // R0 = src 9206 // R1 = dst 9207 // R2 = len 9208 // R3 = len >> 3 9209 // V0 = 0 9210 // v1 = loaded 8 bytes 9211 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9212 address generate_large_byte_array_inflate() { 9213 __ align(CodeEntryAlignment); 9214 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 9215 StubCodeMark mark(this, stub_id); 9216 address entry = __ pc(); 9217 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9218 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9219 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9220 9221 // do one more 8-byte read to have address 16-byte aligned in most cases 9222 // also use single store instruction 9223 __ ldrd(v2, __ post(src, 8)); 9224 __ sub(octetCounter, octetCounter, 2); 9225 __ zip1(v1, __ T16B, v1, v0); 9226 __ zip1(v2, __ T16B, v2, v0); 9227 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9228 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9229 __ subs(rscratch1, octetCounter, large_loop_threshold); 9230 __ br(__ LE, LOOP_START); 9231 __ b(LOOP_PRFM_START); 9232 __ bind(LOOP_PRFM); 9233 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9234 __ bind(LOOP_PRFM_START); 9235 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9236 __ sub(octetCounter, octetCounter, 8); 9237 __ subs(rscratch1, octetCounter, large_loop_threshold); 9238 inflate_and_store_2_fp_registers(true, v3, v4); 9239 inflate_and_store_2_fp_registers(true, v5, v6); 9240 __ br(__ GT, LOOP_PRFM); 9241 __ cmp(octetCounter, (u1)8); 9242 __ br(__ LT, DONE); 9243 __ bind(LOOP); 9244 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9245 __ bind(LOOP_START); 9246 __ sub(octetCounter, octetCounter, 8); 9247 __ cmp(octetCounter, (u1)8); 9248 inflate_and_store_2_fp_registers(false, v3, v4); 9249 inflate_and_store_2_fp_registers(false, v5, v6); 9250 __ br(__ GE, LOOP); 9251 __ bind(DONE); 9252 __ ret(lr); 9253 return entry; 9254 } 9255 9256 /** 9257 * Arguments: 9258 * 9259 * Input: 9260 * c_rarg0 - current state address 9261 * c_rarg1 - H key address 9262 * c_rarg2 - data address 9263 * c_rarg3 - number of blocks 9264 * 9265 * Output: 9266 * Updated state at c_rarg0 9267 */ 9268 address generate_ghash_processBlocks() { 9269 // Bafflingly, GCM uses little-endian for the byte order, but 9270 // big-endian for the bit order. For example, the polynomial 1 is 9271 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9272 // 9273 // So, we must either reverse the bytes in each word and do 9274 // everything big-endian or reverse the bits in each byte and do 9275 // it little-endian. On AArch64 it's more idiomatic to reverse 9276 // the bits in each byte (we have an instruction, RBIT, to do 9277 // that) and keep the data in little-endian bit order through the 9278 // calculation, bit-reversing the inputs and outputs. 9279 9280 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 9281 StubCodeMark mark(this, stub_id); 9282 __ align(wordSize * 2); 9283 address p = __ pc(); 9284 __ emit_int64(0x87); // The low-order bits of the field 9285 // polynomial (i.e. p = z^7+z^2+z+1) 9286 // repeated in the low and high parts of a 9287 // 128-bit vector 9288 __ emit_int64(0x87); 9289 9290 __ align(CodeEntryAlignment); 9291 address start = __ pc(); 9292 9293 Register state = c_rarg0; 9294 Register subkeyH = c_rarg1; 9295 Register data = c_rarg2; 9296 Register blocks = c_rarg3; 9297 9298 FloatRegister vzr = v30; 9299 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9300 9301 __ ldrq(v24, p); // The field polynomial 9302 9303 __ ldrq(v0, Address(state)); 9304 __ ldrq(v1, Address(subkeyH)); 9305 9306 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9307 __ rbit(v0, __ T16B, v0); 9308 __ rev64(v1, __ T16B, v1); 9309 __ rbit(v1, __ T16B, v1); 9310 9311 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9312 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9313 9314 { 9315 Label L_ghash_loop; 9316 __ bind(L_ghash_loop); 9317 9318 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9319 // reversing each byte 9320 __ rbit(v2, __ T16B, v2); 9321 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9322 9323 // Multiply state in v2 by subkey in v1 9324 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9325 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9326 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9327 // Reduce v7:v5 by the field polynomial 9328 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9329 9330 __ sub(blocks, blocks, 1); 9331 __ cbnz(blocks, L_ghash_loop); 9332 } 9333 9334 // The bit-reversed result is at this point in v0 9335 __ rev64(v0, __ T16B, v0); 9336 __ rbit(v0, __ T16B, v0); 9337 9338 __ st1(v0, __ T16B, state); 9339 __ ret(lr); 9340 9341 return start; 9342 } 9343 9344 address generate_ghash_processBlocks_wide() { 9345 address small = generate_ghash_processBlocks(); 9346 9347 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 9348 StubCodeMark mark(this, stub_id); 9349 __ align(wordSize * 2); 9350 address p = __ pc(); 9351 __ emit_int64(0x87); // The low-order bits of the field 9352 // polynomial (i.e. p = z^7+z^2+z+1) 9353 // repeated in the low and high parts of a 9354 // 128-bit vector 9355 __ emit_int64(0x87); 9356 9357 __ align(CodeEntryAlignment); 9358 address start = __ pc(); 9359 9360 Register state = c_rarg0; 9361 Register subkeyH = c_rarg1; 9362 Register data = c_rarg2; 9363 Register blocks = c_rarg3; 9364 9365 const int unroll = 4; 9366 9367 __ cmp(blocks, (unsigned char)(unroll * 2)); 9368 __ br(__ LT, small); 9369 9370 if (unroll > 1) { 9371 // Save state before entering routine 9372 __ sub(sp, sp, 4 * 16); 9373 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9374 __ sub(sp, sp, 4 * 16); 9375 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9376 } 9377 9378 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 9379 9380 if (unroll > 1) { 9381 // And restore state 9382 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9383 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9384 } 9385 9386 __ cmp(blocks, (unsigned char)0); 9387 __ br(__ GT, small); 9388 9389 __ ret(lr); 9390 9391 return start; 9392 } 9393 9394 void generate_base64_encode_simdround(Register src, Register dst, 9395 FloatRegister codec, u8 size) { 9396 9397 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9398 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9399 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9400 9401 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9402 9403 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9404 9405 __ ushr(ind0, arrangement, in0, 2); 9406 9407 __ ushr(ind1, arrangement, in1, 2); 9408 __ shl(in0, arrangement, in0, 6); 9409 __ orr(ind1, arrangement, ind1, in0); 9410 __ ushr(ind1, arrangement, ind1, 2); 9411 9412 __ ushr(ind2, arrangement, in2, 4); 9413 __ shl(in1, arrangement, in1, 4); 9414 __ orr(ind2, arrangement, in1, ind2); 9415 __ ushr(ind2, arrangement, ind2, 2); 9416 9417 __ shl(ind3, arrangement, in2, 2); 9418 __ ushr(ind3, arrangement, ind3, 2); 9419 9420 __ tbl(out0, arrangement, codec, 4, ind0); 9421 __ tbl(out1, arrangement, codec, 4, ind1); 9422 __ tbl(out2, arrangement, codec, 4, ind2); 9423 __ tbl(out3, arrangement, codec, 4, ind3); 9424 9425 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9426 } 9427 9428 /** 9429 * Arguments: 9430 * 9431 * Input: 9432 * c_rarg0 - src_start 9433 * c_rarg1 - src_offset 9434 * c_rarg2 - src_length 9435 * c_rarg3 - dest_start 9436 * c_rarg4 - dest_offset 9437 * c_rarg5 - isURL 9438 * 9439 */ 9440 address generate_base64_encodeBlock() { 9441 9442 static const char toBase64[64] = { 9443 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9444 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9445 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9446 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9447 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9448 }; 9449 9450 static const char toBase64URL[64] = { 9451 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9452 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9453 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9454 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9455 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9456 }; 9457 9458 __ align(CodeEntryAlignment); 9459 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 9460 StubCodeMark mark(this, stub_id); 9461 address start = __ pc(); 9462 9463 Register src = c_rarg0; // source array 9464 Register soff = c_rarg1; // source start offset 9465 Register send = c_rarg2; // source end offset 9466 Register dst = c_rarg3; // dest array 9467 Register doff = c_rarg4; // position for writing to dest array 9468 Register isURL = c_rarg5; // Base64 or URL character set 9469 9470 // c_rarg6 and c_rarg7 are free to use as temps 9471 Register codec = c_rarg6; 9472 Register length = c_rarg7; 9473 9474 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9475 9476 __ add(src, src, soff); 9477 __ add(dst, dst, doff); 9478 __ sub(length, send, soff); 9479 9480 // load the codec base address 9481 __ lea(codec, ExternalAddress((address) toBase64)); 9482 __ cbz(isURL, ProcessData); 9483 __ lea(codec, ExternalAddress((address) toBase64URL)); 9484 9485 __ BIND(ProcessData); 9486 9487 // too short to formup a SIMD loop, roll back 9488 __ cmp(length, (u1)24); 9489 __ br(Assembler::LT, Process3B); 9490 9491 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9492 9493 __ BIND(Process48B); 9494 __ cmp(length, (u1)48); 9495 __ br(Assembler::LT, Process24B); 9496 generate_base64_encode_simdround(src, dst, v0, 16); 9497 __ sub(length, length, 48); 9498 __ b(Process48B); 9499 9500 __ BIND(Process24B); 9501 __ cmp(length, (u1)24); 9502 __ br(Assembler::LT, SIMDExit); 9503 generate_base64_encode_simdround(src, dst, v0, 8); 9504 __ sub(length, length, 24); 9505 9506 __ BIND(SIMDExit); 9507 __ cbz(length, Exit); 9508 9509 __ BIND(Process3B); 9510 // 3 src bytes, 24 bits 9511 __ ldrb(r10, __ post(src, 1)); 9512 __ ldrb(r11, __ post(src, 1)); 9513 __ ldrb(r12, __ post(src, 1)); 9514 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9515 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9516 // codec index 9517 __ ubfmw(r15, r12, 18, 23); 9518 __ ubfmw(r14, r12, 12, 17); 9519 __ ubfmw(r13, r12, 6, 11); 9520 __ andw(r12, r12, 63); 9521 // get the code based on the codec 9522 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9523 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9524 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9525 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9526 __ strb(r15, __ post(dst, 1)); 9527 __ strb(r14, __ post(dst, 1)); 9528 __ strb(r13, __ post(dst, 1)); 9529 __ strb(r12, __ post(dst, 1)); 9530 __ sub(length, length, 3); 9531 __ cbnz(length, Process3B); 9532 9533 __ BIND(Exit); 9534 __ ret(lr); 9535 9536 return start; 9537 } 9538 9539 void generate_base64_decode_simdround(Register src, Register dst, 9540 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9541 9542 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9543 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9544 9545 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9546 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9547 9548 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9549 9550 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9551 9552 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9553 9554 // we need unsigned saturating subtract, to make sure all input values 9555 // in range [0, 63] will have 0U value in the higher half lookup 9556 __ uqsubv(decH0, __ T16B, in0, v27); 9557 __ uqsubv(decH1, __ T16B, in1, v27); 9558 __ uqsubv(decH2, __ T16B, in2, v27); 9559 __ uqsubv(decH3, __ T16B, in3, v27); 9560 9561 // lower half lookup 9562 __ tbl(decL0, arrangement, codecL, 4, in0); 9563 __ tbl(decL1, arrangement, codecL, 4, in1); 9564 __ tbl(decL2, arrangement, codecL, 4, in2); 9565 __ tbl(decL3, arrangement, codecL, 4, in3); 9566 9567 // higher half lookup 9568 __ tbx(decH0, arrangement, codecH, 4, decH0); 9569 __ tbx(decH1, arrangement, codecH, 4, decH1); 9570 __ tbx(decH2, arrangement, codecH, 4, decH2); 9571 __ tbx(decH3, arrangement, codecH, 4, decH3); 9572 9573 // combine lower and higher 9574 __ orr(decL0, arrangement, decL0, decH0); 9575 __ orr(decL1, arrangement, decL1, decH1); 9576 __ orr(decL2, arrangement, decL2, decH2); 9577 __ orr(decL3, arrangement, decL3, decH3); 9578 9579 // check illegal inputs, value larger than 63 (maximum of 6 bits) 9580 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 9581 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 9582 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 9583 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 9584 __ orr(in0, arrangement, decH0, decH1); 9585 __ orr(in1, arrangement, decH2, decH3); 9586 __ orr(in2, arrangement, in0, in1); 9587 __ umaxv(in3, arrangement, in2); 9588 __ umov(rscratch2, in3, __ B, 0); 9589 9590 // get the data to output 9591 __ shl(out0, arrangement, decL0, 2); 9592 __ ushr(out1, arrangement, decL1, 4); 9593 __ orr(out0, arrangement, out0, out1); 9594 __ shl(out1, arrangement, decL1, 4); 9595 __ ushr(out2, arrangement, decL2, 2); 9596 __ orr(out1, arrangement, out1, out2); 9597 __ shl(out2, arrangement, decL2, 6); 9598 __ orr(out2, arrangement, out2, decL3); 9599 9600 __ cbz(rscratch2, NoIllegalData); 9601 9602 // handle illegal input 9603 __ umov(r10, in2, __ D, 0); 9604 if (size == 16) { 9605 __ cbnz(r10, ErrorInLowerHalf); 9606 9607 // illegal input is in higher half, store the lower half now. 9608 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 9609 9610 __ umov(r10, in2, __ D, 1); 9611 __ umov(r11, out0, __ D, 1); 9612 __ umov(r12, out1, __ D, 1); 9613 __ umov(r13, out2, __ D, 1); 9614 __ b(StoreLegalData); 9615 9616 __ BIND(ErrorInLowerHalf); 9617 } 9618 __ umov(r11, out0, __ D, 0); 9619 __ umov(r12, out1, __ D, 0); 9620 __ umov(r13, out2, __ D, 0); 9621 9622 __ BIND(StoreLegalData); 9623 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 9624 __ strb(r11, __ post(dst, 1)); 9625 __ strb(r12, __ post(dst, 1)); 9626 __ strb(r13, __ post(dst, 1)); 9627 __ lsr(r10, r10, 8); 9628 __ lsr(r11, r11, 8); 9629 __ lsr(r12, r12, 8); 9630 __ lsr(r13, r13, 8); 9631 __ b(StoreLegalData); 9632 9633 __ BIND(NoIllegalData); 9634 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 9635 } 9636 9637 9638 /** 9639 * Arguments: 9640 * 9641 * Input: 9642 * c_rarg0 - src_start 9643 * c_rarg1 - src_offset 9644 * c_rarg2 - src_length 9645 * c_rarg3 - dest_start 9646 * c_rarg4 - dest_offset 9647 * c_rarg5 - isURL 9648 * c_rarg6 - isMIME 9649 * 9650 */ 9651 address generate_base64_decodeBlock() { 9652 9653 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 9654 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 9655 // titled "Base64 decoding". 9656 9657 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 9658 // except the trailing character '=' is also treated illegal value in this intrinsic. That 9659 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 9660 static const uint8_t fromBase64ForNoSIMD[256] = { 9661 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9662 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9663 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 9664 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9665 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 9666 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 9667 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 9668 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 9669 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9670 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9671 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9672 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9673 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9674 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9675 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9676 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9677 }; 9678 9679 static const uint8_t fromBase64URLForNoSIMD[256] = { 9680 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9681 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9682 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 9683 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9684 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 9685 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 9686 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 9687 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 9688 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9689 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9690 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9691 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9692 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9693 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9694 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9695 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9696 }; 9697 9698 // A legal value of base64 code is in range [0, 127]. We need two lookups 9699 // with tbl/tbx and combine them to get the decode data. The 1st table vector 9700 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 9701 // table vector lookup use tbx, out of range indices are unchanged in 9702 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 9703 // The value of index 64 is set to 0, so that we know that we already get the 9704 // decoded data with the 1st lookup. 9705 static const uint8_t fromBase64ForSIMD[128] = { 9706 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9707 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9708 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 9709 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9710 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 9711 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 9712 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 9713 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 9714 }; 9715 9716 static const uint8_t fromBase64URLForSIMD[128] = { 9717 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9718 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9719 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 9720 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9721 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 9722 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 9723 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 9724 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 9725 }; 9726 9727 __ align(CodeEntryAlignment); 9728 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 9729 StubCodeMark mark(this, stub_id); 9730 address start = __ pc(); 9731 9732 Register src = c_rarg0; // source array 9733 Register soff = c_rarg1; // source start offset 9734 Register send = c_rarg2; // source end offset 9735 Register dst = c_rarg3; // dest array 9736 Register doff = c_rarg4; // position for writing to dest array 9737 Register isURL = c_rarg5; // Base64 or URL character set 9738 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 9739 9740 Register length = send; // reuse send as length of source data to process 9741 9742 Register simd_codec = c_rarg6; 9743 Register nosimd_codec = c_rarg7; 9744 9745 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 9746 9747 __ enter(); 9748 9749 __ add(src, src, soff); 9750 __ add(dst, dst, doff); 9751 9752 __ mov(doff, dst); 9753 9754 __ sub(length, send, soff); 9755 __ bfm(length, zr, 0, 1); 9756 9757 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 9758 __ cbz(isURL, ProcessData); 9759 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 9760 9761 __ BIND(ProcessData); 9762 __ mov(rscratch1, length); 9763 __ cmp(length, (u1)144); // 144 = 80 + 64 9764 __ br(Assembler::LT, Process4B); 9765 9766 // In the MIME case, the line length cannot be more than 76 9767 // bytes (see RFC 2045). This is too short a block for SIMD 9768 // to be worthwhile, so we use non-SIMD here. 9769 __ movw(rscratch1, 79); 9770 9771 __ BIND(Process4B); 9772 __ ldrw(r14, __ post(src, 4)); 9773 __ ubfxw(r10, r14, 0, 8); 9774 __ ubfxw(r11, r14, 8, 8); 9775 __ ubfxw(r12, r14, 16, 8); 9776 __ ubfxw(r13, r14, 24, 8); 9777 // get the de-code 9778 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 9779 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 9780 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 9781 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 9782 // error detection, 255u indicates an illegal input 9783 __ orrw(r14, r10, r11); 9784 __ orrw(r15, r12, r13); 9785 __ orrw(r14, r14, r15); 9786 __ tbnz(r14, 7, Exit); 9787 // recover the data 9788 __ lslw(r14, r10, 10); 9789 __ bfiw(r14, r11, 4, 6); 9790 __ bfmw(r14, r12, 2, 5); 9791 __ rev16w(r14, r14); 9792 __ bfiw(r13, r12, 6, 2); 9793 __ strh(r14, __ post(dst, 2)); 9794 __ strb(r13, __ post(dst, 1)); 9795 // non-simd loop 9796 __ subsw(rscratch1, rscratch1, 4); 9797 __ br(Assembler::GT, Process4B); 9798 9799 // if exiting from PreProcess80B, rscratch1 == -1; 9800 // otherwise, rscratch1 == 0. 9801 __ cbzw(rscratch1, Exit); 9802 __ sub(length, length, 80); 9803 9804 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 9805 __ cbz(isURL, SIMDEnter); 9806 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 9807 9808 __ BIND(SIMDEnter); 9809 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 9810 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 9811 __ mov(rscratch1, 63); 9812 __ dup(v27, __ T16B, rscratch1); 9813 9814 __ BIND(Process64B); 9815 __ cmp(length, (u1)64); 9816 __ br(Assembler::LT, Process32B); 9817 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 9818 __ sub(length, length, 64); 9819 __ b(Process64B); 9820 9821 __ BIND(Process32B); 9822 __ cmp(length, (u1)32); 9823 __ br(Assembler::LT, SIMDExit); 9824 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 9825 __ sub(length, length, 32); 9826 __ b(Process32B); 9827 9828 __ BIND(SIMDExit); 9829 __ cbz(length, Exit); 9830 __ movw(rscratch1, length); 9831 __ b(Process4B); 9832 9833 __ BIND(Exit); 9834 __ sub(c_rarg0, dst, doff); 9835 9836 __ leave(); 9837 __ ret(lr); 9838 9839 return start; 9840 } 9841 9842 // Support for spin waits. 9843 address generate_spin_wait() { 9844 __ align(CodeEntryAlignment); 9845 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 9846 StubCodeMark mark(this, stub_id); 9847 address start = __ pc(); 9848 9849 __ spin_wait(); 9850 __ ret(lr); 9851 9852 return start; 9853 } 9854 9855 void generate_lookup_secondary_supers_table_stub() { 9856 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 9857 StubCodeMark mark(this, stub_id); 9858 9859 const Register 9860 r_super_klass = r0, 9861 r_array_base = r1, 9862 r_array_length = r2, 9863 r_array_index = r3, 9864 r_sub_klass = r4, 9865 r_bitmap = rscratch2, 9866 result = r5; 9867 const FloatRegister 9868 vtemp = v0; 9869 9870 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 9871 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 9872 Label L_success; 9873 __ enter(); 9874 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 9875 r_array_base, r_array_length, r_array_index, 9876 vtemp, result, slot, 9877 /*stub_is_near*/true); 9878 __ leave(); 9879 __ ret(lr); 9880 } 9881 } 9882 9883 // Slow path implementation for UseSecondarySupersTable. 9884 address generate_lookup_secondary_supers_table_slow_path_stub() { 9885 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 9886 StubCodeMark mark(this, stub_id); 9887 9888 address start = __ pc(); 9889 const Register 9890 r_super_klass = r0, // argument 9891 r_array_base = r1, // argument 9892 temp1 = r2, // temp 9893 r_array_index = r3, // argument 9894 r_bitmap = rscratch2, // argument 9895 result = r5; // argument 9896 9897 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 9898 __ ret(lr); 9899 9900 return start; 9901 } 9902 9903 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 9904 9905 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 9906 // 9907 // If LSE is in use, generate LSE versions of all the stubs. The 9908 // non-LSE versions are in atomic_aarch64.S. 9909 9910 // class AtomicStubMark records the entry point of a stub and the 9911 // stub pointer which will point to it. The stub pointer is set to 9912 // the entry point when ~AtomicStubMark() is called, which must be 9913 // after ICache::invalidate_range. This ensures safe publication of 9914 // the generated code. 9915 class AtomicStubMark { 9916 address _entry_point; 9917 aarch64_atomic_stub_t *_stub; 9918 MacroAssembler *_masm; 9919 public: 9920 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 9921 _masm = masm; 9922 __ align(32); 9923 _entry_point = __ pc(); 9924 _stub = stub; 9925 } 9926 ~AtomicStubMark() { 9927 *_stub = (aarch64_atomic_stub_t)_entry_point; 9928 } 9929 }; 9930 9931 // NB: For memory_order_conservative we need a trailing membar after 9932 // LSE atomic operations but not a leading membar. 9933 // 9934 // We don't need a leading membar because a clause in the Arm ARM 9935 // says: 9936 // 9937 // Barrier-ordered-before 9938 // 9939 // Barrier instructions order prior Memory effects before subsequent 9940 // Memory effects generated by the same Observer. A read or a write 9941 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 9942 // Observer if and only if RW1 appears in program order before RW 2 9943 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 9944 // instruction with both Acquire and Release semantics. 9945 // 9946 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 9947 // and Release semantics, therefore we don't need a leading 9948 // barrier. However, there is no corresponding Barrier-ordered-after 9949 // relationship, therefore we need a trailing membar to prevent a 9950 // later store or load from being reordered with the store in an 9951 // atomic instruction. 9952 // 9953 // This was checked by using the herd7 consistency model simulator 9954 // (http://diy.inria.fr/) with this test case: 9955 // 9956 // AArch64 LseCas 9957 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 9958 // P0 | P1; 9959 // LDR W4, [X2] | MOV W3, #0; 9960 // DMB LD | MOV W4, #1; 9961 // LDR W3, [X1] | CASAL W3, W4, [X1]; 9962 // | DMB ISH; 9963 // | STR W4, [X2]; 9964 // exists 9965 // (0:X3=0 /\ 0:X4=1) 9966 // 9967 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 9968 // with the store to x in P1. Without the DMB in P1 this may happen. 9969 // 9970 // At the time of writing we don't know of any AArch64 hardware that 9971 // reorders stores in this way, but the Reference Manual permits it. 9972 9973 void gen_cas_entry(Assembler::operand_size size, 9974 atomic_memory_order order) { 9975 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 9976 exchange_val = c_rarg2; 9977 bool acquire, release; 9978 switch (order) { 9979 case memory_order_relaxed: 9980 acquire = false; 9981 release = false; 9982 break; 9983 case memory_order_release: 9984 acquire = false; 9985 release = true; 9986 break; 9987 default: 9988 acquire = true; 9989 release = true; 9990 break; 9991 } 9992 __ mov(prev, compare_val); 9993 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 9994 if (order == memory_order_conservative) { 9995 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 9996 } 9997 if (size == Assembler::xword) { 9998 __ mov(r0, prev); 9999 } else { 10000 __ movw(r0, prev); 10001 } 10002 __ ret(lr); 10003 } 10004 10005 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 10006 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10007 // If not relaxed, then default to conservative. Relaxed is the only 10008 // case we use enough to be worth specializing. 10009 if (order == memory_order_relaxed) { 10010 __ ldadd(size, incr, prev, addr); 10011 } else { 10012 __ ldaddal(size, incr, prev, addr); 10013 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10014 } 10015 if (size == Assembler::xword) { 10016 __ mov(r0, prev); 10017 } else { 10018 __ movw(r0, prev); 10019 } 10020 __ ret(lr); 10021 } 10022 10023 void gen_swpal_entry(Assembler::operand_size size) { 10024 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10025 __ swpal(size, incr, prev, addr); 10026 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10027 if (size == Assembler::xword) { 10028 __ mov(r0, prev); 10029 } else { 10030 __ movw(r0, prev); 10031 } 10032 __ ret(lr); 10033 } 10034 10035 void generate_atomic_entry_points() { 10036 if (! UseLSE) { 10037 return; 10038 } 10039 __ align(CodeEntryAlignment); 10040 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 10041 StubCodeMark mark(this, stub_id); 10042 address first_entry = __ pc(); 10043 10044 // ADD, memory_order_conservative 10045 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 10046 gen_ldadd_entry(Assembler::word, memory_order_conservative); 10047 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 10048 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 10049 10050 // ADD, memory_order_relaxed 10051 AtomicStubMark mark_fetch_add_4_relaxed 10052 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 10053 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 10054 AtomicStubMark mark_fetch_add_8_relaxed 10055 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 10056 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 10057 10058 // XCHG, memory_order_conservative 10059 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 10060 gen_swpal_entry(Assembler::word); 10061 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 10062 gen_swpal_entry(Assembler::xword); 10063 10064 // CAS, memory_order_conservative 10065 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 10066 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 10067 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 10068 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 10069 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 10070 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 10071 10072 // CAS, memory_order_relaxed 10073 AtomicStubMark mark_cmpxchg_1_relaxed 10074 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 10075 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 10076 AtomicStubMark mark_cmpxchg_4_relaxed 10077 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 10078 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 10079 AtomicStubMark mark_cmpxchg_8_relaxed 10080 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 10081 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 10082 10083 AtomicStubMark mark_cmpxchg_4_release 10084 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 10085 gen_cas_entry(MacroAssembler::word, memory_order_release); 10086 AtomicStubMark mark_cmpxchg_8_release 10087 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 10088 gen_cas_entry(MacroAssembler::xword, memory_order_release); 10089 10090 AtomicStubMark mark_cmpxchg_4_seq_cst 10091 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 10092 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 10093 AtomicStubMark mark_cmpxchg_8_seq_cst 10094 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 10095 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 10096 10097 ICache::invalidate_range(first_entry, __ pc() - first_entry); 10098 } 10099 #endif // LINUX 10100 10101 address generate_cont_thaw(Continuation::thaw_kind kind) { 10102 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 10103 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 10104 10105 address start = __ pc(); 10106 10107 if (return_barrier) { 10108 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 10109 __ mov(sp, rscratch1); 10110 } 10111 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10112 10113 if (return_barrier) { 10114 // preserve possible return value from a method returning to the return barrier 10115 __ fmovd(rscratch1, v0); 10116 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10117 } 10118 10119 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 10120 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 10121 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 10122 10123 if (return_barrier) { 10124 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10125 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10126 __ fmovd(v0, rscratch1); 10127 } 10128 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10129 10130 10131 Label thaw_success; 10132 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10133 __ cbnz(rscratch2, thaw_success); 10134 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10135 __ br(rscratch1); 10136 __ bind(thaw_success); 10137 10138 // make room for the thawed frames 10139 __ sub(rscratch1, sp, rscratch2); 10140 __ andr(rscratch1, rscratch1, -16); // align 10141 __ mov(sp, rscratch1); 10142 10143 if (return_barrier) { 10144 // save original return value -- again 10145 __ fmovd(rscratch1, v0); 10146 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10147 } 10148 10149 // If we want, we can templatize thaw by kind, and have three different entries 10150 __ movw(c_rarg1, (uint32_t)kind); 10151 10152 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10153 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10154 10155 if (return_barrier) { 10156 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10157 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10158 __ fmovd(v0, rscratch1); 10159 } else { 10160 __ mov(r0, zr); // return 0 (success) from doYield 10161 } 10162 10163 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10164 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10165 __ mov(rfp, sp); 10166 10167 if (return_barrier_exception) { 10168 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10169 __ authenticate_return_address(c_rarg1); 10170 __ verify_oop(r0); 10171 // save return value containing the exception oop in callee-saved R19 10172 __ mov(r19, r0); 10173 10174 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10175 10176 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10177 // __ reinitialize_ptrue(); 10178 10179 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10180 10181 __ mov(r1, r0); // the exception handler 10182 __ mov(r0, r19); // restore return value containing the exception oop 10183 __ verify_oop(r0); 10184 10185 __ leave(); 10186 __ mov(r3, lr); 10187 __ br(r1); // the exception handler 10188 } else { 10189 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10190 __ leave(); 10191 __ ret(lr); 10192 } 10193 10194 return start; 10195 } 10196 10197 address generate_cont_thaw() { 10198 if (!Continuations::enabled()) return nullptr; 10199 10200 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 10201 StubCodeMark mark(this, stub_id); 10202 address start = __ pc(); 10203 generate_cont_thaw(Continuation::thaw_top); 10204 return start; 10205 } 10206 10207 address generate_cont_returnBarrier() { 10208 if (!Continuations::enabled()) return nullptr; 10209 10210 // TODO: will probably need multiple return barriers depending on return type 10211 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 10212 StubCodeMark mark(this, stub_id); 10213 address start = __ pc(); 10214 10215 generate_cont_thaw(Continuation::thaw_return_barrier); 10216 10217 return start; 10218 } 10219 10220 address generate_cont_returnBarrier_exception() { 10221 if (!Continuations::enabled()) return nullptr; 10222 10223 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 10224 StubCodeMark mark(this, stub_id); 10225 address start = __ pc(); 10226 10227 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10228 10229 return start; 10230 } 10231 10232 address generate_cont_preempt_stub() { 10233 if (!Continuations::enabled()) return nullptr; 10234 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 10235 StubCodeMark mark(this, stub_id); 10236 address start = __ pc(); 10237 10238 __ reset_last_Java_frame(true); 10239 10240 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10241 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10242 __ mov(sp, rscratch2); 10243 10244 Label preemption_cancelled; 10245 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10246 __ cbnz(rscratch1, preemption_cancelled); 10247 10248 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10249 SharedRuntime::continuation_enter_cleanup(_masm); 10250 __ leave(); 10251 __ ret(lr); 10252 10253 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10254 __ bind(preemption_cancelled); 10255 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10256 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10257 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10258 __ ldr(rscratch1, Address(rscratch1)); 10259 __ br(rscratch1); 10260 10261 return start; 10262 } 10263 10264 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10265 // are represented as long[5], with BITS_PER_LIMB = 26. 10266 // Pack five 26-bit limbs into three 64-bit registers. 10267 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10268 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10269 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10270 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10271 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10272 10273 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10274 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10275 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10276 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10277 10278 if (dest2->is_valid()) { 10279 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10280 } else { 10281 #ifdef ASSERT 10282 Label OK; 10283 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10284 __ br(__ EQ, OK); 10285 __ stop("high bits of Poly1305 integer should be zero"); 10286 __ should_not_reach_here(); 10287 __ bind(OK); 10288 #endif 10289 } 10290 } 10291 10292 // As above, but return only a 128-bit integer, packed into two 10293 // 64-bit registers. 10294 void pack_26(Register dest0, Register dest1, Register src) { 10295 pack_26(dest0, dest1, noreg, src); 10296 } 10297 10298 // Multiply and multiply-accumulate unsigned 64-bit registers. 10299 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10300 __ mul(prod_lo, n, m); 10301 __ umulh(prod_hi, n, m); 10302 } 10303 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10304 wide_mul(rscratch1, rscratch2, n, m); 10305 __ adds(sum_lo, sum_lo, rscratch1); 10306 __ adc(sum_hi, sum_hi, rscratch2); 10307 } 10308 10309 // Poly1305, RFC 7539 10310 10311 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10312 // description of the tricks used to simplify and accelerate this 10313 // computation. 10314 10315 address generate_poly1305_processBlocks() { 10316 __ align(CodeEntryAlignment); 10317 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 10318 StubCodeMark mark(this, stub_id); 10319 address start = __ pc(); 10320 Label here; 10321 __ enter(); 10322 RegSet callee_saved = RegSet::range(r19, r28); 10323 __ push(callee_saved, sp); 10324 10325 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10326 10327 // Arguments 10328 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10329 10330 // R_n is the 128-bit randomly-generated key, packed into two 10331 // registers. The caller passes this key to us as long[5], with 10332 // BITS_PER_LIMB = 26. 10333 const Register R_0 = *++regs, R_1 = *++regs; 10334 pack_26(R_0, R_1, r_start); 10335 10336 // RR_n is (R_n >> 2) * 5 10337 const Register RR_0 = *++regs, RR_1 = *++regs; 10338 __ lsr(RR_0, R_0, 2); 10339 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10340 __ lsr(RR_1, R_1, 2); 10341 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10342 10343 // U_n is the current checksum 10344 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10345 pack_26(U_0, U_1, U_2, acc_start); 10346 10347 static constexpr int BLOCK_LENGTH = 16; 10348 Label DONE, LOOP; 10349 10350 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10351 __ br(Assembler::LT, DONE); { 10352 __ bind(LOOP); 10353 10354 // S_n is to be the sum of U_n and the next block of data 10355 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10356 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10357 __ adds(S_0, U_0, S_0); 10358 __ adcs(S_1, U_1, S_1); 10359 __ adc(S_2, U_2, zr); 10360 __ add(S_2, S_2, 1); 10361 10362 const Register U_0HI = *++regs, U_1HI = *++regs; 10363 10364 // NB: this logic depends on some of the special properties of 10365 // Poly1305 keys. In particular, because we know that the top 10366 // four bits of R_0 and R_1 are zero, we can add together 10367 // partial products without any risk of needing to propagate a 10368 // carry out. 10369 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10370 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10371 __ andr(U_2, R_0, 3); 10372 __ mul(U_2, S_2, U_2); 10373 10374 // Recycle registers S_0, S_1, S_2 10375 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10376 10377 // Partial reduction mod 2**130 - 5 10378 __ adds(U_1, U_0HI, U_1); 10379 __ adc(U_2, U_1HI, U_2); 10380 // Sum now in U_2:U_1:U_0. 10381 // Dead: U_0HI, U_1HI. 10382 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10383 10384 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10385 10386 // First, U_2:U_1:U_0 += (U_2 >> 2) 10387 __ lsr(rscratch1, U_2, 2); 10388 __ andr(U_2, U_2, (u8)3); 10389 __ adds(U_0, U_0, rscratch1); 10390 __ adcs(U_1, U_1, zr); 10391 __ adc(U_2, U_2, zr); 10392 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10393 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10394 __ adcs(U_1, U_1, zr); 10395 __ adc(U_2, U_2, zr); 10396 10397 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10398 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10399 __ br(~ Assembler::LT, LOOP); 10400 } 10401 10402 // Further reduce modulo 2^130 - 5 10403 __ lsr(rscratch1, U_2, 2); 10404 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10405 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10406 __ adcs(U_1, U_1, zr); 10407 __ andr(U_2, U_2, (u1)3); 10408 __ adc(U_2, U_2, zr); 10409 10410 // Unpack the sum into five 26-bit limbs and write to memory. 10411 __ ubfiz(rscratch1, U_0, 0, 26); 10412 __ ubfx(rscratch2, U_0, 26, 26); 10413 __ stp(rscratch1, rscratch2, Address(acc_start)); 10414 __ ubfx(rscratch1, U_0, 52, 12); 10415 __ bfi(rscratch1, U_1, 12, 14); 10416 __ ubfx(rscratch2, U_1, 14, 26); 10417 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10418 __ ubfx(rscratch1, U_1, 40, 24); 10419 __ bfi(rscratch1, U_2, 24, 3); 10420 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10421 10422 __ bind(DONE); 10423 __ pop(callee_saved, sp); 10424 __ leave(); 10425 __ ret(lr); 10426 10427 return start; 10428 } 10429 10430 // exception handler for upcall stubs 10431 address generate_upcall_stub_exception_handler() { 10432 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 10433 StubCodeMark mark(this, stub_id); 10434 address start = __ pc(); 10435 10436 // Native caller has no idea how to handle exceptions, 10437 // so we just crash here. Up to callee to catch exceptions. 10438 __ verify_oop(r0); 10439 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10440 __ blr(rscratch1); 10441 __ should_not_reach_here(); 10442 10443 return start; 10444 } 10445 10446 // load Method* target of MethodHandle 10447 // j_rarg0 = jobject receiver 10448 // rmethod = result 10449 address generate_upcall_stub_load_target() { 10450 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 10451 StubCodeMark mark(this, stub_id); 10452 address start = __ pc(); 10453 10454 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10455 // Load target method from receiver 10456 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10457 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10458 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10459 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10460 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10461 noreg, noreg); 10462 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10463 10464 __ ret(lr); 10465 10466 return start; 10467 } 10468 10469 #undef __ 10470 #define __ masm-> 10471 10472 class MontgomeryMultiplyGenerator : public MacroAssembler { 10473 10474 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10475 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10476 10477 RegSet _toSave; 10478 bool _squaring; 10479 10480 public: 10481 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10482 : MacroAssembler(as->code()), _squaring(squaring) { 10483 10484 // Register allocation 10485 10486 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10487 Pa_base = *regs; // Argument registers 10488 if (squaring) 10489 Pb_base = Pa_base; 10490 else 10491 Pb_base = *++regs; 10492 Pn_base = *++regs; 10493 Rlen= *++regs; 10494 inv = *++regs; 10495 Pm_base = *++regs; 10496 10497 // Working registers: 10498 Ra = *++regs; // The current digit of a, b, n, and m. 10499 Rb = *++regs; 10500 Rm = *++regs; 10501 Rn = *++regs; 10502 10503 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10504 Pb = *++regs; 10505 Pm = *++regs; 10506 Pn = *++regs; 10507 10508 t0 = *++regs; // Three registers which form a 10509 t1 = *++regs; // triple-precision accumuator. 10510 t2 = *++regs; 10511 10512 Ri = *++regs; // Inner and outer loop indexes. 10513 Rj = *++regs; 10514 10515 Rhi_ab = *++regs; // Product registers: low and high parts 10516 Rlo_ab = *++regs; // of a*b and m*n. 10517 Rhi_mn = *++regs; 10518 Rlo_mn = *++regs; 10519 10520 // r19 and up are callee-saved. 10521 _toSave = RegSet::range(r19, *regs) + Pm_base; 10522 } 10523 10524 private: 10525 void save_regs() { 10526 push(_toSave, sp); 10527 } 10528 10529 void restore_regs() { 10530 pop(_toSave, sp); 10531 } 10532 10533 template <typename T> 10534 void unroll_2(Register count, T block) { 10535 Label loop, end, odd; 10536 tbnz(count, 0, odd); 10537 cbz(count, end); 10538 align(16); 10539 bind(loop); 10540 (this->*block)(); 10541 bind(odd); 10542 (this->*block)(); 10543 subs(count, count, 2); 10544 br(Assembler::GT, loop); 10545 bind(end); 10546 } 10547 10548 template <typename T> 10549 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10550 Label loop, end, odd; 10551 tbnz(count, 0, odd); 10552 cbz(count, end); 10553 align(16); 10554 bind(loop); 10555 (this->*block)(d, s, tmp); 10556 bind(odd); 10557 (this->*block)(d, s, tmp); 10558 subs(count, count, 2); 10559 br(Assembler::GT, loop); 10560 bind(end); 10561 } 10562 10563 void pre1(RegisterOrConstant i) { 10564 block_comment("pre1"); 10565 // Pa = Pa_base; 10566 // Pb = Pb_base + i; 10567 // Pm = Pm_base; 10568 // Pn = Pn_base + i; 10569 // Ra = *Pa; 10570 // Rb = *Pb; 10571 // Rm = *Pm; 10572 // Rn = *Pn; 10573 ldr(Ra, Address(Pa_base)); 10574 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10575 ldr(Rm, Address(Pm_base)); 10576 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10577 lea(Pa, Address(Pa_base)); 10578 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10579 lea(Pm, Address(Pm_base)); 10580 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10581 10582 // Zero the m*n result. 10583 mov(Rhi_mn, zr); 10584 mov(Rlo_mn, zr); 10585 } 10586 10587 // The core multiply-accumulate step of a Montgomery 10588 // multiplication. The idea is to schedule operations as a 10589 // pipeline so that instructions with long latencies (loads and 10590 // multiplies) have time to complete before their results are 10591 // used. This most benefits in-order implementations of the 10592 // architecture but out-of-order ones also benefit. 10593 void step() { 10594 block_comment("step"); 10595 // MACC(Ra, Rb, t0, t1, t2); 10596 // Ra = *++Pa; 10597 // Rb = *--Pb; 10598 umulh(Rhi_ab, Ra, Rb); 10599 mul(Rlo_ab, Ra, Rb); 10600 ldr(Ra, pre(Pa, wordSize)); 10601 ldr(Rb, pre(Pb, -wordSize)); 10602 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 10603 // previous iteration. 10604 // MACC(Rm, Rn, t0, t1, t2); 10605 // Rm = *++Pm; 10606 // Rn = *--Pn; 10607 umulh(Rhi_mn, Rm, Rn); 10608 mul(Rlo_mn, Rm, Rn); 10609 ldr(Rm, pre(Pm, wordSize)); 10610 ldr(Rn, pre(Pn, -wordSize)); 10611 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10612 } 10613 10614 void post1() { 10615 block_comment("post1"); 10616 10617 // MACC(Ra, Rb, t0, t1, t2); 10618 // Ra = *++Pa; 10619 // Rb = *--Pb; 10620 umulh(Rhi_ab, Ra, Rb); 10621 mul(Rlo_ab, Ra, Rb); 10622 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10623 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10624 10625 // *Pm = Rm = t0 * inv; 10626 mul(Rm, t0, inv); 10627 str(Rm, Address(Pm)); 10628 10629 // MACC(Rm, Rn, t0, t1, t2); 10630 // t0 = t1; t1 = t2; t2 = 0; 10631 umulh(Rhi_mn, Rm, Rn); 10632 10633 #ifndef PRODUCT 10634 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10635 { 10636 mul(Rlo_mn, Rm, Rn); 10637 add(Rlo_mn, t0, Rlo_mn); 10638 Label ok; 10639 cbz(Rlo_mn, ok); { 10640 stop("broken Montgomery multiply"); 10641 } bind(ok); 10642 } 10643 #endif 10644 // We have very carefully set things up so that 10645 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 10646 // the lower half of Rm * Rn because we know the result already: 10647 // it must be -t0. t0 + (-t0) must generate a carry iff 10648 // t0 != 0. So, rather than do a mul and an adds we just set 10649 // the carry flag iff t0 is nonzero. 10650 // 10651 // mul(Rlo_mn, Rm, Rn); 10652 // adds(zr, t0, Rlo_mn); 10653 subs(zr, t0, 1); // Set carry iff t0 is nonzero 10654 adcs(t0, t1, Rhi_mn); 10655 adc(t1, t2, zr); 10656 mov(t2, zr); 10657 } 10658 10659 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 10660 block_comment("pre2"); 10661 // Pa = Pa_base + i-len; 10662 // Pb = Pb_base + len; 10663 // Pm = Pm_base + i-len; 10664 // Pn = Pn_base + len; 10665 10666 if (i.is_register()) { 10667 sub(Rj, i.as_register(), len); 10668 } else { 10669 mov(Rj, i.as_constant()); 10670 sub(Rj, Rj, len); 10671 } 10672 // Rj == i-len 10673 10674 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 10675 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 10676 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 10677 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 10678 10679 // Ra = *++Pa; 10680 // Rb = *--Pb; 10681 // Rm = *++Pm; 10682 // Rn = *--Pn; 10683 ldr(Ra, pre(Pa, wordSize)); 10684 ldr(Rb, pre(Pb, -wordSize)); 10685 ldr(Rm, pre(Pm, wordSize)); 10686 ldr(Rn, pre(Pn, -wordSize)); 10687 10688 mov(Rhi_mn, zr); 10689 mov(Rlo_mn, zr); 10690 } 10691 10692 void post2(RegisterOrConstant i, RegisterOrConstant len) { 10693 block_comment("post2"); 10694 if (i.is_constant()) { 10695 mov(Rj, i.as_constant()-len.as_constant()); 10696 } else { 10697 sub(Rj, i.as_register(), len); 10698 } 10699 10700 adds(t0, t0, Rlo_mn); // The pending m*n, low part 10701 10702 // As soon as we know the least significant digit of our result, 10703 // store it. 10704 // Pm_base[i-len] = t0; 10705 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 10706 10707 // t0 = t1; t1 = t2; t2 = 0; 10708 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 10709 adc(t1, t2, zr); 10710 mov(t2, zr); 10711 } 10712 10713 // A carry in t0 after Montgomery multiplication means that we 10714 // should subtract multiples of n from our result in m. We'll 10715 // keep doing that until there is no carry. 10716 void normalize(RegisterOrConstant len) { 10717 block_comment("normalize"); 10718 // while (t0) 10719 // t0 = sub(Pm_base, Pn_base, t0, len); 10720 Label loop, post, again; 10721 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 10722 cbz(t0, post); { 10723 bind(again); { 10724 mov(i, zr); 10725 mov(cnt, len); 10726 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10727 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10728 subs(zr, zr, zr); // set carry flag, i.e. no borrow 10729 align(16); 10730 bind(loop); { 10731 sbcs(Rm, Rm, Rn); 10732 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10733 add(i, i, 1); 10734 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10735 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10736 sub(cnt, cnt, 1); 10737 } cbnz(cnt, loop); 10738 sbc(t0, t0, zr); 10739 } cbnz(t0, again); 10740 } bind(post); 10741 } 10742 10743 // Move memory at s to d, reversing words. 10744 // Increments d to end of copied memory 10745 // Destroys tmp1, tmp2 10746 // Preserves len 10747 // Leaves s pointing to the address which was in d at start 10748 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 10749 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 10750 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 10751 10752 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 10753 mov(tmp1, len); 10754 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 10755 sub(s, d, len, ext::uxtw, LogBytesPerWord); 10756 } 10757 // where 10758 void reverse1(Register d, Register s, Register tmp) { 10759 ldr(tmp, pre(s, -wordSize)); 10760 ror(tmp, tmp, 32); 10761 str(tmp, post(d, wordSize)); 10762 } 10763 10764 void step_squaring() { 10765 // An extra ACC 10766 step(); 10767 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10768 } 10769 10770 void last_squaring(RegisterOrConstant i) { 10771 Label dont; 10772 // if ((i & 1) == 0) { 10773 tbnz(i.as_register(), 0, dont); { 10774 // MACC(Ra, Rb, t0, t1, t2); 10775 // Ra = *++Pa; 10776 // Rb = *--Pb; 10777 umulh(Rhi_ab, Ra, Rb); 10778 mul(Rlo_ab, Ra, Rb); 10779 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10780 } bind(dont); 10781 } 10782 10783 void extra_step_squaring() { 10784 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10785 10786 // MACC(Rm, Rn, t0, t1, t2); 10787 // Rm = *++Pm; 10788 // Rn = *--Pn; 10789 umulh(Rhi_mn, Rm, Rn); 10790 mul(Rlo_mn, Rm, Rn); 10791 ldr(Rm, pre(Pm, wordSize)); 10792 ldr(Rn, pre(Pn, -wordSize)); 10793 } 10794 10795 void post1_squaring() { 10796 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10797 10798 // *Pm = Rm = t0 * inv; 10799 mul(Rm, t0, inv); 10800 str(Rm, Address(Pm)); 10801 10802 // MACC(Rm, Rn, t0, t1, t2); 10803 // t0 = t1; t1 = t2; t2 = 0; 10804 umulh(Rhi_mn, Rm, Rn); 10805 10806 #ifndef PRODUCT 10807 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10808 { 10809 mul(Rlo_mn, Rm, Rn); 10810 add(Rlo_mn, t0, Rlo_mn); 10811 Label ok; 10812 cbz(Rlo_mn, ok); { 10813 stop("broken Montgomery multiply"); 10814 } bind(ok); 10815 } 10816 #endif 10817 // We have very carefully set things up so that 10818 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 10819 // the lower half of Rm * Rn because we know the result already: 10820 // it must be -t0. t0 + (-t0) must generate a carry iff 10821 // t0 != 0. So, rather than do a mul and an adds we just set 10822 // the carry flag iff t0 is nonzero. 10823 // 10824 // mul(Rlo_mn, Rm, Rn); 10825 // adds(zr, t0, Rlo_mn); 10826 subs(zr, t0, 1); // Set carry iff t0 is nonzero 10827 adcs(t0, t1, Rhi_mn); 10828 adc(t1, t2, zr); 10829 mov(t2, zr); 10830 } 10831 10832 void acc(Register Rhi, Register Rlo, 10833 Register t0, Register t1, Register t2) { 10834 adds(t0, t0, Rlo); 10835 adcs(t1, t1, Rhi); 10836 adc(t2, t2, zr); 10837 } 10838 10839 public: 10840 /** 10841 * Fast Montgomery multiplication. The derivation of the 10842 * algorithm is in A Cryptographic Library for the Motorola 10843 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 10844 * 10845 * Arguments: 10846 * 10847 * Inputs for multiplication: 10848 * c_rarg0 - int array elements a 10849 * c_rarg1 - int array elements b 10850 * c_rarg2 - int array elements n (the modulus) 10851 * c_rarg3 - int length 10852 * c_rarg4 - int inv 10853 * c_rarg5 - int array elements m (the result) 10854 * 10855 * Inputs for squaring: 10856 * c_rarg0 - int array elements a 10857 * c_rarg1 - int array elements n (the modulus) 10858 * c_rarg2 - int length 10859 * c_rarg3 - int inv 10860 * c_rarg4 - int array elements m (the result) 10861 * 10862 */ 10863 address generate_multiply() { 10864 Label argh, nothing; 10865 bind(argh); 10866 stop("MontgomeryMultiply total_allocation must be <= 8192"); 10867 10868 align(CodeEntryAlignment); 10869 address entry = pc(); 10870 10871 cbzw(Rlen, nothing); 10872 10873 enter(); 10874 10875 // Make room. 10876 cmpw(Rlen, 512); 10877 br(Assembler::HI, argh); 10878 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 10879 andr(sp, Ra, -2 * wordSize); 10880 10881 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 10882 10883 { 10884 // Copy input args, reversing as we go. We use Ra as a 10885 // temporary variable. 10886 reverse(Ra, Pa_base, Rlen, t0, t1); 10887 if (!_squaring) 10888 reverse(Ra, Pb_base, Rlen, t0, t1); 10889 reverse(Ra, Pn_base, Rlen, t0, t1); 10890 } 10891 10892 // Push all call-saved registers and also Pm_base which we'll need 10893 // at the end. 10894 save_regs(); 10895 10896 #ifndef PRODUCT 10897 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 10898 { 10899 ldr(Rn, Address(Pn_base, 0)); 10900 mul(Rlo_mn, Rn, inv); 10901 subs(zr, Rlo_mn, -1); 10902 Label ok; 10903 br(EQ, ok); { 10904 stop("broken inverse in Montgomery multiply"); 10905 } bind(ok); 10906 } 10907 #endif 10908 10909 mov(Pm_base, Ra); 10910 10911 mov(t0, zr); 10912 mov(t1, zr); 10913 mov(t2, zr); 10914 10915 block_comment("for (int i = 0; i < len; i++) {"); 10916 mov(Ri, zr); { 10917 Label loop, end; 10918 cmpw(Ri, Rlen); 10919 br(Assembler::GE, end); 10920 10921 bind(loop); 10922 pre1(Ri); 10923 10924 block_comment(" for (j = i; j; j--) {"); { 10925 movw(Rj, Ri); 10926 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 10927 } block_comment(" } // j"); 10928 10929 post1(); 10930 addw(Ri, Ri, 1); 10931 cmpw(Ri, Rlen); 10932 br(Assembler::LT, loop); 10933 bind(end); 10934 block_comment("} // i"); 10935 } 10936 10937 block_comment("for (int i = len; i < 2*len; i++) {"); 10938 mov(Ri, Rlen); { 10939 Label loop, end; 10940 cmpw(Ri, Rlen, Assembler::LSL, 1); 10941 br(Assembler::GE, end); 10942 10943 bind(loop); 10944 pre2(Ri, Rlen); 10945 10946 block_comment(" for (j = len*2-i-1; j; j--) {"); { 10947 lslw(Rj, Rlen, 1); 10948 subw(Rj, Rj, Ri); 10949 subw(Rj, Rj, 1); 10950 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 10951 } block_comment(" } // j"); 10952 10953 post2(Ri, Rlen); 10954 addw(Ri, Ri, 1); 10955 cmpw(Ri, Rlen, Assembler::LSL, 1); 10956 br(Assembler::LT, loop); 10957 bind(end); 10958 } 10959 block_comment("} // i"); 10960 10961 normalize(Rlen); 10962 10963 mov(Ra, Pm_base); // Save Pm_base in Ra 10964 restore_regs(); // Restore caller's Pm_base 10965 10966 // Copy our result into caller's Pm_base 10967 reverse(Pm_base, Ra, Rlen, t0, t1); 10968 10969 leave(); 10970 bind(nothing); 10971 ret(lr); 10972 10973 return entry; 10974 } 10975 // In C, approximately: 10976 10977 // void 10978 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 10979 // julong Pn_base[], julong Pm_base[], 10980 // julong inv, int len) { 10981 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 10982 // julong *Pa, *Pb, *Pn, *Pm; 10983 // julong Ra, Rb, Rn, Rm; 10984 10985 // int i; 10986 10987 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 10988 10989 // for (i = 0; i < len; i++) { 10990 // int j; 10991 10992 // Pa = Pa_base; 10993 // Pb = Pb_base + i; 10994 // Pm = Pm_base; 10995 // Pn = Pn_base + i; 10996 10997 // Ra = *Pa; 10998 // Rb = *Pb; 10999 // Rm = *Pm; 11000 // Rn = *Pn; 11001 11002 // int iters = i; 11003 // for (j = 0; iters--; j++) { 11004 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11005 // MACC(Ra, Rb, t0, t1, t2); 11006 // Ra = *++Pa; 11007 // Rb = *--Pb; 11008 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11009 // MACC(Rm, Rn, t0, t1, t2); 11010 // Rm = *++Pm; 11011 // Rn = *--Pn; 11012 // } 11013 11014 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 11015 // MACC(Ra, Rb, t0, t1, t2); 11016 // *Pm = Rm = t0 * inv; 11017 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11018 // MACC(Rm, Rn, t0, t1, t2); 11019 11020 // assert(t0 == 0, "broken Montgomery multiply"); 11021 11022 // t0 = t1; t1 = t2; t2 = 0; 11023 // } 11024 11025 // for (i = len; i < 2*len; i++) { 11026 // int j; 11027 11028 // Pa = Pa_base + i-len; 11029 // Pb = Pb_base + len; 11030 // Pm = Pm_base + i-len; 11031 // Pn = Pn_base + len; 11032 11033 // Ra = *++Pa; 11034 // Rb = *--Pb; 11035 // Rm = *++Pm; 11036 // Rn = *--Pn; 11037 11038 // int iters = len*2-i-1; 11039 // for (j = i-len+1; iters--; j++) { 11040 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11041 // MACC(Ra, Rb, t0, t1, t2); 11042 // Ra = *++Pa; 11043 // Rb = *--Pb; 11044 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11045 // MACC(Rm, Rn, t0, t1, t2); 11046 // Rm = *++Pm; 11047 // Rn = *--Pn; 11048 // } 11049 11050 // Pm_base[i-len] = t0; 11051 // t0 = t1; t1 = t2; t2 = 0; 11052 // } 11053 11054 // while (t0) 11055 // t0 = sub(Pm_base, Pn_base, t0, len); 11056 // } 11057 11058 /** 11059 * Fast Montgomery squaring. This uses asymptotically 25% fewer 11060 * multiplies than Montgomery multiplication so it should be up to 11061 * 25% faster. However, its loop control is more complex and it 11062 * may actually run slower on some machines. 11063 * 11064 * Arguments: 11065 * 11066 * Inputs: 11067 * c_rarg0 - int array elements a 11068 * c_rarg1 - int array elements n (the modulus) 11069 * c_rarg2 - int length 11070 * c_rarg3 - int inv 11071 * c_rarg4 - int array elements m (the result) 11072 * 11073 */ 11074 address generate_square() { 11075 Label argh; 11076 bind(argh); 11077 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11078 11079 align(CodeEntryAlignment); 11080 address entry = pc(); 11081 11082 enter(); 11083 11084 // Make room. 11085 cmpw(Rlen, 512); 11086 br(Assembler::HI, argh); 11087 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11088 andr(sp, Ra, -2 * wordSize); 11089 11090 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11091 11092 { 11093 // Copy input args, reversing as we go. We use Ra as a 11094 // temporary variable. 11095 reverse(Ra, Pa_base, Rlen, t0, t1); 11096 reverse(Ra, Pn_base, Rlen, t0, t1); 11097 } 11098 11099 // Push all call-saved registers and also Pm_base which we'll need 11100 // at the end. 11101 save_regs(); 11102 11103 mov(Pm_base, Ra); 11104 11105 mov(t0, zr); 11106 mov(t1, zr); 11107 mov(t2, zr); 11108 11109 block_comment("for (int i = 0; i < len; i++) {"); 11110 mov(Ri, zr); { 11111 Label loop, end; 11112 bind(loop); 11113 cmp(Ri, Rlen); 11114 br(Assembler::GE, end); 11115 11116 pre1(Ri); 11117 11118 block_comment("for (j = (i+1)/2; j; j--) {"); { 11119 add(Rj, Ri, 1); 11120 lsr(Rj, Rj, 1); 11121 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11122 } block_comment(" } // j"); 11123 11124 last_squaring(Ri); 11125 11126 block_comment(" for (j = i/2; j; j--) {"); { 11127 lsr(Rj, Ri, 1); 11128 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11129 } block_comment(" } // j"); 11130 11131 post1_squaring(); 11132 add(Ri, Ri, 1); 11133 cmp(Ri, Rlen); 11134 br(Assembler::LT, loop); 11135 11136 bind(end); 11137 block_comment("} // i"); 11138 } 11139 11140 block_comment("for (int i = len; i < 2*len; i++) {"); 11141 mov(Ri, Rlen); { 11142 Label loop, end; 11143 bind(loop); 11144 cmp(Ri, Rlen, Assembler::LSL, 1); 11145 br(Assembler::GE, end); 11146 11147 pre2(Ri, Rlen); 11148 11149 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11150 lsl(Rj, Rlen, 1); 11151 sub(Rj, Rj, Ri); 11152 sub(Rj, Rj, 1); 11153 lsr(Rj, Rj, 1); 11154 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11155 } block_comment(" } // j"); 11156 11157 last_squaring(Ri); 11158 11159 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11160 lsl(Rj, Rlen, 1); 11161 sub(Rj, Rj, Ri); 11162 lsr(Rj, Rj, 1); 11163 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11164 } block_comment(" } // j"); 11165 11166 post2(Ri, Rlen); 11167 add(Ri, Ri, 1); 11168 cmp(Ri, Rlen, Assembler::LSL, 1); 11169 11170 br(Assembler::LT, loop); 11171 bind(end); 11172 block_comment("} // i"); 11173 } 11174 11175 normalize(Rlen); 11176 11177 mov(Ra, Pm_base); // Save Pm_base in Ra 11178 restore_regs(); // Restore caller's Pm_base 11179 11180 // Copy our result into caller's Pm_base 11181 reverse(Pm_base, Ra, Rlen, t0, t1); 11182 11183 leave(); 11184 ret(lr); 11185 11186 return entry; 11187 } 11188 // In C, approximately: 11189 11190 // void 11191 // montgomery_square(julong Pa_base[], julong Pn_base[], 11192 // julong Pm_base[], julong inv, int len) { 11193 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11194 // julong *Pa, *Pb, *Pn, *Pm; 11195 // julong Ra, Rb, Rn, Rm; 11196 11197 // int i; 11198 11199 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11200 11201 // for (i = 0; i < len; i++) { 11202 // int j; 11203 11204 // Pa = Pa_base; 11205 // Pb = Pa_base + i; 11206 // Pm = Pm_base; 11207 // Pn = Pn_base + i; 11208 11209 // Ra = *Pa; 11210 // Rb = *Pb; 11211 // Rm = *Pm; 11212 // Rn = *Pn; 11213 11214 // int iters = (i+1)/2; 11215 // for (j = 0; iters--; j++) { 11216 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11217 // MACC2(Ra, Rb, t0, t1, t2); 11218 // Ra = *++Pa; 11219 // Rb = *--Pb; 11220 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11221 // MACC(Rm, Rn, t0, t1, t2); 11222 // Rm = *++Pm; 11223 // Rn = *--Pn; 11224 // } 11225 // if ((i & 1) == 0) { 11226 // assert(Ra == Pa_base[j], "must be"); 11227 // MACC(Ra, Ra, t0, t1, t2); 11228 // } 11229 // iters = i/2; 11230 // assert(iters == i-j, "must be"); 11231 // for (; iters--; j++) { 11232 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11233 // MACC(Rm, Rn, t0, t1, t2); 11234 // Rm = *++Pm; 11235 // Rn = *--Pn; 11236 // } 11237 11238 // *Pm = Rm = t0 * inv; 11239 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11240 // MACC(Rm, Rn, t0, t1, t2); 11241 11242 // assert(t0 == 0, "broken Montgomery multiply"); 11243 11244 // t0 = t1; t1 = t2; t2 = 0; 11245 // } 11246 11247 // for (i = len; i < 2*len; i++) { 11248 // int start = i-len+1; 11249 // int end = start + (len - start)/2; 11250 // int j; 11251 11252 // Pa = Pa_base + i-len; 11253 // Pb = Pa_base + len; 11254 // Pm = Pm_base + i-len; 11255 // Pn = Pn_base + len; 11256 11257 // Ra = *++Pa; 11258 // Rb = *--Pb; 11259 // Rm = *++Pm; 11260 // Rn = *--Pn; 11261 11262 // int iters = (2*len-i-1)/2; 11263 // assert(iters == end-start, "must be"); 11264 // for (j = start; iters--; j++) { 11265 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11266 // MACC2(Ra, Rb, t0, t1, t2); 11267 // Ra = *++Pa; 11268 // Rb = *--Pb; 11269 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11270 // MACC(Rm, Rn, t0, t1, t2); 11271 // Rm = *++Pm; 11272 // Rn = *--Pn; 11273 // } 11274 // if ((i & 1) == 0) { 11275 // assert(Ra == Pa_base[j], "must be"); 11276 // MACC(Ra, Ra, t0, t1, t2); 11277 // } 11278 // iters = (2*len-i)/2; 11279 // assert(iters == len-j, "must be"); 11280 // for (; iters--; j++) { 11281 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11282 // MACC(Rm, Rn, t0, t1, t2); 11283 // Rm = *++Pm; 11284 // Rn = *--Pn; 11285 // } 11286 // Pm_base[i-len] = t0; 11287 // t0 = t1; t1 = t2; t2 = 0; 11288 // } 11289 11290 // while (t0) 11291 // t0 = sub(Pm_base, Pn_base, t0, len); 11292 // } 11293 }; 11294 11295 // Initialization 11296 void generate_initial_stubs() { 11297 // Generate initial stubs and initializes the entry points 11298 11299 // entry points that exist in all platforms Note: This is code 11300 // that could be shared among different platforms - however the 11301 // benefit seems to be smaller than the disadvantage of having a 11302 // much more complicated generator structure. See also comment in 11303 // stubRoutines.hpp. 11304 11305 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11306 11307 StubRoutines::_call_stub_entry = 11308 generate_call_stub(StubRoutines::_call_stub_return_address); 11309 11310 // is referenced by megamorphic call 11311 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11312 11313 // Initialize table for copy memory (arraycopy) check. 11314 if (UnsafeMemoryAccess::_table == nullptr) { 11315 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11316 } 11317 11318 if (UseCRC32Intrinsics) { 11319 // set table address before stub generation which use it 11320 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 11321 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11322 } 11323 11324 if (UseCRC32CIntrinsics) { 11325 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11326 } 11327 11328 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11329 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11330 } 11331 11332 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11333 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11334 } 11335 11336 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11337 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11338 StubRoutines::_hf2f = generate_float16ToFloat(); 11339 StubRoutines::_f2hf = generate_floatToFloat16(); 11340 } 11341 } 11342 11343 void generate_continuation_stubs() { 11344 // Continuation stubs: 11345 StubRoutines::_cont_thaw = generate_cont_thaw(); 11346 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11347 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11348 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11349 } 11350 11351 void generate_final_stubs() { 11352 // support for verify_oop (must happen after universe_init) 11353 if (VerifyOops) { 11354 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11355 } 11356 11357 // arraycopy stubs used by compilers 11358 generate_arraycopy_stubs(); 11359 11360 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11361 11362 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11363 11364 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11365 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11366 11367 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11368 11369 generate_atomic_entry_points(); 11370 11371 #endif // LINUX 11372 11373 #ifdef COMPILER2 11374 if (UseSecondarySupersTable) { 11375 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11376 if (! InlineSecondarySupersTest) { 11377 generate_lookup_secondary_supers_table_stub(); 11378 } 11379 } 11380 #endif 11381 11382 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(); 11383 11384 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11385 } 11386 11387 void generate_compiler_stubs() { 11388 #if COMPILER2_OR_JVMCI 11389 11390 if (UseSVE == 0) { 11391 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 11392 } 11393 11394 // array equals stub for large arrays. 11395 if (!UseSimpleArrayEquals) { 11396 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11397 } 11398 11399 // arrays_hascode stub for large arrays. 11400 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11401 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11402 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11403 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11404 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11405 11406 // byte_array_inflate stub for large arrays. 11407 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11408 11409 // countPositives stub for large arrays. 11410 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11411 11412 generate_compare_long_strings(); 11413 11414 generate_string_indexof_stubs(); 11415 11416 #ifdef COMPILER2 11417 if (UseMultiplyToLenIntrinsic) { 11418 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11419 } 11420 11421 if (UseSquareToLenIntrinsic) { 11422 StubRoutines::_squareToLen = generate_squareToLen(); 11423 } 11424 11425 if (UseMulAddIntrinsic) { 11426 StubRoutines::_mulAdd = generate_mulAdd(); 11427 } 11428 11429 if (UseSIMDForBigIntegerShiftIntrinsics) { 11430 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11431 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11432 } 11433 11434 if (UseMontgomeryMultiplyIntrinsic) { 11435 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 11436 StubCodeMark mark(this, stub_id); 11437 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11438 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11439 } 11440 11441 if (UseMontgomerySquareIntrinsic) { 11442 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 11443 StubCodeMark mark(this, stub_id); 11444 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11445 // We use generate_multiply() rather than generate_square() 11446 // because it's faster for the sizes of modulus we care about. 11447 StubRoutines::_montgomerySquare = g.generate_multiply(); 11448 } 11449 11450 #endif // COMPILER2 11451 11452 if (UseChaCha20Intrinsics) { 11453 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11454 } 11455 11456 if (UseKyberIntrinsics) { 11457 StubRoutines::_kyberNtt = generate_kyberNtt(); 11458 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 11459 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 11460 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 11461 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 11462 StubRoutines::_kyber12To16 = generate_kyber12To16(); 11463 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 11464 } 11465 11466 if (UseDilithiumIntrinsics) { 11467 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 11468 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 11469 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 11470 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 11471 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 11472 } 11473 11474 if (UseBASE64Intrinsics) { 11475 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 11476 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 11477 } 11478 11479 // data cache line writeback 11480 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 11481 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 11482 11483 if (UseAESIntrinsics) { 11484 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 11485 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 11486 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 11487 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 11488 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 11489 } 11490 if (UseGHASHIntrinsics) { 11491 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 11492 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 11493 } 11494 if (UseAESIntrinsics && UseGHASHIntrinsics) { 11495 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 11496 } 11497 11498 if (UseMD5Intrinsics) { 11499 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 11500 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 11501 } 11502 if (UseSHA1Intrinsics) { 11503 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 11504 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 11505 } 11506 if (UseSHA256Intrinsics) { 11507 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 11508 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 11509 } 11510 if (UseSHA512Intrinsics) { 11511 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 11512 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 11513 } 11514 if (UseSHA3Intrinsics) { 11515 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 11516 StubRoutines::_double_keccak = generate_double_keccak(); 11517 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 11518 } 11519 11520 if (UsePoly1305Intrinsics) { 11521 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 11522 } 11523 11524 // generate Adler32 intrinsics code 11525 if (UseAdler32Intrinsics) { 11526 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 11527 } 11528 11529 #endif // COMPILER2_OR_JVMCI 11530 } 11531 11532 public: 11533 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 11534 switch(blob_id) { 11535 case initial_id: 11536 generate_initial_stubs(); 11537 break; 11538 case continuation_id: 11539 generate_continuation_stubs(); 11540 break; 11541 case compiler_id: 11542 generate_compiler_stubs(); 11543 break; 11544 case final_id: 11545 generate_final_stubs(); 11546 break; 11547 default: 11548 fatal("unexpected blob id: %d", blob_id); 11549 break; 11550 }; 11551 } 11552 }; // end class declaration 11553 11554 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 11555 StubGenerator g(code, blob_id); 11556 } 11557 11558 11559 #if defined (LINUX) 11560 11561 // Define pointers to atomic stubs and initialize them to point to the 11562 // code in atomic_aarch64.S. 11563 11564 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 11565 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 11566 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 11567 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 11568 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 11569 11570 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 11571 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 11572 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 11573 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 11574 DEFAULT_ATOMIC_OP(xchg, 4, ) 11575 DEFAULT_ATOMIC_OP(xchg, 8, ) 11576 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 11577 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 11578 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 11579 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 11580 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 11581 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 11582 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 11583 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 11584 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 11585 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 11586 11587 #undef DEFAULT_ATOMIC_OP 11588 11589 #endif // LINUX