1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "code/aotCodeCache.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/arguments.hpp" 46 #include "runtime/atomic.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/frame.inline.hpp" 50 #include "runtime/handles.inline.hpp" 51 #include "runtime/javaThread.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/stubCodeGenerator.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "utilities/align.hpp" 56 #include "utilities/checkedCast.hpp" 57 #include "utilities/debug.hpp" 58 #include "utilities/globalDefinitions.hpp" 59 #include "utilities/intpow.hpp" 60 #include "utilities/powerOfTwo.hpp" 61 #ifdef COMPILER2 62 #include "opto/runtime.hpp" 63 #endif 64 #if INCLUDE_ZGC 65 #include "gc/z/zThreadLocalData.hpp" 66 #endif 67 68 // Declaration and definition of StubGenerator (no .hpp file). 69 // For a more detailed description of the stub routine structure 70 // see the comment in stubRoutines.hpp 71 72 #undef __ 73 #define __ _masm-> 74 75 #ifdef PRODUCT 76 #define BLOCK_COMMENT(str) /* nothing */ 77 #else 78 #define BLOCK_COMMENT(str) __ block_comment(str) 79 #endif 80 81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 82 83 // Stub Code definitions 84 85 class StubGenerator: public StubCodeGenerator { 86 private: 87 88 #ifdef PRODUCT 89 #define inc_counter_np(counter) ((void)0) 90 #else 91 void inc_counter_np_(uint& counter) { 92 __ incrementw(ExternalAddress((address)&counter)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubGenStubId stub_id = StubGenStubId::call_stub_id; 207 StubCodeMark mark(this, stub_id); 208 address start = __ pc(); 209 210 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 211 212 const Address fpcr_save (rfp, fpcr_off * wordSize); 213 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 214 const Address result (rfp, result_off * wordSize); 215 const Address result_type (rfp, result_type_off * wordSize); 216 const Address method (rfp, method_off * wordSize); 217 const Address entry_point (rfp, entry_point_off * wordSize); 218 const Address parameter_size(rfp, parameter_size_off * wordSize); 219 220 const Address thread (rfp, thread_off * wordSize); 221 222 const Address d15_save (rfp, d15_off * wordSize); 223 const Address d13_save (rfp, d13_off * wordSize); 224 const Address d11_save (rfp, d11_off * wordSize); 225 const Address d9_save (rfp, d9_off * wordSize); 226 227 const Address r28_save (rfp, r28_off * wordSize); 228 const Address r26_save (rfp, r26_off * wordSize); 229 const Address r24_save (rfp, r24_off * wordSize); 230 const Address r22_save (rfp, r22_off * wordSize); 231 const Address r20_save (rfp, r20_off * wordSize); 232 233 // stub code 234 235 address aarch64_entry = __ pc(); 236 237 // set up frame and move sp to end of save area 238 __ enter(); 239 __ sub(sp, rfp, -sp_after_call_off * wordSize); 240 241 // save register parameters and Java scratch/global registers 242 // n.b. we save thread even though it gets installed in 243 // rthread because we want to sanity check rthread later 244 __ str(c_rarg7, thread); 245 __ strw(c_rarg6, parameter_size); 246 __ stp(c_rarg4, c_rarg5, entry_point); 247 __ stp(c_rarg2, c_rarg3, result_type); 248 __ stp(c_rarg0, c_rarg1, call_wrapper); 249 250 __ stp(r20, r19, r20_save); 251 __ stp(r22, r21, r22_save); 252 __ stp(r24, r23, r24_save); 253 __ stp(r26, r25, r26_save); 254 __ stp(r28, r27, r28_save); 255 256 __ stpd(v9, v8, d9_save); 257 __ stpd(v11, v10, d11_save); 258 __ stpd(v13, v12, d13_save); 259 __ stpd(v15, v14, d15_save); 260 261 __ get_fpcr(rscratch1); 262 __ str(rscratch1, fpcr_save); 263 // Set FPCR to the state we need. We do want Round to Nearest. We 264 // don't want non-IEEE rounding modes or floating-point traps. 265 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 266 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 267 __ set_fpcr(rscratch1); 268 269 // install Java thread in global register now we have saved 270 // whatever value it held 271 __ mov(rthread, c_rarg7); 272 // And method 273 __ mov(rmethod, c_rarg3); 274 275 // set up the heapbase register 276 __ reinit_heapbase(); 277 278 #ifdef ASSERT 279 // make sure we have no pending exceptions 280 { 281 Label L; 282 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 283 __ cmp(rscratch1, (u1)NULL_WORD); 284 __ br(Assembler::EQ, L); 285 __ stop("StubRoutines::call_stub: entered with pending exception"); 286 __ BIND(L); 287 } 288 #endif 289 // pass parameters if any 290 __ mov(esp, sp); 291 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 292 __ andr(sp, rscratch1, -2 * wordSize); 293 294 BLOCK_COMMENT("pass parameters if any"); 295 Label parameters_done; 296 // parameter count is still in c_rarg6 297 // and parameter pointer identifying param 1 is in c_rarg5 298 __ cbzw(c_rarg6, parameters_done); 299 300 address loop = __ pc(); 301 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 302 __ subsw(c_rarg6, c_rarg6, 1); 303 __ push(rscratch1); 304 __ br(Assembler::GT, loop); 305 306 __ BIND(parameters_done); 307 308 // call Java entry -- passing methdoOop, and current sp 309 // rmethod: Method* 310 // r19_sender_sp: sender sp 311 BLOCK_COMMENT("call Java function"); 312 __ mov(r19_sender_sp, sp); 313 __ blr(c_rarg4); 314 315 // we do this here because the notify will already have been done 316 // if we get to the next instruction via an exception 317 // 318 // n.b. adding this instruction here affects the calculation of 319 // whether or not a routine returns to the call stub (used when 320 // doing stack walks) since the normal test is to check the return 321 // pc against the address saved below. so we may need to allow for 322 // this extra instruction in the check. 323 324 // save current address for use by exception handling code 325 326 return_address = __ pc(); 327 328 // store result depending on type (everything that is not 329 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 330 // n.b. this assumes Java returns an integral result in r0 331 // and a floating result in j_farg0 332 __ ldr(j_rarg2, result); 333 Label is_long, is_float, is_double, exit; 334 __ ldr(j_rarg1, result_type); 335 __ cmp(j_rarg1, (u1)T_OBJECT); 336 __ br(Assembler::EQ, is_long); 337 __ cmp(j_rarg1, (u1)T_LONG); 338 __ br(Assembler::EQ, is_long); 339 __ cmp(j_rarg1, (u1)T_FLOAT); 340 __ br(Assembler::EQ, is_float); 341 __ cmp(j_rarg1, (u1)T_DOUBLE); 342 __ br(Assembler::EQ, is_double); 343 344 // handle T_INT case 345 __ strw(r0, Address(j_rarg2)); 346 347 __ BIND(exit); 348 349 // pop parameters 350 __ sub(esp, rfp, -sp_after_call_off * wordSize); 351 352 #ifdef ASSERT 353 // verify that threads correspond 354 { 355 Label L, S; 356 __ ldr(rscratch1, thread); 357 __ cmp(rthread, rscratch1); 358 __ br(Assembler::NE, S); 359 __ get_thread(rscratch1); 360 __ cmp(rthread, rscratch1); 361 __ br(Assembler::EQ, L); 362 __ BIND(S); 363 __ stop("StubRoutines::call_stub: threads must correspond"); 364 __ BIND(L); 365 } 366 #endif 367 368 __ pop_cont_fastpath(rthread); 369 370 // restore callee-save registers 371 __ ldpd(v15, v14, d15_save); 372 __ ldpd(v13, v12, d13_save); 373 __ ldpd(v11, v10, d11_save); 374 __ ldpd(v9, v8, d9_save); 375 376 __ ldp(r28, r27, r28_save); 377 __ ldp(r26, r25, r26_save); 378 __ ldp(r24, r23, r24_save); 379 __ ldp(r22, r21, r22_save); 380 __ ldp(r20, r19, r20_save); 381 382 // restore fpcr 383 __ ldr(rscratch1, fpcr_save); 384 __ set_fpcr(rscratch1); 385 386 __ ldp(c_rarg0, c_rarg1, call_wrapper); 387 __ ldrw(c_rarg2, result_type); 388 __ ldr(c_rarg3, method); 389 __ ldp(c_rarg4, c_rarg5, entry_point); 390 __ ldp(c_rarg6, c_rarg7, parameter_size); 391 392 // leave frame and return to caller 393 __ leave(); 394 __ ret(lr); 395 396 // handle return types different from T_INT 397 398 __ BIND(is_long); 399 __ str(r0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_float); 403 __ strs(j_farg0, Address(j_rarg2, 0)); 404 __ br(Assembler::AL, exit); 405 406 __ BIND(is_double); 407 __ strd(j_farg0, Address(j_rarg2, 0)); 408 __ br(Assembler::AL, exit); 409 410 return start; 411 } 412 413 // Return point for a Java call if there's an exception thrown in 414 // Java code. The exception is caught and transformed into a 415 // pending exception stored in JavaThread that can be tested from 416 // within the VM. 417 // 418 // Note: Usually the parameters are removed by the callee. In case 419 // of an exception crossing an activation frame boundary, that is 420 // not the case if the callee is compiled code => need to setup the 421 // rsp. 422 // 423 // r0: exception oop 424 425 address generate_catch_exception() { 426 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 427 StubCodeMark mark(this, stub_id); 428 address start = __ pc(); 429 430 // same as in generate_call_stub(): 431 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 432 const Address thread (rfp, thread_off * wordSize); 433 434 #ifdef ASSERT 435 // verify that threads correspond 436 { 437 Label L, S; 438 __ ldr(rscratch1, thread); 439 __ cmp(rthread, rscratch1); 440 __ br(Assembler::NE, S); 441 __ get_thread(rscratch1); 442 __ cmp(rthread, rscratch1); 443 __ br(Assembler::EQ, L); 444 __ bind(S); 445 __ stop("StubRoutines::catch_exception: threads must correspond"); 446 __ bind(L); 447 } 448 #endif 449 450 // set pending exception 451 __ verify_oop(r0); 452 453 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 454 __ mov(rscratch1, (address)__FILE__); 455 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 456 __ movw(rscratch1, (int)__LINE__); 457 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 458 459 // complete return to VM 460 assert(StubRoutines::_call_stub_return_address != nullptr, 461 "_call_stub_return_address must have been generated before"); 462 __ b(StubRoutines::_call_stub_return_address); 463 464 return start; 465 } 466 467 // Continuation point for runtime calls returning with a pending 468 // exception. The pending exception check happened in the runtime 469 // or native call stub. The pending exception in Thread is 470 // converted into a Java-level exception. 471 // 472 // Contract with Java-level exception handlers: 473 // r0: exception 474 // r3: throwing pc 475 // 476 // NOTE: At entry of this stub, exception-pc must be in LR !! 477 478 // NOTE: this is always used as a jump target within generated code 479 // so it just needs to be generated code with no x86 prolog 480 481 address generate_forward_exception() { 482 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 483 StubCodeMark mark(this, stub_id); 484 address start = __ pc(); 485 486 // Upon entry, LR points to the return address returning into 487 // Java (interpreted or compiled) code; i.e., the return address 488 // becomes the throwing pc. 489 // 490 // Arguments pushed before the runtime call are still on the stack 491 // but the exception handler will reset the stack pointer -> 492 // ignore them. A potential result in registers can be ignored as 493 // well. 494 495 #ifdef ASSERT 496 // make sure this code is only executed if there is a pending exception 497 { 498 Label L; 499 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 500 __ cbnz(rscratch1, L); 501 __ stop("StubRoutines::forward exception: no pending exception (1)"); 502 __ bind(L); 503 } 504 #endif 505 506 // compute exception handler into r19 507 508 // call the VM to find the handler address associated with the 509 // caller address. pass thread in r0 and caller pc (ret address) 510 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 511 // the stack. 512 __ mov(c_rarg1, lr); 513 // lr will be trashed by the VM call so we move it to R19 514 // (callee-saved) because we also need to pass it to the handler 515 // returned by this call. 516 __ mov(r19, lr); 517 BLOCK_COMMENT("call exception_handler_for_return_address"); 518 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 519 SharedRuntime::exception_handler_for_return_address), 520 rthread, c_rarg1); 521 // Reinitialize the ptrue predicate register, in case the external runtime 522 // call clobbers ptrue reg, as we may return to SVE compiled code. 523 __ reinitialize_ptrue(); 524 525 // we should not really care that lr is no longer the callee 526 // address. we saved the value the handler needs in r19 so we can 527 // just copy it to r3. however, the C2 handler will push its own 528 // frame and then calls into the VM and the VM code asserts that 529 // the PC for the frame above the handler belongs to a compiled 530 // Java method. So, we restore lr here to satisfy that assert. 531 __ mov(lr, r19); 532 // setup r0 & r3 & clear pending exception 533 __ mov(r3, r19); 534 __ mov(r19, r0); 535 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 536 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 537 538 #ifdef ASSERT 539 // make sure exception is set 540 { 541 Label L; 542 __ cbnz(r0, L); 543 __ stop("StubRoutines::forward exception: no pending exception (2)"); 544 __ bind(L); 545 } 546 #endif 547 548 // continue at exception handler 549 // r0: exception 550 // r3: throwing pc 551 // r19: exception handler 552 __ verify_oop(r0); 553 __ br(r19); 554 555 return start; 556 } 557 558 // Non-destructive plausibility checks for oops 559 // 560 // Arguments: 561 // r0: oop to verify 562 // rscratch1: error message 563 // 564 // Stack after saving c_rarg3: 565 // [tos + 0]: saved c_rarg3 566 // [tos + 1]: saved c_rarg2 567 // [tos + 2]: saved lr 568 // [tos + 3]: saved rscratch2 569 // [tos + 4]: saved r0 570 // [tos + 5]: saved rscratch1 571 address generate_verify_oop() { 572 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 573 StubCodeMark mark(this, stub_id); 574 address start = __ pc(); 575 576 Label exit, error; 577 578 // save c_rarg2 and c_rarg3 579 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 580 581 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 583 __ ldr(c_rarg3, Address(c_rarg2)); 584 __ add(c_rarg3, c_rarg3, 1); 585 __ str(c_rarg3, Address(c_rarg2)); 586 587 // object is in r0 588 // make sure object is 'reasonable' 589 __ cbz(r0, exit); // if obj is null it is OK 590 591 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 592 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blr(rscratch1); 615 __ hlt(0); 616 617 return start; 618 } 619 620 // Generate indices for iota vector. 621 address generate_iota_indices(StubGenStubId stub_id) { 622 __ align(CodeEntryAlignment); 623 StubCodeMark mark(this, stub_id); 624 address start = __ pc(); 625 // B 626 __ emit_data64(0x0706050403020100, relocInfo::none); 627 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 628 // H 629 __ emit_data64(0x0003000200010000, relocInfo::none); 630 __ emit_data64(0x0007000600050004, relocInfo::none); 631 // S 632 __ emit_data64(0x0000000100000000, relocInfo::none); 633 __ emit_data64(0x0000000300000002, relocInfo::none); 634 // D 635 __ emit_data64(0x0000000000000000, relocInfo::none); 636 __ emit_data64(0x0000000000000001, relocInfo::none); 637 // S - FP 638 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 639 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 640 // D - FP 641 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 642 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 643 return start; 644 } 645 646 // The inner part of zero_words(). This is the bulk operation, 647 // zeroing words in blocks, possibly using DC ZVA to do it. The 648 // caller is responsible for zeroing the last few words. 649 // 650 // Inputs: 651 // r10: the HeapWord-aligned base address of an array to zero. 652 // r11: the count in HeapWords, r11 > 0. 653 // 654 // Returns r10 and r11, adjusted for the caller to clear. 655 // r10: the base address of the tail of words left to clear. 656 // r11: the number of words in the tail. 657 // r11 < MacroAssembler::zero_words_block_size. 658 659 address generate_zero_blocks() { 660 Label done; 661 Label base_aligned; 662 663 Register base = r10, cnt = r11; 664 665 __ align(CodeEntryAlignment); 666 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 667 StubCodeMark mark(this, stub_id); 668 address start = __ pc(); 669 670 if (UseBlockZeroing) { 671 int zva_length = VM_Version::zva_length(); 672 673 // Ensure ZVA length can be divided by 16. This is required by 674 // the subsequent operations. 675 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 676 677 __ tbz(base, 3, base_aligned); 678 __ str(zr, Address(__ post(base, 8))); 679 __ sub(cnt, cnt, 1); 680 __ bind(base_aligned); 681 682 // Ensure count >= zva_length * 2 so that it still deserves a zva after 683 // alignment. 684 Label small; 685 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 686 __ subs(rscratch1, cnt, low_limit >> 3); 687 __ br(Assembler::LT, small); 688 __ zero_dcache_blocks(base, cnt); 689 __ bind(small); 690 } 691 692 { 693 // Number of stp instructions we'll unroll 694 const int unroll = 695 MacroAssembler::zero_words_block_size / 2; 696 // Clear the remaining blocks. 697 Label loop; 698 __ subs(cnt, cnt, unroll * 2); 699 __ br(Assembler::LT, done); 700 __ bind(loop); 701 for (int i = 0; i < unroll; i++) 702 __ stp(zr, zr, __ post(base, 16)); 703 __ subs(cnt, cnt, unroll * 2); 704 __ br(Assembler::GE, loop); 705 __ bind(done); 706 __ add(cnt, cnt, unroll * 2); 707 } 708 709 __ ret(lr); 710 711 return start; 712 } 713 714 715 typedef enum { 716 copy_forwards = 1, 717 copy_backwards = -1 718 } copy_direction; 719 720 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 721 // for arraycopy stubs. 722 class ArrayCopyBarrierSetHelper : StackObj { 723 BarrierSetAssembler* _bs_asm; 724 MacroAssembler* _masm; 725 DecoratorSet _decorators; 726 BasicType _type; 727 Register _gct1; 728 Register _gct2; 729 Register _gct3; 730 FloatRegister _gcvt1; 731 FloatRegister _gcvt2; 732 FloatRegister _gcvt3; 733 734 public: 735 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 736 DecoratorSet decorators, 737 BasicType type, 738 Register gct1, 739 Register gct2, 740 Register gct3, 741 FloatRegister gcvt1, 742 FloatRegister gcvt2, 743 FloatRegister gcvt3) 744 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 745 _masm(masm), 746 _decorators(decorators), 747 _type(type), 748 _gct1(gct1), 749 _gct2(gct2), 750 _gct3(gct3), 751 _gcvt1(gcvt1), 752 _gcvt2(gcvt2), 753 _gcvt3(gcvt3) { 754 } 755 756 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 757 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 758 dst1, dst2, src, 759 _gct1, _gct2, _gcvt1); 760 } 761 762 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 763 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 764 dst, src1, src2, 765 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 766 } 767 768 void copy_load_at_16(Register dst1, Register dst2, Address src) { 769 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 770 dst1, dst2, src, 771 _gct1); 772 } 773 774 void copy_store_at_16(Address dst, Register src1, Register src2) { 775 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 776 dst, src1, src2, 777 _gct1, _gct2, _gct3); 778 } 779 780 void copy_load_at_8(Register dst, Address src) { 781 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 782 dst, noreg, src, 783 _gct1); 784 } 785 786 void copy_store_at_8(Address dst, Register src) { 787 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 788 dst, src, noreg, 789 _gct1, _gct2, _gct3); 790 } 791 }; 792 793 // Bulk copy of blocks of 8 words. 794 // 795 // count is a count of words. 796 // 797 // Precondition: count >= 8 798 // 799 // Postconditions: 800 // 801 // The least significant bit of count contains the remaining count 802 // of words to copy. The rest of count is trash. 803 // 804 // s and d are adjusted to point to the remaining words to copy 805 // 806 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 807 BasicType type; 808 copy_direction direction; 809 810 switch (stub_id) { 811 case copy_byte_f_id: 812 direction = copy_forwards; 813 type = T_BYTE; 814 break; 815 case copy_byte_b_id: 816 direction = copy_backwards; 817 type = T_BYTE; 818 break; 819 case copy_oop_f_id: 820 direction = copy_forwards; 821 type = T_OBJECT; 822 break; 823 case copy_oop_b_id: 824 direction = copy_backwards; 825 type = T_OBJECT; 826 break; 827 case copy_oop_uninit_f_id: 828 direction = copy_forwards; 829 type = T_OBJECT; 830 break; 831 case copy_oop_uninit_b_id: 832 direction = copy_backwards; 833 type = T_OBJECT; 834 break; 835 default: 836 ShouldNotReachHere(); 837 } 838 839 int unit = wordSize * direction; 840 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 841 842 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 843 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 844 const Register stride = r14; 845 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 846 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 847 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 848 849 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 850 assert_different_registers(s, d, count, rscratch1, rscratch2); 851 852 Label again, drain; 853 854 __ align(CodeEntryAlignment); 855 856 StubCodeMark mark(this, stub_id); 857 858 __ bind(start); 859 860 Label unaligned_copy_long; 861 if (AvoidUnalignedAccesses) { 862 __ tbnz(d, 3, unaligned_copy_long); 863 } 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, bias); 867 __ sub(d, d, bias); 868 } 869 870 #ifdef ASSERT 871 // Make sure we are never given < 8 words 872 { 873 Label L; 874 __ cmp(count, (u1)8); 875 __ br(Assembler::GE, L); 876 __ stop("genrate_copy_longs called with < 8 words"); 877 __ bind(L); 878 } 879 #endif 880 881 // Fill 8 registers 882 if (UseSIMDForMemoryOps) { 883 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 884 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 885 } else { 886 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 888 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 889 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 890 } 891 892 __ subs(count, count, 16); 893 __ br(Assembler::LO, drain); 894 895 int prefetch = PrefetchCopyIntervalInBytes; 896 bool use_stride = false; 897 if (direction == copy_backwards) { 898 use_stride = prefetch > 256; 899 prefetch = -prefetch; 900 if (use_stride) __ mov(stride, prefetch); 901 } 902 903 __ bind(again); 904 905 if (PrefetchCopyIntervalInBytes > 0) 906 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 907 908 if (UseSIMDForMemoryOps) { 909 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 910 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 911 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 912 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 913 } else { 914 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 915 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 916 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 917 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 918 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 919 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 920 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 921 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 922 } 923 924 __ subs(count, count, 8); 925 __ br(Assembler::HS, again); 926 927 // Drain 928 __ bind(drain); 929 if (UseSIMDForMemoryOps) { 930 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 931 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 932 } else { 933 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 934 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 935 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 936 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 937 } 938 939 { 940 Label L1, L2; 941 __ tbz(count, exact_log2(4), L1); 942 if (UseSIMDForMemoryOps) { 943 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 944 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 945 } else { 946 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 947 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 950 } 951 __ bind(L1); 952 953 if (direction == copy_forwards) { 954 __ add(s, s, bias); 955 __ add(d, d, bias); 956 } 957 958 __ tbz(count, 1, L2); 959 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 960 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 961 __ bind(L2); 962 } 963 964 __ ret(lr); 965 966 if (AvoidUnalignedAccesses) { 967 Label drain, again; 968 // Register order for storing. Order is different for backward copy. 969 970 __ bind(unaligned_copy_long); 971 972 // source address is even aligned, target odd aligned 973 // 974 // when forward copying word pairs we read long pairs at offsets 975 // {0, 2, 4, 6} (in long words). when backwards copying we read 976 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 977 // address by -2 in the forwards case so we can compute the 978 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 979 // or -1. 980 // 981 // when forward copying we need to store 1 word, 3 pairs and 982 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 983 // zero offset We adjust the destination by -1 which means we 984 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 985 // 986 // When backwards copyng we need to store 1 word, 3 pairs and 987 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 988 // offsets {1, 3, 5, 7, 8} * unit. 989 990 if (direction == copy_forwards) { 991 __ sub(s, s, 16); 992 __ sub(d, d, 8); 993 } 994 995 // Fill 8 registers 996 // 997 // for forwards copy s was offset by -16 from the original input 998 // value of s so the register contents are at these offsets 999 // relative to the 64 bit block addressed by that original input 1000 // and so on for each successive 64 byte block when s is updated 1001 // 1002 // t0 at offset 0, t1 at offset 8 1003 // t2 at offset 16, t3 at offset 24 1004 // t4 at offset 32, t5 at offset 40 1005 // t6 at offset 48, t7 at offset 56 1006 1007 // for backwards copy s was not offset so the register contents 1008 // are at these offsets into the preceding 64 byte block 1009 // relative to that original input and so on for each successive 1010 // preceding 64 byte block when s is updated. this explains the 1011 // slightly counter-intuitive looking pattern of register usage 1012 // in the stp instructions for backwards copy. 1013 // 1014 // t0 at offset -16, t1 at offset -8 1015 // t2 at offset -32, t3 at offset -24 1016 // t4 at offset -48, t5 at offset -40 1017 // t6 at offset -64, t7 at offset -56 1018 1019 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1020 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1021 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1022 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1023 1024 __ subs(count, count, 16); 1025 __ br(Assembler::LO, drain); 1026 1027 int prefetch = PrefetchCopyIntervalInBytes; 1028 bool use_stride = false; 1029 if (direction == copy_backwards) { 1030 use_stride = prefetch > 256; 1031 prefetch = -prefetch; 1032 if (use_stride) __ mov(stride, prefetch); 1033 } 1034 1035 __ bind(again); 1036 1037 if (PrefetchCopyIntervalInBytes > 0) 1038 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1039 1040 if (direction == copy_forwards) { 1041 // allowing for the offset of -8 the store instructions place 1042 // registers into the target 64 bit block at the following 1043 // offsets 1044 // 1045 // t0 at offset 0 1046 // t1 at offset 8, t2 at offset 16 1047 // t3 at offset 24, t4 at offset 32 1048 // t5 at offset 40, t6 at offset 48 1049 // t7 at offset 56 1050 1051 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1052 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1053 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1054 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1055 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1056 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1057 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1058 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1059 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1060 } else { 1061 // d was not offset when we started so the registers are 1062 // written into the 64 bit block preceding d with the following 1063 // offsets 1064 // 1065 // t1 at offset -8 1066 // t3 at offset -24, t0 at offset -16 1067 // t5 at offset -48, t2 at offset -32 1068 // t7 at offset -56, t4 at offset -48 1069 // t6 at offset -64 1070 // 1071 // note that this matches the offsets previously noted for the 1072 // loads 1073 1074 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1075 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1076 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1077 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1078 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1079 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1080 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1081 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1082 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1083 } 1084 1085 __ subs(count, count, 8); 1086 __ br(Assembler::HS, again); 1087 1088 // Drain 1089 // 1090 // this uses the same pattern of offsets and register arguments 1091 // as above 1092 __ bind(drain); 1093 if (direction == copy_forwards) { 1094 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1095 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1096 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1097 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1098 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1099 } else { 1100 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1101 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1102 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1103 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1104 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1105 } 1106 // now we need to copy any remaining part block which may 1107 // include a 4 word block subblock and/or a 2 word subblock. 1108 // bits 2 and 1 in the count are the tell-tale for whether we 1109 // have each such subblock 1110 { 1111 Label L1, L2; 1112 __ tbz(count, exact_log2(4), L1); 1113 // this is the same as above but copying only 4 longs hence 1114 // with only one intervening stp between the str instructions 1115 // but note that the offsets and registers still follow the 1116 // same pattern 1117 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1118 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1119 if (direction == copy_forwards) { 1120 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1121 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1122 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1123 } else { 1124 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1125 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1126 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1127 } 1128 __ bind(L1); 1129 1130 __ tbz(count, 1, L2); 1131 // this is the same as above but copying only 2 longs hence 1132 // there is no intervening stp between the str instructions 1133 // but note that the offset and register patterns are still 1134 // the same 1135 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1136 if (direction == copy_forwards) { 1137 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1138 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1139 } else { 1140 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1141 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1142 } 1143 __ bind(L2); 1144 1145 // for forwards copy we need to re-adjust the offsets we 1146 // applied so that s and d are follow the last words written 1147 1148 if (direction == copy_forwards) { 1149 __ add(s, s, 16); 1150 __ add(d, d, 8); 1151 } 1152 1153 } 1154 1155 __ ret(lr); 1156 } 1157 } 1158 1159 // Small copy: less than 16 bytes. 1160 // 1161 // NB: Ignores all of the bits of count which represent more than 15 1162 // bytes, so a caller doesn't have to mask them. 1163 1164 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1165 bool is_backwards = step < 0; 1166 size_t granularity = g_uabs(step); 1167 int direction = is_backwards ? -1 : 1; 1168 1169 Label Lword, Lint, Lshort, Lbyte; 1170 1171 assert(granularity 1172 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1173 1174 const Register t0 = r3; 1175 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1176 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1177 1178 // ??? I don't know if this bit-test-and-branch is the right thing 1179 // to do. It does a lot of jumping, resulting in several 1180 // mispredicted branches. It might make more sense to do this 1181 // with something like Duff's device with a single computed branch. 1182 1183 __ tbz(count, 3 - exact_log2(granularity), Lword); 1184 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1185 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1186 __ bind(Lword); 1187 1188 if (granularity <= sizeof (jint)) { 1189 __ tbz(count, 2 - exact_log2(granularity), Lint); 1190 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1191 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1192 __ bind(Lint); 1193 } 1194 1195 if (granularity <= sizeof (jshort)) { 1196 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1197 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1198 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1199 __ bind(Lshort); 1200 } 1201 1202 if (granularity <= sizeof (jbyte)) { 1203 __ tbz(count, 0, Lbyte); 1204 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1205 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1206 __ bind(Lbyte); 1207 } 1208 } 1209 1210 Label copy_f, copy_b; 1211 Label copy_obj_f, copy_obj_b; 1212 Label copy_obj_uninit_f, copy_obj_uninit_b; 1213 1214 // All-singing all-dancing memory copy. 1215 // 1216 // Copy count units of memory from s to d. The size of a unit is 1217 // step, which can be positive or negative depending on the direction 1218 // of copy. If is_aligned is false, we align the source address. 1219 // 1220 1221 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1222 Register s, Register d, Register count, int step) { 1223 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1224 bool is_backwards = step < 0; 1225 unsigned int granularity = g_uabs(step); 1226 const Register t0 = r3, t1 = r4; 1227 1228 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1229 // load all the data before writing anything 1230 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1231 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1232 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1233 const Register send = r17, dend = r16; 1234 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1235 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1236 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1237 1238 if (PrefetchCopyIntervalInBytes > 0) 1239 __ prfm(Address(s, 0), PLDL1KEEP); 1240 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1241 __ br(Assembler::HI, copy_big); 1242 1243 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1244 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1245 1246 __ cmp(count, u1(16/granularity)); 1247 __ br(Assembler::LS, copy16); 1248 1249 __ cmp(count, u1(64/granularity)); 1250 __ br(Assembler::HI, copy80); 1251 1252 __ cmp(count, u1(32/granularity)); 1253 __ br(Assembler::LS, copy32); 1254 1255 // 33..64 bytes 1256 if (UseSIMDForMemoryOps) { 1257 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1258 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1259 bs.copy_store_at_32(Address(d, 0), v0, v1); 1260 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1261 } else { 1262 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1263 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1264 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1265 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1266 1267 bs.copy_store_at_16(Address(d, 0), t0, t1); 1268 bs.copy_store_at_16(Address(d, 16), t2, t3); 1269 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1270 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1271 } 1272 __ b(finish); 1273 1274 // 17..32 bytes 1275 __ bind(copy32); 1276 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1277 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1278 1279 bs.copy_store_at_16(Address(d, 0), t0, t1); 1280 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1281 __ b(finish); 1282 1283 // 65..80/96 bytes 1284 // (96 bytes if SIMD because we do 32 byes per instruction) 1285 __ bind(copy80); 1286 if (UseSIMDForMemoryOps) { 1287 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1288 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1289 // Unaligned pointers can be an issue for copying. 1290 // The issue has more chances to happen when granularity of data is 1291 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1292 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1293 // The most performance drop has been seen for the range 65-80 bytes. 1294 // For such cases using the pair of ldp/stp instead of the third pair of 1295 // ldpq/stpq fixes the performance issue. 1296 if (granularity < sizeof (jint)) { 1297 Label copy96; 1298 __ cmp(count, u1(80/granularity)); 1299 __ br(Assembler::HI, copy96); 1300 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1301 1302 bs.copy_store_at_32(Address(d, 0), v0, v1); 1303 bs.copy_store_at_32(Address(d, 32), v2, v3); 1304 1305 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1306 __ b(finish); 1307 1308 __ bind(copy96); 1309 } 1310 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1311 1312 bs.copy_store_at_32(Address(d, 0), v0, v1); 1313 bs.copy_store_at_32(Address(d, 32), v2, v3); 1314 1315 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1316 } else { 1317 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1318 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1319 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1320 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1321 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1322 1323 bs.copy_store_at_16(Address(d, 0), t0, t1); 1324 bs.copy_store_at_16(Address(d, 16), t2, t3); 1325 bs.copy_store_at_16(Address(d, 32), t4, t5); 1326 bs.copy_store_at_16(Address(d, 48), t6, t7); 1327 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1328 } 1329 __ b(finish); 1330 1331 // 0..16 bytes 1332 __ bind(copy16); 1333 __ cmp(count, u1(8/granularity)); 1334 __ br(Assembler::LO, copy8); 1335 1336 // 8..16 bytes 1337 bs.copy_load_at_8(t0, Address(s, 0)); 1338 bs.copy_load_at_8(t1, Address(send, -8)); 1339 bs.copy_store_at_8(Address(d, 0), t0); 1340 bs.copy_store_at_8(Address(dend, -8), t1); 1341 __ b(finish); 1342 1343 if (granularity < 8) { 1344 // 4..7 bytes 1345 __ bind(copy8); 1346 __ tbz(count, 2 - exact_log2(granularity), copy4); 1347 __ ldrw(t0, Address(s, 0)); 1348 __ ldrw(t1, Address(send, -4)); 1349 __ strw(t0, Address(d, 0)); 1350 __ strw(t1, Address(dend, -4)); 1351 __ b(finish); 1352 if (granularity < 4) { 1353 // 0..3 bytes 1354 __ bind(copy4); 1355 __ cbz(count, finish); // get rid of 0 case 1356 if (granularity == 2) { 1357 __ ldrh(t0, Address(s, 0)); 1358 __ strh(t0, Address(d, 0)); 1359 } else { // granularity == 1 1360 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1361 // the first and last byte. 1362 // Handle the 3 byte case by loading and storing base + count/2 1363 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1364 // This does means in the 1 byte case we load/store the same 1365 // byte 3 times. 1366 __ lsr(count, count, 1); 1367 __ ldrb(t0, Address(s, 0)); 1368 __ ldrb(t1, Address(send, -1)); 1369 __ ldrb(t2, Address(s, count)); 1370 __ strb(t0, Address(d, 0)); 1371 __ strb(t1, Address(dend, -1)); 1372 __ strb(t2, Address(d, count)); 1373 } 1374 __ b(finish); 1375 } 1376 } 1377 1378 __ bind(copy_big); 1379 if (is_backwards) { 1380 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1381 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1382 } 1383 1384 // Now we've got the small case out of the way we can align the 1385 // source address on a 2-word boundary. 1386 1387 // Here we will materialize a count in r15, which is used by copy_memory_small 1388 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1389 // Up until here, we have used t9, which aliases r15, but from here on, that register 1390 // can not be used as a temp register, as it contains the count. 1391 1392 Label aligned; 1393 1394 if (is_aligned) { 1395 // We may have to adjust by 1 word to get s 2-word-aligned. 1396 __ tbz(s, exact_log2(wordSize), aligned); 1397 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1398 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1399 __ sub(count, count, wordSize/granularity); 1400 } else { 1401 if (is_backwards) { 1402 __ andr(r15, s, 2 * wordSize - 1); 1403 } else { 1404 __ neg(r15, s); 1405 __ andr(r15, r15, 2 * wordSize - 1); 1406 } 1407 // r15 is the byte adjustment needed to align s. 1408 __ cbz(r15, aligned); 1409 int shift = exact_log2(granularity); 1410 if (shift > 0) { 1411 __ lsr(r15, r15, shift); 1412 } 1413 __ sub(count, count, r15); 1414 1415 #if 0 1416 // ?? This code is only correct for a disjoint copy. It may or 1417 // may not make sense to use it in that case. 1418 1419 // Copy the first pair; s and d may not be aligned. 1420 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1421 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1422 1423 // Align s and d, adjust count 1424 if (is_backwards) { 1425 __ sub(s, s, r15); 1426 __ sub(d, d, r15); 1427 } else { 1428 __ add(s, s, r15); 1429 __ add(d, d, r15); 1430 } 1431 #else 1432 copy_memory_small(decorators, type, s, d, r15, step); 1433 #endif 1434 } 1435 1436 __ bind(aligned); 1437 1438 // s is now 2-word-aligned. 1439 1440 // We have a count of units and some trailing bytes. Adjust the 1441 // count and do a bulk copy of words. If the shift is zero 1442 // perform a move instead to benefit from zero latency moves. 1443 int shift = exact_log2(wordSize/granularity); 1444 if (shift > 0) { 1445 __ lsr(r15, count, shift); 1446 } else { 1447 __ mov(r15, count); 1448 } 1449 if (direction == copy_forwards) { 1450 if (type != T_OBJECT) { 1451 __ bl(copy_f); 1452 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1453 __ bl(copy_obj_uninit_f); 1454 } else { 1455 __ bl(copy_obj_f); 1456 } 1457 } else { 1458 if (type != T_OBJECT) { 1459 __ bl(copy_b); 1460 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1461 __ bl(copy_obj_uninit_b); 1462 } else { 1463 __ bl(copy_obj_b); 1464 } 1465 } 1466 1467 // And the tail. 1468 copy_memory_small(decorators, type, s, d, count, step); 1469 1470 if (granularity >= 8) __ bind(copy8); 1471 if (granularity >= 4) __ bind(copy4); 1472 __ bind(finish); 1473 } 1474 1475 1476 void clobber_registers() { 1477 #ifdef ASSERT 1478 RegSet clobbered 1479 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1480 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1481 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1482 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1483 __ mov(*it, rscratch1); 1484 } 1485 #endif 1486 1487 } 1488 1489 // Scan over array at a for count oops, verifying each one. 1490 // Preserves a and count, clobbers rscratch1 and rscratch2. 1491 void verify_oop_array (int size, Register a, Register count, Register temp) { 1492 Label loop, end; 1493 __ mov(rscratch1, a); 1494 __ mov(rscratch2, zr); 1495 __ bind(loop); 1496 __ cmp(rscratch2, count); 1497 __ br(Assembler::HS, end); 1498 if (size == wordSize) { 1499 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1500 __ verify_oop(temp); 1501 } else { 1502 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1503 __ decode_heap_oop(temp); // calls verify_oop 1504 } 1505 __ add(rscratch2, rscratch2, 1); 1506 __ b(loop); 1507 __ bind(end); 1508 } 1509 1510 // Arguments: 1511 // stub_id - is used to name the stub and identify all details of 1512 // how to perform the copy. 1513 // 1514 // entry - is assigned to the stub's post push entry point unless 1515 // it is null 1516 // 1517 // Inputs: 1518 // c_rarg0 - source array address 1519 // c_rarg1 - destination array address 1520 // c_rarg2 - element count, treated as ssize_t, can be zero 1521 // 1522 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1523 // the hardware handle it. The two dwords within qwords that span 1524 // cache line boundaries will still be loaded and stored atomically. 1525 // 1526 // Side Effects: entry is set to the (post push) entry point so it 1527 // can be used by the corresponding conjoint copy 1528 // method 1529 // 1530 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1531 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1532 RegSet saved_reg = RegSet::of(s, d, count); 1533 int size; 1534 bool aligned; 1535 bool is_oop; 1536 bool dest_uninitialized; 1537 switch (stub_id) { 1538 case jbyte_disjoint_arraycopy_id: 1539 size = sizeof(jbyte); 1540 aligned = false; 1541 is_oop = false; 1542 dest_uninitialized = false; 1543 break; 1544 case arrayof_jbyte_disjoint_arraycopy_id: 1545 size = sizeof(jbyte); 1546 aligned = true; 1547 is_oop = false; 1548 dest_uninitialized = false; 1549 break; 1550 case jshort_disjoint_arraycopy_id: 1551 size = sizeof(jshort); 1552 aligned = false; 1553 is_oop = false; 1554 dest_uninitialized = false; 1555 break; 1556 case arrayof_jshort_disjoint_arraycopy_id: 1557 size = sizeof(jshort); 1558 aligned = true; 1559 is_oop = false; 1560 dest_uninitialized = false; 1561 break; 1562 case jint_disjoint_arraycopy_id: 1563 size = sizeof(jint); 1564 aligned = false; 1565 is_oop = false; 1566 dest_uninitialized = false; 1567 break; 1568 case arrayof_jint_disjoint_arraycopy_id: 1569 size = sizeof(jint); 1570 aligned = true; 1571 is_oop = false; 1572 dest_uninitialized = false; 1573 break; 1574 case jlong_disjoint_arraycopy_id: 1575 // since this is always aligned we can (should!) use the same 1576 // stub as for case arrayof_jlong_disjoint_arraycopy 1577 ShouldNotReachHere(); 1578 break; 1579 case arrayof_jlong_disjoint_arraycopy_id: 1580 size = sizeof(jlong); 1581 aligned = true; 1582 is_oop = false; 1583 dest_uninitialized = false; 1584 break; 1585 case oop_disjoint_arraycopy_id: 1586 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1587 aligned = !UseCompressedOops; 1588 is_oop = true; 1589 dest_uninitialized = false; 1590 break; 1591 case arrayof_oop_disjoint_arraycopy_id: 1592 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1593 aligned = !UseCompressedOops; 1594 is_oop = true; 1595 dest_uninitialized = false; 1596 break; 1597 case oop_disjoint_arraycopy_uninit_id: 1598 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1599 aligned = !UseCompressedOops; 1600 is_oop = true; 1601 dest_uninitialized = true; 1602 break; 1603 case arrayof_oop_disjoint_arraycopy_uninit_id: 1604 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1605 aligned = !UseCompressedOops; 1606 is_oop = true; 1607 dest_uninitialized = true; 1608 break; 1609 default: 1610 ShouldNotReachHere(); 1611 break; 1612 } 1613 1614 __ align(CodeEntryAlignment); 1615 StubCodeMark mark(this, stub_id); 1616 address start = __ pc(); 1617 __ enter(); 1618 1619 if (entry != nullptr) { 1620 *entry = __ pc(); 1621 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1622 BLOCK_COMMENT("Entry:"); 1623 } 1624 1625 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1626 if (dest_uninitialized) { 1627 decorators |= IS_DEST_UNINITIALIZED; 1628 } 1629 if (aligned) { 1630 decorators |= ARRAYCOPY_ALIGNED; 1631 } 1632 1633 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1634 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1635 1636 if (is_oop) { 1637 // save regs before copy_memory 1638 __ push(RegSet::of(d, count), sp); 1639 } 1640 { 1641 // UnsafeMemoryAccess page error: continue after unsafe access 1642 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1643 UnsafeMemoryAccessMark umam(this, add_entry, true); 1644 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1645 } 1646 1647 if (is_oop) { 1648 __ pop(RegSet::of(d, count), sp); 1649 if (VerifyOops) 1650 verify_oop_array(size, d, count, r16); 1651 } 1652 1653 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1654 1655 __ leave(); 1656 __ mov(r0, zr); // return 0 1657 __ ret(lr); 1658 return start; 1659 } 1660 1661 // Arguments: 1662 // stub_id - is used to name the stub and identify all details of 1663 // how to perform the copy. 1664 // 1665 // nooverlap_target - identifes the (post push) entry for the 1666 // corresponding disjoint copy routine which can be 1667 // jumped to if the ranges do not actually overlap 1668 // 1669 // entry - is assigned to the stub's post push entry point unless 1670 // it is null 1671 // 1672 // 1673 // Inputs: 1674 // c_rarg0 - source array address 1675 // c_rarg1 - destination array address 1676 // c_rarg2 - element count, treated as ssize_t, can be zero 1677 // 1678 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1679 // the hardware handle it. The two dwords within qwords that span 1680 // cache line boundaries will still be loaded and stored atomically. 1681 // 1682 // Side Effects: 1683 // entry is set to the no-overlap entry point so it can be used by 1684 // some other conjoint copy method 1685 // 1686 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1687 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1688 RegSet saved_regs = RegSet::of(s, d, count); 1689 int size; 1690 bool aligned; 1691 bool is_oop; 1692 bool dest_uninitialized; 1693 switch (stub_id) { 1694 case jbyte_arraycopy_id: 1695 size = sizeof(jbyte); 1696 aligned = false; 1697 is_oop = false; 1698 dest_uninitialized = false; 1699 break; 1700 case arrayof_jbyte_arraycopy_id: 1701 size = sizeof(jbyte); 1702 aligned = true; 1703 is_oop = false; 1704 dest_uninitialized = false; 1705 break; 1706 case jshort_arraycopy_id: 1707 size = sizeof(jshort); 1708 aligned = false; 1709 is_oop = false; 1710 dest_uninitialized = false; 1711 break; 1712 case arrayof_jshort_arraycopy_id: 1713 size = sizeof(jshort); 1714 aligned = true; 1715 is_oop = false; 1716 dest_uninitialized = false; 1717 break; 1718 case jint_arraycopy_id: 1719 size = sizeof(jint); 1720 aligned = false; 1721 is_oop = false; 1722 dest_uninitialized = false; 1723 break; 1724 case arrayof_jint_arraycopy_id: 1725 size = sizeof(jint); 1726 aligned = true; 1727 is_oop = false; 1728 dest_uninitialized = false; 1729 break; 1730 case jlong_arraycopy_id: 1731 // since this is always aligned we can (should!) use the same 1732 // stub as for case arrayof_jlong_disjoint_arraycopy 1733 ShouldNotReachHere(); 1734 break; 1735 case arrayof_jlong_arraycopy_id: 1736 size = sizeof(jlong); 1737 aligned = true; 1738 is_oop = false; 1739 dest_uninitialized = false; 1740 break; 1741 case oop_arraycopy_id: 1742 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1743 aligned = !UseCompressedOops; 1744 is_oop = true; 1745 dest_uninitialized = false; 1746 break; 1747 case arrayof_oop_arraycopy_id: 1748 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1749 aligned = !UseCompressedOops; 1750 is_oop = true; 1751 dest_uninitialized = false; 1752 break; 1753 case oop_arraycopy_uninit_id: 1754 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1755 aligned = !UseCompressedOops; 1756 is_oop = true; 1757 dest_uninitialized = true; 1758 break; 1759 case arrayof_oop_arraycopy_uninit_id: 1760 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1761 aligned = !UseCompressedOops; 1762 is_oop = true; 1763 dest_uninitialized = true; 1764 break; 1765 default: 1766 ShouldNotReachHere(); 1767 } 1768 1769 StubCodeMark mark(this, stub_id); 1770 address start = __ pc(); 1771 __ enter(); 1772 1773 if (entry != nullptr) { 1774 *entry = __ pc(); 1775 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1776 BLOCK_COMMENT("Entry:"); 1777 } 1778 1779 // use fwd copy when (d-s) above_equal (count*size) 1780 __ sub(rscratch1, d, s); 1781 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1782 __ br(Assembler::HS, nooverlap_target); 1783 1784 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1785 if (dest_uninitialized) { 1786 decorators |= IS_DEST_UNINITIALIZED; 1787 } 1788 if (aligned) { 1789 decorators |= ARRAYCOPY_ALIGNED; 1790 } 1791 1792 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1793 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1794 1795 if (is_oop) { 1796 // save regs before copy_memory 1797 __ push(RegSet::of(d, count), sp); 1798 } 1799 { 1800 // UnsafeMemoryAccess page error: continue after unsafe access 1801 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1802 UnsafeMemoryAccessMark umam(this, add_entry, true); 1803 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1804 } 1805 if (is_oop) { 1806 __ pop(RegSet::of(d, count), sp); 1807 if (VerifyOops) 1808 verify_oop_array(size, d, count, r16); 1809 } 1810 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1811 __ leave(); 1812 __ mov(r0, zr); // return 0 1813 __ ret(lr); 1814 return start; 1815 } 1816 1817 // Helper for generating a dynamic type check. 1818 // Smashes rscratch1, rscratch2. 1819 void generate_type_check(Register sub_klass, 1820 Register super_check_offset, 1821 Register super_klass, 1822 Register temp1, 1823 Register temp2, 1824 Register result, 1825 Label& L_success) { 1826 assert_different_registers(sub_klass, super_check_offset, super_klass); 1827 1828 BLOCK_COMMENT("type_check:"); 1829 1830 Label L_miss; 1831 1832 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1833 super_check_offset); 1834 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1835 1836 // Fall through on failure! 1837 __ BIND(L_miss); 1838 } 1839 1840 // 1841 // Generate checkcasting array copy stub 1842 // 1843 // Input: 1844 // c_rarg0 - source array address 1845 // c_rarg1 - destination array address 1846 // c_rarg2 - element count, treated as ssize_t, can be zero 1847 // c_rarg3 - size_t ckoff (super_check_offset) 1848 // c_rarg4 - oop ckval (super_klass) 1849 // 1850 // Output: 1851 // r0 == 0 - success 1852 // r0 == -1^K - failure, where K is partial transfer count 1853 // 1854 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1855 bool dest_uninitialized; 1856 switch (stub_id) { 1857 case checkcast_arraycopy_id: 1858 dest_uninitialized = false; 1859 break; 1860 case checkcast_arraycopy_uninit_id: 1861 dest_uninitialized = true; 1862 break; 1863 default: 1864 ShouldNotReachHere(); 1865 } 1866 1867 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1868 1869 // Input registers (after setup_arg_regs) 1870 const Register from = c_rarg0; // source array address 1871 const Register to = c_rarg1; // destination array address 1872 const Register count = c_rarg2; // elementscount 1873 const Register ckoff = c_rarg3; // super_check_offset 1874 const Register ckval = c_rarg4; // super_klass 1875 1876 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1877 RegSet wb_post_saved_regs = RegSet::of(count); 1878 1879 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1880 const Register copied_oop = r22; // actual oop copied 1881 const Register count_save = r21; // orig elementscount 1882 const Register start_to = r20; // destination array start address 1883 const Register r19_klass = r19; // oop._klass 1884 1885 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1886 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1887 1888 //--------------------------------------------------------------- 1889 // Assembler stub will be used for this call to arraycopy 1890 // if the two arrays are subtypes of Object[] but the 1891 // destination array type is not equal to or a supertype 1892 // of the source type. Each element must be separately 1893 // checked. 1894 1895 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1896 copied_oop, r19_klass, count_save); 1897 1898 __ align(CodeEntryAlignment); 1899 StubCodeMark mark(this, stub_id); 1900 address start = __ pc(); 1901 1902 __ enter(); // required for proper stackwalking of RuntimeStub frame 1903 1904 #ifdef ASSERT 1905 // caller guarantees that the arrays really are different 1906 // otherwise, we would have to make conjoint checks 1907 { Label L; 1908 __ b(L); // conjoint check not yet implemented 1909 __ stop("checkcast_copy within a single array"); 1910 __ bind(L); 1911 } 1912 #endif //ASSERT 1913 1914 // Caller of this entry point must set up the argument registers. 1915 if (entry != nullptr) { 1916 *entry = __ pc(); 1917 BLOCK_COMMENT("Entry:"); 1918 } 1919 1920 // Empty array: Nothing to do. 1921 __ cbz(count, L_done); 1922 __ push(RegSet::of(r19, r20, r21, r22), sp); 1923 1924 #ifdef ASSERT 1925 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1926 // The ckoff and ckval must be mutually consistent, 1927 // even though caller generates both. 1928 { Label L; 1929 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1930 __ ldrw(start_to, Address(ckval, sco_offset)); 1931 __ cmpw(ckoff, start_to); 1932 __ br(Assembler::EQ, L); 1933 __ stop("super_check_offset inconsistent"); 1934 __ bind(L); 1935 } 1936 #endif //ASSERT 1937 1938 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1939 bool is_oop = true; 1940 int element_size = UseCompressedOops ? 4 : 8; 1941 if (dest_uninitialized) { 1942 decorators |= IS_DEST_UNINITIALIZED; 1943 } 1944 1945 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1946 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1947 1948 // save the original count 1949 __ mov(count_save, count); 1950 1951 // Copy from low to high addresses 1952 __ mov(start_to, to); // Save destination array start address 1953 __ b(L_load_element); 1954 1955 // ======== begin loop ======== 1956 // (Loop is rotated; its entry is L_load_element.) 1957 // Loop control: 1958 // for (; count != 0; count--) { 1959 // copied_oop = load_heap_oop(from++); 1960 // ... generate_type_check ...; 1961 // store_heap_oop(to++, copied_oop); 1962 // } 1963 __ align(OptoLoopAlignment); 1964 1965 __ BIND(L_store_element); 1966 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1967 __ post(to, element_size), copied_oop, noreg, 1968 gct1, gct2, gct3); 1969 __ sub(count, count, 1); 1970 __ cbz(count, L_do_card_marks); 1971 1972 // ======== loop entry is here ======== 1973 __ BIND(L_load_element); 1974 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1975 copied_oop, noreg, __ post(from, element_size), 1976 gct1); 1977 __ cbz(copied_oop, L_store_element); 1978 1979 __ load_klass(r19_klass, copied_oop);// query the object klass 1980 1981 BLOCK_COMMENT("type_check:"); 1982 generate_type_check(/*sub_klass*/r19_klass, 1983 /*super_check_offset*/ckoff, 1984 /*super_klass*/ckval, 1985 /*r_array_base*/gct1, 1986 /*temp2*/gct2, 1987 /*result*/r10, L_store_element); 1988 1989 // Fall through on failure! 1990 1991 // ======== end loop ======== 1992 1993 // It was a real error; we must depend on the caller to finish the job. 1994 // Register count = remaining oops, count_orig = total oops. 1995 // Emit GC store barriers for the oops we have copied and report 1996 // their number to the caller. 1997 1998 __ subs(count, count_save, count); // K = partially copied oop count 1999 __ eon(count, count, zr); // report (-1^K) to caller 2000 __ br(Assembler::EQ, L_done_pop); 2001 2002 __ BIND(L_do_card_marks); 2003 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2004 2005 __ bind(L_done_pop); 2006 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2007 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2008 2009 __ bind(L_done); 2010 __ mov(r0, count); 2011 __ leave(); 2012 __ ret(lr); 2013 2014 return start; 2015 } 2016 2017 // Perform range checks on the proposed arraycopy. 2018 // Kills temp, but nothing else. 2019 // Also, clean the sign bits of src_pos and dst_pos. 2020 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2021 Register src_pos, // source position (c_rarg1) 2022 Register dst, // destination array oo (c_rarg2) 2023 Register dst_pos, // destination position (c_rarg3) 2024 Register length, 2025 Register temp, 2026 Label& L_failed) { 2027 BLOCK_COMMENT("arraycopy_range_checks:"); 2028 2029 assert_different_registers(rscratch1, temp); 2030 2031 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2032 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2033 __ addw(temp, length, src_pos); 2034 __ cmpw(temp, rscratch1); 2035 __ br(Assembler::HI, L_failed); 2036 2037 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2038 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2039 __ addw(temp, length, dst_pos); 2040 __ cmpw(temp, rscratch1); 2041 __ br(Assembler::HI, L_failed); 2042 2043 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2044 __ movw(src_pos, src_pos); 2045 __ movw(dst_pos, dst_pos); 2046 2047 BLOCK_COMMENT("arraycopy_range_checks done"); 2048 } 2049 2050 // These stubs get called from some dumb test routine. 2051 // I'll write them properly when they're called from 2052 // something that's actually doing something. 2053 static void fake_arraycopy_stub(address src, address dst, int count) { 2054 assert(count == 0, "huh?"); 2055 } 2056 2057 2058 // 2059 // Generate 'unsafe' array copy stub 2060 // Though just as safe as the other stubs, it takes an unscaled 2061 // size_t argument instead of an element count. 2062 // 2063 // Input: 2064 // c_rarg0 - source array address 2065 // c_rarg1 - destination array address 2066 // c_rarg2 - byte count, treated as ssize_t, can be zero 2067 // 2068 // Examines the alignment of the operands and dispatches 2069 // to a long, int, short, or byte copy loop. 2070 // 2071 address generate_unsafe_copy(address byte_copy_entry, 2072 address short_copy_entry, 2073 address int_copy_entry, 2074 address long_copy_entry) { 2075 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2076 2077 Label L_long_aligned, L_int_aligned, L_short_aligned; 2078 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2079 2080 __ align(CodeEntryAlignment); 2081 StubCodeMark mark(this, stub_id); 2082 address start = __ pc(); 2083 __ enter(); // required for proper stackwalking of RuntimeStub frame 2084 2085 // bump this on entry, not on exit: 2086 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2087 2088 __ orr(rscratch1, s, d); 2089 __ orr(rscratch1, rscratch1, count); 2090 2091 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2092 __ cbz(rscratch1, L_long_aligned); 2093 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2094 __ cbz(rscratch1, L_int_aligned); 2095 __ tbz(rscratch1, 0, L_short_aligned); 2096 __ b(RuntimeAddress(byte_copy_entry)); 2097 2098 __ BIND(L_short_aligned); 2099 __ lsr(count, count, LogBytesPerShort); // size => short_count 2100 __ b(RuntimeAddress(short_copy_entry)); 2101 __ BIND(L_int_aligned); 2102 __ lsr(count, count, LogBytesPerInt); // size => int_count 2103 __ b(RuntimeAddress(int_copy_entry)); 2104 __ BIND(L_long_aligned); 2105 __ lsr(count, count, LogBytesPerLong); // size => long_count 2106 __ b(RuntimeAddress(long_copy_entry)); 2107 2108 return start; 2109 } 2110 2111 // 2112 // Generate generic array copy stubs 2113 // 2114 // Input: 2115 // c_rarg0 - src oop 2116 // c_rarg1 - src_pos (32-bits) 2117 // c_rarg2 - dst oop 2118 // c_rarg3 - dst_pos (32-bits) 2119 // c_rarg4 - element count (32-bits) 2120 // 2121 // Output: 2122 // r0 == 0 - success 2123 // r0 == -1^K - failure, where K is partial transfer count 2124 // 2125 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2126 address int_copy_entry, address oop_copy_entry, 2127 address long_copy_entry, address checkcast_copy_entry) { 2128 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2129 2130 Label L_failed, L_objArray; 2131 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2132 2133 // Input registers 2134 const Register src = c_rarg0; // source array oop 2135 const Register src_pos = c_rarg1; // source position 2136 const Register dst = c_rarg2; // destination array oop 2137 const Register dst_pos = c_rarg3; // destination position 2138 const Register length = c_rarg4; 2139 2140 2141 // Registers used as temps 2142 const Register dst_klass = c_rarg5; 2143 2144 __ align(CodeEntryAlignment); 2145 2146 StubCodeMark mark(this, stub_id); 2147 2148 address start = __ pc(); 2149 2150 __ enter(); // required for proper stackwalking of RuntimeStub frame 2151 2152 // bump this on entry, not on exit: 2153 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2154 2155 //----------------------------------------------------------------------- 2156 // Assembler stub will be used for this call to arraycopy 2157 // if the following conditions are met: 2158 // 2159 // (1) src and dst must not be null. 2160 // (2) src_pos must not be negative. 2161 // (3) dst_pos must not be negative. 2162 // (4) length must not be negative. 2163 // (5) src klass and dst klass should be the same and not null. 2164 // (6) src and dst should be arrays. 2165 // (7) src_pos + length must not exceed length of src. 2166 // (8) dst_pos + length must not exceed length of dst. 2167 // 2168 2169 // if (src == nullptr) return -1; 2170 __ cbz(src, L_failed); 2171 2172 // if (src_pos < 0) return -1; 2173 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2174 2175 // if (dst == nullptr) return -1; 2176 __ cbz(dst, L_failed); 2177 2178 // if (dst_pos < 0) return -1; 2179 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2180 2181 // registers used as temp 2182 const Register scratch_length = r16; // elements count to copy 2183 const Register scratch_src_klass = r17; // array klass 2184 const Register lh = r15; // layout helper 2185 2186 // if (length < 0) return -1; 2187 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2188 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2189 2190 __ load_klass(scratch_src_klass, src); 2191 #ifdef ASSERT 2192 // assert(src->klass() != nullptr); 2193 { 2194 BLOCK_COMMENT("assert klasses not null {"); 2195 Label L1, L2; 2196 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2197 __ bind(L1); 2198 __ stop("broken null klass"); 2199 __ bind(L2); 2200 __ load_klass(rscratch1, dst); 2201 __ cbz(rscratch1, L1); // this would be broken also 2202 BLOCK_COMMENT("} assert klasses not null done"); 2203 } 2204 #endif 2205 2206 // Load layout helper (32-bits) 2207 // 2208 // |array_tag| | header_size | element_type | |log2_element_size| 2209 // 32 30 24 16 8 2 0 2210 // 2211 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2212 // 2213 2214 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2215 2216 // Handle objArrays completely differently... 2217 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2218 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2219 __ movw(rscratch1, objArray_lh); 2220 __ eorw(rscratch2, lh, rscratch1); 2221 __ cbzw(rscratch2, L_objArray); 2222 2223 // if (src->klass() != dst->klass()) return -1; 2224 __ load_klass(rscratch2, dst); 2225 __ eor(rscratch2, rscratch2, scratch_src_klass); 2226 __ cbnz(rscratch2, L_failed); 2227 2228 // if (!src->is_Array()) return -1; 2229 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2230 2231 // At this point, it is known to be a typeArray (array_tag 0x3). 2232 #ifdef ASSERT 2233 { 2234 BLOCK_COMMENT("assert primitive array {"); 2235 Label L; 2236 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2237 __ cmpw(lh, rscratch2); 2238 __ br(Assembler::GE, L); 2239 __ stop("must be a primitive array"); 2240 __ bind(L); 2241 BLOCK_COMMENT("} assert primitive array done"); 2242 } 2243 #endif 2244 2245 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2246 rscratch2, L_failed); 2247 2248 // TypeArrayKlass 2249 // 2250 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2251 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2252 // 2253 2254 const Register rscratch1_offset = rscratch1; // array offset 2255 const Register r15_elsize = lh; // element size 2256 2257 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2258 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2259 __ add(src, src, rscratch1_offset); // src array offset 2260 __ add(dst, dst, rscratch1_offset); // dst array offset 2261 BLOCK_COMMENT("choose copy loop based on element size"); 2262 2263 // next registers should be set before the jump to corresponding stub 2264 const Register from = c_rarg0; // source array address 2265 const Register to = c_rarg1; // destination array address 2266 const Register count = c_rarg2; // elements count 2267 2268 // 'from', 'to', 'count' registers should be set in such order 2269 // since they are the same as 'src', 'src_pos', 'dst'. 2270 2271 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2272 2273 // The possible values of elsize are 0-3, i.e. exact_log2(element 2274 // size in bytes). We do a simple bitwise binary search. 2275 __ BIND(L_copy_bytes); 2276 __ tbnz(r15_elsize, 1, L_copy_ints); 2277 __ tbnz(r15_elsize, 0, L_copy_shorts); 2278 __ lea(from, Address(src, src_pos));// src_addr 2279 __ lea(to, Address(dst, dst_pos));// dst_addr 2280 __ movw(count, scratch_length); // length 2281 __ b(RuntimeAddress(byte_copy_entry)); 2282 2283 __ BIND(L_copy_shorts); 2284 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2285 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2286 __ movw(count, scratch_length); // length 2287 __ b(RuntimeAddress(short_copy_entry)); 2288 2289 __ BIND(L_copy_ints); 2290 __ tbnz(r15_elsize, 0, L_copy_longs); 2291 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2292 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2293 __ movw(count, scratch_length); // length 2294 __ b(RuntimeAddress(int_copy_entry)); 2295 2296 __ BIND(L_copy_longs); 2297 #ifdef ASSERT 2298 { 2299 BLOCK_COMMENT("assert long copy {"); 2300 Label L; 2301 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2302 __ cmpw(r15_elsize, LogBytesPerLong); 2303 __ br(Assembler::EQ, L); 2304 __ stop("must be long copy, but elsize is wrong"); 2305 __ bind(L); 2306 BLOCK_COMMENT("} assert long copy done"); 2307 } 2308 #endif 2309 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2310 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2311 __ movw(count, scratch_length); // length 2312 __ b(RuntimeAddress(long_copy_entry)); 2313 2314 // ObjArrayKlass 2315 __ BIND(L_objArray); 2316 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2317 2318 Label L_plain_copy, L_checkcast_copy; 2319 // test array classes for subtyping 2320 __ load_klass(r15, dst); 2321 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2322 __ br(Assembler::NE, L_checkcast_copy); 2323 2324 // Identically typed arrays can be copied without element-wise checks. 2325 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2326 rscratch2, L_failed); 2327 2328 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2329 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2330 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2331 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2332 __ movw(count, scratch_length); // length 2333 __ BIND(L_plain_copy); 2334 __ b(RuntimeAddress(oop_copy_entry)); 2335 2336 __ BIND(L_checkcast_copy); 2337 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2338 { 2339 // Before looking at dst.length, make sure dst is also an objArray. 2340 __ ldrw(rscratch1, Address(r15, lh_offset)); 2341 __ movw(rscratch2, objArray_lh); 2342 __ eorw(rscratch1, rscratch1, rscratch2); 2343 __ cbnzw(rscratch1, L_failed); 2344 2345 // It is safe to examine both src.length and dst.length. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 r15, L_failed); 2348 2349 __ load_klass(dst_klass, dst); // reload 2350 2351 // Marshal the base address arguments now, freeing registers. 2352 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2353 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2354 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2355 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2356 __ movw(count, length); // length (reloaded) 2357 Register sco_temp = c_rarg3; // this register is free now 2358 assert_different_registers(from, to, count, sco_temp, 2359 dst_klass, scratch_src_klass); 2360 // assert_clean_int(count, sco_temp); 2361 2362 // Generate the type check. 2363 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2364 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2365 2366 // Smashes rscratch1, rscratch2 2367 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2368 L_plain_copy); 2369 2370 // Fetch destination element klass from the ObjArrayKlass header. 2371 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2372 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2373 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2374 2375 // the checkcast_copy loop needs two extra arguments: 2376 assert(c_rarg3 == sco_temp, "#3 already in place"); 2377 // Set up arguments for checkcast_copy_entry. 2378 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2379 __ b(RuntimeAddress(checkcast_copy_entry)); 2380 } 2381 2382 __ BIND(L_failed); 2383 __ mov(r0, -1); 2384 __ leave(); // required for proper stackwalking of RuntimeStub frame 2385 __ ret(lr); 2386 2387 return start; 2388 } 2389 2390 // 2391 // Generate stub for array fill. If "aligned" is true, the 2392 // "to" address is assumed to be heapword aligned. 2393 // 2394 // Arguments for generated stub: 2395 // to: c_rarg0 2396 // value: c_rarg1 2397 // count: c_rarg2 treated as signed 2398 // 2399 address generate_fill(StubGenStubId stub_id) { 2400 BasicType t; 2401 bool aligned; 2402 2403 switch (stub_id) { 2404 case jbyte_fill_id: 2405 t = T_BYTE; 2406 aligned = false; 2407 break; 2408 case jshort_fill_id: 2409 t = T_SHORT; 2410 aligned = false; 2411 break; 2412 case jint_fill_id: 2413 t = T_INT; 2414 aligned = false; 2415 break; 2416 case arrayof_jbyte_fill_id: 2417 t = T_BYTE; 2418 aligned = true; 2419 break; 2420 case arrayof_jshort_fill_id: 2421 t = T_SHORT; 2422 aligned = true; 2423 break; 2424 case arrayof_jint_fill_id: 2425 t = T_INT; 2426 aligned = true; 2427 break; 2428 default: 2429 ShouldNotReachHere(); 2430 }; 2431 2432 __ align(CodeEntryAlignment); 2433 StubCodeMark mark(this, stub_id); 2434 address start = __ pc(); 2435 2436 BLOCK_COMMENT("Entry:"); 2437 2438 const Register to = c_rarg0; // source array address 2439 const Register value = c_rarg1; // value 2440 const Register count = c_rarg2; // elements count 2441 2442 const Register bz_base = r10; // base for block_zero routine 2443 const Register cnt_words = r11; // temp register 2444 2445 __ enter(); 2446 2447 Label L_fill_elements, L_exit1; 2448 2449 int shift = -1; 2450 switch (t) { 2451 case T_BYTE: 2452 shift = 0; 2453 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2454 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2455 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2456 __ br(Assembler::LO, L_fill_elements); 2457 break; 2458 case T_SHORT: 2459 shift = 1; 2460 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2461 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2462 __ br(Assembler::LO, L_fill_elements); 2463 break; 2464 case T_INT: 2465 shift = 2; 2466 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2467 __ br(Assembler::LO, L_fill_elements); 2468 break; 2469 default: ShouldNotReachHere(); 2470 } 2471 2472 // Align source address at 8 bytes address boundary. 2473 Label L_skip_align1, L_skip_align2, L_skip_align4; 2474 if (!aligned) { 2475 switch (t) { 2476 case T_BYTE: 2477 // One byte misalignment happens only for byte arrays. 2478 __ tbz(to, 0, L_skip_align1); 2479 __ strb(value, Address(__ post(to, 1))); 2480 __ subw(count, count, 1); 2481 __ bind(L_skip_align1); 2482 // Fallthrough 2483 case T_SHORT: 2484 // Two bytes misalignment happens only for byte and short (char) arrays. 2485 __ tbz(to, 1, L_skip_align2); 2486 __ strh(value, Address(__ post(to, 2))); 2487 __ subw(count, count, 2 >> shift); 2488 __ bind(L_skip_align2); 2489 // Fallthrough 2490 case T_INT: 2491 // Align to 8 bytes, we know we are 4 byte aligned to start. 2492 __ tbz(to, 2, L_skip_align4); 2493 __ strw(value, Address(__ post(to, 4))); 2494 __ subw(count, count, 4 >> shift); 2495 __ bind(L_skip_align4); 2496 break; 2497 default: ShouldNotReachHere(); 2498 } 2499 } 2500 2501 // 2502 // Fill large chunks 2503 // 2504 __ lsrw(cnt_words, count, 3 - shift); // number of words 2505 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2506 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2507 if (UseBlockZeroing) { 2508 Label non_block_zeroing, rest; 2509 // If the fill value is zero we can use the fast zero_words(). 2510 __ cbnz(value, non_block_zeroing); 2511 __ mov(bz_base, to); 2512 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2513 address tpc = __ zero_words(bz_base, cnt_words); 2514 if (tpc == nullptr) { 2515 fatal("CodeCache is full at generate_fill"); 2516 } 2517 __ b(rest); 2518 __ bind(non_block_zeroing); 2519 __ fill_words(to, cnt_words, value); 2520 __ bind(rest); 2521 } else { 2522 __ fill_words(to, cnt_words, value); 2523 } 2524 2525 // Remaining count is less than 8 bytes. Fill it by a single store. 2526 // Note that the total length is no less than 8 bytes. 2527 if (t == T_BYTE || t == T_SHORT) { 2528 Label L_exit1; 2529 __ cbzw(count, L_exit1); 2530 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2531 __ str(value, Address(to, -8)); // overwrite some elements 2532 __ bind(L_exit1); 2533 __ leave(); 2534 __ ret(lr); 2535 } 2536 2537 // Handle copies less than 8 bytes. 2538 Label L_fill_2, L_fill_4, L_exit2; 2539 __ bind(L_fill_elements); 2540 switch (t) { 2541 case T_BYTE: 2542 __ tbz(count, 0, L_fill_2); 2543 __ strb(value, Address(__ post(to, 1))); 2544 __ bind(L_fill_2); 2545 __ tbz(count, 1, L_fill_4); 2546 __ strh(value, Address(__ post(to, 2))); 2547 __ bind(L_fill_4); 2548 __ tbz(count, 2, L_exit2); 2549 __ strw(value, Address(to)); 2550 break; 2551 case T_SHORT: 2552 __ tbz(count, 0, L_fill_4); 2553 __ strh(value, Address(__ post(to, 2))); 2554 __ bind(L_fill_4); 2555 __ tbz(count, 1, L_exit2); 2556 __ strw(value, Address(to)); 2557 break; 2558 case T_INT: 2559 __ cbzw(count, L_exit2); 2560 __ strw(value, Address(to)); 2561 break; 2562 default: ShouldNotReachHere(); 2563 } 2564 __ bind(L_exit2); 2565 __ leave(); 2566 __ ret(lr); 2567 return start; 2568 } 2569 2570 address generate_unsafecopy_common_error_exit() { 2571 address start_pc = __ pc(); 2572 __ leave(); 2573 __ mov(r0, 0); 2574 __ ret(lr); 2575 return start_pc; 2576 } 2577 2578 // 2579 // Generate 'unsafe' set memory stub 2580 // Though just as safe as the other stubs, it takes an unscaled 2581 // size_t (# bytes) argument instead of an element count. 2582 // 2583 // This fill operation is atomicity preserving: as long as the 2584 // address supplied is sufficiently aligned, all writes of up to 64 2585 // bits in size are single-copy atomic. 2586 // 2587 // Input: 2588 // c_rarg0 - destination array address 2589 // c_rarg1 - byte count (size_t) 2590 // c_rarg2 - byte value 2591 // 2592 address generate_unsafe_setmemory() { 2593 __ align(CodeEntryAlignment); 2594 StubCodeMark mark(this, StubGenStubId::unsafe_setmemory_id); 2595 address start = __ pc(); 2596 2597 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2; 2598 Label tail; 2599 2600 UnsafeMemoryAccessMark umam(this, true, false); 2601 2602 __ enter(); // required for proper stackwalking of RuntimeStub frame 2603 2604 __ dup(v0, __ T16B, value); 2605 2606 if (AvoidUnalignedAccesses) { 2607 __ cmp(count, (u1)16); 2608 __ br(__ LO, tail); 2609 2610 __ mov(rscratch1, 16); 2611 __ andr(rscratch2, dest, 15); 2612 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest 2613 __ strq(v0, Address(dest)); 2614 __ sub(count, count, rscratch1); 2615 __ add(dest, dest, rscratch1); 2616 } 2617 2618 __ subs(count, count, (u1)64); 2619 __ br(__ LO, tail); 2620 { 2621 Label again; 2622 __ bind(again); 2623 __ stpq(v0, v0, Address(dest)); 2624 __ stpq(v0, v0, Address(dest, 32)); 2625 2626 __ subs(count, count, 64); 2627 __ add(dest, dest, 64); 2628 __ br(__ HS, again); 2629 } 2630 2631 __ bind(tail); 2632 // The count of bytes is off by 64, but we don't need to correct 2633 // it because we're only going to use the least-significant few 2634 // count bits from here on. 2635 // __ add(count, count, 64); 2636 2637 { 2638 Label dont; 2639 __ tbz(count, exact_log2(32), dont); 2640 __ stpq(v0, v0, __ post(dest, 32)); 2641 __ bind(dont); 2642 } 2643 { 2644 Label dont; 2645 __ tbz(count, exact_log2(16), dont); 2646 __ strq(v0, __ post(dest, 16)); 2647 __ bind(dont); 2648 } 2649 { 2650 Label dont; 2651 __ tbz(count, exact_log2(8), dont); 2652 __ strd(v0, __ post(dest, 8)); 2653 __ bind(dont); 2654 } 2655 2656 Label finished; 2657 __ tst(count, 7); 2658 __ br(__ EQ, finished); 2659 2660 { 2661 Label dont; 2662 __ tbz(count, exact_log2(4), dont); 2663 __ strs(v0, __ post(dest, 4)); 2664 __ bind(dont); 2665 } 2666 { 2667 Label dont; 2668 __ tbz(count, exact_log2(2), dont); 2669 __ bfi(value, value, 8, 8); 2670 __ strh(value, __ post(dest, 2)); 2671 __ bind(dont); 2672 } 2673 { 2674 Label dont; 2675 __ tbz(count, exact_log2(1), dont); 2676 __ strb(value, Address(dest)); 2677 __ bind(dont); 2678 } 2679 2680 __ bind(finished); 2681 __ leave(); 2682 __ ret(lr); 2683 2684 return start; 2685 } 2686 2687 address generate_data_cache_writeback() { 2688 const Register line = c_rarg0; // address of line to write back 2689 2690 __ align(CodeEntryAlignment); 2691 2692 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2693 StubCodeMark mark(this, stub_id); 2694 2695 address start = __ pc(); 2696 __ enter(); 2697 __ cache_wb(Address(line, 0)); 2698 __ leave(); 2699 __ ret(lr); 2700 2701 return start; 2702 } 2703 2704 address generate_data_cache_writeback_sync() { 2705 const Register is_pre = c_rarg0; // pre or post sync 2706 2707 __ align(CodeEntryAlignment); 2708 2709 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2710 StubCodeMark mark(this, stub_id); 2711 2712 // pre wbsync is a no-op 2713 // post wbsync translates to an sfence 2714 2715 Label skip; 2716 address start = __ pc(); 2717 __ enter(); 2718 __ cbnz(is_pre, skip); 2719 __ cache_wbsync(false); 2720 __ bind(skip); 2721 __ leave(); 2722 __ ret(lr); 2723 2724 return start; 2725 } 2726 2727 void generate_arraycopy_stubs() { 2728 address entry; 2729 address entry_jbyte_arraycopy; 2730 address entry_jshort_arraycopy; 2731 address entry_jint_arraycopy; 2732 address entry_oop_arraycopy; 2733 address entry_jlong_arraycopy; 2734 address entry_checkcast_arraycopy; 2735 2736 address ucm_common_error_exit = generate_unsafecopy_common_error_exit(); 2737 UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit); 2738 2739 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2740 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2741 2742 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2743 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2744 2745 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2746 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2747 2748 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2749 2750 //*** jbyte 2751 // Always need aligned and unaligned versions 2752 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2753 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2754 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2755 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2756 2757 //*** jshort 2758 // Always need aligned and unaligned versions 2759 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2760 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2761 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2762 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2763 2764 //*** jint 2765 // Aligned versions 2766 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2767 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2768 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2769 // entry_jint_arraycopy always points to the unaligned version 2770 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2771 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2772 2773 //*** jlong 2774 // It is always aligned 2775 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2776 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2777 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2778 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2779 2780 //*** oops 2781 { 2782 // With compressed oops we need unaligned versions; notice that 2783 // we overwrite entry_oop_arraycopy. 2784 bool aligned = !UseCompressedOops; 2785 2786 StubRoutines::_arrayof_oop_disjoint_arraycopy 2787 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2788 StubRoutines::_arrayof_oop_arraycopy 2789 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2790 // Aligned versions without pre-barriers 2791 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2792 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2793 StubRoutines::_arrayof_oop_arraycopy_uninit 2794 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2795 } 2796 2797 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2798 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2799 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2800 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2801 2802 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2803 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2804 2805 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2806 entry_jshort_arraycopy, 2807 entry_jint_arraycopy, 2808 entry_jlong_arraycopy); 2809 2810 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2811 entry_jshort_arraycopy, 2812 entry_jint_arraycopy, 2813 entry_oop_arraycopy, 2814 entry_jlong_arraycopy, 2815 entry_checkcast_arraycopy); 2816 2817 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2818 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2819 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2820 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2821 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2822 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2823 } 2824 2825 void generate_math_stubs() { Unimplemented(); } 2826 2827 // Arguments: 2828 // 2829 // Inputs: 2830 // c_rarg0 - source byte array address 2831 // c_rarg1 - destination byte array address 2832 // c_rarg2 - K (key) in little endian int array 2833 // 2834 address generate_aescrypt_encryptBlock() { 2835 __ align(CodeEntryAlignment); 2836 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2837 StubCodeMark mark(this, stub_id); 2838 2839 const Register from = c_rarg0; // source array address 2840 const Register to = c_rarg1; // destination array address 2841 const Register key = c_rarg2; // key array address 2842 const Register keylen = rscratch1; 2843 2844 address start = __ pc(); 2845 __ enter(); 2846 2847 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2848 2849 __ aesenc_loadkeys(key, keylen); 2850 __ aesecb_encrypt(from, to, keylen); 2851 2852 __ mov(r0, 0); 2853 2854 __ leave(); 2855 __ ret(lr); 2856 2857 return start; 2858 } 2859 2860 // Arguments: 2861 // 2862 // Inputs: 2863 // c_rarg0 - source byte array address 2864 // c_rarg1 - destination byte array address 2865 // c_rarg2 - K (key) in little endian int array 2866 // 2867 address generate_aescrypt_decryptBlock() { 2868 assert(UseAES, "need AES cryptographic extension support"); 2869 __ align(CodeEntryAlignment); 2870 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2871 StubCodeMark mark(this, stub_id); 2872 Label L_doLast; 2873 2874 const Register from = c_rarg0; // source array address 2875 const Register to = c_rarg1; // destination array address 2876 const Register key = c_rarg2; // key array address 2877 const Register keylen = rscratch1; 2878 2879 address start = __ pc(); 2880 __ enter(); // required for proper stackwalking of RuntimeStub frame 2881 2882 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2883 2884 __ aesecb_decrypt(from, to, key, keylen); 2885 2886 __ mov(r0, 0); 2887 2888 __ leave(); 2889 __ ret(lr); 2890 2891 return start; 2892 } 2893 2894 // Arguments: 2895 // 2896 // Inputs: 2897 // c_rarg0 - source byte array address 2898 // c_rarg1 - destination byte array address 2899 // c_rarg2 - K (key) in little endian int array 2900 // c_rarg3 - r vector byte array address 2901 // c_rarg4 - input length 2902 // 2903 // Output: 2904 // x0 - input length 2905 // 2906 address generate_cipherBlockChaining_encryptAESCrypt() { 2907 assert(UseAES, "need AES cryptographic extension support"); 2908 __ align(CodeEntryAlignment); 2909 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2910 StubCodeMark mark(this, stub_id); 2911 2912 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2913 2914 const Register from = c_rarg0; // source array address 2915 const Register to = c_rarg1; // destination array address 2916 const Register key = c_rarg2; // key array address 2917 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2918 // and left with the results of the last encryption block 2919 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2920 const Register keylen = rscratch1; 2921 2922 address start = __ pc(); 2923 2924 __ enter(); 2925 2926 __ movw(rscratch2, len_reg); 2927 2928 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2929 2930 __ ld1(v0, __ T16B, rvec); 2931 2932 __ cmpw(keylen, 52); 2933 __ br(Assembler::CC, L_loadkeys_44); 2934 __ br(Assembler::EQ, L_loadkeys_52); 2935 2936 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2937 __ rev32(v17, __ T16B, v17); 2938 __ rev32(v18, __ T16B, v18); 2939 __ BIND(L_loadkeys_52); 2940 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2941 __ rev32(v19, __ T16B, v19); 2942 __ rev32(v20, __ T16B, v20); 2943 __ BIND(L_loadkeys_44); 2944 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2945 __ rev32(v21, __ T16B, v21); 2946 __ rev32(v22, __ T16B, v22); 2947 __ rev32(v23, __ T16B, v23); 2948 __ rev32(v24, __ T16B, v24); 2949 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2950 __ rev32(v25, __ T16B, v25); 2951 __ rev32(v26, __ T16B, v26); 2952 __ rev32(v27, __ T16B, v27); 2953 __ rev32(v28, __ T16B, v28); 2954 __ ld1(v29, v30, v31, __ T16B, key); 2955 __ rev32(v29, __ T16B, v29); 2956 __ rev32(v30, __ T16B, v30); 2957 __ rev32(v31, __ T16B, v31); 2958 2959 __ BIND(L_aes_loop); 2960 __ ld1(v1, __ T16B, __ post(from, 16)); 2961 __ eor(v0, __ T16B, v0, v1); 2962 2963 __ br(Assembler::CC, L_rounds_44); 2964 __ br(Assembler::EQ, L_rounds_52); 2965 2966 __ aese(v0, v17); __ aesmc(v0, v0); 2967 __ aese(v0, v18); __ aesmc(v0, v0); 2968 __ BIND(L_rounds_52); 2969 __ aese(v0, v19); __ aesmc(v0, v0); 2970 __ aese(v0, v20); __ aesmc(v0, v0); 2971 __ BIND(L_rounds_44); 2972 __ aese(v0, v21); __ aesmc(v0, v0); 2973 __ aese(v0, v22); __ aesmc(v0, v0); 2974 __ aese(v0, v23); __ aesmc(v0, v0); 2975 __ aese(v0, v24); __ aesmc(v0, v0); 2976 __ aese(v0, v25); __ aesmc(v0, v0); 2977 __ aese(v0, v26); __ aesmc(v0, v0); 2978 __ aese(v0, v27); __ aesmc(v0, v0); 2979 __ aese(v0, v28); __ aesmc(v0, v0); 2980 __ aese(v0, v29); __ aesmc(v0, v0); 2981 __ aese(v0, v30); 2982 __ eor(v0, __ T16B, v0, v31); 2983 2984 __ st1(v0, __ T16B, __ post(to, 16)); 2985 2986 __ subw(len_reg, len_reg, 16); 2987 __ cbnzw(len_reg, L_aes_loop); 2988 2989 __ st1(v0, __ T16B, rvec); 2990 2991 __ mov(r0, rscratch2); 2992 2993 __ leave(); 2994 __ ret(lr); 2995 2996 return start; 2997 } 2998 2999 // Arguments: 3000 // 3001 // Inputs: 3002 // c_rarg0 - source byte array address 3003 // c_rarg1 - destination byte array address 3004 // c_rarg2 - K (key) in little endian int array 3005 // c_rarg3 - r vector byte array address 3006 // c_rarg4 - input length 3007 // 3008 // Output: 3009 // r0 - input length 3010 // 3011 address generate_cipherBlockChaining_decryptAESCrypt() { 3012 assert(UseAES, "need AES cryptographic extension support"); 3013 __ align(CodeEntryAlignment); 3014 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 3015 StubCodeMark mark(this, stub_id); 3016 3017 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 3018 3019 const Register from = c_rarg0; // source array address 3020 const Register to = c_rarg1; // destination array address 3021 const Register key = c_rarg2; // key array address 3022 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3023 // and left with the results of the last encryption block 3024 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3025 const Register keylen = rscratch1; 3026 3027 address start = __ pc(); 3028 3029 __ enter(); 3030 3031 __ movw(rscratch2, len_reg); 3032 3033 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3034 3035 __ ld1(v2, __ T16B, rvec); 3036 3037 __ ld1(v31, __ T16B, __ post(key, 16)); 3038 __ rev32(v31, __ T16B, v31); 3039 3040 __ cmpw(keylen, 52); 3041 __ br(Assembler::CC, L_loadkeys_44); 3042 __ br(Assembler::EQ, L_loadkeys_52); 3043 3044 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 3045 __ rev32(v17, __ T16B, v17); 3046 __ rev32(v18, __ T16B, v18); 3047 __ BIND(L_loadkeys_52); 3048 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 3049 __ rev32(v19, __ T16B, v19); 3050 __ rev32(v20, __ T16B, v20); 3051 __ BIND(L_loadkeys_44); 3052 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 3053 __ rev32(v21, __ T16B, v21); 3054 __ rev32(v22, __ T16B, v22); 3055 __ rev32(v23, __ T16B, v23); 3056 __ rev32(v24, __ T16B, v24); 3057 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3058 __ rev32(v25, __ T16B, v25); 3059 __ rev32(v26, __ T16B, v26); 3060 __ rev32(v27, __ T16B, v27); 3061 __ rev32(v28, __ T16B, v28); 3062 __ ld1(v29, v30, __ T16B, key); 3063 __ rev32(v29, __ T16B, v29); 3064 __ rev32(v30, __ T16B, v30); 3065 3066 __ BIND(L_aes_loop); 3067 __ ld1(v0, __ T16B, __ post(from, 16)); 3068 __ orr(v1, __ T16B, v0, v0); 3069 3070 __ br(Assembler::CC, L_rounds_44); 3071 __ br(Assembler::EQ, L_rounds_52); 3072 3073 __ aesd(v0, v17); __ aesimc(v0, v0); 3074 __ aesd(v0, v18); __ aesimc(v0, v0); 3075 __ BIND(L_rounds_52); 3076 __ aesd(v0, v19); __ aesimc(v0, v0); 3077 __ aesd(v0, v20); __ aesimc(v0, v0); 3078 __ BIND(L_rounds_44); 3079 __ aesd(v0, v21); __ aesimc(v0, v0); 3080 __ aesd(v0, v22); __ aesimc(v0, v0); 3081 __ aesd(v0, v23); __ aesimc(v0, v0); 3082 __ aesd(v0, v24); __ aesimc(v0, v0); 3083 __ aesd(v0, v25); __ aesimc(v0, v0); 3084 __ aesd(v0, v26); __ aesimc(v0, v0); 3085 __ aesd(v0, v27); __ aesimc(v0, v0); 3086 __ aesd(v0, v28); __ aesimc(v0, v0); 3087 __ aesd(v0, v29); __ aesimc(v0, v0); 3088 __ aesd(v0, v30); 3089 __ eor(v0, __ T16B, v0, v31); 3090 __ eor(v0, __ T16B, v0, v2); 3091 3092 __ st1(v0, __ T16B, __ post(to, 16)); 3093 __ orr(v2, __ T16B, v1, v1); 3094 3095 __ subw(len_reg, len_reg, 16); 3096 __ cbnzw(len_reg, L_aes_loop); 3097 3098 __ st1(v2, __ T16B, rvec); 3099 3100 __ mov(r0, rscratch2); 3101 3102 __ leave(); 3103 __ ret(lr); 3104 3105 return start; 3106 } 3107 3108 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3109 // Inputs: 128-bits. in is preserved. 3110 // The least-significant 64-bit word is in the upper dword of each vector. 3111 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3112 // Output: result 3113 void be_add_128_64(FloatRegister result, FloatRegister in, 3114 FloatRegister inc, FloatRegister tmp) { 3115 assert_different_registers(result, tmp, inc); 3116 3117 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3118 // input 3119 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3120 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3121 // MSD == 0 (must be!) to LSD 3122 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3123 } 3124 3125 // CTR AES crypt. 3126 // Arguments: 3127 // 3128 // Inputs: 3129 // c_rarg0 - source byte array address 3130 // c_rarg1 - destination byte array address 3131 // c_rarg2 - K (key) in little endian int array 3132 // c_rarg3 - counter vector byte array address 3133 // c_rarg4 - input length 3134 // c_rarg5 - saved encryptedCounter start 3135 // c_rarg6 - saved used length 3136 // 3137 // Output: 3138 // r0 - input length 3139 // 3140 address generate_counterMode_AESCrypt() { 3141 const Register in = c_rarg0; 3142 const Register out = c_rarg1; 3143 const Register key = c_rarg2; 3144 const Register counter = c_rarg3; 3145 const Register saved_len = c_rarg4, len = r10; 3146 const Register saved_encrypted_ctr = c_rarg5; 3147 const Register used_ptr = c_rarg6, used = r12; 3148 3149 const Register offset = r7; 3150 const Register keylen = r11; 3151 3152 const unsigned char block_size = 16; 3153 const int bulk_width = 4; 3154 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3155 // performance with larger data sizes, but it also means that the 3156 // fast path isn't used until you have at least 8 blocks, and up 3157 // to 127 bytes of data will be executed on the slow path. For 3158 // that reason, and also so as not to blow away too much icache, 4 3159 // blocks seems like a sensible compromise. 3160 3161 // Algorithm: 3162 // 3163 // if (len == 0) { 3164 // goto DONE; 3165 // } 3166 // int result = len; 3167 // do { 3168 // if (used >= blockSize) { 3169 // if (len >= bulk_width * blockSize) { 3170 // CTR_large_block(); 3171 // if (len == 0) 3172 // goto DONE; 3173 // } 3174 // for (;;) { 3175 // 16ByteVector v0 = counter; 3176 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3177 // used = 0; 3178 // if (len < blockSize) 3179 // break; /* goto NEXT */ 3180 // 16ByteVector v1 = load16Bytes(in, offset); 3181 // v1 = v1 ^ encryptedCounter; 3182 // store16Bytes(out, offset); 3183 // used = blockSize; 3184 // offset += blockSize; 3185 // len -= blockSize; 3186 // if (len == 0) 3187 // goto DONE; 3188 // } 3189 // } 3190 // NEXT: 3191 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3192 // len--; 3193 // } while (len != 0); 3194 // DONE: 3195 // return result; 3196 // 3197 // CTR_large_block() 3198 // Wide bulk encryption of whole blocks. 3199 3200 __ align(CodeEntryAlignment); 3201 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3202 StubCodeMark mark(this, stub_id); 3203 const address start = __ pc(); 3204 __ enter(); 3205 3206 Label DONE, CTR_large_block, large_block_return; 3207 __ ldrw(used, Address(used_ptr)); 3208 __ cbzw(saved_len, DONE); 3209 3210 __ mov(len, saved_len); 3211 __ mov(offset, 0); 3212 3213 // Compute #rounds for AES based on the length of the key array 3214 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3215 3216 __ aesenc_loadkeys(key, keylen); 3217 3218 { 3219 Label L_CTR_loop, NEXT; 3220 3221 __ bind(L_CTR_loop); 3222 3223 __ cmp(used, block_size); 3224 __ br(__ LO, NEXT); 3225 3226 // Maybe we have a lot of data 3227 __ subsw(rscratch1, len, bulk_width * block_size); 3228 __ br(__ HS, CTR_large_block); 3229 __ BIND(large_block_return); 3230 __ cbzw(len, DONE); 3231 3232 // Setup the counter 3233 __ movi(v4, __ T4S, 0); 3234 __ movi(v5, __ T4S, 1); 3235 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3236 3237 // 128-bit big-endian increment 3238 __ ld1(v0, __ T16B, counter); 3239 __ rev64(v16, __ T16B, v0); 3240 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3241 __ rev64(v16, __ T16B, v16); 3242 __ st1(v16, __ T16B, counter); 3243 // Previous counter value is in v0 3244 // v4 contains { 0, 1 } 3245 3246 { 3247 // We have fewer than bulk_width blocks of data left. Encrypt 3248 // them one by one until there is less than a full block 3249 // remaining, being careful to save both the encrypted counter 3250 // and the counter. 3251 3252 Label inner_loop; 3253 __ bind(inner_loop); 3254 // Counter to encrypt is in v0 3255 __ aesecb_encrypt(noreg, noreg, keylen); 3256 __ st1(v0, __ T16B, saved_encrypted_ctr); 3257 3258 // Do we have a remaining full block? 3259 3260 __ mov(used, 0); 3261 __ cmp(len, block_size); 3262 __ br(__ LO, NEXT); 3263 3264 // Yes, we have a full block 3265 __ ldrq(v1, Address(in, offset)); 3266 __ eor(v1, __ T16B, v1, v0); 3267 __ strq(v1, Address(out, offset)); 3268 __ mov(used, block_size); 3269 __ add(offset, offset, block_size); 3270 3271 __ subw(len, len, block_size); 3272 __ cbzw(len, DONE); 3273 3274 // Increment the counter, store it back 3275 __ orr(v0, __ T16B, v16, v16); 3276 __ rev64(v16, __ T16B, v16); 3277 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3278 __ rev64(v16, __ T16B, v16); 3279 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3280 3281 __ b(inner_loop); 3282 } 3283 3284 __ BIND(NEXT); 3285 3286 // Encrypt a single byte, and loop. 3287 // We expect this to be a rare event. 3288 __ ldrb(rscratch1, Address(in, offset)); 3289 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3290 __ eor(rscratch1, rscratch1, rscratch2); 3291 __ strb(rscratch1, Address(out, offset)); 3292 __ add(offset, offset, 1); 3293 __ add(used, used, 1); 3294 __ subw(len, len,1); 3295 __ cbnzw(len, L_CTR_loop); 3296 } 3297 3298 __ bind(DONE); 3299 __ strw(used, Address(used_ptr)); 3300 __ mov(r0, saved_len); 3301 3302 __ leave(); // required for proper stackwalking of RuntimeStub frame 3303 __ ret(lr); 3304 3305 // Bulk encryption 3306 3307 __ BIND (CTR_large_block); 3308 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3309 3310 if (bulk_width == 8) { 3311 __ sub(sp, sp, 4 * 16); 3312 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3313 } 3314 __ sub(sp, sp, 4 * 16); 3315 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3316 RegSet saved_regs = (RegSet::of(in, out, offset) 3317 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3318 __ push(saved_regs, sp); 3319 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3320 __ add(in, in, offset); 3321 __ add(out, out, offset); 3322 3323 // Keys should already be loaded into the correct registers 3324 3325 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3326 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3327 3328 // AES/CTR loop 3329 { 3330 Label L_CTR_loop; 3331 __ BIND(L_CTR_loop); 3332 3333 // Setup the counters 3334 __ movi(v8, __ T4S, 0); 3335 __ movi(v9, __ T4S, 1); 3336 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3337 3338 for (int i = 0; i < bulk_width; i++) { 3339 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3340 __ rev64(v0_ofs, __ T16B, v16); 3341 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3342 } 3343 3344 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3345 3346 // Encrypt the counters 3347 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3348 3349 if (bulk_width == 8) { 3350 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3351 } 3352 3353 // XOR the encrypted counters with the inputs 3354 for (int i = 0; i < bulk_width; i++) { 3355 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3356 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3357 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3358 } 3359 3360 // Write the encrypted data 3361 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3362 if (bulk_width == 8) { 3363 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3364 } 3365 3366 __ subw(len, len, 16 * bulk_width); 3367 __ cbnzw(len, L_CTR_loop); 3368 } 3369 3370 // Save the counter back where it goes 3371 __ rev64(v16, __ T16B, v16); 3372 __ st1(v16, __ T16B, counter); 3373 3374 __ pop(saved_regs, sp); 3375 3376 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3377 if (bulk_width == 8) { 3378 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3379 } 3380 3381 __ andr(rscratch1, len, -16 * bulk_width); 3382 __ sub(len, len, rscratch1); 3383 __ add(offset, offset, rscratch1); 3384 __ mov(used, 16); 3385 __ strw(used, Address(used_ptr)); 3386 __ b(large_block_return); 3387 3388 return start; 3389 } 3390 3391 // Vector AES Galois Counter Mode implementation. Parameters: 3392 // 3393 // in = c_rarg0 3394 // len = c_rarg1 3395 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3396 // out = c_rarg3 3397 // key = c_rarg4 3398 // state = c_rarg5 - GHASH.state 3399 // subkeyHtbl = c_rarg6 - powers of H 3400 // counter = c_rarg7 - 16 bytes of CTR 3401 // return - number of processed bytes 3402 address generate_galoisCounterMode_AESCrypt() { 3403 address ghash_polynomial = __ pc(); 3404 __ emit_int64(0x87); // The low-order bits of the field 3405 // polynomial (i.e. p = z^7+z^2+z+1) 3406 // repeated in the low and high parts of a 3407 // 128-bit vector 3408 __ emit_int64(0x87); 3409 3410 __ align(CodeEntryAlignment); 3411 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3412 StubCodeMark mark(this, stub_id); 3413 address start = __ pc(); 3414 __ enter(); 3415 3416 const Register in = c_rarg0; 3417 const Register len = c_rarg1; 3418 const Register ct = c_rarg2; 3419 const Register out = c_rarg3; 3420 // and updated with the incremented counter in the end 3421 3422 const Register key = c_rarg4; 3423 const Register state = c_rarg5; 3424 3425 const Register subkeyHtbl = c_rarg6; 3426 3427 const Register counter = c_rarg7; 3428 3429 const Register keylen = r10; 3430 // Save state before entering routine 3431 __ sub(sp, sp, 4 * 16); 3432 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3433 __ sub(sp, sp, 4 * 16); 3434 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3435 3436 // __ andr(len, len, -512); 3437 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3438 __ str(len, __ pre(sp, -2 * wordSize)); 3439 3440 Label DONE; 3441 __ cbz(len, DONE); 3442 3443 // Compute #rounds for AES based on the length of the key array 3444 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3445 3446 __ aesenc_loadkeys(key, keylen); 3447 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3448 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3449 3450 // AES/CTR loop 3451 { 3452 Label L_CTR_loop; 3453 __ BIND(L_CTR_loop); 3454 3455 // Setup the counters 3456 __ movi(v8, __ T4S, 0); 3457 __ movi(v9, __ T4S, 1); 3458 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3459 3460 assert(v0->encoding() < v8->encoding(), ""); 3461 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3462 FloatRegister f = as_FloatRegister(i); 3463 __ rev32(f, __ T16B, v16); 3464 __ addv(v16, __ T4S, v16, v8); 3465 } 3466 3467 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3468 3469 // Encrypt the counters 3470 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3471 3472 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3473 3474 // XOR the encrypted counters with the inputs 3475 for (int i = 0; i < 8; i++) { 3476 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3477 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3478 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3479 } 3480 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3481 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3482 3483 __ subw(len, len, 16 * 8); 3484 __ cbnzw(len, L_CTR_loop); 3485 } 3486 3487 __ rev32(v16, __ T16B, v16); 3488 __ st1(v16, __ T16B, counter); 3489 3490 __ ldr(len, Address(sp)); 3491 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3492 3493 // GHASH/CTR loop 3494 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3495 len, /*unrolls*/4); 3496 3497 #ifdef ASSERT 3498 { Label L; 3499 __ cmp(len, (unsigned char)0); 3500 __ br(Assembler::EQ, L); 3501 __ stop("stubGenerator: abort"); 3502 __ bind(L); 3503 } 3504 #endif 3505 3506 __ bind(DONE); 3507 // Return the number of bytes processed 3508 __ ldr(r0, __ post(sp, 2 * wordSize)); 3509 3510 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3511 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3512 3513 __ leave(); // required for proper stackwalking of RuntimeStub frame 3514 __ ret(lr); 3515 return start; 3516 } 3517 3518 class Cached64Bytes { 3519 private: 3520 MacroAssembler *_masm; 3521 Register _regs[8]; 3522 3523 public: 3524 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3525 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3526 auto it = rs.begin(); 3527 for (auto &r: _regs) { 3528 r = *it; 3529 ++it; 3530 } 3531 } 3532 3533 void gen_loads(Register base) { 3534 for (int i = 0; i < 8; i += 2) { 3535 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3536 } 3537 } 3538 3539 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3540 void extract_u32(Register dest, int i) { 3541 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3542 } 3543 }; 3544 3545 // Utility routines for md5. 3546 // Clobbers r10 and r11. 3547 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3548 int k, int s, int t) { 3549 Register rscratch3 = r10; 3550 Register rscratch4 = r11; 3551 3552 __ eorw(rscratch3, r3, r4); 3553 __ movw(rscratch2, t); 3554 __ andw(rscratch3, rscratch3, r2); 3555 __ addw(rscratch4, r1, rscratch2); 3556 reg_cache.extract_u32(rscratch1, k); 3557 __ eorw(rscratch3, rscratch3, r4); 3558 __ addw(rscratch4, rscratch4, rscratch1); 3559 __ addw(rscratch3, rscratch3, rscratch4); 3560 __ rorw(rscratch2, rscratch3, 32 - s); 3561 __ addw(r1, rscratch2, r2); 3562 } 3563 3564 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3565 int k, int s, int t) { 3566 Register rscratch3 = r10; 3567 Register rscratch4 = r11; 3568 3569 reg_cache.extract_u32(rscratch1, k); 3570 __ movw(rscratch2, t); 3571 __ addw(rscratch4, r1, rscratch2); 3572 __ addw(rscratch4, rscratch4, rscratch1); 3573 __ bicw(rscratch2, r3, r4); 3574 __ andw(rscratch3, r2, r4); 3575 __ addw(rscratch2, rscratch2, rscratch4); 3576 __ addw(rscratch2, rscratch2, rscratch3); 3577 __ rorw(rscratch2, rscratch2, 32 - s); 3578 __ addw(r1, rscratch2, r2); 3579 } 3580 3581 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3582 int k, int s, int t) { 3583 Register rscratch3 = r10; 3584 Register rscratch4 = r11; 3585 3586 __ eorw(rscratch3, r3, r4); 3587 __ movw(rscratch2, t); 3588 __ addw(rscratch4, r1, rscratch2); 3589 reg_cache.extract_u32(rscratch1, k); 3590 __ eorw(rscratch3, rscratch3, r2); 3591 __ addw(rscratch4, rscratch4, rscratch1); 3592 __ addw(rscratch3, rscratch3, rscratch4); 3593 __ rorw(rscratch2, rscratch3, 32 - s); 3594 __ addw(r1, rscratch2, r2); 3595 } 3596 3597 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3598 int k, int s, int t) { 3599 Register rscratch3 = r10; 3600 Register rscratch4 = r11; 3601 3602 __ movw(rscratch3, t); 3603 __ ornw(rscratch2, r2, r4); 3604 __ addw(rscratch4, r1, rscratch3); 3605 reg_cache.extract_u32(rscratch1, k); 3606 __ eorw(rscratch3, rscratch2, r3); 3607 __ addw(rscratch4, rscratch4, rscratch1); 3608 __ addw(rscratch3, rscratch3, rscratch4); 3609 __ rorw(rscratch2, rscratch3, 32 - s); 3610 __ addw(r1, rscratch2, r2); 3611 } 3612 3613 // Arguments: 3614 // 3615 // Inputs: 3616 // c_rarg0 - byte[] source+offset 3617 // c_rarg1 - int[] SHA.state 3618 // c_rarg2 - int offset 3619 // c_rarg3 - int limit 3620 // 3621 address generate_md5_implCompress(StubGenStubId stub_id) { 3622 bool multi_block; 3623 switch (stub_id) { 3624 case md5_implCompress_id: 3625 multi_block = false; 3626 break; 3627 case md5_implCompressMB_id: 3628 multi_block = true; 3629 break; 3630 default: 3631 ShouldNotReachHere(); 3632 } 3633 __ align(CodeEntryAlignment); 3634 3635 StubCodeMark mark(this, stub_id); 3636 address start = __ pc(); 3637 3638 Register buf = c_rarg0; 3639 Register state = c_rarg1; 3640 Register ofs = c_rarg2; 3641 Register limit = c_rarg3; 3642 Register a = r4; 3643 Register b = r5; 3644 Register c = r6; 3645 Register d = r7; 3646 Register rscratch3 = r10; 3647 Register rscratch4 = r11; 3648 3649 Register state_regs[2] = { r12, r13 }; 3650 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3651 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3652 3653 __ push(saved_regs, sp); 3654 3655 __ ldp(state_regs[0], state_regs[1], Address(state)); 3656 __ ubfx(a, state_regs[0], 0, 32); 3657 __ ubfx(b, state_regs[0], 32, 32); 3658 __ ubfx(c, state_regs[1], 0, 32); 3659 __ ubfx(d, state_regs[1], 32, 32); 3660 3661 Label md5_loop; 3662 __ BIND(md5_loop); 3663 3664 reg_cache.gen_loads(buf); 3665 3666 // Round 1 3667 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3668 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3669 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3670 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3671 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3672 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3673 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3674 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3675 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3676 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3677 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3678 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3679 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3680 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3681 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3682 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3683 3684 // Round 2 3685 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3686 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3687 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3688 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3689 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3690 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3691 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3692 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3693 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3694 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3695 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3696 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3697 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3698 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3699 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3700 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3701 3702 // Round 3 3703 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3704 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3705 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3706 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3707 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3708 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3709 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3710 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3711 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3712 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3713 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3714 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3715 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3716 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3717 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3718 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3719 3720 // Round 4 3721 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3722 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3723 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3724 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3725 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3726 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3727 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3728 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3729 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3730 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3731 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3732 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3733 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3734 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3735 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3736 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3737 3738 __ addw(a, state_regs[0], a); 3739 __ ubfx(rscratch2, state_regs[0], 32, 32); 3740 __ addw(b, rscratch2, b); 3741 __ addw(c, state_regs[1], c); 3742 __ ubfx(rscratch4, state_regs[1], 32, 32); 3743 __ addw(d, rscratch4, d); 3744 3745 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3746 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3747 3748 if (multi_block) { 3749 __ add(buf, buf, 64); 3750 __ add(ofs, ofs, 64); 3751 __ cmp(ofs, limit); 3752 __ br(Assembler::LE, md5_loop); 3753 __ mov(c_rarg0, ofs); // return ofs 3754 } 3755 3756 // write hash values back in the correct order 3757 __ stp(state_regs[0], state_regs[1], Address(state)); 3758 3759 __ pop(saved_regs, sp); 3760 3761 __ ret(lr); 3762 3763 return start; 3764 } 3765 3766 // Arguments: 3767 // 3768 // Inputs: 3769 // c_rarg0 - byte[] source+offset 3770 // c_rarg1 - int[] SHA.state 3771 // c_rarg2 - int offset 3772 // c_rarg3 - int limit 3773 // 3774 address generate_sha1_implCompress(StubGenStubId stub_id) { 3775 bool multi_block; 3776 switch (stub_id) { 3777 case sha1_implCompress_id: 3778 multi_block = false; 3779 break; 3780 case sha1_implCompressMB_id: 3781 multi_block = true; 3782 break; 3783 default: 3784 ShouldNotReachHere(); 3785 } 3786 3787 __ align(CodeEntryAlignment); 3788 3789 StubCodeMark mark(this, stub_id); 3790 address start = __ pc(); 3791 3792 Register buf = c_rarg0; 3793 Register state = c_rarg1; 3794 Register ofs = c_rarg2; 3795 Register limit = c_rarg3; 3796 3797 Label keys; 3798 Label sha1_loop; 3799 3800 // load the keys into v0..v3 3801 __ adr(rscratch1, keys); 3802 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3803 // load 5 words state into v6, v7 3804 __ ldrq(v6, Address(state, 0)); 3805 __ ldrs(v7, Address(state, 16)); 3806 3807 3808 __ BIND(sha1_loop); 3809 // load 64 bytes of data into v16..v19 3810 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3811 __ rev32(v16, __ T16B, v16); 3812 __ rev32(v17, __ T16B, v17); 3813 __ rev32(v18, __ T16B, v18); 3814 __ rev32(v19, __ T16B, v19); 3815 3816 // do the sha1 3817 __ addv(v4, __ T4S, v16, v0); 3818 __ orr(v20, __ T16B, v6, v6); 3819 3820 FloatRegister d0 = v16; 3821 FloatRegister d1 = v17; 3822 FloatRegister d2 = v18; 3823 FloatRegister d3 = v19; 3824 3825 for (int round = 0; round < 20; round++) { 3826 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3827 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3828 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3829 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3830 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3831 3832 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3833 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3834 __ sha1h(tmp2, __ T4S, v20); 3835 if (round < 5) 3836 __ sha1c(v20, __ T4S, tmp3, tmp4); 3837 else if (round < 10 || round >= 15) 3838 __ sha1p(v20, __ T4S, tmp3, tmp4); 3839 else 3840 __ sha1m(v20, __ T4S, tmp3, tmp4); 3841 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3842 3843 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3844 } 3845 3846 __ addv(v7, __ T2S, v7, v21); 3847 __ addv(v6, __ T4S, v6, v20); 3848 3849 if (multi_block) { 3850 __ add(ofs, ofs, 64); 3851 __ cmp(ofs, limit); 3852 __ br(Assembler::LE, sha1_loop); 3853 __ mov(c_rarg0, ofs); // return ofs 3854 } 3855 3856 __ strq(v6, Address(state, 0)); 3857 __ strs(v7, Address(state, 16)); 3858 3859 __ ret(lr); 3860 3861 __ bind(keys); 3862 __ emit_int32(0x5a827999); 3863 __ emit_int32(0x6ed9eba1); 3864 __ emit_int32(0x8f1bbcdc); 3865 __ emit_int32(0xca62c1d6); 3866 3867 return start; 3868 } 3869 3870 3871 // Arguments: 3872 // 3873 // Inputs: 3874 // c_rarg0 - byte[] source+offset 3875 // c_rarg1 - int[] SHA.state 3876 // c_rarg2 - int offset 3877 // c_rarg3 - int limit 3878 // 3879 address generate_sha256_implCompress(StubGenStubId stub_id) { 3880 bool multi_block; 3881 switch (stub_id) { 3882 case sha256_implCompress_id: 3883 multi_block = false; 3884 break; 3885 case sha256_implCompressMB_id: 3886 multi_block = true; 3887 break; 3888 default: 3889 ShouldNotReachHere(); 3890 } 3891 3892 static const uint32_t round_consts[64] = { 3893 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3894 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3895 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3896 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3897 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3898 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3899 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3900 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3901 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3902 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3903 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3904 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3905 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3906 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3907 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3908 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3909 }; 3910 3911 __ align(CodeEntryAlignment); 3912 3913 StubCodeMark mark(this, stub_id); 3914 address start = __ pc(); 3915 3916 Register buf = c_rarg0; 3917 Register state = c_rarg1; 3918 Register ofs = c_rarg2; 3919 Register limit = c_rarg3; 3920 3921 Label sha1_loop; 3922 3923 __ stpd(v8, v9, __ pre(sp, -32)); 3924 __ stpd(v10, v11, Address(sp, 16)); 3925 3926 // dga == v0 3927 // dgb == v1 3928 // dg0 == v2 3929 // dg1 == v3 3930 // dg2 == v4 3931 // t0 == v6 3932 // t1 == v7 3933 3934 // load 16 keys to v16..v31 3935 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3936 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3937 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3938 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3939 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3940 3941 // load 8 words (256 bits) state 3942 __ ldpq(v0, v1, state); 3943 3944 __ BIND(sha1_loop); 3945 // load 64 bytes of data into v8..v11 3946 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3947 __ rev32(v8, __ T16B, v8); 3948 __ rev32(v9, __ T16B, v9); 3949 __ rev32(v10, __ T16B, v10); 3950 __ rev32(v11, __ T16B, v11); 3951 3952 __ addv(v6, __ T4S, v8, v16); 3953 __ orr(v2, __ T16B, v0, v0); 3954 __ orr(v3, __ T16B, v1, v1); 3955 3956 FloatRegister d0 = v8; 3957 FloatRegister d1 = v9; 3958 FloatRegister d2 = v10; 3959 FloatRegister d3 = v11; 3960 3961 3962 for (int round = 0; round < 16; round++) { 3963 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3964 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3965 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3966 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3967 3968 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3969 __ orr(v4, __ T16B, v2, v2); 3970 if (round < 15) 3971 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3972 __ sha256h(v2, __ T4S, v3, tmp2); 3973 __ sha256h2(v3, __ T4S, v4, tmp2); 3974 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3975 3976 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3977 } 3978 3979 __ addv(v0, __ T4S, v0, v2); 3980 __ addv(v1, __ T4S, v1, v3); 3981 3982 if (multi_block) { 3983 __ add(ofs, ofs, 64); 3984 __ cmp(ofs, limit); 3985 __ br(Assembler::LE, sha1_loop); 3986 __ mov(c_rarg0, ofs); // return ofs 3987 } 3988 3989 __ ldpd(v10, v11, Address(sp, 16)); 3990 __ ldpd(v8, v9, __ post(sp, 32)); 3991 3992 __ stpq(v0, v1, state); 3993 3994 __ ret(lr); 3995 3996 return start; 3997 } 3998 3999 // Double rounds for sha512. 4000 void sha512_dround(int dr, 4001 FloatRegister vi0, FloatRegister vi1, 4002 FloatRegister vi2, FloatRegister vi3, 4003 FloatRegister vi4, FloatRegister vrc0, 4004 FloatRegister vrc1, FloatRegister vin0, 4005 FloatRegister vin1, FloatRegister vin2, 4006 FloatRegister vin3, FloatRegister vin4) { 4007 if (dr < 36) { 4008 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 4009 } 4010 __ addv(v5, __ T2D, vrc0, vin0); 4011 __ ext(v6, __ T16B, vi2, vi3, 8); 4012 __ ext(v5, __ T16B, v5, v5, 8); 4013 __ ext(v7, __ T16B, vi1, vi2, 8); 4014 __ addv(vi3, __ T2D, vi3, v5); 4015 if (dr < 32) { 4016 __ ext(v5, __ T16B, vin3, vin4, 8); 4017 __ sha512su0(vin0, __ T2D, vin1); 4018 } 4019 __ sha512h(vi3, __ T2D, v6, v7); 4020 if (dr < 32) { 4021 __ sha512su1(vin0, __ T2D, vin2, v5); 4022 } 4023 __ addv(vi4, __ T2D, vi1, vi3); 4024 __ sha512h2(vi3, __ T2D, vi1, vi0); 4025 } 4026 4027 // Arguments: 4028 // 4029 // Inputs: 4030 // c_rarg0 - byte[] source+offset 4031 // c_rarg1 - int[] SHA.state 4032 // c_rarg2 - int offset 4033 // c_rarg3 - int limit 4034 // 4035 address generate_sha512_implCompress(StubGenStubId stub_id) { 4036 bool multi_block; 4037 switch (stub_id) { 4038 case sha512_implCompress_id: 4039 multi_block = false; 4040 break; 4041 case sha512_implCompressMB_id: 4042 multi_block = true; 4043 break; 4044 default: 4045 ShouldNotReachHere(); 4046 } 4047 4048 static const uint64_t round_consts[80] = { 4049 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 4050 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 4051 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 4052 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 4053 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 4054 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 4055 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 4056 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 4057 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 4058 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 4059 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 4060 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 4061 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 4062 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 4063 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 4064 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 4065 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 4066 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 4067 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 4068 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 4069 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 4070 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 4071 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 4072 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 4073 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 4074 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 4075 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 4076 }; 4077 4078 __ align(CodeEntryAlignment); 4079 4080 StubCodeMark mark(this, stub_id); 4081 address start = __ pc(); 4082 4083 Register buf = c_rarg0; 4084 Register state = c_rarg1; 4085 Register ofs = c_rarg2; 4086 Register limit = c_rarg3; 4087 4088 __ stpd(v8, v9, __ pre(sp, -64)); 4089 __ stpd(v10, v11, Address(sp, 16)); 4090 __ stpd(v12, v13, Address(sp, 32)); 4091 __ stpd(v14, v15, Address(sp, 48)); 4092 4093 Label sha512_loop; 4094 4095 // load state 4096 __ ld1(v8, v9, v10, v11, __ T2D, state); 4097 4098 // load first 4 round constants 4099 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4100 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4101 4102 __ BIND(sha512_loop); 4103 // load 128B of data into v12..v19 4104 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4105 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4106 __ rev64(v12, __ T16B, v12); 4107 __ rev64(v13, __ T16B, v13); 4108 __ rev64(v14, __ T16B, v14); 4109 __ rev64(v15, __ T16B, v15); 4110 __ rev64(v16, __ T16B, v16); 4111 __ rev64(v17, __ T16B, v17); 4112 __ rev64(v18, __ T16B, v18); 4113 __ rev64(v19, __ T16B, v19); 4114 4115 __ mov(rscratch2, rscratch1); 4116 4117 __ mov(v0, __ T16B, v8); 4118 __ mov(v1, __ T16B, v9); 4119 __ mov(v2, __ T16B, v10); 4120 __ mov(v3, __ T16B, v11); 4121 4122 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4123 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4124 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4125 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4126 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4127 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4128 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4129 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4130 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4131 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4132 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4133 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4134 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4135 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4136 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4137 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4138 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4139 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4140 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4141 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4142 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4143 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4144 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4145 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4146 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4147 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4148 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4149 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4150 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4151 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4152 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4153 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4154 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4155 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4156 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4157 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4158 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4159 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4160 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4161 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4162 4163 __ addv(v8, __ T2D, v8, v0); 4164 __ addv(v9, __ T2D, v9, v1); 4165 __ addv(v10, __ T2D, v10, v2); 4166 __ addv(v11, __ T2D, v11, v3); 4167 4168 if (multi_block) { 4169 __ add(ofs, ofs, 128); 4170 __ cmp(ofs, limit); 4171 __ br(Assembler::LE, sha512_loop); 4172 __ mov(c_rarg0, ofs); // return ofs 4173 } 4174 4175 __ st1(v8, v9, v10, v11, __ T2D, state); 4176 4177 __ ldpd(v14, v15, Address(sp, 48)); 4178 __ ldpd(v12, v13, Address(sp, 32)); 4179 __ ldpd(v10, v11, Address(sp, 16)); 4180 __ ldpd(v8, v9, __ post(sp, 64)); 4181 4182 __ ret(lr); 4183 4184 return start; 4185 } 4186 4187 // Execute one round of keccak of two computations in parallel. 4188 // One of the states should be loaded into the lower halves of 4189 // the vector registers v0-v24, the other should be loaded into 4190 // the upper halves of those registers. The ld1r instruction loads 4191 // the round constant into both halves of register v31. 4192 // Intermediate results c0...c5 and d0...d5 are computed 4193 // in registers v25...v30. 4194 // All vector instructions that are used operate on both register 4195 // halves in parallel. 4196 // If only a single computation is needed, one can only load the lower halves. 4197 void keccak_round(Register rscratch1) { 4198 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4199 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4200 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4201 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4202 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4203 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4204 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4205 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4206 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4207 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4208 4209 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4210 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4211 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4212 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4213 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4214 4215 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4216 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4217 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4218 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4219 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4220 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4221 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4222 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4223 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4224 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4225 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4226 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4227 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4228 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4229 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4230 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4231 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4232 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4233 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4234 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4235 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4236 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4237 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4238 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4239 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4240 4241 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4242 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4243 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4244 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4245 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4246 4247 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4248 4249 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4250 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4251 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4252 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4253 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4254 4255 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4256 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4257 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4258 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4259 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4260 4261 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4262 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4263 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4264 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4265 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4266 4267 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4268 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4269 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4270 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4271 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4272 4273 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4274 } 4275 4276 // Arguments: 4277 // 4278 // Inputs: 4279 // c_rarg0 - byte[] source+offset 4280 // c_rarg1 - byte[] SHA.state 4281 // c_rarg2 - int block_size 4282 // c_rarg3 - int offset 4283 // c_rarg4 - int limit 4284 // 4285 address generate_sha3_implCompress(StubGenStubId stub_id) { 4286 bool multi_block; 4287 switch (stub_id) { 4288 case sha3_implCompress_id: 4289 multi_block = false; 4290 break; 4291 case sha3_implCompressMB_id: 4292 multi_block = true; 4293 break; 4294 default: 4295 ShouldNotReachHere(); 4296 } 4297 4298 static const uint64_t round_consts[24] = { 4299 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4300 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4301 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4302 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4303 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4304 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4305 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4306 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4307 }; 4308 4309 __ align(CodeEntryAlignment); 4310 4311 StubCodeMark mark(this, stub_id); 4312 address start = __ pc(); 4313 4314 Register buf = c_rarg0; 4315 Register state = c_rarg1; 4316 Register block_size = c_rarg2; 4317 Register ofs = c_rarg3; 4318 Register limit = c_rarg4; 4319 4320 Label sha3_loop, rounds24_loop; 4321 Label sha3_512_or_sha3_384, shake128; 4322 4323 __ stpd(v8, v9, __ pre(sp, -64)); 4324 __ stpd(v10, v11, Address(sp, 16)); 4325 __ stpd(v12, v13, Address(sp, 32)); 4326 __ stpd(v14, v15, Address(sp, 48)); 4327 4328 // load state 4329 __ add(rscratch1, state, 32); 4330 __ ld1(v0, v1, v2, v3, __ T1D, state); 4331 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4332 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4333 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4334 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4335 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4336 __ ld1(v24, __ T1D, rscratch1); 4337 4338 __ BIND(sha3_loop); 4339 4340 // 24 keccak rounds 4341 __ movw(rscratch2, 24); 4342 4343 // load round_constants base 4344 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4345 4346 // load input 4347 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4348 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4349 __ eor(v0, __ T8B, v0, v25); 4350 __ eor(v1, __ T8B, v1, v26); 4351 __ eor(v2, __ T8B, v2, v27); 4352 __ eor(v3, __ T8B, v3, v28); 4353 __ eor(v4, __ T8B, v4, v29); 4354 __ eor(v5, __ T8B, v5, v30); 4355 __ eor(v6, __ T8B, v6, v31); 4356 4357 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4358 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4359 4360 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4361 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4362 __ eor(v7, __ T8B, v7, v25); 4363 __ eor(v8, __ T8B, v8, v26); 4364 __ eor(v9, __ T8B, v9, v27); 4365 __ eor(v10, __ T8B, v10, v28); 4366 __ eor(v11, __ T8B, v11, v29); 4367 __ eor(v12, __ T8B, v12, v30); 4368 __ eor(v13, __ T8B, v13, v31); 4369 4370 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4371 __ eor(v14, __ T8B, v14, v25); 4372 __ eor(v15, __ T8B, v15, v26); 4373 __ eor(v16, __ T8B, v16, v27); 4374 4375 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4376 __ andw(c_rarg5, block_size, 48); 4377 __ cbzw(c_rarg5, rounds24_loop); 4378 4379 __ tbnz(block_size, 5, shake128); 4380 // block_size == 144, bit5 == 0, SHA3-224 4381 __ ldrd(v28, __ post(buf, 8)); 4382 __ eor(v17, __ T8B, v17, v28); 4383 __ b(rounds24_loop); 4384 4385 __ BIND(shake128); 4386 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4387 __ eor(v17, __ T8B, v17, v28); 4388 __ eor(v18, __ T8B, v18, v29); 4389 __ eor(v19, __ T8B, v19, v30); 4390 __ eor(v20, __ T8B, v20, v31); 4391 __ b(rounds24_loop); // block_size == 168, SHAKE128 4392 4393 __ BIND(sha3_512_or_sha3_384); 4394 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4395 __ eor(v7, __ T8B, v7, v25); 4396 __ eor(v8, __ T8B, v8, v26); 4397 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4398 4399 // SHA3-384 4400 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4401 __ eor(v9, __ T8B, v9, v27); 4402 __ eor(v10, __ T8B, v10, v28); 4403 __ eor(v11, __ T8B, v11, v29); 4404 __ eor(v12, __ T8B, v12, v30); 4405 4406 __ BIND(rounds24_loop); 4407 __ subw(rscratch2, rscratch2, 1); 4408 4409 keccak_round(rscratch1); 4410 4411 __ cbnzw(rscratch2, rounds24_loop); 4412 4413 if (multi_block) { 4414 __ add(ofs, ofs, block_size); 4415 __ cmp(ofs, limit); 4416 __ br(Assembler::LE, sha3_loop); 4417 __ mov(c_rarg0, ofs); // return ofs 4418 } 4419 4420 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4421 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4422 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4423 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4424 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4425 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4426 __ st1(v24, __ T1D, state); 4427 4428 // restore callee-saved registers 4429 __ ldpd(v14, v15, Address(sp, 48)); 4430 __ ldpd(v12, v13, Address(sp, 32)); 4431 __ ldpd(v10, v11, Address(sp, 16)); 4432 __ ldpd(v8, v9, __ post(sp, 64)); 4433 4434 __ ret(lr); 4435 4436 return start; 4437 } 4438 4439 // Inputs: 4440 // c_rarg0 - long[] state0 4441 // c_rarg1 - long[] state1 4442 address generate_double_keccak() { 4443 static const uint64_t round_consts[24] = { 4444 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4445 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4446 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4447 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4448 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4449 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4450 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4451 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4452 }; 4453 4454 // Implements the double_keccak() method of the 4455 // sun.secyrity.provider.SHA3Parallel class 4456 __ align(CodeEntryAlignment); 4457 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4458 address start = __ pc(); 4459 __ enter(); 4460 4461 Register state0 = c_rarg0; 4462 Register state1 = c_rarg1; 4463 4464 Label rounds24_loop; 4465 4466 // save callee-saved registers 4467 __ stpd(v8, v9, __ pre(sp, -64)); 4468 __ stpd(v10, v11, Address(sp, 16)); 4469 __ stpd(v12, v13, Address(sp, 32)); 4470 __ stpd(v14, v15, Address(sp, 48)); 4471 4472 // load states 4473 __ add(rscratch1, state0, 32); 4474 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4475 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4476 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4477 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4478 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4479 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4480 __ ld1(v24, __ D, 0, rscratch1); 4481 __ add(rscratch1, state1, 32); 4482 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4483 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4484 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4485 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4486 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4487 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4488 __ ld1(v24, __ D, 1, rscratch1); 4489 4490 // 24 keccak rounds 4491 __ movw(rscratch2, 24); 4492 4493 // load round_constants base 4494 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4495 4496 __ BIND(rounds24_loop); 4497 __ subw(rscratch2, rscratch2, 1); 4498 keccak_round(rscratch1); 4499 __ cbnzw(rscratch2, rounds24_loop); 4500 4501 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4502 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4503 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4504 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4505 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4506 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4507 __ st1(v24, __ D, 0, state0); 4508 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4509 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4510 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4511 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4512 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4513 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4514 __ st1(v24, __ D, 1, state1); 4515 4516 // restore callee-saved vector registers 4517 __ ldpd(v14, v15, Address(sp, 48)); 4518 __ ldpd(v12, v13, Address(sp, 32)); 4519 __ ldpd(v10, v11, Address(sp, 16)); 4520 __ ldpd(v8, v9, __ post(sp, 64)); 4521 4522 __ leave(); // required for proper stackwalking of RuntimeStub frame 4523 __ mov(r0, zr); // return 0 4524 __ ret(lr); 4525 4526 return start; 4527 } 4528 4529 // ChaCha20 block function. This version parallelizes the 32-bit 4530 // state elements on each of 16 vectors, producing 4 blocks of 4531 // keystream at a time. 4532 // 4533 // state (int[16]) = c_rarg0 4534 // keystream (byte[256]) = c_rarg1 4535 // return - number of bytes of produced keystream (always 256) 4536 // 4537 // This implementation takes each 32-bit integer from the state 4538 // array and broadcasts it across all 4 32-bit lanes of a vector register 4539 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4540 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4541 // the quarter round schedule is implemented as outlined in RFC 7539 section 4542 // 2.3. However, instead of sequentially processing the 3 quarter round 4543 // operations represented by one QUARTERROUND function, we instead stack all 4544 // the adds, xors and left-rotations from the first 4 quarter rounds together 4545 // and then do the same for the second set of 4 quarter rounds. This removes 4546 // some latency that would otherwise be incurred by waiting for an add to 4547 // complete before performing an xor (which depends on the result of the 4548 // add), etc. An adjustment happens between the first and second groups of 4 4549 // quarter rounds, but this is done only in the inputs to the macro functions 4550 // that generate the assembly instructions - these adjustments themselves are 4551 // not part of the resulting assembly. 4552 // The 4 registers v0-v3 are used during the quarter round operations as 4553 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4554 // registers become the vectors involved in adding the start state back onto 4555 // the post-QR working state. After the adds are complete, each of the 16 4556 // vectors write their first lane back to the keystream buffer, followed 4557 // by the second lane from all vectors and so on. 4558 address generate_chacha20Block_blockpar() { 4559 Label L_twoRounds, L_cc20_const; 4560 // The constant data is broken into two 128-bit segments to be loaded 4561 // onto FloatRegisters. The first 128 bits are a counter add overlay 4562 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4563 // The second 128-bits is a table constant used for 8-bit left rotations. 4564 __ BIND(L_cc20_const); 4565 __ emit_int64(0x0000000100000000UL); 4566 __ emit_int64(0x0000000300000002UL); 4567 __ emit_int64(0x0605040702010003UL); 4568 __ emit_int64(0x0E0D0C0F0A09080BUL); 4569 4570 __ align(CodeEntryAlignment); 4571 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4572 StubCodeMark mark(this, stub_id); 4573 address start = __ pc(); 4574 __ enter(); 4575 4576 int i, j; 4577 const Register state = c_rarg0; 4578 const Register keystream = c_rarg1; 4579 const Register loopCtr = r10; 4580 const Register tmpAddr = r11; 4581 const FloatRegister ctrAddOverlay = v28; 4582 const FloatRegister lrot8Tbl = v29; 4583 4584 // Organize SIMD registers in an array that facilitates 4585 // putting repetitive opcodes into loop structures. It is 4586 // important that each grouping of 4 registers is monotonically 4587 // increasing to support the requirements of multi-register 4588 // instructions (e.g. ld4r, st4, etc.) 4589 const FloatRegister workSt[16] = { 4590 v4, v5, v6, v7, v16, v17, v18, v19, 4591 v20, v21, v22, v23, v24, v25, v26, v27 4592 }; 4593 4594 // Pull in constant data. The first 16 bytes are the add overlay 4595 // which is applied to the vector holding the counter (state[12]). 4596 // The second 16 bytes is the index register for the 8-bit left 4597 // rotation tbl instruction. 4598 __ adr(tmpAddr, L_cc20_const); 4599 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4600 4601 // Load from memory and interlace across 16 SIMD registers, 4602 // With each word from memory being broadcast to all lanes of 4603 // each successive SIMD register. 4604 // Addr(0) -> All lanes in workSt[i] 4605 // Addr(4) -> All lanes workSt[i + 1], etc. 4606 __ mov(tmpAddr, state); 4607 for (i = 0; i < 16; i += 4) { 4608 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4609 __ post(tmpAddr, 16)); 4610 } 4611 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4612 4613 // Before entering the loop, create 5 4-register arrays. These 4614 // will hold the 4 registers that represent the a/b/c/d fields 4615 // in the quarter round operation. For instance the "b" field 4616 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4617 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4618 // since it is part of a diagonal organization. The aSet and scratch 4619 // register sets are defined at declaration time because they do not change 4620 // organization at any point during the 20-round processing. 4621 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4622 FloatRegister bSet[4]; 4623 FloatRegister cSet[4]; 4624 FloatRegister dSet[4]; 4625 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4626 4627 // Set up the 10 iteration loop and perform all 8 quarter round ops 4628 __ mov(loopCtr, 10); 4629 __ BIND(L_twoRounds); 4630 4631 // Set to columnar organization and do the following 4 quarter-rounds: 4632 // QUARTERROUND(0, 4, 8, 12) 4633 // QUARTERROUND(1, 5, 9, 13) 4634 // QUARTERROUND(2, 6, 10, 14) 4635 // QUARTERROUND(3, 7, 11, 15) 4636 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4637 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4638 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4639 4640 __ cc20_qr_add4(aSet, bSet); // a += b 4641 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4642 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4643 4644 __ cc20_qr_add4(cSet, dSet); // c += d 4645 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4646 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4647 4648 __ cc20_qr_add4(aSet, bSet); // a += b 4649 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4650 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4651 4652 __ cc20_qr_add4(cSet, dSet); // c += d 4653 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4654 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4655 4656 // Set to diagonal organization and do the next 4 quarter-rounds: 4657 // QUARTERROUND(0, 5, 10, 15) 4658 // QUARTERROUND(1, 6, 11, 12) 4659 // QUARTERROUND(2, 7, 8, 13) 4660 // QUARTERROUND(3, 4, 9, 14) 4661 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4662 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4663 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4664 4665 __ cc20_qr_add4(aSet, bSet); // a += b 4666 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4667 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4668 4669 __ cc20_qr_add4(cSet, dSet); // c += d 4670 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4671 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4672 4673 __ cc20_qr_add4(aSet, bSet); // a += b 4674 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4675 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4676 4677 __ cc20_qr_add4(cSet, dSet); // c += d 4678 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4679 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4680 4681 // Decrement and iterate 4682 __ sub(loopCtr, loopCtr, 1); 4683 __ cbnz(loopCtr, L_twoRounds); 4684 4685 __ mov(tmpAddr, state); 4686 4687 // Add the starting state back to the post-loop keystream 4688 // state. We read/interlace the state array from memory into 4689 // 4 registers similar to what we did in the beginning. Then 4690 // add the counter overlay onto workSt[12] at the end. 4691 for (i = 0; i < 16; i += 4) { 4692 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4693 __ addv(workSt[i], __ T4S, workSt[i], v0); 4694 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4695 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4696 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4697 } 4698 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4699 4700 // Write working state into the keystream buffer. This is accomplished 4701 // by taking the lane "i" from each of the four vectors and writing 4702 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4703 // repeating with the next 4 vectors until all 16 vectors have been used. 4704 // Then move to the next lane and repeat the process until all lanes have 4705 // been written. 4706 for (i = 0; i < 4; i++) { 4707 for (j = 0; j < 16; j += 4) { 4708 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4709 __ post(keystream, 16)); 4710 } 4711 } 4712 4713 __ mov(r0, 256); // Return length of output keystream 4714 __ leave(); 4715 __ ret(lr); 4716 4717 return start; 4718 } 4719 4720 // Helpers to schedule parallel operation bundles across vector 4721 // register sequences of size 2, 4 or 8. 4722 4723 // Implement various primitive computations across vector sequences 4724 4725 template<int N> 4726 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4727 const VSeq<N>& v1, const VSeq<N>& v2) { 4728 // output must not be constant 4729 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4730 // output cannot overwrite pending inputs 4731 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4732 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4733 for (int i = 0; i < N; i++) { 4734 __ addv(v[i], T, v1[i], v2[i]); 4735 } 4736 } 4737 4738 template<int N> 4739 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4740 const VSeq<N>& v1, const VSeq<N>& v2) { 4741 // output must not be constant 4742 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4743 // output cannot overwrite pending inputs 4744 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4745 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4746 for (int i = 0; i < N; i++) { 4747 __ subv(v[i], T, v1[i], v2[i]); 4748 } 4749 } 4750 4751 template<int N> 4752 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4753 const VSeq<N>& v1, const VSeq<N>& v2) { 4754 // output must not be constant 4755 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4756 // output cannot overwrite pending inputs 4757 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4758 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4759 for (int i = 0; i < N; i++) { 4760 __ mulv(v[i], T, v1[i], v2[i]); 4761 } 4762 } 4763 4764 template<int N> 4765 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4766 // output must not be constant 4767 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4768 // output cannot overwrite pending inputs 4769 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4770 for (int i = 0; i < N; i++) { 4771 __ negr(v[i], T, v1[i]); 4772 } 4773 } 4774 4775 template<int N> 4776 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4777 const VSeq<N>& v1, int shift) { 4778 // output must not be constant 4779 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4780 // output cannot overwrite pending inputs 4781 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4782 for (int i = 0; i < N; i++) { 4783 __ sshr(v[i], T, v1[i], shift); 4784 } 4785 } 4786 4787 template<int N> 4788 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4789 // output must not be constant 4790 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4791 // output cannot overwrite pending inputs 4792 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4793 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4794 for (int i = 0; i < N; i++) { 4795 __ andr(v[i], __ T16B, v1[i], v2[i]); 4796 } 4797 } 4798 4799 template<int N> 4800 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4801 // output must not be constant 4802 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4803 // output cannot overwrite pending inputs 4804 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4805 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4806 for (int i = 0; i < N; i++) { 4807 __ orr(v[i], __ T16B, v1[i], v2[i]); 4808 } 4809 } 4810 4811 template<int N> 4812 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4813 // output must not be constant 4814 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4815 // output cannot overwrite pending inputs 4816 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4817 for (int i = 0; i < N; i++) { 4818 __ notr(v[i], __ T16B, v1[i]); 4819 } 4820 } 4821 4822 template<int N> 4823 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4824 // output must not be constant 4825 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4826 // output cannot overwrite pending inputs 4827 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4828 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4829 for (int i = 0; i < N; i++) { 4830 __ sqdmulh(v[i], T, v1[i], v2[i]); 4831 } 4832 } 4833 4834 template<int N> 4835 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4836 // output must not be constant 4837 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4838 // output cannot overwrite pending inputs 4839 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4840 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4841 for (int i = 0; i < N; i++) { 4842 __ mlsv(v[i], T, v1[i], v2[i]); 4843 } 4844 } 4845 4846 // load N/2 successive pairs of quadword values from memory in order 4847 // into N successive vector registers of the sequence via the 4848 // address supplied in base. 4849 template<int N> 4850 void vs_ldpq(const VSeq<N>& v, Register base) { 4851 for (int i = 0; i < N; i += 2) { 4852 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4853 } 4854 } 4855 4856 // load N/2 successive pairs of quadword values from memory in order 4857 // into N vector registers of the sequence via the address supplied 4858 // in base using post-increment addressing 4859 template<int N> 4860 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4861 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4862 for (int i = 0; i < N; i += 2) { 4863 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4864 } 4865 } 4866 4867 // store N successive vector registers of the sequence into N/2 4868 // successive pairs of quadword memory locations via the address 4869 // supplied in base using post-increment addressing 4870 template<int N> 4871 void vs_stpq_post(const VSeq<N>& v, Register base) { 4872 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4873 for (int i = 0; i < N; i += 2) { 4874 __ stpq(v[i], v[i+1], __ post(base, 32)); 4875 } 4876 } 4877 4878 // load N/2 pairs of quadword values from memory de-interleaved into 4879 // N vector registers 2 at a time via the address supplied in base 4880 // using post-increment addressing. 4881 template<int N> 4882 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4883 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4884 for (int i = 0; i < N; i += 2) { 4885 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4886 } 4887 } 4888 4889 // store N vector registers interleaved into N/2 pairs of quadword 4890 // memory locations via the address supplied in base using 4891 // post-increment addressing. 4892 template<int N> 4893 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4894 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4895 for (int i = 0; i < N; i += 2) { 4896 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4897 } 4898 } 4899 4900 // load N quadword values from memory de-interleaved into N vector 4901 // registers 3 elements at a time via the address supplied in base. 4902 template<int N> 4903 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4904 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4905 for (int i = 0; i < N; i += 3) { 4906 __ ld3(v[i], v[i+1], v[i+2], T, base); 4907 } 4908 } 4909 4910 // load N quadword values from memory de-interleaved into N vector 4911 // registers 3 elements at a time via the address supplied in base 4912 // using post-increment addressing. 4913 template<int N> 4914 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4915 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4916 for (int i = 0; i < N; i += 3) { 4917 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4918 } 4919 } 4920 4921 // load N/2 pairs of quadword values from memory into N vector 4922 // registers via the address supplied in base with each pair indexed 4923 // using the the start offset plus the corresponding entry in the 4924 // offsets array 4925 template<int N> 4926 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4927 for (int i = 0; i < N/2; i++) { 4928 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4929 } 4930 } 4931 4932 // store N vector registers into N/2 pairs of quadword memory 4933 // locations via the address supplied in base with each pair indexed 4934 // using the the start offset plus the corresponding entry in the 4935 // offsets array 4936 template<int N> 4937 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4938 for (int i = 0; i < N/2; i++) { 4939 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4940 } 4941 } 4942 4943 // load N single quadword values from memory into N vector registers 4944 // via the address supplied in base with each value indexed using 4945 // the the start offset plus the corresponding entry in the offsets 4946 // array 4947 template<int N> 4948 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4949 int start, int (&offsets)[N]) { 4950 for (int i = 0; i < N; i++) { 4951 __ ldr(v[i], T, Address(base, start + offsets[i])); 4952 } 4953 } 4954 4955 // store N vector registers into N single quadword memory locations 4956 // via the address supplied in base with each value indexed using 4957 // the the start offset plus the corresponding entry in the offsets 4958 // array 4959 template<int N> 4960 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4961 int start, int (&offsets)[N]) { 4962 for (int i = 0; i < N; i++) { 4963 __ str(v[i], T, Address(base, start + offsets[i])); 4964 } 4965 } 4966 4967 // load N/2 pairs of quadword values from memory de-interleaved into 4968 // N vector registers 2 at a time via the address supplied in base 4969 // with each pair indexed using the the start offset plus the 4970 // corresponding entry in the offsets array 4971 template<int N> 4972 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4973 Register tmp, int start, int (&offsets)[N/2]) { 4974 for (int i = 0; i < N/2; i++) { 4975 __ add(tmp, base, start + offsets[i]); 4976 __ ld2(v[2*i], v[2*i+1], T, tmp); 4977 } 4978 } 4979 4980 // store N vector registers 2 at a time interleaved into N/2 pairs 4981 // of quadword memory locations via the address supplied in base 4982 // with each pair indexed using the the start offset plus the 4983 // corresponding entry in the offsets array 4984 template<int N> 4985 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4986 Register tmp, int start, int (&offsets)[N/2]) { 4987 for (int i = 0; i < N/2; i++) { 4988 __ add(tmp, base, start + offsets[i]); 4989 __ st2(v[2*i], v[2*i+1], T, tmp); 4990 } 4991 } 4992 4993 // Helper routines for various flavours of Montgomery multiply 4994 4995 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 4996 // multiplications in parallel 4997 // 4998 4999 // See the montMul() method of the sun.security.provider.ML_DSA 5000 // class. 5001 // 5002 // Computes 4x4S results or 8x8H results 5003 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5004 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5005 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5006 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5007 // Outputs: va - 4x4S or 4x8H vector register sequences 5008 // vb, vc, vtmp and vq must all be disjoint 5009 // va must be disjoint from all other inputs/temps or must equal vc 5010 // va must have a non-zero delta i.e. it must not be a constant vseq. 5011 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5012 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5013 Assembler::SIMD_Arrangement T, 5014 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5015 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5016 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5017 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5018 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5019 5020 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5021 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5022 5023 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5024 5025 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5026 assert(vs_disjoint(va, vb), "va and vb overlap"); 5027 assert(vs_disjoint(va, vq), "va and vq overlap"); 5028 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5029 assert(!va.is_constant(), "output vector must identify 4 different registers"); 5030 5031 // schedule 4 streams of instructions across the vector sequences 5032 for (int i = 0; i < 4; i++) { 5033 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5034 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5035 } 5036 5037 for (int i = 0; i < 4; i++) { 5038 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5039 } 5040 5041 for (int i = 0; i < 4; i++) { 5042 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5043 } 5044 5045 for (int i = 0; i < 4; i++) { 5046 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5047 } 5048 } 5049 5050 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 5051 // multiplications in parallel 5052 // 5053 5054 // See the montMul() method of the sun.security.provider.ML_DSA 5055 // class. 5056 // 5057 // Computes 4x4S results or 8x8H results 5058 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5059 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5060 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5061 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5062 // Outputs: va - 4x4S or 4x8H vector register sequences 5063 // vb, vc, vtmp and vq must all be disjoint 5064 // va must be disjoint from all other inputs/temps or must equal vc 5065 // va must have a non-zero delta i.e. it must not be a constant vseq. 5066 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5067 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5068 Assembler::SIMD_Arrangement T, 5069 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5070 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5071 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5072 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5073 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5074 5075 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5076 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5077 5078 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5079 5080 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5081 assert(vs_disjoint(va, vb), "va and vb overlap"); 5082 assert(vs_disjoint(va, vq), "va and vq overlap"); 5083 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5084 assert(!va.is_constant(), "output vector must identify 2 different registers"); 5085 5086 // schedule 2 streams of instructions across the vector sequences 5087 for (int i = 0; i < 2; i++) { 5088 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5089 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5090 } 5091 5092 for (int i = 0; i < 2; i++) { 5093 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5094 } 5095 5096 for (int i = 0; i < 2; i++) { 5097 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5098 } 5099 5100 for (int i = 0; i < 2; i++) { 5101 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5102 } 5103 } 5104 5105 // Perform 16 16-bit Montgomery multiplications in parallel. 5106 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5107 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5108 // Use the helper routine to schedule a 2x8H Montgomery multiply. 5109 // It will assert that the register use is valid 5110 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 5111 } 5112 5113 // Perform 32 16-bit Montgomery multiplications in parallel. 5114 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5115 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5116 // Use the helper routine to schedule a 4x8H Montgomery multiply. 5117 // It will assert that the register use is valid 5118 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 5119 } 5120 5121 // Perform 64 16-bit Montgomery multiplications in parallel. 5122 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5123 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5124 // Schedule two successive 4x8H multiplies via the montmul helper 5125 // on the front and back halves of va, vb and vc. The helper will 5126 // assert that the register use has no overlap conflicts on each 5127 // individual call but we also need to ensure that the necessary 5128 // disjoint/equality constraints are met across both calls. 5129 5130 // vb, vc, vtmp and vq must be disjoint. va must either be 5131 // disjoint from all other registers or equal vc 5132 5133 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5134 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5135 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5136 5137 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5138 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5139 5140 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5141 5142 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5143 assert(vs_disjoint(va, vb), "va and vb overlap"); 5144 assert(vs_disjoint(va, vq), "va and vq overlap"); 5145 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5146 5147 // we multiply the front and back halves of each sequence 4 at a 5148 // time because 5149 // 5150 // 1) we are currently only able to get 4-way instruction 5151 // parallelism at best 5152 // 5153 // 2) we need registers for the constants in vq and temporary 5154 // scratch registers to hold intermediate results so vtmp can only 5155 // be a VSeq<4> which means we only have 4 scratch slots 5156 5157 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5158 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5159 } 5160 5161 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5162 const VSeq<4>& vc, 5163 const VSeq<4>& vtmp, 5164 const VSeq<2>& vq) { 5165 // compute a = montmul(a1, c) 5166 kyber_montmul32(vc, va1, vc, vtmp, vq); 5167 // ouptut a1 = a0 - a 5168 vs_subv(va1, __ T8H, va0, vc); 5169 // and a0 = a0 + a 5170 vs_addv(va0, __ T8H, va0, vc); 5171 } 5172 5173 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5174 const VSeq<4>& vb, 5175 const VSeq<4>& vtmp1, 5176 const VSeq<4>& vtmp2, 5177 const VSeq<2>& vq) { 5178 // compute c = a0 - a1 5179 vs_subv(vtmp1, __ T8H, va0, va1); 5180 // output a0 = a0 + a1 5181 vs_addv(va0, __ T8H, va0, va1); 5182 // output a1 = b montmul c 5183 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5184 } 5185 5186 void load64shorts(const VSeq<8>& v, Register shorts) { 5187 vs_ldpq_post(v, shorts); 5188 } 5189 5190 void load32shorts(const VSeq<4>& v, Register shorts) { 5191 vs_ldpq_post(v, shorts); 5192 } 5193 5194 void store64shorts(VSeq<8> v, Register tmpAddr) { 5195 vs_stpq_post(v, tmpAddr); 5196 } 5197 5198 // Kyber NTT function. 5199 // Implements 5200 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5201 // 5202 // coeffs (short[256]) = c_rarg0 5203 // ntt_zetas (short[256]) = c_rarg1 5204 address generate_kyberNtt() { 5205 5206 __ align(CodeEntryAlignment); 5207 StubGenStubId stub_id = StubGenStubId::kyberNtt_id; 5208 StubCodeMark mark(this, stub_id); 5209 address start = __ pc(); 5210 __ enter(); 5211 5212 const Register coeffs = c_rarg0; 5213 const Register zetas = c_rarg1; 5214 5215 const Register kyberConsts = r10; 5216 const Register tmpAddr = r11; 5217 5218 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5219 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5220 VSeq<2> vq(30); // n.b. constants overlap vs3 5221 5222 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5223 // load the montmul constants 5224 vs_ldpq(vq, kyberConsts); 5225 5226 // Each level corresponds to an iteration of the outermost loop of the 5227 // Java method seilerNTT(int[] coeffs). There are some differences 5228 // from what is done in the seilerNTT() method, though: 5229 // 1. The computation is using 16-bit signed values, we do not convert them 5230 // to ints here. 5231 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5232 // this array for each level, it is easier that way to fill up the vector 5233 // registers. 5234 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5235 // multiplications (this is because that way there should not be any 5236 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5237 // that we can use the 16-bit arithmetic in the vector unit. 5238 // 5239 // On each level, we fill up the vector registers in such a way that the 5240 // array elements that need to be multiplied by the zetas go into one 5241 // set of vector registers while the corresponding ones that don't need to 5242 // be multiplied, go into another set. 5243 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5244 // registers interleaving the steps of 4 identical computations, 5245 // each done on 8 16-bit values per register. 5246 5247 // At levels 0-3 the coefficients multiplied by or added/subtracted 5248 // to the zetas occur in discrete blocks whose size is some multiple 5249 // of 32. 5250 5251 // level 0 5252 __ add(tmpAddr, coeffs, 256); 5253 load64shorts(vs1, tmpAddr); 5254 load64shorts(vs2, zetas); 5255 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5256 __ add(tmpAddr, coeffs, 0); 5257 load64shorts(vs1, tmpAddr); 5258 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5259 vs_addv(vs1, __ T8H, vs1, vs2); 5260 __ add(tmpAddr, coeffs, 0); 5261 vs_stpq_post(vs1, tmpAddr); 5262 __ add(tmpAddr, coeffs, 256); 5263 vs_stpq_post(vs3, tmpAddr); 5264 // restore montmul constants 5265 vs_ldpq(vq, kyberConsts); 5266 load64shorts(vs1, tmpAddr); 5267 load64shorts(vs2, zetas); 5268 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5269 __ add(tmpAddr, coeffs, 128); 5270 load64shorts(vs1, tmpAddr); 5271 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5272 vs_addv(vs1, __ T8H, vs1, vs2); 5273 __ add(tmpAddr, coeffs, 128); 5274 store64shorts(vs1, tmpAddr); 5275 __ add(tmpAddr, coeffs, 384); 5276 store64shorts(vs3, tmpAddr); 5277 5278 // level 1 5279 // restore montmul constants 5280 vs_ldpq(vq, kyberConsts); 5281 __ add(tmpAddr, coeffs, 128); 5282 load64shorts(vs1, tmpAddr); 5283 load64shorts(vs2, zetas); 5284 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5285 __ add(tmpAddr, coeffs, 0); 5286 load64shorts(vs1, tmpAddr); 5287 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5288 vs_addv(vs1, __ T8H, vs1, vs2); 5289 __ add(tmpAddr, coeffs, 0); 5290 store64shorts(vs1, tmpAddr); 5291 store64shorts(vs3, tmpAddr); 5292 vs_ldpq(vq, kyberConsts); 5293 __ add(tmpAddr, coeffs, 384); 5294 load64shorts(vs1, tmpAddr); 5295 load64shorts(vs2, zetas); 5296 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5297 __ add(tmpAddr, coeffs, 256); 5298 load64shorts(vs1, tmpAddr); 5299 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5300 vs_addv(vs1, __ T8H, vs1, vs2); 5301 __ add(tmpAddr, coeffs, 256); 5302 store64shorts(vs1, tmpAddr); 5303 store64shorts(vs3, tmpAddr); 5304 5305 // level 2 5306 vs_ldpq(vq, kyberConsts); 5307 int offsets1[4] = { 0, 32, 128, 160 }; 5308 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5309 load64shorts(vs2, zetas); 5310 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5311 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5312 // kyber_subv_addv64(); 5313 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5314 vs_addv(vs1, __ T8H, vs1, vs2); 5315 __ add(tmpAddr, coeffs, 0); 5316 vs_stpq_post(vs_front(vs1), tmpAddr); 5317 vs_stpq_post(vs_front(vs3), tmpAddr); 5318 vs_stpq_post(vs_back(vs1), tmpAddr); 5319 vs_stpq_post(vs_back(vs3), tmpAddr); 5320 vs_ldpq(vq, kyberConsts); 5321 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5322 load64shorts(vs2, zetas); 5323 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5324 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5325 // kyber_subv_addv64(); 5326 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5327 vs_addv(vs1, __ T8H, vs1, vs2); 5328 __ add(tmpAddr, coeffs, 256); 5329 vs_stpq_post(vs_front(vs1), tmpAddr); 5330 vs_stpq_post(vs_front(vs3), tmpAddr); 5331 vs_stpq_post(vs_back(vs1), tmpAddr); 5332 vs_stpq_post(vs_back(vs3), tmpAddr); 5333 5334 // level 3 5335 vs_ldpq(vq, kyberConsts); 5336 int offsets2[4] = { 0, 64, 128, 192 }; 5337 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5338 load64shorts(vs2, zetas); 5339 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5340 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5341 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5342 vs_addv(vs1, __ T8H, vs1, vs2); 5343 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5344 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5345 5346 vs_ldpq(vq, kyberConsts); 5347 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5348 load64shorts(vs2, zetas); 5349 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5350 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5351 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5352 vs_addv(vs1, __ T8H, vs1, vs2); 5353 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5354 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5355 5356 // level 4 5357 // At level 4 coefficients occur in 8 discrete blocks of size 16 5358 // so they are loaded using employing an ldr at 8 distinct offsets. 5359 5360 vs_ldpq(vq, kyberConsts); 5361 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5362 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5363 load64shorts(vs2, zetas); 5364 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5365 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5366 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5367 vs_addv(vs1, __ T8H, vs1, vs2); 5368 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5369 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5370 5371 vs_ldpq(vq, kyberConsts); 5372 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5373 load64shorts(vs2, zetas); 5374 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5375 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5376 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5377 vs_addv(vs1, __ T8H, vs1, vs2); 5378 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5379 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5380 5381 // level 5 5382 // At level 5 related coefficients occur in discrete blocks of size 8 so 5383 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5384 5385 vs_ldpq(vq, kyberConsts); 5386 int offsets4[4] = { 0, 32, 64, 96 }; 5387 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5388 load32shorts(vs_front(vs2), zetas); 5389 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5390 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5391 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5392 load32shorts(vs_front(vs2), zetas); 5393 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5394 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5395 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5396 load32shorts(vs_front(vs2), zetas); 5397 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5398 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5399 5400 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5401 load32shorts(vs_front(vs2), zetas); 5402 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5403 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5404 5405 // level 6 5406 // At level 6 related coefficients occur in discrete blocks of size 4 so 5407 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5408 5409 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5410 load32shorts(vs_front(vs2), zetas); 5411 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5412 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5413 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5414 // __ ldpq(v18, v19, __ post(zetas, 32)); 5415 load32shorts(vs_front(vs2), zetas); 5416 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5417 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5418 5419 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5420 load32shorts(vs_front(vs2), zetas); 5421 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5422 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5423 5424 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5425 load32shorts(vs_front(vs2), zetas); 5426 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5427 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5428 5429 __ leave(); // required for proper stackwalking of RuntimeStub frame 5430 __ mov(r0, zr); // return 0 5431 __ ret(lr); 5432 5433 return start; 5434 } 5435 5436 // Kyber Inverse NTT function 5437 // Implements 5438 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5439 // 5440 // coeffs (short[256]) = c_rarg0 5441 // ntt_zetas (short[256]) = c_rarg1 5442 address generate_kyberInverseNtt() { 5443 5444 __ align(CodeEntryAlignment); 5445 StubGenStubId stub_id = StubGenStubId::kyberInverseNtt_id; 5446 StubCodeMark mark(this, stub_id); 5447 address start = __ pc(); 5448 __ enter(); 5449 5450 const Register coeffs = c_rarg0; 5451 const Register zetas = c_rarg1; 5452 5453 const Register kyberConsts = r10; 5454 const Register tmpAddr = r11; 5455 const Register tmpAddr2 = c_rarg2; 5456 5457 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5458 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5459 VSeq<2> vq(30); // n.b. constants overlap vs3 5460 5461 __ lea(kyberConsts, 5462 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5463 5464 // level 0 5465 // At level 0 related coefficients occur in discrete blocks of size 4 so 5466 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5467 5468 vs_ldpq(vq, kyberConsts); 5469 int offsets4[4] = { 0, 32, 64, 96 }; 5470 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5471 load32shorts(vs_front(vs2), zetas); 5472 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5473 vs_front(vs2), vs_back(vs2), vtmp, vq); 5474 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5475 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5476 load32shorts(vs_front(vs2), zetas); 5477 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5478 vs_front(vs2), vs_back(vs2), vtmp, vq); 5479 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5480 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5481 load32shorts(vs_front(vs2), zetas); 5482 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5483 vs_front(vs2), vs_back(vs2), vtmp, vq); 5484 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5485 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5486 load32shorts(vs_front(vs2), zetas); 5487 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5488 vs_front(vs2), vs_back(vs2), vtmp, vq); 5489 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5490 5491 // level 1 5492 // At level 1 related coefficients occur in discrete blocks of size 8 so 5493 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5494 5495 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5496 load32shorts(vs_front(vs2), zetas); 5497 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5498 vs_front(vs2), vs_back(vs2), vtmp, vq); 5499 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5500 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5501 load32shorts(vs_front(vs2), zetas); 5502 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5503 vs_front(vs2), vs_back(vs2), vtmp, vq); 5504 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5505 5506 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5507 load32shorts(vs_front(vs2), zetas); 5508 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5509 vs_front(vs2), vs_back(vs2), vtmp, vq); 5510 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5511 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5512 load32shorts(vs_front(vs2), zetas); 5513 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5514 vs_front(vs2), vs_back(vs2), vtmp, vq); 5515 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5516 5517 // level 2 5518 // At level 2 coefficients occur in 8 discrete blocks of size 16 5519 // so they are loaded using employing an ldr at 8 distinct offsets. 5520 5521 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5522 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5523 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5524 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5525 vs_subv(vs1, __ T8H, vs1, vs2); 5526 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5527 load64shorts(vs2, zetas); 5528 vs_ldpq(vq, kyberConsts); 5529 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5530 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5531 5532 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5533 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5534 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5535 vs_subv(vs1, __ T8H, vs1, vs2); 5536 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5537 load64shorts(vs2, zetas); 5538 vs_ldpq(vq, kyberConsts); 5539 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5540 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5541 5542 // Barrett reduction at indexes where overflow may happen 5543 5544 // load q and the multiplier for the Barrett reduction 5545 __ add(tmpAddr, kyberConsts, 16); 5546 vs_ldpq(vq, tmpAddr); 5547 5548 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5549 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5550 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5551 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5552 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5553 vs_sshr(vs2, __ T8H, vs2, 11); 5554 vs_mlsv(vs1, __ T8H, vs2, vq1); 5555 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5556 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5557 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5558 vs_sshr(vs2, __ T8H, vs2, 11); 5559 vs_mlsv(vs1, __ T8H, vs2, vq1); 5560 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5561 5562 // level 3 5563 // From level 3 upwards coefficients occur in discrete blocks whose size is 5564 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5565 5566 int offsets2[4] = { 0, 64, 128, 192 }; 5567 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5568 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5569 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5570 vs_subv(vs1, __ T8H, vs1, vs2); 5571 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5572 load64shorts(vs2, zetas); 5573 vs_ldpq(vq, kyberConsts); 5574 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5575 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5576 5577 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5578 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5579 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5580 vs_subv(vs1, __ T8H, vs1, vs2); 5581 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5582 load64shorts(vs2, zetas); 5583 vs_ldpq(vq, kyberConsts); 5584 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5585 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5586 5587 // level 4 5588 5589 int offsets1[4] = { 0, 32, 128, 160 }; 5590 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5591 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5592 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5593 vs_subv(vs1, __ T8H, vs1, vs2); 5594 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5595 load64shorts(vs2, zetas); 5596 vs_ldpq(vq, kyberConsts); 5597 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5598 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5599 5600 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5601 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5602 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5603 vs_subv(vs1, __ T8H, vs1, vs2); 5604 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5605 load64shorts(vs2, zetas); 5606 vs_ldpq(vq, kyberConsts); 5607 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5608 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5609 5610 // level 5 5611 5612 __ add(tmpAddr, coeffs, 0); 5613 load64shorts(vs1, tmpAddr); 5614 __ add(tmpAddr, coeffs, 128); 5615 load64shorts(vs2, tmpAddr); 5616 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5617 vs_subv(vs1, __ T8H, vs1, vs2); 5618 __ add(tmpAddr, coeffs, 0); 5619 store64shorts(vs3, tmpAddr); 5620 load64shorts(vs2, zetas); 5621 vs_ldpq(vq, kyberConsts); 5622 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5623 __ add(tmpAddr, coeffs, 128); 5624 store64shorts(vs2, tmpAddr); 5625 5626 load64shorts(vs1, tmpAddr); 5627 __ add(tmpAddr, coeffs, 384); 5628 load64shorts(vs2, tmpAddr); 5629 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5630 vs_subv(vs1, __ T8H, vs1, vs2); 5631 __ add(tmpAddr, coeffs, 256); 5632 store64shorts(vs3, tmpAddr); 5633 load64shorts(vs2, zetas); 5634 vs_ldpq(vq, kyberConsts); 5635 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5636 __ add(tmpAddr, coeffs, 384); 5637 store64shorts(vs2, tmpAddr); 5638 5639 // Barrett reduction at indexes where overflow may happen 5640 5641 // load q and the multiplier for the Barrett reduction 5642 __ add(tmpAddr, kyberConsts, 16); 5643 vs_ldpq(vq, tmpAddr); 5644 5645 int offsets0[2] = { 0, 256 }; 5646 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5647 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5648 vs_sshr(vs2, __ T8H, vs2, 11); 5649 vs_mlsv(vs1, __ T8H, vs2, vq1); 5650 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5651 5652 // level 6 5653 5654 __ add(tmpAddr, coeffs, 0); 5655 load64shorts(vs1, tmpAddr); 5656 __ add(tmpAddr, coeffs, 256); 5657 load64shorts(vs2, tmpAddr); 5658 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5659 vs_subv(vs1, __ T8H, vs1, vs2); 5660 __ add(tmpAddr, coeffs, 0); 5661 store64shorts(vs3, tmpAddr); 5662 load64shorts(vs2, zetas); 5663 vs_ldpq(vq, kyberConsts); 5664 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5665 __ add(tmpAddr, coeffs, 256); 5666 store64shorts(vs2, tmpAddr); 5667 5668 __ add(tmpAddr, coeffs, 128); 5669 load64shorts(vs1, tmpAddr); 5670 __ add(tmpAddr, coeffs, 384); 5671 load64shorts(vs2, tmpAddr); 5672 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5673 vs_subv(vs1, __ T8H, vs1, vs2); 5674 __ add(tmpAddr, coeffs, 128); 5675 store64shorts(vs3, tmpAddr); 5676 load64shorts(vs2, zetas); 5677 vs_ldpq(vq, kyberConsts); 5678 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5679 __ add(tmpAddr, coeffs, 384); 5680 store64shorts(vs2, tmpAddr); 5681 5682 // multiply by 2^-n 5683 5684 // load toMont(2^-n mod q) 5685 __ add(tmpAddr, kyberConsts, 48); 5686 __ ldr(v29, __ Q, tmpAddr); 5687 5688 vs_ldpq(vq, kyberConsts); 5689 __ add(tmpAddr, coeffs, 0); 5690 load64shorts(vs1, tmpAddr); 5691 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5692 __ add(tmpAddr, coeffs, 0); 5693 store64shorts(vs2, tmpAddr); 5694 5695 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5696 load64shorts(vs1, tmpAddr); 5697 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5698 __ add(tmpAddr, coeffs, 128); 5699 store64shorts(vs2, tmpAddr); 5700 5701 // now tmpAddr contains coeffs + 256 5702 load64shorts(vs1, tmpAddr); 5703 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5704 __ add(tmpAddr, coeffs, 256); 5705 store64shorts(vs2, tmpAddr); 5706 5707 // now tmpAddr contains coeffs + 384 5708 load64shorts(vs1, tmpAddr); 5709 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5710 __ add(tmpAddr, coeffs, 384); 5711 store64shorts(vs2, tmpAddr); 5712 5713 __ leave(); // required for proper stackwalking of RuntimeStub frame 5714 __ mov(r0, zr); // return 0 5715 __ ret(lr); 5716 5717 return start; 5718 } 5719 5720 // Kyber multiply polynomials in the NTT domain. 5721 // Implements 5722 // static int implKyberNttMult( 5723 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5724 // 5725 // result (short[256]) = c_rarg0 5726 // ntta (short[256]) = c_rarg1 5727 // nttb (short[256]) = c_rarg2 5728 // zetas (short[128]) = c_rarg3 5729 address generate_kyberNttMult() { 5730 5731 __ align(CodeEntryAlignment); 5732 StubGenStubId stub_id = StubGenStubId::kyberNttMult_id; 5733 StubCodeMark mark(this, stub_id); 5734 address start = __ pc(); 5735 __ enter(); 5736 5737 const Register result = c_rarg0; 5738 const Register ntta = c_rarg1; 5739 const Register nttb = c_rarg2; 5740 const Register zetas = c_rarg3; 5741 5742 const Register kyberConsts = r10; 5743 const Register limit = r11; 5744 5745 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5746 VSeq<4> vs3(16), vs4(20); 5747 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5748 VSeq<2> vz(28); // pair of zetas 5749 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5750 5751 __ lea(kyberConsts, 5752 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5753 5754 Label kyberNttMult_loop; 5755 5756 __ add(limit, result, 512); 5757 5758 // load q and qinv 5759 vs_ldpq(vq, kyberConsts); 5760 5761 // load R^2 mod q (to convert back from Montgomery representation) 5762 __ add(kyberConsts, kyberConsts, 64); 5763 __ ldr(v27, __ Q, kyberConsts); 5764 5765 __ BIND(kyberNttMult_loop); 5766 5767 // load 16 zetas 5768 vs_ldpq_post(vz, zetas); 5769 5770 // load 2 sets of 32 coefficients from the two input arrays 5771 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5772 // are striped across pairs of vector registers 5773 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5774 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5775 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5776 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5777 5778 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5779 // i.e. montmul the first and second halves of vs1 in order and 5780 // then with one sequence reversed storing the two results in vs3 5781 // 5782 // vs3[0] <- montmul(a0, b0) 5783 // vs3[1] <- montmul(a1, b1) 5784 // vs3[2] <- montmul(a0, b1) 5785 // vs3[3] <- montmul(a1, b0) 5786 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5787 kyber_montmul16(vs_back(vs3), 5788 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5789 5790 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5791 // i.e. montmul the first and second halves of vs4 in order and 5792 // then with one sequence reversed storing the two results in vs1 5793 // 5794 // vs1[0] <- montmul(a2, b2) 5795 // vs1[1] <- montmul(a3, b3) 5796 // vs1[2] <- montmul(a2, b3) 5797 // vs1[3] <- montmul(a3, b2) 5798 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5799 kyber_montmul16(vs_back(vs1), 5800 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5801 5802 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5803 // We can schedule two montmuls at a time if we use a suitable vector 5804 // sequence <vs3[1], vs1[1]>. 5805 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5806 VSeq<2> vs5(vs3[1], delta); 5807 5808 // vs3[1] <- montmul(montmul(a1, b1), z0) 5809 // vs1[1] <- montmul(montmul(a3, b3), z1) 5810 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5811 5812 // add results in pairs storing in vs3 5813 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5814 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5815 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5816 5817 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5818 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5819 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5820 5821 // vs1 <- montmul(vs3, montRSquareModQ) 5822 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5823 5824 // store back the two pairs of result vectors de-interleaved as 8H elements 5825 // i.e. storing each pairs of shorts striped across a register pair adjacent 5826 // in memory 5827 vs_st2_post(vs1, __ T8H, result); 5828 5829 __ cmp(result, limit); 5830 __ br(Assembler::NE, kyberNttMult_loop); 5831 5832 __ leave(); // required for proper stackwalking of RuntimeStub frame 5833 __ mov(r0, zr); // return 0 5834 __ ret(lr); 5835 5836 return start; 5837 } 5838 5839 // Kyber add 2 polynomials. 5840 // Implements 5841 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5842 // 5843 // result (short[256]) = c_rarg0 5844 // a (short[256]) = c_rarg1 5845 // b (short[256]) = c_rarg2 5846 address generate_kyberAddPoly_2() { 5847 5848 __ align(CodeEntryAlignment); 5849 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_2_id; 5850 StubCodeMark mark(this, stub_id); 5851 address start = __ pc(); 5852 __ enter(); 5853 5854 const Register result = c_rarg0; 5855 const Register a = c_rarg1; 5856 const Register b = c_rarg2; 5857 5858 const Register kyberConsts = r11; 5859 5860 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5861 // So, we can load, add and store the data in 3 groups of 11, 5862 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5863 // registers. A further constraint is that the mapping needs 5864 // to skip callee saves. So, we allocate the register 5865 // sequences using two 8 sequences, two 2 sequences and two 5866 // single registers. 5867 VSeq<8> vs1_1(0); 5868 VSeq<2> vs1_2(16); 5869 FloatRegister vs1_3 = v28; 5870 VSeq<8> vs2_1(18); 5871 VSeq<2> vs2_2(26); 5872 FloatRegister vs2_3 = v29; 5873 5874 // two constant vector sequences 5875 VSeq<8> vc_1(31, 0); 5876 VSeq<2> vc_2(31, 0); 5877 5878 FloatRegister vc_3 = v31; 5879 __ lea(kyberConsts, 5880 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5881 5882 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5883 for (int i = 0; i < 3; i++) { 5884 // load 80 or 88 values from a into vs1_1/2/3 5885 vs_ldpq_post(vs1_1, a); 5886 vs_ldpq_post(vs1_2, a); 5887 if (i < 2) { 5888 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5889 } 5890 // load 80 or 88 values from b into vs2_1/2/3 5891 vs_ldpq_post(vs2_1, b); 5892 vs_ldpq_post(vs2_2, b); 5893 if (i < 2) { 5894 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5895 } 5896 // sum 80 or 88 values across vs1 and vs2 into vs1 5897 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5898 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5899 if (i < 2) { 5900 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5901 } 5902 // add constant to all 80 or 88 results 5903 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5904 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5905 if (i < 2) { 5906 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5907 } 5908 // store 80 or 88 values 5909 vs_stpq_post(vs1_1, result); 5910 vs_stpq_post(vs1_2, result); 5911 if (i < 2) { 5912 __ str(vs1_3, __ Q, __ post(result, 16)); 5913 } 5914 } 5915 5916 __ leave(); // required for proper stackwalking of RuntimeStub frame 5917 __ mov(r0, zr); // return 0 5918 __ ret(lr); 5919 5920 return start; 5921 } 5922 5923 // Kyber add 3 polynomials. 5924 // Implements 5925 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5926 // 5927 // result (short[256]) = c_rarg0 5928 // a (short[256]) = c_rarg1 5929 // b (short[256]) = c_rarg2 5930 // c (short[256]) = c_rarg3 5931 address generate_kyberAddPoly_3() { 5932 5933 __ align(CodeEntryAlignment); 5934 StubGenStubId stub_id = StubGenStubId::kyberAddPoly_3_id; 5935 StubCodeMark mark(this, stub_id); 5936 address start = __ pc(); 5937 __ enter(); 5938 5939 const Register result = c_rarg0; 5940 const Register a = c_rarg1; 5941 const Register b = c_rarg2; 5942 const Register c = c_rarg3; 5943 5944 const Register kyberConsts = r11; 5945 5946 // As above we sum 256 sets of values in total i.e. 32 x 8H 5947 // quadwords. So, we can load, add and store the data in 3 5948 // groups of 11, 11 and 10 at a time i.e. we need to map sets 5949 // of 10 or 11 registers. A further constraint is that the 5950 // mapping needs to skip callee saves. So, we allocate the 5951 // register sequences using two 8 sequences, two 2 sequences 5952 // and two single registers. 5953 VSeq<8> vs1_1(0); 5954 VSeq<2> vs1_2(16); 5955 FloatRegister vs1_3 = v28; 5956 VSeq<8> vs2_1(18); 5957 VSeq<2> vs2_2(26); 5958 FloatRegister vs2_3 = v29; 5959 5960 // two constant vector sequences 5961 VSeq<8> vc_1(31, 0); 5962 VSeq<2> vc_2(31, 0); 5963 5964 FloatRegister vc_3 = v31; 5965 5966 __ lea(kyberConsts, 5967 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5968 5969 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5970 for (int i = 0; i < 3; i++) { 5971 // load 80 or 88 values from a into vs1_1/2/3 5972 vs_ldpq_post(vs1_1, a); 5973 vs_ldpq_post(vs1_2, a); 5974 if (i < 2) { 5975 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5976 } 5977 // load 80 or 88 values from b into vs2_1/2/3 5978 vs_ldpq_post(vs2_1, b); 5979 vs_ldpq_post(vs2_2, b); 5980 if (i < 2) { 5981 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5982 } 5983 // sum 80 or 88 values across vs1 and vs2 into vs1 5984 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5985 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5986 if (i < 2) { 5987 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5988 } 5989 // load 80 or 88 values from c into vs2_1/2/3 5990 vs_ldpq_post(vs2_1, c); 5991 vs_ldpq_post(vs2_2, c); 5992 if (i < 2) { 5993 __ ldr(vs2_3, __ Q, __ post(c, 16)); 5994 } 5995 // sum 80 or 88 values across vs1 and vs2 into vs1 5996 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5997 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5998 if (i < 2) { 5999 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 6000 } 6001 // add constant to all 80 or 88 results 6002 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 6003 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 6004 if (i < 2) { 6005 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 6006 } 6007 // store 80 or 88 values 6008 vs_stpq_post(vs1_1, result); 6009 vs_stpq_post(vs1_2, result); 6010 if (i < 2) { 6011 __ str(vs1_3, __ Q, __ post(result, 16)); 6012 } 6013 } 6014 6015 __ leave(); // required for proper stackwalking of RuntimeStub frame 6016 __ mov(r0, zr); // return 0 6017 __ ret(lr); 6018 6019 return start; 6020 } 6021 6022 // Kyber parse XOF output to polynomial coefficient candidates 6023 // or decodePoly(12, ...). 6024 // Implements 6025 // static int implKyber12To16( 6026 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 6027 // 6028 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 6029 // 6030 // condensed (byte[]) = c_rarg0 6031 // condensedIndex = c_rarg1 6032 // parsed (short[112 or 256]) = c_rarg2 6033 // parsedLength (112 or 256) = c_rarg3 6034 address generate_kyber12To16() { 6035 Label L_F00, L_loop, L_end; 6036 6037 __ BIND(L_F00); 6038 __ emit_int64(0x0f000f000f000f00); 6039 __ emit_int64(0x0f000f000f000f00); 6040 6041 __ align(CodeEntryAlignment); 6042 StubGenStubId stub_id = StubGenStubId::kyber12To16_id; 6043 StubCodeMark mark(this, stub_id); 6044 address start = __ pc(); 6045 __ enter(); 6046 6047 const Register condensed = c_rarg0; 6048 const Register condensedOffs = c_rarg1; 6049 const Register parsed = c_rarg2; 6050 const Register parsedLength = c_rarg3; 6051 6052 const Register tmpAddr = r11; 6053 6054 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 6055 // quadwords so we need a 6 vector sequence for the inputs. 6056 // Parsing produces 64 shorts, employing two 8 vector 6057 // sequences to store and combine the intermediate data. 6058 VSeq<6> vin(24); 6059 VSeq<8> va(0), vb(16); 6060 6061 __ adr(tmpAddr, L_F00); 6062 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 6063 __ add(condensed, condensed, condensedOffs); 6064 6065 __ BIND(L_loop); 6066 // load 96 (6 x 16B) byte values 6067 vs_ld3_post(vin, __ T16B, condensed); 6068 6069 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 6070 // holds 48 (16x3) contiguous bytes from memory striped 6071 // horizontally across each of the 16 byte lanes. Equivalently, 6072 // that is 16 pairs of 12-bit integers. Likewise the back half 6073 // holds the next 48 bytes in the same arrangement. 6074 6075 // Each vector in the front half can also be viewed as a vertical 6076 // strip across the 16 pairs of 12 bit integers. Each byte in 6077 // vin[0] stores the low 8 bits of the first int in a pair. Each 6078 // byte in vin[1] stores the high 4 bits of the first int and the 6079 // low 4 bits of the second int. Each byte in vin[2] stores the 6080 // high 8 bits of the second int. Likewise the vectors in second 6081 // half. 6082 6083 // Converting the data to 16-bit shorts requires first of all 6084 // expanding each of the 6 x 16B vectors into 6 corresponding 6085 // pairs of 8H vectors. Mask, shift and add operations on the 6086 // resulting vector pairs can be used to combine 4 and 8 bit 6087 // parts of related 8H vector elements. 6088 // 6089 // The middle vectors (vin[2] and vin[5]) are actually expanded 6090 // twice, one copy manipulated to provide the lower 4 bits 6091 // belonging to the first short in a pair and another copy 6092 // manipulated to provide the higher 4 bits belonging to the 6093 // second short in a pair. This is why the the vector sequences va 6094 // and vb used to hold the expanded 8H elements are of length 8. 6095 6096 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6097 // n.b. target elements 2 and 3 duplicate elements 4 and 5 6098 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6099 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6100 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6101 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6102 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6103 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6104 6105 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 6106 // and vb[4:5] 6107 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6108 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 6109 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6110 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 6111 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6112 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 6113 6114 // shift lo byte of copy 1 of the middle stripe into the high byte 6115 __ shl(va[2], __ T8H, va[2], 8); 6116 __ shl(va[3], __ T8H, va[3], 8); 6117 __ shl(vb[2], __ T8H, vb[2], 8); 6118 __ shl(vb[3], __ T8H, vb[3], 8); 6119 6120 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6121 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6122 // are in bit positions [4..11]. 6123 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6124 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6125 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6126 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6127 6128 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6129 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6130 // copy2 6131 __ andr(va[2], __ T16B, va[2], v31); 6132 __ andr(va[3], __ T16B, va[3], v31); 6133 __ ushr(va[4], __ T8H, va[4], 4); 6134 __ ushr(va[5], __ T8H, va[5], 4); 6135 __ andr(vb[2], __ T16B, vb[2], v31); 6136 __ andr(vb[3], __ T16B, vb[3], v31); 6137 __ ushr(vb[4], __ T8H, vb[4], 4); 6138 __ ushr(vb[5], __ T8H, vb[5], 4); 6139 6140 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6141 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6142 // n.b. the ordering ensures: i) inputs are consumed before they 6143 // are overwritten ii) the order of 16-bit results across successive 6144 // pairs of vectors in va and then vb reflects the order of the 6145 // corresponding 12-bit inputs 6146 __ addv(va[0], __ T8H, va[0], va[2]); 6147 __ addv(va[2], __ T8H, va[1], va[3]); 6148 __ addv(va[1], __ T8H, va[4], va[6]); 6149 __ addv(va[3], __ T8H, va[5], va[7]); 6150 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6151 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6152 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6153 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6154 6155 // store 64 results interleaved as shorts 6156 vs_st2_post(vs_front(va), __ T8H, parsed); 6157 vs_st2_post(vs_front(vb), __ T8H, parsed); 6158 6159 __ sub(parsedLength, parsedLength, 64); 6160 __ cmp(parsedLength, (u1)64); 6161 __ br(Assembler::GE, L_loop); 6162 __ cbz(parsedLength, L_end); 6163 6164 // if anything is left it should be a final 72 bytes of input 6165 // i.e. a final 48 12-bit values. so we handle this by loading 6166 // 48 bytes into all 16B lanes of front(vin) and only 24 6167 // bytes into the lower 8B lane of back(vin) 6168 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6169 vs_ld3(vs_back(vin), __ T8B, condensed); 6170 6171 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6172 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6173 // 5 and target element 2 of vb duplicates element 4. 6174 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6175 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6176 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6177 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6178 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6179 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6180 6181 // This time expand just the lower 8 lanes 6182 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6183 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6184 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6185 6186 // shift lo byte of copy 1 of the middle stripe into the high byte 6187 __ shl(va[2], __ T8H, va[2], 8); 6188 __ shl(va[3], __ T8H, va[3], 8); 6189 __ shl(vb[2], __ T8H, vb[2], 8); 6190 6191 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6192 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6193 // int are in bit positions [4..11]. 6194 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6195 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6196 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6197 6198 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6199 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6200 // copy2 6201 __ andr(va[2], __ T16B, va[2], v31); 6202 __ andr(va[3], __ T16B, va[3], v31); 6203 __ ushr(va[4], __ T8H, va[4], 4); 6204 __ ushr(va[5], __ T8H, va[5], 4); 6205 __ andr(vb[2], __ T16B, vb[2], v31); 6206 __ ushr(vb[4], __ T8H, vb[4], 4); 6207 6208 6209 6210 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6211 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6212 6213 // n.b. ordering ensures: i) inputs are consumed before they are 6214 // overwritten ii) order of 16-bit results across succsessive 6215 // pairs of vectors in va and then lower half of vb reflects order 6216 // of corresponding 12-bit inputs 6217 __ addv(va[0], __ T8H, va[0], va[2]); 6218 __ addv(va[2], __ T8H, va[1], va[3]); 6219 __ addv(va[1], __ T8H, va[4], va[6]); 6220 __ addv(va[3], __ T8H, va[5], va[7]); 6221 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6222 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6223 6224 // store 48 results interleaved as shorts 6225 vs_st2_post(vs_front(va), __ T8H, parsed); 6226 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6227 6228 __ BIND(L_end); 6229 6230 __ leave(); // required for proper stackwalking of RuntimeStub frame 6231 __ mov(r0, zr); // return 0 6232 __ ret(lr); 6233 6234 return start; 6235 } 6236 6237 // Kyber Barrett reduce function. 6238 // Implements 6239 // static int implKyberBarrettReduce(short[] coeffs) {} 6240 // 6241 // coeffs (short[256]) = c_rarg0 6242 address generate_kyberBarrettReduce() { 6243 6244 __ align(CodeEntryAlignment); 6245 StubGenStubId stub_id = StubGenStubId::kyberBarrettReduce_id; 6246 StubCodeMark mark(this, stub_id); 6247 address start = __ pc(); 6248 __ enter(); 6249 6250 const Register coeffs = c_rarg0; 6251 6252 const Register kyberConsts = r10; 6253 const Register result = r11; 6254 6255 // As above we process 256 sets of values in total i.e. 32 x 6256 // 8H quadwords. So, we can load, add and store the data in 3 6257 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6258 // of 10 or 11 registers. A further constraint is that the 6259 // mapping needs to skip callee saves. So, we allocate the 6260 // register sequences using two 8 sequences, two 2 sequences 6261 // and two single registers. 6262 VSeq<8> vs1_1(0); 6263 VSeq<2> vs1_2(16); 6264 FloatRegister vs1_3 = v28; 6265 VSeq<8> vs2_1(18); 6266 VSeq<2> vs2_2(26); 6267 FloatRegister vs2_3 = v29; 6268 6269 // we also need a pair of corresponding constant sequences 6270 6271 VSeq<8> vc1_1(30, 0); 6272 VSeq<2> vc1_2(30, 0); 6273 FloatRegister vc1_3 = v30; // for kyber_q 6274 6275 VSeq<8> vc2_1(31, 0); 6276 VSeq<2> vc2_2(31, 0); 6277 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6278 6279 __ add(result, coeffs, 0); 6280 __ lea(kyberConsts, 6281 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6282 6283 // load q and the multiplier for the Barrett reduction 6284 __ add(kyberConsts, kyberConsts, 16); 6285 __ ldpq(vc1_3, vc2_3, kyberConsts); 6286 6287 for (int i = 0; i < 3; i++) { 6288 // load 80 or 88 coefficients 6289 vs_ldpq_post(vs1_1, coeffs); 6290 vs_ldpq_post(vs1_2, coeffs); 6291 if (i < 2) { 6292 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6293 } 6294 6295 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6296 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6297 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6298 if (i < 2) { 6299 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6300 } 6301 6302 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6303 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6304 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6305 if (i < 2) { 6306 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6307 } 6308 6309 // vs1 <- vs1 - vs2 * kyber_q 6310 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6311 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6312 if (i < 2) { 6313 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6314 } 6315 6316 vs_stpq_post(vs1_1, result); 6317 vs_stpq_post(vs1_2, result); 6318 if (i < 2) { 6319 __ str(vs1_3, __ Q, __ post(result, 16)); 6320 } 6321 } 6322 6323 __ leave(); // required for proper stackwalking of RuntimeStub frame 6324 __ mov(r0, zr); // return 0 6325 __ ret(lr); 6326 6327 return start; 6328 } 6329 6330 6331 // Dilithium-specific montmul helper routines that generate parallel 6332 // code for, respectively, a single 4x4s vector sequence montmul or 6333 // two such multiplies in a row. 6334 6335 // Perform 16 32-bit Montgomery multiplications in parallel 6336 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6337 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6338 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6339 // It will assert that the register use is valid 6340 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6341 } 6342 6343 // Perform 2x16 32-bit Montgomery multiplications in parallel 6344 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6345 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6346 // Schedule two successive 4x4S multiplies via the montmul helper 6347 // on the front and back halves of va, vb and vc. The helper will 6348 // assert that the register use has no overlap conflicts on each 6349 // individual call but we also need to ensure that the necessary 6350 // disjoint/equality constraints are met across both calls. 6351 6352 // vb, vc, vtmp and vq must be disjoint. va must either be 6353 // disjoint from all other registers or equal vc 6354 6355 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6356 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6357 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6358 6359 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6360 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6361 6362 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6363 6364 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6365 assert(vs_disjoint(va, vb), "va and vb overlap"); 6366 assert(vs_disjoint(va, vq), "va and vq overlap"); 6367 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6368 6369 // We multiply the front and back halves of each sequence 4 at a 6370 // time because 6371 // 6372 // 1) we are currently only able to get 4-way instruction 6373 // parallelism at best 6374 // 6375 // 2) we need registers for the constants in vq and temporary 6376 // scratch registers to hold intermediate results so vtmp can only 6377 // be a VSeq<4> which means we only have 4 scratch slots. 6378 6379 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6380 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6381 } 6382 6383 // Perform combined montmul then add/sub on 4x4S vectors. 6384 void dilithium_montmul16_sub_add( 6385 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6386 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6387 // compute a = montmul(a1, c) 6388 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6389 // ouptut a1 = a0 - a 6390 vs_subv(va1, __ T4S, va0, vc); 6391 // and a0 = a0 + a 6392 vs_addv(va0, __ T4S, va0, vc); 6393 } 6394 6395 // Perform combined add/sub then montul on 4x4S vectors. 6396 void dilithium_sub_add_montmul16( 6397 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6398 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6399 // compute c = a0 - a1 6400 vs_subv(vtmp1, __ T4S, va0, va1); 6401 // output a0 = a0 + a1 6402 vs_addv(va0, __ T4S, va0, va1); 6403 // output a1 = b montmul c 6404 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6405 } 6406 6407 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6408 // in the Java implementation come in sequences of at least 8, so we 6409 // can use ldpq to collect the corresponding data into pairs of vector 6410 // registers. 6411 // We collect the coefficients corresponding to the 'j+l' indexes into 6412 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6413 // then we do the (Montgomery) multiplications by the zetas in parallel 6414 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6415 // v0-v7, then do the additions into v24-v31 and the subtractions into 6416 // v0-v7 and finally save the results back to the coeffs array. 6417 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6418 const Register coeffs, const Register zetas) { 6419 int c1 = 0; 6420 int c2 = 512; 6421 int startIncr; 6422 // don't use callee save registers v8 - v15 6423 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6424 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6425 VSeq<2> vq(30); // n.b. constants overlap vs3 6426 int offsets[4] = { 0, 32, 64, 96 }; 6427 6428 for (int level = 0; level < 5; level++) { 6429 int c1Start = c1; 6430 int c2Start = c2; 6431 if (level == 3) { 6432 offsets[1] = 32; 6433 offsets[2] = 128; 6434 offsets[3] = 160; 6435 } else if (level == 4) { 6436 offsets[1] = 64; 6437 offsets[2] = 128; 6438 offsets[3] = 192; 6439 } 6440 6441 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6442 // time at 4 different offsets and multiply them in order by the 6443 // next set of input values. So we employ indexed load and store 6444 // pair instructions with arrangement 4S. 6445 for (int i = 0; i < 4; i++) { 6446 // reload q and qinv 6447 vs_ldpq(vq, dilithiumConsts); // qInv, q 6448 // load 8x4S coefficients via second start pos == c2 6449 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6450 // load next 8x4S inputs == b 6451 vs_ldpq_post(vs2, zetas); 6452 // compute a == c2 * b mod MONT_Q 6453 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6454 // load 8x4s coefficients via first start pos == c1 6455 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6456 // compute a1 = c1 + a 6457 vs_addv(vs3, __ T4S, vs1, vs2); 6458 // compute a2 = c1 - a 6459 vs_subv(vs1, __ T4S, vs1, vs2); 6460 // output a1 and a2 6461 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6462 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6463 6464 int k = 4 * level + i; 6465 6466 if (k > 7) { 6467 startIncr = 256; 6468 } else if (k == 5) { 6469 startIncr = 384; 6470 } else { 6471 startIncr = 128; 6472 } 6473 6474 c1Start += startIncr; 6475 c2Start += startIncr; 6476 } 6477 6478 c2 /= 2; 6479 } 6480 } 6481 6482 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6483 // Implements the method 6484 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6485 // of the Java class sun.security.provider 6486 // 6487 // coeffs (int[256]) = c_rarg0 6488 // zetas (int[256]) = c_rarg1 6489 address generate_dilithiumAlmostNtt() { 6490 6491 __ align(CodeEntryAlignment); 6492 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 6493 StubCodeMark mark(this, stub_id); 6494 address start = __ pc(); 6495 __ enter(); 6496 6497 const Register coeffs = c_rarg0; 6498 const Register zetas = c_rarg1; 6499 6500 const Register tmpAddr = r9; 6501 const Register dilithiumConsts = r10; 6502 const Register result = r11; 6503 // don't use callee save registers v8 - v15 6504 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6505 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6506 VSeq<2> vq(30); // n.b. constants overlap vs3 6507 int offsets[4] = { 0, 32, 64, 96}; 6508 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6509 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6510 __ add(result, coeffs, 0); 6511 __ lea(dilithiumConsts, 6512 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6513 6514 // Each level represents one iteration of the outer for loop of the Java version. 6515 6516 // level 0-4 6517 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6518 6519 // level 5 6520 6521 // At level 5 the coefficients we need to combine with the zetas 6522 // are grouped in memory in blocks of size 4. So, for both sets of 6523 // coefficients we load 4 adjacent values at 8 different offsets 6524 // using an indexed ldr with register variant Q and multiply them 6525 // in sequence order by the next set of inputs. Likewise we store 6526 // the resuls using an indexed str with register variant Q. 6527 for (int i = 0; i < 1024; i += 256) { 6528 // reload constants q, qinv each iteration as they get clobbered later 6529 vs_ldpq(vq, dilithiumConsts); // qInv, q 6530 // load 32 (8x4S) coefficients via first offsets = c1 6531 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6532 // load next 32 (8x4S) inputs = b 6533 vs_ldpq_post(vs2, zetas); 6534 // a = b montul c1 6535 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6536 // load 32 (8x4S) coefficients via second offsets = c2 6537 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6538 // add/sub with result of multiply 6539 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6540 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6541 // write back new coefficients using same offsets 6542 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6543 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6544 } 6545 6546 // level 6 6547 // At level 6 the coefficients we need to combine with the zetas 6548 // are grouped in memory in pairs, the first two being montmul 6549 // inputs and the second add/sub inputs. We can still implement 6550 // the montmul+sub+add using 4-way parallelism but only if we 6551 // combine the coefficients with the zetas 16 at a time. We load 8 6552 // adjacent values at 4 different offsets using an ld2 load with 6553 // arrangement 2D. That interleaves the lower and upper halves of 6554 // each pair of quadwords into successive vector registers. We 6555 // then need to montmul the 4 even elements of the coefficients 6556 // register sequence by the zetas in order and then add/sub the 4 6557 // odd elements of the coefficients register sequence. We use an 6558 // equivalent st2 operation to store the results back into memory 6559 // de-interleaved. 6560 for (int i = 0; i < 1024; i += 128) { 6561 // reload constants q, qinv each iteration as they get clobbered later 6562 vs_ldpq(vq, dilithiumConsts); // qInv, q 6563 // load interleaved 16 (4x2D) coefficients via offsets 6564 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6565 // load next 16 (4x4S) inputs 6566 vs_ldpq_post(vs_front(vs2), zetas); 6567 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6568 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6569 vs_front(vs2), vtmp, vq); 6570 // store interleaved 16 (4x2D) coefficients via offsets 6571 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6572 } 6573 6574 // level 7 6575 // At level 7 the coefficients we need to combine with the zetas 6576 // occur singly with montmul inputs alterating with add/sub 6577 // inputs. Once again we can use 4-way parallelism to combine 16 6578 // zetas at a time. However, we have to load 8 adjacent values at 6579 // 4 different offsets using an ld2 load with arrangement 4S. That 6580 // interleaves the the odd words of each pair into one 6581 // coefficients vector register and the even words of the pair 6582 // into the next register. We then need to montmul the 4 even 6583 // elements of the coefficients register sequence by the zetas in 6584 // order and then add/sub the 4 odd elements of the coefficients 6585 // register sequence. We use an equivalent st2 operation to store 6586 // the results back into memory de-interleaved. 6587 6588 for (int i = 0; i < 1024; i += 128) { 6589 // reload constants q, qinv each iteration as they get clobbered later 6590 vs_ldpq(vq, dilithiumConsts); // qInv, q 6591 // load interleaved 16 (4x4S) coefficients via offsets 6592 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6593 // load next 16 (4x4S) inputs 6594 vs_ldpq_post(vs_front(vs2), zetas); 6595 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6596 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6597 vs_front(vs2), vtmp, vq); 6598 // store interleaved 16 (4x4S) coefficients via offsets 6599 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6600 } 6601 __ leave(); // required for proper stackwalking of RuntimeStub frame 6602 __ mov(r0, zr); // return 0 6603 __ ret(lr); 6604 6605 return start; 6606 } 6607 6608 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6609 // in the Java implementation come in sequences of at least 8, so we 6610 // can use ldpq to collect the corresponding data into pairs of vector 6611 // registers 6612 // We collect the coefficients that correspond to the 'j's into vs1 6613 // the coefficiets that correspond to the 'j+l's into vs2 then 6614 // do the additions into vs3 and the subtractions into vs1 then 6615 // save the result of the additions, load the zetas into vs2 6616 // do the (Montgomery) multiplications by zeta in parallel into vs2 6617 // finally save the results back to the coeffs array 6618 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6619 const Register coeffs, const Register zetas) { 6620 int c1 = 0; 6621 int c2 = 32; 6622 int startIncr; 6623 int offsets[4]; 6624 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6625 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6626 VSeq<2> vq(30); // n.b. constants overlap vs3 6627 6628 offsets[0] = 0; 6629 6630 for (int level = 3; level < 8; level++) { 6631 int c1Start = c1; 6632 int c2Start = c2; 6633 if (level == 3) { 6634 offsets[1] = 64; 6635 offsets[2] = 128; 6636 offsets[3] = 192; 6637 } else if (level == 4) { 6638 offsets[1] = 32; 6639 offsets[2] = 128; 6640 offsets[3] = 160; 6641 } else { 6642 offsets[1] = 32; 6643 offsets[2] = 64; 6644 offsets[3] = 96; 6645 } 6646 6647 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6648 // time at 4 different offsets and multiply them in order by the 6649 // next set of input values. So we employ indexed load and store 6650 // pair instructions with arrangement 4S. 6651 for (int i = 0; i < 4; i++) { 6652 // load v1 32 (8x4S) coefficients relative to first start index 6653 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6654 // load v2 32 (8x4S) coefficients relative to second start index 6655 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6656 // a0 = v1 + v2 -- n.b. clobbers vqs 6657 vs_addv(vs3, __ T4S, vs1, vs2); 6658 // a1 = v1 - v2 6659 vs_subv(vs1, __ T4S, vs1, vs2); 6660 // save a1 relative to first start index 6661 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6662 // load constants q, qinv each iteration as they get clobbered above 6663 vs_ldpq(vq, dilithiumConsts); // qInv, q 6664 // load b next 32 (8x4S) inputs 6665 vs_ldpq_post(vs2, zetas); 6666 // a = a1 montmul b 6667 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6668 // save a relative to second start index 6669 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6670 6671 int k = 4 * level + i; 6672 6673 if (k < 24) { 6674 startIncr = 256; 6675 } else if (k == 25) { 6676 startIncr = 384; 6677 } else { 6678 startIncr = 128; 6679 } 6680 6681 c1Start += startIncr; 6682 c2Start += startIncr; 6683 } 6684 6685 c2 *= 2; 6686 } 6687 } 6688 6689 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6690 // Implements the method 6691 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6692 // the sun.security.provider.ML_DSA class. 6693 // 6694 // coeffs (int[256]) = c_rarg0 6695 // zetas (int[256]) = c_rarg1 6696 address generate_dilithiumAlmostInverseNtt() { 6697 6698 __ align(CodeEntryAlignment); 6699 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 6700 StubCodeMark mark(this, stub_id); 6701 address start = __ pc(); 6702 __ enter(); 6703 6704 const Register coeffs = c_rarg0; 6705 const Register zetas = c_rarg1; 6706 6707 const Register tmpAddr = r9; 6708 const Register dilithiumConsts = r10; 6709 const Register result = r11; 6710 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6711 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6712 VSeq<2> vq(30); // n.b. constants overlap vs3 6713 int offsets[4] = { 0, 32, 64, 96 }; 6714 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6715 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6716 6717 __ add(result, coeffs, 0); 6718 __ lea(dilithiumConsts, 6719 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6720 6721 // Each level represents one iteration of the outer for loop of the Java version 6722 6723 // level 0 6724 // At level 0 we need to interleave adjacent quartets of 6725 // coefficients before we multiply and add/sub by the next 16 6726 // zetas just as we did for level 7 in the multiply code. So we 6727 // load and store the values using an ld2/st2 with arrangement 4S. 6728 for (int i = 0; i < 1024; i += 128) { 6729 // load constants q, qinv 6730 // n.b. this can be moved out of the loop as they do not get 6731 // clobbered by first two loops 6732 vs_ldpq(vq, dilithiumConsts); // qInv, q 6733 // a0/a1 load interleaved 32 (8x4S) coefficients 6734 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6735 // b load next 32 (8x4S) inputs 6736 vs_ldpq_post(vs_front(vs2), zetas); 6737 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6738 // n.b. second half of vs2 provides temporary register storage 6739 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6740 vs_front(vs2), vs_back(vs2), vtmp, vq); 6741 // a0/a1 store interleaved 32 (8x4S) coefficients 6742 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6743 } 6744 6745 // level 1 6746 // At level 1 we need to interleave pairs of adjacent pairs of 6747 // coefficients before we multiply by the next 16 zetas just as we 6748 // did for level 6 in the multiply code. So we load and store the 6749 // values an ld2/st2 with arrangement 2D. 6750 for (int i = 0; i < 1024; i += 128) { 6751 // a0/a1 load interleaved 32 (8x2D) coefficients 6752 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6753 // b load next 16 (4x4S) inputs 6754 vs_ldpq_post(vs_front(vs2), zetas); 6755 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6756 // n.b. second half of vs2 provides temporary register storage 6757 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6758 vs_front(vs2), vs_back(vs2), vtmp, vq); 6759 // a0/a1 store interleaved 32 (8x2D) coefficients 6760 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6761 } 6762 6763 // level 2 6764 // At level 2 coefficients come in blocks of 4. So, we load 4 6765 // adjacent coefficients at 8 distinct offsets for both the first 6766 // and second coefficient sequences, using an ldr with register 6767 // variant Q then combine them with next set of 32 zetas. Likewise 6768 // we store the results using an str with register variant Q. 6769 for (int i = 0; i < 1024; i += 256) { 6770 // c0 load 32 (8x4S) coefficients via first offsets 6771 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6772 // c1 load 32 (8x4S) coefficients via second offsets 6773 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6774 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6775 vs_addv(vs3, __ T4S, vs1, vs2); 6776 // c = c0 - c1 6777 vs_subv(vs1, __ T4S, vs1, vs2); 6778 // store a0 32 (8x4S) coefficients via first offsets 6779 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6780 // b load 32 (8x4S) next inputs 6781 vs_ldpq_post(vs2, zetas); 6782 // reload constants q, qinv -- they were clobbered earlier 6783 vs_ldpq(vq, dilithiumConsts); // qInv, q 6784 // compute a1 = b montmul c 6785 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6786 // store a1 32 (8x4S) coefficients via second offsets 6787 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6788 } 6789 6790 // level 3-7 6791 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6792 6793 __ leave(); // required for proper stackwalking of RuntimeStub frame 6794 __ mov(r0, zr); // return 0 6795 __ ret(lr); 6796 6797 return start; 6798 } 6799 6800 // Dilithium multiply polynomials in the NTT domain. 6801 // Straightforward implementation of the method 6802 // static int implDilithiumNttMult( 6803 // int[] result, int[] ntta, int[] nttb {} of 6804 // the sun.security.provider.ML_DSA class. 6805 // 6806 // result (int[256]) = c_rarg0 6807 // poly1 (int[256]) = c_rarg1 6808 // poly2 (int[256]) = c_rarg2 6809 address generate_dilithiumNttMult() { 6810 6811 __ align(CodeEntryAlignment); 6812 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 6813 StubCodeMark mark(this, stub_id); 6814 address start = __ pc(); 6815 __ enter(); 6816 6817 Label L_loop; 6818 6819 const Register result = c_rarg0; 6820 const Register poly1 = c_rarg1; 6821 const Register poly2 = c_rarg2; 6822 6823 const Register dilithiumConsts = r10; 6824 const Register len = r11; 6825 6826 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6827 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6828 VSeq<2> vq(30); // n.b. constants overlap vs3 6829 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6830 6831 __ lea(dilithiumConsts, 6832 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6833 6834 // load constants q, qinv 6835 vs_ldpq(vq, dilithiumConsts); // qInv, q 6836 // load constant rSquare into v29 6837 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6838 6839 __ mov(len, zr); 6840 __ add(len, len, 1024); 6841 6842 __ BIND(L_loop); 6843 6844 // b load 32 (8x4S) next inputs from poly1 6845 vs_ldpq_post(vs1, poly1); 6846 // c load 32 (8x4S) next inputs from poly2 6847 vs_ldpq_post(vs2, poly2); 6848 // compute a = b montmul c 6849 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6850 // compute a = rsquare montmul a 6851 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6852 // save a 32 (8x4S) results 6853 vs_stpq_post(vs2, result); 6854 6855 __ sub(len, len, 128); 6856 __ cmp(len, (u1)128); 6857 __ br(Assembler::GE, L_loop); 6858 6859 __ leave(); // required for proper stackwalking of RuntimeStub frame 6860 __ mov(r0, zr); // return 0 6861 __ ret(lr); 6862 6863 return start; 6864 } 6865 6866 // Dilithium Motgomery multiply an array by a constant. 6867 // A straightforward implementation of the method 6868 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6869 // of the sun.security.provider.MLDSA class 6870 // 6871 // coeffs (int[256]) = c_rarg0 6872 // constant (int) = c_rarg1 6873 address generate_dilithiumMontMulByConstant() { 6874 6875 __ align(CodeEntryAlignment); 6876 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 6877 StubCodeMark mark(this, stub_id); 6878 address start = __ pc(); 6879 __ enter(); 6880 6881 Label L_loop; 6882 6883 const Register coeffs = c_rarg0; 6884 const Register constant = c_rarg1; 6885 6886 const Register dilithiumConsts = r10; 6887 const Register result = r11; 6888 const Register len = r12; 6889 6890 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6891 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6892 VSeq<2> vq(30); // n.b. constants overlap vs3 6893 VSeq<8> vconst(29, 0); // for montmul by constant 6894 6895 // results track inputs 6896 __ add(result, coeffs, 0); 6897 __ lea(dilithiumConsts, 6898 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6899 6900 // load constants q, qinv -- they do not get clobbered by first two loops 6901 vs_ldpq(vq, dilithiumConsts); // qInv, q 6902 // copy caller supplied constant across vconst 6903 __ dup(vconst[0], __ T4S, constant); 6904 __ mov(len, zr); 6905 __ add(len, len, 1024); 6906 6907 __ BIND(L_loop); 6908 6909 // load next 32 inputs 6910 vs_ldpq_post(vs2, coeffs); 6911 // mont mul by constant 6912 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6913 // write next 32 results 6914 vs_stpq_post(vs2, result); 6915 6916 __ sub(len, len, 128); 6917 __ cmp(len, (u1)128); 6918 __ br(Assembler::GE, L_loop); 6919 6920 __ leave(); // required for proper stackwalking of RuntimeStub frame 6921 __ mov(r0, zr); // return 0 6922 __ ret(lr); 6923 6924 return start; 6925 } 6926 6927 // Dilithium decompose poly. 6928 // Implements the method 6929 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6930 // of the sun.security.provider.ML_DSA class 6931 // 6932 // input (int[256]) = c_rarg0 6933 // lowPart (int[256]) = c_rarg1 6934 // highPart (int[256]) = c_rarg2 6935 // twoGamma2 (int) = c_rarg3 6936 // multiplier (int) = c_rarg4 6937 address generate_dilithiumDecomposePoly() { 6938 6939 __ align(CodeEntryAlignment); 6940 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 6941 StubCodeMark mark(this, stub_id); 6942 address start = __ pc(); 6943 Label L_loop; 6944 6945 const Register input = c_rarg0; 6946 const Register lowPart = c_rarg1; 6947 const Register highPart = c_rarg2; 6948 const Register twoGamma2 = c_rarg3; 6949 const Register multiplier = c_rarg4; 6950 6951 const Register len = r9; 6952 const Register dilithiumConsts = r10; 6953 const Register tmp = r11; 6954 6955 // 6 independent sets of 4x4s values 6956 VSeq<4> vs1(0), vs2(4), vs3(8); 6957 VSeq<4> vs4(12), vs5(16), vtmp(20); 6958 6959 // 7 constants for cross-multiplying 6960 VSeq<4> one(25, 0); 6961 VSeq<4> qminus1(26, 0); 6962 VSeq<4> g2(27, 0); 6963 VSeq<4> twog2(28, 0); 6964 VSeq<4> mult(29, 0); 6965 VSeq<4> q(30, 0); 6966 VSeq<4> qadd(31, 0); 6967 6968 __ enter(); 6969 6970 __ lea(dilithiumConsts, 6971 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6972 6973 // save callee-saved registers 6974 __ stpd(v8, v9, __ pre(sp, -64)); 6975 __ stpd(v10, v11, Address(sp, 16)); 6976 __ stpd(v12, v13, Address(sp, 32)); 6977 __ stpd(v14, v15, Address(sp, 48)); 6978 6979 // populate constant registers 6980 __ mov(tmp, zr); 6981 __ add(tmp, tmp, 1); 6982 __ dup(one[0], __ T4S, tmp); // 1 6983 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 6984 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 6985 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 6986 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 6987 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 6988 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 6989 6990 __ mov(len, zr); 6991 __ add(len, len, 1024); 6992 6993 __ BIND(L_loop); 6994 6995 // load next 4x4S inputs interleaved: rplus --> vs1 6996 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 6997 6998 // rplus = rplus - ((rplus + qadd) >> 23) * q 6999 vs_addv(vtmp, __ T4S, vs1, qadd); 7000 vs_sshr(vtmp, __ T4S, vtmp, 23); 7001 vs_mulv(vtmp, __ T4S, vtmp, q); 7002 vs_subv(vs1, __ T4S, vs1, vtmp); 7003 7004 // rplus = rplus + ((rplus >> 31) & dilithium_q); 7005 vs_sshr(vtmp, __ T4S, vs1, 31); 7006 vs_andr(vtmp, vtmp, q); 7007 vs_addv(vs1, __ T4S, vs1, vtmp); 7008 7009 // quotient --> vs2 7010 // int quotient = (rplus * multiplier) >> 22; 7011 vs_mulv(vtmp, __ T4S, vs1, mult); 7012 vs_sshr(vs2, __ T4S, vtmp, 22); 7013 7014 // r0 --> vs3 7015 // int r0 = rplus - quotient * twoGamma2; 7016 vs_mulv(vtmp, __ T4S, vs2, twog2); 7017 vs_subv(vs3, __ T4S, vs1, vtmp); 7018 7019 // mask --> vs4 7020 // int mask = (twoGamma2 - r0) >> 22; 7021 vs_subv(vtmp, __ T4S, twog2, vs3); 7022 vs_sshr(vs4, __ T4S, vtmp, 22); 7023 7024 // r0 -= (mask & twoGamma2); 7025 vs_andr(vtmp, vs4, twog2); 7026 vs_subv(vs3, __ T4S, vs3, vtmp); 7027 7028 // quotient += (mask & 1); 7029 vs_andr(vtmp, vs4, one); 7030 vs_addv(vs2, __ T4S, vs2, vtmp); 7031 7032 // mask = (twoGamma2 / 2 - r0) >> 31; 7033 vs_subv(vtmp, __ T4S, g2, vs3); 7034 vs_sshr(vs4, __ T4S, vtmp, 31); 7035 7036 // r0 -= (mask & twoGamma2); 7037 vs_andr(vtmp, vs4, twog2); 7038 vs_subv(vs3, __ T4S, vs3, vtmp); 7039 7040 // quotient += (mask & 1); 7041 vs_andr(vtmp, vs4, one); 7042 vs_addv(vs2, __ T4S, vs2, vtmp); 7043 7044 // r1 --> vs5 7045 // int r1 = rplus - r0 - (dilithium_q - 1); 7046 vs_subv(vtmp, __ T4S, vs1, vs3); 7047 vs_subv(vs5, __ T4S, vtmp, qminus1); 7048 7049 // r1 --> vs1 (overwriting rplus) 7050 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 7051 vs_negr(vtmp, __ T4S, vs5); 7052 vs_orr(vtmp, vs5, vtmp); 7053 vs_sshr(vs1, __ T4S, vtmp, 31); 7054 7055 // r0 += ~r1; 7056 vs_notr(vtmp, vs1); 7057 vs_addv(vs3, __ T4S, vs3, vtmp); 7058 7059 // r1 = r1 & quotient; 7060 vs_andr(vs1, vs2, vs1); 7061 7062 // store results inteleaved 7063 // lowPart[m] = r0; 7064 // highPart[m] = r1; 7065 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 7066 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 7067 7068 __ sub(len, len, 64); 7069 __ cmp(len, (u1)64); 7070 __ br(Assembler::GE, L_loop); 7071 7072 // restore callee-saved vector registers 7073 __ ldpd(v14, v15, Address(sp, 48)); 7074 __ ldpd(v12, v13, Address(sp, 32)); 7075 __ ldpd(v10, v11, Address(sp, 16)); 7076 __ ldpd(v8, v9, __ post(sp, 64)); 7077 7078 __ leave(); // required for proper stackwalking of RuntimeStub frame 7079 __ mov(r0, zr); // return 0 7080 __ ret(lr); 7081 7082 return start; 7083 } 7084 7085 /** 7086 * Arguments: 7087 * 7088 * Inputs: 7089 * c_rarg0 - int crc 7090 * c_rarg1 - byte* buf 7091 * c_rarg2 - int length 7092 * 7093 * Output: 7094 * rax - int crc result 7095 */ 7096 address generate_updateBytesCRC32() { 7097 assert(UseCRC32Intrinsics, "what are we doing here?"); 7098 7099 __ align(CodeEntryAlignment); 7100 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 7101 StubCodeMark mark(this, stub_id); 7102 7103 address start = __ pc(); 7104 7105 const Register crc = c_rarg0; // crc 7106 const Register buf = c_rarg1; // source java byte array address 7107 const Register len = c_rarg2; // length 7108 const Register table0 = c_rarg3; // crc_table address 7109 const Register table1 = c_rarg4; 7110 const Register table2 = c_rarg5; 7111 const Register table3 = c_rarg6; 7112 const Register tmp3 = c_rarg7; 7113 7114 BLOCK_COMMENT("Entry:"); 7115 __ enter(); // required for proper stackwalking of RuntimeStub frame 7116 7117 __ kernel_crc32(crc, buf, len, 7118 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7119 7120 __ leave(); // required for proper stackwalking of RuntimeStub frame 7121 __ ret(lr); 7122 7123 return start; 7124 } 7125 7126 /** 7127 * Arguments: 7128 * 7129 * Inputs: 7130 * c_rarg0 - int crc 7131 * c_rarg1 - byte* buf 7132 * c_rarg2 - int length 7133 * c_rarg3 - int* table 7134 * 7135 * Output: 7136 * r0 - int crc result 7137 */ 7138 address generate_updateBytesCRC32C() { 7139 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7140 7141 __ align(CodeEntryAlignment); 7142 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 7143 StubCodeMark mark(this, stub_id); 7144 7145 address start = __ pc(); 7146 7147 const Register crc = c_rarg0; // crc 7148 const Register buf = c_rarg1; // source java byte array address 7149 const Register len = c_rarg2; // length 7150 const Register table0 = c_rarg3; // crc_table address 7151 const Register table1 = c_rarg4; 7152 const Register table2 = c_rarg5; 7153 const Register table3 = c_rarg6; 7154 const Register tmp3 = c_rarg7; 7155 7156 BLOCK_COMMENT("Entry:"); 7157 __ enter(); // required for proper stackwalking of RuntimeStub frame 7158 7159 __ kernel_crc32c(crc, buf, len, 7160 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7161 7162 __ leave(); // required for proper stackwalking of RuntimeStub frame 7163 __ ret(lr); 7164 7165 return start; 7166 } 7167 7168 /*** 7169 * Arguments: 7170 * 7171 * Inputs: 7172 * c_rarg0 - int adler 7173 * c_rarg1 - byte* buff 7174 * c_rarg2 - int len 7175 * 7176 * Output: 7177 * c_rarg0 - int adler result 7178 */ 7179 address generate_updateBytesAdler32() { 7180 __ align(CodeEntryAlignment); 7181 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 7182 StubCodeMark mark(this, stub_id); 7183 address start = __ pc(); 7184 7185 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7186 7187 // Aliases 7188 Register adler = c_rarg0; 7189 Register s1 = c_rarg0; 7190 Register s2 = c_rarg3; 7191 Register buff = c_rarg1; 7192 Register len = c_rarg2; 7193 Register nmax = r4; 7194 Register base = r5; 7195 Register count = r6; 7196 Register temp0 = rscratch1; 7197 Register temp1 = rscratch2; 7198 FloatRegister vbytes = v0; 7199 FloatRegister vs1acc = v1; 7200 FloatRegister vs2acc = v2; 7201 FloatRegister vtable = v3; 7202 7203 // Max number of bytes we can process before having to take the mod 7204 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7205 uint64_t BASE = 0xfff1; 7206 uint64_t NMAX = 0x15B0; 7207 7208 __ mov(base, BASE); 7209 __ mov(nmax, NMAX); 7210 7211 // Load accumulation coefficients for the upper 16 bits 7212 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7213 __ ld1(vtable, __ T16B, Address(temp0)); 7214 7215 // s1 is initialized to the lower 16 bits of adler 7216 // s2 is initialized to the upper 16 bits of adler 7217 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7218 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7219 7220 // The pipelined loop needs at least 16 elements for 1 iteration 7221 // It does check this, but it is more effective to skip to the cleanup loop 7222 __ cmp(len, (u1)16); 7223 __ br(Assembler::HS, L_nmax); 7224 __ cbz(len, L_combine); 7225 7226 __ bind(L_simple_by1_loop); 7227 __ ldrb(temp0, Address(__ post(buff, 1))); 7228 __ add(s1, s1, temp0); 7229 __ add(s2, s2, s1); 7230 __ subs(len, len, 1); 7231 __ br(Assembler::HI, L_simple_by1_loop); 7232 7233 // s1 = s1 % BASE 7234 __ subs(temp0, s1, base); 7235 __ csel(s1, temp0, s1, Assembler::HS); 7236 7237 // s2 = s2 % BASE 7238 __ lsr(temp0, s2, 16); 7239 __ lsl(temp1, temp0, 4); 7240 __ sub(temp1, temp1, temp0); 7241 __ add(s2, temp1, s2, ext::uxth); 7242 7243 __ subs(temp0, s2, base); 7244 __ csel(s2, temp0, s2, Assembler::HS); 7245 7246 __ b(L_combine); 7247 7248 __ bind(L_nmax); 7249 __ subs(len, len, nmax); 7250 __ sub(count, nmax, 16); 7251 __ br(Assembler::LO, L_by16); 7252 7253 __ bind(L_nmax_loop); 7254 7255 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7256 vbytes, vs1acc, vs2acc, vtable); 7257 7258 __ subs(count, count, 16); 7259 __ br(Assembler::HS, L_nmax_loop); 7260 7261 // s1 = s1 % BASE 7262 __ lsr(temp0, s1, 16); 7263 __ lsl(temp1, temp0, 4); 7264 __ sub(temp1, temp1, temp0); 7265 __ add(temp1, temp1, s1, ext::uxth); 7266 7267 __ lsr(temp0, temp1, 16); 7268 __ lsl(s1, temp0, 4); 7269 __ sub(s1, s1, temp0); 7270 __ add(s1, s1, temp1, ext:: uxth); 7271 7272 __ subs(temp0, s1, base); 7273 __ csel(s1, temp0, s1, Assembler::HS); 7274 7275 // s2 = s2 % BASE 7276 __ lsr(temp0, s2, 16); 7277 __ lsl(temp1, temp0, 4); 7278 __ sub(temp1, temp1, temp0); 7279 __ add(temp1, temp1, s2, ext::uxth); 7280 7281 __ lsr(temp0, temp1, 16); 7282 __ lsl(s2, temp0, 4); 7283 __ sub(s2, s2, temp0); 7284 __ add(s2, s2, temp1, ext:: uxth); 7285 7286 __ subs(temp0, s2, base); 7287 __ csel(s2, temp0, s2, Assembler::HS); 7288 7289 __ subs(len, len, nmax); 7290 __ sub(count, nmax, 16); 7291 __ br(Assembler::HS, L_nmax_loop); 7292 7293 __ bind(L_by16); 7294 __ adds(len, len, count); 7295 __ br(Assembler::LO, L_by1); 7296 7297 __ bind(L_by16_loop); 7298 7299 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7300 vbytes, vs1acc, vs2acc, vtable); 7301 7302 __ subs(len, len, 16); 7303 __ br(Assembler::HS, L_by16_loop); 7304 7305 __ bind(L_by1); 7306 __ adds(len, len, 15); 7307 __ br(Assembler::LO, L_do_mod); 7308 7309 __ bind(L_by1_loop); 7310 __ ldrb(temp0, Address(__ post(buff, 1))); 7311 __ add(s1, temp0, s1); 7312 __ add(s2, s2, s1); 7313 __ subs(len, len, 1); 7314 __ br(Assembler::HS, L_by1_loop); 7315 7316 __ bind(L_do_mod); 7317 // s1 = s1 % BASE 7318 __ lsr(temp0, s1, 16); 7319 __ lsl(temp1, temp0, 4); 7320 __ sub(temp1, temp1, temp0); 7321 __ add(temp1, temp1, s1, ext::uxth); 7322 7323 __ lsr(temp0, temp1, 16); 7324 __ lsl(s1, temp0, 4); 7325 __ sub(s1, s1, temp0); 7326 __ add(s1, s1, temp1, ext:: uxth); 7327 7328 __ subs(temp0, s1, base); 7329 __ csel(s1, temp0, s1, Assembler::HS); 7330 7331 // s2 = s2 % BASE 7332 __ lsr(temp0, s2, 16); 7333 __ lsl(temp1, temp0, 4); 7334 __ sub(temp1, temp1, temp0); 7335 __ add(temp1, temp1, s2, ext::uxth); 7336 7337 __ lsr(temp0, temp1, 16); 7338 __ lsl(s2, temp0, 4); 7339 __ sub(s2, s2, temp0); 7340 __ add(s2, s2, temp1, ext:: uxth); 7341 7342 __ subs(temp0, s2, base); 7343 __ csel(s2, temp0, s2, Assembler::HS); 7344 7345 // Combine lower bits and higher bits 7346 __ bind(L_combine); 7347 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7348 7349 __ ret(lr); 7350 7351 return start; 7352 } 7353 7354 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7355 Register temp0, Register temp1, FloatRegister vbytes, 7356 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7357 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7358 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7359 // In non-vectorized code, we update s1 and s2 as: 7360 // s1 <- s1 + b1 7361 // s2 <- s2 + s1 7362 // s1 <- s1 + b2 7363 // s2 <- s2 + b1 7364 // ... 7365 // s1 <- s1 + b16 7366 // s2 <- s2 + s1 7367 // Putting above assignments together, we have: 7368 // s1_new = s1 + b1 + b2 + ... + b16 7369 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7370 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7371 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7372 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7373 7374 // s2 = s2 + s1 * 16 7375 __ add(s2, s2, s1, Assembler::LSL, 4); 7376 7377 // vs1acc = b1 + b2 + b3 + ... + b16 7378 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7379 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7380 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7381 __ uaddlv(vs1acc, __ T16B, vbytes); 7382 __ uaddlv(vs2acc, __ T8H, vs2acc); 7383 7384 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7385 __ fmovd(temp0, vs1acc); 7386 __ fmovd(temp1, vs2acc); 7387 __ add(s1, s1, temp0); 7388 __ add(s2, s2, temp1); 7389 } 7390 7391 /** 7392 * Arguments: 7393 * 7394 * Input: 7395 * c_rarg0 - x address 7396 * c_rarg1 - x length 7397 * c_rarg2 - y address 7398 * c_rarg3 - y length 7399 * c_rarg4 - z address 7400 */ 7401 address generate_multiplyToLen() { 7402 __ align(CodeEntryAlignment); 7403 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 7404 StubCodeMark mark(this, stub_id); 7405 7406 address start = __ pc(); 7407 7408 if (AOTCodeCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) { 7409 return start; 7410 } 7411 const Register x = r0; 7412 const Register xlen = r1; 7413 const Register y = r2; 7414 const Register ylen = r3; 7415 const Register z = r4; 7416 7417 const Register tmp0 = r5; 7418 const Register tmp1 = r10; 7419 const Register tmp2 = r11; 7420 const Register tmp3 = r12; 7421 const Register tmp4 = r13; 7422 const Register tmp5 = r14; 7423 const Register tmp6 = r15; 7424 const Register tmp7 = r16; 7425 7426 BLOCK_COMMENT("Entry:"); 7427 __ enter(); // required for proper stackwalking of RuntimeStub frame 7428 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7429 __ leave(); // required for proper stackwalking of RuntimeStub frame 7430 __ ret(lr); 7431 7432 AOTCodeCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start); 7433 return start; 7434 } 7435 7436 address generate_squareToLen() { 7437 // squareToLen algorithm for sizes 1..127 described in java code works 7438 // faster than multiply_to_len on some CPUs and slower on others, but 7439 // multiply_to_len shows a bit better overall results 7440 __ align(CodeEntryAlignment); 7441 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 7442 StubCodeMark mark(this, stub_id); 7443 address start = __ pc(); 7444 7445 if (AOTCodeCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) { 7446 return start; 7447 } 7448 const Register x = r0; 7449 const Register xlen = r1; 7450 const Register z = r2; 7451 const Register y = r4; // == x 7452 const Register ylen = r5; // == xlen 7453 7454 const Register tmp0 = r3; 7455 const Register tmp1 = r10; 7456 const Register tmp2 = r11; 7457 const Register tmp3 = r12; 7458 const Register tmp4 = r13; 7459 const Register tmp5 = r14; 7460 const Register tmp6 = r15; 7461 const Register tmp7 = r16; 7462 7463 RegSet spilled_regs = RegSet::of(y, ylen); 7464 BLOCK_COMMENT("Entry:"); 7465 __ enter(); 7466 __ push(spilled_regs, sp); 7467 __ mov(y, x); 7468 __ mov(ylen, xlen); 7469 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7470 __ pop(spilled_regs, sp); 7471 __ leave(); 7472 __ ret(lr); 7473 7474 AOTCodeCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start); 7475 return start; 7476 } 7477 7478 address generate_mulAdd() { 7479 __ align(CodeEntryAlignment); 7480 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 7481 StubCodeMark mark(this, stub_id); 7482 7483 address start = __ pc(); 7484 7485 if (AOTCodeCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) { 7486 return start; 7487 } 7488 const Register out = r0; 7489 const Register in = r1; 7490 const Register offset = r2; 7491 const Register len = r3; 7492 const Register k = r4; 7493 7494 BLOCK_COMMENT("Entry:"); 7495 __ enter(); 7496 __ mul_add(out, in, offset, len, k); 7497 __ leave(); 7498 __ ret(lr); 7499 7500 AOTCodeCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start); 7501 return start; 7502 } 7503 7504 // Arguments: 7505 // 7506 // Input: 7507 // c_rarg0 - newArr address 7508 // c_rarg1 - oldArr address 7509 // c_rarg2 - newIdx 7510 // c_rarg3 - shiftCount 7511 // c_rarg4 - numIter 7512 // 7513 address generate_bigIntegerRightShift() { 7514 __ align(CodeEntryAlignment); 7515 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 7516 StubCodeMark mark(this, stub_id); 7517 address start = __ pc(); 7518 7519 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7520 7521 Register newArr = c_rarg0; 7522 Register oldArr = c_rarg1; 7523 Register newIdx = c_rarg2; 7524 Register shiftCount = c_rarg3; 7525 Register numIter = c_rarg4; 7526 Register idx = numIter; 7527 7528 Register newArrCur = rscratch1; 7529 Register shiftRevCount = rscratch2; 7530 Register oldArrCur = r13; 7531 Register oldArrNext = r14; 7532 7533 FloatRegister oldElem0 = v0; 7534 FloatRegister oldElem1 = v1; 7535 FloatRegister newElem = v2; 7536 FloatRegister shiftVCount = v3; 7537 FloatRegister shiftVRevCount = v4; 7538 7539 __ cbz(idx, Exit); 7540 7541 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7542 7543 // left shift count 7544 __ movw(shiftRevCount, 32); 7545 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7546 7547 // numIter too small to allow a 4-words SIMD loop, rolling back 7548 __ cmp(numIter, (u1)4); 7549 __ br(Assembler::LT, ShiftThree); 7550 7551 __ dup(shiftVCount, __ T4S, shiftCount); 7552 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7553 __ negr(shiftVCount, __ T4S, shiftVCount); 7554 7555 __ BIND(ShiftSIMDLoop); 7556 7557 // Calculate the load addresses 7558 __ sub(idx, idx, 4); 7559 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7560 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7561 __ add(oldArrCur, oldArrNext, 4); 7562 7563 // Load 4 words and process 7564 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7565 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7566 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7567 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7568 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7569 __ st1(newElem, __ T4S, Address(newArrCur)); 7570 7571 __ cmp(idx, (u1)4); 7572 __ br(Assembler::LT, ShiftTwoLoop); 7573 __ b(ShiftSIMDLoop); 7574 7575 __ BIND(ShiftTwoLoop); 7576 __ cbz(idx, Exit); 7577 __ cmp(idx, (u1)1); 7578 __ br(Assembler::EQ, ShiftOne); 7579 7580 // Calculate the load addresses 7581 __ sub(idx, idx, 2); 7582 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7583 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7584 __ add(oldArrCur, oldArrNext, 4); 7585 7586 // Load 2 words and process 7587 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 7588 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 7589 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7590 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7591 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7592 __ st1(newElem, __ T2S, Address(newArrCur)); 7593 __ b(ShiftTwoLoop); 7594 7595 __ BIND(ShiftThree); 7596 __ tbz(idx, 1, ShiftOne); 7597 __ tbz(idx, 0, ShiftTwo); 7598 __ ldrw(r10, Address(oldArr, 12)); 7599 __ ldrw(r11, Address(oldArr, 8)); 7600 __ lsrvw(r10, r10, shiftCount); 7601 __ lslvw(r11, r11, shiftRevCount); 7602 __ orrw(r12, r10, r11); 7603 __ strw(r12, Address(newArr, 8)); 7604 7605 __ BIND(ShiftTwo); 7606 __ ldrw(r10, Address(oldArr, 8)); 7607 __ ldrw(r11, Address(oldArr, 4)); 7608 __ lsrvw(r10, r10, shiftCount); 7609 __ lslvw(r11, r11, shiftRevCount); 7610 __ orrw(r12, r10, r11); 7611 __ strw(r12, Address(newArr, 4)); 7612 7613 __ BIND(ShiftOne); 7614 __ ldrw(r10, Address(oldArr, 4)); 7615 __ ldrw(r11, Address(oldArr)); 7616 __ lsrvw(r10, r10, shiftCount); 7617 __ lslvw(r11, r11, shiftRevCount); 7618 __ orrw(r12, r10, r11); 7619 __ strw(r12, Address(newArr)); 7620 7621 __ BIND(Exit); 7622 __ ret(lr); 7623 7624 return start; 7625 } 7626 7627 // Arguments: 7628 // 7629 // Input: 7630 // c_rarg0 - newArr address 7631 // c_rarg1 - oldArr address 7632 // c_rarg2 - newIdx 7633 // c_rarg3 - shiftCount 7634 // c_rarg4 - numIter 7635 // 7636 address generate_bigIntegerLeftShift() { 7637 __ align(CodeEntryAlignment); 7638 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 7639 StubCodeMark mark(this, stub_id); 7640 address start = __ pc(); 7641 7642 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7643 7644 Register newArr = c_rarg0; 7645 Register oldArr = c_rarg1; 7646 Register newIdx = c_rarg2; 7647 Register shiftCount = c_rarg3; 7648 Register numIter = c_rarg4; 7649 7650 Register shiftRevCount = rscratch1; 7651 Register oldArrNext = rscratch2; 7652 7653 FloatRegister oldElem0 = v0; 7654 FloatRegister oldElem1 = v1; 7655 FloatRegister newElem = v2; 7656 FloatRegister shiftVCount = v3; 7657 FloatRegister shiftVRevCount = v4; 7658 7659 __ cbz(numIter, Exit); 7660 7661 __ add(oldArrNext, oldArr, 4); 7662 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7663 7664 // right shift count 7665 __ movw(shiftRevCount, 32); 7666 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7667 7668 // numIter too small to allow a 4-words SIMD loop, rolling back 7669 __ cmp(numIter, (u1)4); 7670 __ br(Assembler::LT, ShiftThree); 7671 7672 __ dup(shiftVCount, __ T4S, shiftCount); 7673 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7674 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 7675 7676 __ BIND(ShiftSIMDLoop); 7677 7678 // load 4 words and process 7679 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 7680 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 7681 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7682 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7683 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7684 __ st1(newElem, __ T4S, __ post(newArr, 16)); 7685 __ sub(numIter, numIter, 4); 7686 7687 __ cmp(numIter, (u1)4); 7688 __ br(Assembler::LT, ShiftTwoLoop); 7689 __ b(ShiftSIMDLoop); 7690 7691 __ BIND(ShiftTwoLoop); 7692 __ cbz(numIter, Exit); 7693 __ cmp(numIter, (u1)1); 7694 __ br(Assembler::EQ, ShiftOne); 7695 7696 // load 2 words and process 7697 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 7698 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 7699 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7700 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7701 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7702 __ st1(newElem, __ T2S, __ post(newArr, 8)); 7703 __ sub(numIter, numIter, 2); 7704 __ b(ShiftTwoLoop); 7705 7706 __ BIND(ShiftThree); 7707 __ ldrw(r10, __ post(oldArr, 4)); 7708 __ ldrw(r11, __ post(oldArrNext, 4)); 7709 __ lslvw(r10, r10, shiftCount); 7710 __ lsrvw(r11, r11, shiftRevCount); 7711 __ orrw(r12, r10, r11); 7712 __ strw(r12, __ post(newArr, 4)); 7713 __ tbz(numIter, 1, Exit); 7714 __ tbz(numIter, 0, ShiftOne); 7715 7716 __ BIND(ShiftTwo); 7717 __ ldrw(r10, __ post(oldArr, 4)); 7718 __ ldrw(r11, __ post(oldArrNext, 4)); 7719 __ lslvw(r10, r10, shiftCount); 7720 __ lsrvw(r11, r11, shiftRevCount); 7721 __ orrw(r12, r10, r11); 7722 __ strw(r12, __ post(newArr, 4)); 7723 7724 __ BIND(ShiftOne); 7725 __ ldrw(r10, Address(oldArr)); 7726 __ ldrw(r11, Address(oldArrNext)); 7727 __ lslvw(r10, r10, shiftCount); 7728 __ lsrvw(r11, r11, shiftRevCount); 7729 __ orrw(r12, r10, r11); 7730 __ strw(r12, Address(newArr)); 7731 7732 __ BIND(Exit); 7733 __ ret(lr); 7734 7735 return start; 7736 } 7737 7738 address generate_count_positives(address &count_positives_long) { 7739 const u1 large_loop_size = 64; 7740 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 7741 int dcache_line = VM_Version::dcache_line_size(); 7742 7743 Register ary1 = r1, len = r2, result = r0; 7744 7745 __ align(CodeEntryAlignment); 7746 7747 StubGenStubId stub_id = StubGenStubId::count_positives_id; 7748 StubCodeMark mark(this, stub_id); 7749 7750 address entry = __ pc(); 7751 7752 __ enter(); 7753 // precondition: a copy of len is already in result 7754 // __ mov(result, len); 7755 7756 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 7757 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 7758 7759 __ cmp(len, (u1)15); 7760 __ br(Assembler::GT, LEN_OVER_15); 7761 // The only case when execution falls into this code is when pointer is near 7762 // the end of memory page and we have to avoid reading next page 7763 __ add(ary1, ary1, len); 7764 __ subs(len, len, 8); 7765 __ br(Assembler::GT, LEN_OVER_8); 7766 __ ldr(rscratch2, Address(ary1, -8)); 7767 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 7768 __ lsrv(rscratch2, rscratch2, rscratch1); 7769 __ tst(rscratch2, UPPER_BIT_MASK); 7770 __ csel(result, zr, result, Assembler::NE); 7771 __ leave(); 7772 __ ret(lr); 7773 __ bind(LEN_OVER_8); 7774 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 7775 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 7776 __ tst(rscratch2, UPPER_BIT_MASK); 7777 __ br(Assembler::NE, RET_NO_POP); 7778 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 7779 __ lsrv(rscratch1, rscratch1, rscratch2); 7780 __ tst(rscratch1, UPPER_BIT_MASK); 7781 __ bind(RET_NO_POP); 7782 __ csel(result, zr, result, Assembler::NE); 7783 __ leave(); 7784 __ ret(lr); 7785 7786 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 7787 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 7788 7789 count_positives_long = __ pc(); // 2nd entry point 7790 7791 __ enter(); 7792 7793 __ bind(LEN_OVER_15); 7794 __ push(spilled_regs, sp); 7795 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 7796 __ cbz(rscratch2, ALIGNED); 7797 __ ldp(tmp6, tmp1, Address(ary1)); 7798 __ mov(tmp5, 16); 7799 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 7800 __ add(ary1, ary1, rscratch1); 7801 __ orr(tmp6, tmp6, tmp1); 7802 __ tst(tmp6, UPPER_BIT_MASK); 7803 __ br(Assembler::NE, RET_ADJUST); 7804 __ sub(len, len, rscratch1); 7805 7806 __ bind(ALIGNED); 7807 __ cmp(len, large_loop_size); 7808 __ br(Assembler::LT, CHECK_16); 7809 // Perform 16-byte load as early return in pre-loop to handle situation 7810 // when initially aligned large array has negative values at starting bytes, 7811 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 7812 // slower. Cases with negative bytes further ahead won't be affected that 7813 // much. In fact, it'll be faster due to early loads, less instructions and 7814 // less branches in LARGE_LOOP. 7815 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 7816 __ sub(len, len, 16); 7817 __ orr(tmp6, tmp6, tmp1); 7818 __ tst(tmp6, UPPER_BIT_MASK); 7819 __ br(Assembler::NE, RET_ADJUST_16); 7820 __ cmp(len, large_loop_size); 7821 __ br(Assembler::LT, CHECK_16); 7822 7823 if (SoftwarePrefetchHintDistance >= 0 7824 && SoftwarePrefetchHintDistance >= dcache_line) { 7825 // initial prefetch 7826 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 7827 } 7828 __ bind(LARGE_LOOP); 7829 if (SoftwarePrefetchHintDistance >= 0) { 7830 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 7831 } 7832 // Issue load instructions first, since it can save few CPU/MEM cycles, also 7833 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 7834 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 7835 // instructions per cycle and have less branches, but this approach disables 7836 // early return, thus, all 64 bytes are loaded and checked every time. 7837 __ ldp(tmp2, tmp3, Address(ary1)); 7838 __ ldp(tmp4, tmp5, Address(ary1, 16)); 7839 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 7840 __ ldp(tmp6, tmp1, Address(ary1, 48)); 7841 __ add(ary1, ary1, large_loop_size); 7842 __ sub(len, len, large_loop_size); 7843 __ orr(tmp2, tmp2, tmp3); 7844 __ orr(tmp4, tmp4, tmp5); 7845 __ orr(rscratch1, rscratch1, rscratch2); 7846 __ orr(tmp6, tmp6, tmp1); 7847 __ orr(tmp2, tmp2, tmp4); 7848 __ orr(rscratch1, rscratch1, tmp6); 7849 __ orr(tmp2, tmp2, rscratch1); 7850 __ tst(tmp2, UPPER_BIT_MASK); 7851 __ br(Assembler::NE, RET_ADJUST_LONG); 7852 __ cmp(len, large_loop_size); 7853 __ br(Assembler::GE, LARGE_LOOP); 7854 7855 __ bind(CHECK_16); // small 16-byte load pre-loop 7856 __ cmp(len, (u1)16); 7857 __ br(Assembler::LT, POST_LOOP16); 7858 7859 __ bind(LOOP16); // small 16-byte load loop 7860 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 7861 __ sub(len, len, 16); 7862 __ orr(tmp2, tmp2, tmp3); 7863 __ tst(tmp2, UPPER_BIT_MASK); 7864 __ br(Assembler::NE, RET_ADJUST_16); 7865 __ cmp(len, (u1)16); 7866 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 7867 7868 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 7869 __ cmp(len, (u1)8); 7870 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 7871 __ ldr(tmp3, Address(__ post(ary1, 8))); 7872 __ tst(tmp3, UPPER_BIT_MASK); 7873 __ br(Assembler::NE, RET_ADJUST); 7874 __ sub(len, len, 8); 7875 7876 __ bind(POST_LOOP16_LOAD_TAIL); 7877 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 7878 __ ldr(tmp1, Address(ary1)); 7879 __ mov(tmp2, 64); 7880 __ sub(tmp4, tmp2, len, __ LSL, 3); 7881 __ lslv(tmp1, tmp1, tmp4); 7882 __ tst(tmp1, UPPER_BIT_MASK); 7883 __ br(Assembler::NE, RET_ADJUST); 7884 // Fallthrough 7885 7886 __ bind(RET_LEN); 7887 __ pop(spilled_regs, sp); 7888 __ leave(); 7889 __ ret(lr); 7890 7891 // difference result - len is the count of guaranteed to be 7892 // positive bytes 7893 7894 __ bind(RET_ADJUST_LONG); 7895 __ add(len, len, (u1)(large_loop_size - 16)); 7896 __ bind(RET_ADJUST_16); 7897 __ add(len, len, 16); 7898 __ bind(RET_ADJUST); 7899 __ pop(spilled_regs, sp); 7900 __ leave(); 7901 __ sub(result, result, len); 7902 __ ret(lr); 7903 7904 return entry; 7905 } 7906 7907 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 7908 bool usePrefetch, Label &NOT_EQUAL) { 7909 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7910 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 7911 tmp7 = r12, tmp8 = r13; 7912 Label LOOP; 7913 7914 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7915 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7916 __ bind(LOOP); 7917 if (usePrefetch) { 7918 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 7919 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 7920 } 7921 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 7922 __ eor(tmp1, tmp1, tmp2); 7923 __ eor(tmp3, tmp3, tmp4); 7924 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 7925 __ orr(tmp1, tmp1, tmp3); 7926 __ cbnz(tmp1, NOT_EQUAL); 7927 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7928 __ eor(tmp5, tmp5, tmp6); 7929 __ eor(tmp7, tmp7, tmp8); 7930 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7931 __ orr(tmp5, tmp5, tmp7); 7932 __ cbnz(tmp5, NOT_EQUAL); 7933 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 7934 __ eor(tmp1, tmp1, tmp2); 7935 __ eor(tmp3, tmp3, tmp4); 7936 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 7937 __ orr(tmp1, tmp1, tmp3); 7938 __ cbnz(tmp1, NOT_EQUAL); 7939 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 7940 __ eor(tmp5, tmp5, tmp6); 7941 __ sub(cnt1, cnt1, 8 * wordSize); 7942 __ eor(tmp7, tmp7, tmp8); 7943 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 7944 // tmp6 is not used. MacroAssembler::subs is used here (rather than 7945 // cmp) because subs allows an unlimited range of immediate operand. 7946 __ subs(tmp6, cnt1, loopThreshold); 7947 __ orr(tmp5, tmp5, tmp7); 7948 __ cbnz(tmp5, NOT_EQUAL); 7949 __ br(__ GE, LOOP); 7950 // post-loop 7951 __ eor(tmp1, tmp1, tmp2); 7952 __ eor(tmp3, tmp3, tmp4); 7953 __ orr(tmp1, tmp1, tmp3); 7954 __ sub(cnt1, cnt1, 2 * wordSize); 7955 __ cbnz(tmp1, NOT_EQUAL); 7956 } 7957 7958 void generate_large_array_equals_loop_simd(int loopThreshold, 7959 bool usePrefetch, Label &NOT_EQUAL) { 7960 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7961 tmp2 = rscratch2; 7962 Label LOOP; 7963 7964 __ bind(LOOP); 7965 if (usePrefetch) { 7966 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 7967 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 7968 } 7969 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 7970 __ sub(cnt1, cnt1, 8 * wordSize); 7971 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 7972 __ subs(tmp1, cnt1, loopThreshold); 7973 __ eor(v0, __ T16B, v0, v4); 7974 __ eor(v1, __ T16B, v1, v5); 7975 __ eor(v2, __ T16B, v2, v6); 7976 __ eor(v3, __ T16B, v3, v7); 7977 __ orr(v0, __ T16B, v0, v1); 7978 __ orr(v1, __ T16B, v2, v3); 7979 __ orr(v0, __ T16B, v0, v1); 7980 __ umov(tmp1, v0, __ D, 0); 7981 __ umov(tmp2, v0, __ D, 1); 7982 __ orr(tmp1, tmp1, tmp2); 7983 __ cbnz(tmp1, NOT_EQUAL); 7984 __ br(__ GE, LOOP); 7985 } 7986 7987 // a1 = r1 - array1 address 7988 // a2 = r2 - array2 address 7989 // result = r0 - return value. Already contains "false" 7990 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 7991 // r3-r5 are reserved temporary registers 7992 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 7993 address generate_large_array_equals() { 7994 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 7995 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 7996 tmp7 = r12, tmp8 = r13; 7997 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 7998 SMALL_LOOP, POST_LOOP; 7999 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 8000 // calculate if at least 32 prefetched bytes are used 8001 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 8002 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 8003 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 8004 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 8005 tmp5, tmp6, tmp7, tmp8); 8006 8007 __ align(CodeEntryAlignment); 8008 8009 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 8010 StubCodeMark mark(this, stub_id); 8011 8012 address entry = __ pc(); 8013 __ enter(); 8014 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 8015 // also advance pointers to use post-increment instead of pre-increment 8016 __ add(a1, a1, wordSize); 8017 __ add(a2, a2, wordSize); 8018 if (AvoidUnalignedAccesses) { 8019 // both implementations (SIMD/nonSIMD) are using relatively large load 8020 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 8021 // on some CPUs in case of address is not at least 16-byte aligned. 8022 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 8023 // load if needed at least for 1st address and make if 16-byte aligned. 8024 Label ALIGNED16; 8025 __ tbz(a1, 3, ALIGNED16); 8026 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8027 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8028 __ sub(cnt1, cnt1, wordSize); 8029 __ eor(tmp1, tmp1, tmp2); 8030 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 8031 __ bind(ALIGNED16); 8032 } 8033 if (UseSIMDForArrayEquals) { 8034 if (SoftwarePrefetchHintDistance >= 0) { 8035 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8036 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8037 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 8038 /* prfm = */ true, NOT_EQUAL); 8039 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8040 __ br(__ LT, TAIL); 8041 } 8042 __ bind(NO_PREFETCH_LARGE_LOOP); 8043 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 8044 /* prfm = */ false, NOT_EQUAL); 8045 } else { 8046 __ push(spilled_regs, sp); 8047 if (SoftwarePrefetchHintDistance >= 0) { 8048 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8049 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8050 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 8051 /* prfm = */ true, NOT_EQUAL); 8052 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8053 __ br(__ LT, TAIL); 8054 } 8055 __ bind(NO_PREFETCH_LARGE_LOOP); 8056 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 8057 /* prfm = */ false, NOT_EQUAL); 8058 } 8059 __ bind(TAIL); 8060 __ cbz(cnt1, EQUAL); 8061 __ subs(cnt1, cnt1, wordSize); 8062 __ br(__ LE, POST_LOOP); 8063 __ bind(SMALL_LOOP); 8064 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8065 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8066 __ subs(cnt1, cnt1, wordSize); 8067 __ eor(tmp1, tmp1, tmp2); 8068 __ cbnz(tmp1, NOT_EQUAL); 8069 __ br(__ GT, SMALL_LOOP); 8070 __ bind(POST_LOOP); 8071 __ ldr(tmp1, Address(a1, cnt1)); 8072 __ ldr(tmp2, Address(a2, cnt1)); 8073 __ eor(tmp1, tmp1, tmp2); 8074 __ cbnz(tmp1, NOT_EQUAL); 8075 __ bind(EQUAL); 8076 __ mov(result, true); 8077 __ bind(NOT_EQUAL); 8078 if (!UseSIMDForArrayEquals) { 8079 __ pop(spilled_regs, sp); 8080 } 8081 __ bind(NOT_EQUAL_NO_POP); 8082 __ leave(); 8083 __ ret(lr); 8084 return entry; 8085 } 8086 8087 // result = r0 - return value. Contains initial hashcode value on entry. 8088 // ary = r1 - array address 8089 // cnt = r2 - elements count 8090 // Clobbers: v0-v13, rscratch1, rscratch2 8091 address generate_large_arrays_hashcode(BasicType eltype) { 8092 const Register result = r0, ary = r1, cnt = r2; 8093 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 8094 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 8095 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 8096 const FloatRegister vpowm = v13; 8097 8098 ARRAYS_HASHCODE_REGISTERS; 8099 8100 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 8101 8102 unsigned int vf; // vectorization factor 8103 bool multiply_by_halves; 8104 Assembler::SIMD_Arrangement load_arrangement; 8105 switch (eltype) { 8106 case T_BOOLEAN: 8107 case T_BYTE: 8108 load_arrangement = Assembler::T8B; 8109 multiply_by_halves = true; 8110 vf = 8; 8111 break; 8112 case T_CHAR: 8113 case T_SHORT: 8114 load_arrangement = Assembler::T8H; 8115 multiply_by_halves = true; 8116 vf = 8; 8117 break; 8118 case T_INT: 8119 load_arrangement = Assembler::T4S; 8120 multiply_by_halves = false; 8121 vf = 4; 8122 break; 8123 default: 8124 ShouldNotReachHere(); 8125 } 8126 8127 // Unroll factor 8128 const unsigned uf = 4; 8129 8130 // Effective vectorization factor 8131 const unsigned evf = vf * uf; 8132 8133 __ align(CodeEntryAlignment); 8134 8135 StubGenStubId stub_id; 8136 switch (eltype) { 8137 case T_BOOLEAN: 8138 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 8139 break; 8140 case T_BYTE: 8141 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 8142 break; 8143 case T_CHAR: 8144 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 8145 break; 8146 case T_SHORT: 8147 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 8148 break; 8149 case T_INT: 8150 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 8151 break; 8152 default: 8153 stub_id = StubGenStubId::NO_STUBID; 8154 ShouldNotReachHere(); 8155 }; 8156 8157 StubCodeMark mark(this, stub_id); 8158 8159 address entry = __ pc(); 8160 __ enter(); 8161 8162 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8163 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8164 // value shouldn't change throughout both loops. 8165 __ movw(rscratch1, intpow(31U, 3)); 8166 __ mov(vpow, Assembler::S, 0, rscratch1); 8167 __ movw(rscratch1, intpow(31U, 2)); 8168 __ mov(vpow, Assembler::S, 1, rscratch1); 8169 __ movw(rscratch1, intpow(31U, 1)); 8170 __ mov(vpow, Assembler::S, 2, rscratch1); 8171 __ movw(rscratch1, intpow(31U, 0)); 8172 __ mov(vpow, Assembler::S, 3, rscratch1); 8173 8174 __ mov(vmul0, Assembler::T16B, 0); 8175 __ mov(vmul0, Assembler::S, 3, result); 8176 8177 __ andr(rscratch2, cnt, (uf - 1) * vf); 8178 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8179 8180 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8181 __ mov(vpowm, Assembler::S, 0, rscratch1); 8182 8183 // SMALL LOOP 8184 __ bind(SMALL_LOOP); 8185 8186 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8187 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8188 __ subsw(rscratch2, rscratch2, vf); 8189 8190 if (load_arrangement == Assembler::T8B) { 8191 // Extend 8B to 8H to be able to use vector multiply 8192 // instructions 8193 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8194 if (is_signed_subword_type(eltype)) { 8195 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8196 } else { 8197 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8198 } 8199 } 8200 8201 switch (load_arrangement) { 8202 case Assembler::T4S: 8203 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8204 break; 8205 case Assembler::T8B: 8206 case Assembler::T8H: 8207 assert(is_subword_type(eltype), "subword type expected"); 8208 if (is_signed_subword_type(eltype)) { 8209 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8210 } else { 8211 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8212 } 8213 break; 8214 default: 8215 __ should_not_reach_here(); 8216 } 8217 8218 // Process the upper half of a vector 8219 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8220 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8221 if (is_signed_subword_type(eltype)) { 8222 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8223 } else { 8224 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8225 } 8226 } 8227 8228 __ br(Assembler::HI, SMALL_LOOP); 8229 8230 // SMALL LOOP'S EPILOQUE 8231 __ lsr(rscratch2, cnt, exact_log2(evf)); 8232 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8233 8234 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8235 __ addv(vmul0, Assembler::T4S, vmul0); 8236 __ umov(result, vmul0, Assembler::S, 0); 8237 8238 // TAIL 8239 __ bind(TAIL); 8240 8241 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8242 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8243 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8244 __ andr(rscratch2, cnt, vf - 1); 8245 __ bind(TAIL_SHORTCUT); 8246 __ adr(rscratch1, BR_BASE); 8247 // For Cortex-A53 offset is 4 because 2 nops are generated. 8248 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3); 8249 __ movw(rscratch2, 0x1f); 8250 __ br(rscratch1); 8251 8252 for (size_t i = 0; i < vf - 1; ++i) { 8253 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8254 eltype); 8255 __ maddw(result, result, rscratch2, rscratch1); 8256 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 8257 // Generate 2nd nop to have 4 instructions per iteration. 8258 if (VM_Version::supports_a53mac()) { 8259 __ nop(); 8260 } 8261 } 8262 __ bind(BR_BASE); 8263 8264 __ leave(); 8265 __ ret(lr); 8266 8267 // LARGE LOOP 8268 __ bind(LARGE_LOOP_PREHEADER); 8269 8270 __ lsr(rscratch2, cnt, exact_log2(evf)); 8271 8272 if (multiply_by_halves) { 8273 // 31^4 - multiplier between lower and upper parts of a register 8274 __ movw(rscratch1, intpow(31U, vf / 2)); 8275 __ mov(vpowm, Assembler::S, 1, rscratch1); 8276 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8277 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8278 __ mov(vpowm, Assembler::S, 0, rscratch1); 8279 } else { 8280 // 31^16 8281 __ movw(rscratch1, intpow(31U, evf)); 8282 __ mov(vpowm, Assembler::S, 0, rscratch1); 8283 } 8284 8285 __ mov(vmul3, Assembler::T16B, 0); 8286 __ mov(vmul2, Assembler::T16B, 0); 8287 __ mov(vmul1, Assembler::T16B, 0); 8288 8289 __ bind(LARGE_LOOP); 8290 8291 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8292 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8293 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8294 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8295 8296 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8297 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8298 8299 if (load_arrangement == Assembler::T8B) { 8300 // Extend 8B to 8H to be able to use vector multiply 8301 // instructions 8302 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8303 if (is_signed_subword_type(eltype)) { 8304 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8305 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8306 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8307 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8308 } else { 8309 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8310 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8311 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8312 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8313 } 8314 } 8315 8316 switch (load_arrangement) { 8317 case Assembler::T4S: 8318 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8319 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8320 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8321 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8322 break; 8323 case Assembler::T8B: 8324 case Assembler::T8H: 8325 assert(is_subword_type(eltype), "subword type expected"); 8326 if (is_signed_subword_type(eltype)) { 8327 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8328 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8329 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8330 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8331 } else { 8332 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8333 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8334 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8335 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8336 } 8337 break; 8338 default: 8339 __ should_not_reach_here(); 8340 } 8341 8342 // Process the upper half of a vector 8343 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8344 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8345 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8346 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8347 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8348 if (is_signed_subword_type(eltype)) { 8349 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8350 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8351 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8352 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8353 } else { 8354 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8355 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8356 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8357 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8358 } 8359 } 8360 8361 __ subsw(rscratch2, rscratch2, 1); 8362 __ br(Assembler::HI, LARGE_LOOP); 8363 8364 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8365 __ addv(vmul3, Assembler::T4S, vmul3); 8366 __ umov(result, vmul3, Assembler::S, 0); 8367 8368 __ mov(rscratch2, intpow(31U, vf)); 8369 8370 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8371 __ addv(vmul2, Assembler::T4S, vmul2); 8372 __ umov(rscratch1, vmul2, Assembler::S, 0); 8373 __ maddw(result, result, rscratch2, rscratch1); 8374 8375 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8376 __ addv(vmul1, Assembler::T4S, vmul1); 8377 __ umov(rscratch1, vmul1, Assembler::S, 0); 8378 __ maddw(result, result, rscratch2, rscratch1); 8379 8380 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8381 __ addv(vmul0, Assembler::T4S, vmul0); 8382 __ umov(rscratch1, vmul0, Assembler::S, 0); 8383 __ maddw(result, result, rscratch2, rscratch1); 8384 8385 __ andr(rscratch2, cnt, vf - 1); 8386 __ cbnz(rscratch2, TAIL_SHORTCUT); 8387 8388 __ leave(); 8389 __ ret(lr); 8390 8391 return entry; 8392 } 8393 8394 address generate_dsin_dcos(bool isCos) { 8395 __ align(CodeEntryAlignment); 8396 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 8397 StubCodeMark mark(this, stub_id); 8398 address start = __ pc(); 8399 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8400 (address)StubRoutines::aarch64::_two_over_pi, 8401 (address)StubRoutines::aarch64::_pio2, 8402 (address)StubRoutines::aarch64::_dsin_coef, 8403 (address)StubRoutines::aarch64::_dcos_coef); 8404 return start; 8405 } 8406 8407 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8408 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8409 Label &DIFF2) { 8410 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8411 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8412 8413 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8414 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8415 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8416 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8417 8418 __ fmovd(tmpL, vtmp3); 8419 __ eor(rscratch2, tmp3, tmpL); 8420 __ cbnz(rscratch2, DIFF2); 8421 8422 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8423 __ umov(tmpL, vtmp3, __ D, 1); 8424 __ eor(rscratch2, tmpU, tmpL); 8425 __ cbnz(rscratch2, DIFF1); 8426 8427 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8428 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8429 __ fmovd(tmpL, vtmp); 8430 __ eor(rscratch2, tmp3, tmpL); 8431 __ cbnz(rscratch2, DIFF2); 8432 8433 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8434 __ umov(tmpL, vtmp, __ D, 1); 8435 __ eor(rscratch2, tmpU, tmpL); 8436 __ cbnz(rscratch2, DIFF1); 8437 } 8438 8439 // r0 = result 8440 // r1 = str1 8441 // r2 = cnt1 8442 // r3 = str2 8443 // r4 = cnt2 8444 // r10 = tmp1 8445 // r11 = tmp2 8446 address generate_compare_long_string_different_encoding(bool isLU) { 8447 __ align(CodeEntryAlignment); 8448 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 8449 StubCodeMark mark(this, stub_id); 8450 address entry = __ pc(); 8451 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8452 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8453 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8454 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8455 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8456 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8457 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8458 8459 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8460 8461 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8462 // cnt2 == amount of characters left to compare 8463 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8464 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8465 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8466 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8467 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8468 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8469 __ eor(rscratch2, tmp1, tmp2); 8470 __ mov(rscratch1, tmp2); 8471 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8472 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8473 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8474 __ push(spilled_regs, sp); 8475 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8476 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8477 8478 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8479 8480 if (SoftwarePrefetchHintDistance >= 0) { 8481 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8482 __ br(__ LT, NO_PREFETCH); 8483 __ bind(LARGE_LOOP_PREFETCH); 8484 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8485 __ mov(tmp4, 2); 8486 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8487 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8488 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8489 __ subs(tmp4, tmp4, 1); 8490 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8491 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8492 __ mov(tmp4, 2); 8493 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8494 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8495 __ subs(tmp4, tmp4, 1); 8496 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8497 __ sub(cnt2, cnt2, 64); 8498 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8499 __ br(__ GE, LARGE_LOOP_PREFETCH); 8500 } 8501 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8502 __ bind(NO_PREFETCH); 8503 __ subs(cnt2, cnt2, 16); 8504 __ br(__ LT, TAIL); 8505 __ align(OptoLoopAlignment); 8506 __ bind(SMALL_LOOP); // smaller loop 8507 __ subs(cnt2, cnt2, 16); 8508 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8509 __ br(__ GE, SMALL_LOOP); 8510 __ cmn(cnt2, (u1)16); 8511 __ br(__ EQ, LOAD_LAST); 8512 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8513 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8514 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8515 __ ldr(tmp3, Address(cnt1, -8)); 8516 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8517 __ b(LOAD_LAST); 8518 __ bind(DIFF2); 8519 __ mov(tmpU, tmp3); 8520 __ bind(DIFF1); 8521 __ pop(spilled_regs, sp); 8522 __ b(CALCULATE_DIFFERENCE); 8523 __ bind(LOAD_LAST); 8524 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8525 // No need to load it again 8526 __ mov(tmpU, tmp3); 8527 __ pop(spilled_regs, sp); 8528 8529 // tmp2 points to the address of the last 4 Latin1 characters right now 8530 __ ldrs(vtmp, Address(tmp2)); 8531 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8532 __ fmovd(tmpL, vtmp); 8533 8534 __ eor(rscratch2, tmpU, tmpL); 8535 __ cbz(rscratch2, DONE); 8536 8537 // Find the first different characters in the longwords and 8538 // compute their difference. 8539 __ bind(CALCULATE_DIFFERENCE); 8540 __ rev(rscratch2, rscratch2); 8541 __ clz(rscratch2, rscratch2); 8542 __ andr(rscratch2, rscratch2, -16); 8543 __ lsrv(tmp1, tmp1, rscratch2); 8544 __ uxthw(tmp1, tmp1); 8545 __ lsrv(rscratch1, rscratch1, rscratch2); 8546 __ uxthw(rscratch1, rscratch1); 8547 __ subw(result, tmp1, rscratch1); 8548 __ bind(DONE); 8549 __ ret(lr); 8550 return entry; 8551 } 8552 8553 // r0 = input (float16) 8554 // v0 = result (float) 8555 // v1 = temporary float register 8556 address generate_float16ToFloat() { 8557 __ align(CodeEntryAlignment); 8558 StubGenStubId stub_id = StubGenStubId::hf2f_id; 8559 StubCodeMark mark(this, stub_id); 8560 address entry = __ pc(); 8561 BLOCK_COMMENT("Entry:"); 8562 __ flt16_to_flt(v0, r0, v1); 8563 __ ret(lr); 8564 return entry; 8565 } 8566 8567 // v0 = input (float) 8568 // r0 = result (float16) 8569 // v1 = temporary float register 8570 address generate_floatToFloat16() { 8571 __ align(CodeEntryAlignment); 8572 StubGenStubId stub_id = StubGenStubId::f2hf_id; 8573 StubCodeMark mark(this, stub_id); 8574 address entry = __ pc(); 8575 BLOCK_COMMENT("Entry:"); 8576 __ flt_to_flt16(r0, v0, v1); 8577 __ ret(lr); 8578 return entry; 8579 } 8580 8581 address generate_method_entry_barrier() { 8582 __ align(CodeEntryAlignment); 8583 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 8584 StubCodeMark mark(this, stub_id); 8585 8586 Label deoptimize_label; 8587 8588 address start = __ pc(); 8589 8590 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 8591 8592 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 8593 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8594 // We can get here despite the nmethod being good, if we have not 8595 // yet applied our cross modification fence (or data fence). 8596 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 8597 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 8598 __ ldrw(rscratch2, rscratch2); 8599 __ strw(rscratch2, thread_epoch_addr); 8600 __ isb(); 8601 __ membar(__ LoadLoad); 8602 } 8603 8604 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 8605 8606 __ enter(); 8607 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 8608 8609 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 8610 8611 __ push_call_clobbered_registers(); 8612 8613 __ mov(c_rarg0, rscratch2); 8614 __ call_VM_leaf 8615 (CAST_FROM_FN_PTR 8616 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 8617 8618 __ reset_last_Java_frame(true); 8619 8620 __ mov(rscratch1, r0); 8621 8622 __ pop_call_clobbered_registers(); 8623 8624 __ cbnz(rscratch1, deoptimize_label); 8625 8626 __ leave(); 8627 __ ret(lr); 8628 8629 __ BIND(deoptimize_label); 8630 8631 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 8632 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 8633 8634 __ mov(sp, rscratch1); 8635 __ br(rscratch2); 8636 8637 return start; 8638 } 8639 8640 // r0 = result 8641 // r1 = str1 8642 // r2 = cnt1 8643 // r3 = str2 8644 // r4 = cnt2 8645 // r10 = tmp1 8646 // r11 = tmp2 8647 address generate_compare_long_string_same_encoding(bool isLL) { 8648 __ align(CodeEntryAlignment); 8649 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 8650 StubCodeMark mark(this, stub_id); 8651 address entry = __ pc(); 8652 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8653 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 8654 8655 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 8656 8657 // exit from large loop when less than 64 bytes left to read or we're about 8658 // to prefetch memory behind array border 8659 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 8660 8661 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 8662 __ eor(rscratch2, tmp1, tmp2); 8663 __ cbnz(rscratch2, CAL_DIFFERENCE); 8664 8665 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 8666 // update pointers, because of previous read 8667 __ add(str1, str1, wordSize); 8668 __ add(str2, str2, wordSize); 8669 if (SoftwarePrefetchHintDistance >= 0) { 8670 __ align(OptoLoopAlignment); 8671 __ bind(LARGE_LOOP_PREFETCH); 8672 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 8673 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 8674 8675 for (int i = 0; i < 4; i++) { 8676 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 8677 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 8678 __ cmp(tmp1, tmp2); 8679 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8680 __ br(Assembler::NE, DIFF); 8681 } 8682 __ sub(cnt2, cnt2, isLL ? 64 : 32); 8683 __ add(str1, str1, 64); 8684 __ add(str2, str2, 64); 8685 __ subs(rscratch2, cnt2, largeLoopExitCondition); 8686 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 8687 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 8688 } 8689 8690 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 8691 __ br(Assembler::LE, LESS16); 8692 __ align(OptoLoopAlignment); 8693 __ bind(LOOP_COMPARE16); 8694 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 8695 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 8696 __ cmp(tmp1, tmp2); 8697 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8698 __ br(Assembler::NE, DIFF); 8699 __ sub(cnt2, cnt2, isLL ? 16 : 8); 8700 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 8701 __ br(Assembler::LT, LESS16); 8702 8703 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 8704 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 8705 __ cmp(tmp1, tmp2); 8706 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 8707 __ br(Assembler::NE, DIFF); 8708 __ sub(cnt2, cnt2, isLL ? 16 : 8); 8709 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 8710 __ br(Assembler::GE, LOOP_COMPARE16); 8711 __ cbz(cnt2, LENGTH_DIFF); 8712 8713 __ bind(LESS16); 8714 // each 8 compare 8715 __ subs(cnt2, cnt2, isLL ? 8 : 4); 8716 __ br(Assembler::LE, LESS8); 8717 __ ldr(tmp1, Address(__ post(str1, 8))); 8718 __ ldr(tmp2, Address(__ post(str2, 8))); 8719 __ eor(rscratch2, tmp1, tmp2); 8720 __ cbnz(rscratch2, CAL_DIFFERENCE); 8721 __ sub(cnt2, cnt2, isLL ? 8 : 4); 8722 8723 __ bind(LESS8); // directly load last 8 bytes 8724 if (!isLL) { 8725 __ add(cnt2, cnt2, cnt2); 8726 } 8727 __ ldr(tmp1, Address(str1, cnt2)); 8728 __ ldr(tmp2, Address(str2, cnt2)); 8729 __ eor(rscratch2, tmp1, tmp2); 8730 __ cbz(rscratch2, LENGTH_DIFF); 8731 __ b(CAL_DIFFERENCE); 8732 8733 __ bind(DIFF); 8734 __ cmp(tmp1, tmp2); 8735 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 8736 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 8737 // reuse rscratch2 register for the result of eor instruction 8738 __ eor(rscratch2, tmp1, tmp2); 8739 8740 __ bind(CAL_DIFFERENCE); 8741 __ rev(rscratch2, rscratch2); 8742 __ clz(rscratch2, rscratch2); 8743 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 8744 __ lsrv(tmp1, tmp1, rscratch2); 8745 __ lsrv(tmp2, tmp2, rscratch2); 8746 if (isLL) { 8747 __ uxtbw(tmp1, tmp1); 8748 __ uxtbw(tmp2, tmp2); 8749 } else { 8750 __ uxthw(tmp1, tmp1); 8751 __ uxthw(tmp2, tmp2); 8752 } 8753 __ subw(result, tmp1, tmp2); 8754 8755 __ bind(LENGTH_DIFF); 8756 __ ret(lr); 8757 return entry; 8758 } 8759 8760 enum string_compare_mode { 8761 LL, 8762 LU, 8763 UL, 8764 UU, 8765 }; 8766 8767 // The following registers are declared in aarch64.ad 8768 // r0 = result 8769 // r1 = str1 8770 // r2 = cnt1 8771 // r3 = str2 8772 // r4 = cnt2 8773 // r10 = tmp1 8774 // r11 = tmp2 8775 // z0 = ztmp1 8776 // z1 = ztmp2 8777 // p0 = pgtmp1 8778 // p1 = pgtmp2 8779 address generate_compare_long_string_sve(string_compare_mode mode) { 8780 StubGenStubId stub_id; 8781 switch (mode) { 8782 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 8783 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 8784 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 8785 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 8786 default: ShouldNotReachHere(); 8787 } 8788 8789 __ align(CodeEntryAlignment); 8790 address entry = __ pc(); 8791 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8792 tmp1 = r10, tmp2 = r11; 8793 8794 Label LOOP, DONE, MISMATCH; 8795 Register vec_len = tmp1; 8796 Register idx = tmp2; 8797 // The minimum of the string lengths has been stored in cnt2. 8798 Register cnt = cnt2; 8799 FloatRegister ztmp1 = z0, ztmp2 = z1; 8800 PRegister pgtmp1 = p0, pgtmp2 = p1; 8801 8802 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 8803 switch (mode) { \ 8804 case LL: \ 8805 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 8806 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 8807 break; \ 8808 case LU: \ 8809 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 8810 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 8811 break; \ 8812 case UL: \ 8813 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 8814 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 8815 break; \ 8816 case UU: \ 8817 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 8818 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 8819 break; \ 8820 default: \ 8821 ShouldNotReachHere(); \ 8822 } 8823 8824 StubCodeMark mark(this, stub_id); 8825 8826 __ mov(idx, 0); 8827 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 8828 8829 if (mode == LL) { 8830 __ sve_cntb(vec_len); 8831 } else { 8832 __ sve_cnth(vec_len); 8833 } 8834 8835 __ sub(rscratch1, cnt, vec_len); 8836 8837 __ bind(LOOP); 8838 8839 // main loop 8840 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 8841 __ add(idx, idx, vec_len); 8842 // Compare strings. 8843 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 8844 __ br(__ NE, MISMATCH); 8845 __ cmp(idx, rscratch1); 8846 __ br(__ LT, LOOP); 8847 8848 // post loop, last iteration 8849 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 8850 8851 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 8852 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 8853 __ br(__ EQ, DONE); 8854 8855 __ bind(MISMATCH); 8856 8857 // Crop the vector to find its location. 8858 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 8859 // Extract the first different characters of each string. 8860 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 8861 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 8862 8863 // Compute the difference of the first different characters. 8864 __ sub(result, rscratch1, rscratch2); 8865 8866 __ bind(DONE); 8867 __ ret(lr); 8868 #undef LOAD_PAIR 8869 return entry; 8870 } 8871 8872 void generate_compare_long_strings() { 8873 if (UseSVE == 0) { 8874 StubRoutines::aarch64::_compare_long_string_LL 8875 = generate_compare_long_string_same_encoding(true); 8876 StubRoutines::aarch64::_compare_long_string_UU 8877 = generate_compare_long_string_same_encoding(false); 8878 StubRoutines::aarch64::_compare_long_string_LU 8879 = generate_compare_long_string_different_encoding(true); 8880 StubRoutines::aarch64::_compare_long_string_UL 8881 = generate_compare_long_string_different_encoding(false); 8882 } else { 8883 StubRoutines::aarch64::_compare_long_string_LL 8884 = generate_compare_long_string_sve(LL); 8885 StubRoutines::aarch64::_compare_long_string_UU 8886 = generate_compare_long_string_sve(UU); 8887 StubRoutines::aarch64::_compare_long_string_LU 8888 = generate_compare_long_string_sve(LU); 8889 StubRoutines::aarch64::_compare_long_string_UL 8890 = generate_compare_long_string_sve(UL); 8891 } 8892 } 8893 8894 // R0 = result 8895 // R1 = str2 8896 // R2 = cnt1 8897 // R3 = str1 8898 // R4 = cnt2 8899 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 8900 // 8901 // This generic linear code use few additional ideas, which makes it faster: 8902 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 8903 // in order to skip initial loading(help in systems with 1 ld pipeline) 8904 // 2) we can use "fast" algorithm of finding single character to search for 8905 // first symbol with less branches(1 branch per each loaded register instead 8906 // of branch for each symbol), so, this is where constants like 8907 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 8908 // 3) after loading and analyzing 1st register of source string, it can be 8909 // used to search for every 1st character entry, saving few loads in 8910 // comparison with "simplier-but-slower" implementation 8911 // 4) in order to avoid lots of push/pop operations, code below is heavily 8912 // re-using/re-initializing/compressing register values, which makes code 8913 // larger and a bit less readable, however, most of extra operations are 8914 // issued during loads or branches, so, penalty is minimal 8915 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 8916 StubGenStubId stub_id; 8917 if (str1_isL) { 8918 if (str2_isL) { 8919 stub_id = StubGenStubId::string_indexof_linear_ll_id; 8920 } else { 8921 stub_id = StubGenStubId::string_indexof_linear_ul_id; 8922 } 8923 } else { 8924 if (str2_isL) { 8925 ShouldNotReachHere(); 8926 } else { 8927 stub_id = StubGenStubId::string_indexof_linear_uu_id; 8928 } 8929 } 8930 __ align(CodeEntryAlignment); 8931 StubCodeMark mark(this, stub_id); 8932 address entry = __ pc(); 8933 8934 int str1_chr_size = str1_isL ? 1 : 2; 8935 int str2_chr_size = str2_isL ? 1 : 2; 8936 int str1_chr_shift = str1_isL ? 0 : 1; 8937 int str2_chr_shift = str2_isL ? 0 : 1; 8938 bool isL = str1_isL && str2_isL; 8939 // parameters 8940 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 8941 // temporary registers 8942 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 8943 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 8944 // redefinitions 8945 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 8946 8947 __ push(spilled_regs, sp); 8948 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 8949 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 8950 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 8951 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 8952 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 8953 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 8954 // Read whole register from str1. It is safe, because length >=8 here 8955 __ ldr(ch1, Address(str1)); 8956 // Read whole register from str2. It is safe, because length >=8 here 8957 __ ldr(ch2, Address(str2)); 8958 __ sub(cnt2, cnt2, cnt1); 8959 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 8960 if (str1_isL != str2_isL) { 8961 __ eor(v0, __ T16B, v0, v0); 8962 } 8963 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 8964 __ mul(first, first, tmp1); 8965 // check if we have less than 1 register to check 8966 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 8967 if (str1_isL != str2_isL) { 8968 __ fmovd(v1, ch1); 8969 } 8970 __ br(__ LE, L_SMALL); 8971 __ eor(ch2, first, ch2); 8972 if (str1_isL != str2_isL) { 8973 __ zip1(v1, __ T16B, v1, v0); 8974 } 8975 __ sub(tmp2, ch2, tmp1); 8976 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8977 __ bics(tmp2, tmp2, ch2); 8978 if (str1_isL != str2_isL) { 8979 __ fmovd(ch1, v1); 8980 } 8981 __ br(__ NE, L_HAS_ZERO); 8982 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 8983 __ add(result, result, wordSize/str2_chr_size); 8984 __ add(str2, str2, wordSize); 8985 __ br(__ LT, L_POST_LOOP); 8986 __ BIND(L_LOOP); 8987 __ ldr(ch2, Address(str2)); 8988 __ eor(ch2, first, ch2); 8989 __ sub(tmp2, ch2, tmp1); 8990 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 8991 __ bics(tmp2, tmp2, ch2); 8992 __ br(__ NE, L_HAS_ZERO); 8993 __ BIND(L_LOOP_PROCEED); 8994 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 8995 __ add(str2, str2, wordSize); 8996 __ add(result, result, wordSize/str2_chr_size); 8997 __ br(__ GE, L_LOOP); 8998 __ BIND(L_POST_LOOP); 8999 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 9000 __ br(__ LE, NOMATCH); 9001 __ ldr(ch2, Address(str2)); 9002 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9003 __ eor(ch2, first, ch2); 9004 __ sub(tmp2, ch2, tmp1); 9005 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9006 __ mov(tmp4, -1); // all bits set 9007 __ b(L_SMALL_PROCEED); 9008 __ align(OptoLoopAlignment); 9009 __ BIND(L_SMALL); 9010 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9011 __ eor(ch2, first, ch2); 9012 if (str1_isL != str2_isL) { 9013 __ zip1(v1, __ T16B, v1, v0); 9014 } 9015 __ sub(tmp2, ch2, tmp1); 9016 __ mov(tmp4, -1); // all bits set 9017 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9018 if (str1_isL != str2_isL) { 9019 __ fmovd(ch1, v1); // move converted 4 symbols 9020 } 9021 __ BIND(L_SMALL_PROCEED); 9022 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 9023 __ bic(tmp2, tmp2, ch2); 9024 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 9025 __ rbit(tmp2, tmp2); 9026 __ br(__ EQ, NOMATCH); 9027 __ BIND(L_SMALL_HAS_ZERO_LOOP); 9028 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 9029 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 9030 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 9031 if (str2_isL) { // LL 9032 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9033 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9034 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9035 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9036 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9037 } else { 9038 __ mov(ch2, 0xE); // all bits in byte set except last one 9039 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9040 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9041 __ lslv(tmp2, tmp2, tmp4); 9042 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9043 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9044 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9045 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9046 } 9047 __ cmp(ch1, ch2); 9048 __ mov(tmp4, wordSize/str2_chr_size); 9049 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9050 __ BIND(L_SMALL_CMP_LOOP); 9051 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9052 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9053 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9054 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9055 __ add(tmp4, tmp4, 1); 9056 __ cmp(tmp4, cnt1); 9057 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 9058 __ cmp(first, ch2); 9059 __ br(__ EQ, L_SMALL_CMP_LOOP); 9060 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 9061 __ cbz(tmp2, NOMATCH); // no more matches. exit 9062 __ clz(tmp4, tmp2); 9063 __ add(result, result, 1); // advance index 9064 __ add(str2, str2, str2_chr_size); // advance pointer 9065 __ b(L_SMALL_HAS_ZERO_LOOP); 9066 __ align(OptoLoopAlignment); 9067 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 9068 __ cmp(first, ch2); 9069 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9070 __ b(DONE); 9071 __ align(OptoLoopAlignment); 9072 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 9073 if (str2_isL) { // LL 9074 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9075 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9076 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9077 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9078 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9079 } else { 9080 __ mov(ch2, 0xE); // all bits in byte set except last one 9081 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9082 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9083 __ lslv(tmp2, tmp2, tmp4); 9084 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9085 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9086 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9087 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9088 } 9089 __ cmp(ch1, ch2); 9090 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9091 __ b(DONE); 9092 __ align(OptoLoopAlignment); 9093 __ BIND(L_HAS_ZERO); 9094 __ rbit(tmp2, tmp2); 9095 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 9096 // Now, perform compression of counters(cnt2 and cnt1) into one register. 9097 // It's fine because both counters are 32bit and are not changed in this 9098 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 9099 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 9100 __ sub(result, result, 1); 9101 __ BIND(L_HAS_ZERO_LOOP); 9102 __ mov(cnt1, wordSize/str2_chr_size); 9103 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9104 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 9105 if (str2_isL) { 9106 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9107 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9108 __ lslv(tmp2, tmp2, tmp4); 9109 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9110 __ add(tmp4, tmp4, 1); 9111 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9112 __ lsl(tmp2, tmp2, 1); 9113 __ mov(tmp4, wordSize/str2_chr_size); 9114 } else { 9115 __ mov(ch2, 0xE); 9116 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9117 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9118 __ lslv(tmp2, tmp2, tmp4); 9119 __ add(tmp4, tmp4, 1); 9120 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9121 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9122 __ lsl(tmp2, tmp2, 1); 9123 __ mov(tmp4, wordSize/str2_chr_size); 9124 __ sub(str2, str2, str2_chr_size); 9125 } 9126 __ cmp(ch1, ch2); 9127 __ mov(tmp4, wordSize/str2_chr_size); 9128 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9129 __ BIND(L_CMP_LOOP); 9130 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9131 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9132 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9133 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9134 __ add(tmp4, tmp4, 1); 9135 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9136 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 9137 __ cmp(cnt1, ch2); 9138 __ br(__ EQ, L_CMP_LOOP); 9139 __ BIND(L_CMP_LOOP_NOMATCH); 9140 // here we're not matched 9141 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9142 __ clz(tmp4, tmp2); 9143 __ add(str2, str2, str2_chr_size); // advance pointer 9144 __ b(L_HAS_ZERO_LOOP); 9145 __ align(OptoLoopAlignment); 9146 __ BIND(L_CMP_LOOP_LAST_CMP); 9147 __ cmp(cnt1, ch2); 9148 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9149 __ b(DONE); 9150 __ align(OptoLoopAlignment); 9151 __ BIND(L_CMP_LOOP_LAST_CMP2); 9152 if (str2_isL) { 9153 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9154 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9155 __ lslv(tmp2, tmp2, tmp4); 9156 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9157 __ add(tmp4, tmp4, 1); 9158 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9159 __ lsl(tmp2, tmp2, 1); 9160 } else { 9161 __ mov(ch2, 0xE); 9162 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9163 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9164 __ lslv(tmp2, tmp2, tmp4); 9165 __ add(tmp4, tmp4, 1); 9166 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9167 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9168 __ lsl(tmp2, tmp2, 1); 9169 __ sub(str2, str2, str2_chr_size); 9170 } 9171 __ cmp(ch1, ch2); 9172 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9173 __ b(DONE); 9174 __ align(OptoLoopAlignment); 9175 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9176 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9177 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9178 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9179 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9180 // result by analyzed characters value, so, we can just reset lower bits 9181 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9182 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9183 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9184 // index of last analyzed substring inside current octet. So, str2 in at 9185 // respective start address. We need to advance it to next octet 9186 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9187 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9188 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9189 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9190 __ movw(cnt2, cnt2); 9191 __ b(L_LOOP_PROCEED); 9192 __ align(OptoLoopAlignment); 9193 __ BIND(NOMATCH); 9194 __ mov(result, -1); 9195 __ BIND(DONE); 9196 __ pop(spilled_regs, sp); 9197 __ ret(lr); 9198 return entry; 9199 } 9200 9201 void generate_string_indexof_stubs() { 9202 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9203 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9204 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9205 } 9206 9207 void inflate_and_store_2_fp_registers(bool generatePrfm, 9208 FloatRegister src1, FloatRegister src2) { 9209 Register dst = r1; 9210 __ zip1(v1, __ T16B, src1, v0); 9211 __ zip2(v2, __ T16B, src1, v0); 9212 if (generatePrfm) { 9213 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9214 } 9215 __ zip1(v3, __ T16B, src2, v0); 9216 __ zip2(v4, __ T16B, src2, v0); 9217 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9218 } 9219 9220 // R0 = src 9221 // R1 = dst 9222 // R2 = len 9223 // R3 = len >> 3 9224 // V0 = 0 9225 // v1 = loaded 8 bytes 9226 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9227 address generate_large_byte_array_inflate() { 9228 __ align(CodeEntryAlignment); 9229 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 9230 StubCodeMark mark(this, stub_id); 9231 address entry = __ pc(); 9232 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9233 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9234 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9235 9236 // do one more 8-byte read to have address 16-byte aligned in most cases 9237 // also use single store instruction 9238 __ ldrd(v2, __ post(src, 8)); 9239 __ sub(octetCounter, octetCounter, 2); 9240 __ zip1(v1, __ T16B, v1, v0); 9241 __ zip1(v2, __ T16B, v2, v0); 9242 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9243 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9244 __ subs(rscratch1, octetCounter, large_loop_threshold); 9245 __ br(__ LE, LOOP_START); 9246 __ b(LOOP_PRFM_START); 9247 __ bind(LOOP_PRFM); 9248 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9249 __ bind(LOOP_PRFM_START); 9250 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9251 __ sub(octetCounter, octetCounter, 8); 9252 __ subs(rscratch1, octetCounter, large_loop_threshold); 9253 inflate_and_store_2_fp_registers(true, v3, v4); 9254 inflate_and_store_2_fp_registers(true, v5, v6); 9255 __ br(__ GT, LOOP_PRFM); 9256 __ cmp(octetCounter, (u1)8); 9257 __ br(__ LT, DONE); 9258 __ bind(LOOP); 9259 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9260 __ bind(LOOP_START); 9261 __ sub(octetCounter, octetCounter, 8); 9262 __ cmp(octetCounter, (u1)8); 9263 inflate_and_store_2_fp_registers(false, v3, v4); 9264 inflate_and_store_2_fp_registers(false, v5, v6); 9265 __ br(__ GE, LOOP); 9266 __ bind(DONE); 9267 __ ret(lr); 9268 return entry; 9269 } 9270 9271 /** 9272 * Arguments: 9273 * 9274 * Input: 9275 * c_rarg0 - current state address 9276 * c_rarg1 - H key address 9277 * c_rarg2 - data address 9278 * c_rarg3 - number of blocks 9279 * 9280 * Output: 9281 * Updated state at c_rarg0 9282 */ 9283 address generate_ghash_processBlocks() { 9284 // Bafflingly, GCM uses little-endian for the byte order, but 9285 // big-endian for the bit order. For example, the polynomial 1 is 9286 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9287 // 9288 // So, we must either reverse the bytes in each word and do 9289 // everything big-endian or reverse the bits in each byte and do 9290 // it little-endian. On AArch64 it's more idiomatic to reverse 9291 // the bits in each byte (we have an instruction, RBIT, to do 9292 // that) and keep the data in little-endian bit order through the 9293 // calculation, bit-reversing the inputs and outputs. 9294 9295 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 9296 StubCodeMark mark(this, stub_id); 9297 __ align(wordSize * 2); 9298 address p = __ pc(); 9299 __ emit_int64(0x87); // The low-order bits of the field 9300 // polynomial (i.e. p = z^7+z^2+z+1) 9301 // repeated in the low and high parts of a 9302 // 128-bit vector 9303 __ emit_int64(0x87); 9304 9305 __ align(CodeEntryAlignment); 9306 address start = __ pc(); 9307 9308 Register state = c_rarg0; 9309 Register subkeyH = c_rarg1; 9310 Register data = c_rarg2; 9311 Register blocks = c_rarg3; 9312 9313 FloatRegister vzr = v30; 9314 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9315 9316 __ ldrq(v24, p); // The field polynomial 9317 9318 __ ldrq(v0, Address(state)); 9319 __ ldrq(v1, Address(subkeyH)); 9320 9321 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9322 __ rbit(v0, __ T16B, v0); 9323 __ rev64(v1, __ T16B, v1); 9324 __ rbit(v1, __ T16B, v1); 9325 9326 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9327 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9328 9329 { 9330 Label L_ghash_loop; 9331 __ bind(L_ghash_loop); 9332 9333 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9334 // reversing each byte 9335 __ rbit(v2, __ T16B, v2); 9336 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9337 9338 // Multiply state in v2 by subkey in v1 9339 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9340 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9341 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9342 // Reduce v7:v5 by the field polynomial 9343 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9344 9345 __ sub(blocks, blocks, 1); 9346 __ cbnz(blocks, L_ghash_loop); 9347 } 9348 9349 // The bit-reversed result is at this point in v0 9350 __ rev64(v0, __ T16B, v0); 9351 __ rbit(v0, __ T16B, v0); 9352 9353 __ st1(v0, __ T16B, state); 9354 __ ret(lr); 9355 9356 return start; 9357 } 9358 9359 address generate_ghash_processBlocks_wide() { 9360 address small = generate_ghash_processBlocks(); 9361 9362 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 9363 StubCodeMark mark(this, stub_id); 9364 __ align(wordSize * 2); 9365 address p = __ pc(); 9366 __ emit_int64(0x87); // The low-order bits of the field 9367 // polynomial (i.e. p = z^7+z^2+z+1) 9368 // repeated in the low and high parts of a 9369 // 128-bit vector 9370 __ emit_int64(0x87); 9371 9372 __ align(CodeEntryAlignment); 9373 address start = __ pc(); 9374 9375 Register state = c_rarg0; 9376 Register subkeyH = c_rarg1; 9377 Register data = c_rarg2; 9378 Register blocks = c_rarg3; 9379 9380 const int unroll = 4; 9381 9382 __ cmp(blocks, (unsigned char)(unroll * 2)); 9383 __ br(__ LT, small); 9384 9385 if (unroll > 1) { 9386 // Save state before entering routine 9387 __ sub(sp, sp, 4 * 16); 9388 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9389 __ sub(sp, sp, 4 * 16); 9390 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9391 } 9392 9393 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 9394 9395 if (unroll > 1) { 9396 // And restore state 9397 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9398 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9399 } 9400 9401 __ cmp(blocks, (unsigned char)0); 9402 __ br(__ GT, small); 9403 9404 __ ret(lr); 9405 9406 return start; 9407 } 9408 9409 void generate_base64_encode_simdround(Register src, Register dst, 9410 FloatRegister codec, u8 size) { 9411 9412 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9413 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9414 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9415 9416 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9417 9418 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9419 9420 __ ushr(ind0, arrangement, in0, 2); 9421 9422 __ ushr(ind1, arrangement, in1, 2); 9423 __ shl(in0, arrangement, in0, 6); 9424 __ orr(ind1, arrangement, ind1, in0); 9425 __ ushr(ind1, arrangement, ind1, 2); 9426 9427 __ ushr(ind2, arrangement, in2, 4); 9428 __ shl(in1, arrangement, in1, 4); 9429 __ orr(ind2, arrangement, in1, ind2); 9430 __ ushr(ind2, arrangement, ind2, 2); 9431 9432 __ shl(ind3, arrangement, in2, 2); 9433 __ ushr(ind3, arrangement, ind3, 2); 9434 9435 __ tbl(out0, arrangement, codec, 4, ind0); 9436 __ tbl(out1, arrangement, codec, 4, ind1); 9437 __ tbl(out2, arrangement, codec, 4, ind2); 9438 __ tbl(out3, arrangement, codec, 4, ind3); 9439 9440 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9441 } 9442 9443 /** 9444 * Arguments: 9445 * 9446 * Input: 9447 * c_rarg0 - src_start 9448 * c_rarg1 - src_offset 9449 * c_rarg2 - src_length 9450 * c_rarg3 - dest_start 9451 * c_rarg4 - dest_offset 9452 * c_rarg5 - isURL 9453 * 9454 */ 9455 address generate_base64_encodeBlock() { 9456 9457 static const char toBase64[64] = { 9458 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9459 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9460 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9461 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9462 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9463 }; 9464 9465 static const char toBase64URL[64] = { 9466 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9467 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9468 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9469 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9470 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9471 }; 9472 9473 __ align(CodeEntryAlignment); 9474 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 9475 StubCodeMark mark(this, stub_id); 9476 address start = __ pc(); 9477 9478 Register src = c_rarg0; // source array 9479 Register soff = c_rarg1; // source start offset 9480 Register send = c_rarg2; // source end offset 9481 Register dst = c_rarg3; // dest array 9482 Register doff = c_rarg4; // position for writing to dest array 9483 Register isURL = c_rarg5; // Base64 or URL character set 9484 9485 // c_rarg6 and c_rarg7 are free to use as temps 9486 Register codec = c_rarg6; 9487 Register length = c_rarg7; 9488 9489 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9490 9491 __ add(src, src, soff); 9492 __ add(dst, dst, doff); 9493 __ sub(length, send, soff); 9494 9495 // load the codec base address 9496 __ lea(codec, ExternalAddress((address) toBase64)); 9497 __ cbz(isURL, ProcessData); 9498 __ lea(codec, ExternalAddress((address) toBase64URL)); 9499 9500 __ BIND(ProcessData); 9501 9502 // too short to formup a SIMD loop, roll back 9503 __ cmp(length, (u1)24); 9504 __ br(Assembler::LT, Process3B); 9505 9506 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9507 9508 __ BIND(Process48B); 9509 __ cmp(length, (u1)48); 9510 __ br(Assembler::LT, Process24B); 9511 generate_base64_encode_simdround(src, dst, v0, 16); 9512 __ sub(length, length, 48); 9513 __ b(Process48B); 9514 9515 __ BIND(Process24B); 9516 __ cmp(length, (u1)24); 9517 __ br(Assembler::LT, SIMDExit); 9518 generate_base64_encode_simdround(src, dst, v0, 8); 9519 __ sub(length, length, 24); 9520 9521 __ BIND(SIMDExit); 9522 __ cbz(length, Exit); 9523 9524 __ BIND(Process3B); 9525 // 3 src bytes, 24 bits 9526 __ ldrb(r10, __ post(src, 1)); 9527 __ ldrb(r11, __ post(src, 1)); 9528 __ ldrb(r12, __ post(src, 1)); 9529 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9530 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9531 // codec index 9532 __ ubfmw(r15, r12, 18, 23); 9533 __ ubfmw(r14, r12, 12, 17); 9534 __ ubfmw(r13, r12, 6, 11); 9535 __ andw(r12, r12, 63); 9536 // get the code based on the codec 9537 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9538 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9539 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9540 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9541 __ strb(r15, __ post(dst, 1)); 9542 __ strb(r14, __ post(dst, 1)); 9543 __ strb(r13, __ post(dst, 1)); 9544 __ strb(r12, __ post(dst, 1)); 9545 __ sub(length, length, 3); 9546 __ cbnz(length, Process3B); 9547 9548 __ BIND(Exit); 9549 __ ret(lr); 9550 9551 return start; 9552 } 9553 9554 void generate_base64_decode_simdround(Register src, Register dst, 9555 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9556 9557 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9558 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9559 9560 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9561 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9562 9563 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9564 9565 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9566 9567 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9568 9569 // we need unsigned saturating subtract, to make sure all input values 9570 // in range [0, 63] will have 0U value in the higher half lookup 9571 __ uqsubv(decH0, __ T16B, in0, v27); 9572 __ uqsubv(decH1, __ T16B, in1, v27); 9573 __ uqsubv(decH2, __ T16B, in2, v27); 9574 __ uqsubv(decH3, __ T16B, in3, v27); 9575 9576 // lower half lookup 9577 __ tbl(decL0, arrangement, codecL, 4, in0); 9578 __ tbl(decL1, arrangement, codecL, 4, in1); 9579 __ tbl(decL2, arrangement, codecL, 4, in2); 9580 __ tbl(decL3, arrangement, codecL, 4, in3); 9581 9582 // higher half lookup 9583 __ tbx(decH0, arrangement, codecH, 4, decH0); 9584 __ tbx(decH1, arrangement, codecH, 4, decH1); 9585 __ tbx(decH2, arrangement, codecH, 4, decH2); 9586 __ tbx(decH3, arrangement, codecH, 4, decH3); 9587 9588 // combine lower and higher 9589 __ orr(decL0, arrangement, decL0, decH0); 9590 __ orr(decL1, arrangement, decL1, decH1); 9591 __ orr(decL2, arrangement, decL2, decH2); 9592 __ orr(decL3, arrangement, decL3, decH3); 9593 9594 // check illegal inputs, value larger than 63 (maximum of 6 bits) 9595 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 9596 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 9597 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 9598 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 9599 __ orr(in0, arrangement, decH0, decH1); 9600 __ orr(in1, arrangement, decH2, decH3); 9601 __ orr(in2, arrangement, in0, in1); 9602 __ umaxv(in3, arrangement, in2); 9603 __ umov(rscratch2, in3, __ B, 0); 9604 9605 // get the data to output 9606 __ shl(out0, arrangement, decL0, 2); 9607 __ ushr(out1, arrangement, decL1, 4); 9608 __ orr(out0, arrangement, out0, out1); 9609 __ shl(out1, arrangement, decL1, 4); 9610 __ ushr(out2, arrangement, decL2, 2); 9611 __ orr(out1, arrangement, out1, out2); 9612 __ shl(out2, arrangement, decL2, 6); 9613 __ orr(out2, arrangement, out2, decL3); 9614 9615 __ cbz(rscratch2, NoIllegalData); 9616 9617 // handle illegal input 9618 __ umov(r10, in2, __ D, 0); 9619 if (size == 16) { 9620 __ cbnz(r10, ErrorInLowerHalf); 9621 9622 // illegal input is in higher half, store the lower half now. 9623 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 9624 9625 __ umov(r10, in2, __ D, 1); 9626 __ umov(r11, out0, __ D, 1); 9627 __ umov(r12, out1, __ D, 1); 9628 __ umov(r13, out2, __ D, 1); 9629 __ b(StoreLegalData); 9630 9631 __ BIND(ErrorInLowerHalf); 9632 } 9633 __ umov(r11, out0, __ D, 0); 9634 __ umov(r12, out1, __ D, 0); 9635 __ umov(r13, out2, __ D, 0); 9636 9637 __ BIND(StoreLegalData); 9638 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 9639 __ strb(r11, __ post(dst, 1)); 9640 __ strb(r12, __ post(dst, 1)); 9641 __ strb(r13, __ post(dst, 1)); 9642 __ lsr(r10, r10, 8); 9643 __ lsr(r11, r11, 8); 9644 __ lsr(r12, r12, 8); 9645 __ lsr(r13, r13, 8); 9646 __ b(StoreLegalData); 9647 9648 __ BIND(NoIllegalData); 9649 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 9650 } 9651 9652 9653 /** 9654 * Arguments: 9655 * 9656 * Input: 9657 * c_rarg0 - src_start 9658 * c_rarg1 - src_offset 9659 * c_rarg2 - src_length 9660 * c_rarg3 - dest_start 9661 * c_rarg4 - dest_offset 9662 * c_rarg5 - isURL 9663 * c_rarg6 - isMIME 9664 * 9665 */ 9666 address generate_base64_decodeBlock() { 9667 9668 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 9669 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 9670 // titled "Base64 decoding". 9671 9672 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 9673 // except the trailing character '=' is also treated illegal value in this intrinsic. That 9674 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 9675 static const uint8_t fromBase64ForNoSIMD[256] = { 9676 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9677 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9678 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 9679 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9680 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 9681 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 9682 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 9683 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 9684 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9685 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9686 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9687 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9688 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9689 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9690 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9691 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9692 }; 9693 9694 static const uint8_t fromBase64URLForNoSIMD[256] = { 9695 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9696 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9697 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 9698 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9699 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 9700 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 9701 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 9702 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 9703 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9704 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9705 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9706 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9707 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9708 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9709 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9710 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9711 }; 9712 9713 // A legal value of base64 code is in range [0, 127]. We need two lookups 9714 // with tbl/tbx and combine them to get the decode data. The 1st table vector 9715 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 9716 // table vector lookup use tbx, out of range indices are unchanged in 9717 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 9718 // The value of index 64 is set to 0, so that we know that we already get the 9719 // decoded data with the 1st lookup. 9720 static const uint8_t fromBase64ForSIMD[128] = { 9721 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9722 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9723 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 9724 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9725 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 9726 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 9727 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 9728 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 9729 }; 9730 9731 static const uint8_t fromBase64URLForSIMD[128] = { 9732 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9733 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 9734 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 9735 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 9736 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 9737 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 9738 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 9739 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 9740 }; 9741 9742 __ align(CodeEntryAlignment); 9743 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 9744 StubCodeMark mark(this, stub_id); 9745 address start = __ pc(); 9746 9747 Register src = c_rarg0; // source array 9748 Register soff = c_rarg1; // source start offset 9749 Register send = c_rarg2; // source end offset 9750 Register dst = c_rarg3; // dest array 9751 Register doff = c_rarg4; // position for writing to dest array 9752 Register isURL = c_rarg5; // Base64 or URL character set 9753 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 9754 9755 Register length = send; // reuse send as length of source data to process 9756 9757 Register simd_codec = c_rarg6; 9758 Register nosimd_codec = c_rarg7; 9759 9760 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 9761 9762 __ enter(); 9763 9764 __ add(src, src, soff); 9765 __ add(dst, dst, doff); 9766 9767 __ mov(doff, dst); 9768 9769 __ sub(length, send, soff); 9770 __ bfm(length, zr, 0, 1); 9771 9772 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 9773 __ cbz(isURL, ProcessData); 9774 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 9775 9776 __ BIND(ProcessData); 9777 __ mov(rscratch1, length); 9778 __ cmp(length, (u1)144); // 144 = 80 + 64 9779 __ br(Assembler::LT, Process4B); 9780 9781 // In the MIME case, the line length cannot be more than 76 9782 // bytes (see RFC 2045). This is too short a block for SIMD 9783 // to be worthwhile, so we use non-SIMD here. 9784 __ movw(rscratch1, 79); 9785 9786 __ BIND(Process4B); 9787 __ ldrw(r14, __ post(src, 4)); 9788 __ ubfxw(r10, r14, 0, 8); 9789 __ ubfxw(r11, r14, 8, 8); 9790 __ ubfxw(r12, r14, 16, 8); 9791 __ ubfxw(r13, r14, 24, 8); 9792 // get the de-code 9793 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 9794 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 9795 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 9796 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 9797 // error detection, 255u indicates an illegal input 9798 __ orrw(r14, r10, r11); 9799 __ orrw(r15, r12, r13); 9800 __ orrw(r14, r14, r15); 9801 __ tbnz(r14, 7, Exit); 9802 // recover the data 9803 __ lslw(r14, r10, 10); 9804 __ bfiw(r14, r11, 4, 6); 9805 __ bfmw(r14, r12, 2, 5); 9806 __ rev16w(r14, r14); 9807 __ bfiw(r13, r12, 6, 2); 9808 __ strh(r14, __ post(dst, 2)); 9809 __ strb(r13, __ post(dst, 1)); 9810 // non-simd loop 9811 __ subsw(rscratch1, rscratch1, 4); 9812 __ br(Assembler::GT, Process4B); 9813 9814 // if exiting from PreProcess80B, rscratch1 == -1; 9815 // otherwise, rscratch1 == 0. 9816 __ cbzw(rscratch1, Exit); 9817 __ sub(length, length, 80); 9818 9819 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 9820 __ cbz(isURL, SIMDEnter); 9821 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 9822 9823 __ BIND(SIMDEnter); 9824 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 9825 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 9826 __ mov(rscratch1, 63); 9827 __ dup(v27, __ T16B, rscratch1); 9828 9829 __ BIND(Process64B); 9830 __ cmp(length, (u1)64); 9831 __ br(Assembler::LT, Process32B); 9832 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 9833 __ sub(length, length, 64); 9834 __ b(Process64B); 9835 9836 __ BIND(Process32B); 9837 __ cmp(length, (u1)32); 9838 __ br(Assembler::LT, SIMDExit); 9839 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 9840 __ sub(length, length, 32); 9841 __ b(Process32B); 9842 9843 __ BIND(SIMDExit); 9844 __ cbz(length, Exit); 9845 __ movw(rscratch1, length); 9846 __ b(Process4B); 9847 9848 __ BIND(Exit); 9849 __ sub(c_rarg0, dst, doff); 9850 9851 __ leave(); 9852 __ ret(lr); 9853 9854 return start; 9855 } 9856 9857 // Support for spin waits. 9858 address generate_spin_wait() { 9859 __ align(CodeEntryAlignment); 9860 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 9861 StubCodeMark mark(this, stub_id); 9862 address start = __ pc(); 9863 9864 __ spin_wait(); 9865 __ ret(lr); 9866 9867 return start; 9868 } 9869 9870 void generate_lookup_secondary_supers_table_stub() { 9871 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 9872 StubCodeMark mark(this, stub_id); 9873 9874 const Register 9875 r_super_klass = r0, 9876 r_array_base = r1, 9877 r_array_length = r2, 9878 r_array_index = r3, 9879 r_sub_klass = r4, 9880 r_bitmap = rscratch2, 9881 result = r5; 9882 const FloatRegister 9883 vtemp = v0; 9884 9885 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 9886 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 9887 Label L_success; 9888 __ enter(); 9889 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 9890 r_array_base, r_array_length, r_array_index, 9891 vtemp, result, slot, 9892 /*stub_is_near*/true); 9893 __ leave(); 9894 __ ret(lr); 9895 } 9896 } 9897 9898 // Slow path implementation for UseSecondarySupersTable. 9899 address generate_lookup_secondary_supers_table_slow_path_stub() { 9900 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 9901 StubCodeMark mark(this, stub_id); 9902 9903 address start = __ pc(); 9904 const Register 9905 r_super_klass = r0, // argument 9906 r_array_base = r1, // argument 9907 temp1 = r2, // temp 9908 r_array_index = r3, // argument 9909 r_bitmap = rscratch2, // argument 9910 result = r5; // argument 9911 9912 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 9913 __ ret(lr); 9914 9915 return start; 9916 } 9917 9918 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 9919 9920 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 9921 // 9922 // If LSE is in use, generate LSE versions of all the stubs. The 9923 // non-LSE versions are in atomic_aarch64.S. 9924 9925 // class AtomicStubMark records the entry point of a stub and the 9926 // stub pointer which will point to it. The stub pointer is set to 9927 // the entry point when ~AtomicStubMark() is called, which must be 9928 // after ICache::invalidate_range. This ensures safe publication of 9929 // the generated code. 9930 class AtomicStubMark { 9931 address _entry_point; 9932 aarch64_atomic_stub_t *_stub; 9933 MacroAssembler *_masm; 9934 public: 9935 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 9936 _masm = masm; 9937 __ align(32); 9938 _entry_point = __ pc(); 9939 _stub = stub; 9940 } 9941 ~AtomicStubMark() { 9942 *_stub = (aarch64_atomic_stub_t)_entry_point; 9943 } 9944 }; 9945 9946 // NB: For memory_order_conservative we need a trailing membar after 9947 // LSE atomic operations but not a leading membar. 9948 // 9949 // We don't need a leading membar because a clause in the Arm ARM 9950 // says: 9951 // 9952 // Barrier-ordered-before 9953 // 9954 // Barrier instructions order prior Memory effects before subsequent 9955 // Memory effects generated by the same Observer. A read or a write 9956 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 9957 // Observer if and only if RW1 appears in program order before RW 2 9958 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 9959 // instruction with both Acquire and Release semantics. 9960 // 9961 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 9962 // and Release semantics, therefore we don't need a leading 9963 // barrier. However, there is no corresponding Barrier-ordered-after 9964 // relationship, therefore we need a trailing membar to prevent a 9965 // later store or load from being reordered with the store in an 9966 // atomic instruction. 9967 // 9968 // This was checked by using the herd7 consistency model simulator 9969 // (http://diy.inria.fr/) with this test case: 9970 // 9971 // AArch64 LseCas 9972 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 9973 // P0 | P1; 9974 // LDR W4, [X2] | MOV W3, #0; 9975 // DMB LD | MOV W4, #1; 9976 // LDR W3, [X1] | CASAL W3, W4, [X1]; 9977 // | DMB ISH; 9978 // | STR W4, [X2]; 9979 // exists 9980 // (0:X3=0 /\ 0:X4=1) 9981 // 9982 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 9983 // with the store to x in P1. Without the DMB in P1 this may happen. 9984 // 9985 // At the time of writing we don't know of any AArch64 hardware that 9986 // reorders stores in this way, but the Reference Manual permits it. 9987 9988 void gen_cas_entry(Assembler::operand_size size, 9989 atomic_memory_order order) { 9990 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 9991 exchange_val = c_rarg2; 9992 bool acquire, release; 9993 switch (order) { 9994 case memory_order_relaxed: 9995 acquire = false; 9996 release = false; 9997 break; 9998 case memory_order_release: 9999 acquire = false; 10000 release = true; 10001 break; 10002 default: 10003 acquire = true; 10004 release = true; 10005 break; 10006 } 10007 __ mov(prev, compare_val); 10008 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 10009 if (order == memory_order_conservative) { 10010 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10011 } 10012 if (size == Assembler::xword) { 10013 __ mov(r0, prev); 10014 } else { 10015 __ movw(r0, prev); 10016 } 10017 __ ret(lr); 10018 } 10019 10020 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 10021 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10022 // If not relaxed, then default to conservative. Relaxed is the only 10023 // case we use enough to be worth specializing. 10024 if (order == memory_order_relaxed) { 10025 __ ldadd(size, incr, prev, addr); 10026 } else { 10027 __ ldaddal(size, incr, prev, addr); 10028 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10029 } 10030 if (size == Assembler::xword) { 10031 __ mov(r0, prev); 10032 } else { 10033 __ movw(r0, prev); 10034 } 10035 __ ret(lr); 10036 } 10037 10038 void gen_swpal_entry(Assembler::operand_size size) { 10039 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10040 __ swpal(size, incr, prev, addr); 10041 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10042 if (size == Assembler::xword) { 10043 __ mov(r0, prev); 10044 } else { 10045 __ movw(r0, prev); 10046 } 10047 __ ret(lr); 10048 } 10049 10050 void generate_atomic_entry_points() { 10051 if (! UseLSE) { 10052 return; 10053 } 10054 __ align(CodeEntryAlignment); 10055 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 10056 StubCodeMark mark(this, stub_id); 10057 address first_entry = __ pc(); 10058 10059 // ADD, memory_order_conservative 10060 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 10061 gen_ldadd_entry(Assembler::word, memory_order_conservative); 10062 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 10063 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 10064 10065 // ADD, memory_order_relaxed 10066 AtomicStubMark mark_fetch_add_4_relaxed 10067 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 10068 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 10069 AtomicStubMark mark_fetch_add_8_relaxed 10070 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 10071 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 10072 10073 // XCHG, memory_order_conservative 10074 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 10075 gen_swpal_entry(Assembler::word); 10076 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 10077 gen_swpal_entry(Assembler::xword); 10078 10079 // CAS, memory_order_conservative 10080 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 10081 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 10082 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 10083 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 10084 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 10085 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 10086 10087 // CAS, memory_order_relaxed 10088 AtomicStubMark mark_cmpxchg_1_relaxed 10089 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 10090 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 10091 AtomicStubMark mark_cmpxchg_4_relaxed 10092 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 10093 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 10094 AtomicStubMark mark_cmpxchg_8_relaxed 10095 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 10096 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 10097 10098 AtomicStubMark mark_cmpxchg_4_release 10099 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 10100 gen_cas_entry(MacroAssembler::word, memory_order_release); 10101 AtomicStubMark mark_cmpxchg_8_release 10102 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 10103 gen_cas_entry(MacroAssembler::xword, memory_order_release); 10104 10105 AtomicStubMark mark_cmpxchg_4_seq_cst 10106 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 10107 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 10108 AtomicStubMark mark_cmpxchg_8_seq_cst 10109 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 10110 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 10111 10112 ICache::invalidate_range(first_entry, __ pc() - first_entry); 10113 } 10114 #endif // LINUX 10115 10116 address generate_cont_thaw(Continuation::thaw_kind kind) { 10117 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 10118 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 10119 10120 address start = __ pc(); 10121 10122 if (return_barrier) { 10123 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 10124 __ mov(sp, rscratch1); 10125 } 10126 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10127 10128 if (return_barrier) { 10129 // preserve possible return value from a method returning to the return barrier 10130 __ fmovd(rscratch1, v0); 10131 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10132 } 10133 10134 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 10135 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 10136 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 10137 10138 if (return_barrier) { 10139 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10140 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10141 __ fmovd(v0, rscratch1); 10142 } 10143 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10144 10145 10146 Label thaw_success; 10147 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10148 __ cbnz(rscratch2, thaw_success); 10149 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10150 __ br(rscratch1); 10151 __ bind(thaw_success); 10152 10153 // make room for the thawed frames 10154 __ sub(rscratch1, sp, rscratch2); 10155 __ andr(rscratch1, rscratch1, -16); // align 10156 __ mov(sp, rscratch1); 10157 10158 if (return_barrier) { 10159 // save original return value -- again 10160 __ fmovd(rscratch1, v0); 10161 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10162 } 10163 10164 // If we want, we can templatize thaw by kind, and have three different entries 10165 __ movw(c_rarg1, (uint32_t)kind); 10166 10167 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10168 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10169 10170 if (return_barrier) { 10171 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10172 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10173 __ fmovd(v0, rscratch1); 10174 } else { 10175 __ mov(r0, zr); // return 0 (success) from doYield 10176 } 10177 10178 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10179 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10180 __ mov(rfp, sp); 10181 10182 if (return_barrier_exception) { 10183 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10184 __ authenticate_return_address(c_rarg1); 10185 __ verify_oop(r0); 10186 // save return value containing the exception oop in callee-saved R19 10187 __ mov(r19, r0); 10188 10189 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10190 10191 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10192 // __ reinitialize_ptrue(); 10193 10194 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10195 10196 __ mov(r1, r0); // the exception handler 10197 __ mov(r0, r19); // restore return value containing the exception oop 10198 __ verify_oop(r0); 10199 10200 __ leave(); 10201 __ mov(r3, lr); 10202 __ br(r1); // the exception handler 10203 } else { 10204 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10205 __ leave(); 10206 __ ret(lr); 10207 } 10208 10209 return start; 10210 } 10211 10212 address generate_cont_thaw() { 10213 if (!Continuations::enabled()) return nullptr; 10214 10215 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 10216 StubCodeMark mark(this, stub_id); 10217 address start = __ pc(); 10218 generate_cont_thaw(Continuation::thaw_top); 10219 return start; 10220 } 10221 10222 address generate_cont_returnBarrier() { 10223 if (!Continuations::enabled()) return nullptr; 10224 10225 // TODO: will probably need multiple return barriers depending on return type 10226 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 10227 StubCodeMark mark(this, stub_id); 10228 address start = __ pc(); 10229 10230 generate_cont_thaw(Continuation::thaw_return_barrier); 10231 10232 return start; 10233 } 10234 10235 address generate_cont_returnBarrier_exception() { 10236 if (!Continuations::enabled()) return nullptr; 10237 10238 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 10239 StubCodeMark mark(this, stub_id); 10240 address start = __ pc(); 10241 10242 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10243 10244 return start; 10245 } 10246 10247 address generate_cont_preempt_stub() { 10248 if (!Continuations::enabled()) return nullptr; 10249 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 10250 StubCodeMark mark(this, stub_id); 10251 address start = __ pc(); 10252 10253 __ reset_last_Java_frame(true); 10254 10255 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10256 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10257 __ mov(sp, rscratch2); 10258 10259 Label preemption_cancelled; 10260 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10261 __ cbnz(rscratch1, preemption_cancelled); 10262 10263 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10264 SharedRuntime::continuation_enter_cleanup(_masm); 10265 __ leave(); 10266 __ ret(lr); 10267 10268 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10269 __ bind(preemption_cancelled); 10270 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10271 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10272 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10273 __ ldr(rscratch1, Address(rscratch1)); 10274 __ br(rscratch1); 10275 10276 return start; 10277 } 10278 10279 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10280 // are represented as long[5], with BITS_PER_LIMB = 26. 10281 // Pack five 26-bit limbs into three 64-bit registers. 10282 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10283 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10284 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10285 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10286 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10287 10288 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10289 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10290 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10291 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10292 10293 if (dest2->is_valid()) { 10294 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10295 } else { 10296 #ifdef ASSERT 10297 Label OK; 10298 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10299 __ br(__ EQ, OK); 10300 __ stop("high bits of Poly1305 integer should be zero"); 10301 __ should_not_reach_here(); 10302 __ bind(OK); 10303 #endif 10304 } 10305 } 10306 10307 // As above, but return only a 128-bit integer, packed into two 10308 // 64-bit registers. 10309 void pack_26(Register dest0, Register dest1, Register src) { 10310 pack_26(dest0, dest1, noreg, src); 10311 } 10312 10313 // Multiply and multiply-accumulate unsigned 64-bit registers. 10314 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10315 __ mul(prod_lo, n, m); 10316 __ umulh(prod_hi, n, m); 10317 } 10318 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10319 wide_mul(rscratch1, rscratch2, n, m); 10320 __ adds(sum_lo, sum_lo, rscratch1); 10321 __ adc(sum_hi, sum_hi, rscratch2); 10322 } 10323 10324 // Poly1305, RFC 7539 10325 10326 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10327 // description of the tricks used to simplify and accelerate this 10328 // computation. 10329 10330 address generate_poly1305_processBlocks() { 10331 __ align(CodeEntryAlignment); 10332 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 10333 StubCodeMark mark(this, stub_id); 10334 address start = __ pc(); 10335 Label here; 10336 __ enter(); 10337 RegSet callee_saved = RegSet::range(r19, r28); 10338 __ push(callee_saved, sp); 10339 10340 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10341 10342 // Arguments 10343 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10344 10345 // R_n is the 128-bit randomly-generated key, packed into two 10346 // registers. The caller passes this key to us as long[5], with 10347 // BITS_PER_LIMB = 26. 10348 const Register R_0 = *++regs, R_1 = *++regs; 10349 pack_26(R_0, R_1, r_start); 10350 10351 // RR_n is (R_n >> 2) * 5 10352 const Register RR_0 = *++regs, RR_1 = *++regs; 10353 __ lsr(RR_0, R_0, 2); 10354 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10355 __ lsr(RR_1, R_1, 2); 10356 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10357 10358 // U_n is the current checksum 10359 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10360 pack_26(U_0, U_1, U_2, acc_start); 10361 10362 static constexpr int BLOCK_LENGTH = 16; 10363 Label DONE, LOOP; 10364 10365 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10366 __ br(Assembler::LT, DONE); { 10367 __ bind(LOOP); 10368 10369 // S_n is to be the sum of U_n and the next block of data 10370 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10371 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10372 __ adds(S_0, U_0, S_0); 10373 __ adcs(S_1, U_1, S_1); 10374 __ adc(S_2, U_2, zr); 10375 __ add(S_2, S_2, 1); 10376 10377 const Register U_0HI = *++regs, U_1HI = *++regs; 10378 10379 // NB: this logic depends on some of the special properties of 10380 // Poly1305 keys. In particular, because we know that the top 10381 // four bits of R_0 and R_1 are zero, we can add together 10382 // partial products without any risk of needing to propagate a 10383 // carry out. 10384 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10385 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10386 __ andr(U_2, R_0, 3); 10387 __ mul(U_2, S_2, U_2); 10388 10389 // Recycle registers S_0, S_1, S_2 10390 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10391 10392 // Partial reduction mod 2**130 - 5 10393 __ adds(U_1, U_0HI, U_1); 10394 __ adc(U_2, U_1HI, U_2); 10395 // Sum now in U_2:U_1:U_0. 10396 // Dead: U_0HI, U_1HI. 10397 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10398 10399 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10400 10401 // First, U_2:U_1:U_0 += (U_2 >> 2) 10402 __ lsr(rscratch1, U_2, 2); 10403 __ andr(U_2, U_2, (u8)3); 10404 __ adds(U_0, U_0, rscratch1); 10405 __ adcs(U_1, U_1, zr); 10406 __ adc(U_2, U_2, zr); 10407 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10408 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10409 __ adcs(U_1, U_1, zr); 10410 __ adc(U_2, U_2, zr); 10411 10412 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10413 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10414 __ br(~ Assembler::LT, LOOP); 10415 } 10416 10417 // Further reduce modulo 2^130 - 5 10418 __ lsr(rscratch1, U_2, 2); 10419 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10420 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10421 __ adcs(U_1, U_1, zr); 10422 __ andr(U_2, U_2, (u1)3); 10423 __ adc(U_2, U_2, zr); 10424 10425 // Unpack the sum into five 26-bit limbs and write to memory. 10426 __ ubfiz(rscratch1, U_0, 0, 26); 10427 __ ubfx(rscratch2, U_0, 26, 26); 10428 __ stp(rscratch1, rscratch2, Address(acc_start)); 10429 __ ubfx(rscratch1, U_0, 52, 12); 10430 __ bfi(rscratch1, U_1, 12, 14); 10431 __ ubfx(rscratch2, U_1, 14, 26); 10432 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10433 __ ubfx(rscratch1, U_1, 40, 24); 10434 __ bfi(rscratch1, U_2, 24, 3); 10435 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10436 10437 __ bind(DONE); 10438 __ pop(callee_saved, sp); 10439 __ leave(); 10440 __ ret(lr); 10441 10442 return start; 10443 } 10444 10445 // exception handler for upcall stubs 10446 address generate_upcall_stub_exception_handler() { 10447 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 10448 StubCodeMark mark(this, stub_id); 10449 address start = __ pc(); 10450 10451 // Native caller has no idea how to handle exceptions, 10452 // so we just crash here. Up to callee to catch exceptions. 10453 __ verify_oop(r0); 10454 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10455 __ blr(rscratch1); 10456 __ should_not_reach_here(); 10457 10458 return start; 10459 } 10460 10461 // load Method* target of MethodHandle 10462 // j_rarg0 = jobject receiver 10463 // rmethod = result 10464 address generate_upcall_stub_load_target() { 10465 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 10466 StubCodeMark mark(this, stub_id); 10467 address start = __ pc(); 10468 10469 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10470 // Load target method from receiver 10471 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10472 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10473 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10474 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10475 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10476 noreg, noreg); 10477 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10478 10479 __ ret(lr); 10480 10481 return start; 10482 } 10483 10484 #undef __ 10485 #define __ masm-> 10486 10487 class MontgomeryMultiplyGenerator : public MacroAssembler { 10488 10489 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10490 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10491 10492 RegSet _toSave; 10493 bool _squaring; 10494 10495 public: 10496 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10497 : MacroAssembler(as->code()), _squaring(squaring) { 10498 10499 // Register allocation 10500 10501 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10502 Pa_base = *regs; // Argument registers 10503 if (squaring) 10504 Pb_base = Pa_base; 10505 else 10506 Pb_base = *++regs; 10507 Pn_base = *++regs; 10508 Rlen= *++regs; 10509 inv = *++regs; 10510 Pm_base = *++regs; 10511 10512 // Working registers: 10513 Ra = *++regs; // The current digit of a, b, n, and m. 10514 Rb = *++regs; 10515 Rm = *++regs; 10516 Rn = *++regs; 10517 10518 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10519 Pb = *++regs; 10520 Pm = *++regs; 10521 Pn = *++regs; 10522 10523 t0 = *++regs; // Three registers which form a 10524 t1 = *++regs; // triple-precision accumuator. 10525 t2 = *++regs; 10526 10527 Ri = *++regs; // Inner and outer loop indexes. 10528 Rj = *++regs; 10529 10530 Rhi_ab = *++regs; // Product registers: low and high parts 10531 Rlo_ab = *++regs; // of a*b and m*n. 10532 Rhi_mn = *++regs; 10533 Rlo_mn = *++regs; 10534 10535 // r19 and up are callee-saved. 10536 _toSave = RegSet::range(r19, *regs) + Pm_base; 10537 } 10538 10539 private: 10540 void save_regs() { 10541 push(_toSave, sp); 10542 } 10543 10544 void restore_regs() { 10545 pop(_toSave, sp); 10546 } 10547 10548 template <typename T> 10549 void unroll_2(Register count, T block) { 10550 Label loop, end, odd; 10551 tbnz(count, 0, odd); 10552 cbz(count, end); 10553 align(16); 10554 bind(loop); 10555 (this->*block)(); 10556 bind(odd); 10557 (this->*block)(); 10558 subs(count, count, 2); 10559 br(Assembler::GT, loop); 10560 bind(end); 10561 } 10562 10563 template <typename T> 10564 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10565 Label loop, end, odd; 10566 tbnz(count, 0, odd); 10567 cbz(count, end); 10568 align(16); 10569 bind(loop); 10570 (this->*block)(d, s, tmp); 10571 bind(odd); 10572 (this->*block)(d, s, tmp); 10573 subs(count, count, 2); 10574 br(Assembler::GT, loop); 10575 bind(end); 10576 } 10577 10578 void pre1(RegisterOrConstant i) { 10579 block_comment("pre1"); 10580 // Pa = Pa_base; 10581 // Pb = Pb_base + i; 10582 // Pm = Pm_base; 10583 // Pn = Pn_base + i; 10584 // Ra = *Pa; 10585 // Rb = *Pb; 10586 // Rm = *Pm; 10587 // Rn = *Pn; 10588 ldr(Ra, Address(Pa_base)); 10589 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10590 ldr(Rm, Address(Pm_base)); 10591 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10592 lea(Pa, Address(Pa_base)); 10593 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 10594 lea(Pm, Address(Pm_base)); 10595 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10596 10597 // Zero the m*n result. 10598 mov(Rhi_mn, zr); 10599 mov(Rlo_mn, zr); 10600 } 10601 10602 // The core multiply-accumulate step of a Montgomery 10603 // multiplication. The idea is to schedule operations as a 10604 // pipeline so that instructions with long latencies (loads and 10605 // multiplies) have time to complete before their results are 10606 // used. This most benefits in-order implementations of the 10607 // architecture but out-of-order ones also benefit. 10608 void step() { 10609 block_comment("step"); 10610 // MACC(Ra, Rb, t0, t1, t2); 10611 // Ra = *++Pa; 10612 // Rb = *--Pb; 10613 umulh(Rhi_ab, Ra, Rb); 10614 mul(Rlo_ab, Ra, Rb); 10615 ldr(Ra, pre(Pa, wordSize)); 10616 ldr(Rb, pre(Pb, -wordSize)); 10617 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 10618 // previous iteration. 10619 // MACC(Rm, Rn, t0, t1, t2); 10620 // Rm = *++Pm; 10621 // Rn = *--Pn; 10622 umulh(Rhi_mn, Rm, Rn); 10623 mul(Rlo_mn, Rm, Rn); 10624 ldr(Rm, pre(Pm, wordSize)); 10625 ldr(Rn, pre(Pn, -wordSize)); 10626 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10627 } 10628 10629 void post1() { 10630 block_comment("post1"); 10631 10632 // MACC(Ra, Rb, t0, t1, t2); 10633 // Ra = *++Pa; 10634 // Rb = *--Pb; 10635 umulh(Rhi_ab, Ra, Rb); 10636 mul(Rlo_ab, Ra, Rb); 10637 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10638 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10639 10640 // *Pm = Rm = t0 * inv; 10641 mul(Rm, t0, inv); 10642 str(Rm, Address(Pm)); 10643 10644 // MACC(Rm, Rn, t0, t1, t2); 10645 // t0 = t1; t1 = t2; t2 = 0; 10646 umulh(Rhi_mn, Rm, Rn); 10647 10648 #ifndef PRODUCT 10649 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10650 { 10651 mul(Rlo_mn, Rm, Rn); 10652 add(Rlo_mn, t0, Rlo_mn); 10653 Label ok; 10654 cbz(Rlo_mn, ok); { 10655 stop("broken Montgomery multiply"); 10656 } bind(ok); 10657 } 10658 #endif 10659 // We have very carefully set things up so that 10660 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 10661 // the lower half of Rm * Rn because we know the result already: 10662 // it must be -t0. t0 + (-t0) must generate a carry iff 10663 // t0 != 0. So, rather than do a mul and an adds we just set 10664 // the carry flag iff t0 is nonzero. 10665 // 10666 // mul(Rlo_mn, Rm, Rn); 10667 // adds(zr, t0, Rlo_mn); 10668 subs(zr, t0, 1); // Set carry iff t0 is nonzero 10669 adcs(t0, t1, Rhi_mn); 10670 adc(t1, t2, zr); 10671 mov(t2, zr); 10672 } 10673 10674 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 10675 block_comment("pre2"); 10676 // Pa = Pa_base + i-len; 10677 // Pb = Pb_base + len; 10678 // Pm = Pm_base + i-len; 10679 // Pn = Pn_base + len; 10680 10681 if (i.is_register()) { 10682 sub(Rj, i.as_register(), len); 10683 } else { 10684 mov(Rj, i.as_constant()); 10685 sub(Rj, Rj, len); 10686 } 10687 // Rj == i-len 10688 10689 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 10690 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 10691 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 10692 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 10693 10694 // Ra = *++Pa; 10695 // Rb = *--Pb; 10696 // Rm = *++Pm; 10697 // Rn = *--Pn; 10698 ldr(Ra, pre(Pa, wordSize)); 10699 ldr(Rb, pre(Pb, -wordSize)); 10700 ldr(Rm, pre(Pm, wordSize)); 10701 ldr(Rn, pre(Pn, -wordSize)); 10702 10703 mov(Rhi_mn, zr); 10704 mov(Rlo_mn, zr); 10705 } 10706 10707 void post2(RegisterOrConstant i, RegisterOrConstant len) { 10708 block_comment("post2"); 10709 if (i.is_constant()) { 10710 mov(Rj, i.as_constant()-len.as_constant()); 10711 } else { 10712 sub(Rj, i.as_register(), len); 10713 } 10714 10715 adds(t0, t0, Rlo_mn); // The pending m*n, low part 10716 10717 // As soon as we know the least significant digit of our result, 10718 // store it. 10719 // Pm_base[i-len] = t0; 10720 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 10721 10722 // t0 = t1; t1 = t2; t2 = 0; 10723 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 10724 adc(t1, t2, zr); 10725 mov(t2, zr); 10726 } 10727 10728 // A carry in t0 after Montgomery multiplication means that we 10729 // should subtract multiples of n from our result in m. We'll 10730 // keep doing that until there is no carry. 10731 void normalize(RegisterOrConstant len) { 10732 block_comment("normalize"); 10733 // while (t0) 10734 // t0 = sub(Pm_base, Pn_base, t0, len); 10735 Label loop, post, again; 10736 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 10737 cbz(t0, post); { 10738 bind(again); { 10739 mov(i, zr); 10740 mov(cnt, len); 10741 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10742 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10743 subs(zr, zr, zr); // set carry flag, i.e. no borrow 10744 align(16); 10745 bind(loop); { 10746 sbcs(Rm, Rm, Rn); 10747 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10748 add(i, i, 1); 10749 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 10750 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 10751 sub(cnt, cnt, 1); 10752 } cbnz(cnt, loop); 10753 sbc(t0, t0, zr); 10754 } cbnz(t0, again); 10755 } bind(post); 10756 } 10757 10758 // Move memory at s to d, reversing words. 10759 // Increments d to end of copied memory 10760 // Destroys tmp1, tmp2 10761 // Preserves len 10762 // Leaves s pointing to the address which was in d at start 10763 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 10764 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 10765 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 10766 10767 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 10768 mov(tmp1, len); 10769 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 10770 sub(s, d, len, ext::uxtw, LogBytesPerWord); 10771 } 10772 // where 10773 void reverse1(Register d, Register s, Register tmp) { 10774 ldr(tmp, pre(s, -wordSize)); 10775 ror(tmp, tmp, 32); 10776 str(tmp, post(d, wordSize)); 10777 } 10778 10779 void step_squaring() { 10780 // An extra ACC 10781 step(); 10782 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10783 } 10784 10785 void last_squaring(RegisterOrConstant i) { 10786 Label dont; 10787 // if ((i & 1) == 0) { 10788 tbnz(i.as_register(), 0, dont); { 10789 // MACC(Ra, Rb, t0, t1, t2); 10790 // Ra = *++Pa; 10791 // Rb = *--Pb; 10792 umulh(Rhi_ab, Ra, Rb); 10793 mul(Rlo_ab, Ra, Rb); 10794 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 10795 } bind(dont); 10796 } 10797 10798 void extra_step_squaring() { 10799 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10800 10801 // MACC(Rm, Rn, t0, t1, t2); 10802 // Rm = *++Pm; 10803 // Rn = *--Pn; 10804 umulh(Rhi_mn, Rm, Rn); 10805 mul(Rlo_mn, Rm, Rn); 10806 ldr(Rm, pre(Pm, wordSize)); 10807 ldr(Rn, pre(Pn, -wordSize)); 10808 } 10809 10810 void post1_squaring() { 10811 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 10812 10813 // *Pm = Rm = t0 * inv; 10814 mul(Rm, t0, inv); 10815 str(Rm, Address(Pm)); 10816 10817 // MACC(Rm, Rn, t0, t1, t2); 10818 // t0 = t1; t1 = t2; t2 = 0; 10819 umulh(Rhi_mn, Rm, Rn); 10820 10821 #ifndef PRODUCT 10822 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 10823 { 10824 mul(Rlo_mn, Rm, Rn); 10825 add(Rlo_mn, t0, Rlo_mn); 10826 Label ok; 10827 cbz(Rlo_mn, ok); { 10828 stop("broken Montgomery multiply"); 10829 } bind(ok); 10830 } 10831 #endif 10832 // We have very carefully set things up so that 10833 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 10834 // the lower half of Rm * Rn because we know the result already: 10835 // it must be -t0. t0 + (-t0) must generate a carry iff 10836 // t0 != 0. So, rather than do a mul and an adds we just set 10837 // the carry flag iff t0 is nonzero. 10838 // 10839 // mul(Rlo_mn, Rm, Rn); 10840 // adds(zr, t0, Rlo_mn); 10841 subs(zr, t0, 1); // Set carry iff t0 is nonzero 10842 adcs(t0, t1, Rhi_mn); 10843 adc(t1, t2, zr); 10844 mov(t2, zr); 10845 } 10846 10847 void acc(Register Rhi, Register Rlo, 10848 Register t0, Register t1, Register t2) { 10849 adds(t0, t0, Rlo); 10850 adcs(t1, t1, Rhi); 10851 adc(t2, t2, zr); 10852 } 10853 10854 public: 10855 /** 10856 * Fast Montgomery multiplication. The derivation of the 10857 * algorithm is in A Cryptographic Library for the Motorola 10858 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 10859 * 10860 * Arguments: 10861 * 10862 * Inputs for multiplication: 10863 * c_rarg0 - int array elements a 10864 * c_rarg1 - int array elements b 10865 * c_rarg2 - int array elements n (the modulus) 10866 * c_rarg3 - int length 10867 * c_rarg4 - int inv 10868 * c_rarg5 - int array elements m (the result) 10869 * 10870 * Inputs for squaring: 10871 * c_rarg0 - int array elements a 10872 * c_rarg1 - int array elements n (the modulus) 10873 * c_rarg2 - int length 10874 * c_rarg3 - int inv 10875 * c_rarg4 - int array elements m (the result) 10876 * 10877 */ 10878 address generate_multiply() { 10879 Label argh, nothing; 10880 bind(argh); 10881 stop("MontgomeryMultiply total_allocation must be <= 8192"); 10882 10883 align(CodeEntryAlignment); 10884 address entry = pc(); 10885 10886 cbzw(Rlen, nothing); 10887 10888 enter(); 10889 10890 // Make room. 10891 cmpw(Rlen, 512); 10892 br(Assembler::HI, argh); 10893 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 10894 andr(sp, Ra, -2 * wordSize); 10895 10896 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 10897 10898 { 10899 // Copy input args, reversing as we go. We use Ra as a 10900 // temporary variable. 10901 reverse(Ra, Pa_base, Rlen, t0, t1); 10902 if (!_squaring) 10903 reverse(Ra, Pb_base, Rlen, t0, t1); 10904 reverse(Ra, Pn_base, Rlen, t0, t1); 10905 } 10906 10907 // Push all call-saved registers and also Pm_base which we'll need 10908 // at the end. 10909 save_regs(); 10910 10911 #ifndef PRODUCT 10912 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 10913 { 10914 ldr(Rn, Address(Pn_base, 0)); 10915 mul(Rlo_mn, Rn, inv); 10916 subs(zr, Rlo_mn, -1); 10917 Label ok; 10918 br(EQ, ok); { 10919 stop("broken inverse in Montgomery multiply"); 10920 } bind(ok); 10921 } 10922 #endif 10923 10924 mov(Pm_base, Ra); 10925 10926 mov(t0, zr); 10927 mov(t1, zr); 10928 mov(t2, zr); 10929 10930 block_comment("for (int i = 0; i < len; i++) {"); 10931 mov(Ri, zr); { 10932 Label loop, end; 10933 cmpw(Ri, Rlen); 10934 br(Assembler::GE, end); 10935 10936 bind(loop); 10937 pre1(Ri); 10938 10939 block_comment(" for (j = i; j; j--) {"); { 10940 movw(Rj, Ri); 10941 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 10942 } block_comment(" } // j"); 10943 10944 post1(); 10945 addw(Ri, Ri, 1); 10946 cmpw(Ri, Rlen); 10947 br(Assembler::LT, loop); 10948 bind(end); 10949 block_comment("} // i"); 10950 } 10951 10952 block_comment("for (int i = len; i < 2*len; i++) {"); 10953 mov(Ri, Rlen); { 10954 Label loop, end; 10955 cmpw(Ri, Rlen, Assembler::LSL, 1); 10956 br(Assembler::GE, end); 10957 10958 bind(loop); 10959 pre2(Ri, Rlen); 10960 10961 block_comment(" for (j = len*2-i-1; j; j--) {"); { 10962 lslw(Rj, Rlen, 1); 10963 subw(Rj, Rj, Ri); 10964 subw(Rj, Rj, 1); 10965 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 10966 } block_comment(" } // j"); 10967 10968 post2(Ri, Rlen); 10969 addw(Ri, Ri, 1); 10970 cmpw(Ri, Rlen, Assembler::LSL, 1); 10971 br(Assembler::LT, loop); 10972 bind(end); 10973 } 10974 block_comment("} // i"); 10975 10976 normalize(Rlen); 10977 10978 mov(Ra, Pm_base); // Save Pm_base in Ra 10979 restore_regs(); // Restore caller's Pm_base 10980 10981 // Copy our result into caller's Pm_base 10982 reverse(Pm_base, Ra, Rlen, t0, t1); 10983 10984 leave(); 10985 bind(nothing); 10986 ret(lr); 10987 10988 return entry; 10989 } 10990 // In C, approximately: 10991 10992 // void 10993 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 10994 // julong Pn_base[], julong Pm_base[], 10995 // julong inv, int len) { 10996 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 10997 // julong *Pa, *Pb, *Pn, *Pm; 10998 // julong Ra, Rb, Rn, Rm; 10999 11000 // int i; 11001 11002 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11003 11004 // for (i = 0; i < len; i++) { 11005 // int j; 11006 11007 // Pa = Pa_base; 11008 // Pb = Pb_base + i; 11009 // Pm = Pm_base; 11010 // Pn = Pn_base + i; 11011 11012 // Ra = *Pa; 11013 // Rb = *Pb; 11014 // Rm = *Pm; 11015 // Rn = *Pn; 11016 11017 // int iters = i; 11018 // for (j = 0; iters--; j++) { 11019 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11020 // MACC(Ra, Rb, t0, t1, t2); 11021 // Ra = *++Pa; 11022 // Rb = *--Pb; 11023 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11024 // MACC(Rm, Rn, t0, t1, t2); 11025 // Rm = *++Pm; 11026 // Rn = *--Pn; 11027 // } 11028 11029 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 11030 // MACC(Ra, Rb, t0, t1, t2); 11031 // *Pm = Rm = t0 * inv; 11032 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11033 // MACC(Rm, Rn, t0, t1, t2); 11034 11035 // assert(t0 == 0, "broken Montgomery multiply"); 11036 11037 // t0 = t1; t1 = t2; t2 = 0; 11038 // } 11039 11040 // for (i = len; i < 2*len; i++) { 11041 // int j; 11042 11043 // Pa = Pa_base + i-len; 11044 // Pb = Pb_base + len; 11045 // Pm = Pm_base + i-len; 11046 // Pn = Pn_base + len; 11047 11048 // Ra = *++Pa; 11049 // Rb = *--Pb; 11050 // Rm = *++Pm; 11051 // Rn = *--Pn; 11052 11053 // int iters = len*2-i-1; 11054 // for (j = i-len+1; iters--; j++) { 11055 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11056 // MACC(Ra, Rb, t0, t1, t2); 11057 // Ra = *++Pa; 11058 // Rb = *--Pb; 11059 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11060 // MACC(Rm, Rn, t0, t1, t2); 11061 // Rm = *++Pm; 11062 // Rn = *--Pn; 11063 // } 11064 11065 // Pm_base[i-len] = t0; 11066 // t0 = t1; t1 = t2; t2 = 0; 11067 // } 11068 11069 // while (t0) 11070 // t0 = sub(Pm_base, Pn_base, t0, len); 11071 // } 11072 11073 /** 11074 * Fast Montgomery squaring. This uses asymptotically 25% fewer 11075 * multiplies than Montgomery multiplication so it should be up to 11076 * 25% faster. However, its loop control is more complex and it 11077 * may actually run slower on some machines. 11078 * 11079 * Arguments: 11080 * 11081 * Inputs: 11082 * c_rarg0 - int array elements a 11083 * c_rarg1 - int array elements n (the modulus) 11084 * c_rarg2 - int length 11085 * c_rarg3 - int inv 11086 * c_rarg4 - int array elements m (the result) 11087 * 11088 */ 11089 address generate_square() { 11090 Label argh; 11091 bind(argh); 11092 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11093 11094 align(CodeEntryAlignment); 11095 address entry = pc(); 11096 11097 enter(); 11098 11099 // Make room. 11100 cmpw(Rlen, 512); 11101 br(Assembler::HI, argh); 11102 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11103 andr(sp, Ra, -2 * wordSize); 11104 11105 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11106 11107 { 11108 // Copy input args, reversing as we go. We use Ra as a 11109 // temporary variable. 11110 reverse(Ra, Pa_base, Rlen, t0, t1); 11111 reverse(Ra, Pn_base, Rlen, t0, t1); 11112 } 11113 11114 // Push all call-saved registers and also Pm_base which we'll need 11115 // at the end. 11116 save_regs(); 11117 11118 mov(Pm_base, Ra); 11119 11120 mov(t0, zr); 11121 mov(t1, zr); 11122 mov(t2, zr); 11123 11124 block_comment("for (int i = 0; i < len; i++) {"); 11125 mov(Ri, zr); { 11126 Label loop, end; 11127 bind(loop); 11128 cmp(Ri, Rlen); 11129 br(Assembler::GE, end); 11130 11131 pre1(Ri); 11132 11133 block_comment("for (j = (i+1)/2; j; j--) {"); { 11134 add(Rj, Ri, 1); 11135 lsr(Rj, Rj, 1); 11136 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11137 } block_comment(" } // j"); 11138 11139 last_squaring(Ri); 11140 11141 block_comment(" for (j = i/2; j; j--) {"); { 11142 lsr(Rj, Ri, 1); 11143 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11144 } block_comment(" } // j"); 11145 11146 post1_squaring(); 11147 add(Ri, Ri, 1); 11148 cmp(Ri, Rlen); 11149 br(Assembler::LT, loop); 11150 11151 bind(end); 11152 block_comment("} // i"); 11153 } 11154 11155 block_comment("for (int i = len; i < 2*len; i++) {"); 11156 mov(Ri, Rlen); { 11157 Label loop, end; 11158 bind(loop); 11159 cmp(Ri, Rlen, Assembler::LSL, 1); 11160 br(Assembler::GE, end); 11161 11162 pre2(Ri, Rlen); 11163 11164 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11165 lsl(Rj, Rlen, 1); 11166 sub(Rj, Rj, Ri); 11167 sub(Rj, Rj, 1); 11168 lsr(Rj, Rj, 1); 11169 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11170 } block_comment(" } // j"); 11171 11172 last_squaring(Ri); 11173 11174 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11175 lsl(Rj, Rlen, 1); 11176 sub(Rj, Rj, Ri); 11177 lsr(Rj, Rj, 1); 11178 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11179 } block_comment(" } // j"); 11180 11181 post2(Ri, Rlen); 11182 add(Ri, Ri, 1); 11183 cmp(Ri, Rlen, Assembler::LSL, 1); 11184 11185 br(Assembler::LT, loop); 11186 bind(end); 11187 block_comment("} // i"); 11188 } 11189 11190 normalize(Rlen); 11191 11192 mov(Ra, Pm_base); // Save Pm_base in Ra 11193 restore_regs(); // Restore caller's Pm_base 11194 11195 // Copy our result into caller's Pm_base 11196 reverse(Pm_base, Ra, Rlen, t0, t1); 11197 11198 leave(); 11199 ret(lr); 11200 11201 return entry; 11202 } 11203 // In C, approximately: 11204 11205 // void 11206 // montgomery_square(julong Pa_base[], julong Pn_base[], 11207 // julong Pm_base[], julong inv, int len) { 11208 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11209 // julong *Pa, *Pb, *Pn, *Pm; 11210 // julong Ra, Rb, Rn, Rm; 11211 11212 // int i; 11213 11214 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11215 11216 // for (i = 0; i < len; i++) { 11217 // int j; 11218 11219 // Pa = Pa_base; 11220 // Pb = Pa_base + i; 11221 // Pm = Pm_base; 11222 // Pn = Pn_base + i; 11223 11224 // Ra = *Pa; 11225 // Rb = *Pb; 11226 // Rm = *Pm; 11227 // Rn = *Pn; 11228 11229 // int iters = (i+1)/2; 11230 // for (j = 0; iters--; j++) { 11231 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11232 // MACC2(Ra, Rb, t0, t1, t2); 11233 // Ra = *++Pa; 11234 // Rb = *--Pb; 11235 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11236 // MACC(Rm, Rn, t0, t1, t2); 11237 // Rm = *++Pm; 11238 // Rn = *--Pn; 11239 // } 11240 // if ((i & 1) == 0) { 11241 // assert(Ra == Pa_base[j], "must be"); 11242 // MACC(Ra, Ra, t0, t1, t2); 11243 // } 11244 // iters = i/2; 11245 // assert(iters == i-j, "must be"); 11246 // for (; iters--; j++) { 11247 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11248 // MACC(Rm, Rn, t0, t1, t2); 11249 // Rm = *++Pm; 11250 // Rn = *--Pn; 11251 // } 11252 11253 // *Pm = Rm = t0 * inv; 11254 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11255 // MACC(Rm, Rn, t0, t1, t2); 11256 11257 // assert(t0 == 0, "broken Montgomery multiply"); 11258 11259 // t0 = t1; t1 = t2; t2 = 0; 11260 // } 11261 11262 // for (i = len; i < 2*len; i++) { 11263 // int start = i-len+1; 11264 // int end = start + (len - start)/2; 11265 // int j; 11266 11267 // Pa = Pa_base + i-len; 11268 // Pb = Pa_base + len; 11269 // Pm = Pm_base + i-len; 11270 // Pn = Pn_base + len; 11271 11272 // Ra = *++Pa; 11273 // Rb = *--Pb; 11274 // Rm = *++Pm; 11275 // Rn = *--Pn; 11276 11277 // int iters = (2*len-i-1)/2; 11278 // assert(iters == end-start, "must be"); 11279 // for (j = start; iters--; j++) { 11280 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11281 // MACC2(Ra, Rb, t0, t1, t2); 11282 // Ra = *++Pa; 11283 // Rb = *--Pb; 11284 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11285 // MACC(Rm, Rn, t0, t1, t2); 11286 // Rm = *++Pm; 11287 // Rn = *--Pn; 11288 // } 11289 // if ((i & 1) == 0) { 11290 // assert(Ra == Pa_base[j], "must be"); 11291 // MACC(Ra, Ra, t0, t1, t2); 11292 // } 11293 // iters = (2*len-i)/2; 11294 // assert(iters == len-j, "must be"); 11295 // for (; iters--; j++) { 11296 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11297 // MACC(Rm, Rn, t0, t1, t2); 11298 // Rm = *++Pm; 11299 // Rn = *--Pn; 11300 // } 11301 // Pm_base[i-len] = t0; 11302 // t0 = t1; t1 = t2; t2 = 0; 11303 // } 11304 11305 // while (t0) 11306 // t0 = sub(Pm_base, Pn_base, t0, len); 11307 // } 11308 }; 11309 11310 // Initialization 11311 void generate_initial_stubs() { 11312 // Generate initial stubs and initializes the entry points 11313 11314 // entry points that exist in all platforms Note: This is code 11315 // that could be shared among different platforms - however the 11316 // benefit seems to be smaller than the disadvantage of having a 11317 // much more complicated generator structure. See also comment in 11318 // stubRoutines.hpp. 11319 11320 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11321 11322 StubRoutines::_call_stub_entry = 11323 generate_call_stub(StubRoutines::_call_stub_return_address); 11324 11325 // is referenced by megamorphic call 11326 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11327 11328 // Initialize table for copy memory (arraycopy) check. 11329 if (UnsafeMemoryAccess::_table == nullptr) { 11330 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11331 } 11332 11333 if (UseCRC32Intrinsics) { 11334 // set table address before stub generation which use it 11335 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 11336 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11337 } 11338 11339 if (UseCRC32CIntrinsics) { 11340 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11341 } 11342 11343 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11344 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11345 } 11346 11347 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11348 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11349 } 11350 11351 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11352 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11353 StubRoutines::_hf2f = generate_float16ToFloat(); 11354 StubRoutines::_f2hf = generate_floatToFloat16(); 11355 } 11356 } 11357 11358 void generate_continuation_stubs() { 11359 // Continuation stubs: 11360 StubRoutines::_cont_thaw = generate_cont_thaw(); 11361 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11362 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11363 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11364 } 11365 11366 void generate_final_stubs() { 11367 // support for verify_oop (must happen after universe_init) 11368 if (VerifyOops) { 11369 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11370 } 11371 11372 // arraycopy stubs used by compilers 11373 generate_arraycopy_stubs(); 11374 11375 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11376 11377 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11378 11379 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11380 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11381 11382 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11383 11384 generate_atomic_entry_points(); 11385 11386 #endif // LINUX 11387 11388 #ifdef COMPILER2 11389 if (UseSecondarySupersTable) { 11390 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11391 if (! InlineSecondarySupersTest) { 11392 generate_lookup_secondary_supers_table_stub(); 11393 } 11394 } 11395 #endif 11396 11397 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(); 11398 11399 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11400 } 11401 11402 void generate_compiler_stubs() { 11403 #if COMPILER2_OR_JVMCI 11404 11405 if (UseSVE == 0) { 11406 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 11407 } 11408 11409 // array equals stub for large arrays. 11410 if (!UseSimpleArrayEquals) { 11411 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11412 } 11413 11414 // arrays_hascode stub for large arrays. 11415 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11416 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11417 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11418 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11419 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11420 11421 // byte_array_inflate stub for large arrays. 11422 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11423 11424 // countPositives stub for large arrays. 11425 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11426 11427 generate_compare_long_strings(); 11428 11429 generate_string_indexof_stubs(); 11430 11431 #ifdef COMPILER2 11432 if (UseMultiplyToLenIntrinsic) { 11433 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11434 } 11435 11436 if (UseSquareToLenIntrinsic) { 11437 StubRoutines::_squareToLen = generate_squareToLen(); 11438 } 11439 11440 if (UseMulAddIntrinsic) { 11441 StubRoutines::_mulAdd = generate_mulAdd(); 11442 } 11443 11444 if (UseSIMDForBigIntegerShiftIntrinsics) { 11445 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11446 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11447 } 11448 11449 if (UseMontgomeryMultiplyIntrinsic) { 11450 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 11451 StubCodeMark mark(this, stub_id); 11452 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11453 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11454 } 11455 11456 if (UseMontgomerySquareIntrinsic) { 11457 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 11458 StubCodeMark mark(this, stub_id); 11459 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11460 // We use generate_multiply() rather than generate_square() 11461 // because it's faster for the sizes of modulus we care about. 11462 StubRoutines::_montgomerySquare = g.generate_multiply(); 11463 } 11464 11465 #endif // COMPILER2 11466 11467 if (UseChaCha20Intrinsics) { 11468 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11469 } 11470 11471 if (UseKyberIntrinsics) { 11472 StubRoutines::_kyberNtt = generate_kyberNtt(); 11473 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 11474 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 11475 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 11476 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 11477 StubRoutines::_kyber12To16 = generate_kyber12To16(); 11478 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 11479 } 11480 11481 if (UseDilithiumIntrinsics) { 11482 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 11483 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 11484 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 11485 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 11486 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 11487 } 11488 11489 if (UseBASE64Intrinsics) { 11490 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 11491 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 11492 } 11493 11494 // data cache line writeback 11495 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 11496 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 11497 11498 if (UseAESIntrinsics) { 11499 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 11500 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 11501 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 11502 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 11503 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 11504 } 11505 if (UseGHASHIntrinsics) { 11506 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 11507 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 11508 } 11509 if (UseAESIntrinsics && UseGHASHIntrinsics) { 11510 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 11511 } 11512 11513 if (UseMD5Intrinsics) { 11514 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 11515 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 11516 } 11517 if (UseSHA1Intrinsics) { 11518 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 11519 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 11520 } 11521 if (UseSHA256Intrinsics) { 11522 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 11523 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 11524 } 11525 if (UseSHA512Intrinsics) { 11526 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 11527 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 11528 } 11529 if (UseSHA3Intrinsics) { 11530 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 11531 StubRoutines::_double_keccak = generate_double_keccak(); 11532 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 11533 } 11534 11535 if (UsePoly1305Intrinsics) { 11536 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 11537 } 11538 11539 // generate Adler32 intrinsics code 11540 if (UseAdler32Intrinsics) { 11541 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 11542 } 11543 11544 #endif // COMPILER2_OR_JVMCI 11545 } 11546 11547 public: 11548 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 11549 switch(blob_id) { 11550 case initial_id: 11551 generate_initial_stubs(); 11552 break; 11553 case continuation_id: 11554 generate_continuation_stubs(); 11555 break; 11556 case compiler_id: 11557 generate_compiler_stubs(); 11558 break; 11559 case final_id: 11560 generate_final_stubs(); 11561 break; 11562 default: 11563 fatal("unexpected blob id: %d", blob_id); 11564 break; 11565 }; 11566 } 11567 }; // end class declaration 11568 11569 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 11570 StubGenerator g(code, blob_id); 11571 } 11572 11573 11574 #if defined (LINUX) 11575 11576 // Define pointers to atomic stubs and initialize them to point to the 11577 // code in atomic_aarch64.S. 11578 11579 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 11580 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 11581 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 11582 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 11583 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 11584 11585 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 11586 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 11587 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 11588 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 11589 DEFAULT_ATOMIC_OP(xchg, 4, ) 11590 DEFAULT_ATOMIC_OP(xchg, 8, ) 11591 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 11592 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 11593 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 11594 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 11595 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 11596 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 11597 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 11598 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 11599 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 11600 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 11601 11602 #undef DEFAULT_ATOMIC_OP 11603 11604 #endif // LINUX