1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "code/aotCodeCache.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/arguments.hpp" 46 #include "runtime/atomicAccess.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/frame.inline.hpp" 50 #include "runtime/handles.inline.hpp" 51 #include "runtime/javaThread.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/stubCodeGenerator.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "utilities/align.hpp" 56 #include "utilities/checkedCast.hpp" 57 #include "utilities/debug.hpp" 58 #include "utilities/globalDefinitions.hpp" 59 #include "utilities/intpow.hpp" 60 #include "utilities/powerOfTwo.hpp" 61 #ifdef COMPILER2 62 #include "opto/runtime.hpp" 63 #endif 64 #if INCLUDE_ZGC 65 #include "gc/z/zThreadLocalData.hpp" 66 #endif 67 68 // Declaration and definition of StubGenerator (no .hpp file). 69 // For a more detailed description of the stub routine structure 70 // see the comment in stubRoutines.hpp 71 72 #undef __ 73 #define __ _masm-> 74 75 #ifdef PRODUCT 76 #define BLOCK_COMMENT(str) /* nothing */ 77 #else 78 #define BLOCK_COMMENT(str) __ block_comment(str) 79 #endif 80 81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 82 83 // Stub Code definitions 84 85 class StubGenerator: public StubCodeGenerator { 86 private: 87 88 #ifdef PRODUCT 89 #define inc_counter_np(counter) ((void)0) 90 #else 91 void inc_counter_np_(uint& counter) { 92 __ incrementw(ExternalAddress((address)&counter)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubId stub_id = StubId::stubgen_call_stub_id; 207 StubCodeMark mark(this, stub_id); 208 address start = __ pc(); 209 210 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 211 212 const Address fpcr_save (rfp, fpcr_off * wordSize); 213 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 214 const Address result (rfp, result_off * wordSize); 215 const Address result_type (rfp, result_type_off * wordSize); 216 const Address method (rfp, method_off * wordSize); 217 const Address entry_point (rfp, entry_point_off * wordSize); 218 const Address parameter_size(rfp, parameter_size_off * wordSize); 219 220 const Address thread (rfp, thread_off * wordSize); 221 222 const Address d15_save (rfp, d15_off * wordSize); 223 const Address d13_save (rfp, d13_off * wordSize); 224 const Address d11_save (rfp, d11_off * wordSize); 225 const Address d9_save (rfp, d9_off * wordSize); 226 227 const Address r28_save (rfp, r28_off * wordSize); 228 const Address r26_save (rfp, r26_off * wordSize); 229 const Address r24_save (rfp, r24_off * wordSize); 230 const Address r22_save (rfp, r22_off * wordSize); 231 const Address r20_save (rfp, r20_off * wordSize); 232 233 // stub code 234 235 address aarch64_entry = __ pc(); 236 237 // set up frame and move sp to end of save area 238 __ enter(); 239 __ sub(sp, rfp, -sp_after_call_off * wordSize); 240 241 // save register parameters and Java scratch/global registers 242 // n.b. we save thread even though it gets installed in 243 // rthread because we want to sanity check rthread later 244 __ str(c_rarg7, thread); 245 __ strw(c_rarg6, parameter_size); 246 __ stp(c_rarg4, c_rarg5, entry_point); 247 __ stp(c_rarg2, c_rarg3, result_type); 248 __ stp(c_rarg0, c_rarg1, call_wrapper); 249 250 __ stp(r20, r19, r20_save); 251 __ stp(r22, r21, r22_save); 252 __ stp(r24, r23, r24_save); 253 __ stp(r26, r25, r26_save); 254 __ stp(r28, r27, r28_save); 255 256 __ stpd(v9, v8, d9_save); 257 __ stpd(v11, v10, d11_save); 258 __ stpd(v13, v12, d13_save); 259 __ stpd(v15, v14, d15_save); 260 261 __ get_fpcr(rscratch1); 262 __ str(rscratch1, fpcr_save); 263 // Set FPCR to the state we need. We do want Round to Nearest. We 264 // don't want non-IEEE rounding modes or floating-point traps. 265 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 266 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 267 __ set_fpcr(rscratch1); 268 269 // install Java thread in global register now we have saved 270 // whatever value it held 271 __ mov(rthread, c_rarg7); 272 // And method 273 __ mov(rmethod, c_rarg3); 274 275 // set up the heapbase register 276 __ reinit_heapbase(); 277 278 #ifdef ASSERT 279 // make sure we have no pending exceptions 280 { 281 Label L; 282 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 283 __ cmp(rscratch1, (u1)NULL_WORD); 284 __ br(Assembler::EQ, L); 285 __ stop("StubRoutines::call_stub: entered with pending exception"); 286 __ BIND(L); 287 } 288 #endif 289 // pass parameters if any 290 __ mov(esp, sp); 291 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 292 __ andr(sp, rscratch1, -2 * wordSize); 293 294 BLOCK_COMMENT("pass parameters if any"); 295 Label parameters_done; 296 // parameter count is still in c_rarg6 297 // and parameter pointer identifying param 1 is in c_rarg5 298 __ cbzw(c_rarg6, parameters_done); 299 300 address loop = __ pc(); 301 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 302 __ subsw(c_rarg6, c_rarg6, 1); 303 __ push(rscratch1); 304 __ br(Assembler::GT, loop); 305 306 __ BIND(parameters_done); 307 308 // call Java entry -- passing methdoOop, and current sp 309 // rmethod: Method* 310 // r19_sender_sp: sender sp 311 BLOCK_COMMENT("call Java function"); 312 __ mov(r19_sender_sp, sp); 313 __ blr(c_rarg4); 314 315 // we do this here because the notify will already have been done 316 // if we get to the next instruction via an exception 317 // 318 // n.b. adding this instruction here affects the calculation of 319 // whether or not a routine returns to the call stub (used when 320 // doing stack walks) since the normal test is to check the return 321 // pc against the address saved below. so we may need to allow for 322 // this extra instruction in the check. 323 324 // save current address for use by exception handling code 325 326 return_address = __ pc(); 327 328 // store result depending on type (everything that is not 329 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 330 // n.b. this assumes Java returns an integral result in r0 331 // and a floating result in j_farg0 332 __ ldr(j_rarg2, result); 333 Label is_long, is_float, is_double, exit; 334 __ ldr(j_rarg1, result_type); 335 __ cmp(j_rarg1, (u1)T_OBJECT); 336 __ br(Assembler::EQ, is_long); 337 __ cmp(j_rarg1, (u1)T_LONG); 338 __ br(Assembler::EQ, is_long); 339 __ cmp(j_rarg1, (u1)T_FLOAT); 340 __ br(Assembler::EQ, is_float); 341 __ cmp(j_rarg1, (u1)T_DOUBLE); 342 __ br(Assembler::EQ, is_double); 343 344 // handle T_INT case 345 __ strw(r0, Address(j_rarg2)); 346 347 __ BIND(exit); 348 349 // pop parameters 350 __ sub(esp, rfp, -sp_after_call_off * wordSize); 351 352 #ifdef ASSERT 353 // verify that threads correspond 354 { 355 Label L, S; 356 __ ldr(rscratch1, thread); 357 __ cmp(rthread, rscratch1); 358 __ br(Assembler::NE, S); 359 __ get_thread(rscratch1); 360 __ cmp(rthread, rscratch1); 361 __ br(Assembler::EQ, L); 362 __ BIND(S); 363 __ stop("StubRoutines::call_stub: threads must correspond"); 364 __ BIND(L); 365 } 366 #endif 367 368 __ pop_cont_fastpath(rthread); 369 370 // restore callee-save registers 371 __ ldpd(v15, v14, d15_save); 372 __ ldpd(v13, v12, d13_save); 373 __ ldpd(v11, v10, d11_save); 374 __ ldpd(v9, v8, d9_save); 375 376 __ ldp(r28, r27, r28_save); 377 __ ldp(r26, r25, r26_save); 378 __ ldp(r24, r23, r24_save); 379 __ ldp(r22, r21, r22_save); 380 __ ldp(r20, r19, r20_save); 381 382 // restore fpcr 383 __ ldr(rscratch1, fpcr_save); 384 __ set_fpcr(rscratch1); 385 386 __ ldp(c_rarg0, c_rarg1, call_wrapper); 387 __ ldrw(c_rarg2, result_type); 388 __ ldr(c_rarg3, method); 389 __ ldp(c_rarg4, c_rarg5, entry_point); 390 __ ldp(c_rarg6, c_rarg7, parameter_size); 391 392 // leave frame and return to caller 393 __ leave(); 394 __ ret(lr); 395 396 // handle return types different from T_INT 397 398 __ BIND(is_long); 399 __ str(r0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_float); 403 __ strs(j_farg0, Address(j_rarg2, 0)); 404 __ br(Assembler::AL, exit); 405 406 __ BIND(is_double); 407 __ strd(j_farg0, Address(j_rarg2, 0)); 408 __ br(Assembler::AL, exit); 409 410 return start; 411 } 412 413 // Return point for a Java call if there's an exception thrown in 414 // Java code. The exception is caught and transformed into a 415 // pending exception stored in JavaThread that can be tested from 416 // within the VM. 417 // 418 // Note: Usually the parameters are removed by the callee. In case 419 // of an exception crossing an activation frame boundary, that is 420 // not the case if the callee is compiled code => need to setup the 421 // rsp. 422 // 423 // r0: exception oop 424 425 address generate_catch_exception() { 426 StubId stub_id = StubId::stubgen_catch_exception_id; 427 StubCodeMark mark(this, stub_id); 428 address start = __ pc(); 429 430 // same as in generate_call_stub(): 431 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 432 const Address thread (rfp, thread_off * wordSize); 433 434 #ifdef ASSERT 435 // verify that threads correspond 436 { 437 Label L, S; 438 __ ldr(rscratch1, thread); 439 __ cmp(rthread, rscratch1); 440 __ br(Assembler::NE, S); 441 __ get_thread(rscratch1); 442 __ cmp(rthread, rscratch1); 443 __ br(Assembler::EQ, L); 444 __ bind(S); 445 __ stop("StubRoutines::catch_exception: threads must correspond"); 446 __ bind(L); 447 } 448 #endif 449 450 // set pending exception 451 __ verify_oop(r0); 452 453 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 454 __ mov(rscratch1, (address)__FILE__); 455 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 456 __ movw(rscratch1, (int)__LINE__); 457 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 458 459 // complete return to VM 460 assert(StubRoutines::_call_stub_return_address != nullptr, 461 "_call_stub_return_address must have been generated before"); 462 __ b(StubRoutines::_call_stub_return_address); 463 464 return start; 465 } 466 467 // Continuation point for runtime calls returning with a pending 468 // exception. The pending exception check happened in the runtime 469 // or native call stub. The pending exception in Thread is 470 // converted into a Java-level exception. 471 // 472 // Contract with Java-level exception handlers: 473 // r0: exception 474 // r3: throwing pc 475 // 476 // NOTE: At entry of this stub, exception-pc must be in LR !! 477 478 // NOTE: this is always used as a jump target within generated code 479 // so it just needs to be generated code with no x86 prolog 480 481 address generate_forward_exception() { 482 StubId stub_id = StubId::stubgen_forward_exception_id; 483 StubCodeMark mark(this, stub_id); 484 address start = __ pc(); 485 486 // Upon entry, LR points to the return address returning into 487 // Java (interpreted or compiled) code; i.e., the return address 488 // becomes the throwing pc. 489 // 490 // Arguments pushed before the runtime call are still on the stack 491 // but the exception handler will reset the stack pointer -> 492 // ignore them. A potential result in registers can be ignored as 493 // well. 494 495 #ifdef ASSERT 496 // make sure this code is only executed if there is a pending exception 497 { 498 Label L; 499 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 500 __ cbnz(rscratch1, L); 501 __ stop("StubRoutines::forward exception: no pending exception (1)"); 502 __ bind(L); 503 } 504 #endif 505 506 // compute exception handler into r19 507 508 // call the VM to find the handler address associated with the 509 // caller address. pass thread in r0 and caller pc (ret address) 510 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 511 // the stack. 512 __ mov(c_rarg1, lr); 513 // lr will be trashed by the VM call so we move it to R19 514 // (callee-saved) because we also need to pass it to the handler 515 // returned by this call. 516 __ mov(r19, lr); 517 BLOCK_COMMENT("call exception_handler_for_return_address"); 518 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 519 SharedRuntime::exception_handler_for_return_address), 520 rthread, c_rarg1); 521 // Reinitialize the ptrue predicate register, in case the external runtime 522 // call clobbers ptrue reg, as we may return to SVE compiled code. 523 __ reinitialize_ptrue(); 524 525 // we should not really care that lr is no longer the callee 526 // address. we saved the value the handler needs in r19 so we can 527 // just copy it to r3. however, the C2 handler will push its own 528 // frame and then calls into the VM and the VM code asserts that 529 // the PC for the frame above the handler belongs to a compiled 530 // Java method. So, we restore lr here to satisfy that assert. 531 __ mov(lr, r19); 532 // setup r0 & r3 & clear pending exception 533 __ mov(r3, r19); 534 __ mov(r19, r0); 535 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 536 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 537 538 #ifdef ASSERT 539 // make sure exception is set 540 { 541 Label L; 542 __ cbnz(r0, L); 543 __ stop("StubRoutines::forward exception: no pending exception (2)"); 544 __ bind(L); 545 } 546 #endif 547 548 // continue at exception handler 549 // r0: exception 550 // r3: throwing pc 551 // r19: exception handler 552 __ verify_oop(r0); 553 __ br(r19); 554 555 return start; 556 } 557 558 // Non-destructive plausibility checks for oops 559 // 560 // Arguments: 561 // r0: oop to verify 562 // rscratch1: error message 563 // 564 // Stack after saving c_rarg3: 565 // [tos + 0]: saved c_rarg3 566 // [tos + 1]: saved c_rarg2 567 // [tos + 2]: saved lr 568 // [tos + 3]: saved rscratch2 569 // [tos + 4]: saved r0 570 // [tos + 5]: saved rscratch1 571 address generate_verify_oop() { 572 StubId stub_id = StubId::stubgen_verify_oop_id; 573 StubCodeMark mark(this, stub_id); 574 address start = __ pc(); 575 576 Label exit, error; 577 578 // save c_rarg2 and c_rarg3 579 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 580 581 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 583 __ ldr(c_rarg3, Address(c_rarg2)); 584 __ add(c_rarg3, c_rarg3, 1); 585 __ str(c_rarg3, Address(c_rarg2)); 586 587 // object is in r0 588 // make sure object is 'reasonable' 589 __ cbz(r0, exit); // if obj is null it is OK 590 591 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 592 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blr(rscratch1); 615 __ hlt(0); 616 617 return start; 618 } 619 620 // Generate indices for iota vector. 621 address generate_iota_indices(StubId stub_id) { 622 __ align(CodeEntryAlignment); 623 StubCodeMark mark(this, stub_id); 624 address start = __ pc(); 625 // B 626 __ emit_data64(0x0706050403020100, relocInfo::none); 627 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 628 // H 629 __ emit_data64(0x0003000200010000, relocInfo::none); 630 __ emit_data64(0x0007000600050004, relocInfo::none); 631 // S 632 __ emit_data64(0x0000000100000000, relocInfo::none); 633 __ emit_data64(0x0000000300000002, relocInfo::none); 634 // D 635 __ emit_data64(0x0000000000000000, relocInfo::none); 636 __ emit_data64(0x0000000000000001, relocInfo::none); 637 // S - FP 638 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 639 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 640 // D - FP 641 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 642 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 643 return start; 644 } 645 646 // The inner part of zero_words(). This is the bulk operation, 647 // zeroing words in blocks, possibly using DC ZVA to do it. The 648 // caller is responsible for zeroing the last few words. 649 // 650 // Inputs: 651 // r10: the HeapWord-aligned base address of an array to zero. 652 // r11: the count in HeapWords, r11 > 0. 653 // 654 // Returns r10 and r11, adjusted for the caller to clear. 655 // r10: the base address of the tail of words left to clear. 656 // r11: the number of words in the tail. 657 // r11 < MacroAssembler::zero_words_block_size. 658 659 address generate_zero_blocks() { 660 Label done; 661 Label base_aligned; 662 663 Register base = r10, cnt = r11; 664 665 __ align(CodeEntryAlignment); 666 StubId stub_id = StubId::stubgen_zero_blocks_id; 667 StubCodeMark mark(this, stub_id); 668 address start = __ pc(); 669 670 if (UseBlockZeroing) { 671 int zva_length = VM_Version::zva_length(); 672 673 // Ensure ZVA length can be divided by 16. This is required by 674 // the subsequent operations. 675 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 676 677 __ tbz(base, 3, base_aligned); 678 __ str(zr, Address(__ post(base, 8))); 679 __ sub(cnt, cnt, 1); 680 __ bind(base_aligned); 681 682 // Ensure count >= zva_length * 2 so that it still deserves a zva after 683 // alignment. 684 Label small; 685 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 686 __ subs(rscratch1, cnt, low_limit >> 3); 687 __ br(Assembler::LT, small); 688 __ zero_dcache_blocks(base, cnt); 689 __ bind(small); 690 } 691 692 { 693 // Number of stp instructions we'll unroll 694 const int unroll = 695 MacroAssembler::zero_words_block_size / 2; 696 // Clear the remaining blocks. 697 Label loop; 698 __ subs(cnt, cnt, unroll * 2); 699 __ br(Assembler::LT, done); 700 __ bind(loop); 701 for (int i = 0; i < unroll; i++) 702 __ stp(zr, zr, __ post(base, 16)); 703 __ subs(cnt, cnt, unroll * 2); 704 __ br(Assembler::GE, loop); 705 __ bind(done); 706 __ add(cnt, cnt, unroll * 2); 707 } 708 709 __ ret(lr); 710 711 return start; 712 } 713 714 715 typedef enum { 716 copy_forwards = 1, 717 copy_backwards = -1 718 } copy_direction; 719 720 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 721 // for arraycopy stubs. 722 class ArrayCopyBarrierSetHelper : StackObj { 723 BarrierSetAssembler* _bs_asm; 724 MacroAssembler* _masm; 725 DecoratorSet _decorators; 726 BasicType _type; 727 Register _gct1; 728 Register _gct2; 729 Register _gct3; 730 FloatRegister _gcvt1; 731 FloatRegister _gcvt2; 732 FloatRegister _gcvt3; 733 734 public: 735 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 736 DecoratorSet decorators, 737 BasicType type, 738 Register gct1, 739 Register gct2, 740 Register gct3, 741 FloatRegister gcvt1, 742 FloatRegister gcvt2, 743 FloatRegister gcvt3) 744 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 745 _masm(masm), 746 _decorators(decorators), 747 _type(type), 748 _gct1(gct1), 749 _gct2(gct2), 750 _gct3(gct3), 751 _gcvt1(gcvt1), 752 _gcvt2(gcvt2), 753 _gcvt3(gcvt3) { 754 } 755 756 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 757 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 758 dst1, dst2, src, 759 _gct1, _gct2, _gcvt1); 760 } 761 762 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 763 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 764 dst, src1, src2, 765 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 766 } 767 768 void copy_load_at_16(Register dst1, Register dst2, Address src) { 769 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 770 dst1, dst2, src, 771 _gct1); 772 } 773 774 void copy_store_at_16(Address dst, Register src1, Register src2) { 775 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 776 dst, src1, src2, 777 _gct1, _gct2, _gct3); 778 } 779 780 void copy_load_at_8(Register dst, Address src) { 781 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 782 dst, noreg, src, 783 _gct1); 784 } 785 786 void copy_store_at_8(Address dst, Register src) { 787 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 788 dst, src, noreg, 789 _gct1, _gct2, _gct3); 790 } 791 }; 792 793 // Bulk copy of blocks of 8 words. 794 // 795 // count is a count of words. 796 // 797 // Precondition: count >= 8 798 // 799 // Postconditions: 800 // 801 // The least significant bit of count contains the remaining count 802 // of words to copy. The rest of count is trash. 803 // 804 // s and d are adjusted to point to the remaining words to copy 805 // 806 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) { 807 BasicType type; 808 copy_direction direction; 809 810 switch (stub_id) { 811 case StubId::stubgen_copy_byte_f_id: 812 direction = copy_forwards; 813 type = T_BYTE; 814 break; 815 case StubId::stubgen_copy_byte_b_id: 816 direction = copy_backwards; 817 type = T_BYTE; 818 break; 819 case StubId::stubgen_copy_oop_f_id: 820 direction = copy_forwards; 821 type = T_OBJECT; 822 break; 823 case StubId::stubgen_copy_oop_b_id: 824 direction = copy_backwards; 825 type = T_OBJECT; 826 break; 827 case StubId::stubgen_copy_oop_uninit_f_id: 828 direction = copy_forwards; 829 type = T_OBJECT; 830 break; 831 case StubId::stubgen_copy_oop_uninit_b_id: 832 direction = copy_backwards; 833 type = T_OBJECT; 834 break; 835 default: 836 ShouldNotReachHere(); 837 } 838 839 int unit = wordSize * direction; 840 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 841 842 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 843 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 844 const Register stride = r14; 845 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 846 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 847 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 848 849 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 850 assert_different_registers(s, d, count, rscratch1, rscratch2); 851 852 Label again, drain; 853 854 __ align(CodeEntryAlignment); 855 856 StubCodeMark mark(this, stub_id); 857 858 address start = __ pc(); 859 860 Label unaligned_copy_long; 861 if (AvoidUnalignedAccesses) { 862 __ tbnz(d, 3, unaligned_copy_long); 863 } 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, bias); 867 __ sub(d, d, bias); 868 } 869 870 #ifdef ASSERT 871 // Make sure we are never given < 8 words 872 { 873 Label L; 874 __ cmp(count, (u1)8); 875 __ br(Assembler::GE, L); 876 __ stop("genrate_copy_longs called with < 8 words"); 877 __ bind(L); 878 } 879 #endif 880 881 // Fill 8 registers 882 if (UseSIMDForMemoryOps) { 883 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 884 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 885 } else { 886 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 888 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 889 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 890 } 891 892 __ subs(count, count, 16); 893 __ br(Assembler::LO, drain); 894 895 int prefetch = PrefetchCopyIntervalInBytes; 896 bool use_stride = false; 897 if (direction == copy_backwards) { 898 use_stride = prefetch > 256; 899 prefetch = -prefetch; 900 if (use_stride) __ mov(stride, prefetch); 901 } 902 903 __ bind(again); 904 905 if (PrefetchCopyIntervalInBytes > 0) 906 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 907 908 if (UseSIMDForMemoryOps) { 909 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 910 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 911 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 912 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 913 } else { 914 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 915 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 916 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 917 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 918 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 919 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 920 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 921 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 922 } 923 924 __ subs(count, count, 8); 925 __ br(Assembler::HS, again); 926 927 // Drain 928 __ bind(drain); 929 if (UseSIMDForMemoryOps) { 930 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 931 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 932 } else { 933 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 934 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 935 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 936 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 937 } 938 939 { 940 Label L1, L2; 941 __ tbz(count, exact_log2(4), L1); 942 if (UseSIMDForMemoryOps) { 943 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 944 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 945 } else { 946 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 947 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 950 } 951 __ bind(L1); 952 953 if (direction == copy_forwards) { 954 __ add(s, s, bias); 955 __ add(d, d, bias); 956 } 957 958 __ tbz(count, 1, L2); 959 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 960 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 961 __ bind(L2); 962 } 963 964 __ ret(lr); 965 966 if (AvoidUnalignedAccesses) { 967 Label drain, again; 968 // Register order for storing. Order is different for backward copy. 969 970 __ bind(unaligned_copy_long); 971 972 // source address is even aligned, target odd aligned 973 // 974 // when forward copying word pairs we read long pairs at offsets 975 // {0, 2, 4, 6} (in long words). when backwards copying we read 976 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 977 // address by -2 in the forwards case so we can compute the 978 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 979 // or -1. 980 // 981 // when forward copying we need to store 1 word, 3 pairs and 982 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 983 // zero offset We adjust the destination by -1 which means we 984 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 985 // 986 // When backwards copyng we need to store 1 word, 3 pairs and 987 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 988 // offsets {1, 3, 5, 7, 8} * unit. 989 990 if (direction == copy_forwards) { 991 __ sub(s, s, 16); 992 __ sub(d, d, 8); 993 } 994 995 // Fill 8 registers 996 // 997 // for forwards copy s was offset by -16 from the original input 998 // value of s so the register contents are at these offsets 999 // relative to the 64 bit block addressed by that original input 1000 // and so on for each successive 64 byte block when s is updated 1001 // 1002 // t0 at offset 0, t1 at offset 8 1003 // t2 at offset 16, t3 at offset 24 1004 // t4 at offset 32, t5 at offset 40 1005 // t6 at offset 48, t7 at offset 56 1006 1007 // for backwards copy s was not offset so the register contents 1008 // are at these offsets into the preceding 64 byte block 1009 // relative to that original input and so on for each successive 1010 // preceding 64 byte block when s is updated. this explains the 1011 // slightly counter-intuitive looking pattern of register usage 1012 // in the stp instructions for backwards copy. 1013 // 1014 // t0 at offset -16, t1 at offset -8 1015 // t2 at offset -32, t3 at offset -24 1016 // t4 at offset -48, t5 at offset -40 1017 // t6 at offset -64, t7 at offset -56 1018 1019 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1020 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1021 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1022 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1023 1024 __ subs(count, count, 16); 1025 __ br(Assembler::LO, drain); 1026 1027 int prefetch = PrefetchCopyIntervalInBytes; 1028 bool use_stride = false; 1029 if (direction == copy_backwards) { 1030 use_stride = prefetch > 256; 1031 prefetch = -prefetch; 1032 if (use_stride) __ mov(stride, prefetch); 1033 } 1034 1035 __ bind(again); 1036 1037 if (PrefetchCopyIntervalInBytes > 0) 1038 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1039 1040 if (direction == copy_forwards) { 1041 // allowing for the offset of -8 the store instructions place 1042 // registers into the target 64 bit block at the following 1043 // offsets 1044 // 1045 // t0 at offset 0 1046 // t1 at offset 8, t2 at offset 16 1047 // t3 at offset 24, t4 at offset 32 1048 // t5 at offset 40, t6 at offset 48 1049 // t7 at offset 56 1050 1051 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1052 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1053 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1054 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1055 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1056 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1057 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1058 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1059 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1060 } else { 1061 // d was not offset when we started so the registers are 1062 // written into the 64 bit block preceding d with the following 1063 // offsets 1064 // 1065 // t1 at offset -8 1066 // t3 at offset -24, t0 at offset -16 1067 // t5 at offset -48, t2 at offset -32 1068 // t7 at offset -56, t4 at offset -48 1069 // t6 at offset -64 1070 // 1071 // note that this matches the offsets previously noted for the 1072 // loads 1073 1074 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1075 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1076 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1077 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1078 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1079 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1080 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1081 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1082 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1083 } 1084 1085 __ subs(count, count, 8); 1086 __ br(Assembler::HS, again); 1087 1088 // Drain 1089 // 1090 // this uses the same pattern of offsets and register arguments 1091 // as above 1092 __ bind(drain); 1093 if (direction == copy_forwards) { 1094 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1095 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1096 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1097 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1098 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1099 } else { 1100 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1101 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1102 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1103 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1104 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1105 } 1106 // now we need to copy any remaining part block which may 1107 // include a 4 word block subblock and/or a 2 word subblock. 1108 // bits 2 and 1 in the count are the tell-tale for whether we 1109 // have each such subblock 1110 { 1111 Label L1, L2; 1112 __ tbz(count, exact_log2(4), L1); 1113 // this is the same as above but copying only 4 longs hence 1114 // with only one intervening stp between the str instructions 1115 // but note that the offsets and registers still follow the 1116 // same pattern 1117 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1118 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1119 if (direction == copy_forwards) { 1120 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1121 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1122 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1123 } else { 1124 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1125 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1126 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1127 } 1128 __ bind(L1); 1129 1130 __ tbz(count, 1, L2); 1131 // this is the same as above but copying only 2 longs hence 1132 // there is no intervening stp between the str instructions 1133 // but note that the offset and register patterns are still 1134 // the same 1135 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1136 if (direction == copy_forwards) { 1137 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1138 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1139 } else { 1140 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1141 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1142 } 1143 __ bind(L2); 1144 1145 // for forwards copy we need to re-adjust the offsets we 1146 // applied so that s and d are follow the last words written 1147 1148 if (direction == copy_forwards) { 1149 __ add(s, s, 16); 1150 __ add(d, d, 8); 1151 } 1152 1153 } 1154 1155 __ ret(lr); 1156 } 1157 1158 return start; 1159 } 1160 1161 // Small copy: less than 16 bytes. 1162 // 1163 // NB: Ignores all of the bits of count which represent more than 15 1164 // bytes, so a caller doesn't have to mask them. 1165 1166 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1167 bool is_backwards = step < 0; 1168 size_t granularity = g_uabs(step); 1169 int direction = is_backwards ? -1 : 1; 1170 1171 Label Lword, Lint, Lshort, Lbyte; 1172 1173 assert(granularity 1174 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1175 1176 const Register t0 = r3; 1177 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1178 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1179 1180 // ??? I don't know if this bit-test-and-branch is the right thing 1181 // to do. It does a lot of jumping, resulting in several 1182 // mispredicted branches. It might make more sense to do this 1183 // with something like Duff's device with a single computed branch. 1184 1185 __ tbz(count, 3 - exact_log2(granularity), Lword); 1186 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1187 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1188 __ bind(Lword); 1189 1190 if (granularity <= sizeof (jint)) { 1191 __ tbz(count, 2 - exact_log2(granularity), Lint); 1192 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1193 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1194 __ bind(Lint); 1195 } 1196 1197 if (granularity <= sizeof (jshort)) { 1198 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1199 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1200 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1201 __ bind(Lshort); 1202 } 1203 1204 if (granularity <= sizeof (jbyte)) { 1205 __ tbz(count, 0, Lbyte); 1206 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1207 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1208 __ bind(Lbyte); 1209 } 1210 } 1211 1212 // All-singing all-dancing memory copy. 1213 // 1214 // Copy count units of memory from s to d. The size of a unit is 1215 // step, which can be positive or negative depending on the direction 1216 // of copy. If is_aligned is false, we align the source address. 1217 // 1218 1219 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1220 Register s, Register d, Register count, int step) { 1221 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1222 bool is_backwards = step < 0; 1223 unsigned int granularity = g_uabs(step); 1224 const Register t0 = r3, t1 = r4; 1225 1226 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1227 // load all the data before writing anything 1228 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1229 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1230 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1231 const Register send = r17, dend = r16; 1232 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1233 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1234 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1235 1236 if (PrefetchCopyIntervalInBytes > 0) 1237 __ prfm(Address(s, 0), PLDL1KEEP); 1238 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1239 __ br(Assembler::HI, copy_big); 1240 1241 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1242 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1243 1244 __ cmp(count, u1(16/granularity)); 1245 __ br(Assembler::LS, copy16); 1246 1247 __ cmp(count, u1(64/granularity)); 1248 __ br(Assembler::HI, copy80); 1249 1250 __ cmp(count, u1(32/granularity)); 1251 __ br(Assembler::LS, copy32); 1252 1253 // 33..64 bytes 1254 if (UseSIMDForMemoryOps) { 1255 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1256 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1257 bs.copy_store_at_32(Address(d, 0), v0, v1); 1258 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1259 } else { 1260 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1261 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1262 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1263 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1264 1265 bs.copy_store_at_16(Address(d, 0), t0, t1); 1266 bs.copy_store_at_16(Address(d, 16), t2, t3); 1267 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1268 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1269 } 1270 __ b(finish); 1271 1272 // 17..32 bytes 1273 __ bind(copy32); 1274 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1275 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1276 1277 bs.copy_store_at_16(Address(d, 0), t0, t1); 1278 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1279 __ b(finish); 1280 1281 // 65..80/96 bytes 1282 // (96 bytes if SIMD because we do 32 byes per instruction) 1283 __ bind(copy80); 1284 if (UseSIMDForMemoryOps) { 1285 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1286 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1287 // Unaligned pointers can be an issue for copying. 1288 // The issue has more chances to happen when granularity of data is 1289 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1290 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1291 // The most performance drop has been seen for the range 65-80 bytes. 1292 // For such cases using the pair of ldp/stp instead of the third pair of 1293 // ldpq/stpq fixes the performance issue. 1294 if (granularity < sizeof (jint)) { 1295 Label copy96; 1296 __ cmp(count, u1(80/granularity)); 1297 __ br(Assembler::HI, copy96); 1298 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1299 1300 bs.copy_store_at_32(Address(d, 0), v0, v1); 1301 bs.copy_store_at_32(Address(d, 32), v2, v3); 1302 1303 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1304 __ b(finish); 1305 1306 __ bind(copy96); 1307 } 1308 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1309 1310 bs.copy_store_at_32(Address(d, 0), v0, v1); 1311 bs.copy_store_at_32(Address(d, 32), v2, v3); 1312 1313 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1314 } else { 1315 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1316 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1317 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1318 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1319 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1320 1321 bs.copy_store_at_16(Address(d, 0), t0, t1); 1322 bs.copy_store_at_16(Address(d, 16), t2, t3); 1323 bs.copy_store_at_16(Address(d, 32), t4, t5); 1324 bs.copy_store_at_16(Address(d, 48), t6, t7); 1325 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1326 } 1327 __ b(finish); 1328 1329 // 0..16 bytes 1330 __ bind(copy16); 1331 __ cmp(count, u1(8/granularity)); 1332 __ br(Assembler::LO, copy8); 1333 1334 // 8..16 bytes 1335 bs.copy_load_at_8(t0, Address(s, 0)); 1336 bs.copy_load_at_8(t1, Address(send, -8)); 1337 bs.copy_store_at_8(Address(d, 0), t0); 1338 bs.copy_store_at_8(Address(dend, -8), t1); 1339 __ b(finish); 1340 1341 if (granularity < 8) { 1342 // 4..7 bytes 1343 __ bind(copy8); 1344 __ tbz(count, 2 - exact_log2(granularity), copy4); 1345 __ ldrw(t0, Address(s, 0)); 1346 __ ldrw(t1, Address(send, -4)); 1347 __ strw(t0, Address(d, 0)); 1348 __ strw(t1, Address(dend, -4)); 1349 __ b(finish); 1350 if (granularity < 4) { 1351 // 0..3 bytes 1352 __ bind(copy4); 1353 __ cbz(count, finish); // get rid of 0 case 1354 if (granularity == 2) { 1355 __ ldrh(t0, Address(s, 0)); 1356 __ strh(t0, Address(d, 0)); 1357 } else { // granularity == 1 1358 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1359 // the first and last byte. 1360 // Handle the 3 byte case by loading and storing base + count/2 1361 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1362 // This does means in the 1 byte case we load/store the same 1363 // byte 3 times. 1364 __ lsr(count, count, 1); 1365 __ ldrb(t0, Address(s, 0)); 1366 __ ldrb(t1, Address(send, -1)); 1367 __ ldrb(t2, Address(s, count)); 1368 __ strb(t0, Address(d, 0)); 1369 __ strb(t1, Address(dend, -1)); 1370 __ strb(t2, Address(d, count)); 1371 } 1372 __ b(finish); 1373 } 1374 } 1375 1376 __ bind(copy_big); 1377 if (is_backwards) { 1378 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1379 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1380 } 1381 1382 // Now we've got the small case out of the way we can align the 1383 // source address on a 2-word boundary. 1384 1385 // Here we will materialize a count in r15, which is used by copy_memory_small 1386 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1387 // Up until here, we have used t9, which aliases r15, but from here on, that register 1388 // can not be used as a temp register, as it contains the count. 1389 1390 Label aligned; 1391 1392 if (is_aligned) { 1393 // We may have to adjust by 1 word to get s 2-word-aligned. 1394 __ tbz(s, exact_log2(wordSize), aligned); 1395 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1396 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1397 __ sub(count, count, wordSize/granularity); 1398 } else { 1399 if (is_backwards) { 1400 __ andr(r15, s, 2 * wordSize - 1); 1401 } else { 1402 __ neg(r15, s); 1403 __ andr(r15, r15, 2 * wordSize - 1); 1404 } 1405 // r15 is the byte adjustment needed to align s. 1406 __ cbz(r15, aligned); 1407 int shift = exact_log2(granularity); 1408 if (shift > 0) { 1409 __ lsr(r15, r15, shift); 1410 } 1411 __ sub(count, count, r15); 1412 1413 #if 0 1414 // ?? This code is only correct for a disjoint copy. It may or 1415 // may not make sense to use it in that case. 1416 1417 // Copy the first pair; s and d may not be aligned. 1418 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1419 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1420 1421 // Align s and d, adjust count 1422 if (is_backwards) { 1423 __ sub(s, s, r15); 1424 __ sub(d, d, r15); 1425 } else { 1426 __ add(s, s, r15); 1427 __ add(d, d, r15); 1428 } 1429 #else 1430 copy_memory_small(decorators, type, s, d, r15, step); 1431 #endif 1432 } 1433 1434 __ bind(aligned); 1435 1436 // s is now 2-word-aligned. 1437 1438 // We have a count of units and some trailing bytes. Adjust the 1439 // count and do a bulk copy of words. If the shift is zero 1440 // perform a move instead to benefit from zero latency moves. 1441 int shift = exact_log2(wordSize/granularity); 1442 if (shift > 0) { 1443 __ lsr(r15, count, shift); 1444 } else { 1445 __ mov(r15, count); 1446 } 1447 if (direction == copy_forwards) { 1448 if (type != T_OBJECT) { 1449 __ bl(StubRoutines::aarch64::copy_byte_f()); 1450 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1451 __ bl(StubRoutines::aarch64::copy_oop_uninit_f()); 1452 } else { 1453 __ bl(StubRoutines::aarch64::copy_oop_f()); 1454 } 1455 } else { 1456 if (type != T_OBJECT) { 1457 __ bl(StubRoutines::aarch64::copy_byte_b()); 1458 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1459 __ bl(StubRoutines::aarch64::copy_oop_uninit_b()); 1460 } else { 1461 __ bl(StubRoutines::aarch64::copy_oop_b()); 1462 } 1463 } 1464 1465 // And the tail. 1466 copy_memory_small(decorators, type, s, d, count, step); 1467 1468 if (granularity >= 8) __ bind(copy8); 1469 if (granularity >= 4) __ bind(copy4); 1470 __ bind(finish); 1471 } 1472 1473 1474 void clobber_registers() { 1475 #ifdef ASSERT 1476 RegSet clobbered 1477 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1478 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1479 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1480 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1481 __ mov(*it, rscratch1); 1482 } 1483 #endif 1484 1485 } 1486 1487 // Scan over array at a for count oops, verifying each one. 1488 // Preserves a and count, clobbers rscratch1 and rscratch2. 1489 void verify_oop_array (int size, Register a, Register count, Register temp) { 1490 Label loop, end; 1491 __ mov(rscratch1, a); 1492 __ mov(rscratch2, zr); 1493 __ bind(loop); 1494 __ cmp(rscratch2, count); 1495 __ br(Assembler::HS, end); 1496 if (size == wordSize) { 1497 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1498 __ verify_oop(temp); 1499 } else { 1500 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1501 __ decode_heap_oop(temp); // calls verify_oop 1502 } 1503 __ add(rscratch2, rscratch2, 1); 1504 __ b(loop); 1505 __ bind(end); 1506 } 1507 1508 // Arguments: 1509 // stub_id - is used to name the stub and identify all details of 1510 // how to perform the copy. 1511 // 1512 // entry - is assigned to the stub's post push entry point unless 1513 // it is null 1514 // 1515 // Inputs: 1516 // c_rarg0 - source array address 1517 // c_rarg1 - destination array address 1518 // c_rarg2 - element count, treated as ssize_t, can be zero 1519 // 1520 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1521 // the hardware handle it. The two dwords within qwords that span 1522 // cache line boundaries will still be loaded and stored atomically. 1523 // 1524 // Side Effects: nopush_entry is set to the (post push) entry point 1525 // so it can be used by the corresponding conjoint 1526 // copy method 1527 // 1528 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) { 1529 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1530 RegSet saved_reg = RegSet::of(s, d, count); 1531 int size; 1532 bool aligned; 1533 bool is_oop; 1534 bool dest_uninitialized; 1535 switch (stub_id) { 1536 case StubId::stubgen_jbyte_disjoint_arraycopy_id: 1537 size = sizeof(jbyte); 1538 aligned = false; 1539 is_oop = false; 1540 dest_uninitialized = false; 1541 break; 1542 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id: 1543 size = sizeof(jbyte); 1544 aligned = true; 1545 is_oop = false; 1546 dest_uninitialized = false; 1547 break; 1548 case StubId::stubgen_jshort_disjoint_arraycopy_id: 1549 size = sizeof(jshort); 1550 aligned = false; 1551 is_oop = false; 1552 dest_uninitialized = false; 1553 break; 1554 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id: 1555 size = sizeof(jshort); 1556 aligned = true; 1557 is_oop = false; 1558 dest_uninitialized = false; 1559 break; 1560 case StubId::stubgen_jint_disjoint_arraycopy_id: 1561 size = sizeof(jint); 1562 aligned = false; 1563 is_oop = false; 1564 dest_uninitialized = false; 1565 break; 1566 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id: 1567 size = sizeof(jint); 1568 aligned = true; 1569 is_oop = false; 1570 dest_uninitialized = false; 1571 break; 1572 case StubId::stubgen_jlong_disjoint_arraycopy_id: 1573 // since this is always aligned we can (should!) use the same 1574 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy 1575 ShouldNotReachHere(); 1576 break; 1577 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id: 1578 size = sizeof(jlong); 1579 aligned = true; 1580 is_oop = false; 1581 dest_uninitialized = false; 1582 break; 1583 case StubId::stubgen_oop_disjoint_arraycopy_id: 1584 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1585 aligned = !UseCompressedOops; 1586 is_oop = true; 1587 dest_uninitialized = false; 1588 break; 1589 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id: 1590 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1591 aligned = !UseCompressedOops; 1592 is_oop = true; 1593 dest_uninitialized = false; 1594 break; 1595 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id: 1596 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1597 aligned = !UseCompressedOops; 1598 is_oop = true; 1599 dest_uninitialized = true; 1600 break; 1601 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id: 1602 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1603 aligned = !UseCompressedOops; 1604 is_oop = true; 1605 dest_uninitialized = true; 1606 break; 1607 default: 1608 ShouldNotReachHere(); 1609 break; 1610 } 1611 1612 __ align(CodeEntryAlignment); 1613 StubCodeMark mark(this, stub_id); 1614 address start = __ pc(); 1615 __ enter(); 1616 1617 if (nopush_entry != nullptr) { 1618 *nopush_entry = __ pc(); 1619 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1620 BLOCK_COMMENT("Entry:"); 1621 } 1622 1623 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1624 if (dest_uninitialized) { 1625 decorators |= IS_DEST_UNINITIALIZED; 1626 } 1627 if (aligned) { 1628 decorators |= ARRAYCOPY_ALIGNED; 1629 } 1630 1631 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1632 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1633 1634 if (is_oop) { 1635 // save regs before copy_memory 1636 __ push(RegSet::of(d, count), sp); 1637 } 1638 { 1639 // UnsafeMemoryAccess page error: continue after unsafe access 1640 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1641 UnsafeMemoryAccessMark umam(this, add_entry, true); 1642 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1643 } 1644 1645 if (is_oop) { 1646 __ pop(RegSet::of(d, count), sp); 1647 if (VerifyOops) 1648 verify_oop_array(size, d, count, r16); 1649 } 1650 1651 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1652 1653 __ leave(); 1654 __ mov(r0, zr); // return 0 1655 __ ret(lr); 1656 return start; 1657 } 1658 1659 // Arguments: 1660 // stub_id - is used to name the stub and identify all details of 1661 // how to perform the copy. 1662 // 1663 // nooverlap_target - identifes the (post push) entry for the 1664 // corresponding disjoint copy routine which can be 1665 // jumped to if the ranges do not actually overlap 1666 // 1667 // entry - is assigned to the stub's post push entry point unless 1668 // it is null 1669 // 1670 // 1671 // Inputs: 1672 // c_rarg0 - source array address 1673 // c_rarg1 - destination array address 1674 // c_rarg2 - element count, treated as ssize_t, can be zero 1675 // 1676 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1677 // the hardware handle it. The two dwords within qwords that span 1678 // cache line boundaries will still be loaded and stored atomically. 1679 // 1680 // Side Effects: 1681 // nopush_entry is set to the no-overlap entry point so it can be 1682 // used by some other conjoint copy method 1683 // 1684 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) { 1685 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1686 RegSet saved_regs = RegSet::of(s, d, count); 1687 int size; 1688 bool aligned; 1689 bool is_oop; 1690 bool dest_uninitialized; 1691 switch (stub_id) { 1692 case StubId::stubgen_jbyte_arraycopy_id: 1693 size = sizeof(jbyte); 1694 aligned = false; 1695 is_oop = false; 1696 dest_uninitialized = false; 1697 break; 1698 case StubId::stubgen_arrayof_jbyte_arraycopy_id: 1699 size = sizeof(jbyte); 1700 aligned = true; 1701 is_oop = false; 1702 dest_uninitialized = false; 1703 break; 1704 case StubId::stubgen_jshort_arraycopy_id: 1705 size = sizeof(jshort); 1706 aligned = false; 1707 is_oop = false; 1708 dest_uninitialized = false; 1709 break; 1710 case StubId::stubgen_arrayof_jshort_arraycopy_id: 1711 size = sizeof(jshort); 1712 aligned = true; 1713 is_oop = false; 1714 dest_uninitialized = false; 1715 break; 1716 case StubId::stubgen_jint_arraycopy_id: 1717 size = sizeof(jint); 1718 aligned = false; 1719 is_oop = false; 1720 dest_uninitialized = false; 1721 break; 1722 case StubId::stubgen_arrayof_jint_arraycopy_id: 1723 size = sizeof(jint); 1724 aligned = true; 1725 is_oop = false; 1726 dest_uninitialized = false; 1727 break; 1728 case StubId::stubgen_jlong_arraycopy_id: 1729 // since this is always aligned we can (should!) use the same 1730 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy 1731 ShouldNotReachHere(); 1732 break; 1733 case StubId::stubgen_arrayof_jlong_arraycopy_id: 1734 size = sizeof(jlong); 1735 aligned = true; 1736 is_oop = false; 1737 dest_uninitialized = false; 1738 break; 1739 case StubId::stubgen_oop_arraycopy_id: 1740 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1741 aligned = !UseCompressedOops; 1742 is_oop = true; 1743 dest_uninitialized = false; 1744 break; 1745 case StubId::stubgen_arrayof_oop_arraycopy_id: 1746 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1747 aligned = !UseCompressedOops; 1748 is_oop = true; 1749 dest_uninitialized = false; 1750 break; 1751 case StubId::stubgen_oop_arraycopy_uninit_id: 1752 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1753 aligned = !UseCompressedOops; 1754 is_oop = true; 1755 dest_uninitialized = true; 1756 break; 1757 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id: 1758 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1759 aligned = !UseCompressedOops; 1760 is_oop = true; 1761 dest_uninitialized = true; 1762 break; 1763 default: 1764 ShouldNotReachHere(); 1765 } 1766 1767 StubCodeMark mark(this, stub_id); 1768 address start = __ pc(); 1769 __ enter(); 1770 1771 if (nopush_entry != nullptr) { 1772 *nopush_entry = __ pc(); 1773 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1774 BLOCK_COMMENT("Entry:"); 1775 } 1776 1777 // use fwd copy when (d-s) above_equal (count*size) 1778 Label L_overlapping; 1779 __ sub(rscratch1, d, s); 1780 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1781 __ br(Assembler::LO, L_overlapping); 1782 __ b(RuntimeAddress(nooverlap_target)); 1783 __ bind(L_overlapping); 1784 1785 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1786 if (dest_uninitialized) { 1787 decorators |= IS_DEST_UNINITIALIZED; 1788 } 1789 if (aligned) { 1790 decorators |= ARRAYCOPY_ALIGNED; 1791 } 1792 1793 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1794 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1795 1796 if (is_oop) { 1797 // save regs before copy_memory 1798 __ push(RegSet::of(d, count), sp); 1799 } 1800 { 1801 // UnsafeMemoryAccess page error: continue after unsafe access 1802 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1803 UnsafeMemoryAccessMark umam(this, add_entry, true); 1804 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1805 } 1806 if (is_oop) { 1807 __ pop(RegSet::of(d, count), sp); 1808 if (VerifyOops) 1809 verify_oop_array(size, d, count, r16); 1810 } 1811 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1812 __ leave(); 1813 __ mov(r0, zr); // return 0 1814 __ ret(lr); 1815 return start; 1816 } 1817 1818 // Helper for generating a dynamic type check. 1819 // Smashes rscratch1, rscratch2. 1820 void generate_type_check(Register sub_klass, 1821 Register super_check_offset, 1822 Register super_klass, 1823 Register temp1, 1824 Register temp2, 1825 Register result, 1826 Label& L_success) { 1827 assert_different_registers(sub_klass, super_check_offset, super_klass); 1828 1829 BLOCK_COMMENT("type_check:"); 1830 1831 Label L_miss; 1832 1833 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1834 super_check_offset); 1835 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1836 1837 // Fall through on failure! 1838 __ BIND(L_miss); 1839 } 1840 1841 // 1842 // Generate checkcasting array copy stub 1843 // 1844 // Input: 1845 // c_rarg0 - source array address 1846 // c_rarg1 - destination array address 1847 // c_rarg2 - element count, treated as ssize_t, can be zero 1848 // c_rarg3 - size_t ckoff (super_check_offset) 1849 // c_rarg4 - oop ckval (super_klass) 1850 // 1851 // Output: 1852 // r0 == 0 - success 1853 // r0 == -1^K - failure, where K is partial transfer count 1854 // 1855 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) { 1856 bool dest_uninitialized; 1857 switch (stub_id) { 1858 case StubId::stubgen_checkcast_arraycopy_id: 1859 dest_uninitialized = false; 1860 break; 1861 case StubId::stubgen_checkcast_arraycopy_uninit_id: 1862 dest_uninitialized = true; 1863 break; 1864 default: 1865 ShouldNotReachHere(); 1866 } 1867 1868 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1869 1870 // Input registers (after setup_arg_regs) 1871 const Register from = c_rarg0; // source array address 1872 const Register to = c_rarg1; // destination array address 1873 const Register count = c_rarg2; // elementscount 1874 const Register ckoff = c_rarg3; // super_check_offset 1875 const Register ckval = c_rarg4; // super_klass 1876 1877 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1878 RegSet wb_post_saved_regs = RegSet::of(count); 1879 1880 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1881 const Register copied_oop = r22; // actual oop copied 1882 const Register count_save = r21; // orig elementscount 1883 const Register start_to = r20; // destination array start address 1884 const Register r19_klass = r19; // oop._klass 1885 1886 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1887 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1888 1889 //--------------------------------------------------------------- 1890 // Assembler stub will be used for this call to arraycopy 1891 // if the two arrays are subtypes of Object[] but the 1892 // destination array type is not equal to or a supertype 1893 // of the source type. Each element must be separately 1894 // checked. 1895 1896 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1897 copied_oop, r19_klass, count_save); 1898 1899 __ align(CodeEntryAlignment); 1900 StubCodeMark mark(this, stub_id); 1901 address start = __ pc(); 1902 1903 __ enter(); // required for proper stackwalking of RuntimeStub frame 1904 1905 #ifdef ASSERT 1906 // caller guarantees that the arrays really are different 1907 // otherwise, we would have to make conjoint checks 1908 { Label L; 1909 __ b(L); // conjoint check not yet implemented 1910 __ stop("checkcast_copy within a single array"); 1911 __ bind(L); 1912 } 1913 #endif //ASSERT 1914 1915 // Caller of this entry point must set up the argument registers. 1916 if (nopush_entry != nullptr) { 1917 *nopush_entry = __ pc(); 1918 BLOCK_COMMENT("Entry:"); 1919 } 1920 1921 // Empty array: Nothing to do. 1922 __ cbz(count, L_done); 1923 __ push(RegSet::of(r19, r20, r21, r22), sp); 1924 1925 #ifdef ASSERT 1926 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1927 // The ckoff and ckval must be mutually consistent, 1928 // even though caller generates both. 1929 { Label L; 1930 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1931 __ ldrw(start_to, Address(ckval, sco_offset)); 1932 __ cmpw(ckoff, start_to); 1933 __ br(Assembler::EQ, L); 1934 __ stop("super_check_offset inconsistent"); 1935 __ bind(L); 1936 } 1937 #endif //ASSERT 1938 1939 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1940 bool is_oop = true; 1941 int element_size = UseCompressedOops ? 4 : 8; 1942 if (dest_uninitialized) { 1943 decorators |= IS_DEST_UNINITIALIZED; 1944 } 1945 1946 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1947 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1948 1949 // save the original count 1950 __ mov(count_save, count); 1951 1952 // Copy from low to high addresses 1953 __ mov(start_to, to); // Save destination array start address 1954 __ b(L_load_element); 1955 1956 // ======== begin loop ======== 1957 // (Loop is rotated; its entry is L_load_element.) 1958 // Loop control: 1959 // for (; count != 0; count--) { 1960 // copied_oop = load_heap_oop(from++); 1961 // ... generate_type_check ...; 1962 // store_heap_oop(to++, copied_oop); 1963 // } 1964 __ align(OptoLoopAlignment); 1965 1966 __ BIND(L_store_element); 1967 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1968 __ post(to, element_size), copied_oop, noreg, 1969 gct1, gct2, gct3); 1970 __ sub(count, count, 1); 1971 __ cbz(count, L_do_card_marks); 1972 1973 // ======== loop entry is here ======== 1974 __ BIND(L_load_element); 1975 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1976 copied_oop, noreg, __ post(from, element_size), 1977 gct1); 1978 __ cbz(copied_oop, L_store_element); 1979 1980 __ load_klass(r19_klass, copied_oop);// query the object klass 1981 1982 BLOCK_COMMENT("type_check:"); 1983 generate_type_check(/*sub_klass*/r19_klass, 1984 /*super_check_offset*/ckoff, 1985 /*super_klass*/ckval, 1986 /*r_array_base*/gct1, 1987 /*temp2*/gct2, 1988 /*result*/r10, L_store_element); 1989 1990 // Fall through on failure! 1991 1992 // ======== end loop ======== 1993 1994 // It was a real error; we must depend on the caller to finish the job. 1995 // Register count = remaining oops, count_orig = total oops. 1996 // Emit GC store barriers for the oops we have copied and report 1997 // their number to the caller. 1998 1999 __ subs(count, count_save, count); // K = partially copied oop count 2000 __ eon(count, count, zr); // report (-1^K) to caller 2001 __ br(Assembler::EQ, L_done_pop); 2002 2003 __ BIND(L_do_card_marks); 2004 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2005 2006 __ bind(L_done_pop); 2007 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2008 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2009 2010 __ bind(L_done); 2011 __ mov(r0, count); 2012 __ leave(); 2013 __ ret(lr); 2014 2015 return start; 2016 } 2017 2018 // Perform range checks on the proposed arraycopy. 2019 // Kills temp, but nothing else. 2020 // Also, clean the sign bits of src_pos and dst_pos. 2021 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2022 Register src_pos, // source position (c_rarg1) 2023 Register dst, // destination array oo (c_rarg2) 2024 Register dst_pos, // destination position (c_rarg3) 2025 Register length, 2026 Register temp, 2027 Label& L_failed) { 2028 BLOCK_COMMENT("arraycopy_range_checks:"); 2029 2030 assert_different_registers(rscratch1, temp); 2031 2032 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2033 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2034 __ addw(temp, length, src_pos); 2035 __ cmpw(temp, rscratch1); 2036 __ br(Assembler::HI, L_failed); 2037 2038 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2039 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2040 __ addw(temp, length, dst_pos); 2041 __ cmpw(temp, rscratch1); 2042 __ br(Assembler::HI, L_failed); 2043 2044 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2045 __ movw(src_pos, src_pos); 2046 __ movw(dst_pos, dst_pos); 2047 2048 BLOCK_COMMENT("arraycopy_range_checks done"); 2049 } 2050 2051 // These stubs get called from some dumb test routine. 2052 // I'll write them properly when they're called from 2053 // something that's actually doing something. 2054 static void fake_arraycopy_stub(address src, address dst, int count) { 2055 assert(count == 0, "huh?"); 2056 } 2057 2058 2059 // 2060 // Generate 'unsafe' array copy stub 2061 // Though just as safe as the other stubs, it takes an unscaled 2062 // size_t argument instead of an element count. 2063 // 2064 // Input: 2065 // c_rarg0 - source array address 2066 // c_rarg1 - destination array address 2067 // c_rarg2 - byte count, treated as ssize_t, can be zero 2068 // 2069 // Examines the alignment of the operands and dispatches 2070 // to a long, int, short, or byte copy loop. 2071 // 2072 address generate_unsafe_copy(address byte_copy_entry, 2073 address short_copy_entry, 2074 address int_copy_entry, 2075 address long_copy_entry) { 2076 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id; 2077 2078 Label L_long_aligned, L_int_aligned, L_short_aligned; 2079 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2080 2081 __ align(CodeEntryAlignment); 2082 StubCodeMark mark(this, stub_id); 2083 address start = __ pc(); 2084 __ enter(); // required for proper stackwalking of RuntimeStub frame 2085 2086 // bump this on entry, not on exit: 2087 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2088 2089 __ orr(rscratch1, s, d); 2090 __ orr(rscratch1, rscratch1, count); 2091 2092 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2093 __ cbz(rscratch1, L_long_aligned); 2094 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2095 __ cbz(rscratch1, L_int_aligned); 2096 __ tbz(rscratch1, 0, L_short_aligned); 2097 __ b(RuntimeAddress(byte_copy_entry)); 2098 2099 __ BIND(L_short_aligned); 2100 __ lsr(count, count, LogBytesPerShort); // size => short_count 2101 __ b(RuntimeAddress(short_copy_entry)); 2102 __ BIND(L_int_aligned); 2103 __ lsr(count, count, LogBytesPerInt); // size => int_count 2104 __ b(RuntimeAddress(int_copy_entry)); 2105 __ BIND(L_long_aligned); 2106 __ lsr(count, count, LogBytesPerLong); // size => long_count 2107 __ b(RuntimeAddress(long_copy_entry)); 2108 2109 return start; 2110 } 2111 2112 // 2113 // Generate generic array copy stubs 2114 // 2115 // Input: 2116 // c_rarg0 - src oop 2117 // c_rarg1 - src_pos (32-bits) 2118 // c_rarg2 - dst oop 2119 // c_rarg3 - dst_pos (32-bits) 2120 // c_rarg4 - element count (32-bits) 2121 // 2122 // Output: 2123 // r0 == 0 - success 2124 // r0 == -1^K - failure, where K is partial transfer count 2125 // 2126 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2127 address int_copy_entry, address oop_copy_entry, 2128 address long_copy_entry, address checkcast_copy_entry) { 2129 StubId stub_id = StubId::stubgen_generic_arraycopy_id; 2130 2131 Label L_failed, L_objArray; 2132 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2133 2134 // Input registers 2135 const Register src = c_rarg0; // source array oop 2136 const Register src_pos = c_rarg1; // source position 2137 const Register dst = c_rarg2; // destination array oop 2138 const Register dst_pos = c_rarg3; // destination position 2139 const Register length = c_rarg4; 2140 2141 2142 // Registers used as temps 2143 const Register dst_klass = c_rarg5; 2144 2145 __ align(CodeEntryAlignment); 2146 2147 StubCodeMark mark(this, stub_id); 2148 2149 address start = __ pc(); 2150 2151 __ enter(); // required for proper stackwalking of RuntimeStub frame 2152 2153 // bump this on entry, not on exit: 2154 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2155 2156 //----------------------------------------------------------------------- 2157 // Assembler stub will be used for this call to arraycopy 2158 // if the following conditions are met: 2159 // 2160 // (1) src and dst must not be null. 2161 // (2) src_pos must not be negative. 2162 // (3) dst_pos must not be negative. 2163 // (4) length must not be negative. 2164 // (5) src klass and dst klass should be the same and not null. 2165 // (6) src and dst should be arrays. 2166 // (7) src_pos + length must not exceed length of src. 2167 // (8) dst_pos + length must not exceed length of dst. 2168 // 2169 2170 // if (src == nullptr) return -1; 2171 __ cbz(src, L_failed); 2172 2173 // if (src_pos < 0) return -1; 2174 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2175 2176 // if (dst == nullptr) return -1; 2177 __ cbz(dst, L_failed); 2178 2179 // if (dst_pos < 0) return -1; 2180 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2181 2182 // registers used as temp 2183 const Register scratch_length = r16; // elements count to copy 2184 const Register scratch_src_klass = r17; // array klass 2185 const Register lh = r15; // layout helper 2186 2187 // if (length < 0) return -1; 2188 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2189 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2190 2191 __ load_klass(scratch_src_klass, src); 2192 #ifdef ASSERT 2193 // assert(src->klass() != nullptr); 2194 { 2195 BLOCK_COMMENT("assert klasses not null {"); 2196 Label L1, L2; 2197 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2198 __ bind(L1); 2199 __ stop("broken null klass"); 2200 __ bind(L2); 2201 __ load_klass(rscratch1, dst); 2202 __ cbz(rscratch1, L1); // this would be broken also 2203 BLOCK_COMMENT("} assert klasses not null done"); 2204 } 2205 #endif 2206 2207 // Load layout helper (32-bits) 2208 // 2209 // |array_tag| | header_size | element_type | |log2_element_size| 2210 // 32 30 24 16 8 2 0 2211 // 2212 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2213 // 2214 2215 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2216 2217 // Handle objArrays completely differently... 2218 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2219 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2220 __ movw(rscratch1, objArray_lh); 2221 __ eorw(rscratch2, lh, rscratch1); 2222 __ cbzw(rscratch2, L_objArray); 2223 2224 // if (src->klass() != dst->klass()) return -1; 2225 __ load_klass(rscratch2, dst); 2226 __ eor(rscratch2, rscratch2, scratch_src_klass); 2227 __ cbnz(rscratch2, L_failed); 2228 2229 // if (!src->is_Array()) return -1; 2230 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2231 2232 // At this point, it is known to be a typeArray (array_tag 0x3). 2233 #ifdef ASSERT 2234 { 2235 BLOCK_COMMENT("assert primitive array {"); 2236 Label L; 2237 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2238 __ cmpw(lh, rscratch2); 2239 __ br(Assembler::GE, L); 2240 __ stop("must be a primitive array"); 2241 __ bind(L); 2242 BLOCK_COMMENT("} assert primitive array done"); 2243 } 2244 #endif 2245 2246 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2247 rscratch2, L_failed); 2248 2249 // TypeArrayKlass 2250 // 2251 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2252 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2253 // 2254 2255 const Register rscratch1_offset = rscratch1; // array offset 2256 const Register r15_elsize = lh; // element size 2257 2258 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2259 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2260 __ add(src, src, rscratch1_offset); // src array offset 2261 __ add(dst, dst, rscratch1_offset); // dst array offset 2262 BLOCK_COMMENT("choose copy loop based on element size"); 2263 2264 // next registers should be set before the jump to corresponding stub 2265 const Register from = c_rarg0; // source array address 2266 const Register to = c_rarg1; // destination array address 2267 const Register count = c_rarg2; // elements count 2268 2269 // 'from', 'to', 'count' registers should be set in such order 2270 // since they are the same as 'src', 'src_pos', 'dst'. 2271 2272 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2273 2274 // The possible values of elsize are 0-3, i.e. exact_log2(element 2275 // size in bytes). We do a simple bitwise binary search. 2276 __ BIND(L_copy_bytes); 2277 __ tbnz(r15_elsize, 1, L_copy_ints); 2278 __ tbnz(r15_elsize, 0, L_copy_shorts); 2279 __ lea(from, Address(src, src_pos));// src_addr 2280 __ lea(to, Address(dst, dst_pos));// dst_addr 2281 __ movw(count, scratch_length); // length 2282 __ b(RuntimeAddress(byte_copy_entry)); 2283 2284 __ BIND(L_copy_shorts); 2285 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2286 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2287 __ movw(count, scratch_length); // length 2288 __ b(RuntimeAddress(short_copy_entry)); 2289 2290 __ BIND(L_copy_ints); 2291 __ tbnz(r15_elsize, 0, L_copy_longs); 2292 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2293 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2294 __ movw(count, scratch_length); // length 2295 __ b(RuntimeAddress(int_copy_entry)); 2296 2297 __ BIND(L_copy_longs); 2298 #ifdef ASSERT 2299 { 2300 BLOCK_COMMENT("assert long copy {"); 2301 Label L; 2302 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2303 __ cmpw(r15_elsize, LogBytesPerLong); 2304 __ br(Assembler::EQ, L); 2305 __ stop("must be long copy, but elsize is wrong"); 2306 __ bind(L); 2307 BLOCK_COMMENT("} assert long copy done"); 2308 } 2309 #endif 2310 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2311 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2312 __ movw(count, scratch_length); // length 2313 __ b(RuntimeAddress(long_copy_entry)); 2314 2315 // ObjArrayKlass 2316 __ BIND(L_objArray); 2317 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2318 2319 Label L_plain_copy, L_checkcast_copy; 2320 // test array classes for subtyping 2321 __ load_klass(r15, dst); 2322 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2323 __ br(Assembler::NE, L_checkcast_copy); 2324 2325 // Identically typed arrays can be copied without element-wise checks. 2326 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2327 rscratch2, L_failed); 2328 2329 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2330 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2331 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2332 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2333 __ movw(count, scratch_length); // length 2334 __ BIND(L_plain_copy); 2335 __ b(RuntimeAddress(oop_copy_entry)); 2336 2337 __ BIND(L_checkcast_copy); 2338 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2339 { 2340 // Before looking at dst.length, make sure dst is also an objArray. 2341 __ ldrw(rscratch1, Address(r15, lh_offset)); 2342 __ movw(rscratch2, objArray_lh); 2343 __ eorw(rscratch1, rscratch1, rscratch2); 2344 __ cbnzw(rscratch1, L_failed); 2345 2346 // It is safe to examine both src.length and dst.length. 2347 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2348 r15, L_failed); 2349 2350 __ load_klass(dst_klass, dst); // reload 2351 2352 // Marshal the base address arguments now, freeing registers. 2353 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2354 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2355 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2356 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2357 __ movw(count, length); // length (reloaded) 2358 Register sco_temp = c_rarg3; // this register is free now 2359 assert_different_registers(from, to, count, sco_temp, 2360 dst_klass, scratch_src_klass); 2361 // assert_clean_int(count, sco_temp); 2362 2363 // Generate the type check. 2364 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2365 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2366 2367 // Smashes rscratch1, rscratch2 2368 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2369 L_plain_copy); 2370 2371 // Fetch destination element klass from the ObjArrayKlass header. 2372 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2373 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2374 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2375 2376 // the checkcast_copy loop needs two extra arguments: 2377 assert(c_rarg3 == sco_temp, "#3 already in place"); 2378 // Set up arguments for checkcast_copy_entry. 2379 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2380 __ b(RuntimeAddress(checkcast_copy_entry)); 2381 } 2382 2383 __ BIND(L_failed); 2384 __ mov(r0, -1); 2385 __ leave(); // required for proper stackwalking of RuntimeStub frame 2386 __ ret(lr); 2387 2388 return start; 2389 } 2390 2391 // 2392 // Generate stub for array fill. If "aligned" is true, the 2393 // "to" address is assumed to be heapword aligned. 2394 // 2395 // Arguments for generated stub: 2396 // to: c_rarg0 2397 // value: c_rarg1 2398 // count: c_rarg2 treated as signed 2399 // 2400 address generate_fill(StubId stub_id) { 2401 BasicType t; 2402 bool aligned; 2403 2404 switch (stub_id) { 2405 case StubId::stubgen_jbyte_fill_id: 2406 t = T_BYTE; 2407 aligned = false; 2408 break; 2409 case StubId::stubgen_jshort_fill_id: 2410 t = T_SHORT; 2411 aligned = false; 2412 break; 2413 case StubId::stubgen_jint_fill_id: 2414 t = T_INT; 2415 aligned = false; 2416 break; 2417 case StubId::stubgen_arrayof_jbyte_fill_id: 2418 t = T_BYTE; 2419 aligned = true; 2420 break; 2421 case StubId::stubgen_arrayof_jshort_fill_id: 2422 t = T_SHORT; 2423 aligned = true; 2424 break; 2425 case StubId::stubgen_arrayof_jint_fill_id: 2426 t = T_INT; 2427 aligned = true; 2428 break; 2429 default: 2430 ShouldNotReachHere(); 2431 }; 2432 2433 __ align(CodeEntryAlignment); 2434 StubCodeMark mark(this, stub_id); 2435 address start = __ pc(); 2436 2437 BLOCK_COMMENT("Entry:"); 2438 2439 const Register to = c_rarg0; // source array address 2440 const Register value = c_rarg1; // value 2441 const Register count = c_rarg2; // elements count 2442 2443 const Register bz_base = r10; // base for block_zero routine 2444 const Register cnt_words = r11; // temp register 2445 2446 __ enter(); 2447 2448 Label L_fill_elements, L_exit1; 2449 2450 int shift = -1; 2451 switch (t) { 2452 case T_BYTE: 2453 shift = 0; 2454 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2455 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2456 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2457 __ br(Assembler::LO, L_fill_elements); 2458 break; 2459 case T_SHORT: 2460 shift = 1; 2461 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2462 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2463 __ br(Assembler::LO, L_fill_elements); 2464 break; 2465 case T_INT: 2466 shift = 2; 2467 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2468 __ br(Assembler::LO, L_fill_elements); 2469 break; 2470 default: ShouldNotReachHere(); 2471 } 2472 2473 // Align source address at 8 bytes address boundary. 2474 Label L_skip_align1, L_skip_align2, L_skip_align4; 2475 if (!aligned) { 2476 switch (t) { 2477 case T_BYTE: 2478 // One byte misalignment happens only for byte arrays. 2479 __ tbz(to, 0, L_skip_align1); 2480 __ strb(value, Address(__ post(to, 1))); 2481 __ subw(count, count, 1); 2482 __ bind(L_skip_align1); 2483 // Fallthrough 2484 case T_SHORT: 2485 // Two bytes misalignment happens only for byte and short (char) arrays. 2486 __ tbz(to, 1, L_skip_align2); 2487 __ strh(value, Address(__ post(to, 2))); 2488 __ subw(count, count, 2 >> shift); 2489 __ bind(L_skip_align2); 2490 // Fallthrough 2491 case T_INT: 2492 // Align to 8 bytes, we know we are 4 byte aligned to start. 2493 __ tbz(to, 2, L_skip_align4); 2494 __ strw(value, Address(__ post(to, 4))); 2495 __ subw(count, count, 4 >> shift); 2496 __ bind(L_skip_align4); 2497 break; 2498 default: ShouldNotReachHere(); 2499 } 2500 } 2501 2502 // 2503 // Fill large chunks 2504 // 2505 __ lsrw(cnt_words, count, 3 - shift); // number of words 2506 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2507 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2508 if (UseBlockZeroing) { 2509 Label non_block_zeroing, rest; 2510 // If the fill value is zero we can use the fast zero_words(). 2511 __ cbnz(value, non_block_zeroing); 2512 __ mov(bz_base, to); 2513 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2514 address tpc = __ zero_words(bz_base, cnt_words); 2515 if (tpc == nullptr) { 2516 fatal("CodeCache is full at generate_fill"); 2517 } 2518 __ b(rest); 2519 __ bind(non_block_zeroing); 2520 __ fill_words(to, cnt_words, value); 2521 __ bind(rest); 2522 } else { 2523 __ fill_words(to, cnt_words, value); 2524 } 2525 2526 // Remaining count is less than 8 bytes. Fill it by a single store. 2527 // Note that the total length is no less than 8 bytes. 2528 if (t == T_BYTE || t == T_SHORT) { 2529 Label L_exit1; 2530 __ cbzw(count, L_exit1); 2531 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2532 __ str(value, Address(to, -8)); // overwrite some elements 2533 __ bind(L_exit1); 2534 __ leave(); 2535 __ ret(lr); 2536 } 2537 2538 // Handle copies less than 8 bytes. 2539 Label L_fill_2, L_fill_4, L_exit2; 2540 __ bind(L_fill_elements); 2541 switch (t) { 2542 case T_BYTE: 2543 __ tbz(count, 0, L_fill_2); 2544 __ strb(value, Address(__ post(to, 1))); 2545 __ bind(L_fill_2); 2546 __ tbz(count, 1, L_fill_4); 2547 __ strh(value, Address(__ post(to, 2))); 2548 __ bind(L_fill_4); 2549 __ tbz(count, 2, L_exit2); 2550 __ strw(value, Address(to)); 2551 break; 2552 case T_SHORT: 2553 __ tbz(count, 0, L_fill_4); 2554 __ strh(value, Address(__ post(to, 2))); 2555 __ bind(L_fill_4); 2556 __ tbz(count, 1, L_exit2); 2557 __ strw(value, Address(to)); 2558 break; 2559 case T_INT: 2560 __ cbzw(count, L_exit2); 2561 __ strw(value, Address(to)); 2562 break; 2563 default: ShouldNotReachHere(); 2564 } 2565 __ bind(L_exit2); 2566 __ leave(); 2567 __ ret(lr); 2568 return start; 2569 } 2570 2571 address generate_unsafecopy_common_error_exit() { 2572 address start_pc = __ pc(); 2573 __ leave(); 2574 __ mov(r0, 0); 2575 __ ret(lr); 2576 return start_pc; 2577 } 2578 2579 // 2580 // Generate 'unsafe' set memory stub 2581 // Though just as safe as the other stubs, it takes an unscaled 2582 // size_t (# bytes) argument instead of an element count. 2583 // 2584 // This fill operation is atomicity preserving: as long as the 2585 // address supplied is sufficiently aligned, all writes of up to 64 2586 // bits in size are single-copy atomic. 2587 // 2588 // Input: 2589 // c_rarg0 - destination array address 2590 // c_rarg1 - byte count (size_t) 2591 // c_rarg2 - byte value 2592 // 2593 address generate_unsafe_setmemory() { 2594 __ align(CodeEntryAlignment); 2595 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id); 2596 address start = __ pc(); 2597 2598 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2; 2599 Label tail; 2600 2601 UnsafeMemoryAccessMark umam(this, true, false); 2602 2603 __ enter(); // required for proper stackwalking of RuntimeStub frame 2604 2605 __ dup(v0, __ T16B, value); 2606 2607 if (AvoidUnalignedAccesses) { 2608 __ cmp(count, (u1)16); 2609 __ br(__ LO, tail); 2610 2611 __ mov(rscratch1, 16); 2612 __ andr(rscratch2, dest, 15); 2613 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest 2614 __ strq(v0, Address(dest)); 2615 __ sub(count, count, rscratch1); 2616 __ add(dest, dest, rscratch1); 2617 } 2618 2619 __ subs(count, count, (u1)64); 2620 __ br(__ LO, tail); 2621 { 2622 Label again; 2623 __ bind(again); 2624 __ stpq(v0, v0, Address(dest)); 2625 __ stpq(v0, v0, Address(dest, 32)); 2626 2627 __ subs(count, count, 64); 2628 __ add(dest, dest, 64); 2629 __ br(__ HS, again); 2630 } 2631 2632 __ bind(tail); 2633 // The count of bytes is off by 64, but we don't need to correct 2634 // it because we're only going to use the least-significant few 2635 // count bits from here on. 2636 // __ add(count, count, 64); 2637 2638 { 2639 Label dont; 2640 __ tbz(count, exact_log2(32), dont); 2641 __ stpq(v0, v0, __ post(dest, 32)); 2642 __ bind(dont); 2643 } 2644 { 2645 Label dont; 2646 __ tbz(count, exact_log2(16), dont); 2647 __ strq(v0, __ post(dest, 16)); 2648 __ bind(dont); 2649 } 2650 { 2651 Label dont; 2652 __ tbz(count, exact_log2(8), dont); 2653 __ strd(v0, __ post(dest, 8)); 2654 __ bind(dont); 2655 } 2656 2657 Label finished; 2658 __ tst(count, 7); 2659 __ br(__ EQ, finished); 2660 2661 { 2662 Label dont; 2663 __ tbz(count, exact_log2(4), dont); 2664 __ strs(v0, __ post(dest, 4)); 2665 __ bind(dont); 2666 } 2667 { 2668 Label dont; 2669 __ tbz(count, exact_log2(2), dont); 2670 __ bfi(value, value, 8, 8); 2671 __ strh(value, __ post(dest, 2)); 2672 __ bind(dont); 2673 } 2674 { 2675 Label dont; 2676 __ tbz(count, exact_log2(1), dont); 2677 __ strb(value, Address(dest)); 2678 __ bind(dont); 2679 } 2680 2681 __ bind(finished); 2682 __ leave(); 2683 __ ret(lr); 2684 2685 return start; 2686 } 2687 2688 address generate_data_cache_writeback() { 2689 const Register line = c_rarg0; // address of line to write back 2690 2691 __ align(CodeEntryAlignment); 2692 2693 StubId stub_id = StubId::stubgen_data_cache_writeback_id; 2694 StubCodeMark mark(this, stub_id); 2695 2696 address start = __ pc(); 2697 __ enter(); 2698 __ cache_wb(Address(line, 0)); 2699 __ leave(); 2700 __ ret(lr); 2701 2702 return start; 2703 } 2704 2705 address generate_data_cache_writeback_sync() { 2706 const Register is_pre = c_rarg0; // pre or post sync 2707 2708 __ align(CodeEntryAlignment); 2709 2710 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id; 2711 StubCodeMark mark(this, stub_id); 2712 2713 // pre wbsync is a no-op 2714 // post wbsync translates to an sfence 2715 2716 Label skip; 2717 address start = __ pc(); 2718 __ enter(); 2719 __ cbnz(is_pre, skip); 2720 __ cache_wbsync(false); 2721 __ bind(skip); 2722 __ leave(); 2723 __ ret(lr); 2724 2725 return start; 2726 } 2727 2728 void generate_arraycopy_stubs() { 2729 // Some copy stubs publish a normal entry and then a 2nd 'fallback' 2730 // entry immediately following their stack push. This can be used 2731 // as a post-push branch target for compatible stubs when they 2732 // identify a special case that can be handled by the fallback 2733 // stub e.g a disjoint copy stub may be use as a special case 2734 // fallback for its compatible conjoint copy stub. 2735 // 2736 // A no push entry is always returned in the following local and 2737 // then published by assigning to the appropriate entry field in 2738 // class StubRoutines. The entry value is then passed to the 2739 // generator for the compatible stub. That means the entry must be 2740 // listed when saving to/restoring from the AOT cache, ensuring 2741 // that the inter-stub jumps are noted at AOT-cache save and 2742 // relocated at AOT cache load. 2743 address nopush_entry; 2744 2745 // generate the common exit first so later stubs can rely on it if 2746 // they want an UnsafeMemoryAccess exit non-local to the stub 2747 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit(); 2748 // register the stub as the default exit with class UnsafeMemoryAccess 2749 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit); 2750 2751 // generate and publish arch64-specific bulk copy routines first 2752 // so we can call them from other copy stubs 2753 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15); 2754 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15); 2755 2756 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15); 2757 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15); 2758 2759 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15); 2760 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15); 2761 2762 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2763 2764 //*** jbyte 2765 // Always need aligned and unaligned versions 2766 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry); 2767 // disjoint nopush entry is needed by conjoint copy 2768 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry; 2769 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry); 2770 // conjoint nopush entry is needed by generic/unsafe copy 2771 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry; 2772 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry); 2773 // disjoint arrayof nopush entry is needed by conjoint copy 2774 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry; 2775 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr); 2776 2777 //*** jshort 2778 // Always need aligned and unaligned versions 2779 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry); 2780 // disjoint nopush entry is needed by conjoint copy 2781 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry; 2782 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry); 2783 // conjoint nopush entry is used by generic/unsafe copy 2784 StubRoutines::_jshort_arraycopy_nopush = nopush_entry; 2785 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry); 2786 // disjoint arrayof nopush entry is needed by conjoint copy 2787 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry; 2788 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr); 2789 2790 //*** jint 2791 // Aligned versions 2792 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry); 2793 // disjoint arrayof nopush entry is needed by conjoint copy 2794 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry; 2795 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr); 2796 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2797 // jint_arraycopy_nopush always points to the unaligned version 2798 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry); 2799 // disjoint nopush entry is needed by conjoint copy 2800 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry; 2801 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry); 2802 // conjoint nopush entry is needed by generic/unsafe copy 2803 StubRoutines::_jint_arraycopy_nopush = nopush_entry; 2804 2805 //*** jlong 2806 // It is always aligned 2807 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry); 2808 // disjoint arrayof nopush entry is needed by conjoint copy 2809 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry; 2810 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry); 2811 // conjoint nopush entry is needed by generic/unsafe copy 2812 StubRoutines::_jlong_arraycopy_nopush = nopush_entry; 2813 // disjoint normal/nopush and conjoint normal entries are not 2814 // generated since the arrayof versions are the same 2815 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2816 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush; 2817 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2818 2819 //*** oops 2820 { 2821 StubRoutines::_arrayof_oop_disjoint_arraycopy 2822 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry); 2823 // disjoint arrayof nopush entry is needed by conjoint copy 2824 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry; 2825 StubRoutines::_arrayof_oop_arraycopy 2826 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry); 2827 // conjoint arrayof nopush entry is needed by generic/unsafe copy 2828 StubRoutines::_oop_arraycopy_nopush = nopush_entry; 2829 // Aligned versions without pre-barriers 2830 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2831 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry); 2832 // disjoint arrayof+uninit nopush entry is needed by conjoint copy 2833 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry; 2834 // note that we don't need a returned nopush entry because the 2835 // generic/unsafe copy does not cater for uninit arrays. 2836 StubRoutines::_arrayof_oop_arraycopy_uninit 2837 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr); 2838 } 2839 2840 // for oop copies reuse arrayof entries for non-arrayof cases 2841 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2842 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush; 2843 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2844 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2845 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush; 2846 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2847 2848 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry); 2849 // checkcast nopush entry is needed by generic copy 2850 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry; 2851 // note that we don't need a returned nopush entry because the 2852 // generic copy does not cater for uninit arrays. 2853 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr); 2854 2855 // unsafe arraycopy may fallback on conjoint stubs 2856 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush, 2857 StubRoutines::_jshort_arraycopy_nopush, 2858 StubRoutines::_jint_arraycopy_nopush, 2859 StubRoutines::_jlong_arraycopy_nopush); 2860 2861 // generic arraycopy may fallback on conjoint stubs 2862 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush, 2863 StubRoutines::_jshort_arraycopy_nopush, 2864 StubRoutines::_jint_arraycopy_nopush, 2865 StubRoutines::_oop_arraycopy_nopush, 2866 StubRoutines::_jlong_arraycopy_nopush, 2867 StubRoutines::_checkcast_arraycopy_nopush); 2868 2869 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id); 2870 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id); 2871 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id); 2872 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id); 2873 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id); 2874 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id); 2875 } 2876 2877 void generate_math_stubs() { Unimplemented(); } 2878 2879 // Arguments: 2880 // 2881 // Inputs: 2882 // c_rarg0 - source byte array address 2883 // c_rarg1 - destination byte array address 2884 // c_rarg2 - K (key) in little endian int array 2885 // 2886 address generate_aescrypt_encryptBlock() { 2887 __ align(CodeEntryAlignment); 2888 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id; 2889 StubCodeMark mark(this, stub_id); 2890 2891 const Register from = c_rarg0; // source array address 2892 const Register to = c_rarg1; // destination array address 2893 const Register key = c_rarg2; // key array address 2894 const Register keylen = rscratch1; 2895 2896 address start = __ pc(); 2897 __ enter(); 2898 2899 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2900 2901 __ aesenc_loadkeys(key, keylen); 2902 __ aesecb_encrypt(from, to, keylen); 2903 2904 __ mov(r0, 0); 2905 2906 __ leave(); 2907 __ ret(lr); 2908 2909 return start; 2910 } 2911 2912 // Arguments: 2913 // 2914 // Inputs: 2915 // c_rarg0 - source byte array address 2916 // c_rarg1 - destination byte array address 2917 // c_rarg2 - K (key) in little endian int array 2918 // 2919 address generate_aescrypt_decryptBlock() { 2920 assert(UseAES, "need AES cryptographic extension support"); 2921 __ align(CodeEntryAlignment); 2922 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id; 2923 StubCodeMark mark(this, stub_id); 2924 Label L_doLast; 2925 2926 const Register from = c_rarg0; // source array address 2927 const Register to = c_rarg1; // destination array address 2928 const Register key = c_rarg2; // key array address 2929 const Register keylen = rscratch1; 2930 2931 address start = __ pc(); 2932 __ enter(); // required for proper stackwalking of RuntimeStub frame 2933 2934 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2935 2936 __ aesecb_decrypt(from, to, key, keylen); 2937 2938 __ mov(r0, 0); 2939 2940 __ leave(); 2941 __ ret(lr); 2942 2943 return start; 2944 } 2945 2946 // Arguments: 2947 // 2948 // Inputs: 2949 // c_rarg0 - source byte array address 2950 // c_rarg1 - destination byte array address 2951 // c_rarg2 - K (key) in little endian int array 2952 // c_rarg3 - r vector byte array address 2953 // c_rarg4 - input length 2954 // 2955 // Output: 2956 // x0 - input length 2957 // 2958 address generate_cipherBlockChaining_encryptAESCrypt() { 2959 assert(UseAES, "need AES cryptographic extension support"); 2960 __ align(CodeEntryAlignment); 2961 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id; 2962 StubCodeMark mark(this, stub_id); 2963 2964 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2965 2966 const Register from = c_rarg0; // source array address 2967 const Register to = c_rarg1; // destination array address 2968 const Register key = c_rarg2; // key array address 2969 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2970 // and left with the results of the last encryption block 2971 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2972 const Register keylen = rscratch1; 2973 2974 address start = __ pc(); 2975 2976 __ enter(); 2977 2978 __ movw(rscratch2, len_reg); 2979 2980 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2981 2982 __ ld1(v0, __ T16B, rvec); 2983 2984 __ cmpw(keylen, 52); 2985 __ br(Assembler::CC, L_loadkeys_44); 2986 __ br(Assembler::EQ, L_loadkeys_52); 2987 2988 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2989 __ rev32(v17, __ T16B, v17); 2990 __ rev32(v18, __ T16B, v18); 2991 __ BIND(L_loadkeys_52); 2992 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2993 __ rev32(v19, __ T16B, v19); 2994 __ rev32(v20, __ T16B, v20); 2995 __ BIND(L_loadkeys_44); 2996 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2997 __ rev32(v21, __ T16B, v21); 2998 __ rev32(v22, __ T16B, v22); 2999 __ rev32(v23, __ T16B, v23); 3000 __ rev32(v24, __ T16B, v24); 3001 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3002 __ rev32(v25, __ T16B, v25); 3003 __ rev32(v26, __ T16B, v26); 3004 __ rev32(v27, __ T16B, v27); 3005 __ rev32(v28, __ T16B, v28); 3006 __ ld1(v29, v30, v31, __ T16B, key); 3007 __ rev32(v29, __ T16B, v29); 3008 __ rev32(v30, __ T16B, v30); 3009 __ rev32(v31, __ T16B, v31); 3010 3011 __ BIND(L_aes_loop); 3012 __ ld1(v1, __ T16B, __ post(from, 16)); 3013 __ eor(v0, __ T16B, v0, v1); 3014 3015 __ br(Assembler::CC, L_rounds_44); 3016 __ br(Assembler::EQ, L_rounds_52); 3017 3018 __ aese(v0, v17); __ aesmc(v0, v0); 3019 __ aese(v0, v18); __ aesmc(v0, v0); 3020 __ BIND(L_rounds_52); 3021 __ aese(v0, v19); __ aesmc(v0, v0); 3022 __ aese(v0, v20); __ aesmc(v0, v0); 3023 __ BIND(L_rounds_44); 3024 __ aese(v0, v21); __ aesmc(v0, v0); 3025 __ aese(v0, v22); __ aesmc(v0, v0); 3026 __ aese(v0, v23); __ aesmc(v0, v0); 3027 __ aese(v0, v24); __ aesmc(v0, v0); 3028 __ aese(v0, v25); __ aesmc(v0, v0); 3029 __ aese(v0, v26); __ aesmc(v0, v0); 3030 __ aese(v0, v27); __ aesmc(v0, v0); 3031 __ aese(v0, v28); __ aesmc(v0, v0); 3032 __ aese(v0, v29); __ aesmc(v0, v0); 3033 __ aese(v0, v30); 3034 __ eor(v0, __ T16B, v0, v31); 3035 3036 __ st1(v0, __ T16B, __ post(to, 16)); 3037 3038 __ subw(len_reg, len_reg, 16); 3039 __ cbnzw(len_reg, L_aes_loop); 3040 3041 __ st1(v0, __ T16B, rvec); 3042 3043 __ mov(r0, rscratch2); 3044 3045 __ leave(); 3046 __ ret(lr); 3047 3048 return start; 3049 } 3050 3051 // Arguments: 3052 // 3053 // Inputs: 3054 // c_rarg0 - source byte array address 3055 // c_rarg1 - destination byte array address 3056 // c_rarg2 - K (key) in little endian int array 3057 // c_rarg3 - r vector byte array address 3058 // c_rarg4 - input length 3059 // 3060 // Output: 3061 // r0 - input length 3062 // 3063 address generate_cipherBlockChaining_decryptAESCrypt() { 3064 assert(UseAES, "need AES cryptographic extension support"); 3065 __ align(CodeEntryAlignment); 3066 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id; 3067 StubCodeMark mark(this, stub_id); 3068 3069 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 3070 3071 const Register from = c_rarg0; // source array address 3072 const Register to = c_rarg1; // destination array address 3073 const Register key = c_rarg2; // key array address 3074 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3075 // and left with the results of the last encryption block 3076 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3077 const Register keylen = rscratch1; 3078 3079 address start = __ pc(); 3080 3081 __ enter(); 3082 3083 __ movw(rscratch2, len_reg); 3084 3085 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3086 3087 __ ld1(v2, __ T16B, rvec); 3088 3089 __ ld1(v31, __ T16B, __ post(key, 16)); 3090 __ rev32(v31, __ T16B, v31); 3091 3092 __ cmpw(keylen, 52); 3093 __ br(Assembler::CC, L_loadkeys_44); 3094 __ br(Assembler::EQ, L_loadkeys_52); 3095 3096 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 3097 __ rev32(v17, __ T16B, v17); 3098 __ rev32(v18, __ T16B, v18); 3099 __ BIND(L_loadkeys_52); 3100 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 3101 __ rev32(v19, __ T16B, v19); 3102 __ rev32(v20, __ T16B, v20); 3103 __ BIND(L_loadkeys_44); 3104 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 3105 __ rev32(v21, __ T16B, v21); 3106 __ rev32(v22, __ T16B, v22); 3107 __ rev32(v23, __ T16B, v23); 3108 __ rev32(v24, __ T16B, v24); 3109 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3110 __ rev32(v25, __ T16B, v25); 3111 __ rev32(v26, __ T16B, v26); 3112 __ rev32(v27, __ T16B, v27); 3113 __ rev32(v28, __ T16B, v28); 3114 __ ld1(v29, v30, __ T16B, key); 3115 __ rev32(v29, __ T16B, v29); 3116 __ rev32(v30, __ T16B, v30); 3117 3118 __ BIND(L_aes_loop); 3119 __ ld1(v0, __ T16B, __ post(from, 16)); 3120 __ orr(v1, __ T16B, v0, v0); 3121 3122 __ br(Assembler::CC, L_rounds_44); 3123 __ br(Assembler::EQ, L_rounds_52); 3124 3125 __ aesd(v0, v17); __ aesimc(v0, v0); 3126 __ aesd(v0, v18); __ aesimc(v0, v0); 3127 __ BIND(L_rounds_52); 3128 __ aesd(v0, v19); __ aesimc(v0, v0); 3129 __ aesd(v0, v20); __ aesimc(v0, v0); 3130 __ BIND(L_rounds_44); 3131 __ aesd(v0, v21); __ aesimc(v0, v0); 3132 __ aesd(v0, v22); __ aesimc(v0, v0); 3133 __ aesd(v0, v23); __ aesimc(v0, v0); 3134 __ aesd(v0, v24); __ aesimc(v0, v0); 3135 __ aesd(v0, v25); __ aesimc(v0, v0); 3136 __ aesd(v0, v26); __ aesimc(v0, v0); 3137 __ aesd(v0, v27); __ aesimc(v0, v0); 3138 __ aesd(v0, v28); __ aesimc(v0, v0); 3139 __ aesd(v0, v29); __ aesimc(v0, v0); 3140 __ aesd(v0, v30); 3141 __ eor(v0, __ T16B, v0, v31); 3142 __ eor(v0, __ T16B, v0, v2); 3143 3144 __ st1(v0, __ T16B, __ post(to, 16)); 3145 __ orr(v2, __ T16B, v1, v1); 3146 3147 __ subw(len_reg, len_reg, 16); 3148 __ cbnzw(len_reg, L_aes_loop); 3149 3150 __ st1(v2, __ T16B, rvec); 3151 3152 __ mov(r0, rscratch2); 3153 3154 __ leave(); 3155 __ ret(lr); 3156 3157 return start; 3158 } 3159 3160 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3161 // Inputs: 128-bits. in is preserved. 3162 // The least-significant 64-bit word is in the upper dword of each vector. 3163 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3164 // Output: result 3165 void be_add_128_64(FloatRegister result, FloatRegister in, 3166 FloatRegister inc, FloatRegister tmp) { 3167 assert_different_registers(result, tmp, inc); 3168 3169 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3170 // input 3171 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3172 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3173 // MSD == 0 (must be!) to LSD 3174 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3175 } 3176 3177 // CTR AES crypt. 3178 // Arguments: 3179 // 3180 // Inputs: 3181 // c_rarg0 - source byte array address 3182 // c_rarg1 - destination byte array address 3183 // c_rarg2 - K (key) in little endian int array 3184 // c_rarg3 - counter vector byte array address 3185 // c_rarg4 - input length 3186 // c_rarg5 - saved encryptedCounter start 3187 // c_rarg6 - saved used length 3188 // 3189 // Output: 3190 // r0 - input length 3191 // 3192 address generate_counterMode_AESCrypt() { 3193 const Register in = c_rarg0; 3194 const Register out = c_rarg1; 3195 const Register key = c_rarg2; 3196 const Register counter = c_rarg3; 3197 const Register saved_len = c_rarg4, len = r10; 3198 const Register saved_encrypted_ctr = c_rarg5; 3199 const Register used_ptr = c_rarg6, used = r12; 3200 3201 const Register offset = r7; 3202 const Register keylen = r11; 3203 3204 const unsigned char block_size = 16; 3205 const int bulk_width = 4; 3206 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3207 // performance with larger data sizes, but it also means that the 3208 // fast path isn't used until you have at least 8 blocks, and up 3209 // to 127 bytes of data will be executed on the slow path. For 3210 // that reason, and also so as not to blow away too much icache, 4 3211 // blocks seems like a sensible compromise. 3212 3213 // Algorithm: 3214 // 3215 // if (len == 0) { 3216 // goto DONE; 3217 // } 3218 // int result = len; 3219 // do { 3220 // if (used >= blockSize) { 3221 // if (len >= bulk_width * blockSize) { 3222 // CTR_large_block(); 3223 // if (len == 0) 3224 // goto DONE; 3225 // } 3226 // for (;;) { 3227 // 16ByteVector v0 = counter; 3228 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3229 // used = 0; 3230 // if (len < blockSize) 3231 // break; /* goto NEXT */ 3232 // 16ByteVector v1 = load16Bytes(in, offset); 3233 // v1 = v1 ^ encryptedCounter; 3234 // store16Bytes(out, offset); 3235 // used = blockSize; 3236 // offset += blockSize; 3237 // len -= blockSize; 3238 // if (len == 0) 3239 // goto DONE; 3240 // } 3241 // } 3242 // NEXT: 3243 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3244 // len--; 3245 // } while (len != 0); 3246 // DONE: 3247 // return result; 3248 // 3249 // CTR_large_block() 3250 // Wide bulk encryption of whole blocks. 3251 3252 __ align(CodeEntryAlignment); 3253 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id; 3254 StubCodeMark mark(this, stub_id); 3255 const address start = __ pc(); 3256 __ enter(); 3257 3258 Label DONE, CTR_large_block, large_block_return; 3259 __ ldrw(used, Address(used_ptr)); 3260 __ cbzw(saved_len, DONE); 3261 3262 __ mov(len, saved_len); 3263 __ mov(offset, 0); 3264 3265 // Compute #rounds for AES based on the length of the key array 3266 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3267 3268 __ aesenc_loadkeys(key, keylen); 3269 3270 { 3271 Label L_CTR_loop, NEXT; 3272 3273 __ bind(L_CTR_loop); 3274 3275 __ cmp(used, block_size); 3276 __ br(__ LO, NEXT); 3277 3278 // Maybe we have a lot of data 3279 __ subsw(rscratch1, len, bulk_width * block_size); 3280 __ br(__ HS, CTR_large_block); 3281 __ BIND(large_block_return); 3282 __ cbzw(len, DONE); 3283 3284 // Setup the counter 3285 __ movi(v4, __ T4S, 0); 3286 __ movi(v5, __ T4S, 1); 3287 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3288 3289 // 128-bit big-endian increment 3290 __ ld1(v0, __ T16B, counter); 3291 __ rev64(v16, __ T16B, v0); 3292 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3293 __ rev64(v16, __ T16B, v16); 3294 __ st1(v16, __ T16B, counter); 3295 // Previous counter value is in v0 3296 // v4 contains { 0, 1 } 3297 3298 { 3299 // We have fewer than bulk_width blocks of data left. Encrypt 3300 // them one by one until there is less than a full block 3301 // remaining, being careful to save both the encrypted counter 3302 // and the counter. 3303 3304 Label inner_loop; 3305 __ bind(inner_loop); 3306 // Counter to encrypt is in v0 3307 __ aesecb_encrypt(noreg, noreg, keylen); 3308 __ st1(v0, __ T16B, saved_encrypted_ctr); 3309 3310 // Do we have a remaining full block? 3311 3312 __ mov(used, 0); 3313 __ cmp(len, block_size); 3314 __ br(__ LO, NEXT); 3315 3316 // Yes, we have a full block 3317 __ ldrq(v1, Address(in, offset)); 3318 __ eor(v1, __ T16B, v1, v0); 3319 __ strq(v1, Address(out, offset)); 3320 __ mov(used, block_size); 3321 __ add(offset, offset, block_size); 3322 3323 __ subw(len, len, block_size); 3324 __ cbzw(len, DONE); 3325 3326 // Increment the counter, store it back 3327 __ orr(v0, __ T16B, v16, v16); 3328 __ rev64(v16, __ T16B, v16); 3329 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3330 __ rev64(v16, __ T16B, v16); 3331 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3332 3333 __ b(inner_loop); 3334 } 3335 3336 __ BIND(NEXT); 3337 3338 // Encrypt a single byte, and loop. 3339 // We expect this to be a rare event. 3340 __ ldrb(rscratch1, Address(in, offset)); 3341 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3342 __ eor(rscratch1, rscratch1, rscratch2); 3343 __ strb(rscratch1, Address(out, offset)); 3344 __ add(offset, offset, 1); 3345 __ add(used, used, 1); 3346 __ subw(len, len,1); 3347 __ cbnzw(len, L_CTR_loop); 3348 } 3349 3350 __ bind(DONE); 3351 __ strw(used, Address(used_ptr)); 3352 __ mov(r0, saved_len); 3353 3354 __ leave(); // required for proper stackwalking of RuntimeStub frame 3355 __ ret(lr); 3356 3357 // Bulk encryption 3358 3359 __ BIND (CTR_large_block); 3360 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3361 3362 if (bulk_width == 8) { 3363 __ sub(sp, sp, 4 * 16); 3364 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3365 } 3366 __ sub(sp, sp, 4 * 16); 3367 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3368 RegSet saved_regs = (RegSet::of(in, out, offset) 3369 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3370 __ push(saved_regs, sp); 3371 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3372 __ add(in, in, offset); 3373 __ add(out, out, offset); 3374 3375 // Keys should already be loaded into the correct registers 3376 3377 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3378 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3379 3380 // AES/CTR loop 3381 { 3382 Label L_CTR_loop; 3383 __ BIND(L_CTR_loop); 3384 3385 // Setup the counters 3386 __ movi(v8, __ T4S, 0); 3387 __ movi(v9, __ T4S, 1); 3388 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3389 3390 for (int i = 0; i < bulk_width; i++) { 3391 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3392 __ rev64(v0_ofs, __ T16B, v16); 3393 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3394 } 3395 3396 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3397 3398 // Encrypt the counters 3399 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3400 3401 if (bulk_width == 8) { 3402 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3403 } 3404 3405 // XOR the encrypted counters with the inputs 3406 for (int i = 0; i < bulk_width; i++) { 3407 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3408 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3409 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3410 } 3411 3412 // Write the encrypted data 3413 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3414 if (bulk_width == 8) { 3415 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3416 } 3417 3418 __ subw(len, len, 16 * bulk_width); 3419 __ cbnzw(len, L_CTR_loop); 3420 } 3421 3422 // Save the counter back where it goes 3423 __ rev64(v16, __ T16B, v16); 3424 __ st1(v16, __ T16B, counter); 3425 3426 __ pop(saved_regs, sp); 3427 3428 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3429 if (bulk_width == 8) { 3430 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3431 } 3432 3433 __ andr(rscratch1, len, -16 * bulk_width); 3434 __ sub(len, len, rscratch1); 3435 __ add(offset, offset, rscratch1); 3436 __ mov(used, 16); 3437 __ strw(used, Address(used_ptr)); 3438 __ b(large_block_return); 3439 3440 return start; 3441 } 3442 3443 // Vector AES Galois Counter Mode implementation. Parameters: 3444 // 3445 // in = c_rarg0 3446 // len = c_rarg1 3447 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3448 // out = c_rarg3 3449 // key = c_rarg4 3450 // state = c_rarg5 - GHASH.state 3451 // subkeyHtbl = c_rarg6 - powers of H 3452 // counter = c_rarg7 - 16 bytes of CTR 3453 // return - number of processed bytes 3454 address generate_galoisCounterMode_AESCrypt() { 3455 Label ghash_polynomial; // local data generated after code 3456 3457 __ align(CodeEntryAlignment); 3458 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id; 3459 StubCodeMark mark(this, stub_id); 3460 address start = __ pc(); 3461 __ enter(); 3462 3463 const Register in = c_rarg0; 3464 const Register len = c_rarg1; 3465 const Register ct = c_rarg2; 3466 const Register out = c_rarg3; 3467 // and updated with the incremented counter in the end 3468 3469 const Register key = c_rarg4; 3470 const Register state = c_rarg5; 3471 3472 const Register subkeyHtbl = c_rarg6; 3473 3474 const Register counter = c_rarg7; 3475 3476 const Register keylen = r10; 3477 // Save state before entering routine 3478 __ sub(sp, sp, 4 * 16); 3479 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3480 __ sub(sp, sp, 4 * 16); 3481 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3482 3483 // __ andr(len, len, -512); 3484 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3485 __ str(len, __ pre(sp, -2 * wordSize)); 3486 3487 Label DONE; 3488 __ cbz(len, DONE); 3489 3490 // Compute #rounds for AES based on the length of the key array 3491 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3492 3493 __ aesenc_loadkeys(key, keylen); 3494 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3495 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3496 3497 // AES/CTR loop 3498 { 3499 Label L_CTR_loop; 3500 __ BIND(L_CTR_loop); 3501 3502 // Setup the counters 3503 __ movi(v8, __ T4S, 0); 3504 __ movi(v9, __ T4S, 1); 3505 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3506 3507 assert(v0->encoding() < v8->encoding(), ""); 3508 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3509 FloatRegister f = as_FloatRegister(i); 3510 __ rev32(f, __ T16B, v16); 3511 __ addv(v16, __ T4S, v16, v8); 3512 } 3513 3514 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3515 3516 // Encrypt the counters 3517 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3518 3519 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3520 3521 // XOR the encrypted counters with the inputs 3522 for (int i = 0; i < 8; i++) { 3523 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3524 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3525 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3526 } 3527 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3528 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3529 3530 __ subw(len, len, 16 * 8); 3531 __ cbnzw(len, L_CTR_loop); 3532 } 3533 3534 __ rev32(v16, __ T16B, v16); 3535 __ st1(v16, __ T16B, counter); 3536 3537 __ ldr(len, Address(sp)); 3538 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3539 3540 // GHASH/CTR loop 3541 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3542 len, /*unrolls*/4); 3543 3544 #ifdef ASSERT 3545 { Label L; 3546 __ cmp(len, (unsigned char)0); 3547 __ br(Assembler::EQ, L); 3548 __ stop("stubGenerator: abort"); 3549 __ bind(L); 3550 } 3551 #endif 3552 3553 __ bind(DONE); 3554 // Return the number of bytes processed 3555 __ ldr(r0, __ post(sp, 2 * wordSize)); 3556 3557 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3558 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3559 3560 __ leave(); // required for proper stackwalking of RuntimeStub frame 3561 __ ret(lr); 3562 3563 // bind label and generate polynomial data 3564 __ align(wordSize * 2); 3565 __ bind(ghash_polynomial); 3566 __ emit_int64(0x87); // The low-order bits of the field 3567 // polynomial (i.e. p = z^7+z^2+z+1) 3568 // repeated in the low and high parts of a 3569 // 128-bit vector 3570 __ emit_int64(0x87); 3571 3572 return start; 3573 } 3574 3575 class Cached64Bytes { 3576 private: 3577 MacroAssembler *_masm; 3578 Register _regs[8]; 3579 3580 public: 3581 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3582 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3583 auto it = rs.begin(); 3584 for (auto &r: _regs) { 3585 r = *it; 3586 ++it; 3587 } 3588 } 3589 3590 void gen_loads(Register base) { 3591 for (int i = 0; i < 8; i += 2) { 3592 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3593 } 3594 } 3595 3596 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3597 void extract_u32(Register dest, int i) { 3598 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3599 } 3600 }; 3601 3602 // Utility routines for md5. 3603 // Clobbers r10 and r11. 3604 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3605 int k, int s, int t) { 3606 Register rscratch3 = r10; 3607 Register rscratch4 = r11; 3608 3609 __ eorw(rscratch3, r3, r4); 3610 __ movw(rscratch2, t); 3611 __ andw(rscratch3, rscratch3, r2); 3612 __ addw(rscratch4, r1, rscratch2); 3613 reg_cache.extract_u32(rscratch1, k); 3614 __ eorw(rscratch3, rscratch3, r4); 3615 __ addw(rscratch4, rscratch4, rscratch1); 3616 __ addw(rscratch3, rscratch3, rscratch4); 3617 __ rorw(rscratch2, rscratch3, 32 - s); 3618 __ addw(r1, rscratch2, r2); 3619 } 3620 3621 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3622 int k, int s, int t) { 3623 Register rscratch3 = r10; 3624 Register rscratch4 = r11; 3625 3626 reg_cache.extract_u32(rscratch1, k); 3627 __ movw(rscratch2, t); 3628 __ addw(rscratch4, r1, rscratch2); 3629 __ addw(rscratch4, rscratch4, rscratch1); 3630 __ bicw(rscratch2, r3, r4); 3631 __ andw(rscratch3, r2, r4); 3632 __ addw(rscratch2, rscratch2, rscratch4); 3633 __ addw(rscratch2, rscratch2, rscratch3); 3634 __ rorw(rscratch2, rscratch2, 32 - s); 3635 __ addw(r1, rscratch2, r2); 3636 } 3637 3638 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3639 int k, int s, int t) { 3640 Register rscratch3 = r10; 3641 Register rscratch4 = r11; 3642 3643 __ eorw(rscratch3, r3, r4); 3644 __ movw(rscratch2, t); 3645 __ addw(rscratch4, r1, rscratch2); 3646 reg_cache.extract_u32(rscratch1, k); 3647 __ eorw(rscratch3, rscratch3, r2); 3648 __ addw(rscratch4, rscratch4, rscratch1); 3649 __ addw(rscratch3, rscratch3, rscratch4); 3650 __ rorw(rscratch2, rscratch3, 32 - s); 3651 __ addw(r1, rscratch2, r2); 3652 } 3653 3654 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3655 int k, int s, int t) { 3656 Register rscratch3 = r10; 3657 Register rscratch4 = r11; 3658 3659 __ movw(rscratch3, t); 3660 __ ornw(rscratch2, r2, r4); 3661 __ addw(rscratch4, r1, rscratch3); 3662 reg_cache.extract_u32(rscratch1, k); 3663 __ eorw(rscratch3, rscratch2, r3); 3664 __ addw(rscratch4, rscratch4, rscratch1); 3665 __ addw(rscratch3, rscratch3, rscratch4); 3666 __ rorw(rscratch2, rscratch3, 32 - s); 3667 __ addw(r1, rscratch2, r2); 3668 } 3669 3670 // Arguments: 3671 // 3672 // Inputs: 3673 // c_rarg0 - byte[] source+offset 3674 // c_rarg1 - int[] SHA.state 3675 // c_rarg2 - int offset 3676 // c_rarg3 - int limit 3677 // 3678 address generate_md5_implCompress(StubId stub_id) { 3679 bool multi_block; 3680 switch (stub_id) { 3681 case StubId::stubgen_md5_implCompress_id: 3682 multi_block = false; 3683 break; 3684 case StubId::stubgen_md5_implCompressMB_id: 3685 multi_block = true; 3686 break; 3687 default: 3688 ShouldNotReachHere(); 3689 } 3690 __ align(CodeEntryAlignment); 3691 3692 StubCodeMark mark(this, stub_id); 3693 address start = __ pc(); 3694 3695 Register buf = c_rarg0; 3696 Register state = c_rarg1; 3697 Register ofs = c_rarg2; 3698 Register limit = c_rarg3; 3699 Register a = r4; 3700 Register b = r5; 3701 Register c = r6; 3702 Register d = r7; 3703 Register rscratch3 = r10; 3704 Register rscratch4 = r11; 3705 3706 Register state_regs[2] = { r12, r13 }; 3707 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3708 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3709 3710 __ push(saved_regs, sp); 3711 3712 __ ldp(state_regs[0], state_regs[1], Address(state)); 3713 __ ubfx(a, state_regs[0], 0, 32); 3714 __ ubfx(b, state_regs[0], 32, 32); 3715 __ ubfx(c, state_regs[1], 0, 32); 3716 __ ubfx(d, state_regs[1], 32, 32); 3717 3718 Label md5_loop; 3719 __ BIND(md5_loop); 3720 3721 reg_cache.gen_loads(buf); 3722 3723 // Round 1 3724 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3725 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3726 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3727 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3728 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3729 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3730 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3731 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3732 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3733 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3734 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3735 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3736 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3737 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3738 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3739 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3740 3741 // Round 2 3742 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3743 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3744 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3745 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3746 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3747 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3748 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3749 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3750 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3751 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3752 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3753 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3754 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3755 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3756 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3757 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3758 3759 // Round 3 3760 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3761 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3762 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3763 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3764 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3765 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3766 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3767 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3768 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3769 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3770 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3771 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3772 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3773 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3774 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3775 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3776 3777 // Round 4 3778 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3779 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3780 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3781 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3782 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3783 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3784 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3785 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3786 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3787 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3788 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3789 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3790 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3791 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3792 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3793 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3794 3795 __ addw(a, state_regs[0], a); 3796 __ ubfx(rscratch2, state_regs[0], 32, 32); 3797 __ addw(b, rscratch2, b); 3798 __ addw(c, state_regs[1], c); 3799 __ ubfx(rscratch4, state_regs[1], 32, 32); 3800 __ addw(d, rscratch4, d); 3801 3802 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3803 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3804 3805 if (multi_block) { 3806 __ add(buf, buf, 64); 3807 __ add(ofs, ofs, 64); 3808 __ cmp(ofs, limit); 3809 __ br(Assembler::LE, md5_loop); 3810 __ mov(c_rarg0, ofs); // return ofs 3811 } 3812 3813 // write hash values back in the correct order 3814 __ stp(state_regs[0], state_regs[1], Address(state)); 3815 3816 __ pop(saved_regs, sp); 3817 3818 __ ret(lr); 3819 3820 return start; 3821 } 3822 3823 // Arguments: 3824 // 3825 // Inputs: 3826 // c_rarg0 - byte[] source+offset 3827 // c_rarg1 - int[] SHA.state 3828 // c_rarg2 - int offset 3829 // c_rarg3 - int limit 3830 // 3831 address generate_sha1_implCompress(StubId stub_id) { 3832 bool multi_block; 3833 switch (stub_id) { 3834 case StubId::stubgen_sha1_implCompress_id: 3835 multi_block = false; 3836 break; 3837 case StubId::stubgen_sha1_implCompressMB_id: 3838 multi_block = true; 3839 break; 3840 default: 3841 ShouldNotReachHere(); 3842 } 3843 3844 __ align(CodeEntryAlignment); 3845 3846 StubCodeMark mark(this, stub_id); 3847 address start = __ pc(); 3848 3849 Register buf = c_rarg0; 3850 Register state = c_rarg1; 3851 Register ofs = c_rarg2; 3852 Register limit = c_rarg3; 3853 3854 Label keys; 3855 Label sha1_loop; 3856 3857 // load the keys into v0..v3 3858 __ adr(rscratch1, keys); 3859 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3860 // load 5 words state into v6, v7 3861 __ ldrq(v6, Address(state, 0)); 3862 __ ldrs(v7, Address(state, 16)); 3863 3864 3865 __ BIND(sha1_loop); 3866 // load 64 bytes of data into v16..v19 3867 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3868 __ rev32(v16, __ T16B, v16); 3869 __ rev32(v17, __ T16B, v17); 3870 __ rev32(v18, __ T16B, v18); 3871 __ rev32(v19, __ T16B, v19); 3872 3873 // do the sha1 3874 __ addv(v4, __ T4S, v16, v0); 3875 __ orr(v20, __ T16B, v6, v6); 3876 3877 FloatRegister d0 = v16; 3878 FloatRegister d1 = v17; 3879 FloatRegister d2 = v18; 3880 FloatRegister d3 = v19; 3881 3882 for (int round = 0; round < 20; round++) { 3883 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3884 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3885 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3886 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3887 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3888 3889 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3890 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3891 __ sha1h(tmp2, __ T4S, v20); 3892 if (round < 5) 3893 __ sha1c(v20, __ T4S, tmp3, tmp4); 3894 else if (round < 10 || round >= 15) 3895 __ sha1p(v20, __ T4S, tmp3, tmp4); 3896 else 3897 __ sha1m(v20, __ T4S, tmp3, tmp4); 3898 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3899 3900 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3901 } 3902 3903 __ addv(v7, __ T2S, v7, v21); 3904 __ addv(v6, __ T4S, v6, v20); 3905 3906 if (multi_block) { 3907 __ add(ofs, ofs, 64); 3908 __ cmp(ofs, limit); 3909 __ br(Assembler::LE, sha1_loop); 3910 __ mov(c_rarg0, ofs); // return ofs 3911 } 3912 3913 __ strq(v6, Address(state, 0)); 3914 __ strs(v7, Address(state, 16)); 3915 3916 __ ret(lr); 3917 3918 __ bind(keys); 3919 __ emit_int32(0x5a827999); 3920 __ emit_int32(0x6ed9eba1); 3921 __ emit_int32(0x8f1bbcdc); 3922 __ emit_int32(0xca62c1d6); 3923 3924 return start; 3925 } 3926 3927 3928 // Arguments: 3929 // 3930 // Inputs: 3931 // c_rarg0 - byte[] source+offset 3932 // c_rarg1 - int[] SHA.state 3933 // c_rarg2 - int offset 3934 // c_rarg3 - int limit 3935 // 3936 address generate_sha256_implCompress(StubId stub_id) { 3937 bool multi_block; 3938 switch (stub_id) { 3939 case StubId::stubgen_sha256_implCompress_id: 3940 multi_block = false; 3941 break; 3942 case StubId::stubgen_sha256_implCompressMB_id: 3943 multi_block = true; 3944 break; 3945 default: 3946 ShouldNotReachHere(); 3947 } 3948 3949 static const uint32_t round_consts[64] = { 3950 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3951 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3952 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3953 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3954 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3955 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3956 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3957 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3958 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3959 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3960 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3961 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3962 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3963 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3964 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3965 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3966 }; 3967 3968 __ align(CodeEntryAlignment); 3969 3970 StubCodeMark mark(this, stub_id); 3971 address start = __ pc(); 3972 3973 Register buf = c_rarg0; 3974 Register state = c_rarg1; 3975 Register ofs = c_rarg2; 3976 Register limit = c_rarg3; 3977 3978 Label sha1_loop; 3979 3980 __ stpd(v8, v9, __ pre(sp, -32)); 3981 __ stpd(v10, v11, Address(sp, 16)); 3982 3983 // dga == v0 3984 // dgb == v1 3985 // dg0 == v2 3986 // dg1 == v3 3987 // dg2 == v4 3988 // t0 == v6 3989 // t1 == v7 3990 3991 // load 16 keys to v16..v31 3992 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3993 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3994 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3995 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3996 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3997 3998 // load 8 words (256 bits) state 3999 __ ldpq(v0, v1, state); 4000 4001 __ BIND(sha1_loop); 4002 // load 64 bytes of data into v8..v11 4003 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 4004 __ rev32(v8, __ T16B, v8); 4005 __ rev32(v9, __ T16B, v9); 4006 __ rev32(v10, __ T16B, v10); 4007 __ rev32(v11, __ T16B, v11); 4008 4009 __ addv(v6, __ T4S, v8, v16); 4010 __ orr(v2, __ T16B, v0, v0); 4011 __ orr(v3, __ T16B, v1, v1); 4012 4013 FloatRegister d0 = v8; 4014 FloatRegister d1 = v9; 4015 FloatRegister d2 = v10; 4016 FloatRegister d3 = v11; 4017 4018 4019 for (int round = 0; round < 16; round++) { 4020 FloatRegister tmp1 = (round & 1) ? v6 : v7; 4021 FloatRegister tmp2 = (round & 1) ? v7 : v6; 4022 FloatRegister tmp3 = (round & 1) ? v2 : v4; 4023 FloatRegister tmp4 = (round & 1) ? v4 : v2; 4024 4025 if (round < 12) __ sha256su0(d0, __ T4S, d1); 4026 __ orr(v4, __ T16B, v2, v2); 4027 if (round < 15) 4028 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 4029 __ sha256h(v2, __ T4S, v3, tmp2); 4030 __ sha256h2(v3, __ T4S, v4, tmp2); 4031 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 4032 4033 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 4034 } 4035 4036 __ addv(v0, __ T4S, v0, v2); 4037 __ addv(v1, __ T4S, v1, v3); 4038 4039 if (multi_block) { 4040 __ add(ofs, ofs, 64); 4041 __ cmp(ofs, limit); 4042 __ br(Assembler::LE, sha1_loop); 4043 __ mov(c_rarg0, ofs); // return ofs 4044 } 4045 4046 __ ldpd(v10, v11, Address(sp, 16)); 4047 __ ldpd(v8, v9, __ post(sp, 32)); 4048 4049 __ stpq(v0, v1, state); 4050 4051 __ ret(lr); 4052 4053 return start; 4054 } 4055 4056 // Double rounds for sha512. 4057 void sha512_dround(int dr, 4058 FloatRegister vi0, FloatRegister vi1, 4059 FloatRegister vi2, FloatRegister vi3, 4060 FloatRegister vi4, FloatRegister vrc0, 4061 FloatRegister vrc1, FloatRegister vin0, 4062 FloatRegister vin1, FloatRegister vin2, 4063 FloatRegister vin3, FloatRegister vin4) { 4064 if (dr < 36) { 4065 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 4066 } 4067 __ addv(v5, __ T2D, vrc0, vin0); 4068 __ ext(v6, __ T16B, vi2, vi3, 8); 4069 __ ext(v5, __ T16B, v5, v5, 8); 4070 __ ext(v7, __ T16B, vi1, vi2, 8); 4071 __ addv(vi3, __ T2D, vi3, v5); 4072 if (dr < 32) { 4073 __ ext(v5, __ T16B, vin3, vin4, 8); 4074 __ sha512su0(vin0, __ T2D, vin1); 4075 } 4076 __ sha512h(vi3, __ T2D, v6, v7); 4077 if (dr < 32) { 4078 __ sha512su1(vin0, __ T2D, vin2, v5); 4079 } 4080 __ addv(vi4, __ T2D, vi1, vi3); 4081 __ sha512h2(vi3, __ T2D, vi1, vi0); 4082 } 4083 4084 // Arguments: 4085 // 4086 // Inputs: 4087 // c_rarg0 - byte[] source+offset 4088 // c_rarg1 - int[] SHA.state 4089 // c_rarg2 - int offset 4090 // c_rarg3 - int limit 4091 // 4092 address generate_sha512_implCompress(StubId stub_id) { 4093 bool multi_block; 4094 switch (stub_id) { 4095 case StubId::stubgen_sha512_implCompress_id: 4096 multi_block = false; 4097 break; 4098 case StubId::stubgen_sha512_implCompressMB_id: 4099 multi_block = true; 4100 break; 4101 default: 4102 ShouldNotReachHere(); 4103 } 4104 4105 static const uint64_t round_consts[80] = { 4106 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 4107 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 4108 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 4109 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 4110 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 4111 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 4112 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 4113 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 4114 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 4115 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 4116 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 4117 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 4118 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 4119 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 4120 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 4121 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 4122 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 4123 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 4124 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 4125 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 4126 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 4127 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 4128 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 4129 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 4130 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 4131 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 4132 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 4133 }; 4134 4135 __ align(CodeEntryAlignment); 4136 4137 StubCodeMark mark(this, stub_id); 4138 address start = __ pc(); 4139 4140 Register buf = c_rarg0; 4141 Register state = c_rarg1; 4142 Register ofs = c_rarg2; 4143 Register limit = c_rarg3; 4144 4145 __ stpd(v8, v9, __ pre(sp, -64)); 4146 __ stpd(v10, v11, Address(sp, 16)); 4147 __ stpd(v12, v13, Address(sp, 32)); 4148 __ stpd(v14, v15, Address(sp, 48)); 4149 4150 Label sha512_loop; 4151 4152 // load state 4153 __ ld1(v8, v9, v10, v11, __ T2D, state); 4154 4155 // load first 4 round constants 4156 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4157 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4158 4159 __ BIND(sha512_loop); 4160 // load 128B of data into v12..v19 4161 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4162 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4163 __ rev64(v12, __ T16B, v12); 4164 __ rev64(v13, __ T16B, v13); 4165 __ rev64(v14, __ T16B, v14); 4166 __ rev64(v15, __ T16B, v15); 4167 __ rev64(v16, __ T16B, v16); 4168 __ rev64(v17, __ T16B, v17); 4169 __ rev64(v18, __ T16B, v18); 4170 __ rev64(v19, __ T16B, v19); 4171 4172 __ mov(rscratch2, rscratch1); 4173 4174 __ mov(v0, __ T16B, v8); 4175 __ mov(v1, __ T16B, v9); 4176 __ mov(v2, __ T16B, v10); 4177 __ mov(v3, __ T16B, v11); 4178 4179 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4180 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4181 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4182 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4183 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4184 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4185 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4186 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4187 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4188 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4189 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4190 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4191 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4192 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4193 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4194 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4195 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4196 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4197 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4198 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4199 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4200 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4201 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4202 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4203 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4204 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4205 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4206 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4207 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4208 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4209 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4210 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4211 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4212 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4213 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4214 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4215 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4216 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4217 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4218 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4219 4220 __ addv(v8, __ T2D, v8, v0); 4221 __ addv(v9, __ T2D, v9, v1); 4222 __ addv(v10, __ T2D, v10, v2); 4223 __ addv(v11, __ T2D, v11, v3); 4224 4225 if (multi_block) { 4226 __ add(ofs, ofs, 128); 4227 __ cmp(ofs, limit); 4228 __ br(Assembler::LE, sha512_loop); 4229 __ mov(c_rarg0, ofs); // return ofs 4230 } 4231 4232 __ st1(v8, v9, v10, v11, __ T2D, state); 4233 4234 __ ldpd(v14, v15, Address(sp, 48)); 4235 __ ldpd(v12, v13, Address(sp, 32)); 4236 __ ldpd(v10, v11, Address(sp, 16)); 4237 __ ldpd(v8, v9, __ post(sp, 64)); 4238 4239 __ ret(lr); 4240 4241 return start; 4242 } 4243 4244 // Execute one round of keccak of two computations in parallel. 4245 // One of the states should be loaded into the lower halves of 4246 // the vector registers v0-v24, the other should be loaded into 4247 // the upper halves of those registers. The ld1r instruction loads 4248 // the round constant into both halves of register v31. 4249 // Intermediate results c0...c5 and d0...d5 are computed 4250 // in registers v25...v30. 4251 // All vector instructions that are used operate on both register 4252 // halves in parallel. 4253 // If only a single computation is needed, one can only load the lower halves. 4254 void keccak_round(Register rscratch1) { 4255 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4256 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4257 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4258 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4259 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4260 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4261 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4262 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4263 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4264 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4265 4266 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4267 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4268 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4269 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4270 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4271 4272 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4273 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4274 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4275 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4276 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4277 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4278 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4279 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4280 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4281 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4282 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4283 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4284 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4285 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4286 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4287 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4288 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4289 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4290 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4291 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4292 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4293 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4294 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4295 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4296 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4297 4298 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4299 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4300 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4301 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4302 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4303 4304 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4305 4306 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4307 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4308 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4309 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4310 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4311 4312 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4313 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4314 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4315 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4316 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4317 4318 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4319 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4320 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4321 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4322 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4323 4324 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4325 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4326 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4327 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4328 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4329 4330 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4331 } 4332 4333 // Arguments: 4334 // 4335 // Inputs: 4336 // c_rarg0 - byte[] source+offset 4337 // c_rarg1 - byte[] SHA.state 4338 // c_rarg2 - int block_size 4339 // c_rarg3 - int offset 4340 // c_rarg4 - int limit 4341 // 4342 address generate_sha3_implCompress(StubId stub_id) { 4343 bool multi_block; 4344 switch (stub_id) { 4345 case StubId::stubgen_sha3_implCompress_id: 4346 multi_block = false; 4347 break; 4348 case StubId::stubgen_sha3_implCompressMB_id: 4349 multi_block = true; 4350 break; 4351 default: 4352 ShouldNotReachHere(); 4353 } 4354 4355 static const uint64_t round_consts[24] = { 4356 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4357 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4358 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4359 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4360 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4361 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4362 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4363 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4364 }; 4365 4366 __ align(CodeEntryAlignment); 4367 4368 StubCodeMark mark(this, stub_id); 4369 address start = __ pc(); 4370 4371 Register buf = c_rarg0; 4372 Register state = c_rarg1; 4373 Register block_size = c_rarg2; 4374 Register ofs = c_rarg3; 4375 Register limit = c_rarg4; 4376 4377 Label sha3_loop, rounds24_loop; 4378 Label sha3_512_or_sha3_384, shake128; 4379 4380 __ stpd(v8, v9, __ pre(sp, -64)); 4381 __ stpd(v10, v11, Address(sp, 16)); 4382 __ stpd(v12, v13, Address(sp, 32)); 4383 __ stpd(v14, v15, Address(sp, 48)); 4384 4385 // load state 4386 __ add(rscratch1, state, 32); 4387 __ ld1(v0, v1, v2, v3, __ T1D, state); 4388 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4389 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4390 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4391 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4392 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4393 __ ld1(v24, __ T1D, rscratch1); 4394 4395 __ BIND(sha3_loop); 4396 4397 // 24 keccak rounds 4398 __ movw(rscratch2, 24); 4399 4400 // load round_constants base 4401 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4402 4403 // load input 4404 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4405 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4406 __ eor(v0, __ T8B, v0, v25); 4407 __ eor(v1, __ T8B, v1, v26); 4408 __ eor(v2, __ T8B, v2, v27); 4409 __ eor(v3, __ T8B, v3, v28); 4410 __ eor(v4, __ T8B, v4, v29); 4411 __ eor(v5, __ T8B, v5, v30); 4412 __ eor(v6, __ T8B, v6, v31); 4413 4414 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4415 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4416 4417 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4418 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4419 __ eor(v7, __ T8B, v7, v25); 4420 __ eor(v8, __ T8B, v8, v26); 4421 __ eor(v9, __ T8B, v9, v27); 4422 __ eor(v10, __ T8B, v10, v28); 4423 __ eor(v11, __ T8B, v11, v29); 4424 __ eor(v12, __ T8B, v12, v30); 4425 __ eor(v13, __ T8B, v13, v31); 4426 4427 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4428 __ eor(v14, __ T8B, v14, v25); 4429 __ eor(v15, __ T8B, v15, v26); 4430 __ eor(v16, __ T8B, v16, v27); 4431 4432 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4433 __ andw(c_rarg5, block_size, 48); 4434 __ cbzw(c_rarg5, rounds24_loop); 4435 4436 __ tbnz(block_size, 5, shake128); 4437 // block_size == 144, bit5 == 0, SHA3-224 4438 __ ldrd(v28, __ post(buf, 8)); 4439 __ eor(v17, __ T8B, v17, v28); 4440 __ b(rounds24_loop); 4441 4442 __ BIND(shake128); 4443 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4444 __ eor(v17, __ T8B, v17, v28); 4445 __ eor(v18, __ T8B, v18, v29); 4446 __ eor(v19, __ T8B, v19, v30); 4447 __ eor(v20, __ T8B, v20, v31); 4448 __ b(rounds24_loop); // block_size == 168, SHAKE128 4449 4450 __ BIND(sha3_512_or_sha3_384); 4451 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4452 __ eor(v7, __ T8B, v7, v25); 4453 __ eor(v8, __ T8B, v8, v26); 4454 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4455 4456 // SHA3-384 4457 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4458 __ eor(v9, __ T8B, v9, v27); 4459 __ eor(v10, __ T8B, v10, v28); 4460 __ eor(v11, __ T8B, v11, v29); 4461 __ eor(v12, __ T8B, v12, v30); 4462 4463 __ BIND(rounds24_loop); 4464 __ subw(rscratch2, rscratch2, 1); 4465 4466 keccak_round(rscratch1); 4467 4468 __ cbnzw(rscratch2, rounds24_loop); 4469 4470 if (multi_block) { 4471 __ add(ofs, ofs, block_size); 4472 __ cmp(ofs, limit); 4473 __ br(Assembler::LE, sha3_loop); 4474 __ mov(c_rarg0, ofs); // return ofs 4475 } 4476 4477 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4478 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4479 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4480 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4481 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4482 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4483 __ st1(v24, __ T1D, state); 4484 4485 // restore callee-saved registers 4486 __ ldpd(v14, v15, Address(sp, 48)); 4487 __ ldpd(v12, v13, Address(sp, 32)); 4488 __ ldpd(v10, v11, Address(sp, 16)); 4489 __ ldpd(v8, v9, __ post(sp, 64)); 4490 4491 __ ret(lr); 4492 4493 return start; 4494 } 4495 4496 // Inputs: 4497 // c_rarg0 - long[] state0 4498 // c_rarg1 - long[] state1 4499 address generate_double_keccak() { 4500 static const uint64_t round_consts[24] = { 4501 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4502 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4503 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4504 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4505 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4506 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4507 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4508 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4509 }; 4510 4511 // Implements the double_keccak() method of the 4512 // sun.secyrity.provider.SHA3Parallel class 4513 __ align(CodeEntryAlignment); 4514 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4515 address start = __ pc(); 4516 __ enter(); 4517 4518 Register state0 = c_rarg0; 4519 Register state1 = c_rarg1; 4520 4521 Label rounds24_loop; 4522 4523 // save callee-saved registers 4524 __ stpd(v8, v9, __ pre(sp, -64)); 4525 __ stpd(v10, v11, Address(sp, 16)); 4526 __ stpd(v12, v13, Address(sp, 32)); 4527 __ stpd(v14, v15, Address(sp, 48)); 4528 4529 // load states 4530 __ add(rscratch1, state0, 32); 4531 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4532 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4533 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4534 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4535 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4536 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4537 __ ld1(v24, __ D, 0, rscratch1); 4538 __ add(rscratch1, state1, 32); 4539 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4540 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4541 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4542 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4543 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4544 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4545 __ ld1(v24, __ D, 1, rscratch1); 4546 4547 // 24 keccak rounds 4548 __ movw(rscratch2, 24); 4549 4550 // load round_constants base 4551 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4552 4553 __ BIND(rounds24_loop); 4554 __ subw(rscratch2, rscratch2, 1); 4555 keccak_round(rscratch1); 4556 __ cbnzw(rscratch2, rounds24_loop); 4557 4558 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4559 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4560 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4561 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4562 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4563 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4564 __ st1(v24, __ D, 0, state0); 4565 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4566 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4567 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4568 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4569 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4570 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4571 __ st1(v24, __ D, 1, state1); 4572 4573 // restore callee-saved vector registers 4574 __ ldpd(v14, v15, Address(sp, 48)); 4575 __ ldpd(v12, v13, Address(sp, 32)); 4576 __ ldpd(v10, v11, Address(sp, 16)); 4577 __ ldpd(v8, v9, __ post(sp, 64)); 4578 4579 __ leave(); // required for proper stackwalking of RuntimeStub frame 4580 __ mov(r0, zr); // return 0 4581 __ ret(lr); 4582 4583 return start; 4584 } 4585 4586 // ChaCha20 block function. This version parallelizes the 32-bit 4587 // state elements on each of 16 vectors, producing 4 blocks of 4588 // keystream at a time. 4589 // 4590 // state (int[16]) = c_rarg0 4591 // keystream (byte[256]) = c_rarg1 4592 // return - number of bytes of produced keystream (always 256) 4593 // 4594 // This implementation takes each 32-bit integer from the state 4595 // array and broadcasts it across all 4 32-bit lanes of a vector register 4596 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4597 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4598 // the quarter round schedule is implemented as outlined in RFC 7539 section 4599 // 2.3. However, instead of sequentially processing the 3 quarter round 4600 // operations represented by one QUARTERROUND function, we instead stack all 4601 // the adds, xors and left-rotations from the first 4 quarter rounds together 4602 // and then do the same for the second set of 4 quarter rounds. This removes 4603 // some latency that would otherwise be incurred by waiting for an add to 4604 // complete before performing an xor (which depends on the result of the 4605 // add), etc. An adjustment happens between the first and second groups of 4 4606 // quarter rounds, but this is done only in the inputs to the macro functions 4607 // that generate the assembly instructions - these adjustments themselves are 4608 // not part of the resulting assembly. 4609 // The 4 registers v0-v3 are used during the quarter round operations as 4610 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4611 // registers become the vectors involved in adding the start state back onto 4612 // the post-QR working state. After the adds are complete, each of the 16 4613 // vectors write their first lane back to the keystream buffer, followed 4614 // by the second lane from all vectors and so on. 4615 address generate_chacha20Block_blockpar() { 4616 Label L_twoRounds, L_cc20_const; 4617 __ align(CodeEntryAlignment); 4618 StubId stub_id = StubId::stubgen_chacha20Block_id; 4619 StubCodeMark mark(this, stub_id); 4620 address start = __ pc(); 4621 __ enter(); 4622 4623 int i, j; 4624 const Register state = c_rarg0; 4625 const Register keystream = c_rarg1; 4626 const Register loopCtr = r10; 4627 const Register tmpAddr = r11; 4628 const FloatRegister ctrAddOverlay = v28; 4629 const FloatRegister lrot8Tbl = v29; 4630 4631 // Organize SIMD registers in an array that facilitates 4632 // putting repetitive opcodes into loop structures. It is 4633 // important that each grouping of 4 registers is monotonically 4634 // increasing to support the requirements of multi-register 4635 // instructions (e.g. ld4r, st4, etc.) 4636 const FloatRegister workSt[16] = { 4637 v4, v5, v6, v7, v16, v17, v18, v19, 4638 v20, v21, v22, v23, v24, v25, v26, v27 4639 }; 4640 4641 // Pull in constant data. The first 16 bytes are the add overlay 4642 // which is applied to the vector holding the counter (state[12]). 4643 // The second 16 bytes is the index register for the 8-bit left 4644 // rotation tbl instruction. 4645 __ adr(tmpAddr, L_cc20_const); 4646 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4647 4648 // Load from memory and interlace across 16 SIMD registers, 4649 // With each word from memory being broadcast to all lanes of 4650 // each successive SIMD register. 4651 // Addr(0) -> All lanes in workSt[i] 4652 // Addr(4) -> All lanes workSt[i + 1], etc. 4653 __ mov(tmpAddr, state); 4654 for (i = 0; i < 16; i += 4) { 4655 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4656 __ post(tmpAddr, 16)); 4657 } 4658 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4659 4660 // Before entering the loop, create 5 4-register arrays. These 4661 // will hold the 4 registers that represent the a/b/c/d fields 4662 // in the quarter round operation. For instance the "b" field 4663 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4664 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4665 // since it is part of a diagonal organization. The aSet and scratch 4666 // register sets are defined at declaration time because they do not change 4667 // organization at any point during the 20-round processing. 4668 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4669 FloatRegister bSet[4]; 4670 FloatRegister cSet[4]; 4671 FloatRegister dSet[4]; 4672 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4673 4674 // Set up the 10 iteration loop and perform all 8 quarter round ops 4675 __ mov(loopCtr, 10); 4676 __ BIND(L_twoRounds); 4677 4678 // Set to columnar organization and do the following 4 quarter-rounds: 4679 // QUARTERROUND(0, 4, 8, 12) 4680 // QUARTERROUND(1, 5, 9, 13) 4681 // QUARTERROUND(2, 6, 10, 14) 4682 // QUARTERROUND(3, 7, 11, 15) 4683 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4684 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4685 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4686 4687 __ cc20_qr_add4(aSet, bSet); // a += b 4688 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4689 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4690 4691 __ cc20_qr_add4(cSet, dSet); // c += d 4692 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4693 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4694 4695 __ cc20_qr_add4(aSet, bSet); // a += b 4696 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4697 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4698 4699 __ cc20_qr_add4(cSet, dSet); // c += d 4700 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4701 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4702 4703 // Set to diagonal organization and do the next 4 quarter-rounds: 4704 // QUARTERROUND(0, 5, 10, 15) 4705 // QUARTERROUND(1, 6, 11, 12) 4706 // QUARTERROUND(2, 7, 8, 13) 4707 // QUARTERROUND(3, 4, 9, 14) 4708 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4709 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4710 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4711 4712 __ cc20_qr_add4(aSet, bSet); // a += b 4713 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4714 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4715 4716 __ cc20_qr_add4(cSet, dSet); // c += d 4717 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4718 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4719 4720 __ cc20_qr_add4(aSet, bSet); // a += b 4721 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4722 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4723 4724 __ cc20_qr_add4(cSet, dSet); // c += d 4725 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4726 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4727 4728 // Decrement and iterate 4729 __ sub(loopCtr, loopCtr, 1); 4730 __ cbnz(loopCtr, L_twoRounds); 4731 4732 __ mov(tmpAddr, state); 4733 4734 // Add the starting state back to the post-loop keystream 4735 // state. We read/interlace the state array from memory into 4736 // 4 registers similar to what we did in the beginning. Then 4737 // add the counter overlay onto workSt[12] at the end. 4738 for (i = 0; i < 16; i += 4) { 4739 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4740 __ addv(workSt[i], __ T4S, workSt[i], v0); 4741 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4742 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4743 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4744 } 4745 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4746 4747 // Write working state into the keystream buffer. This is accomplished 4748 // by taking the lane "i" from each of the four vectors and writing 4749 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4750 // repeating with the next 4 vectors until all 16 vectors have been used. 4751 // Then move to the next lane and repeat the process until all lanes have 4752 // been written. 4753 for (i = 0; i < 4; i++) { 4754 for (j = 0; j < 16; j += 4) { 4755 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4756 __ post(keystream, 16)); 4757 } 4758 } 4759 4760 __ mov(r0, 256); // Return length of output keystream 4761 __ leave(); 4762 __ ret(lr); 4763 4764 // bind label and generate local constant data used by this stub 4765 // The constant data is broken into two 128-bit segments to be loaded 4766 // onto FloatRegisters. The first 128 bits are a counter add overlay 4767 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4768 // The second 128-bits is a table constant used for 8-bit left rotations. 4769 __ BIND(L_cc20_const); 4770 __ emit_int64(0x0000000100000000UL); 4771 __ emit_int64(0x0000000300000002UL); 4772 __ emit_int64(0x0605040702010003UL); 4773 __ emit_int64(0x0E0D0C0F0A09080BUL); 4774 4775 return start; 4776 } 4777 4778 // Helpers to schedule parallel operation bundles across vector 4779 // register sequences of size 2, 4 or 8. 4780 4781 // Implement various primitive computations across vector sequences 4782 4783 template<int N> 4784 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4785 const VSeq<N>& v1, const VSeq<N>& v2) { 4786 // output must not be constant 4787 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4788 // output cannot overwrite pending inputs 4789 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4790 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4791 for (int i = 0; i < N; i++) { 4792 __ addv(v[i], T, v1[i], v2[i]); 4793 } 4794 } 4795 4796 template<int N> 4797 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4798 const VSeq<N>& v1, const VSeq<N>& v2) { 4799 // output must not be constant 4800 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4801 // output cannot overwrite pending inputs 4802 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4803 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4804 for (int i = 0; i < N; i++) { 4805 __ subv(v[i], T, v1[i], v2[i]); 4806 } 4807 } 4808 4809 template<int N> 4810 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4811 const VSeq<N>& v1, const VSeq<N>& v2) { 4812 // output must not be constant 4813 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4814 // output cannot overwrite pending inputs 4815 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4816 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4817 for (int i = 0; i < N; i++) { 4818 __ mulv(v[i], T, v1[i], v2[i]); 4819 } 4820 } 4821 4822 template<int N> 4823 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4824 // output must not be constant 4825 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4826 // output cannot overwrite pending inputs 4827 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4828 for (int i = 0; i < N; i++) { 4829 __ negr(v[i], T, v1[i]); 4830 } 4831 } 4832 4833 template<int N> 4834 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4835 const VSeq<N>& v1, int shift) { 4836 // output must not be constant 4837 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4838 // output cannot overwrite pending inputs 4839 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4840 for (int i = 0; i < N; i++) { 4841 __ sshr(v[i], T, v1[i], shift); 4842 } 4843 } 4844 4845 template<int N> 4846 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4847 // output must not be constant 4848 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4849 // output cannot overwrite pending inputs 4850 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4851 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4852 for (int i = 0; i < N; i++) { 4853 __ andr(v[i], __ T16B, v1[i], v2[i]); 4854 } 4855 } 4856 4857 template<int N> 4858 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4859 // output must not be constant 4860 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4861 // output cannot overwrite pending inputs 4862 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4863 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4864 for (int i = 0; i < N; i++) { 4865 __ orr(v[i], __ T16B, v1[i], v2[i]); 4866 } 4867 } 4868 4869 template<int N> 4870 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4871 // output must not be constant 4872 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4873 // output cannot overwrite pending inputs 4874 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4875 for (int i = 0; i < N; i++) { 4876 __ notr(v[i], __ T16B, v1[i]); 4877 } 4878 } 4879 4880 template<int N> 4881 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4882 // output must not be constant 4883 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4884 // output cannot overwrite pending inputs 4885 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4886 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4887 for (int i = 0; i < N; i++) { 4888 __ sqdmulh(v[i], T, v1[i], v2[i]); 4889 } 4890 } 4891 4892 template<int N> 4893 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4894 // output must not be constant 4895 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4896 // output cannot overwrite pending inputs 4897 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4898 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4899 for (int i = 0; i < N; i++) { 4900 __ mlsv(v[i], T, v1[i], v2[i]); 4901 } 4902 } 4903 4904 // load N/2 successive pairs of quadword values from memory in order 4905 // into N successive vector registers of the sequence via the 4906 // address supplied in base. 4907 template<int N> 4908 void vs_ldpq(const VSeq<N>& v, Register base) { 4909 for (int i = 0; i < N; i += 2) { 4910 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4911 } 4912 } 4913 4914 // load N/2 successive pairs of quadword values from memory in order 4915 // into N vector registers of the sequence via the address supplied 4916 // in base using post-increment addressing 4917 template<int N> 4918 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4919 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4920 for (int i = 0; i < N; i += 2) { 4921 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4922 } 4923 } 4924 4925 // store N successive vector registers of the sequence into N/2 4926 // successive pairs of quadword memory locations via the address 4927 // supplied in base using post-increment addressing 4928 template<int N> 4929 void vs_stpq_post(const VSeq<N>& v, Register base) { 4930 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4931 for (int i = 0; i < N; i += 2) { 4932 __ stpq(v[i], v[i+1], __ post(base, 32)); 4933 } 4934 } 4935 4936 // load N/2 pairs of quadword values from memory de-interleaved into 4937 // N vector registers 2 at a time via the address supplied in base 4938 // using post-increment addressing. 4939 template<int N> 4940 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4941 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4942 for (int i = 0; i < N; i += 2) { 4943 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4944 } 4945 } 4946 4947 // store N vector registers interleaved into N/2 pairs of quadword 4948 // memory locations via the address supplied in base using 4949 // post-increment addressing. 4950 template<int N> 4951 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4952 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4953 for (int i = 0; i < N; i += 2) { 4954 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4955 } 4956 } 4957 4958 // load N quadword values from memory de-interleaved into N vector 4959 // registers 3 elements at a time via the address supplied in base. 4960 template<int N> 4961 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4962 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4963 for (int i = 0; i < N; i += 3) { 4964 __ ld3(v[i], v[i+1], v[i+2], T, base); 4965 } 4966 } 4967 4968 // load N quadword values from memory de-interleaved into N vector 4969 // registers 3 elements at a time via the address supplied in base 4970 // using post-increment addressing. 4971 template<int N> 4972 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4973 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4974 for (int i = 0; i < N; i += 3) { 4975 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4976 } 4977 } 4978 4979 // load N/2 pairs of quadword values from memory into N vector 4980 // registers via the address supplied in base with each pair indexed 4981 // using the the start offset plus the corresponding entry in the 4982 // offsets array 4983 template<int N> 4984 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4985 for (int i = 0; i < N/2; i++) { 4986 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4987 } 4988 } 4989 4990 // store N vector registers into N/2 pairs of quadword memory 4991 // locations via the address supplied in base with each pair indexed 4992 // using the the start offset plus the corresponding entry in the 4993 // offsets array 4994 template<int N> 4995 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4996 for (int i = 0; i < N/2; i++) { 4997 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4998 } 4999 } 5000 5001 // load N single quadword values from memory into N vector registers 5002 // via the address supplied in base with each value indexed using 5003 // the the start offset plus the corresponding entry in the offsets 5004 // array 5005 template<int N> 5006 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 5007 int start, int (&offsets)[N]) { 5008 for (int i = 0; i < N; i++) { 5009 __ ldr(v[i], T, Address(base, start + offsets[i])); 5010 } 5011 } 5012 5013 // store N vector registers into N single quadword memory locations 5014 // via the address supplied in base with each value indexed using 5015 // the the start offset plus the corresponding entry in the offsets 5016 // array 5017 template<int N> 5018 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 5019 int start, int (&offsets)[N]) { 5020 for (int i = 0; i < N; i++) { 5021 __ str(v[i], T, Address(base, start + offsets[i])); 5022 } 5023 } 5024 5025 // load N/2 pairs of quadword values from memory de-interleaved into 5026 // N vector registers 2 at a time via the address supplied in base 5027 // with each pair indexed using the the start offset plus the 5028 // corresponding entry in the offsets array 5029 template<int N> 5030 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 5031 Register tmp, int start, int (&offsets)[N/2]) { 5032 for (int i = 0; i < N/2; i++) { 5033 __ add(tmp, base, start + offsets[i]); 5034 __ ld2(v[2*i], v[2*i+1], T, tmp); 5035 } 5036 } 5037 5038 // store N vector registers 2 at a time interleaved into N/2 pairs 5039 // of quadword memory locations via the address supplied in base 5040 // with each pair indexed using the the start offset plus the 5041 // corresponding entry in the offsets array 5042 template<int N> 5043 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 5044 Register tmp, int start, int (&offsets)[N/2]) { 5045 for (int i = 0; i < N/2; i++) { 5046 __ add(tmp, base, start + offsets[i]); 5047 __ st2(v[2*i], v[2*i+1], T, tmp); 5048 } 5049 } 5050 5051 // Helper routines for various flavours of Montgomery multiply 5052 5053 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 5054 // multiplications in parallel 5055 // 5056 5057 // See the montMul() method of the sun.security.provider.ML_DSA 5058 // class. 5059 // 5060 // Computes 4x4S results or 8x8H results 5061 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5062 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5063 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5064 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5065 // Outputs: va - 4x4S or 4x8H vector register sequences 5066 // vb, vc, vtmp and vq must all be disjoint 5067 // va must be disjoint from all other inputs/temps or must equal vc 5068 // va must have a non-zero delta i.e. it must not be a constant vseq. 5069 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5070 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5071 Assembler::SIMD_Arrangement T, 5072 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5073 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5074 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5075 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5076 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5077 5078 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5079 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5080 5081 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5082 5083 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5084 assert(vs_disjoint(va, vb), "va and vb overlap"); 5085 assert(vs_disjoint(va, vq), "va and vq overlap"); 5086 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5087 assert(!va.is_constant(), "output vector must identify 4 different registers"); 5088 5089 // schedule 4 streams of instructions across the vector sequences 5090 for (int i = 0; i < 4; i++) { 5091 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5092 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5093 } 5094 5095 for (int i = 0; i < 4; i++) { 5096 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5097 } 5098 5099 for (int i = 0; i < 4; i++) { 5100 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5101 } 5102 5103 for (int i = 0; i < 4; i++) { 5104 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5105 } 5106 } 5107 5108 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 5109 // multiplications in parallel 5110 // 5111 5112 // See the montMul() method of the sun.security.provider.ML_DSA 5113 // class. 5114 // 5115 // Computes 4x4S results or 8x8H results 5116 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5117 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5118 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5119 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5120 // Outputs: va - 4x4S or 4x8H vector register sequences 5121 // vb, vc, vtmp and vq must all be disjoint 5122 // va must be disjoint from all other inputs/temps or must equal vc 5123 // va must have a non-zero delta i.e. it must not be a constant vseq. 5124 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5125 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5126 Assembler::SIMD_Arrangement T, 5127 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5128 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5129 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5130 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5131 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5132 5133 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5134 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5135 5136 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5137 5138 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5139 assert(vs_disjoint(va, vb), "va and vb overlap"); 5140 assert(vs_disjoint(va, vq), "va and vq overlap"); 5141 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5142 assert(!va.is_constant(), "output vector must identify 2 different registers"); 5143 5144 // schedule 2 streams of instructions across the vector sequences 5145 for (int i = 0; i < 2; i++) { 5146 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5147 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5148 } 5149 5150 for (int i = 0; i < 2; i++) { 5151 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5152 } 5153 5154 for (int i = 0; i < 2; i++) { 5155 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5156 } 5157 5158 for (int i = 0; i < 2; i++) { 5159 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5160 } 5161 } 5162 5163 // Perform 16 16-bit Montgomery multiplications in parallel. 5164 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5165 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5166 // Use the helper routine to schedule a 2x8H Montgomery multiply. 5167 // It will assert that the register use is valid 5168 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 5169 } 5170 5171 // Perform 32 16-bit Montgomery multiplications in parallel. 5172 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5173 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5174 // Use the helper routine to schedule a 4x8H Montgomery multiply. 5175 // It will assert that the register use is valid 5176 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 5177 } 5178 5179 // Perform 64 16-bit Montgomery multiplications in parallel. 5180 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5181 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5182 // Schedule two successive 4x8H multiplies via the montmul helper 5183 // on the front and back halves of va, vb and vc. The helper will 5184 // assert that the register use has no overlap conflicts on each 5185 // individual call but we also need to ensure that the necessary 5186 // disjoint/equality constraints are met across both calls. 5187 5188 // vb, vc, vtmp and vq must be disjoint. va must either be 5189 // disjoint from all other registers or equal vc 5190 5191 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5192 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5193 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5194 5195 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5196 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5197 5198 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5199 5200 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5201 assert(vs_disjoint(va, vb), "va and vb overlap"); 5202 assert(vs_disjoint(va, vq), "va and vq overlap"); 5203 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5204 5205 // we multiply the front and back halves of each sequence 4 at a 5206 // time because 5207 // 5208 // 1) we are currently only able to get 4-way instruction 5209 // parallelism at best 5210 // 5211 // 2) we need registers for the constants in vq and temporary 5212 // scratch registers to hold intermediate results so vtmp can only 5213 // be a VSeq<4> which means we only have 4 scratch slots 5214 5215 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5216 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5217 } 5218 5219 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5220 const VSeq<4>& vc, 5221 const VSeq<4>& vtmp, 5222 const VSeq<2>& vq) { 5223 // compute a = montmul(a1, c) 5224 kyber_montmul32(vc, va1, vc, vtmp, vq); 5225 // ouptut a1 = a0 - a 5226 vs_subv(va1, __ T8H, va0, vc); 5227 // and a0 = a0 + a 5228 vs_addv(va0, __ T8H, va0, vc); 5229 } 5230 5231 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5232 const VSeq<4>& vb, 5233 const VSeq<4>& vtmp1, 5234 const VSeq<4>& vtmp2, 5235 const VSeq<2>& vq) { 5236 // compute c = a0 - a1 5237 vs_subv(vtmp1, __ T8H, va0, va1); 5238 // output a0 = a0 + a1 5239 vs_addv(va0, __ T8H, va0, va1); 5240 // output a1 = b montmul c 5241 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5242 } 5243 5244 void load64shorts(const VSeq<8>& v, Register shorts) { 5245 vs_ldpq_post(v, shorts); 5246 } 5247 5248 void load32shorts(const VSeq<4>& v, Register shorts) { 5249 vs_ldpq_post(v, shorts); 5250 } 5251 5252 void store64shorts(VSeq<8> v, Register tmpAddr) { 5253 vs_stpq_post(v, tmpAddr); 5254 } 5255 5256 // Kyber NTT function. 5257 // Implements 5258 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5259 // 5260 // coeffs (short[256]) = c_rarg0 5261 // ntt_zetas (short[256]) = c_rarg1 5262 address generate_kyberNtt() { 5263 5264 __ align(CodeEntryAlignment); 5265 StubId stub_id = StubId::stubgen_kyberNtt_id; 5266 StubCodeMark mark(this, stub_id); 5267 address start = __ pc(); 5268 __ enter(); 5269 5270 const Register coeffs = c_rarg0; 5271 const Register zetas = c_rarg1; 5272 5273 const Register kyberConsts = r10; 5274 const Register tmpAddr = r11; 5275 5276 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5277 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5278 VSeq<2> vq(30); // n.b. constants overlap vs3 5279 5280 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5281 // load the montmul constants 5282 vs_ldpq(vq, kyberConsts); 5283 5284 // Each level corresponds to an iteration of the outermost loop of the 5285 // Java method seilerNTT(int[] coeffs). There are some differences 5286 // from what is done in the seilerNTT() method, though: 5287 // 1. The computation is using 16-bit signed values, we do not convert them 5288 // to ints here. 5289 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5290 // this array for each level, it is easier that way to fill up the vector 5291 // registers. 5292 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5293 // multiplications (this is because that way there should not be any 5294 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5295 // that we can use the 16-bit arithmetic in the vector unit. 5296 // 5297 // On each level, we fill up the vector registers in such a way that the 5298 // array elements that need to be multiplied by the zetas go into one 5299 // set of vector registers while the corresponding ones that don't need to 5300 // be multiplied, go into another set. 5301 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5302 // registers interleaving the steps of 4 identical computations, 5303 // each done on 8 16-bit values per register. 5304 5305 // At levels 0-3 the coefficients multiplied by or added/subtracted 5306 // to the zetas occur in discrete blocks whose size is some multiple 5307 // of 32. 5308 5309 // level 0 5310 __ add(tmpAddr, coeffs, 256); 5311 load64shorts(vs1, tmpAddr); 5312 load64shorts(vs2, zetas); 5313 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5314 __ add(tmpAddr, coeffs, 0); 5315 load64shorts(vs1, tmpAddr); 5316 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5317 vs_addv(vs1, __ T8H, vs1, vs2); 5318 __ add(tmpAddr, coeffs, 0); 5319 vs_stpq_post(vs1, tmpAddr); 5320 __ add(tmpAddr, coeffs, 256); 5321 vs_stpq_post(vs3, tmpAddr); 5322 // restore montmul constants 5323 vs_ldpq(vq, kyberConsts); 5324 load64shorts(vs1, tmpAddr); 5325 load64shorts(vs2, zetas); 5326 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5327 __ add(tmpAddr, coeffs, 128); 5328 load64shorts(vs1, tmpAddr); 5329 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5330 vs_addv(vs1, __ T8H, vs1, vs2); 5331 __ add(tmpAddr, coeffs, 128); 5332 store64shorts(vs1, tmpAddr); 5333 __ add(tmpAddr, coeffs, 384); 5334 store64shorts(vs3, tmpAddr); 5335 5336 // level 1 5337 // restore montmul constants 5338 vs_ldpq(vq, kyberConsts); 5339 __ add(tmpAddr, coeffs, 128); 5340 load64shorts(vs1, tmpAddr); 5341 load64shorts(vs2, zetas); 5342 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5343 __ add(tmpAddr, coeffs, 0); 5344 load64shorts(vs1, tmpAddr); 5345 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5346 vs_addv(vs1, __ T8H, vs1, vs2); 5347 __ add(tmpAddr, coeffs, 0); 5348 store64shorts(vs1, tmpAddr); 5349 store64shorts(vs3, tmpAddr); 5350 vs_ldpq(vq, kyberConsts); 5351 __ add(tmpAddr, coeffs, 384); 5352 load64shorts(vs1, tmpAddr); 5353 load64shorts(vs2, zetas); 5354 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5355 __ add(tmpAddr, coeffs, 256); 5356 load64shorts(vs1, tmpAddr); 5357 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5358 vs_addv(vs1, __ T8H, vs1, vs2); 5359 __ add(tmpAddr, coeffs, 256); 5360 store64shorts(vs1, tmpAddr); 5361 store64shorts(vs3, tmpAddr); 5362 5363 // level 2 5364 vs_ldpq(vq, kyberConsts); 5365 int offsets1[4] = { 0, 32, 128, 160 }; 5366 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5367 load64shorts(vs2, zetas); 5368 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5369 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5370 // kyber_subv_addv64(); 5371 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5372 vs_addv(vs1, __ T8H, vs1, vs2); 5373 __ add(tmpAddr, coeffs, 0); 5374 vs_stpq_post(vs_front(vs1), tmpAddr); 5375 vs_stpq_post(vs_front(vs3), tmpAddr); 5376 vs_stpq_post(vs_back(vs1), tmpAddr); 5377 vs_stpq_post(vs_back(vs3), tmpAddr); 5378 vs_ldpq(vq, kyberConsts); 5379 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5380 load64shorts(vs2, zetas); 5381 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5382 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5383 // kyber_subv_addv64(); 5384 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5385 vs_addv(vs1, __ T8H, vs1, vs2); 5386 __ add(tmpAddr, coeffs, 256); 5387 vs_stpq_post(vs_front(vs1), tmpAddr); 5388 vs_stpq_post(vs_front(vs3), tmpAddr); 5389 vs_stpq_post(vs_back(vs1), tmpAddr); 5390 vs_stpq_post(vs_back(vs3), tmpAddr); 5391 5392 // level 3 5393 vs_ldpq(vq, kyberConsts); 5394 int offsets2[4] = { 0, 64, 128, 192 }; 5395 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5396 load64shorts(vs2, zetas); 5397 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5398 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5399 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5400 vs_addv(vs1, __ T8H, vs1, vs2); 5401 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5402 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5403 5404 vs_ldpq(vq, kyberConsts); 5405 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5406 load64shorts(vs2, zetas); 5407 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5408 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5409 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5410 vs_addv(vs1, __ T8H, vs1, vs2); 5411 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5412 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5413 5414 // level 4 5415 // At level 4 coefficients occur in 8 discrete blocks of size 16 5416 // so they are loaded using employing an ldr at 8 distinct offsets. 5417 5418 vs_ldpq(vq, kyberConsts); 5419 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5420 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5421 load64shorts(vs2, zetas); 5422 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5423 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5424 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5425 vs_addv(vs1, __ T8H, vs1, vs2); 5426 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5427 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5428 5429 vs_ldpq(vq, kyberConsts); 5430 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5431 load64shorts(vs2, zetas); 5432 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5433 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5434 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5435 vs_addv(vs1, __ T8H, vs1, vs2); 5436 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5437 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5438 5439 // level 5 5440 // At level 5 related coefficients occur in discrete blocks of size 8 so 5441 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5442 5443 vs_ldpq(vq, kyberConsts); 5444 int offsets4[4] = { 0, 32, 64, 96 }; 5445 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5446 load32shorts(vs_front(vs2), zetas); 5447 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5448 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5449 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5450 load32shorts(vs_front(vs2), zetas); 5451 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5452 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5453 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5454 load32shorts(vs_front(vs2), zetas); 5455 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5456 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5457 5458 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5459 load32shorts(vs_front(vs2), zetas); 5460 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5461 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5462 5463 // level 6 5464 // At level 6 related coefficients occur in discrete blocks of size 4 so 5465 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5466 5467 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5468 load32shorts(vs_front(vs2), zetas); 5469 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5470 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5471 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5472 // __ ldpq(v18, v19, __ post(zetas, 32)); 5473 load32shorts(vs_front(vs2), zetas); 5474 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5475 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5476 5477 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5478 load32shorts(vs_front(vs2), zetas); 5479 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5480 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5481 5482 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5483 load32shorts(vs_front(vs2), zetas); 5484 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5485 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5486 5487 __ leave(); // required for proper stackwalking of RuntimeStub frame 5488 __ mov(r0, zr); // return 0 5489 __ ret(lr); 5490 5491 return start; 5492 } 5493 5494 // Kyber Inverse NTT function 5495 // Implements 5496 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5497 // 5498 // coeffs (short[256]) = c_rarg0 5499 // ntt_zetas (short[256]) = c_rarg1 5500 address generate_kyberInverseNtt() { 5501 5502 __ align(CodeEntryAlignment); 5503 StubId stub_id = StubId::stubgen_kyberInverseNtt_id; 5504 StubCodeMark mark(this, stub_id); 5505 address start = __ pc(); 5506 __ enter(); 5507 5508 const Register coeffs = c_rarg0; 5509 const Register zetas = c_rarg1; 5510 5511 const Register kyberConsts = r10; 5512 const Register tmpAddr = r11; 5513 const Register tmpAddr2 = c_rarg2; 5514 5515 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5516 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5517 VSeq<2> vq(30); // n.b. constants overlap vs3 5518 5519 __ lea(kyberConsts, 5520 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5521 5522 // level 0 5523 // At level 0 related coefficients occur in discrete blocks of size 4 so 5524 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5525 5526 vs_ldpq(vq, kyberConsts); 5527 int offsets4[4] = { 0, 32, 64, 96 }; 5528 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5529 load32shorts(vs_front(vs2), zetas); 5530 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5531 vs_front(vs2), vs_back(vs2), vtmp, vq); 5532 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5533 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5534 load32shorts(vs_front(vs2), zetas); 5535 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5536 vs_front(vs2), vs_back(vs2), vtmp, vq); 5537 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5538 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5539 load32shorts(vs_front(vs2), zetas); 5540 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5541 vs_front(vs2), vs_back(vs2), vtmp, vq); 5542 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5543 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5544 load32shorts(vs_front(vs2), zetas); 5545 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5546 vs_front(vs2), vs_back(vs2), vtmp, vq); 5547 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5548 5549 // level 1 5550 // At level 1 related coefficients occur in discrete blocks of size 8 so 5551 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5552 5553 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5554 load32shorts(vs_front(vs2), zetas); 5555 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5556 vs_front(vs2), vs_back(vs2), vtmp, vq); 5557 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5558 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5559 load32shorts(vs_front(vs2), zetas); 5560 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5561 vs_front(vs2), vs_back(vs2), vtmp, vq); 5562 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5563 5564 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5565 load32shorts(vs_front(vs2), zetas); 5566 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5567 vs_front(vs2), vs_back(vs2), vtmp, vq); 5568 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5569 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5570 load32shorts(vs_front(vs2), zetas); 5571 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5572 vs_front(vs2), vs_back(vs2), vtmp, vq); 5573 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5574 5575 // level 2 5576 // At level 2 coefficients occur in 8 discrete blocks of size 16 5577 // so they are loaded using employing an ldr at 8 distinct offsets. 5578 5579 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5580 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5581 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5582 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5583 vs_subv(vs1, __ T8H, vs1, vs2); 5584 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5585 load64shorts(vs2, zetas); 5586 vs_ldpq(vq, kyberConsts); 5587 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5588 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5589 5590 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5591 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5592 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5593 vs_subv(vs1, __ T8H, vs1, vs2); 5594 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5595 load64shorts(vs2, zetas); 5596 vs_ldpq(vq, kyberConsts); 5597 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5598 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5599 5600 // Barrett reduction at indexes where overflow may happen 5601 5602 // load q and the multiplier for the Barrett reduction 5603 __ add(tmpAddr, kyberConsts, 16); 5604 vs_ldpq(vq, tmpAddr); 5605 5606 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5607 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5608 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5609 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5610 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5611 vs_sshr(vs2, __ T8H, vs2, 11); 5612 vs_mlsv(vs1, __ T8H, vs2, vq1); 5613 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5614 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5615 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5616 vs_sshr(vs2, __ T8H, vs2, 11); 5617 vs_mlsv(vs1, __ T8H, vs2, vq1); 5618 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5619 5620 // level 3 5621 // From level 3 upwards coefficients occur in discrete blocks whose size is 5622 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5623 5624 int offsets2[4] = { 0, 64, 128, 192 }; 5625 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5626 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5627 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5628 vs_subv(vs1, __ T8H, vs1, vs2); 5629 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5630 load64shorts(vs2, zetas); 5631 vs_ldpq(vq, kyberConsts); 5632 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5633 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5634 5635 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5636 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5637 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5638 vs_subv(vs1, __ T8H, vs1, vs2); 5639 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5640 load64shorts(vs2, zetas); 5641 vs_ldpq(vq, kyberConsts); 5642 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5643 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5644 5645 // level 4 5646 5647 int offsets1[4] = { 0, 32, 128, 160 }; 5648 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5649 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5650 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5651 vs_subv(vs1, __ T8H, vs1, vs2); 5652 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5653 load64shorts(vs2, zetas); 5654 vs_ldpq(vq, kyberConsts); 5655 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5656 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5657 5658 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5659 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5660 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5661 vs_subv(vs1, __ T8H, vs1, vs2); 5662 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5663 load64shorts(vs2, zetas); 5664 vs_ldpq(vq, kyberConsts); 5665 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5666 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5667 5668 // level 5 5669 5670 __ add(tmpAddr, coeffs, 0); 5671 load64shorts(vs1, tmpAddr); 5672 __ add(tmpAddr, coeffs, 128); 5673 load64shorts(vs2, tmpAddr); 5674 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5675 vs_subv(vs1, __ T8H, vs1, vs2); 5676 __ add(tmpAddr, coeffs, 0); 5677 store64shorts(vs3, tmpAddr); 5678 load64shorts(vs2, zetas); 5679 vs_ldpq(vq, kyberConsts); 5680 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5681 __ add(tmpAddr, coeffs, 128); 5682 store64shorts(vs2, tmpAddr); 5683 5684 load64shorts(vs1, tmpAddr); 5685 __ add(tmpAddr, coeffs, 384); 5686 load64shorts(vs2, tmpAddr); 5687 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5688 vs_subv(vs1, __ T8H, vs1, vs2); 5689 __ add(tmpAddr, coeffs, 256); 5690 store64shorts(vs3, tmpAddr); 5691 load64shorts(vs2, zetas); 5692 vs_ldpq(vq, kyberConsts); 5693 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5694 __ add(tmpAddr, coeffs, 384); 5695 store64shorts(vs2, tmpAddr); 5696 5697 // Barrett reduction at indexes where overflow may happen 5698 5699 // load q and the multiplier for the Barrett reduction 5700 __ add(tmpAddr, kyberConsts, 16); 5701 vs_ldpq(vq, tmpAddr); 5702 5703 int offsets0[2] = { 0, 256 }; 5704 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5705 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5706 vs_sshr(vs2, __ T8H, vs2, 11); 5707 vs_mlsv(vs1, __ T8H, vs2, vq1); 5708 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5709 5710 // level 6 5711 5712 __ add(tmpAddr, coeffs, 0); 5713 load64shorts(vs1, tmpAddr); 5714 __ add(tmpAddr, coeffs, 256); 5715 load64shorts(vs2, tmpAddr); 5716 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5717 vs_subv(vs1, __ T8H, vs1, vs2); 5718 __ add(tmpAddr, coeffs, 0); 5719 store64shorts(vs3, tmpAddr); 5720 load64shorts(vs2, zetas); 5721 vs_ldpq(vq, kyberConsts); 5722 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5723 __ add(tmpAddr, coeffs, 256); 5724 store64shorts(vs2, tmpAddr); 5725 5726 __ add(tmpAddr, coeffs, 128); 5727 load64shorts(vs1, tmpAddr); 5728 __ add(tmpAddr, coeffs, 384); 5729 load64shorts(vs2, tmpAddr); 5730 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5731 vs_subv(vs1, __ T8H, vs1, vs2); 5732 __ add(tmpAddr, coeffs, 128); 5733 store64shorts(vs3, tmpAddr); 5734 load64shorts(vs2, zetas); 5735 vs_ldpq(vq, kyberConsts); 5736 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5737 __ add(tmpAddr, coeffs, 384); 5738 store64shorts(vs2, tmpAddr); 5739 5740 // multiply by 2^-n 5741 5742 // load toMont(2^-n mod q) 5743 __ add(tmpAddr, kyberConsts, 48); 5744 __ ldr(v29, __ Q, tmpAddr); 5745 5746 vs_ldpq(vq, kyberConsts); 5747 __ add(tmpAddr, coeffs, 0); 5748 load64shorts(vs1, tmpAddr); 5749 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5750 __ add(tmpAddr, coeffs, 0); 5751 store64shorts(vs2, tmpAddr); 5752 5753 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5754 load64shorts(vs1, tmpAddr); 5755 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5756 __ add(tmpAddr, coeffs, 128); 5757 store64shorts(vs2, tmpAddr); 5758 5759 // now tmpAddr contains coeffs + 256 5760 load64shorts(vs1, tmpAddr); 5761 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5762 __ add(tmpAddr, coeffs, 256); 5763 store64shorts(vs2, tmpAddr); 5764 5765 // now tmpAddr contains coeffs + 384 5766 load64shorts(vs1, tmpAddr); 5767 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5768 __ add(tmpAddr, coeffs, 384); 5769 store64shorts(vs2, tmpAddr); 5770 5771 __ leave(); // required for proper stackwalking of RuntimeStub frame 5772 __ mov(r0, zr); // return 0 5773 __ ret(lr); 5774 5775 return start; 5776 } 5777 5778 // Kyber multiply polynomials in the NTT domain. 5779 // Implements 5780 // static int implKyberNttMult( 5781 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5782 // 5783 // result (short[256]) = c_rarg0 5784 // ntta (short[256]) = c_rarg1 5785 // nttb (short[256]) = c_rarg2 5786 // zetas (short[128]) = c_rarg3 5787 address generate_kyberNttMult() { 5788 5789 __ align(CodeEntryAlignment); 5790 StubId stub_id = StubId::stubgen_kyberNttMult_id; 5791 StubCodeMark mark(this, stub_id); 5792 address start = __ pc(); 5793 __ enter(); 5794 5795 const Register result = c_rarg0; 5796 const Register ntta = c_rarg1; 5797 const Register nttb = c_rarg2; 5798 const Register zetas = c_rarg3; 5799 5800 const Register kyberConsts = r10; 5801 const Register limit = r11; 5802 5803 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5804 VSeq<4> vs3(16), vs4(20); 5805 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5806 VSeq<2> vz(28); // pair of zetas 5807 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5808 5809 __ lea(kyberConsts, 5810 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5811 5812 Label kyberNttMult_loop; 5813 5814 __ add(limit, result, 512); 5815 5816 // load q and qinv 5817 vs_ldpq(vq, kyberConsts); 5818 5819 // load R^2 mod q (to convert back from Montgomery representation) 5820 __ add(kyberConsts, kyberConsts, 64); 5821 __ ldr(v27, __ Q, kyberConsts); 5822 5823 __ BIND(kyberNttMult_loop); 5824 5825 // load 16 zetas 5826 vs_ldpq_post(vz, zetas); 5827 5828 // load 2 sets of 32 coefficients from the two input arrays 5829 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5830 // are striped across pairs of vector registers 5831 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5832 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5833 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5834 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5835 5836 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5837 // i.e. montmul the first and second halves of vs1 in order and 5838 // then with one sequence reversed storing the two results in vs3 5839 // 5840 // vs3[0] <- montmul(a0, b0) 5841 // vs3[1] <- montmul(a1, b1) 5842 // vs3[2] <- montmul(a0, b1) 5843 // vs3[3] <- montmul(a1, b0) 5844 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5845 kyber_montmul16(vs_back(vs3), 5846 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5847 5848 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5849 // i.e. montmul the first and second halves of vs4 in order and 5850 // then with one sequence reversed storing the two results in vs1 5851 // 5852 // vs1[0] <- montmul(a2, b2) 5853 // vs1[1] <- montmul(a3, b3) 5854 // vs1[2] <- montmul(a2, b3) 5855 // vs1[3] <- montmul(a3, b2) 5856 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5857 kyber_montmul16(vs_back(vs1), 5858 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5859 5860 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5861 // We can schedule two montmuls at a time if we use a suitable vector 5862 // sequence <vs3[1], vs1[1]>. 5863 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5864 VSeq<2> vs5(vs3[1], delta); 5865 5866 // vs3[1] <- montmul(montmul(a1, b1), z0) 5867 // vs1[1] <- montmul(montmul(a3, b3), z1) 5868 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5869 5870 // add results in pairs storing in vs3 5871 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5872 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5873 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5874 5875 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5876 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5877 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5878 5879 // vs1 <- montmul(vs3, montRSquareModQ) 5880 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5881 5882 // store back the two pairs of result vectors de-interleaved as 8H elements 5883 // i.e. storing each pairs of shorts striped across a register pair adjacent 5884 // in memory 5885 vs_st2_post(vs1, __ T8H, result); 5886 5887 __ cmp(result, limit); 5888 __ br(Assembler::NE, kyberNttMult_loop); 5889 5890 __ leave(); // required for proper stackwalking of RuntimeStub frame 5891 __ mov(r0, zr); // return 0 5892 __ ret(lr); 5893 5894 return start; 5895 } 5896 5897 // Kyber add 2 polynomials. 5898 // Implements 5899 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5900 // 5901 // result (short[256]) = c_rarg0 5902 // a (short[256]) = c_rarg1 5903 // b (short[256]) = c_rarg2 5904 address generate_kyberAddPoly_2() { 5905 5906 __ align(CodeEntryAlignment); 5907 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id; 5908 StubCodeMark mark(this, stub_id); 5909 address start = __ pc(); 5910 __ enter(); 5911 5912 const Register result = c_rarg0; 5913 const Register a = c_rarg1; 5914 const Register b = c_rarg2; 5915 5916 const Register kyberConsts = r11; 5917 5918 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5919 // So, we can load, add and store the data in 3 groups of 11, 5920 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5921 // registers. A further constraint is that the mapping needs 5922 // to skip callee saves. So, we allocate the register 5923 // sequences using two 8 sequences, two 2 sequences and two 5924 // single registers. 5925 VSeq<8> vs1_1(0); 5926 VSeq<2> vs1_2(16); 5927 FloatRegister vs1_3 = v28; 5928 VSeq<8> vs2_1(18); 5929 VSeq<2> vs2_2(26); 5930 FloatRegister vs2_3 = v29; 5931 5932 // two constant vector sequences 5933 VSeq<8> vc_1(31, 0); 5934 VSeq<2> vc_2(31, 0); 5935 5936 FloatRegister vc_3 = v31; 5937 __ lea(kyberConsts, 5938 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5939 5940 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5941 for (int i = 0; i < 3; i++) { 5942 // load 80 or 88 values from a into vs1_1/2/3 5943 vs_ldpq_post(vs1_1, a); 5944 vs_ldpq_post(vs1_2, a); 5945 if (i < 2) { 5946 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5947 } 5948 // load 80 or 88 values from b into vs2_1/2/3 5949 vs_ldpq_post(vs2_1, b); 5950 vs_ldpq_post(vs2_2, b); 5951 if (i < 2) { 5952 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5953 } 5954 // sum 80 or 88 values across vs1 and vs2 into vs1 5955 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5956 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5957 if (i < 2) { 5958 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5959 } 5960 // add constant to all 80 or 88 results 5961 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5962 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5963 if (i < 2) { 5964 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5965 } 5966 // store 80 or 88 values 5967 vs_stpq_post(vs1_1, result); 5968 vs_stpq_post(vs1_2, result); 5969 if (i < 2) { 5970 __ str(vs1_3, __ Q, __ post(result, 16)); 5971 } 5972 } 5973 5974 __ leave(); // required for proper stackwalking of RuntimeStub frame 5975 __ mov(r0, zr); // return 0 5976 __ ret(lr); 5977 5978 return start; 5979 } 5980 5981 // Kyber add 3 polynomials. 5982 // Implements 5983 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5984 // 5985 // result (short[256]) = c_rarg0 5986 // a (short[256]) = c_rarg1 5987 // b (short[256]) = c_rarg2 5988 // c (short[256]) = c_rarg3 5989 address generate_kyberAddPoly_3() { 5990 5991 __ align(CodeEntryAlignment); 5992 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id; 5993 StubCodeMark mark(this, stub_id); 5994 address start = __ pc(); 5995 __ enter(); 5996 5997 const Register result = c_rarg0; 5998 const Register a = c_rarg1; 5999 const Register b = c_rarg2; 6000 const Register c = c_rarg3; 6001 6002 const Register kyberConsts = r11; 6003 6004 // As above we sum 256 sets of values in total i.e. 32 x 8H 6005 // quadwords. So, we can load, add and store the data in 3 6006 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6007 // of 10 or 11 registers. A further constraint is that the 6008 // mapping needs to skip callee saves. So, we allocate the 6009 // register sequences using two 8 sequences, two 2 sequences 6010 // and two single registers. 6011 VSeq<8> vs1_1(0); 6012 VSeq<2> vs1_2(16); 6013 FloatRegister vs1_3 = v28; 6014 VSeq<8> vs2_1(18); 6015 VSeq<2> vs2_2(26); 6016 FloatRegister vs2_3 = v29; 6017 6018 // two constant vector sequences 6019 VSeq<8> vc_1(31, 0); 6020 VSeq<2> vc_2(31, 0); 6021 6022 FloatRegister vc_3 = v31; 6023 6024 __ lea(kyberConsts, 6025 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6026 6027 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 6028 for (int i = 0; i < 3; i++) { 6029 // load 80 or 88 values from a into vs1_1/2/3 6030 vs_ldpq_post(vs1_1, a); 6031 vs_ldpq_post(vs1_2, a); 6032 if (i < 2) { 6033 __ ldr(vs1_3, __ Q, __ post(a, 16)); 6034 } 6035 // load 80 or 88 values from b into vs2_1/2/3 6036 vs_ldpq_post(vs2_1, b); 6037 vs_ldpq_post(vs2_2, b); 6038 if (i < 2) { 6039 __ ldr(vs2_3, __ Q, __ post(b, 16)); 6040 } 6041 // sum 80 or 88 values across vs1 and vs2 into vs1 6042 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 6043 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 6044 if (i < 2) { 6045 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 6046 } 6047 // load 80 or 88 values from c into vs2_1/2/3 6048 vs_ldpq_post(vs2_1, c); 6049 vs_ldpq_post(vs2_2, c); 6050 if (i < 2) { 6051 __ ldr(vs2_3, __ Q, __ post(c, 16)); 6052 } 6053 // sum 80 or 88 values across vs1 and vs2 into vs1 6054 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 6055 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 6056 if (i < 2) { 6057 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 6058 } 6059 // add constant to all 80 or 88 results 6060 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 6061 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 6062 if (i < 2) { 6063 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 6064 } 6065 // store 80 or 88 values 6066 vs_stpq_post(vs1_1, result); 6067 vs_stpq_post(vs1_2, result); 6068 if (i < 2) { 6069 __ str(vs1_3, __ Q, __ post(result, 16)); 6070 } 6071 } 6072 6073 __ leave(); // required for proper stackwalking of RuntimeStub frame 6074 __ mov(r0, zr); // return 0 6075 __ ret(lr); 6076 6077 return start; 6078 } 6079 6080 // Kyber parse XOF output to polynomial coefficient candidates 6081 // or decodePoly(12, ...). 6082 // Implements 6083 // static int implKyber12To16( 6084 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 6085 // 6086 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 6087 // 6088 // condensed (byte[]) = c_rarg0 6089 // condensedIndex = c_rarg1 6090 // parsed (short[112 or 256]) = c_rarg2 6091 // parsedLength (112 or 256) = c_rarg3 6092 address generate_kyber12To16() { 6093 Label L_F00, L_loop, L_end; 6094 6095 __ align(CodeEntryAlignment); 6096 StubId stub_id = StubId::stubgen_kyber12To16_id; 6097 StubCodeMark mark(this, stub_id); 6098 address start = __ pc(); 6099 __ enter(); 6100 6101 const Register condensed = c_rarg0; 6102 const Register condensedOffs = c_rarg1; 6103 const Register parsed = c_rarg2; 6104 const Register parsedLength = c_rarg3; 6105 6106 const Register tmpAddr = r11; 6107 6108 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 6109 // quadwords so we need a 6 vector sequence for the inputs. 6110 // Parsing produces 64 shorts, employing two 8 vector 6111 // sequences to store and combine the intermediate data. 6112 VSeq<6> vin(24); 6113 VSeq<8> va(0), vb(16); 6114 6115 __ adr(tmpAddr, L_F00); 6116 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 6117 __ add(condensed, condensed, condensedOffs); 6118 6119 __ BIND(L_loop); 6120 // load 96 (6 x 16B) byte values 6121 vs_ld3_post(vin, __ T16B, condensed); 6122 6123 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 6124 // holds 48 (16x3) contiguous bytes from memory striped 6125 // horizontally across each of the 16 byte lanes. Equivalently, 6126 // that is 16 pairs of 12-bit integers. Likewise the back half 6127 // holds the next 48 bytes in the same arrangement. 6128 6129 // Each vector in the front half can also be viewed as a vertical 6130 // strip across the 16 pairs of 12 bit integers. Each byte in 6131 // vin[0] stores the low 8 bits of the first int in a pair. Each 6132 // byte in vin[1] stores the high 4 bits of the first int and the 6133 // low 4 bits of the second int. Each byte in vin[2] stores the 6134 // high 8 bits of the second int. Likewise the vectors in second 6135 // half. 6136 6137 // Converting the data to 16-bit shorts requires first of all 6138 // expanding each of the 6 x 16B vectors into 6 corresponding 6139 // pairs of 8H vectors. Mask, shift and add operations on the 6140 // resulting vector pairs can be used to combine 4 and 8 bit 6141 // parts of related 8H vector elements. 6142 // 6143 // The middle vectors (vin[2] and vin[5]) are actually expanded 6144 // twice, one copy manipulated to provide the lower 4 bits 6145 // belonging to the first short in a pair and another copy 6146 // manipulated to provide the higher 4 bits belonging to the 6147 // second short in a pair. This is why the the vector sequences va 6148 // and vb used to hold the expanded 8H elements are of length 8. 6149 6150 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6151 // n.b. target elements 2 and 3 duplicate elements 4 and 5 6152 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6153 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6154 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6155 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6156 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6157 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6158 6159 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 6160 // and vb[4:5] 6161 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6162 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 6163 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6164 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 6165 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6166 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 6167 6168 // shift lo byte of copy 1 of the middle stripe into the high byte 6169 __ shl(va[2], __ T8H, va[2], 8); 6170 __ shl(va[3], __ T8H, va[3], 8); 6171 __ shl(vb[2], __ T8H, vb[2], 8); 6172 __ shl(vb[3], __ T8H, vb[3], 8); 6173 6174 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6175 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6176 // are in bit positions [4..11]. 6177 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6178 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6179 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6180 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6181 6182 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6183 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6184 // copy2 6185 __ andr(va[2], __ T16B, va[2], v31); 6186 __ andr(va[3], __ T16B, va[3], v31); 6187 __ ushr(va[4], __ T8H, va[4], 4); 6188 __ ushr(va[5], __ T8H, va[5], 4); 6189 __ andr(vb[2], __ T16B, vb[2], v31); 6190 __ andr(vb[3], __ T16B, vb[3], v31); 6191 __ ushr(vb[4], __ T8H, vb[4], 4); 6192 __ ushr(vb[5], __ T8H, vb[5], 4); 6193 6194 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6195 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6196 // n.b. the ordering ensures: i) inputs are consumed before they 6197 // are overwritten ii) the order of 16-bit results across successive 6198 // pairs of vectors in va and then vb reflects the order of the 6199 // corresponding 12-bit inputs 6200 __ addv(va[0], __ T8H, va[0], va[2]); 6201 __ addv(va[2], __ T8H, va[1], va[3]); 6202 __ addv(va[1], __ T8H, va[4], va[6]); 6203 __ addv(va[3], __ T8H, va[5], va[7]); 6204 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6205 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6206 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6207 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6208 6209 // store 64 results interleaved as shorts 6210 vs_st2_post(vs_front(va), __ T8H, parsed); 6211 vs_st2_post(vs_front(vb), __ T8H, parsed); 6212 6213 __ sub(parsedLength, parsedLength, 64); 6214 __ cmp(parsedLength, (u1)64); 6215 __ br(Assembler::GE, L_loop); 6216 __ cbz(parsedLength, L_end); 6217 6218 // if anything is left it should be a final 72 bytes of input 6219 // i.e. a final 48 12-bit values. so we handle this by loading 6220 // 48 bytes into all 16B lanes of front(vin) and only 24 6221 // bytes into the lower 8B lane of back(vin) 6222 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6223 vs_ld3(vs_back(vin), __ T8B, condensed); 6224 6225 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6226 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6227 // 5 and target element 2 of vb duplicates element 4. 6228 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6229 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6230 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6231 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6232 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6233 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6234 6235 // This time expand just the lower 8 lanes 6236 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6237 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6238 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6239 6240 // shift lo byte of copy 1 of the middle stripe into the high byte 6241 __ shl(va[2], __ T8H, va[2], 8); 6242 __ shl(va[3], __ T8H, va[3], 8); 6243 __ shl(vb[2], __ T8H, vb[2], 8); 6244 6245 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6246 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6247 // int are in bit positions [4..11]. 6248 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6249 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6250 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6251 6252 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6253 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6254 // copy2 6255 __ andr(va[2], __ T16B, va[2], v31); 6256 __ andr(va[3], __ T16B, va[3], v31); 6257 __ ushr(va[4], __ T8H, va[4], 4); 6258 __ ushr(va[5], __ T8H, va[5], 4); 6259 __ andr(vb[2], __ T16B, vb[2], v31); 6260 __ ushr(vb[4], __ T8H, vb[4], 4); 6261 6262 6263 6264 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6265 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6266 6267 // n.b. ordering ensures: i) inputs are consumed before they are 6268 // overwritten ii) order of 16-bit results across succsessive 6269 // pairs of vectors in va and then lower half of vb reflects order 6270 // of corresponding 12-bit inputs 6271 __ addv(va[0], __ T8H, va[0], va[2]); 6272 __ addv(va[2], __ T8H, va[1], va[3]); 6273 __ addv(va[1], __ T8H, va[4], va[6]); 6274 __ addv(va[3], __ T8H, va[5], va[7]); 6275 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6276 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6277 6278 // store 48 results interleaved as shorts 6279 vs_st2_post(vs_front(va), __ T8H, parsed); 6280 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6281 6282 __ BIND(L_end); 6283 6284 __ leave(); // required for proper stackwalking of RuntimeStub frame 6285 __ mov(r0, zr); // return 0 6286 __ ret(lr); 6287 6288 // bind label and generate constant data used by this stub 6289 __ BIND(L_F00); 6290 __ emit_int64(0x0f000f000f000f00); 6291 __ emit_int64(0x0f000f000f000f00); 6292 6293 return start; 6294 } 6295 6296 // Kyber Barrett reduce function. 6297 // Implements 6298 // static int implKyberBarrettReduce(short[] coeffs) {} 6299 // 6300 // coeffs (short[256]) = c_rarg0 6301 address generate_kyberBarrettReduce() { 6302 6303 __ align(CodeEntryAlignment); 6304 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id; 6305 StubCodeMark mark(this, stub_id); 6306 address start = __ pc(); 6307 __ enter(); 6308 6309 const Register coeffs = c_rarg0; 6310 6311 const Register kyberConsts = r10; 6312 const Register result = r11; 6313 6314 // As above we process 256 sets of values in total i.e. 32 x 6315 // 8H quadwords. So, we can load, add and store the data in 3 6316 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6317 // of 10 or 11 registers. A further constraint is that the 6318 // mapping needs to skip callee saves. So, we allocate the 6319 // register sequences using two 8 sequences, two 2 sequences 6320 // and two single registers. 6321 VSeq<8> vs1_1(0); 6322 VSeq<2> vs1_2(16); 6323 FloatRegister vs1_3 = v28; 6324 VSeq<8> vs2_1(18); 6325 VSeq<2> vs2_2(26); 6326 FloatRegister vs2_3 = v29; 6327 6328 // we also need a pair of corresponding constant sequences 6329 6330 VSeq<8> vc1_1(30, 0); 6331 VSeq<2> vc1_2(30, 0); 6332 FloatRegister vc1_3 = v30; // for kyber_q 6333 6334 VSeq<8> vc2_1(31, 0); 6335 VSeq<2> vc2_2(31, 0); 6336 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6337 6338 __ add(result, coeffs, 0); 6339 __ lea(kyberConsts, 6340 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6341 6342 // load q and the multiplier for the Barrett reduction 6343 __ add(kyberConsts, kyberConsts, 16); 6344 __ ldpq(vc1_3, vc2_3, kyberConsts); 6345 6346 for (int i = 0; i < 3; i++) { 6347 // load 80 or 88 coefficients 6348 vs_ldpq_post(vs1_1, coeffs); 6349 vs_ldpq_post(vs1_2, coeffs); 6350 if (i < 2) { 6351 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6352 } 6353 6354 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6355 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6356 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6357 if (i < 2) { 6358 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6359 } 6360 6361 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6362 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6363 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6364 if (i < 2) { 6365 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6366 } 6367 6368 // vs1 <- vs1 - vs2 * kyber_q 6369 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6370 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6371 if (i < 2) { 6372 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6373 } 6374 6375 vs_stpq_post(vs1_1, result); 6376 vs_stpq_post(vs1_2, result); 6377 if (i < 2) { 6378 __ str(vs1_3, __ Q, __ post(result, 16)); 6379 } 6380 } 6381 6382 __ leave(); // required for proper stackwalking of RuntimeStub frame 6383 __ mov(r0, zr); // return 0 6384 __ ret(lr); 6385 6386 return start; 6387 } 6388 6389 6390 // Dilithium-specific montmul helper routines that generate parallel 6391 // code for, respectively, a single 4x4s vector sequence montmul or 6392 // two such multiplies in a row. 6393 6394 // Perform 16 32-bit Montgomery multiplications in parallel 6395 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6396 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6397 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6398 // It will assert that the register use is valid 6399 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6400 } 6401 6402 // Perform 2x16 32-bit Montgomery multiplications in parallel 6403 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6404 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6405 // Schedule two successive 4x4S multiplies via the montmul helper 6406 // on the front and back halves of va, vb and vc. The helper will 6407 // assert that the register use has no overlap conflicts on each 6408 // individual call but we also need to ensure that the necessary 6409 // disjoint/equality constraints are met across both calls. 6410 6411 // vb, vc, vtmp and vq must be disjoint. va must either be 6412 // disjoint from all other registers or equal vc 6413 6414 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6415 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6416 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6417 6418 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6419 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6420 6421 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6422 6423 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6424 assert(vs_disjoint(va, vb), "va and vb overlap"); 6425 assert(vs_disjoint(va, vq), "va and vq overlap"); 6426 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6427 6428 // We multiply the front and back halves of each sequence 4 at a 6429 // time because 6430 // 6431 // 1) we are currently only able to get 4-way instruction 6432 // parallelism at best 6433 // 6434 // 2) we need registers for the constants in vq and temporary 6435 // scratch registers to hold intermediate results so vtmp can only 6436 // be a VSeq<4> which means we only have 4 scratch slots. 6437 6438 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6439 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6440 } 6441 6442 // Perform combined montmul then add/sub on 4x4S vectors. 6443 void dilithium_montmul16_sub_add( 6444 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6445 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6446 // compute a = montmul(a1, c) 6447 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6448 // ouptut a1 = a0 - a 6449 vs_subv(va1, __ T4S, va0, vc); 6450 // and a0 = a0 + a 6451 vs_addv(va0, __ T4S, va0, vc); 6452 } 6453 6454 // Perform combined add/sub then montul on 4x4S vectors. 6455 void dilithium_sub_add_montmul16( 6456 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6457 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6458 // compute c = a0 - a1 6459 vs_subv(vtmp1, __ T4S, va0, va1); 6460 // output a0 = a0 + a1 6461 vs_addv(va0, __ T4S, va0, va1); 6462 // output a1 = b montmul c 6463 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6464 } 6465 6466 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6467 // in the Java implementation come in sequences of at least 8, so we 6468 // can use ldpq to collect the corresponding data into pairs of vector 6469 // registers. 6470 // We collect the coefficients corresponding to the 'j+l' indexes into 6471 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6472 // then we do the (Montgomery) multiplications by the zetas in parallel 6473 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6474 // v0-v7, then do the additions into v24-v31 and the subtractions into 6475 // v0-v7 and finally save the results back to the coeffs array. 6476 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6477 const Register coeffs, const Register zetas) { 6478 int c1 = 0; 6479 int c2 = 512; 6480 int startIncr; 6481 // don't use callee save registers v8 - v15 6482 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6483 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6484 VSeq<2> vq(30); // n.b. constants overlap vs3 6485 int offsets[4] = { 0, 32, 64, 96 }; 6486 6487 for (int level = 0; level < 5; level++) { 6488 int c1Start = c1; 6489 int c2Start = c2; 6490 if (level == 3) { 6491 offsets[1] = 32; 6492 offsets[2] = 128; 6493 offsets[3] = 160; 6494 } else if (level == 4) { 6495 offsets[1] = 64; 6496 offsets[2] = 128; 6497 offsets[3] = 192; 6498 } 6499 6500 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6501 // time at 4 different offsets and multiply them in order by the 6502 // next set of input values. So we employ indexed load and store 6503 // pair instructions with arrangement 4S. 6504 for (int i = 0; i < 4; i++) { 6505 // reload q and qinv 6506 vs_ldpq(vq, dilithiumConsts); // qInv, q 6507 // load 8x4S coefficients via second start pos == c2 6508 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6509 // load next 8x4S inputs == b 6510 vs_ldpq_post(vs2, zetas); 6511 // compute a == c2 * b mod MONT_Q 6512 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6513 // load 8x4s coefficients via first start pos == c1 6514 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6515 // compute a1 = c1 + a 6516 vs_addv(vs3, __ T4S, vs1, vs2); 6517 // compute a2 = c1 - a 6518 vs_subv(vs1, __ T4S, vs1, vs2); 6519 // output a1 and a2 6520 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6521 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6522 6523 int k = 4 * level + i; 6524 6525 if (k > 7) { 6526 startIncr = 256; 6527 } else if (k == 5) { 6528 startIncr = 384; 6529 } else { 6530 startIncr = 128; 6531 } 6532 6533 c1Start += startIncr; 6534 c2Start += startIncr; 6535 } 6536 6537 c2 /= 2; 6538 } 6539 } 6540 6541 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6542 // Implements the method 6543 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6544 // of the Java class sun.security.provider 6545 // 6546 // coeffs (int[256]) = c_rarg0 6547 // zetas (int[256]) = c_rarg1 6548 address generate_dilithiumAlmostNtt() { 6549 6550 __ align(CodeEntryAlignment); 6551 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id; 6552 StubCodeMark mark(this, stub_id); 6553 address start = __ pc(); 6554 __ enter(); 6555 6556 const Register coeffs = c_rarg0; 6557 const Register zetas = c_rarg1; 6558 6559 const Register tmpAddr = r9; 6560 const Register dilithiumConsts = r10; 6561 const Register result = r11; 6562 // don't use callee save registers v8 - v15 6563 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6564 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6565 VSeq<2> vq(30); // n.b. constants overlap vs3 6566 int offsets[4] = { 0, 32, 64, 96}; 6567 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6568 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6569 __ add(result, coeffs, 0); 6570 __ lea(dilithiumConsts, 6571 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6572 6573 // Each level represents one iteration of the outer for loop of the Java version. 6574 6575 // level 0-4 6576 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6577 6578 // level 5 6579 6580 // At level 5 the coefficients we need to combine with the zetas 6581 // are grouped in memory in blocks of size 4. So, for both sets of 6582 // coefficients we load 4 adjacent values at 8 different offsets 6583 // using an indexed ldr with register variant Q and multiply them 6584 // in sequence order by the next set of inputs. Likewise we store 6585 // the resuls using an indexed str with register variant Q. 6586 for (int i = 0; i < 1024; i += 256) { 6587 // reload constants q, qinv each iteration as they get clobbered later 6588 vs_ldpq(vq, dilithiumConsts); // qInv, q 6589 // load 32 (8x4S) coefficients via first offsets = c1 6590 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6591 // load next 32 (8x4S) inputs = b 6592 vs_ldpq_post(vs2, zetas); 6593 // a = b montul c1 6594 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6595 // load 32 (8x4S) coefficients via second offsets = c2 6596 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6597 // add/sub with result of multiply 6598 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6599 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6600 // write back new coefficients using same offsets 6601 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6602 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6603 } 6604 6605 // level 6 6606 // At level 6 the coefficients we need to combine with the zetas 6607 // are grouped in memory in pairs, the first two being montmul 6608 // inputs and the second add/sub inputs. We can still implement 6609 // the montmul+sub+add using 4-way parallelism but only if we 6610 // combine the coefficients with the zetas 16 at a time. We load 8 6611 // adjacent values at 4 different offsets using an ld2 load with 6612 // arrangement 2D. That interleaves the lower and upper halves of 6613 // each pair of quadwords into successive vector registers. We 6614 // then need to montmul the 4 even elements of the coefficients 6615 // register sequence by the zetas in order and then add/sub the 4 6616 // odd elements of the coefficients register sequence. We use an 6617 // equivalent st2 operation to store the results back into memory 6618 // de-interleaved. 6619 for (int i = 0; i < 1024; i += 128) { 6620 // reload constants q, qinv each iteration as they get clobbered later 6621 vs_ldpq(vq, dilithiumConsts); // qInv, q 6622 // load interleaved 16 (4x2D) coefficients via offsets 6623 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6624 // load next 16 (4x4S) inputs 6625 vs_ldpq_post(vs_front(vs2), zetas); 6626 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6627 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6628 vs_front(vs2), vtmp, vq); 6629 // store interleaved 16 (4x2D) coefficients via offsets 6630 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6631 } 6632 6633 // level 7 6634 // At level 7 the coefficients we need to combine with the zetas 6635 // occur singly with montmul inputs alterating with add/sub 6636 // inputs. Once again we can use 4-way parallelism to combine 16 6637 // zetas at a time. However, we have to load 8 adjacent values at 6638 // 4 different offsets using an ld2 load with arrangement 4S. That 6639 // interleaves the the odd words of each pair into one 6640 // coefficients vector register and the even words of the pair 6641 // into the next register. We then need to montmul the 4 even 6642 // elements of the coefficients register sequence by the zetas in 6643 // order and then add/sub the 4 odd elements of the coefficients 6644 // register sequence. We use an equivalent st2 operation to store 6645 // the results back into memory de-interleaved. 6646 6647 for (int i = 0; i < 1024; i += 128) { 6648 // reload constants q, qinv each iteration as they get clobbered later 6649 vs_ldpq(vq, dilithiumConsts); // qInv, q 6650 // load interleaved 16 (4x4S) coefficients via offsets 6651 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6652 // load next 16 (4x4S) inputs 6653 vs_ldpq_post(vs_front(vs2), zetas); 6654 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6655 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6656 vs_front(vs2), vtmp, vq); 6657 // store interleaved 16 (4x4S) coefficients via offsets 6658 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6659 } 6660 __ leave(); // required for proper stackwalking of RuntimeStub frame 6661 __ mov(r0, zr); // return 0 6662 __ ret(lr); 6663 6664 return start; 6665 } 6666 6667 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6668 // in the Java implementation come in sequences of at least 8, so we 6669 // can use ldpq to collect the corresponding data into pairs of vector 6670 // registers 6671 // We collect the coefficients that correspond to the 'j's into vs1 6672 // the coefficiets that correspond to the 'j+l's into vs2 then 6673 // do the additions into vs3 and the subtractions into vs1 then 6674 // save the result of the additions, load the zetas into vs2 6675 // do the (Montgomery) multiplications by zeta in parallel into vs2 6676 // finally save the results back to the coeffs array 6677 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6678 const Register coeffs, const Register zetas) { 6679 int c1 = 0; 6680 int c2 = 32; 6681 int startIncr; 6682 int offsets[4]; 6683 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6684 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6685 VSeq<2> vq(30); // n.b. constants overlap vs3 6686 6687 offsets[0] = 0; 6688 6689 for (int level = 3; level < 8; level++) { 6690 int c1Start = c1; 6691 int c2Start = c2; 6692 if (level == 3) { 6693 offsets[1] = 64; 6694 offsets[2] = 128; 6695 offsets[3] = 192; 6696 } else if (level == 4) { 6697 offsets[1] = 32; 6698 offsets[2] = 128; 6699 offsets[3] = 160; 6700 } else { 6701 offsets[1] = 32; 6702 offsets[2] = 64; 6703 offsets[3] = 96; 6704 } 6705 6706 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6707 // time at 4 different offsets and multiply them in order by the 6708 // next set of input values. So we employ indexed load and store 6709 // pair instructions with arrangement 4S. 6710 for (int i = 0; i < 4; i++) { 6711 // load v1 32 (8x4S) coefficients relative to first start index 6712 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6713 // load v2 32 (8x4S) coefficients relative to second start index 6714 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6715 // a0 = v1 + v2 -- n.b. clobbers vqs 6716 vs_addv(vs3, __ T4S, vs1, vs2); 6717 // a1 = v1 - v2 6718 vs_subv(vs1, __ T4S, vs1, vs2); 6719 // save a1 relative to first start index 6720 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6721 // load constants q, qinv each iteration as they get clobbered above 6722 vs_ldpq(vq, dilithiumConsts); // qInv, q 6723 // load b next 32 (8x4S) inputs 6724 vs_ldpq_post(vs2, zetas); 6725 // a = a1 montmul b 6726 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6727 // save a relative to second start index 6728 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6729 6730 int k = 4 * level + i; 6731 6732 if (k < 24) { 6733 startIncr = 256; 6734 } else if (k == 25) { 6735 startIncr = 384; 6736 } else { 6737 startIncr = 128; 6738 } 6739 6740 c1Start += startIncr; 6741 c2Start += startIncr; 6742 } 6743 6744 c2 *= 2; 6745 } 6746 } 6747 6748 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6749 // Implements the method 6750 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6751 // the sun.security.provider.ML_DSA class. 6752 // 6753 // coeffs (int[256]) = c_rarg0 6754 // zetas (int[256]) = c_rarg1 6755 address generate_dilithiumAlmostInverseNtt() { 6756 6757 __ align(CodeEntryAlignment); 6758 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id; 6759 StubCodeMark mark(this, stub_id); 6760 address start = __ pc(); 6761 __ enter(); 6762 6763 const Register coeffs = c_rarg0; 6764 const Register zetas = c_rarg1; 6765 6766 const Register tmpAddr = r9; 6767 const Register dilithiumConsts = r10; 6768 const Register result = r11; 6769 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6770 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6771 VSeq<2> vq(30); // n.b. constants overlap vs3 6772 int offsets[4] = { 0, 32, 64, 96 }; 6773 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6774 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6775 6776 __ add(result, coeffs, 0); 6777 __ lea(dilithiumConsts, 6778 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6779 6780 // Each level represents one iteration of the outer for loop of the Java version 6781 6782 // level 0 6783 // At level 0 we need to interleave adjacent quartets of 6784 // coefficients before we multiply and add/sub by the next 16 6785 // zetas just as we did for level 7 in the multiply code. So we 6786 // load and store the values using an ld2/st2 with arrangement 4S. 6787 for (int i = 0; i < 1024; i += 128) { 6788 // load constants q, qinv 6789 // n.b. this can be moved out of the loop as they do not get 6790 // clobbered by first two loops 6791 vs_ldpq(vq, dilithiumConsts); // qInv, q 6792 // a0/a1 load interleaved 32 (8x4S) coefficients 6793 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6794 // b load next 32 (8x4S) inputs 6795 vs_ldpq_post(vs_front(vs2), zetas); 6796 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6797 // n.b. second half of vs2 provides temporary register storage 6798 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6799 vs_front(vs2), vs_back(vs2), vtmp, vq); 6800 // a0/a1 store interleaved 32 (8x4S) coefficients 6801 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6802 } 6803 6804 // level 1 6805 // At level 1 we need to interleave pairs of adjacent pairs of 6806 // coefficients before we multiply by the next 16 zetas just as we 6807 // did for level 6 in the multiply code. So we load and store the 6808 // values an ld2/st2 with arrangement 2D. 6809 for (int i = 0; i < 1024; i += 128) { 6810 // a0/a1 load interleaved 32 (8x2D) coefficients 6811 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6812 // b load next 16 (4x4S) inputs 6813 vs_ldpq_post(vs_front(vs2), zetas); 6814 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6815 // n.b. second half of vs2 provides temporary register storage 6816 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6817 vs_front(vs2), vs_back(vs2), vtmp, vq); 6818 // a0/a1 store interleaved 32 (8x2D) coefficients 6819 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6820 } 6821 6822 // level 2 6823 // At level 2 coefficients come in blocks of 4. So, we load 4 6824 // adjacent coefficients at 8 distinct offsets for both the first 6825 // and second coefficient sequences, using an ldr with register 6826 // variant Q then combine them with next set of 32 zetas. Likewise 6827 // we store the results using an str with register variant Q. 6828 for (int i = 0; i < 1024; i += 256) { 6829 // c0 load 32 (8x4S) coefficients via first offsets 6830 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6831 // c1 load 32 (8x4S) coefficients via second offsets 6832 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6833 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6834 vs_addv(vs3, __ T4S, vs1, vs2); 6835 // c = c0 - c1 6836 vs_subv(vs1, __ T4S, vs1, vs2); 6837 // store a0 32 (8x4S) coefficients via first offsets 6838 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6839 // b load 32 (8x4S) next inputs 6840 vs_ldpq_post(vs2, zetas); 6841 // reload constants q, qinv -- they were clobbered earlier 6842 vs_ldpq(vq, dilithiumConsts); // qInv, q 6843 // compute a1 = b montmul c 6844 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6845 // store a1 32 (8x4S) coefficients via second offsets 6846 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6847 } 6848 6849 // level 3-7 6850 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6851 6852 __ leave(); // required for proper stackwalking of RuntimeStub frame 6853 __ mov(r0, zr); // return 0 6854 __ ret(lr); 6855 6856 return start; 6857 } 6858 6859 // Dilithium multiply polynomials in the NTT domain. 6860 // Straightforward implementation of the method 6861 // static int implDilithiumNttMult( 6862 // int[] result, int[] ntta, int[] nttb {} of 6863 // the sun.security.provider.ML_DSA class. 6864 // 6865 // result (int[256]) = c_rarg0 6866 // poly1 (int[256]) = c_rarg1 6867 // poly2 (int[256]) = c_rarg2 6868 address generate_dilithiumNttMult() { 6869 6870 __ align(CodeEntryAlignment); 6871 StubId stub_id = StubId::stubgen_dilithiumNttMult_id; 6872 StubCodeMark mark(this, stub_id); 6873 address start = __ pc(); 6874 __ enter(); 6875 6876 Label L_loop; 6877 6878 const Register result = c_rarg0; 6879 const Register poly1 = c_rarg1; 6880 const Register poly2 = c_rarg2; 6881 6882 const Register dilithiumConsts = r10; 6883 const Register len = r11; 6884 6885 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6886 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6887 VSeq<2> vq(30); // n.b. constants overlap vs3 6888 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6889 6890 __ lea(dilithiumConsts, 6891 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6892 6893 // load constants q, qinv 6894 vs_ldpq(vq, dilithiumConsts); // qInv, q 6895 // load constant rSquare into v29 6896 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6897 6898 __ mov(len, zr); 6899 __ add(len, len, 1024); 6900 6901 __ BIND(L_loop); 6902 6903 // b load 32 (8x4S) next inputs from poly1 6904 vs_ldpq_post(vs1, poly1); 6905 // c load 32 (8x4S) next inputs from poly2 6906 vs_ldpq_post(vs2, poly2); 6907 // compute a = b montmul c 6908 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6909 // compute a = rsquare montmul a 6910 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6911 // save a 32 (8x4S) results 6912 vs_stpq_post(vs2, result); 6913 6914 __ sub(len, len, 128); 6915 __ cmp(len, (u1)128); 6916 __ br(Assembler::GE, L_loop); 6917 6918 __ leave(); // required for proper stackwalking of RuntimeStub frame 6919 __ mov(r0, zr); // return 0 6920 __ ret(lr); 6921 6922 return start; 6923 } 6924 6925 // Dilithium Motgomery multiply an array by a constant. 6926 // A straightforward implementation of the method 6927 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6928 // of the sun.security.provider.MLDSA class 6929 // 6930 // coeffs (int[256]) = c_rarg0 6931 // constant (int) = c_rarg1 6932 address generate_dilithiumMontMulByConstant() { 6933 6934 __ align(CodeEntryAlignment); 6935 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id; 6936 StubCodeMark mark(this, stub_id); 6937 address start = __ pc(); 6938 __ enter(); 6939 6940 Label L_loop; 6941 6942 const Register coeffs = c_rarg0; 6943 const Register constant = c_rarg1; 6944 6945 const Register dilithiumConsts = r10; 6946 const Register result = r11; 6947 const Register len = r12; 6948 6949 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6950 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6951 VSeq<2> vq(30); // n.b. constants overlap vs3 6952 VSeq<8> vconst(29, 0); // for montmul by constant 6953 6954 // results track inputs 6955 __ add(result, coeffs, 0); 6956 __ lea(dilithiumConsts, 6957 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6958 6959 // load constants q, qinv -- they do not get clobbered by first two loops 6960 vs_ldpq(vq, dilithiumConsts); // qInv, q 6961 // copy caller supplied constant across vconst 6962 __ dup(vconst[0], __ T4S, constant); 6963 __ mov(len, zr); 6964 __ add(len, len, 1024); 6965 6966 __ BIND(L_loop); 6967 6968 // load next 32 inputs 6969 vs_ldpq_post(vs2, coeffs); 6970 // mont mul by constant 6971 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6972 // write next 32 results 6973 vs_stpq_post(vs2, result); 6974 6975 __ sub(len, len, 128); 6976 __ cmp(len, (u1)128); 6977 __ br(Assembler::GE, L_loop); 6978 6979 __ leave(); // required for proper stackwalking of RuntimeStub frame 6980 __ mov(r0, zr); // return 0 6981 __ ret(lr); 6982 6983 return start; 6984 } 6985 6986 // Dilithium decompose poly. 6987 // Implements the method 6988 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6989 // of the sun.security.provider.ML_DSA class 6990 // 6991 // input (int[256]) = c_rarg0 6992 // lowPart (int[256]) = c_rarg1 6993 // highPart (int[256]) = c_rarg2 6994 // twoGamma2 (int) = c_rarg3 6995 // multiplier (int) = c_rarg4 6996 address generate_dilithiumDecomposePoly() { 6997 6998 __ align(CodeEntryAlignment); 6999 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id; 7000 StubCodeMark mark(this, stub_id); 7001 address start = __ pc(); 7002 Label L_loop; 7003 7004 const Register input = c_rarg0; 7005 const Register lowPart = c_rarg1; 7006 const Register highPart = c_rarg2; 7007 const Register twoGamma2 = c_rarg3; 7008 const Register multiplier = c_rarg4; 7009 7010 const Register len = r9; 7011 const Register dilithiumConsts = r10; 7012 const Register tmp = r11; 7013 7014 // 6 independent sets of 4x4s values 7015 VSeq<4> vs1(0), vs2(4), vs3(8); 7016 VSeq<4> vs4(12), vs5(16), vtmp(20); 7017 7018 // 7 constants for cross-multiplying 7019 VSeq<4> one(25, 0); 7020 VSeq<4> qminus1(26, 0); 7021 VSeq<4> g2(27, 0); 7022 VSeq<4> twog2(28, 0); 7023 VSeq<4> mult(29, 0); 7024 VSeq<4> q(30, 0); 7025 VSeq<4> qadd(31, 0); 7026 7027 __ enter(); 7028 7029 __ lea(dilithiumConsts, 7030 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 7031 7032 // save callee-saved registers 7033 __ stpd(v8, v9, __ pre(sp, -64)); 7034 __ stpd(v10, v11, Address(sp, 16)); 7035 __ stpd(v12, v13, Address(sp, 32)); 7036 __ stpd(v14, v15, Address(sp, 48)); 7037 7038 // populate constant registers 7039 __ mov(tmp, zr); 7040 __ add(tmp, tmp, 1); 7041 __ dup(one[0], __ T4S, tmp); // 1 7042 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 7043 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 7044 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 7045 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 7046 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 7047 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 7048 7049 __ mov(len, zr); 7050 __ add(len, len, 1024); 7051 7052 __ BIND(L_loop); 7053 7054 // load next 4x4S inputs interleaved: rplus --> vs1 7055 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 7056 7057 // rplus = rplus - ((rplus + qadd) >> 23) * q 7058 vs_addv(vtmp, __ T4S, vs1, qadd); 7059 vs_sshr(vtmp, __ T4S, vtmp, 23); 7060 vs_mulv(vtmp, __ T4S, vtmp, q); 7061 vs_subv(vs1, __ T4S, vs1, vtmp); 7062 7063 // rplus = rplus + ((rplus >> 31) & dilithium_q); 7064 vs_sshr(vtmp, __ T4S, vs1, 31); 7065 vs_andr(vtmp, vtmp, q); 7066 vs_addv(vs1, __ T4S, vs1, vtmp); 7067 7068 // quotient --> vs2 7069 // int quotient = (rplus * multiplier) >> 22; 7070 vs_mulv(vtmp, __ T4S, vs1, mult); 7071 vs_sshr(vs2, __ T4S, vtmp, 22); 7072 7073 // r0 --> vs3 7074 // int r0 = rplus - quotient * twoGamma2; 7075 vs_mulv(vtmp, __ T4S, vs2, twog2); 7076 vs_subv(vs3, __ T4S, vs1, vtmp); 7077 7078 // mask --> vs4 7079 // int mask = (twoGamma2 - r0) >> 22; 7080 vs_subv(vtmp, __ T4S, twog2, vs3); 7081 vs_sshr(vs4, __ T4S, vtmp, 22); 7082 7083 // r0 -= (mask & twoGamma2); 7084 vs_andr(vtmp, vs4, twog2); 7085 vs_subv(vs3, __ T4S, vs3, vtmp); 7086 7087 // quotient += (mask & 1); 7088 vs_andr(vtmp, vs4, one); 7089 vs_addv(vs2, __ T4S, vs2, vtmp); 7090 7091 // mask = (twoGamma2 / 2 - r0) >> 31; 7092 vs_subv(vtmp, __ T4S, g2, vs3); 7093 vs_sshr(vs4, __ T4S, vtmp, 31); 7094 7095 // r0 -= (mask & twoGamma2); 7096 vs_andr(vtmp, vs4, twog2); 7097 vs_subv(vs3, __ T4S, vs3, vtmp); 7098 7099 // quotient += (mask & 1); 7100 vs_andr(vtmp, vs4, one); 7101 vs_addv(vs2, __ T4S, vs2, vtmp); 7102 7103 // r1 --> vs5 7104 // int r1 = rplus - r0 - (dilithium_q - 1); 7105 vs_subv(vtmp, __ T4S, vs1, vs3); 7106 vs_subv(vs5, __ T4S, vtmp, qminus1); 7107 7108 // r1 --> vs1 (overwriting rplus) 7109 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 7110 vs_negr(vtmp, __ T4S, vs5); 7111 vs_orr(vtmp, vs5, vtmp); 7112 vs_sshr(vs1, __ T4S, vtmp, 31); 7113 7114 // r0 += ~r1; 7115 vs_notr(vtmp, vs1); 7116 vs_addv(vs3, __ T4S, vs3, vtmp); 7117 7118 // r1 = r1 & quotient; 7119 vs_andr(vs1, vs2, vs1); 7120 7121 // store results inteleaved 7122 // lowPart[m] = r0; 7123 // highPart[m] = r1; 7124 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 7125 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 7126 7127 __ sub(len, len, 64); 7128 __ cmp(len, (u1)64); 7129 __ br(Assembler::GE, L_loop); 7130 7131 // restore callee-saved vector registers 7132 __ ldpd(v14, v15, Address(sp, 48)); 7133 __ ldpd(v12, v13, Address(sp, 32)); 7134 __ ldpd(v10, v11, Address(sp, 16)); 7135 __ ldpd(v8, v9, __ post(sp, 64)); 7136 7137 __ leave(); // required for proper stackwalking of RuntimeStub frame 7138 __ mov(r0, zr); // return 0 7139 __ ret(lr); 7140 7141 return start; 7142 } 7143 7144 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4, 7145 Register tmp0, Register tmp1, Register tmp2) { 7146 __ bic(tmp0, a2, a1); // for a0 7147 __ bic(tmp1, a3, a2); // for a1 7148 __ bic(tmp2, a4, a3); // for a2 7149 __ eor(a2, a2, tmp2); 7150 __ bic(tmp2, a0, a4); // for a3 7151 __ eor(a3, a3, tmp2); 7152 __ bic(tmp2, a1, a0); // for a4 7153 __ eor(a0, a0, tmp0); 7154 __ eor(a1, a1, tmp1); 7155 __ eor(a4, a4, tmp2); 7156 } 7157 7158 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc, 7159 Register a0, Register a1, Register a2, Register a3, Register a4, 7160 Register a5, Register a6, Register a7, Register a8, Register a9, 7161 Register a10, Register a11, Register a12, Register a13, Register a14, 7162 Register a15, Register a16, Register a17, Register a18, Register a19, 7163 Register a20, Register a21, Register a22, Register a23, Register a24, 7164 Register tmp0, Register tmp1, Register tmp2) { 7165 __ eor3(tmp1, a4, a9, a14); 7166 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4 7167 __ eor3(tmp2, a1, a6, a11); 7168 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1 7169 __ rax1(tmp2, tmp0, tmp1); // d0 7170 { 7171 7172 Register tmp3, tmp4; 7173 if (can_use_fp && can_use_r18) { 7174 tmp3 = rfp; 7175 tmp4 = r18_tls; 7176 } else { 7177 tmp3 = a4; 7178 tmp4 = a9; 7179 __ stp(tmp3, tmp4, __ pre(sp, -16)); 7180 } 7181 7182 __ eor3(tmp3, a0, a5, a10); 7183 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0 7184 __ eor(a0, a0, tmp2); 7185 __ eor(a5, a5, tmp2); 7186 __ eor(a10, a10, tmp2); 7187 __ eor(a15, a15, tmp2); 7188 __ eor(a20, a20, tmp2); // d0(tmp2) 7189 __ eor3(tmp3, a2, a7, a12); 7190 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2 7191 __ rax1(tmp3, tmp4, tmp2); // d1 7192 __ eor(a1, a1, tmp3); 7193 __ eor(a6, a6, tmp3); 7194 __ eor(a11, a11, tmp3); 7195 __ eor(a16, a16, tmp3); 7196 __ eor(a21, a21, tmp3); // d1(tmp3) 7197 __ rax1(tmp3, tmp2, tmp0); // d3 7198 __ eor3(tmp2, a3, a8, a13); 7199 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3 7200 __ eor(a3, a3, tmp3); 7201 __ eor(a8, a8, tmp3); 7202 __ eor(a13, a13, tmp3); 7203 __ eor(a18, a18, tmp3); 7204 __ eor(a23, a23, tmp3); 7205 __ rax1(tmp2, tmp1, tmp0); // d2 7206 __ eor(a2, a2, tmp2); 7207 __ eor(a7, a7, tmp2); 7208 __ eor(a12, a12, tmp2); 7209 __ rax1(tmp0, tmp0, tmp4); // d4 7210 if (!can_use_fp || !can_use_r18) { 7211 __ ldp(tmp3, tmp4, __ post(sp, 16)); 7212 } 7213 __ eor(a17, a17, tmp2); 7214 __ eor(a22, a22, tmp2); 7215 __ eor(a4, a4, tmp0); 7216 __ eor(a9, a9, tmp0); 7217 __ eor(a14, a14, tmp0); 7218 __ eor(a19, a19, tmp0); 7219 __ eor(a24, a24, tmp0); 7220 } 7221 7222 __ rol(tmp0, a10, 3); 7223 __ rol(a10, a1, 1); 7224 __ rol(a1, a6, 44); 7225 __ rol(a6, a9, 20); 7226 __ rol(a9, a22, 61); 7227 __ rol(a22, a14, 39); 7228 __ rol(a14, a20, 18); 7229 __ rol(a20, a2, 62); 7230 __ rol(a2, a12, 43); 7231 __ rol(a12, a13, 25); 7232 __ rol(a13, a19, 8) ; 7233 __ rol(a19, a23, 56); 7234 __ rol(a23, a15, 41); 7235 __ rol(a15, a4, 27); 7236 __ rol(a4, a24, 14); 7237 __ rol(a24, a21, 2); 7238 __ rol(a21, a8, 55); 7239 __ rol(a8, a16, 45); 7240 __ rol(a16, a5, 36); 7241 __ rol(a5, a3, 28); 7242 __ rol(a3, a18, 21); 7243 __ rol(a18, a17, 15); 7244 __ rol(a17, a11, 10); 7245 __ rol(a11, a7, 6); 7246 __ mov(a7, tmp0); 7247 7248 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2); 7249 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2); 7250 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2); 7251 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2); 7252 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2); 7253 7254 __ ldr(tmp1, __ post(rc, 8)); 7255 __ eor(a0, a0, tmp1); 7256 7257 } 7258 7259 // Arguments: 7260 // 7261 // Inputs: 7262 // c_rarg0 - byte[] source+offset 7263 // c_rarg1 - byte[] SHA.state 7264 // c_rarg2 - int block_size 7265 // c_rarg3 - int offset 7266 // c_rarg4 - int limit 7267 // 7268 address generate_sha3_implCompress_gpr(StubId stub_id) { 7269 bool multi_block; 7270 switch (stub_id) { 7271 case StubId::stubgen_sha3_implCompress_id: 7272 multi_block = false; 7273 break; 7274 case StubId::stubgen_sha3_implCompressMB_id: 7275 multi_block = true; 7276 break; 7277 default: 7278 ShouldNotReachHere(); 7279 } 7280 7281 static const uint64_t round_consts[24] = { 7282 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 7283 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 7284 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 7285 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 7286 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 7287 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 7288 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 7289 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 7290 }; 7291 7292 __ align(CodeEntryAlignment); 7293 StubCodeMark mark(this, stub_id); 7294 address start = __ pc(); 7295 7296 Register buf = c_rarg0; 7297 Register state = c_rarg1; 7298 Register block_size = c_rarg2; 7299 Register ofs = c_rarg3; 7300 Register limit = c_rarg4; 7301 7302 // use r3.r17,r19..r28 to keep a0..a24. 7303 // a0..a24 are respective locals from SHA3.java 7304 Register a0 = r25, 7305 a1 = r26, 7306 a2 = r27, 7307 a3 = r3, 7308 a4 = r4, 7309 a5 = r5, 7310 a6 = r6, 7311 a7 = r7, 7312 a8 = rscratch1, // r8 7313 a9 = rscratch2, // r9 7314 a10 = r10, 7315 a11 = r11, 7316 a12 = r12, 7317 a13 = r13, 7318 a14 = r14, 7319 a15 = r15, 7320 a16 = r16, 7321 a17 = r17, 7322 a18 = r28, 7323 a19 = r19, 7324 a20 = r20, 7325 a21 = r21, 7326 a22 = r22, 7327 a23 = r23, 7328 a24 = r24; 7329 7330 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30; 7331 7332 Label sha3_loop, rounds24_preloop, loop_body; 7333 Label sha3_512_or_sha3_384, shake128; 7334 7335 bool can_use_r18 = false; 7336 #ifndef R18_RESERVED 7337 can_use_r18 = true; 7338 #endif 7339 bool can_use_fp = !PreserveFramePointer; 7340 7341 __ enter(); 7342 7343 // save almost all yet unsaved gpr registers on stack 7344 __ str(block_size, __ pre(sp, -128)); 7345 if (multi_block) { 7346 __ stpw(ofs, limit, Address(sp, 8)); 7347 } 7348 // 8 bytes at sp+16 will be used to keep buf 7349 __ stp(r19, r20, Address(sp, 32)); 7350 __ stp(r21, r22, Address(sp, 48)); 7351 __ stp(r23, r24, Address(sp, 64)); 7352 __ stp(r25, r26, Address(sp, 80)); 7353 __ stp(r27, r28, Address(sp, 96)); 7354 if (can_use_r18 && can_use_fp) { 7355 __ stp(r18_tls, state, Address(sp, 112)); 7356 } else { 7357 __ str(state, Address(sp, 112)); 7358 } 7359 7360 // begin sha3 calculations: loading a0..a24 from state arrary 7361 __ ldp(a0, a1, state); 7362 __ ldp(a2, a3, Address(state, 16)); 7363 __ ldp(a4, a5, Address(state, 32)); 7364 __ ldp(a6, a7, Address(state, 48)); 7365 __ ldp(a8, a9, Address(state, 64)); 7366 __ ldp(a10, a11, Address(state, 80)); 7367 __ ldp(a12, a13, Address(state, 96)); 7368 __ ldp(a14, a15, Address(state, 112)); 7369 __ ldp(a16, a17, Address(state, 128)); 7370 __ ldp(a18, a19, Address(state, 144)); 7371 __ ldp(a20, a21, Address(state, 160)); 7372 __ ldp(a22, a23, Address(state, 176)); 7373 __ ldr(a24, Address(state, 192)); 7374 7375 __ BIND(sha3_loop); 7376 7377 // load input 7378 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7379 __ eor(a0, a0, tmp3); 7380 __ eor(a1, a1, tmp2); 7381 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7382 __ eor(a2, a2, tmp3); 7383 __ eor(a3, a3, tmp2); 7384 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7385 __ eor(a4, a4, tmp3); 7386 __ eor(a5, a5, tmp2); 7387 __ ldr(tmp3, __ post(buf, 8)); 7388 __ eor(a6, a6, tmp3); 7389 7390 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 7391 __ tbz(block_size, 7, sha3_512_or_sha3_384); 7392 7393 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7394 __ eor(a7, a7, tmp3); 7395 __ eor(a8, a8, tmp2); 7396 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7397 __ eor(a9, a9, tmp3); 7398 __ eor(a10, a10, tmp2); 7399 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7400 __ eor(a11, a11, tmp3); 7401 __ eor(a12, a12, tmp2); 7402 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7403 __ eor(a13, a13, tmp3); 7404 __ eor(a14, a14, tmp2); 7405 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7406 __ eor(a15, a15, tmp3); 7407 __ eor(a16, a16, tmp2); 7408 7409 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 7410 __ andw(tmp2, block_size, 48); 7411 __ cbzw(tmp2, rounds24_preloop); 7412 __ tbnz(block_size, 5, shake128); 7413 // block_size == 144, bit5 == 0, SHA3-244 7414 __ ldr(tmp3, __ post(buf, 8)); 7415 __ eor(a17, a17, tmp3); 7416 __ b(rounds24_preloop); 7417 7418 __ BIND(shake128); 7419 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7420 __ eor(a17, a17, tmp3); 7421 __ eor(a18, a18, tmp2); 7422 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7423 __ eor(a19, a19, tmp3); 7424 __ eor(a20, a20, tmp2); 7425 __ b(rounds24_preloop); // block_size == 168, SHAKE128 7426 7427 __ BIND(sha3_512_or_sha3_384); 7428 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7429 __ eor(a7, a7, tmp3); 7430 __ eor(a8, a8, tmp2); 7431 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512 7432 7433 // SHA3-384 7434 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7435 __ eor(a9, a9, tmp3); 7436 __ eor(a10, a10, tmp2); 7437 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7438 __ eor(a11, a11, tmp3); 7439 __ eor(a12, a12, tmp2); 7440 7441 __ BIND(rounds24_preloop); 7442 __ fmovs(v0, 24.0); // float loop counter, 7443 __ fmovs(v1, 1.0); // exact representation 7444 7445 __ str(buf, Address(sp, 16)); 7446 __ lea(tmp3, ExternalAddress((address) round_consts)); 7447 7448 __ BIND(loop_body); 7449 keccak_round_gpr(can_use_fp, can_use_r18, tmp3, 7450 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, 7451 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, 7452 tmp0, tmp1, tmp2); 7453 __ fsubs(v0, v0, v1); 7454 __ fcmps(v0, 0.0); 7455 __ br(__ NE, loop_body); 7456 7457 if (multi_block) { 7458 __ ldrw(block_size, sp); // block_size 7459 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit 7460 __ addw(tmp2, tmp2, block_size); 7461 __ cmpw(tmp2, tmp1); 7462 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping 7463 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping 7464 __ br(Assembler::LE, sha3_loop); 7465 __ movw(c_rarg0, tmp2); // return offset 7466 } 7467 if (can_use_fp && can_use_r18) { 7468 __ ldp(r18_tls, state, Address(sp, 112)); 7469 } else { 7470 __ ldr(state, Address(sp, 112)); 7471 } 7472 // save calculated sha3 state 7473 __ stp(a0, a1, Address(state)); 7474 __ stp(a2, a3, Address(state, 16)); 7475 __ stp(a4, a5, Address(state, 32)); 7476 __ stp(a6, a7, Address(state, 48)); 7477 __ stp(a8, a9, Address(state, 64)); 7478 __ stp(a10, a11, Address(state, 80)); 7479 __ stp(a12, a13, Address(state, 96)); 7480 __ stp(a14, a15, Address(state, 112)); 7481 __ stp(a16, a17, Address(state, 128)); 7482 __ stp(a18, a19, Address(state, 144)); 7483 __ stp(a20, a21, Address(state, 160)); 7484 __ stp(a22, a23, Address(state, 176)); 7485 __ str(a24, Address(state, 192)); 7486 7487 // restore required registers from stack 7488 __ ldp(r19, r20, Address(sp, 32)); 7489 __ ldp(r21, r22, Address(sp, 48)); 7490 __ ldp(r23, r24, Address(sp, 64)); 7491 __ ldp(r25, r26, Address(sp, 80)); 7492 __ ldp(r27, r28, Address(sp, 96)); 7493 if (can_use_fp && can_use_r18) { 7494 __ add(rfp, sp, 128); // leave() will copy rfp to sp below 7495 } // else no need to recalculate rfp, since it wasn't changed 7496 7497 __ leave(); 7498 7499 __ ret(lr); 7500 7501 return start; 7502 } 7503 7504 /** 7505 * Arguments: 7506 * 7507 * Inputs: 7508 * c_rarg0 - int crc 7509 * c_rarg1 - byte* buf 7510 * c_rarg2 - int length 7511 * 7512 * Output: 7513 * rax - int crc result 7514 */ 7515 address generate_updateBytesCRC32() { 7516 assert(UseCRC32Intrinsics, "what are we doing here?"); 7517 7518 __ align(CodeEntryAlignment); 7519 StubId stub_id = StubId::stubgen_updateBytesCRC32_id; 7520 StubCodeMark mark(this, stub_id); 7521 7522 address start = __ pc(); 7523 7524 const Register crc = c_rarg0; // crc 7525 const Register buf = c_rarg1; // source java byte array address 7526 const Register len = c_rarg2; // length 7527 const Register table0 = c_rarg3; // crc_table address 7528 const Register table1 = c_rarg4; 7529 const Register table2 = c_rarg5; 7530 const Register table3 = c_rarg6; 7531 const Register tmp3 = c_rarg7; 7532 7533 BLOCK_COMMENT("Entry:"); 7534 __ enter(); // required for proper stackwalking of RuntimeStub frame 7535 7536 __ kernel_crc32(crc, buf, len, 7537 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7538 7539 __ leave(); // required for proper stackwalking of RuntimeStub frame 7540 __ ret(lr); 7541 7542 return start; 7543 } 7544 7545 /** 7546 * Arguments: 7547 * 7548 * Inputs: 7549 * c_rarg0 - int crc 7550 * c_rarg1 - byte* buf 7551 * c_rarg2 - int length 7552 * c_rarg3 - int* table 7553 * 7554 * Output: 7555 * r0 - int crc result 7556 */ 7557 address generate_updateBytesCRC32C() { 7558 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7559 7560 __ align(CodeEntryAlignment); 7561 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id; 7562 StubCodeMark mark(this, stub_id); 7563 7564 address start = __ pc(); 7565 7566 const Register crc = c_rarg0; // crc 7567 const Register buf = c_rarg1; // source java byte array address 7568 const Register len = c_rarg2; // length 7569 const Register table0 = c_rarg3; // crc_table address 7570 const Register table1 = c_rarg4; 7571 const Register table2 = c_rarg5; 7572 const Register table3 = c_rarg6; 7573 const Register tmp3 = c_rarg7; 7574 7575 BLOCK_COMMENT("Entry:"); 7576 __ enter(); // required for proper stackwalking of RuntimeStub frame 7577 7578 __ kernel_crc32c(crc, buf, len, 7579 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7580 7581 __ leave(); // required for proper stackwalking of RuntimeStub frame 7582 __ ret(lr); 7583 7584 return start; 7585 } 7586 7587 /*** 7588 * Arguments: 7589 * 7590 * Inputs: 7591 * c_rarg0 - int adler 7592 * c_rarg1 - byte* buff 7593 * c_rarg2 - int len 7594 * 7595 * Output: 7596 * c_rarg0 - int adler result 7597 */ 7598 address generate_updateBytesAdler32() { 7599 __ align(CodeEntryAlignment); 7600 StubId stub_id = StubId::stubgen_updateBytesAdler32_id; 7601 StubCodeMark mark(this, stub_id); 7602 address start = __ pc(); 7603 7604 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7605 7606 // Aliases 7607 Register adler = c_rarg0; 7608 Register s1 = c_rarg0; 7609 Register s2 = c_rarg3; 7610 Register buff = c_rarg1; 7611 Register len = c_rarg2; 7612 Register nmax = r4; 7613 Register base = r5; 7614 Register count = r6; 7615 Register temp0 = rscratch1; 7616 Register temp1 = rscratch2; 7617 FloatRegister vbytes = v0; 7618 FloatRegister vs1acc = v1; 7619 FloatRegister vs2acc = v2; 7620 FloatRegister vtable = v3; 7621 7622 // Max number of bytes we can process before having to take the mod 7623 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7624 uint64_t BASE = 0xfff1; 7625 uint64_t NMAX = 0x15B0; 7626 7627 __ mov(base, BASE); 7628 __ mov(nmax, NMAX); 7629 7630 // Load accumulation coefficients for the upper 16 bits 7631 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7632 __ ld1(vtable, __ T16B, Address(temp0)); 7633 7634 // s1 is initialized to the lower 16 bits of adler 7635 // s2 is initialized to the upper 16 bits of adler 7636 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7637 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7638 7639 // The pipelined loop needs at least 16 elements for 1 iteration 7640 // It does check this, but it is more effective to skip to the cleanup loop 7641 __ cmp(len, (u1)16); 7642 __ br(Assembler::HS, L_nmax); 7643 __ cbz(len, L_combine); 7644 7645 __ bind(L_simple_by1_loop); 7646 __ ldrb(temp0, Address(__ post(buff, 1))); 7647 __ add(s1, s1, temp0); 7648 __ add(s2, s2, s1); 7649 __ subs(len, len, 1); 7650 __ br(Assembler::HI, L_simple_by1_loop); 7651 7652 // s1 = s1 % BASE 7653 __ subs(temp0, s1, base); 7654 __ csel(s1, temp0, s1, Assembler::HS); 7655 7656 // s2 = s2 % BASE 7657 __ lsr(temp0, s2, 16); 7658 __ lsl(temp1, temp0, 4); 7659 __ sub(temp1, temp1, temp0); 7660 __ add(s2, temp1, s2, ext::uxth); 7661 7662 __ subs(temp0, s2, base); 7663 __ csel(s2, temp0, s2, Assembler::HS); 7664 7665 __ b(L_combine); 7666 7667 __ bind(L_nmax); 7668 __ subs(len, len, nmax); 7669 __ sub(count, nmax, 16); 7670 __ br(Assembler::LO, L_by16); 7671 7672 __ bind(L_nmax_loop); 7673 7674 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7675 vbytes, vs1acc, vs2acc, vtable); 7676 7677 __ subs(count, count, 16); 7678 __ br(Assembler::HS, L_nmax_loop); 7679 7680 // s1 = s1 % BASE 7681 __ lsr(temp0, s1, 16); 7682 __ lsl(temp1, temp0, 4); 7683 __ sub(temp1, temp1, temp0); 7684 __ add(temp1, temp1, s1, ext::uxth); 7685 7686 __ lsr(temp0, temp1, 16); 7687 __ lsl(s1, temp0, 4); 7688 __ sub(s1, s1, temp0); 7689 __ add(s1, s1, temp1, ext:: uxth); 7690 7691 __ subs(temp0, s1, base); 7692 __ csel(s1, temp0, s1, Assembler::HS); 7693 7694 // s2 = s2 % BASE 7695 __ lsr(temp0, s2, 16); 7696 __ lsl(temp1, temp0, 4); 7697 __ sub(temp1, temp1, temp0); 7698 __ add(temp1, temp1, s2, ext::uxth); 7699 7700 __ lsr(temp0, temp1, 16); 7701 __ lsl(s2, temp0, 4); 7702 __ sub(s2, s2, temp0); 7703 __ add(s2, s2, temp1, ext:: uxth); 7704 7705 __ subs(temp0, s2, base); 7706 __ csel(s2, temp0, s2, Assembler::HS); 7707 7708 __ subs(len, len, nmax); 7709 __ sub(count, nmax, 16); 7710 __ br(Assembler::HS, L_nmax_loop); 7711 7712 __ bind(L_by16); 7713 __ adds(len, len, count); 7714 __ br(Assembler::LO, L_by1); 7715 7716 __ bind(L_by16_loop); 7717 7718 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7719 vbytes, vs1acc, vs2acc, vtable); 7720 7721 __ subs(len, len, 16); 7722 __ br(Assembler::HS, L_by16_loop); 7723 7724 __ bind(L_by1); 7725 __ adds(len, len, 15); 7726 __ br(Assembler::LO, L_do_mod); 7727 7728 __ bind(L_by1_loop); 7729 __ ldrb(temp0, Address(__ post(buff, 1))); 7730 __ add(s1, temp0, s1); 7731 __ add(s2, s2, s1); 7732 __ subs(len, len, 1); 7733 __ br(Assembler::HS, L_by1_loop); 7734 7735 __ bind(L_do_mod); 7736 // s1 = s1 % BASE 7737 __ lsr(temp0, s1, 16); 7738 __ lsl(temp1, temp0, 4); 7739 __ sub(temp1, temp1, temp0); 7740 __ add(temp1, temp1, s1, ext::uxth); 7741 7742 __ lsr(temp0, temp1, 16); 7743 __ lsl(s1, temp0, 4); 7744 __ sub(s1, s1, temp0); 7745 __ add(s1, s1, temp1, ext:: uxth); 7746 7747 __ subs(temp0, s1, base); 7748 __ csel(s1, temp0, s1, Assembler::HS); 7749 7750 // s2 = s2 % BASE 7751 __ lsr(temp0, s2, 16); 7752 __ lsl(temp1, temp0, 4); 7753 __ sub(temp1, temp1, temp0); 7754 __ add(temp1, temp1, s2, ext::uxth); 7755 7756 __ lsr(temp0, temp1, 16); 7757 __ lsl(s2, temp0, 4); 7758 __ sub(s2, s2, temp0); 7759 __ add(s2, s2, temp1, ext:: uxth); 7760 7761 __ subs(temp0, s2, base); 7762 __ csel(s2, temp0, s2, Assembler::HS); 7763 7764 // Combine lower bits and higher bits 7765 __ bind(L_combine); 7766 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7767 7768 __ ret(lr); 7769 7770 return start; 7771 } 7772 7773 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7774 Register temp0, Register temp1, FloatRegister vbytes, 7775 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7776 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7777 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7778 // In non-vectorized code, we update s1 and s2 as: 7779 // s1 <- s1 + b1 7780 // s2 <- s2 + s1 7781 // s1 <- s1 + b2 7782 // s2 <- s2 + b1 7783 // ... 7784 // s1 <- s1 + b16 7785 // s2 <- s2 + s1 7786 // Putting above assignments together, we have: 7787 // s1_new = s1 + b1 + b2 + ... + b16 7788 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7789 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7790 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7791 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7792 7793 // s2 = s2 + s1 * 16 7794 __ add(s2, s2, s1, Assembler::LSL, 4); 7795 7796 // vs1acc = b1 + b2 + b3 + ... + b16 7797 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7798 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7799 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7800 __ uaddlv(vs1acc, __ T16B, vbytes); 7801 __ uaddlv(vs2acc, __ T8H, vs2acc); 7802 7803 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7804 __ fmovd(temp0, vs1acc); 7805 __ fmovd(temp1, vs2acc); 7806 __ add(s1, s1, temp0); 7807 __ add(s2, s2, temp1); 7808 } 7809 7810 /** 7811 * Arguments: 7812 * 7813 * Input: 7814 * c_rarg0 - x address 7815 * c_rarg1 - x length 7816 * c_rarg2 - y address 7817 * c_rarg3 - y length 7818 * c_rarg4 - z address 7819 */ 7820 address generate_multiplyToLen() { 7821 __ align(CodeEntryAlignment); 7822 StubId stub_id = StubId::stubgen_multiplyToLen_id; 7823 StubCodeMark mark(this, stub_id); 7824 7825 address start = __ pc(); 7826 7827 if (AOTCodeCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) { 7828 return start; 7829 } 7830 const Register x = r0; 7831 const Register xlen = r1; 7832 const Register y = r2; 7833 const Register ylen = r3; 7834 const Register z = r4; 7835 7836 const Register tmp0 = r5; 7837 const Register tmp1 = r10; 7838 const Register tmp2 = r11; 7839 const Register tmp3 = r12; 7840 const Register tmp4 = r13; 7841 const Register tmp5 = r14; 7842 const Register tmp6 = r15; 7843 const Register tmp7 = r16; 7844 7845 BLOCK_COMMENT("Entry:"); 7846 __ enter(); // required for proper stackwalking of RuntimeStub frame 7847 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7848 __ leave(); // required for proper stackwalking of RuntimeStub frame 7849 __ ret(lr); 7850 7851 AOTCodeCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start); 7852 return start; 7853 } 7854 7855 address generate_squareToLen() { 7856 // squareToLen algorithm for sizes 1..127 described in java code works 7857 // faster than multiply_to_len on some CPUs and slower on others, but 7858 // multiply_to_len shows a bit better overall results 7859 __ align(CodeEntryAlignment); 7860 StubId stub_id = StubId::stubgen_squareToLen_id; 7861 StubCodeMark mark(this, stub_id); 7862 address start = __ pc(); 7863 7864 if (AOTCodeCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) { 7865 return start; 7866 } 7867 const Register x = r0; 7868 const Register xlen = r1; 7869 const Register z = r2; 7870 const Register y = r4; // == x 7871 const Register ylen = r5; // == xlen 7872 7873 const Register tmp0 = r3; 7874 const Register tmp1 = r10; 7875 const Register tmp2 = r11; 7876 const Register tmp3 = r12; 7877 const Register tmp4 = r13; 7878 const Register tmp5 = r14; 7879 const Register tmp6 = r15; 7880 const Register tmp7 = r16; 7881 7882 RegSet spilled_regs = RegSet::of(y, ylen); 7883 BLOCK_COMMENT("Entry:"); 7884 __ enter(); 7885 __ push(spilled_regs, sp); 7886 __ mov(y, x); 7887 __ mov(ylen, xlen); 7888 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7889 __ pop(spilled_regs, sp); 7890 __ leave(); 7891 __ ret(lr); 7892 7893 AOTCodeCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start); 7894 return start; 7895 } 7896 7897 address generate_mulAdd() { 7898 __ align(CodeEntryAlignment); 7899 StubId stub_id = StubId::stubgen_mulAdd_id; 7900 StubCodeMark mark(this, stub_id); 7901 7902 address start = __ pc(); 7903 7904 if (AOTCodeCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) { 7905 return start; 7906 } 7907 const Register out = r0; 7908 const Register in = r1; 7909 const Register offset = r2; 7910 const Register len = r3; 7911 const Register k = r4; 7912 7913 BLOCK_COMMENT("Entry:"); 7914 __ enter(); 7915 __ mul_add(out, in, offset, len, k); 7916 __ leave(); 7917 __ ret(lr); 7918 7919 AOTCodeCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start); 7920 return start; 7921 } 7922 7923 // Arguments: 7924 // 7925 // Input: 7926 // c_rarg0 - newArr address 7927 // c_rarg1 - oldArr address 7928 // c_rarg2 - newIdx 7929 // c_rarg3 - shiftCount 7930 // c_rarg4 - numIter 7931 // 7932 address generate_bigIntegerRightShift() { 7933 __ align(CodeEntryAlignment); 7934 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id; 7935 StubCodeMark mark(this, stub_id); 7936 address start = __ pc(); 7937 7938 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7939 7940 Register newArr = c_rarg0; 7941 Register oldArr = c_rarg1; 7942 Register newIdx = c_rarg2; 7943 Register shiftCount = c_rarg3; 7944 Register numIter = c_rarg4; 7945 Register idx = numIter; 7946 7947 Register newArrCur = rscratch1; 7948 Register shiftRevCount = rscratch2; 7949 Register oldArrCur = r13; 7950 Register oldArrNext = r14; 7951 7952 FloatRegister oldElem0 = v0; 7953 FloatRegister oldElem1 = v1; 7954 FloatRegister newElem = v2; 7955 FloatRegister shiftVCount = v3; 7956 FloatRegister shiftVRevCount = v4; 7957 7958 __ cbz(idx, Exit); 7959 7960 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7961 7962 // left shift count 7963 __ movw(shiftRevCount, 32); 7964 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7965 7966 // numIter too small to allow a 4-words SIMD loop, rolling back 7967 __ cmp(numIter, (u1)4); 7968 __ br(Assembler::LT, ShiftThree); 7969 7970 __ dup(shiftVCount, __ T4S, shiftCount); 7971 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7972 __ negr(shiftVCount, __ T4S, shiftVCount); 7973 7974 __ BIND(ShiftSIMDLoop); 7975 7976 // Calculate the load addresses 7977 __ sub(idx, idx, 4); 7978 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7979 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7980 __ add(oldArrCur, oldArrNext, 4); 7981 7982 // Load 4 words and process 7983 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7984 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7985 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7986 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7987 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7988 __ st1(newElem, __ T4S, Address(newArrCur)); 7989 7990 __ cmp(idx, (u1)4); 7991 __ br(Assembler::LT, ShiftTwoLoop); 7992 __ b(ShiftSIMDLoop); 7993 7994 __ BIND(ShiftTwoLoop); 7995 __ cbz(idx, Exit); 7996 __ cmp(idx, (u1)1); 7997 __ br(Assembler::EQ, ShiftOne); 7998 7999 // Calculate the load addresses 8000 __ sub(idx, idx, 2); 8001 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 8002 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 8003 __ add(oldArrCur, oldArrNext, 4); 8004 8005 // Load 2 words and process 8006 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 8007 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 8008 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 8009 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 8010 __ orr(newElem, __ T8B, oldElem0, oldElem1); 8011 __ st1(newElem, __ T2S, Address(newArrCur)); 8012 __ b(ShiftTwoLoop); 8013 8014 __ BIND(ShiftThree); 8015 __ tbz(idx, 1, ShiftOne); 8016 __ tbz(idx, 0, ShiftTwo); 8017 __ ldrw(r10, Address(oldArr, 12)); 8018 __ ldrw(r11, Address(oldArr, 8)); 8019 __ lsrvw(r10, r10, shiftCount); 8020 __ lslvw(r11, r11, shiftRevCount); 8021 __ orrw(r12, r10, r11); 8022 __ strw(r12, Address(newArr, 8)); 8023 8024 __ BIND(ShiftTwo); 8025 __ ldrw(r10, Address(oldArr, 8)); 8026 __ ldrw(r11, Address(oldArr, 4)); 8027 __ lsrvw(r10, r10, shiftCount); 8028 __ lslvw(r11, r11, shiftRevCount); 8029 __ orrw(r12, r10, r11); 8030 __ strw(r12, Address(newArr, 4)); 8031 8032 __ BIND(ShiftOne); 8033 __ ldrw(r10, Address(oldArr, 4)); 8034 __ ldrw(r11, Address(oldArr)); 8035 __ lsrvw(r10, r10, shiftCount); 8036 __ lslvw(r11, r11, shiftRevCount); 8037 __ orrw(r12, r10, r11); 8038 __ strw(r12, Address(newArr)); 8039 8040 __ BIND(Exit); 8041 __ ret(lr); 8042 8043 return start; 8044 } 8045 8046 // Arguments: 8047 // 8048 // Input: 8049 // c_rarg0 - newArr address 8050 // c_rarg1 - oldArr address 8051 // c_rarg2 - newIdx 8052 // c_rarg3 - shiftCount 8053 // c_rarg4 - numIter 8054 // 8055 address generate_bigIntegerLeftShift() { 8056 __ align(CodeEntryAlignment); 8057 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id; 8058 StubCodeMark mark(this, stub_id); 8059 address start = __ pc(); 8060 8061 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 8062 8063 Register newArr = c_rarg0; 8064 Register oldArr = c_rarg1; 8065 Register newIdx = c_rarg2; 8066 Register shiftCount = c_rarg3; 8067 Register numIter = c_rarg4; 8068 8069 Register shiftRevCount = rscratch1; 8070 Register oldArrNext = rscratch2; 8071 8072 FloatRegister oldElem0 = v0; 8073 FloatRegister oldElem1 = v1; 8074 FloatRegister newElem = v2; 8075 FloatRegister shiftVCount = v3; 8076 FloatRegister shiftVRevCount = v4; 8077 8078 __ cbz(numIter, Exit); 8079 8080 __ add(oldArrNext, oldArr, 4); 8081 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 8082 8083 // right shift count 8084 __ movw(shiftRevCount, 32); 8085 __ subw(shiftRevCount, shiftRevCount, shiftCount); 8086 8087 // numIter too small to allow a 4-words SIMD loop, rolling back 8088 __ cmp(numIter, (u1)4); 8089 __ br(Assembler::LT, ShiftThree); 8090 8091 __ dup(shiftVCount, __ T4S, shiftCount); 8092 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 8093 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 8094 8095 __ BIND(ShiftSIMDLoop); 8096 8097 // load 4 words and process 8098 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 8099 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 8100 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 8101 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 8102 __ orr(newElem, __ T16B, oldElem0, oldElem1); 8103 __ st1(newElem, __ T4S, __ post(newArr, 16)); 8104 __ sub(numIter, numIter, 4); 8105 8106 __ cmp(numIter, (u1)4); 8107 __ br(Assembler::LT, ShiftTwoLoop); 8108 __ b(ShiftSIMDLoop); 8109 8110 __ BIND(ShiftTwoLoop); 8111 __ cbz(numIter, Exit); 8112 __ cmp(numIter, (u1)1); 8113 __ br(Assembler::EQ, ShiftOne); 8114 8115 // load 2 words and process 8116 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 8117 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 8118 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 8119 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 8120 __ orr(newElem, __ T8B, oldElem0, oldElem1); 8121 __ st1(newElem, __ T2S, __ post(newArr, 8)); 8122 __ sub(numIter, numIter, 2); 8123 __ b(ShiftTwoLoop); 8124 8125 __ BIND(ShiftThree); 8126 __ ldrw(r10, __ post(oldArr, 4)); 8127 __ ldrw(r11, __ post(oldArrNext, 4)); 8128 __ lslvw(r10, r10, shiftCount); 8129 __ lsrvw(r11, r11, shiftRevCount); 8130 __ orrw(r12, r10, r11); 8131 __ strw(r12, __ post(newArr, 4)); 8132 __ tbz(numIter, 1, Exit); 8133 __ tbz(numIter, 0, ShiftOne); 8134 8135 __ BIND(ShiftTwo); 8136 __ ldrw(r10, __ post(oldArr, 4)); 8137 __ ldrw(r11, __ post(oldArrNext, 4)); 8138 __ lslvw(r10, r10, shiftCount); 8139 __ lsrvw(r11, r11, shiftRevCount); 8140 __ orrw(r12, r10, r11); 8141 __ strw(r12, __ post(newArr, 4)); 8142 8143 __ BIND(ShiftOne); 8144 __ ldrw(r10, Address(oldArr)); 8145 __ ldrw(r11, Address(oldArrNext)); 8146 __ lslvw(r10, r10, shiftCount); 8147 __ lsrvw(r11, r11, shiftRevCount); 8148 __ orrw(r12, r10, r11); 8149 __ strw(r12, Address(newArr)); 8150 8151 __ BIND(Exit); 8152 __ ret(lr); 8153 8154 return start; 8155 } 8156 8157 address generate_count_positives(address &count_positives_long) { 8158 const u1 large_loop_size = 64; 8159 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 8160 int dcache_line = VM_Version::dcache_line_size(); 8161 8162 Register ary1 = r1, len = r2, result = r0; 8163 8164 __ align(CodeEntryAlignment); 8165 8166 StubId stub_id = StubId::stubgen_count_positives_id; 8167 StubCodeMark mark(this, stub_id); 8168 8169 address entry = __ pc(); 8170 8171 __ enter(); 8172 // precondition: a copy of len is already in result 8173 // __ mov(result, len); 8174 8175 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 8176 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 8177 8178 __ cmp(len, (u1)15); 8179 __ br(Assembler::GT, LEN_OVER_15); 8180 // The only case when execution falls into this code is when pointer is near 8181 // the end of memory page and we have to avoid reading next page 8182 __ add(ary1, ary1, len); 8183 __ subs(len, len, 8); 8184 __ br(Assembler::GT, LEN_OVER_8); 8185 __ ldr(rscratch2, Address(ary1, -8)); 8186 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 8187 __ lsrv(rscratch2, rscratch2, rscratch1); 8188 __ tst(rscratch2, UPPER_BIT_MASK); 8189 __ csel(result, zr, result, Assembler::NE); 8190 __ leave(); 8191 __ ret(lr); 8192 __ bind(LEN_OVER_8); 8193 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 8194 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 8195 __ tst(rscratch2, UPPER_BIT_MASK); 8196 __ br(Assembler::NE, RET_NO_POP); 8197 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 8198 __ lsrv(rscratch1, rscratch1, rscratch2); 8199 __ tst(rscratch1, UPPER_BIT_MASK); 8200 __ bind(RET_NO_POP); 8201 __ csel(result, zr, result, Assembler::NE); 8202 __ leave(); 8203 __ ret(lr); 8204 8205 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 8206 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 8207 8208 count_positives_long = __ pc(); // 2nd entry point 8209 8210 __ enter(); 8211 8212 __ bind(LEN_OVER_15); 8213 __ push(spilled_regs, sp); 8214 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 8215 __ cbz(rscratch2, ALIGNED); 8216 __ ldp(tmp6, tmp1, Address(ary1)); 8217 __ mov(tmp5, 16); 8218 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 8219 __ add(ary1, ary1, rscratch1); 8220 __ orr(tmp6, tmp6, tmp1); 8221 __ tst(tmp6, UPPER_BIT_MASK); 8222 __ br(Assembler::NE, RET_ADJUST); 8223 __ sub(len, len, rscratch1); 8224 8225 __ bind(ALIGNED); 8226 __ cmp(len, large_loop_size); 8227 __ br(Assembler::LT, CHECK_16); 8228 // Perform 16-byte load as early return in pre-loop to handle situation 8229 // when initially aligned large array has negative values at starting bytes, 8230 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 8231 // slower. Cases with negative bytes further ahead won't be affected that 8232 // much. In fact, it'll be faster due to early loads, less instructions and 8233 // less branches in LARGE_LOOP. 8234 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 8235 __ sub(len, len, 16); 8236 __ orr(tmp6, tmp6, tmp1); 8237 __ tst(tmp6, UPPER_BIT_MASK); 8238 __ br(Assembler::NE, RET_ADJUST_16); 8239 __ cmp(len, large_loop_size); 8240 __ br(Assembler::LT, CHECK_16); 8241 8242 if (SoftwarePrefetchHintDistance >= 0 8243 && SoftwarePrefetchHintDistance >= dcache_line) { 8244 // initial prefetch 8245 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 8246 } 8247 __ bind(LARGE_LOOP); 8248 if (SoftwarePrefetchHintDistance >= 0) { 8249 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 8250 } 8251 // Issue load instructions first, since it can save few CPU/MEM cycles, also 8252 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 8253 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 8254 // instructions per cycle and have less branches, but this approach disables 8255 // early return, thus, all 64 bytes are loaded and checked every time. 8256 __ ldp(tmp2, tmp3, Address(ary1)); 8257 __ ldp(tmp4, tmp5, Address(ary1, 16)); 8258 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 8259 __ ldp(tmp6, tmp1, Address(ary1, 48)); 8260 __ add(ary1, ary1, large_loop_size); 8261 __ sub(len, len, large_loop_size); 8262 __ orr(tmp2, tmp2, tmp3); 8263 __ orr(tmp4, tmp4, tmp5); 8264 __ orr(rscratch1, rscratch1, rscratch2); 8265 __ orr(tmp6, tmp6, tmp1); 8266 __ orr(tmp2, tmp2, tmp4); 8267 __ orr(rscratch1, rscratch1, tmp6); 8268 __ orr(tmp2, tmp2, rscratch1); 8269 __ tst(tmp2, UPPER_BIT_MASK); 8270 __ br(Assembler::NE, RET_ADJUST_LONG); 8271 __ cmp(len, large_loop_size); 8272 __ br(Assembler::GE, LARGE_LOOP); 8273 8274 __ bind(CHECK_16); // small 16-byte load pre-loop 8275 __ cmp(len, (u1)16); 8276 __ br(Assembler::LT, POST_LOOP16); 8277 8278 __ bind(LOOP16); // small 16-byte load loop 8279 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 8280 __ sub(len, len, 16); 8281 __ orr(tmp2, tmp2, tmp3); 8282 __ tst(tmp2, UPPER_BIT_MASK); 8283 __ br(Assembler::NE, RET_ADJUST_16); 8284 __ cmp(len, (u1)16); 8285 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 8286 8287 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 8288 __ cmp(len, (u1)8); 8289 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 8290 __ ldr(tmp3, Address(__ post(ary1, 8))); 8291 __ tst(tmp3, UPPER_BIT_MASK); 8292 __ br(Assembler::NE, RET_ADJUST); 8293 __ sub(len, len, 8); 8294 8295 __ bind(POST_LOOP16_LOAD_TAIL); 8296 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 8297 __ ldr(tmp1, Address(ary1)); 8298 __ mov(tmp2, 64); 8299 __ sub(tmp4, tmp2, len, __ LSL, 3); 8300 __ lslv(tmp1, tmp1, tmp4); 8301 __ tst(tmp1, UPPER_BIT_MASK); 8302 __ br(Assembler::NE, RET_ADJUST); 8303 // Fallthrough 8304 8305 __ bind(RET_LEN); 8306 __ pop(spilled_regs, sp); 8307 __ leave(); 8308 __ ret(lr); 8309 8310 // difference result - len is the count of guaranteed to be 8311 // positive bytes 8312 8313 __ bind(RET_ADJUST_LONG); 8314 __ add(len, len, (u1)(large_loop_size - 16)); 8315 __ bind(RET_ADJUST_16); 8316 __ add(len, len, 16); 8317 __ bind(RET_ADJUST); 8318 __ pop(spilled_regs, sp); 8319 __ leave(); 8320 __ sub(result, result, len); 8321 __ ret(lr); 8322 8323 return entry; 8324 } 8325 8326 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 8327 bool usePrefetch, Label &NOT_EQUAL) { 8328 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8329 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8330 tmp7 = r12, tmp8 = r13; 8331 Label LOOP; 8332 8333 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8334 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8335 __ bind(LOOP); 8336 if (usePrefetch) { 8337 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8338 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8339 } 8340 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8341 __ eor(tmp1, tmp1, tmp2); 8342 __ eor(tmp3, tmp3, tmp4); 8343 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8344 __ orr(tmp1, tmp1, tmp3); 8345 __ cbnz(tmp1, NOT_EQUAL); 8346 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8347 __ eor(tmp5, tmp5, tmp6); 8348 __ eor(tmp7, tmp7, tmp8); 8349 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8350 __ orr(tmp5, tmp5, tmp7); 8351 __ cbnz(tmp5, NOT_EQUAL); 8352 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8353 __ eor(tmp1, tmp1, tmp2); 8354 __ eor(tmp3, tmp3, tmp4); 8355 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8356 __ orr(tmp1, tmp1, tmp3); 8357 __ cbnz(tmp1, NOT_EQUAL); 8358 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8359 __ eor(tmp5, tmp5, tmp6); 8360 __ sub(cnt1, cnt1, 8 * wordSize); 8361 __ eor(tmp7, tmp7, tmp8); 8362 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8363 // tmp6 is not used. MacroAssembler::subs is used here (rather than 8364 // cmp) because subs allows an unlimited range of immediate operand. 8365 __ subs(tmp6, cnt1, loopThreshold); 8366 __ orr(tmp5, tmp5, tmp7); 8367 __ cbnz(tmp5, NOT_EQUAL); 8368 __ br(__ GE, LOOP); 8369 // post-loop 8370 __ eor(tmp1, tmp1, tmp2); 8371 __ eor(tmp3, tmp3, tmp4); 8372 __ orr(tmp1, tmp1, tmp3); 8373 __ sub(cnt1, cnt1, 2 * wordSize); 8374 __ cbnz(tmp1, NOT_EQUAL); 8375 } 8376 8377 void generate_large_array_equals_loop_simd(int loopThreshold, 8378 bool usePrefetch, Label &NOT_EQUAL) { 8379 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8380 tmp2 = rscratch2; 8381 Label LOOP; 8382 8383 __ bind(LOOP); 8384 if (usePrefetch) { 8385 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8386 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8387 } 8388 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 8389 __ sub(cnt1, cnt1, 8 * wordSize); 8390 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 8391 __ subs(tmp1, cnt1, loopThreshold); 8392 __ eor(v0, __ T16B, v0, v4); 8393 __ eor(v1, __ T16B, v1, v5); 8394 __ eor(v2, __ T16B, v2, v6); 8395 __ eor(v3, __ T16B, v3, v7); 8396 __ orr(v0, __ T16B, v0, v1); 8397 __ orr(v1, __ T16B, v2, v3); 8398 __ orr(v0, __ T16B, v0, v1); 8399 __ umov(tmp1, v0, __ D, 0); 8400 __ umov(tmp2, v0, __ D, 1); 8401 __ orr(tmp1, tmp1, tmp2); 8402 __ cbnz(tmp1, NOT_EQUAL); 8403 __ br(__ GE, LOOP); 8404 } 8405 8406 // a1 = r1 - array1 address 8407 // a2 = r2 - array2 address 8408 // result = r0 - return value. Already contains "false" 8409 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 8410 // r3-r5 are reserved temporary registers 8411 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 8412 address generate_large_array_equals() { 8413 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8414 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8415 tmp7 = r12, tmp8 = r13; 8416 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 8417 SMALL_LOOP, POST_LOOP; 8418 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 8419 // calculate if at least 32 prefetched bytes are used 8420 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 8421 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 8422 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 8423 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 8424 tmp5, tmp6, tmp7, tmp8); 8425 8426 __ align(CodeEntryAlignment); 8427 8428 StubId stub_id = StubId::stubgen_large_array_equals_id; 8429 StubCodeMark mark(this, stub_id); 8430 8431 address entry = __ pc(); 8432 __ enter(); 8433 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 8434 // also advance pointers to use post-increment instead of pre-increment 8435 __ add(a1, a1, wordSize); 8436 __ add(a2, a2, wordSize); 8437 if (AvoidUnalignedAccesses) { 8438 // both implementations (SIMD/nonSIMD) are using relatively large load 8439 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 8440 // on some CPUs in case of address is not at least 16-byte aligned. 8441 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 8442 // load if needed at least for 1st address and make if 16-byte aligned. 8443 Label ALIGNED16; 8444 __ tbz(a1, 3, ALIGNED16); 8445 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8446 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8447 __ sub(cnt1, cnt1, wordSize); 8448 __ eor(tmp1, tmp1, tmp2); 8449 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 8450 __ bind(ALIGNED16); 8451 } 8452 if (UseSIMDForArrayEquals) { 8453 if (SoftwarePrefetchHintDistance >= 0) { 8454 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8455 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8456 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 8457 /* prfm = */ true, NOT_EQUAL); 8458 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8459 __ br(__ LT, TAIL); 8460 } 8461 __ bind(NO_PREFETCH_LARGE_LOOP); 8462 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 8463 /* prfm = */ false, NOT_EQUAL); 8464 } else { 8465 __ push(spilled_regs, sp); 8466 if (SoftwarePrefetchHintDistance >= 0) { 8467 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8468 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8469 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 8470 /* prfm = */ true, NOT_EQUAL); 8471 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8472 __ br(__ LT, TAIL); 8473 } 8474 __ bind(NO_PREFETCH_LARGE_LOOP); 8475 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 8476 /* prfm = */ false, NOT_EQUAL); 8477 } 8478 __ bind(TAIL); 8479 __ cbz(cnt1, EQUAL); 8480 __ subs(cnt1, cnt1, wordSize); 8481 __ br(__ LE, POST_LOOP); 8482 __ bind(SMALL_LOOP); 8483 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8484 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8485 __ subs(cnt1, cnt1, wordSize); 8486 __ eor(tmp1, tmp1, tmp2); 8487 __ cbnz(tmp1, NOT_EQUAL); 8488 __ br(__ GT, SMALL_LOOP); 8489 __ bind(POST_LOOP); 8490 __ ldr(tmp1, Address(a1, cnt1)); 8491 __ ldr(tmp2, Address(a2, cnt1)); 8492 __ eor(tmp1, tmp1, tmp2); 8493 __ cbnz(tmp1, NOT_EQUAL); 8494 __ bind(EQUAL); 8495 __ mov(result, true); 8496 __ bind(NOT_EQUAL); 8497 if (!UseSIMDForArrayEquals) { 8498 __ pop(spilled_regs, sp); 8499 } 8500 __ bind(NOT_EQUAL_NO_POP); 8501 __ leave(); 8502 __ ret(lr); 8503 return entry; 8504 } 8505 8506 // result = r0 - return value. Contains initial hashcode value on entry. 8507 // ary = r1 - array address 8508 // cnt = r2 - elements count 8509 // Clobbers: v0-v13, rscratch1, rscratch2 8510 address generate_large_arrays_hashcode(BasicType eltype) { 8511 const Register result = r0, ary = r1, cnt = r2; 8512 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 8513 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 8514 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 8515 const FloatRegister vpowm = v13; 8516 8517 ARRAYS_HASHCODE_REGISTERS; 8518 8519 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 8520 8521 unsigned int vf; // vectorization factor 8522 bool multiply_by_halves; 8523 Assembler::SIMD_Arrangement load_arrangement; 8524 switch (eltype) { 8525 case T_BOOLEAN: 8526 case T_BYTE: 8527 load_arrangement = Assembler::T8B; 8528 multiply_by_halves = true; 8529 vf = 8; 8530 break; 8531 case T_CHAR: 8532 case T_SHORT: 8533 load_arrangement = Assembler::T8H; 8534 multiply_by_halves = true; 8535 vf = 8; 8536 break; 8537 case T_INT: 8538 load_arrangement = Assembler::T4S; 8539 multiply_by_halves = false; 8540 vf = 4; 8541 break; 8542 default: 8543 ShouldNotReachHere(); 8544 } 8545 8546 // Unroll factor 8547 const unsigned uf = 4; 8548 8549 // Effective vectorization factor 8550 const unsigned evf = vf * uf; 8551 8552 __ align(CodeEntryAlignment); 8553 8554 StubId stub_id; 8555 switch (eltype) { 8556 case T_BOOLEAN: 8557 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id; 8558 break; 8559 case T_BYTE: 8560 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id; 8561 break; 8562 case T_CHAR: 8563 stub_id = StubId::stubgen_large_arrays_hashcode_char_id; 8564 break; 8565 case T_SHORT: 8566 stub_id = StubId::stubgen_large_arrays_hashcode_short_id; 8567 break; 8568 case T_INT: 8569 stub_id = StubId::stubgen_large_arrays_hashcode_int_id; 8570 break; 8571 default: 8572 stub_id = StubId::NO_STUBID; 8573 ShouldNotReachHere(); 8574 }; 8575 8576 StubCodeMark mark(this, stub_id); 8577 8578 address entry = __ pc(); 8579 __ enter(); 8580 8581 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8582 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8583 // value shouldn't change throughout both loops. 8584 __ movw(rscratch1, intpow(31U, 3)); 8585 __ mov(vpow, Assembler::S, 0, rscratch1); 8586 __ movw(rscratch1, intpow(31U, 2)); 8587 __ mov(vpow, Assembler::S, 1, rscratch1); 8588 __ movw(rscratch1, intpow(31U, 1)); 8589 __ mov(vpow, Assembler::S, 2, rscratch1); 8590 __ movw(rscratch1, intpow(31U, 0)); 8591 __ mov(vpow, Assembler::S, 3, rscratch1); 8592 8593 __ mov(vmul0, Assembler::T16B, 0); 8594 __ mov(vmul0, Assembler::S, 3, result); 8595 8596 __ andr(rscratch2, cnt, (uf - 1) * vf); 8597 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8598 8599 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8600 __ mov(vpowm, Assembler::S, 0, rscratch1); 8601 8602 // SMALL LOOP 8603 __ bind(SMALL_LOOP); 8604 8605 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8606 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8607 __ subsw(rscratch2, rscratch2, vf); 8608 8609 if (load_arrangement == Assembler::T8B) { 8610 // Extend 8B to 8H to be able to use vector multiply 8611 // instructions 8612 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8613 if (is_signed_subword_type(eltype)) { 8614 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8615 } else { 8616 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8617 } 8618 } 8619 8620 switch (load_arrangement) { 8621 case Assembler::T4S: 8622 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8623 break; 8624 case Assembler::T8B: 8625 case Assembler::T8H: 8626 assert(is_subword_type(eltype), "subword type expected"); 8627 if (is_signed_subword_type(eltype)) { 8628 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8629 } else { 8630 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8631 } 8632 break; 8633 default: 8634 __ should_not_reach_here(); 8635 } 8636 8637 // Process the upper half of a vector 8638 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8639 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8640 if (is_signed_subword_type(eltype)) { 8641 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8642 } else { 8643 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8644 } 8645 } 8646 8647 __ br(Assembler::HI, SMALL_LOOP); 8648 8649 // SMALL LOOP'S EPILOQUE 8650 __ lsr(rscratch2, cnt, exact_log2(evf)); 8651 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8652 8653 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8654 __ addv(vmul0, Assembler::T4S, vmul0); 8655 __ umov(result, vmul0, Assembler::S, 0); 8656 8657 // TAIL 8658 __ bind(TAIL); 8659 8660 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8661 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8662 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8663 __ andr(rscratch2, cnt, vf - 1); 8664 __ bind(TAIL_SHORTCUT); 8665 __ adr(rscratch1, BR_BASE); 8666 // For Cortex-A53 offset is 4 because 2 nops are generated. 8667 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3); 8668 __ movw(rscratch2, 0x1f); 8669 __ br(rscratch1); 8670 8671 for (size_t i = 0; i < vf - 1; ++i) { 8672 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8673 eltype); 8674 __ maddw(result, result, rscratch2, rscratch1); 8675 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 8676 // Generate 2nd nop to have 4 instructions per iteration. 8677 if (VM_Version::supports_a53mac()) { 8678 __ nop(); 8679 } 8680 } 8681 __ bind(BR_BASE); 8682 8683 __ leave(); 8684 __ ret(lr); 8685 8686 // LARGE LOOP 8687 __ bind(LARGE_LOOP_PREHEADER); 8688 8689 __ lsr(rscratch2, cnt, exact_log2(evf)); 8690 8691 if (multiply_by_halves) { 8692 // 31^4 - multiplier between lower and upper parts of a register 8693 __ movw(rscratch1, intpow(31U, vf / 2)); 8694 __ mov(vpowm, Assembler::S, 1, rscratch1); 8695 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8696 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8697 __ mov(vpowm, Assembler::S, 0, rscratch1); 8698 } else { 8699 // 31^16 8700 __ movw(rscratch1, intpow(31U, evf)); 8701 __ mov(vpowm, Assembler::S, 0, rscratch1); 8702 } 8703 8704 __ mov(vmul3, Assembler::T16B, 0); 8705 __ mov(vmul2, Assembler::T16B, 0); 8706 __ mov(vmul1, Assembler::T16B, 0); 8707 8708 __ bind(LARGE_LOOP); 8709 8710 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8711 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8712 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8713 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8714 8715 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8716 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8717 8718 if (load_arrangement == Assembler::T8B) { 8719 // Extend 8B to 8H to be able to use vector multiply 8720 // instructions 8721 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8722 if (is_signed_subword_type(eltype)) { 8723 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8724 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8725 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8726 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8727 } else { 8728 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8729 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8730 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8731 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8732 } 8733 } 8734 8735 switch (load_arrangement) { 8736 case Assembler::T4S: 8737 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8738 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8739 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8740 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8741 break; 8742 case Assembler::T8B: 8743 case Assembler::T8H: 8744 assert(is_subword_type(eltype), "subword type expected"); 8745 if (is_signed_subword_type(eltype)) { 8746 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8747 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8748 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8749 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8750 } else { 8751 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8752 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8753 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8754 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8755 } 8756 break; 8757 default: 8758 __ should_not_reach_here(); 8759 } 8760 8761 // Process the upper half of a vector 8762 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8763 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8764 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8765 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8766 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8767 if (is_signed_subword_type(eltype)) { 8768 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8769 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8770 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8771 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8772 } else { 8773 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8774 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8775 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8776 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8777 } 8778 } 8779 8780 __ subsw(rscratch2, rscratch2, 1); 8781 __ br(Assembler::HI, LARGE_LOOP); 8782 8783 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8784 __ addv(vmul3, Assembler::T4S, vmul3); 8785 __ umov(result, vmul3, Assembler::S, 0); 8786 8787 __ mov(rscratch2, intpow(31U, vf)); 8788 8789 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8790 __ addv(vmul2, Assembler::T4S, vmul2); 8791 __ umov(rscratch1, vmul2, Assembler::S, 0); 8792 __ maddw(result, result, rscratch2, rscratch1); 8793 8794 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8795 __ addv(vmul1, Assembler::T4S, vmul1); 8796 __ umov(rscratch1, vmul1, Assembler::S, 0); 8797 __ maddw(result, result, rscratch2, rscratch1); 8798 8799 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8800 __ addv(vmul0, Assembler::T4S, vmul0); 8801 __ umov(rscratch1, vmul0, Assembler::S, 0); 8802 __ maddw(result, result, rscratch2, rscratch1); 8803 8804 __ andr(rscratch2, cnt, vf - 1); 8805 __ cbnz(rscratch2, TAIL_SHORTCUT); 8806 8807 __ leave(); 8808 __ ret(lr); 8809 8810 return entry; 8811 } 8812 8813 address generate_dsin_dcos(bool isCos) { 8814 __ align(CodeEntryAlignment); 8815 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id); 8816 StubCodeMark mark(this, stub_id); 8817 address start = __ pc(); 8818 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8819 (address)StubRoutines::aarch64::_two_over_pi, 8820 (address)StubRoutines::aarch64::_pio2, 8821 (address)StubRoutines::aarch64::_dsin_coef, 8822 (address)StubRoutines::aarch64::_dcos_coef); 8823 return start; 8824 } 8825 8826 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8827 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8828 Label &DIFF2) { 8829 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8830 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8831 8832 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8833 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8834 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8835 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8836 8837 __ fmovd(tmpL, vtmp3); 8838 __ eor(rscratch2, tmp3, tmpL); 8839 __ cbnz(rscratch2, DIFF2); 8840 8841 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8842 __ umov(tmpL, vtmp3, __ D, 1); 8843 __ eor(rscratch2, tmpU, tmpL); 8844 __ cbnz(rscratch2, DIFF1); 8845 8846 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8847 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8848 __ fmovd(tmpL, vtmp); 8849 __ eor(rscratch2, tmp3, tmpL); 8850 __ cbnz(rscratch2, DIFF2); 8851 8852 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8853 __ umov(tmpL, vtmp, __ D, 1); 8854 __ eor(rscratch2, tmpU, tmpL); 8855 __ cbnz(rscratch2, DIFF1); 8856 } 8857 8858 // r0 = result 8859 // r1 = str1 8860 // r2 = cnt1 8861 // r3 = str2 8862 // r4 = cnt2 8863 // r10 = tmp1 8864 // r11 = tmp2 8865 address generate_compare_long_string_different_encoding(bool isLU) { 8866 __ align(CodeEntryAlignment); 8867 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id); 8868 StubCodeMark mark(this, stub_id); 8869 address entry = __ pc(); 8870 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8871 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8872 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8873 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8874 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8875 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8876 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8877 8878 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8879 8880 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8881 // cnt2 == amount of characters left to compare 8882 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8883 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8884 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8885 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8886 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8887 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8888 __ eor(rscratch2, tmp1, tmp2); 8889 __ mov(rscratch1, tmp2); 8890 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8891 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8892 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8893 __ push(spilled_regs, sp); 8894 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8895 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8896 8897 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8898 8899 if (SoftwarePrefetchHintDistance >= 0) { 8900 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8901 __ br(__ LT, NO_PREFETCH); 8902 __ bind(LARGE_LOOP_PREFETCH); 8903 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8904 __ mov(tmp4, 2); 8905 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8906 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8907 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8908 __ subs(tmp4, tmp4, 1); 8909 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8910 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8911 __ mov(tmp4, 2); 8912 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8913 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8914 __ subs(tmp4, tmp4, 1); 8915 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8916 __ sub(cnt2, cnt2, 64); 8917 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8918 __ br(__ GE, LARGE_LOOP_PREFETCH); 8919 } 8920 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8921 __ bind(NO_PREFETCH); 8922 __ subs(cnt2, cnt2, 16); 8923 __ br(__ LT, TAIL); 8924 __ align(OptoLoopAlignment); 8925 __ bind(SMALL_LOOP); // smaller loop 8926 __ subs(cnt2, cnt2, 16); 8927 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8928 __ br(__ GE, SMALL_LOOP); 8929 __ cmn(cnt2, (u1)16); 8930 __ br(__ EQ, LOAD_LAST); 8931 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8932 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8933 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8934 __ ldr(tmp3, Address(cnt1, -8)); 8935 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8936 __ b(LOAD_LAST); 8937 __ bind(DIFF2); 8938 __ mov(tmpU, tmp3); 8939 __ bind(DIFF1); 8940 __ pop(spilled_regs, sp); 8941 __ b(CALCULATE_DIFFERENCE); 8942 __ bind(LOAD_LAST); 8943 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8944 // No need to load it again 8945 __ mov(tmpU, tmp3); 8946 __ pop(spilled_regs, sp); 8947 8948 // tmp2 points to the address of the last 4 Latin1 characters right now 8949 __ ldrs(vtmp, Address(tmp2)); 8950 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8951 __ fmovd(tmpL, vtmp); 8952 8953 __ eor(rscratch2, tmpU, tmpL); 8954 __ cbz(rscratch2, DONE); 8955 8956 // Find the first different characters in the longwords and 8957 // compute their difference. 8958 __ bind(CALCULATE_DIFFERENCE); 8959 __ rev(rscratch2, rscratch2); 8960 __ clz(rscratch2, rscratch2); 8961 __ andr(rscratch2, rscratch2, -16); 8962 __ lsrv(tmp1, tmp1, rscratch2); 8963 __ uxthw(tmp1, tmp1); 8964 __ lsrv(rscratch1, rscratch1, rscratch2); 8965 __ uxthw(rscratch1, rscratch1); 8966 __ subw(result, tmp1, rscratch1); 8967 __ bind(DONE); 8968 __ ret(lr); 8969 return entry; 8970 } 8971 8972 // r0 = input (float16) 8973 // v0 = result (float) 8974 // v1 = temporary float register 8975 address generate_float16ToFloat() { 8976 __ align(CodeEntryAlignment); 8977 StubId stub_id = StubId::stubgen_hf2f_id; 8978 StubCodeMark mark(this, stub_id); 8979 address entry = __ pc(); 8980 BLOCK_COMMENT("Entry:"); 8981 __ flt16_to_flt(v0, r0, v1); 8982 __ ret(lr); 8983 return entry; 8984 } 8985 8986 // v0 = input (float) 8987 // r0 = result (float16) 8988 // v1 = temporary float register 8989 address generate_floatToFloat16() { 8990 __ align(CodeEntryAlignment); 8991 StubId stub_id = StubId::stubgen_f2hf_id; 8992 StubCodeMark mark(this, stub_id); 8993 address entry = __ pc(); 8994 BLOCK_COMMENT("Entry:"); 8995 __ flt_to_flt16(r0, v0, v1); 8996 __ ret(lr); 8997 return entry; 8998 } 8999 9000 address generate_method_entry_barrier() { 9001 __ align(CodeEntryAlignment); 9002 StubId stub_id = StubId::stubgen_method_entry_barrier_id; 9003 StubCodeMark mark(this, stub_id); 9004 9005 Label deoptimize_label; 9006 9007 address start = __ pc(); 9008 9009 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 9010 9011 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 9012 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 9013 // We can get here despite the nmethod being good, if we have not 9014 // yet applied our cross modification fence (or data fence). 9015 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 9016 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 9017 __ ldrw(rscratch2, rscratch2); 9018 __ strw(rscratch2, thread_epoch_addr); 9019 __ isb(); 9020 __ membar(__ LoadLoad); 9021 } 9022 9023 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 9024 9025 __ enter(); 9026 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 9027 9028 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 9029 9030 __ push_call_clobbered_registers(); 9031 9032 __ mov(c_rarg0, rscratch2); 9033 __ call_VM_leaf 9034 (CAST_FROM_FN_PTR 9035 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 9036 9037 __ reset_last_Java_frame(true); 9038 9039 __ mov(rscratch1, r0); 9040 9041 __ pop_call_clobbered_registers(); 9042 9043 __ cbnz(rscratch1, deoptimize_label); 9044 9045 __ leave(); 9046 __ ret(lr); 9047 9048 __ BIND(deoptimize_label); 9049 9050 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 9051 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 9052 9053 __ mov(sp, rscratch1); 9054 __ br(rscratch2); 9055 9056 return start; 9057 } 9058 9059 // r0 = result 9060 // r1 = str1 9061 // r2 = cnt1 9062 // r3 = str2 9063 // r4 = cnt2 9064 // r10 = tmp1 9065 // r11 = tmp2 9066 address generate_compare_long_string_same_encoding(bool isLL) { 9067 __ align(CodeEntryAlignment); 9068 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id); 9069 StubCodeMark mark(this, stub_id); 9070 address entry = __ pc(); 9071 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9072 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 9073 9074 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 9075 9076 // exit from large loop when less than 64 bytes left to read or we're about 9077 // to prefetch memory behind array border 9078 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 9079 9080 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 9081 __ eor(rscratch2, tmp1, tmp2); 9082 __ cbnz(rscratch2, CAL_DIFFERENCE); 9083 9084 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 9085 // update pointers, because of previous read 9086 __ add(str1, str1, wordSize); 9087 __ add(str2, str2, wordSize); 9088 if (SoftwarePrefetchHintDistance >= 0) { 9089 __ align(OptoLoopAlignment); 9090 __ bind(LARGE_LOOP_PREFETCH); 9091 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 9092 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 9093 9094 for (int i = 0; i < 4; i++) { 9095 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 9096 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 9097 __ cmp(tmp1, tmp2); 9098 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9099 __ br(Assembler::NE, DIFF); 9100 } 9101 __ sub(cnt2, cnt2, isLL ? 64 : 32); 9102 __ add(str1, str1, 64); 9103 __ add(str2, str2, 64); 9104 __ subs(rscratch2, cnt2, largeLoopExitCondition); 9105 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 9106 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 9107 } 9108 9109 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 9110 __ br(Assembler::LE, LESS16); 9111 __ align(OptoLoopAlignment); 9112 __ bind(LOOP_COMPARE16); 9113 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9114 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9115 __ cmp(tmp1, tmp2); 9116 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9117 __ br(Assembler::NE, DIFF); 9118 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9119 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9120 __ br(Assembler::LT, LESS16); 9121 9122 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9123 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9124 __ cmp(tmp1, tmp2); 9125 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9126 __ br(Assembler::NE, DIFF); 9127 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9128 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9129 __ br(Assembler::GE, LOOP_COMPARE16); 9130 __ cbz(cnt2, LENGTH_DIFF); 9131 9132 __ bind(LESS16); 9133 // each 8 compare 9134 __ subs(cnt2, cnt2, isLL ? 8 : 4); 9135 __ br(Assembler::LE, LESS8); 9136 __ ldr(tmp1, Address(__ post(str1, 8))); 9137 __ ldr(tmp2, Address(__ post(str2, 8))); 9138 __ eor(rscratch2, tmp1, tmp2); 9139 __ cbnz(rscratch2, CAL_DIFFERENCE); 9140 __ sub(cnt2, cnt2, isLL ? 8 : 4); 9141 9142 __ bind(LESS8); // directly load last 8 bytes 9143 if (!isLL) { 9144 __ add(cnt2, cnt2, cnt2); 9145 } 9146 __ ldr(tmp1, Address(str1, cnt2)); 9147 __ ldr(tmp2, Address(str2, cnt2)); 9148 __ eor(rscratch2, tmp1, tmp2); 9149 __ cbz(rscratch2, LENGTH_DIFF); 9150 __ b(CAL_DIFFERENCE); 9151 9152 __ bind(DIFF); 9153 __ cmp(tmp1, tmp2); 9154 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 9155 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 9156 // reuse rscratch2 register for the result of eor instruction 9157 __ eor(rscratch2, tmp1, tmp2); 9158 9159 __ bind(CAL_DIFFERENCE); 9160 __ rev(rscratch2, rscratch2); 9161 __ clz(rscratch2, rscratch2); 9162 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 9163 __ lsrv(tmp1, tmp1, rscratch2); 9164 __ lsrv(tmp2, tmp2, rscratch2); 9165 if (isLL) { 9166 __ uxtbw(tmp1, tmp1); 9167 __ uxtbw(tmp2, tmp2); 9168 } else { 9169 __ uxthw(tmp1, tmp1); 9170 __ uxthw(tmp2, tmp2); 9171 } 9172 __ subw(result, tmp1, tmp2); 9173 9174 __ bind(LENGTH_DIFF); 9175 __ ret(lr); 9176 return entry; 9177 } 9178 9179 enum string_compare_mode { 9180 LL, 9181 LU, 9182 UL, 9183 UU, 9184 }; 9185 9186 // The following registers are declared in aarch64.ad 9187 // r0 = result 9188 // r1 = str1 9189 // r2 = cnt1 9190 // r3 = str2 9191 // r4 = cnt2 9192 // r10 = tmp1 9193 // r11 = tmp2 9194 // z0 = ztmp1 9195 // z1 = ztmp2 9196 // p0 = pgtmp1 9197 // p1 = pgtmp2 9198 address generate_compare_long_string_sve(string_compare_mode mode) { 9199 StubId stub_id; 9200 switch (mode) { 9201 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break; 9202 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break; 9203 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break; 9204 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break; 9205 default: ShouldNotReachHere(); 9206 } 9207 9208 __ align(CodeEntryAlignment); 9209 address entry = __ pc(); 9210 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9211 tmp1 = r10, tmp2 = r11; 9212 9213 Label LOOP, DONE, MISMATCH; 9214 Register vec_len = tmp1; 9215 Register idx = tmp2; 9216 // The minimum of the string lengths has been stored in cnt2. 9217 Register cnt = cnt2; 9218 FloatRegister ztmp1 = z0, ztmp2 = z1; 9219 PRegister pgtmp1 = p0, pgtmp2 = p1; 9220 9221 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 9222 switch (mode) { \ 9223 case LL: \ 9224 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 9225 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 9226 break; \ 9227 case LU: \ 9228 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 9229 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9230 break; \ 9231 case UL: \ 9232 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9233 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 9234 break; \ 9235 case UU: \ 9236 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9237 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9238 break; \ 9239 default: \ 9240 ShouldNotReachHere(); \ 9241 } 9242 9243 StubCodeMark mark(this, stub_id); 9244 9245 __ mov(idx, 0); 9246 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9247 9248 if (mode == LL) { 9249 __ sve_cntb(vec_len); 9250 } else { 9251 __ sve_cnth(vec_len); 9252 } 9253 9254 __ sub(rscratch1, cnt, vec_len); 9255 9256 __ bind(LOOP); 9257 9258 // main loop 9259 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9260 __ add(idx, idx, vec_len); 9261 // Compare strings. 9262 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9263 __ br(__ NE, MISMATCH); 9264 __ cmp(idx, rscratch1); 9265 __ br(__ LT, LOOP); 9266 9267 // post loop, last iteration 9268 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9269 9270 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9271 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9272 __ br(__ EQ, DONE); 9273 9274 __ bind(MISMATCH); 9275 9276 // Crop the vector to find its location. 9277 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 9278 // Extract the first different characters of each string. 9279 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 9280 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 9281 9282 // Compute the difference of the first different characters. 9283 __ sub(result, rscratch1, rscratch2); 9284 9285 __ bind(DONE); 9286 __ ret(lr); 9287 #undef LOAD_PAIR 9288 return entry; 9289 } 9290 9291 void generate_compare_long_strings() { 9292 if (UseSVE == 0) { 9293 StubRoutines::aarch64::_compare_long_string_LL 9294 = generate_compare_long_string_same_encoding(true); 9295 StubRoutines::aarch64::_compare_long_string_UU 9296 = generate_compare_long_string_same_encoding(false); 9297 StubRoutines::aarch64::_compare_long_string_LU 9298 = generate_compare_long_string_different_encoding(true); 9299 StubRoutines::aarch64::_compare_long_string_UL 9300 = generate_compare_long_string_different_encoding(false); 9301 } else { 9302 StubRoutines::aarch64::_compare_long_string_LL 9303 = generate_compare_long_string_sve(LL); 9304 StubRoutines::aarch64::_compare_long_string_UU 9305 = generate_compare_long_string_sve(UU); 9306 StubRoutines::aarch64::_compare_long_string_LU 9307 = generate_compare_long_string_sve(LU); 9308 StubRoutines::aarch64::_compare_long_string_UL 9309 = generate_compare_long_string_sve(UL); 9310 } 9311 } 9312 9313 // R0 = result 9314 // R1 = str2 9315 // R2 = cnt1 9316 // R3 = str1 9317 // R4 = cnt2 9318 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 9319 // 9320 // This generic linear code use few additional ideas, which makes it faster: 9321 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 9322 // in order to skip initial loading(help in systems with 1 ld pipeline) 9323 // 2) we can use "fast" algorithm of finding single character to search for 9324 // first symbol with less branches(1 branch per each loaded register instead 9325 // of branch for each symbol), so, this is where constants like 9326 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 9327 // 3) after loading and analyzing 1st register of source string, it can be 9328 // used to search for every 1st character entry, saving few loads in 9329 // comparison with "simplier-but-slower" implementation 9330 // 4) in order to avoid lots of push/pop operations, code below is heavily 9331 // re-using/re-initializing/compressing register values, which makes code 9332 // larger and a bit less readable, however, most of extra operations are 9333 // issued during loads or branches, so, penalty is minimal 9334 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 9335 StubId stub_id; 9336 if (str1_isL) { 9337 if (str2_isL) { 9338 stub_id = StubId::stubgen_string_indexof_linear_ll_id; 9339 } else { 9340 stub_id = StubId::stubgen_string_indexof_linear_ul_id; 9341 } 9342 } else { 9343 if (str2_isL) { 9344 ShouldNotReachHere(); 9345 } else { 9346 stub_id = StubId::stubgen_string_indexof_linear_uu_id; 9347 } 9348 } 9349 __ align(CodeEntryAlignment); 9350 StubCodeMark mark(this, stub_id); 9351 address entry = __ pc(); 9352 9353 int str1_chr_size = str1_isL ? 1 : 2; 9354 int str2_chr_size = str2_isL ? 1 : 2; 9355 int str1_chr_shift = str1_isL ? 0 : 1; 9356 int str2_chr_shift = str2_isL ? 0 : 1; 9357 bool isL = str1_isL && str2_isL; 9358 // parameters 9359 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 9360 // temporary registers 9361 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 9362 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 9363 // redefinitions 9364 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 9365 9366 __ push(spilled_regs, sp); 9367 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 9368 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 9369 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 9370 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 9371 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 9372 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 9373 // Read whole register from str1. It is safe, because length >=8 here 9374 __ ldr(ch1, Address(str1)); 9375 // Read whole register from str2. It is safe, because length >=8 here 9376 __ ldr(ch2, Address(str2)); 9377 __ sub(cnt2, cnt2, cnt1); 9378 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 9379 if (str1_isL != str2_isL) { 9380 __ eor(v0, __ T16B, v0, v0); 9381 } 9382 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 9383 __ mul(first, first, tmp1); 9384 // check if we have less than 1 register to check 9385 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 9386 if (str1_isL != str2_isL) { 9387 __ fmovd(v1, ch1); 9388 } 9389 __ br(__ LE, L_SMALL); 9390 __ eor(ch2, first, ch2); 9391 if (str1_isL != str2_isL) { 9392 __ zip1(v1, __ T16B, v1, v0); 9393 } 9394 __ sub(tmp2, ch2, tmp1); 9395 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9396 __ bics(tmp2, tmp2, ch2); 9397 if (str1_isL != str2_isL) { 9398 __ fmovd(ch1, v1); 9399 } 9400 __ br(__ NE, L_HAS_ZERO); 9401 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9402 __ add(result, result, wordSize/str2_chr_size); 9403 __ add(str2, str2, wordSize); 9404 __ br(__ LT, L_POST_LOOP); 9405 __ BIND(L_LOOP); 9406 __ ldr(ch2, Address(str2)); 9407 __ eor(ch2, first, ch2); 9408 __ sub(tmp2, ch2, tmp1); 9409 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9410 __ bics(tmp2, tmp2, ch2); 9411 __ br(__ NE, L_HAS_ZERO); 9412 __ BIND(L_LOOP_PROCEED); 9413 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9414 __ add(str2, str2, wordSize); 9415 __ add(result, result, wordSize/str2_chr_size); 9416 __ br(__ GE, L_LOOP); 9417 __ BIND(L_POST_LOOP); 9418 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 9419 __ br(__ LE, NOMATCH); 9420 __ ldr(ch2, Address(str2)); 9421 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9422 __ eor(ch2, first, ch2); 9423 __ sub(tmp2, ch2, tmp1); 9424 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9425 __ mov(tmp4, -1); // all bits set 9426 __ b(L_SMALL_PROCEED); 9427 __ align(OptoLoopAlignment); 9428 __ BIND(L_SMALL); 9429 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9430 __ eor(ch2, first, ch2); 9431 if (str1_isL != str2_isL) { 9432 __ zip1(v1, __ T16B, v1, v0); 9433 } 9434 __ sub(tmp2, ch2, tmp1); 9435 __ mov(tmp4, -1); // all bits set 9436 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9437 if (str1_isL != str2_isL) { 9438 __ fmovd(ch1, v1); // move converted 4 symbols 9439 } 9440 __ BIND(L_SMALL_PROCEED); 9441 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 9442 __ bic(tmp2, tmp2, ch2); 9443 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 9444 __ rbit(tmp2, tmp2); 9445 __ br(__ EQ, NOMATCH); 9446 __ BIND(L_SMALL_HAS_ZERO_LOOP); 9447 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 9448 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 9449 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 9450 if (str2_isL) { // LL 9451 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9452 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9453 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9454 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9455 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9456 } else { 9457 __ mov(ch2, 0xE); // all bits in byte set except last one 9458 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9459 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9460 __ lslv(tmp2, tmp2, tmp4); 9461 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9462 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9463 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9464 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9465 } 9466 __ cmp(ch1, ch2); 9467 __ mov(tmp4, wordSize/str2_chr_size); 9468 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9469 __ BIND(L_SMALL_CMP_LOOP); 9470 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9471 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9472 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9473 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9474 __ add(tmp4, tmp4, 1); 9475 __ cmp(tmp4, cnt1); 9476 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 9477 __ cmp(first, ch2); 9478 __ br(__ EQ, L_SMALL_CMP_LOOP); 9479 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 9480 __ cbz(tmp2, NOMATCH); // no more matches. exit 9481 __ clz(tmp4, tmp2); 9482 __ add(result, result, 1); // advance index 9483 __ add(str2, str2, str2_chr_size); // advance pointer 9484 __ b(L_SMALL_HAS_ZERO_LOOP); 9485 __ align(OptoLoopAlignment); 9486 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 9487 __ cmp(first, ch2); 9488 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9489 __ b(DONE); 9490 __ align(OptoLoopAlignment); 9491 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 9492 if (str2_isL) { // LL 9493 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9494 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9495 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9496 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9497 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9498 } else { 9499 __ mov(ch2, 0xE); // all bits in byte set except last one 9500 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9501 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9502 __ lslv(tmp2, tmp2, tmp4); 9503 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9504 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9505 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9506 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9507 } 9508 __ cmp(ch1, ch2); 9509 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9510 __ b(DONE); 9511 __ align(OptoLoopAlignment); 9512 __ BIND(L_HAS_ZERO); 9513 __ rbit(tmp2, tmp2); 9514 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 9515 // Now, perform compression of counters(cnt2 and cnt1) into one register. 9516 // It's fine because both counters are 32bit and are not changed in this 9517 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 9518 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 9519 __ sub(result, result, 1); 9520 __ BIND(L_HAS_ZERO_LOOP); 9521 __ mov(cnt1, wordSize/str2_chr_size); 9522 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9523 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 9524 if (str2_isL) { 9525 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9526 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9527 __ lslv(tmp2, tmp2, tmp4); 9528 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9529 __ add(tmp4, tmp4, 1); 9530 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9531 __ lsl(tmp2, tmp2, 1); 9532 __ mov(tmp4, wordSize/str2_chr_size); 9533 } else { 9534 __ mov(ch2, 0xE); 9535 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9536 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9537 __ lslv(tmp2, tmp2, tmp4); 9538 __ add(tmp4, tmp4, 1); 9539 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9540 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9541 __ lsl(tmp2, tmp2, 1); 9542 __ mov(tmp4, wordSize/str2_chr_size); 9543 __ sub(str2, str2, str2_chr_size); 9544 } 9545 __ cmp(ch1, ch2); 9546 __ mov(tmp4, wordSize/str2_chr_size); 9547 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9548 __ BIND(L_CMP_LOOP); 9549 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9550 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9551 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9552 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9553 __ add(tmp4, tmp4, 1); 9554 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9555 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 9556 __ cmp(cnt1, ch2); 9557 __ br(__ EQ, L_CMP_LOOP); 9558 __ BIND(L_CMP_LOOP_NOMATCH); 9559 // here we're not matched 9560 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9561 __ clz(tmp4, tmp2); 9562 __ add(str2, str2, str2_chr_size); // advance pointer 9563 __ b(L_HAS_ZERO_LOOP); 9564 __ align(OptoLoopAlignment); 9565 __ BIND(L_CMP_LOOP_LAST_CMP); 9566 __ cmp(cnt1, ch2); 9567 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9568 __ b(DONE); 9569 __ align(OptoLoopAlignment); 9570 __ BIND(L_CMP_LOOP_LAST_CMP2); 9571 if (str2_isL) { 9572 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9573 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9574 __ lslv(tmp2, tmp2, tmp4); 9575 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9576 __ add(tmp4, tmp4, 1); 9577 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9578 __ lsl(tmp2, tmp2, 1); 9579 } else { 9580 __ mov(ch2, 0xE); 9581 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9582 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9583 __ lslv(tmp2, tmp2, tmp4); 9584 __ add(tmp4, tmp4, 1); 9585 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9586 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9587 __ lsl(tmp2, tmp2, 1); 9588 __ sub(str2, str2, str2_chr_size); 9589 } 9590 __ cmp(ch1, ch2); 9591 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9592 __ b(DONE); 9593 __ align(OptoLoopAlignment); 9594 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9595 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9596 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9597 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9598 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9599 // result by analyzed characters value, so, we can just reset lower bits 9600 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9601 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9602 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9603 // index of last analyzed substring inside current octet. So, str2 in at 9604 // respective start address. We need to advance it to next octet 9605 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9606 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9607 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9608 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9609 __ movw(cnt2, cnt2); 9610 __ b(L_LOOP_PROCEED); 9611 __ align(OptoLoopAlignment); 9612 __ BIND(NOMATCH); 9613 __ mov(result, -1); 9614 __ BIND(DONE); 9615 __ pop(spilled_regs, sp); 9616 __ ret(lr); 9617 return entry; 9618 } 9619 9620 void generate_string_indexof_stubs() { 9621 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9622 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9623 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9624 } 9625 9626 void inflate_and_store_2_fp_registers(bool generatePrfm, 9627 FloatRegister src1, FloatRegister src2) { 9628 Register dst = r1; 9629 __ zip1(v1, __ T16B, src1, v0); 9630 __ zip2(v2, __ T16B, src1, v0); 9631 if (generatePrfm) { 9632 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9633 } 9634 __ zip1(v3, __ T16B, src2, v0); 9635 __ zip2(v4, __ T16B, src2, v0); 9636 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9637 } 9638 9639 // R0 = src 9640 // R1 = dst 9641 // R2 = len 9642 // R3 = len >> 3 9643 // V0 = 0 9644 // v1 = loaded 8 bytes 9645 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9646 address generate_large_byte_array_inflate() { 9647 __ align(CodeEntryAlignment); 9648 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id; 9649 StubCodeMark mark(this, stub_id); 9650 address entry = __ pc(); 9651 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9652 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9653 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9654 9655 // do one more 8-byte read to have address 16-byte aligned in most cases 9656 // also use single store instruction 9657 __ ldrd(v2, __ post(src, 8)); 9658 __ sub(octetCounter, octetCounter, 2); 9659 __ zip1(v1, __ T16B, v1, v0); 9660 __ zip1(v2, __ T16B, v2, v0); 9661 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9662 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9663 __ subs(rscratch1, octetCounter, large_loop_threshold); 9664 __ br(__ LE, LOOP_START); 9665 __ b(LOOP_PRFM_START); 9666 __ bind(LOOP_PRFM); 9667 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9668 __ bind(LOOP_PRFM_START); 9669 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9670 __ sub(octetCounter, octetCounter, 8); 9671 __ subs(rscratch1, octetCounter, large_loop_threshold); 9672 inflate_and_store_2_fp_registers(true, v3, v4); 9673 inflate_and_store_2_fp_registers(true, v5, v6); 9674 __ br(__ GT, LOOP_PRFM); 9675 __ cmp(octetCounter, (u1)8); 9676 __ br(__ LT, DONE); 9677 __ bind(LOOP); 9678 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9679 __ bind(LOOP_START); 9680 __ sub(octetCounter, octetCounter, 8); 9681 __ cmp(octetCounter, (u1)8); 9682 inflate_and_store_2_fp_registers(false, v3, v4); 9683 inflate_and_store_2_fp_registers(false, v5, v6); 9684 __ br(__ GE, LOOP); 9685 __ bind(DONE); 9686 __ ret(lr); 9687 return entry; 9688 } 9689 9690 /** 9691 * Arguments: 9692 * 9693 * Input: 9694 * c_rarg0 - current state address 9695 * c_rarg1 - H key address 9696 * c_rarg2 - data address 9697 * c_rarg3 - number of blocks 9698 * 9699 * Output: 9700 * Updated state at c_rarg0 9701 */ 9702 address generate_ghash_processBlocks() { 9703 // Bafflingly, GCM uses little-endian for the byte order, but 9704 // big-endian for the bit order. For example, the polynomial 1 is 9705 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9706 // 9707 // So, we must either reverse the bytes in each word and do 9708 // everything big-endian or reverse the bits in each byte and do 9709 // it little-endian. On AArch64 it's more idiomatic to reverse 9710 // the bits in each byte (we have an instruction, RBIT, to do 9711 // that) and keep the data in little-endian bit order through the 9712 // calculation, bit-reversing the inputs and outputs. 9713 9714 StubId stub_id = StubId::stubgen_ghash_processBlocks_id; 9715 StubCodeMark mark(this, stub_id); 9716 Label polynomial; // local data generated at end of stub 9717 __ align(CodeEntryAlignment); 9718 address start = __ pc(); 9719 9720 Register state = c_rarg0; 9721 Register subkeyH = c_rarg1; 9722 Register data = c_rarg2; 9723 Register blocks = c_rarg3; 9724 9725 FloatRegister vzr = v30; 9726 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9727 9728 __ adr(rscratch1, polynomial); 9729 __ ldrq(v24, rscratch1); // The field polynomial 9730 9731 __ ldrq(v0, Address(state)); 9732 __ ldrq(v1, Address(subkeyH)); 9733 9734 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9735 __ rbit(v0, __ T16B, v0); 9736 __ rev64(v1, __ T16B, v1); 9737 __ rbit(v1, __ T16B, v1); 9738 9739 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9740 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9741 9742 { 9743 Label L_ghash_loop; 9744 __ bind(L_ghash_loop); 9745 9746 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9747 // reversing each byte 9748 __ rbit(v2, __ T16B, v2); 9749 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9750 9751 // Multiply state in v2 by subkey in v1 9752 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9753 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9754 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9755 // Reduce v7:v5 by the field polynomial 9756 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9757 9758 __ sub(blocks, blocks, 1); 9759 __ cbnz(blocks, L_ghash_loop); 9760 } 9761 9762 // The bit-reversed result is at this point in v0 9763 __ rev64(v0, __ T16B, v0); 9764 __ rbit(v0, __ T16B, v0); 9765 9766 __ st1(v0, __ T16B, state); 9767 __ ret(lr); 9768 9769 // bind label and generate local polynomial data 9770 __ align(wordSize * 2); 9771 __ bind(polynomial); 9772 __ emit_int64(0x87); // The low-order bits of the field 9773 // polynomial (i.e. p = z^7+z^2+z+1) 9774 // repeated in the low and high parts of a 9775 // 128-bit vector 9776 __ emit_int64(0x87); 9777 9778 return start; 9779 } 9780 9781 address generate_ghash_processBlocks_wide() { 9782 address small = generate_ghash_processBlocks(); 9783 9784 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id; 9785 StubCodeMark mark(this, stub_id); 9786 Label polynomial; // local data generated after stub 9787 __ align(CodeEntryAlignment); 9788 address start = __ pc(); 9789 9790 Register state = c_rarg0; 9791 Register subkeyH = c_rarg1; 9792 Register data = c_rarg2; 9793 Register blocks = c_rarg3; 9794 9795 const int unroll = 4; 9796 9797 __ cmp(blocks, (unsigned char)(unroll * 2)); 9798 __ br(__ LT, small); 9799 9800 if (unroll > 1) { 9801 // Save state before entering routine 9802 __ sub(sp, sp, 4 * 16); 9803 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9804 __ sub(sp, sp, 4 * 16); 9805 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9806 } 9807 9808 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll); 9809 9810 if (unroll > 1) { 9811 // And restore state 9812 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9813 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9814 } 9815 9816 __ cmp(blocks, (unsigned char)0); 9817 __ br(__ GT, small); 9818 9819 __ ret(lr); 9820 9821 // bind label and generate polynomial data 9822 __ align(wordSize * 2); 9823 __ bind(polynomial); 9824 __ emit_int64(0x87); // The low-order bits of the field 9825 // polynomial (i.e. p = z^7+z^2+z+1) 9826 // repeated in the low and high parts of a 9827 // 128-bit vector 9828 __ emit_int64(0x87); 9829 9830 return start; 9831 9832 } 9833 9834 void generate_base64_encode_simdround(Register src, Register dst, 9835 FloatRegister codec, u8 size) { 9836 9837 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9838 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9839 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9840 9841 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9842 9843 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9844 9845 __ ushr(ind0, arrangement, in0, 2); 9846 9847 __ ushr(ind1, arrangement, in1, 2); 9848 __ shl(in0, arrangement, in0, 6); 9849 __ orr(ind1, arrangement, ind1, in0); 9850 __ ushr(ind1, arrangement, ind1, 2); 9851 9852 __ ushr(ind2, arrangement, in2, 4); 9853 __ shl(in1, arrangement, in1, 4); 9854 __ orr(ind2, arrangement, in1, ind2); 9855 __ ushr(ind2, arrangement, ind2, 2); 9856 9857 __ shl(ind3, arrangement, in2, 2); 9858 __ ushr(ind3, arrangement, ind3, 2); 9859 9860 __ tbl(out0, arrangement, codec, 4, ind0); 9861 __ tbl(out1, arrangement, codec, 4, ind1); 9862 __ tbl(out2, arrangement, codec, 4, ind2); 9863 __ tbl(out3, arrangement, codec, 4, ind3); 9864 9865 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9866 } 9867 9868 /** 9869 * Arguments: 9870 * 9871 * Input: 9872 * c_rarg0 - src_start 9873 * c_rarg1 - src_offset 9874 * c_rarg2 - src_length 9875 * c_rarg3 - dest_start 9876 * c_rarg4 - dest_offset 9877 * c_rarg5 - isURL 9878 * 9879 */ 9880 address generate_base64_encodeBlock() { 9881 9882 static const char toBase64[64] = { 9883 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9884 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9885 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9886 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9887 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9888 }; 9889 9890 static const char toBase64URL[64] = { 9891 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9892 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9893 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9894 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9895 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9896 }; 9897 9898 __ align(CodeEntryAlignment); 9899 StubId stub_id = StubId::stubgen_base64_encodeBlock_id; 9900 StubCodeMark mark(this, stub_id); 9901 address start = __ pc(); 9902 9903 Register src = c_rarg0; // source array 9904 Register soff = c_rarg1; // source start offset 9905 Register send = c_rarg2; // source end offset 9906 Register dst = c_rarg3; // dest array 9907 Register doff = c_rarg4; // position for writing to dest array 9908 Register isURL = c_rarg5; // Base64 or URL character set 9909 9910 // c_rarg6 and c_rarg7 are free to use as temps 9911 Register codec = c_rarg6; 9912 Register length = c_rarg7; 9913 9914 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9915 9916 __ add(src, src, soff); 9917 __ add(dst, dst, doff); 9918 __ sub(length, send, soff); 9919 9920 // load the codec base address 9921 __ lea(codec, ExternalAddress((address) toBase64)); 9922 __ cbz(isURL, ProcessData); 9923 __ lea(codec, ExternalAddress((address) toBase64URL)); 9924 9925 __ BIND(ProcessData); 9926 9927 // too short to formup a SIMD loop, roll back 9928 __ cmp(length, (u1)24); 9929 __ br(Assembler::LT, Process3B); 9930 9931 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9932 9933 __ BIND(Process48B); 9934 __ cmp(length, (u1)48); 9935 __ br(Assembler::LT, Process24B); 9936 generate_base64_encode_simdround(src, dst, v0, 16); 9937 __ sub(length, length, 48); 9938 __ b(Process48B); 9939 9940 __ BIND(Process24B); 9941 __ cmp(length, (u1)24); 9942 __ br(Assembler::LT, SIMDExit); 9943 generate_base64_encode_simdround(src, dst, v0, 8); 9944 __ sub(length, length, 24); 9945 9946 __ BIND(SIMDExit); 9947 __ cbz(length, Exit); 9948 9949 __ BIND(Process3B); 9950 // 3 src bytes, 24 bits 9951 __ ldrb(r10, __ post(src, 1)); 9952 __ ldrb(r11, __ post(src, 1)); 9953 __ ldrb(r12, __ post(src, 1)); 9954 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9955 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9956 // codec index 9957 __ ubfmw(r15, r12, 18, 23); 9958 __ ubfmw(r14, r12, 12, 17); 9959 __ ubfmw(r13, r12, 6, 11); 9960 __ andw(r12, r12, 63); 9961 // get the code based on the codec 9962 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9963 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9964 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9965 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9966 __ strb(r15, __ post(dst, 1)); 9967 __ strb(r14, __ post(dst, 1)); 9968 __ strb(r13, __ post(dst, 1)); 9969 __ strb(r12, __ post(dst, 1)); 9970 __ sub(length, length, 3); 9971 __ cbnz(length, Process3B); 9972 9973 __ BIND(Exit); 9974 __ ret(lr); 9975 9976 return start; 9977 } 9978 9979 void generate_base64_decode_simdround(Register src, Register dst, 9980 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9981 9982 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9983 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9984 9985 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9986 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9987 9988 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9989 9990 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9991 9992 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9993 9994 // we need unsigned saturating subtract, to make sure all input values 9995 // in range [0, 63] will have 0U value in the higher half lookup 9996 __ uqsubv(decH0, __ T16B, in0, v27); 9997 __ uqsubv(decH1, __ T16B, in1, v27); 9998 __ uqsubv(decH2, __ T16B, in2, v27); 9999 __ uqsubv(decH3, __ T16B, in3, v27); 10000 10001 // lower half lookup 10002 __ tbl(decL0, arrangement, codecL, 4, in0); 10003 __ tbl(decL1, arrangement, codecL, 4, in1); 10004 __ tbl(decL2, arrangement, codecL, 4, in2); 10005 __ tbl(decL3, arrangement, codecL, 4, in3); 10006 10007 // higher half lookup 10008 __ tbx(decH0, arrangement, codecH, 4, decH0); 10009 __ tbx(decH1, arrangement, codecH, 4, decH1); 10010 __ tbx(decH2, arrangement, codecH, 4, decH2); 10011 __ tbx(decH3, arrangement, codecH, 4, decH3); 10012 10013 // combine lower and higher 10014 __ orr(decL0, arrangement, decL0, decH0); 10015 __ orr(decL1, arrangement, decL1, decH1); 10016 __ orr(decL2, arrangement, decL2, decH2); 10017 __ orr(decL3, arrangement, decL3, decH3); 10018 10019 // check illegal inputs, value larger than 63 (maximum of 6 bits) 10020 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 10021 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 10022 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 10023 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 10024 __ orr(in0, arrangement, decH0, decH1); 10025 __ orr(in1, arrangement, decH2, decH3); 10026 __ orr(in2, arrangement, in0, in1); 10027 __ umaxv(in3, arrangement, in2); 10028 __ umov(rscratch2, in3, __ B, 0); 10029 10030 // get the data to output 10031 __ shl(out0, arrangement, decL0, 2); 10032 __ ushr(out1, arrangement, decL1, 4); 10033 __ orr(out0, arrangement, out0, out1); 10034 __ shl(out1, arrangement, decL1, 4); 10035 __ ushr(out2, arrangement, decL2, 2); 10036 __ orr(out1, arrangement, out1, out2); 10037 __ shl(out2, arrangement, decL2, 6); 10038 __ orr(out2, arrangement, out2, decL3); 10039 10040 __ cbz(rscratch2, NoIllegalData); 10041 10042 // handle illegal input 10043 __ umov(r10, in2, __ D, 0); 10044 if (size == 16) { 10045 __ cbnz(r10, ErrorInLowerHalf); 10046 10047 // illegal input is in higher half, store the lower half now. 10048 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 10049 10050 __ umov(r10, in2, __ D, 1); 10051 __ umov(r11, out0, __ D, 1); 10052 __ umov(r12, out1, __ D, 1); 10053 __ umov(r13, out2, __ D, 1); 10054 __ b(StoreLegalData); 10055 10056 __ BIND(ErrorInLowerHalf); 10057 } 10058 __ umov(r11, out0, __ D, 0); 10059 __ umov(r12, out1, __ D, 0); 10060 __ umov(r13, out2, __ D, 0); 10061 10062 __ BIND(StoreLegalData); 10063 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 10064 __ strb(r11, __ post(dst, 1)); 10065 __ strb(r12, __ post(dst, 1)); 10066 __ strb(r13, __ post(dst, 1)); 10067 __ lsr(r10, r10, 8); 10068 __ lsr(r11, r11, 8); 10069 __ lsr(r12, r12, 8); 10070 __ lsr(r13, r13, 8); 10071 __ b(StoreLegalData); 10072 10073 __ BIND(NoIllegalData); 10074 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 10075 } 10076 10077 10078 /** 10079 * Arguments: 10080 * 10081 * Input: 10082 * c_rarg0 - src_start 10083 * c_rarg1 - src_offset 10084 * c_rarg2 - src_length 10085 * c_rarg3 - dest_start 10086 * c_rarg4 - dest_offset 10087 * c_rarg5 - isURL 10088 * c_rarg6 - isMIME 10089 * 10090 */ 10091 address generate_base64_decodeBlock() { 10092 10093 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 10094 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 10095 // titled "Base64 decoding". 10096 10097 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 10098 // except the trailing character '=' is also treated illegal value in this intrinsic. That 10099 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 10100 static const uint8_t fromBase64ForNoSIMD[256] = { 10101 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10102 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10103 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10104 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10105 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10106 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 10107 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10108 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10109 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10110 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10111 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10112 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10113 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10114 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10115 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10116 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10117 }; 10118 10119 static const uint8_t fromBase64URLForNoSIMD[256] = { 10120 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10121 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10122 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10123 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10124 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10125 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 10126 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10127 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10128 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10129 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10130 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10131 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10132 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10133 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10134 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10135 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10136 }; 10137 10138 // A legal value of base64 code is in range [0, 127]. We need two lookups 10139 // with tbl/tbx and combine them to get the decode data. The 1st table vector 10140 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 10141 // table vector lookup use tbx, out of range indices are unchanged in 10142 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 10143 // The value of index 64 is set to 0, so that we know that we already get the 10144 // decoded data with the 1st lookup. 10145 static const uint8_t fromBase64ForSIMD[128] = { 10146 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10147 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10148 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10149 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10150 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10151 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10152 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10153 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10154 }; 10155 10156 static const uint8_t fromBase64URLForSIMD[128] = { 10157 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10158 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10159 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10160 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10161 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10162 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10163 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10164 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10165 }; 10166 10167 __ align(CodeEntryAlignment); 10168 StubId stub_id = StubId::stubgen_base64_decodeBlock_id; 10169 StubCodeMark mark(this, stub_id); 10170 address start = __ pc(); 10171 10172 Register src = c_rarg0; // source array 10173 Register soff = c_rarg1; // source start offset 10174 Register send = c_rarg2; // source end offset 10175 Register dst = c_rarg3; // dest array 10176 Register doff = c_rarg4; // position for writing to dest array 10177 Register isURL = c_rarg5; // Base64 or URL character set 10178 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 10179 10180 Register length = send; // reuse send as length of source data to process 10181 10182 Register simd_codec = c_rarg6; 10183 Register nosimd_codec = c_rarg7; 10184 10185 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 10186 10187 __ enter(); 10188 10189 __ add(src, src, soff); 10190 __ add(dst, dst, doff); 10191 10192 __ mov(doff, dst); 10193 10194 __ sub(length, send, soff); 10195 __ bfm(length, zr, 0, 1); 10196 10197 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 10198 __ cbz(isURL, ProcessData); 10199 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 10200 10201 __ BIND(ProcessData); 10202 __ mov(rscratch1, length); 10203 __ cmp(length, (u1)144); // 144 = 80 + 64 10204 __ br(Assembler::LT, Process4B); 10205 10206 // In the MIME case, the line length cannot be more than 76 10207 // bytes (see RFC 2045). This is too short a block for SIMD 10208 // to be worthwhile, so we use non-SIMD here. 10209 __ movw(rscratch1, 79); 10210 10211 __ BIND(Process4B); 10212 __ ldrw(r14, __ post(src, 4)); 10213 __ ubfxw(r10, r14, 0, 8); 10214 __ ubfxw(r11, r14, 8, 8); 10215 __ ubfxw(r12, r14, 16, 8); 10216 __ ubfxw(r13, r14, 24, 8); 10217 // get the de-code 10218 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 10219 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 10220 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 10221 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 10222 // error detection, 255u indicates an illegal input 10223 __ orrw(r14, r10, r11); 10224 __ orrw(r15, r12, r13); 10225 __ orrw(r14, r14, r15); 10226 __ tbnz(r14, 7, Exit); 10227 // recover the data 10228 __ lslw(r14, r10, 10); 10229 __ bfiw(r14, r11, 4, 6); 10230 __ bfmw(r14, r12, 2, 5); 10231 __ rev16w(r14, r14); 10232 __ bfiw(r13, r12, 6, 2); 10233 __ strh(r14, __ post(dst, 2)); 10234 __ strb(r13, __ post(dst, 1)); 10235 // non-simd loop 10236 __ subsw(rscratch1, rscratch1, 4); 10237 __ br(Assembler::GT, Process4B); 10238 10239 // if exiting from PreProcess80B, rscratch1 == -1; 10240 // otherwise, rscratch1 == 0. 10241 __ cbzw(rscratch1, Exit); 10242 __ sub(length, length, 80); 10243 10244 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 10245 __ cbz(isURL, SIMDEnter); 10246 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 10247 10248 __ BIND(SIMDEnter); 10249 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 10250 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 10251 __ mov(rscratch1, 63); 10252 __ dup(v27, __ T16B, rscratch1); 10253 10254 __ BIND(Process64B); 10255 __ cmp(length, (u1)64); 10256 __ br(Assembler::LT, Process32B); 10257 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 10258 __ sub(length, length, 64); 10259 __ b(Process64B); 10260 10261 __ BIND(Process32B); 10262 __ cmp(length, (u1)32); 10263 __ br(Assembler::LT, SIMDExit); 10264 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 10265 __ sub(length, length, 32); 10266 __ b(Process32B); 10267 10268 __ BIND(SIMDExit); 10269 __ cbz(length, Exit); 10270 __ movw(rscratch1, length); 10271 __ b(Process4B); 10272 10273 __ BIND(Exit); 10274 __ sub(c_rarg0, dst, doff); 10275 10276 __ leave(); 10277 __ ret(lr); 10278 10279 return start; 10280 } 10281 10282 // Support for spin waits. 10283 address generate_spin_wait() { 10284 __ align(CodeEntryAlignment); 10285 StubId stub_id = StubId::stubgen_spin_wait_id; 10286 StubCodeMark mark(this, stub_id); 10287 address start = __ pc(); 10288 10289 __ spin_wait(); 10290 __ ret(lr); 10291 10292 return start; 10293 } 10294 10295 void generate_lookup_secondary_supers_table_stub() { 10296 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id; 10297 StubCodeMark mark(this, stub_id); 10298 10299 const Register 10300 r_super_klass = r0, 10301 r_array_base = r1, 10302 r_array_length = r2, 10303 r_array_index = r3, 10304 r_sub_klass = r4, 10305 r_bitmap = rscratch2, 10306 result = r5; 10307 const FloatRegister 10308 vtemp = v0; 10309 10310 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 10311 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 10312 Label L_success; 10313 __ enter(); 10314 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 10315 r_array_base, r_array_length, r_array_index, 10316 vtemp, result, slot, 10317 /*stub_is_near*/true); 10318 __ leave(); 10319 __ ret(lr); 10320 } 10321 } 10322 10323 // Slow path implementation for UseSecondarySupersTable. 10324 address generate_lookup_secondary_supers_table_slow_path_stub() { 10325 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id; 10326 StubCodeMark mark(this, stub_id); 10327 10328 address start = __ pc(); 10329 const Register 10330 r_super_klass = r0, // argument 10331 r_array_base = r1, // argument 10332 temp1 = r2, // temp 10333 r_array_index = r3, // argument 10334 r_bitmap = rscratch2, // argument 10335 result = r5; // argument 10336 10337 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 10338 __ ret(lr); 10339 10340 return start; 10341 } 10342 10343 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 10344 10345 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX. 10346 // 10347 // If LSE is in use, generate LSE versions of all the stubs. The 10348 // non-LSE versions are in atomic_aarch64.S. 10349 10350 // class AtomicStubMark records the entry point of a stub and the 10351 // stub pointer which will point to it. The stub pointer is set to 10352 // the entry point when ~AtomicStubMark() is called, which must be 10353 // after ICache::invalidate_range. This ensures safe publication of 10354 // the generated code. 10355 class AtomicStubMark { 10356 address _entry_point; 10357 aarch64_atomic_stub_t *_stub; 10358 MacroAssembler *_masm; 10359 public: 10360 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 10361 _masm = masm; 10362 __ align(32); 10363 _entry_point = __ pc(); 10364 _stub = stub; 10365 } 10366 ~AtomicStubMark() { 10367 *_stub = (aarch64_atomic_stub_t)_entry_point; 10368 } 10369 }; 10370 10371 // NB: For memory_order_conservative we need a trailing membar after 10372 // LSE atomic operations but not a leading membar. 10373 // 10374 // We don't need a leading membar because a clause in the Arm ARM 10375 // says: 10376 // 10377 // Barrier-ordered-before 10378 // 10379 // Barrier instructions order prior Memory effects before subsequent 10380 // Memory effects generated by the same Observer. A read or a write 10381 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 10382 // Observer if and only if RW1 appears in program order before RW 2 10383 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 10384 // instruction with both Acquire and Release semantics. 10385 // 10386 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 10387 // and Release semantics, therefore we don't need a leading 10388 // barrier. However, there is no corresponding Barrier-ordered-after 10389 // relationship, therefore we need a trailing membar to prevent a 10390 // later store or load from being reordered with the store in an 10391 // atomic instruction. 10392 // 10393 // This was checked by using the herd7 consistency model simulator 10394 // (http://diy.inria.fr/) with this test case: 10395 // 10396 // AArch64 LseCas 10397 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 10398 // P0 | P1; 10399 // LDR W4, [X2] | MOV W3, #0; 10400 // DMB LD | MOV W4, #1; 10401 // LDR W3, [X1] | CASAL W3, W4, [X1]; 10402 // | DMB ISH; 10403 // | STR W4, [X2]; 10404 // exists 10405 // (0:X3=0 /\ 0:X4=1) 10406 // 10407 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 10408 // with the store to x in P1. Without the DMB in P1 this may happen. 10409 // 10410 // At the time of writing we don't know of any AArch64 hardware that 10411 // reorders stores in this way, but the Reference Manual permits it. 10412 10413 void gen_cas_entry(Assembler::operand_size size, 10414 atomic_memory_order order) { 10415 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 10416 exchange_val = c_rarg2; 10417 bool acquire, release; 10418 switch (order) { 10419 case memory_order_relaxed: 10420 acquire = false; 10421 release = false; 10422 break; 10423 case memory_order_release: 10424 acquire = false; 10425 release = true; 10426 break; 10427 default: 10428 acquire = true; 10429 release = true; 10430 break; 10431 } 10432 __ mov(prev, compare_val); 10433 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 10434 if (order == memory_order_conservative) { 10435 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10436 } 10437 if (size == Assembler::xword) { 10438 __ mov(r0, prev); 10439 } else { 10440 __ movw(r0, prev); 10441 } 10442 __ ret(lr); 10443 } 10444 10445 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 10446 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10447 // If not relaxed, then default to conservative. Relaxed is the only 10448 // case we use enough to be worth specializing. 10449 if (order == memory_order_relaxed) { 10450 __ ldadd(size, incr, prev, addr); 10451 } else { 10452 __ ldaddal(size, incr, prev, addr); 10453 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10454 } 10455 if (size == Assembler::xword) { 10456 __ mov(r0, prev); 10457 } else { 10458 __ movw(r0, prev); 10459 } 10460 __ ret(lr); 10461 } 10462 10463 void gen_swpal_entry(Assembler::operand_size size) { 10464 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10465 __ swpal(size, incr, prev, addr); 10466 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10467 if (size == Assembler::xword) { 10468 __ mov(r0, prev); 10469 } else { 10470 __ movw(r0, prev); 10471 } 10472 __ ret(lr); 10473 } 10474 10475 void generate_atomic_entry_points() { 10476 if (! UseLSE) { 10477 return; 10478 } 10479 __ align(CodeEntryAlignment); 10480 StubId stub_id = StubId::stubgen_atomic_entry_points_id; 10481 StubCodeMark mark(this, stub_id); 10482 address first_entry = __ pc(); 10483 10484 // ADD, memory_order_conservative 10485 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 10486 gen_ldadd_entry(Assembler::word, memory_order_conservative); 10487 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 10488 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 10489 10490 // ADD, memory_order_relaxed 10491 AtomicStubMark mark_fetch_add_4_relaxed 10492 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 10493 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 10494 AtomicStubMark mark_fetch_add_8_relaxed 10495 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 10496 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 10497 10498 // XCHG, memory_order_conservative 10499 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 10500 gen_swpal_entry(Assembler::word); 10501 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 10502 gen_swpal_entry(Assembler::xword); 10503 10504 // CAS, memory_order_conservative 10505 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 10506 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 10507 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 10508 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 10509 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 10510 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 10511 10512 // CAS, memory_order_relaxed 10513 AtomicStubMark mark_cmpxchg_1_relaxed 10514 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 10515 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 10516 AtomicStubMark mark_cmpxchg_4_relaxed 10517 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 10518 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 10519 AtomicStubMark mark_cmpxchg_8_relaxed 10520 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 10521 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 10522 10523 AtomicStubMark mark_cmpxchg_4_release 10524 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 10525 gen_cas_entry(MacroAssembler::word, memory_order_release); 10526 AtomicStubMark mark_cmpxchg_8_release 10527 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 10528 gen_cas_entry(MacroAssembler::xword, memory_order_release); 10529 10530 AtomicStubMark mark_cmpxchg_4_seq_cst 10531 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 10532 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 10533 AtomicStubMark mark_cmpxchg_8_seq_cst 10534 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 10535 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 10536 10537 ICache::invalidate_range(first_entry, __ pc() - first_entry); 10538 } 10539 #endif // LINUX 10540 10541 address generate_cont_thaw(Continuation::thaw_kind kind) { 10542 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 10543 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 10544 10545 address start = __ pc(); 10546 10547 if (return_barrier) { 10548 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 10549 __ mov(sp, rscratch1); 10550 } 10551 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10552 10553 if (return_barrier) { 10554 // preserve possible return value from a method returning to the return barrier 10555 __ fmovd(rscratch1, v0); 10556 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10557 } 10558 10559 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 10560 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 10561 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 10562 10563 if (return_barrier) { 10564 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10565 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10566 __ fmovd(v0, rscratch1); 10567 } 10568 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10569 10570 10571 Label thaw_success; 10572 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10573 __ cbnz(rscratch2, thaw_success); 10574 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10575 __ br(rscratch1); 10576 __ bind(thaw_success); 10577 10578 // make room for the thawed frames 10579 __ sub(rscratch1, sp, rscratch2); 10580 __ andr(rscratch1, rscratch1, -16); // align 10581 __ mov(sp, rscratch1); 10582 10583 if (return_barrier) { 10584 // save original return value -- again 10585 __ fmovd(rscratch1, v0); 10586 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10587 } 10588 10589 // If we want, we can templatize thaw by kind, and have three different entries 10590 __ movw(c_rarg1, (uint32_t)kind); 10591 10592 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10593 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10594 10595 if (return_barrier) { 10596 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10597 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10598 __ fmovd(v0, rscratch1); 10599 } else { 10600 __ mov(r0, zr); // return 0 (success) from doYield 10601 } 10602 10603 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10604 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10605 __ mov(rfp, sp); 10606 10607 if (return_barrier_exception) { 10608 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10609 __ authenticate_return_address(c_rarg1); 10610 __ verify_oop(r0); 10611 // save return value containing the exception oop in callee-saved R19 10612 __ mov(r19, r0); 10613 10614 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10615 10616 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10617 // __ reinitialize_ptrue(); 10618 10619 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10620 10621 __ mov(r1, r0); // the exception handler 10622 __ mov(r0, r19); // restore return value containing the exception oop 10623 __ verify_oop(r0); 10624 10625 __ leave(); 10626 __ mov(r3, lr); 10627 __ br(r1); // the exception handler 10628 } else { 10629 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10630 __ leave(); 10631 __ ret(lr); 10632 } 10633 10634 return start; 10635 } 10636 10637 address generate_cont_thaw() { 10638 if (!Continuations::enabled()) return nullptr; 10639 10640 StubId stub_id = StubId::stubgen_cont_thaw_id; 10641 StubCodeMark mark(this, stub_id); 10642 address start = __ pc(); 10643 generate_cont_thaw(Continuation::thaw_top); 10644 return start; 10645 } 10646 10647 address generate_cont_returnBarrier() { 10648 if (!Continuations::enabled()) return nullptr; 10649 10650 // TODO: will probably need multiple return barriers depending on return type 10651 StubId stub_id = StubId::stubgen_cont_returnBarrier_id; 10652 StubCodeMark mark(this, stub_id); 10653 address start = __ pc(); 10654 10655 generate_cont_thaw(Continuation::thaw_return_barrier); 10656 10657 return start; 10658 } 10659 10660 address generate_cont_returnBarrier_exception() { 10661 if (!Continuations::enabled()) return nullptr; 10662 10663 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id; 10664 StubCodeMark mark(this, stub_id); 10665 address start = __ pc(); 10666 10667 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10668 10669 return start; 10670 } 10671 10672 address generate_cont_preempt_stub() { 10673 if (!Continuations::enabled()) return nullptr; 10674 StubId stub_id = StubId::stubgen_cont_preempt_id; 10675 StubCodeMark mark(this, stub_id); 10676 address start = __ pc(); 10677 10678 __ reset_last_Java_frame(true); 10679 10680 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10681 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10682 __ mov(sp, rscratch2); 10683 10684 Label preemption_cancelled; 10685 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10686 __ cbnz(rscratch1, preemption_cancelled); 10687 10688 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10689 SharedRuntime::continuation_enter_cleanup(_masm); 10690 __ leave(); 10691 __ ret(lr); 10692 10693 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10694 __ bind(preemption_cancelled); 10695 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10696 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10697 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10698 __ ldr(rscratch1, Address(rscratch1)); 10699 __ br(rscratch1); 10700 10701 return start; 10702 } 10703 10704 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10705 // are represented as long[5], with BITS_PER_LIMB = 26. 10706 // Pack five 26-bit limbs into three 64-bit registers. 10707 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10708 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10709 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10710 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10711 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10712 10713 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10714 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10715 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10716 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10717 10718 if (dest2->is_valid()) { 10719 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10720 } else { 10721 #ifdef ASSERT 10722 Label OK; 10723 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10724 __ br(__ EQ, OK); 10725 __ stop("high bits of Poly1305 integer should be zero"); 10726 __ should_not_reach_here(); 10727 __ bind(OK); 10728 #endif 10729 } 10730 } 10731 10732 // As above, but return only a 128-bit integer, packed into two 10733 // 64-bit registers. 10734 void pack_26(Register dest0, Register dest1, Register src) { 10735 pack_26(dest0, dest1, noreg, src); 10736 } 10737 10738 // Multiply and multiply-accumulate unsigned 64-bit registers. 10739 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10740 __ mul(prod_lo, n, m); 10741 __ umulh(prod_hi, n, m); 10742 } 10743 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10744 wide_mul(rscratch1, rscratch2, n, m); 10745 __ adds(sum_lo, sum_lo, rscratch1); 10746 __ adc(sum_hi, sum_hi, rscratch2); 10747 } 10748 10749 // Poly1305, RFC 7539 10750 10751 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10752 // description of the tricks used to simplify and accelerate this 10753 // computation. 10754 10755 address generate_poly1305_processBlocks() { 10756 __ align(CodeEntryAlignment); 10757 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id; 10758 StubCodeMark mark(this, stub_id); 10759 address start = __ pc(); 10760 Label here; 10761 __ enter(); 10762 RegSet callee_saved = RegSet::range(r19, r28); 10763 __ push(callee_saved, sp); 10764 10765 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10766 10767 // Arguments 10768 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10769 10770 // R_n is the 128-bit randomly-generated key, packed into two 10771 // registers. The caller passes this key to us as long[5], with 10772 // BITS_PER_LIMB = 26. 10773 const Register R_0 = *++regs, R_1 = *++regs; 10774 pack_26(R_0, R_1, r_start); 10775 10776 // RR_n is (R_n >> 2) * 5 10777 const Register RR_0 = *++regs, RR_1 = *++regs; 10778 __ lsr(RR_0, R_0, 2); 10779 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10780 __ lsr(RR_1, R_1, 2); 10781 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10782 10783 // U_n is the current checksum 10784 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10785 pack_26(U_0, U_1, U_2, acc_start); 10786 10787 static constexpr int BLOCK_LENGTH = 16; 10788 Label DONE, LOOP; 10789 10790 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10791 __ br(Assembler::LT, DONE); { 10792 __ bind(LOOP); 10793 10794 // S_n is to be the sum of U_n and the next block of data 10795 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10796 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10797 __ adds(S_0, U_0, S_0); 10798 __ adcs(S_1, U_1, S_1); 10799 __ adc(S_2, U_2, zr); 10800 __ add(S_2, S_2, 1); 10801 10802 const Register U_0HI = *++regs, U_1HI = *++regs; 10803 10804 // NB: this logic depends on some of the special properties of 10805 // Poly1305 keys. In particular, because we know that the top 10806 // four bits of R_0 and R_1 are zero, we can add together 10807 // partial products without any risk of needing to propagate a 10808 // carry out. 10809 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10810 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10811 __ andr(U_2, R_0, 3); 10812 __ mul(U_2, S_2, U_2); 10813 10814 // Recycle registers S_0, S_1, S_2 10815 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10816 10817 // Partial reduction mod 2**130 - 5 10818 __ adds(U_1, U_0HI, U_1); 10819 __ adc(U_2, U_1HI, U_2); 10820 // Sum now in U_2:U_1:U_0. 10821 // Dead: U_0HI, U_1HI. 10822 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10823 10824 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10825 10826 // First, U_2:U_1:U_0 += (U_2 >> 2) 10827 __ lsr(rscratch1, U_2, 2); 10828 __ andr(U_2, U_2, (u8)3); 10829 __ adds(U_0, U_0, rscratch1); 10830 __ adcs(U_1, U_1, zr); 10831 __ adc(U_2, U_2, zr); 10832 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10833 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10834 __ adcs(U_1, U_1, zr); 10835 __ adc(U_2, U_2, zr); 10836 10837 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10838 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10839 __ br(~ Assembler::LT, LOOP); 10840 } 10841 10842 // Further reduce modulo 2^130 - 5 10843 __ lsr(rscratch1, U_2, 2); 10844 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10845 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10846 __ adcs(U_1, U_1, zr); 10847 __ andr(U_2, U_2, (u1)3); 10848 __ adc(U_2, U_2, zr); 10849 10850 // Unpack the sum into five 26-bit limbs and write to memory. 10851 __ ubfiz(rscratch1, U_0, 0, 26); 10852 __ ubfx(rscratch2, U_0, 26, 26); 10853 __ stp(rscratch1, rscratch2, Address(acc_start)); 10854 __ ubfx(rscratch1, U_0, 52, 12); 10855 __ bfi(rscratch1, U_1, 12, 14); 10856 __ ubfx(rscratch2, U_1, 14, 26); 10857 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10858 __ ubfx(rscratch1, U_1, 40, 24); 10859 __ bfi(rscratch1, U_2, 24, 3); 10860 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10861 10862 __ bind(DONE); 10863 __ pop(callee_saved, sp); 10864 __ leave(); 10865 __ ret(lr); 10866 10867 return start; 10868 } 10869 10870 // exception handler for upcall stubs 10871 address generate_upcall_stub_exception_handler() { 10872 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id; 10873 StubCodeMark mark(this, stub_id); 10874 address start = __ pc(); 10875 10876 // Native caller has no idea how to handle exceptions, 10877 // so we just crash here. Up to callee to catch exceptions. 10878 __ verify_oop(r0); 10879 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10880 __ blr(rscratch1); 10881 __ should_not_reach_here(); 10882 10883 return start; 10884 } 10885 10886 // load Method* target of MethodHandle 10887 // j_rarg0 = jobject receiver 10888 // rmethod = result 10889 address generate_upcall_stub_load_target() { 10890 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id; 10891 StubCodeMark mark(this, stub_id); 10892 address start = __ pc(); 10893 10894 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10895 // Load target method from receiver 10896 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10897 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10898 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10899 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10900 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10901 noreg, noreg); 10902 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10903 10904 __ ret(lr); 10905 10906 return start; 10907 } 10908 10909 #undef __ 10910 #define __ masm-> 10911 10912 class MontgomeryMultiplyGenerator : public MacroAssembler { 10913 10914 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10915 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10916 10917 RegSet _toSave; 10918 bool _squaring; 10919 10920 public: 10921 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10922 : MacroAssembler(as->code()), _squaring(squaring) { 10923 10924 // Register allocation 10925 10926 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10927 Pa_base = *regs; // Argument registers 10928 if (squaring) 10929 Pb_base = Pa_base; 10930 else 10931 Pb_base = *++regs; 10932 Pn_base = *++regs; 10933 Rlen= *++regs; 10934 inv = *++regs; 10935 Pm_base = *++regs; 10936 10937 // Working registers: 10938 Ra = *++regs; // The current digit of a, b, n, and m. 10939 Rb = *++regs; 10940 Rm = *++regs; 10941 Rn = *++regs; 10942 10943 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10944 Pb = *++regs; 10945 Pm = *++regs; 10946 Pn = *++regs; 10947 10948 t0 = *++regs; // Three registers which form a 10949 t1 = *++regs; // triple-precision accumuator. 10950 t2 = *++regs; 10951 10952 Ri = *++regs; // Inner and outer loop indexes. 10953 Rj = *++regs; 10954 10955 Rhi_ab = *++regs; // Product registers: low and high parts 10956 Rlo_ab = *++regs; // of a*b and m*n. 10957 Rhi_mn = *++regs; 10958 Rlo_mn = *++regs; 10959 10960 // r19 and up are callee-saved. 10961 _toSave = RegSet::range(r19, *regs) + Pm_base; 10962 } 10963 10964 private: 10965 void save_regs() { 10966 push(_toSave, sp); 10967 } 10968 10969 void restore_regs() { 10970 pop(_toSave, sp); 10971 } 10972 10973 template <typename T> 10974 void unroll_2(Register count, T block) { 10975 Label loop, end, odd; 10976 tbnz(count, 0, odd); 10977 cbz(count, end); 10978 align(16); 10979 bind(loop); 10980 (this->*block)(); 10981 bind(odd); 10982 (this->*block)(); 10983 subs(count, count, 2); 10984 br(Assembler::GT, loop); 10985 bind(end); 10986 } 10987 10988 template <typename T> 10989 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10990 Label loop, end, odd; 10991 tbnz(count, 0, odd); 10992 cbz(count, end); 10993 align(16); 10994 bind(loop); 10995 (this->*block)(d, s, tmp); 10996 bind(odd); 10997 (this->*block)(d, s, tmp); 10998 subs(count, count, 2); 10999 br(Assembler::GT, loop); 11000 bind(end); 11001 } 11002 11003 void pre1(RegisterOrConstant i) { 11004 block_comment("pre1"); 11005 // Pa = Pa_base; 11006 // Pb = Pb_base + i; 11007 // Pm = Pm_base; 11008 // Pn = Pn_base + i; 11009 // Ra = *Pa; 11010 // Rb = *Pb; 11011 // Rm = *Pm; 11012 // Rn = *Pn; 11013 ldr(Ra, Address(Pa_base)); 11014 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 11015 ldr(Rm, Address(Pm_base)); 11016 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11017 lea(Pa, Address(Pa_base)); 11018 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 11019 lea(Pm, Address(Pm_base)); 11020 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11021 11022 // Zero the m*n result. 11023 mov(Rhi_mn, zr); 11024 mov(Rlo_mn, zr); 11025 } 11026 11027 // The core multiply-accumulate step of a Montgomery 11028 // multiplication. The idea is to schedule operations as a 11029 // pipeline so that instructions with long latencies (loads and 11030 // multiplies) have time to complete before their results are 11031 // used. This most benefits in-order implementations of the 11032 // architecture but out-of-order ones also benefit. 11033 void step() { 11034 block_comment("step"); 11035 // MACC(Ra, Rb, t0, t1, t2); 11036 // Ra = *++Pa; 11037 // Rb = *--Pb; 11038 umulh(Rhi_ab, Ra, Rb); 11039 mul(Rlo_ab, Ra, Rb); 11040 ldr(Ra, pre(Pa, wordSize)); 11041 ldr(Rb, pre(Pb, -wordSize)); 11042 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 11043 // previous iteration. 11044 // MACC(Rm, Rn, t0, t1, t2); 11045 // Rm = *++Pm; 11046 // Rn = *--Pn; 11047 umulh(Rhi_mn, Rm, Rn); 11048 mul(Rlo_mn, Rm, Rn); 11049 ldr(Rm, pre(Pm, wordSize)); 11050 ldr(Rn, pre(Pn, -wordSize)); 11051 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11052 } 11053 11054 void post1() { 11055 block_comment("post1"); 11056 11057 // MACC(Ra, Rb, t0, t1, t2); 11058 // Ra = *++Pa; 11059 // Rb = *--Pb; 11060 umulh(Rhi_ab, Ra, Rb); 11061 mul(Rlo_ab, Ra, Rb); 11062 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11063 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11064 11065 // *Pm = Rm = t0 * inv; 11066 mul(Rm, t0, inv); 11067 str(Rm, Address(Pm)); 11068 11069 // MACC(Rm, Rn, t0, t1, t2); 11070 // t0 = t1; t1 = t2; t2 = 0; 11071 umulh(Rhi_mn, Rm, Rn); 11072 11073 #ifndef PRODUCT 11074 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11075 { 11076 mul(Rlo_mn, Rm, Rn); 11077 add(Rlo_mn, t0, Rlo_mn); 11078 Label ok; 11079 cbz(Rlo_mn, ok); { 11080 stop("broken Montgomery multiply"); 11081 } bind(ok); 11082 } 11083 #endif 11084 // We have very carefully set things up so that 11085 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11086 // the lower half of Rm * Rn because we know the result already: 11087 // it must be -t0. t0 + (-t0) must generate a carry iff 11088 // t0 != 0. So, rather than do a mul and an adds we just set 11089 // the carry flag iff t0 is nonzero. 11090 // 11091 // mul(Rlo_mn, Rm, Rn); 11092 // adds(zr, t0, Rlo_mn); 11093 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11094 adcs(t0, t1, Rhi_mn); 11095 adc(t1, t2, zr); 11096 mov(t2, zr); 11097 } 11098 11099 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 11100 block_comment("pre2"); 11101 // Pa = Pa_base + i-len; 11102 // Pb = Pb_base + len; 11103 // Pm = Pm_base + i-len; 11104 // Pn = Pn_base + len; 11105 11106 if (i.is_register()) { 11107 sub(Rj, i.as_register(), len); 11108 } else { 11109 mov(Rj, i.as_constant()); 11110 sub(Rj, Rj, len); 11111 } 11112 // Rj == i-len 11113 11114 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 11115 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 11116 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11117 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 11118 11119 // Ra = *++Pa; 11120 // Rb = *--Pb; 11121 // Rm = *++Pm; 11122 // Rn = *--Pn; 11123 ldr(Ra, pre(Pa, wordSize)); 11124 ldr(Rb, pre(Pb, -wordSize)); 11125 ldr(Rm, pre(Pm, wordSize)); 11126 ldr(Rn, pre(Pn, -wordSize)); 11127 11128 mov(Rhi_mn, zr); 11129 mov(Rlo_mn, zr); 11130 } 11131 11132 void post2(RegisterOrConstant i, RegisterOrConstant len) { 11133 block_comment("post2"); 11134 if (i.is_constant()) { 11135 mov(Rj, i.as_constant()-len.as_constant()); 11136 } else { 11137 sub(Rj, i.as_register(), len); 11138 } 11139 11140 adds(t0, t0, Rlo_mn); // The pending m*n, low part 11141 11142 // As soon as we know the least significant digit of our result, 11143 // store it. 11144 // Pm_base[i-len] = t0; 11145 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11146 11147 // t0 = t1; t1 = t2; t2 = 0; 11148 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 11149 adc(t1, t2, zr); 11150 mov(t2, zr); 11151 } 11152 11153 // A carry in t0 after Montgomery multiplication means that we 11154 // should subtract multiples of n from our result in m. We'll 11155 // keep doing that until there is no carry. 11156 void normalize(RegisterOrConstant len) { 11157 block_comment("normalize"); 11158 // while (t0) 11159 // t0 = sub(Pm_base, Pn_base, t0, len); 11160 Label loop, post, again; 11161 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 11162 cbz(t0, post); { 11163 bind(again); { 11164 mov(i, zr); 11165 mov(cnt, len); 11166 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11167 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11168 subs(zr, zr, zr); // set carry flag, i.e. no borrow 11169 align(16); 11170 bind(loop); { 11171 sbcs(Rm, Rm, Rn); 11172 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11173 add(i, i, 1); 11174 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11175 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11176 sub(cnt, cnt, 1); 11177 } cbnz(cnt, loop); 11178 sbc(t0, t0, zr); 11179 } cbnz(t0, again); 11180 } bind(post); 11181 } 11182 11183 // Move memory at s to d, reversing words. 11184 // Increments d to end of copied memory 11185 // Destroys tmp1, tmp2 11186 // Preserves len 11187 // Leaves s pointing to the address which was in d at start 11188 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 11189 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 11190 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 11191 11192 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 11193 mov(tmp1, len); 11194 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 11195 sub(s, d, len, ext::uxtw, LogBytesPerWord); 11196 } 11197 // where 11198 void reverse1(Register d, Register s, Register tmp) { 11199 ldr(tmp, pre(s, -wordSize)); 11200 ror(tmp, tmp, 32); 11201 str(tmp, post(d, wordSize)); 11202 } 11203 11204 void step_squaring() { 11205 // An extra ACC 11206 step(); 11207 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11208 } 11209 11210 void last_squaring(RegisterOrConstant i) { 11211 Label dont; 11212 // if ((i & 1) == 0) { 11213 tbnz(i.as_register(), 0, dont); { 11214 // MACC(Ra, Rb, t0, t1, t2); 11215 // Ra = *++Pa; 11216 // Rb = *--Pb; 11217 umulh(Rhi_ab, Ra, Rb); 11218 mul(Rlo_ab, Ra, Rb); 11219 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11220 } bind(dont); 11221 } 11222 11223 void extra_step_squaring() { 11224 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11225 11226 // MACC(Rm, Rn, t0, t1, t2); 11227 // Rm = *++Pm; 11228 // Rn = *--Pn; 11229 umulh(Rhi_mn, Rm, Rn); 11230 mul(Rlo_mn, Rm, Rn); 11231 ldr(Rm, pre(Pm, wordSize)); 11232 ldr(Rn, pre(Pn, -wordSize)); 11233 } 11234 11235 void post1_squaring() { 11236 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11237 11238 // *Pm = Rm = t0 * inv; 11239 mul(Rm, t0, inv); 11240 str(Rm, Address(Pm)); 11241 11242 // MACC(Rm, Rn, t0, t1, t2); 11243 // t0 = t1; t1 = t2; t2 = 0; 11244 umulh(Rhi_mn, Rm, Rn); 11245 11246 #ifndef PRODUCT 11247 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11248 { 11249 mul(Rlo_mn, Rm, Rn); 11250 add(Rlo_mn, t0, Rlo_mn); 11251 Label ok; 11252 cbz(Rlo_mn, ok); { 11253 stop("broken Montgomery multiply"); 11254 } bind(ok); 11255 } 11256 #endif 11257 // We have very carefully set things up so that 11258 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11259 // the lower half of Rm * Rn because we know the result already: 11260 // it must be -t0. t0 + (-t0) must generate a carry iff 11261 // t0 != 0. So, rather than do a mul and an adds we just set 11262 // the carry flag iff t0 is nonzero. 11263 // 11264 // mul(Rlo_mn, Rm, Rn); 11265 // adds(zr, t0, Rlo_mn); 11266 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11267 adcs(t0, t1, Rhi_mn); 11268 adc(t1, t2, zr); 11269 mov(t2, zr); 11270 } 11271 11272 void acc(Register Rhi, Register Rlo, 11273 Register t0, Register t1, Register t2) { 11274 adds(t0, t0, Rlo); 11275 adcs(t1, t1, Rhi); 11276 adc(t2, t2, zr); 11277 } 11278 11279 public: 11280 /** 11281 * Fast Montgomery multiplication. The derivation of the 11282 * algorithm is in A Cryptographic Library for the Motorola 11283 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 11284 * 11285 * Arguments: 11286 * 11287 * Inputs for multiplication: 11288 * c_rarg0 - int array elements a 11289 * c_rarg1 - int array elements b 11290 * c_rarg2 - int array elements n (the modulus) 11291 * c_rarg3 - int length 11292 * c_rarg4 - int inv 11293 * c_rarg5 - int array elements m (the result) 11294 * 11295 * Inputs for squaring: 11296 * c_rarg0 - int array elements a 11297 * c_rarg1 - int array elements n (the modulus) 11298 * c_rarg2 - int length 11299 * c_rarg3 - int inv 11300 * c_rarg4 - int array elements m (the result) 11301 * 11302 */ 11303 address generate_multiply() { 11304 Label argh, nothing; 11305 bind(argh); 11306 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11307 11308 align(CodeEntryAlignment); 11309 address entry = pc(); 11310 11311 cbzw(Rlen, nothing); 11312 11313 enter(); 11314 11315 // Make room. 11316 cmpw(Rlen, 512); 11317 br(Assembler::HI, argh); 11318 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11319 andr(sp, Ra, -2 * wordSize); 11320 11321 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11322 11323 { 11324 // Copy input args, reversing as we go. We use Ra as a 11325 // temporary variable. 11326 reverse(Ra, Pa_base, Rlen, t0, t1); 11327 if (!_squaring) 11328 reverse(Ra, Pb_base, Rlen, t0, t1); 11329 reverse(Ra, Pn_base, Rlen, t0, t1); 11330 } 11331 11332 // Push all call-saved registers and also Pm_base which we'll need 11333 // at the end. 11334 save_regs(); 11335 11336 #ifndef PRODUCT 11337 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 11338 { 11339 ldr(Rn, Address(Pn_base, 0)); 11340 mul(Rlo_mn, Rn, inv); 11341 subs(zr, Rlo_mn, -1); 11342 Label ok; 11343 br(EQ, ok); { 11344 stop("broken inverse in Montgomery multiply"); 11345 } bind(ok); 11346 } 11347 #endif 11348 11349 mov(Pm_base, Ra); 11350 11351 mov(t0, zr); 11352 mov(t1, zr); 11353 mov(t2, zr); 11354 11355 block_comment("for (int i = 0; i < len; i++) {"); 11356 mov(Ri, zr); { 11357 Label loop, end; 11358 cmpw(Ri, Rlen); 11359 br(Assembler::GE, end); 11360 11361 bind(loop); 11362 pre1(Ri); 11363 11364 block_comment(" for (j = i; j; j--) {"); { 11365 movw(Rj, Ri); 11366 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11367 } block_comment(" } // j"); 11368 11369 post1(); 11370 addw(Ri, Ri, 1); 11371 cmpw(Ri, Rlen); 11372 br(Assembler::LT, loop); 11373 bind(end); 11374 block_comment("} // i"); 11375 } 11376 11377 block_comment("for (int i = len; i < 2*len; i++) {"); 11378 mov(Ri, Rlen); { 11379 Label loop, end; 11380 cmpw(Ri, Rlen, Assembler::LSL, 1); 11381 br(Assembler::GE, end); 11382 11383 bind(loop); 11384 pre2(Ri, Rlen); 11385 11386 block_comment(" for (j = len*2-i-1; j; j--) {"); { 11387 lslw(Rj, Rlen, 1); 11388 subw(Rj, Rj, Ri); 11389 subw(Rj, Rj, 1); 11390 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11391 } block_comment(" } // j"); 11392 11393 post2(Ri, Rlen); 11394 addw(Ri, Ri, 1); 11395 cmpw(Ri, Rlen, Assembler::LSL, 1); 11396 br(Assembler::LT, loop); 11397 bind(end); 11398 } 11399 block_comment("} // i"); 11400 11401 normalize(Rlen); 11402 11403 mov(Ra, Pm_base); // Save Pm_base in Ra 11404 restore_regs(); // Restore caller's Pm_base 11405 11406 // Copy our result into caller's Pm_base 11407 reverse(Pm_base, Ra, Rlen, t0, t1); 11408 11409 leave(); 11410 bind(nothing); 11411 ret(lr); 11412 11413 return entry; 11414 } 11415 // In C, approximately: 11416 11417 // void 11418 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 11419 // julong Pn_base[], julong Pm_base[], 11420 // julong inv, int len) { 11421 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11422 // julong *Pa, *Pb, *Pn, *Pm; 11423 // julong Ra, Rb, Rn, Rm; 11424 11425 // int i; 11426 11427 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11428 11429 // for (i = 0; i < len; i++) { 11430 // int j; 11431 11432 // Pa = Pa_base; 11433 // Pb = Pb_base + i; 11434 // Pm = Pm_base; 11435 // Pn = Pn_base + i; 11436 11437 // Ra = *Pa; 11438 // Rb = *Pb; 11439 // Rm = *Pm; 11440 // Rn = *Pn; 11441 11442 // int iters = i; 11443 // for (j = 0; iters--; j++) { 11444 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11445 // MACC(Ra, Rb, t0, t1, t2); 11446 // Ra = *++Pa; 11447 // Rb = *--Pb; 11448 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11449 // MACC(Rm, Rn, t0, t1, t2); 11450 // Rm = *++Pm; 11451 // Rn = *--Pn; 11452 // } 11453 11454 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 11455 // MACC(Ra, Rb, t0, t1, t2); 11456 // *Pm = Rm = t0 * inv; 11457 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11458 // MACC(Rm, Rn, t0, t1, t2); 11459 11460 // assert(t0 == 0, "broken Montgomery multiply"); 11461 11462 // t0 = t1; t1 = t2; t2 = 0; 11463 // } 11464 11465 // for (i = len; i < 2*len; i++) { 11466 // int j; 11467 11468 // Pa = Pa_base + i-len; 11469 // Pb = Pb_base + len; 11470 // Pm = Pm_base + i-len; 11471 // Pn = Pn_base + len; 11472 11473 // Ra = *++Pa; 11474 // Rb = *--Pb; 11475 // Rm = *++Pm; 11476 // Rn = *--Pn; 11477 11478 // int iters = len*2-i-1; 11479 // for (j = i-len+1; iters--; j++) { 11480 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11481 // MACC(Ra, Rb, t0, t1, t2); 11482 // Ra = *++Pa; 11483 // Rb = *--Pb; 11484 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11485 // MACC(Rm, Rn, t0, t1, t2); 11486 // Rm = *++Pm; 11487 // Rn = *--Pn; 11488 // } 11489 11490 // Pm_base[i-len] = t0; 11491 // t0 = t1; t1 = t2; t2 = 0; 11492 // } 11493 11494 // while (t0) 11495 // t0 = sub(Pm_base, Pn_base, t0, len); 11496 // } 11497 11498 /** 11499 * Fast Montgomery squaring. This uses asymptotically 25% fewer 11500 * multiplies than Montgomery multiplication so it should be up to 11501 * 25% faster. However, its loop control is more complex and it 11502 * may actually run slower on some machines. 11503 * 11504 * Arguments: 11505 * 11506 * Inputs: 11507 * c_rarg0 - int array elements a 11508 * c_rarg1 - int array elements n (the modulus) 11509 * c_rarg2 - int length 11510 * c_rarg3 - int inv 11511 * c_rarg4 - int array elements m (the result) 11512 * 11513 */ 11514 address generate_square() { 11515 Label argh; 11516 bind(argh); 11517 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11518 11519 align(CodeEntryAlignment); 11520 address entry = pc(); 11521 11522 enter(); 11523 11524 // Make room. 11525 cmpw(Rlen, 512); 11526 br(Assembler::HI, argh); 11527 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11528 andr(sp, Ra, -2 * wordSize); 11529 11530 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11531 11532 { 11533 // Copy input args, reversing as we go. We use Ra as a 11534 // temporary variable. 11535 reverse(Ra, Pa_base, Rlen, t0, t1); 11536 reverse(Ra, Pn_base, Rlen, t0, t1); 11537 } 11538 11539 // Push all call-saved registers and also Pm_base which we'll need 11540 // at the end. 11541 save_regs(); 11542 11543 mov(Pm_base, Ra); 11544 11545 mov(t0, zr); 11546 mov(t1, zr); 11547 mov(t2, zr); 11548 11549 block_comment("for (int i = 0; i < len; i++) {"); 11550 mov(Ri, zr); { 11551 Label loop, end; 11552 bind(loop); 11553 cmp(Ri, Rlen); 11554 br(Assembler::GE, end); 11555 11556 pre1(Ri); 11557 11558 block_comment("for (j = (i+1)/2; j; j--) {"); { 11559 add(Rj, Ri, 1); 11560 lsr(Rj, Rj, 1); 11561 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11562 } block_comment(" } // j"); 11563 11564 last_squaring(Ri); 11565 11566 block_comment(" for (j = i/2; j; j--) {"); { 11567 lsr(Rj, Ri, 1); 11568 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11569 } block_comment(" } // j"); 11570 11571 post1_squaring(); 11572 add(Ri, Ri, 1); 11573 cmp(Ri, Rlen); 11574 br(Assembler::LT, loop); 11575 11576 bind(end); 11577 block_comment("} // i"); 11578 } 11579 11580 block_comment("for (int i = len; i < 2*len; i++) {"); 11581 mov(Ri, Rlen); { 11582 Label loop, end; 11583 bind(loop); 11584 cmp(Ri, Rlen, Assembler::LSL, 1); 11585 br(Assembler::GE, end); 11586 11587 pre2(Ri, Rlen); 11588 11589 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11590 lsl(Rj, Rlen, 1); 11591 sub(Rj, Rj, Ri); 11592 sub(Rj, Rj, 1); 11593 lsr(Rj, Rj, 1); 11594 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11595 } block_comment(" } // j"); 11596 11597 last_squaring(Ri); 11598 11599 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11600 lsl(Rj, Rlen, 1); 11601 sub(Rj, Rj, Ri); 11602 lsr(Rj, Rj, 1); 11603 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11604 } block_comment(" } // j"); 11605 11606 post2(Ri, Rlen); 11607 add(Ri, Ri, 1); 11608 cmp(Ri, Rlen, Assembler::LSL, 1); 11609 11610 br(Assembler::LT, loop); 11611 bind(end); 11612 block_comment("} // i"); 11613 } 11614 11615 normalize(Rlen); 11616 11617 mov(Ra, Pm_base); // Save Pm_base in Ra 11618 restore_regs(); // Restore caller's Pm_base 11619 11620 // Copy our result into caller's Pm_base 11621 reverse(Pm_base, Ra, Rlen, t0, t1); 11622 11623 leave(); 11624 ret(lr); 11625 11626 return entry; 11627 } 11628 // In C, approximately: 11629 11630 // void 11631 // montgomery_square(julong Pa_base[], julong Pn_base[], 11632 // julong Pm_base[], julong inv, int len) { 11633 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11634 // julong *Pa, *Pb, *Pn, *Pm; 11635 // julong Ra, Rb, Rn, Rm; 11636 11637 // int i; 11638 11639 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11640 11641 // for (i = 0; i < len; i++) { 11642 // int j; 11643 11644 // Pa = Pa_base; 11645 // Pb = Pa_base + i; 11646 // Pm = Pm_base; 11647 // Pn = Pn_base + i; 11648 11649 // Ra = *Pa; 11650 // Rb = *Pb; 11651 // Rm = *Pm; 11652 // Rn = *Pn; 11653 11654 // int iters = (i+1)/2; 11655 // for (j = 0; iters--; j++) { 11656 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11657 // MACC2(Ra, Rb, t0, t1, t2); 11658 // Ra = *++Pa; 11659 // Rb = *--Pb; 11660 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11661 // MACC(Rm, Rn, t0, t1, t2); 11662 // Rm = *++Pm; 11663 // Rn = *--Pn; 11664 // } 11665 // if ((i & 1) == 0) { 11666 // assert(Ra == Pa_base[j], "must be"); 11667 // MACC(Ra, Ra, t0, t1, t2); 11668 // } 11669 // iters = i/2; 11670 // assert(iters == i-j, "must be"); 11671 // for (; iters--; j++) { 11672 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11673 // MACC(Rm, Rn, t0, t1, t2); 11674 // Rm = *++Pm; 11675 // Rn = *--Pn; 11676 // } 11677 11678 // *Pm = Rm = t0 * inv; 11679 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11680 // MACC(Rm, Rn, t0, t1, t2); 11681 11682 // assert(t0 == 0, "broken Montgomery multiply"); 11683 11684 // t0 = t1; t1 = t2; t2 = 0; 11685 // } 11686 11687 // for (i = len; i < 2*len; i++) { 11688 // int start = i-len+1; 11689 // int end = start + (len - start)/2; 11690 // int j; 11691 11692 // Pa = Pa_base + i-len; 11693 // Pb = Pa_base + len; 11694 // Pm = Pm_base + i-len; 11695 // Pn = Pn_base + len; 11696 11697 // Ra = *++Pa; 11698 // Rb = *--Pb; 11699 // Rm = *++Pm; 11700 // Rn = *--Pn; 11701 11702 // int iters = (2*len-i-1)/2; 11703 // assert(iters == end-start, "must be"); 11704 // for (j = start; iters--; j++) { 11705 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11706 // MACC2(Ra, Rb, t0, t1, t2); 11707 // Ra = *++Pa; 11708 // Rb = *--Pb; 11709 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11710 // MACC(Rm, Rn, t0, t1, t2); 11711 // Rm = *++Pm; 11712 // Rn = *--Pn; 11713 // } 11714 // if ((i & 1) == 0) { 11715 // assert(Ra == Pa_base[j], "must be"); 11716 // MACC(Ra, Ra, t0, t1, t2); 11717 // } 11718 // iters = (2*len-i)/2; 11719 // assert(iters == len-j, "must be"); 11720 // for (; iters--; j++) { 11721 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11722 // MACC(Rm, Rn, t0, t1, t2); 11723 // Rm = *++Pm; 11724 // Rn = *--Pn; 11725 // } 11726 // Pm_base[i-len] = t0; 11727 // t0 = t1; t1 = t2; t2 = 0; 11728 // } 11729 11730 // while (t0) 11731 // t0 = sub(Pm_base, Pn_base, t0, len); 11732 // } 11733 }; 11734 11735 // Initialization 11736 void generate_preuniverse_stubs() { 11737 // preuniverse stubs are not needed for aarch64 11738 } 11739 11740 void generate_initial_stubs() { 11741 // Generate initial stubs and initializes the entry points 11742 11743 // entry points that exist in all platforms Note: This is code 11744 // that could be shared among different platforms - however the 11745 // benefit seems to be smaller than the disadvantage of having a 11746 // much more complicated generator structure. See also comment in 11747 // stubRoutines.hpp. 11748 11749 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11750 11751 StubRoutines::_call_stub_entry = 11752 generate_call_stub(StubRoutines::_call_stub_return_address); 11753 11754 // is referenced by megamorphic call 11755 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11756 11757 // Initialize table for copy memory (arraycopy) check. 11758 if (UnsafeMemoryAccess::_table == nullptr) { 11759 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11760 } 11761 11762 if (UseCRC32Intrinsics) { 11763 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11764 } 11765 11766 if (UseCRC32CIntrinsics) { 11767 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11768 } 11769 11770 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11771 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11772 } 11773 11774 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11775 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11776 } 11777 11778 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11779 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11780 StubRoutines::_hf2f = generate_float16ToFloat(); 11781 StubRoutines::_f2hf = generate_floatToFloat16(); 11782 } 11783 } 11784 11785 void generate_continuation_stubs() { 11786 // Continuation stubs: 11787 StubRoutines::_cont_thaw = generate_cont_thaw(); 11788 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11789 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11790 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11791 } 11792 11793 void generate_final_stubs() { 11794 // support for verify_oop (must happen after universe_init) 11795 if (VerifyOops) { 11796 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11797 } 11798 11799 // arraycopy stubs used by compilers 11800 generate_arraycopy_stubs(); 11801 11802 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11803 11804 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11805 11806 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11807 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11808 11809 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11810 11811 generate_atomic_entry_points(); 11812 11813 #endif // LINUX 11814 11815 #ifdef COMPILER2 11816 if (UseSecondarySupersTable) { 11817 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11818 if (! InlineSecondarySupersTest) { 11819 generate_lookup_secondary_supers_table_stub(); 11820 } 11821 } 11822 #endif 11823 11824 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(); 11825 11826 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11827 } 11828 11829 void generate_compiler_stubs() { 11830 #if COMPILER2_OR_JVMCI 11831 11832 if (UseSVE == 0) { 11833 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id); 11834 } 11835 11836 // array equals stub for large arrays. 11837 if (!UseSimpleArrayEquals) { 11838 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11839 } 11840 11841 // arrays_hascode stub for large arrays. 11842 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11843 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11844 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11845 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11846 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11847 11848 // byte_array_inflate stub for large arrays. 11849 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11850 11851 // countPositives stub for large arrays. 11852 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11853 11854 generate_compare_long_strings(); 11855 11856 generate_string_indexof_stubs(); 11857 11858 #ifdef COMPILER2 11859 if (UseMultiplyToLenIntrinsic) { 11860 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11861 } 11862 11863 if (UseSquareToLenIntrinsic) { 11864 StubRoutines::_squareToLen = generate_squareToLen(); 11865 } 11866 11867 if (UseMulAddIntrinsic) { 11868 StubRoutines::_mulAdd = generate_mulAdd(); 11869 } 11870 11871 if (UseSIMDForBigIntegerShiftIntrinsics) { 11872 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11873 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11874 } 11875 11876 if (UseMontgomeryMultiplyIntrinsic) { 11877 StubId stub_id = StubId::stubgen_montgomeryMultiply_id; 11878 StubCodeMark mark(this, stub_id); 11879 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11880 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11881 } 11882 11883 if (UseMontgomerySquareIntrinsic) { 11884 StubId stub_id = StubId::stubgen_montgomerySquare_id; 11885 StubCodeMark mark(this, stub_id); 11886 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11887 // We use generate_multiply() rather than generate_square() 11888 // because it's faster for the sizes of modulus we care about. 11889 StubRoutines::_montgomerySquare = g.generate_multiply(); 11890 } 11891 11892 #endif // COMPILER2 11893 11894 if (UseChaCha20Intrinsics) { 11895 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11896 } 11897 11898 if (UseKyberIntrinsics) { 11899 StubRoutines::_kyberNtt = generate_kyberNtt(); 11900 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 11901 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 11902 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 11903 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 11904 StubRoutines::_kyber12To16 = generate_kyber12To16(); 11905 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 11906 } 11907 11908 if (UseDilithiumIntrinsics) { 11909 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 11910 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 11911 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 11912 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 11913 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 11914 } 11915 11916 if (UseBASE64Intrinsics) { 11917 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 11918 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 11919 } 11920 11921 // data cache line writeback 11922 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 11923 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 11924 11925 if (UseAESIntrinsics) { 11926 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 11927 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 11928 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 11929 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 11930 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 11931 } 11932 if (UseGHASHIntrinsics) { 11933 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 11934 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 11935 } 11936 if (UseAESIntrinsics && UseGHASHIntrinsics) { 11937 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 11938 } 11939 11940 if (UseMD5Intrinsics) { 11941 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id); 11942 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id); 11943 } 11944 if (UseSHA1Intrinsics) { 11945 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id); 11946 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id); 11947 } 11948 if (UseSHA256Intrinsics) { 11949 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id); 11950 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id); 11951 } 11952 if (UseSHA512Intrinsics) { 11953 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id); 11954 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id); 11955 } 11956 if (UseSHA3Intrinsics) { 11957 11958 StubRoutines::_double_keccak = generate_double_keccak(); 11959 if (UseSIMDForSHA3Intrinsic) { 11960 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id); 11961 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id); 11962 } else { 11963 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id); 11964 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id); 11965 } 11966 } 11967 11968 if (UsePoly1305Intrinsics) { 11969 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 11970 } 11971 11972 // generate Adler32 intrinsics code 11973 if (UseAdler32Intrinsics) { 11974 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 11975 } 11976 11977 #endif // COMPILER2_OR_JVMCI 11978 } 11979 11980 public: 11981 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) { 11982 switch(blob_id) { 11983 case BlobId::stubgen_preuniverse_id: 11984 generate_preuniverse_stubs(); 11985 break; 11986 case BlobId::stubgen_initial_id: 11987 generate_initial_stubs(); 11988 break; 11989 case BlobId::stubgen_continuation_id: 11990 generate_continuation_stubs(); 11991 break; 11992 case BlobId::stubgen_compiler_id: 11993 generate_compiler_stubs(); 11994 break; 11995 case BlobId::stubgen_final_id: 11996 generate_final_stubs(); 11997 break; 11998 default: 11999 fatal("unexpected blob id: %s", StubInfo::name(blob_id)); 12000 break; 12001 }; 12002 } 12003 }; // end class declaration 12004 12005 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) { 12006 StubGenerator g(code, blob_id); 12007 } 12008 12009 12010 #if defined (LINUX) 12011 12012 // Define pointers to atomic stubs and initialize them to point to the 12013 // code in atomic_aarch64.S. 12014 12015 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 12016 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 12017 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 12018 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 12019 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 12020 12021 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 12022 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 12023 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 12024 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 12025 DEFAULT_ATOMIC_OP(xchg, 4, ) 12026 DEFAULT_ATOMIC_OP(xchg, 8, ) 12027 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 12028 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 12029 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 12030 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 12031 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 12032 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 12033 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 12034 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 12035 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 12036 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 12037 12038 #undef DEFAULT_ATOMIC_OP 12039 12040 #endif // LINUX