1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomicAccess.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubId stub_id = StubId::stubgen_call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 __ ldr(j_rarg2, result); 332 Label is_long, is_float, is_double, exit; 333 __ ldr(j_rarg1, result_type); 334 __ cmp(j_rarg1, (u1)T_OBJECT); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, (u1)T_LONG); 337 __ br(Assembler::EQ, is_long); 338 __ cmp(j_rarg1, (u1)T_FLOAT); 339 __ br(Assembler::EQ, is_float); 340 __ cmp(j_rarg1, (u1)T_DOUBLE); 341 __ br(Assembler::EQ, is_double); 342 343 // handle T_INT case 344 __ strw(r0, Address(j_rarg2)); 345 346 __ BIND(exit); 347 348 // pop parameters 349 __ sub(esp, rfp, -sp_after_call_off * wordSize); 350 351 #ifdef ASSERT 352 // verify that threads correspond 353 { 354 Label L, S; 355 __ ldr(rscratch1, thread); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::NE, S); 358 __ get_thread(rscratch1); 359 __ cmp(rthread, rscratch1); 360 __ br(Assembler::EQ, L); 361 __ BIND(S); 362 __ stop("StubRoutines::call_stub: threads must correspond"); 363 __ BIND(L); 364 } 365 #endif 366 367 __ pop_cont_fastpath(rthread); 368 369 // restore callee-save registers 370 __ ldpd(v15, v14, d15_save); 371 __ ldpd(v13, v12, d13_save); 372 __ ldpd(v11, v10, d11_save); 373 __ ldpd(v9, v8, d9_save); 374 375 __ ldp(r28, r27, r28_save); 376 __ ldp(r26, r25, r26_save); 377 __ ldp(r24, r23, r24_save); 378 __ ldp(r22, r21, r22_save); 379 __ ldp(r20, r19, r20_save); 380 381 // restore fpcr 382 __ ldr(rscratch1, fpcr_save); 383 __ set_fpcr(rscratch1); 384 385 __ ldp(c_rarg0, c_rarg1, call_wrapper); 386 __ ldrw(c_rarg2, result_type); 387 __ ldr(c_rarg3, method); 388 __ ldp(c_rarg4, c_rarg5, entry_point); 389 __ ldp(c_rarg6, c_rarg7, parameter_size); 390 391 // leave frame and return to caller 392 __ leave(); 393 __ ret(lr); 394 395 // handle return types different from T_INT 396 397 __ BIND(is_long); 398 __ str(r0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 __ BIND(is_float); 402 __ strs(j_farg0, Address(j_rarg2, 0)); 403 __ br(Assembler::AL, exit); 404 405 __ BIND(is_double); 406 __ strd(j_farg0, Address(j_rarg2, 0)); 407 __ br(Assembler::AL, exit); 408 409 return start; 410 } 411 412 // Return point for a Java call if there's an exception thrown in 413 // Java code. The exception is caught and transformed into a 414 // pending exception stored in JavaThread that can be tested from 415 // within the VM. 416 // 417 // Note: Usually the parameters are removed by the callee. In case 418 // of an exception crossing an activation frame boundary, that is 419 // not the case if the callee is compiled code => need to setup the 420 // rsp. 421 // 422 // r0: exception oop 423 424 address generate_catch_exception() { 425 StubId stub_id = StubId::stubgen_catch_exception_id; 426 StubCodeMark mark(this, stub_id); 427 address start = __ pc(); 428 429 // same as in generate_call_stub(): 430 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 431 const Address thread (rfp, thread_off * wordSize); 432 433 #ifdef ASSERT 434 // verify that threads correspond 435 { 436 Label L, S; 437 __ ldr(rscratch1, thread); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::NE, S); 440 __ get_thread(rscratch1); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::EQ, L); 443 __ bind(S); 444 __ stop("StubRoutines::catch_exception: threads must correspond"); 445 __ bind(L); 446 } 447 #endif 448 449 // set pending exception 450 __ verify_oop(r0); 451 452 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 453 __ mov(rscratch1, (address)__FILE__); 454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 455 __ movw(rscratch1, (int)__LINE__); 456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 457 458 // complete return to VM 459 assert(StubRoutines::_call_stub_return_address != nullptr, 460 "_call_stub_return_address must have been generated before"); 461 __ b(StubRoutines::_call_stub_return_address); 462 463 return start; 464 } 465 466 // Continuation point for runtime calls returning with a pending 467 // exception. The pending exception check happened in the runtime 468 // or native call stub. The pending exception in Thread is 469 // converted into a Java-level exception. 470 // 471 // Contract with Java-level exception handlers: 472 // r0: exception 473 // r3: throwing pc 474 // 475 // NOTE: At entry of this stub, exception-pc must be in LR !! 476 477 // NOTE: this is always used as a jump target within generated code 478 // so it just needs to be generated code with no x86 prolog 479 480 address generate_forward_exception() { 481 StubId stub_id = StubId::stubgen_forward_exception_id; 482 StubCodeMark mark(this, stub_id); 483 address start = __ pc(); 484 485 // Upon entry, LR points to the return address returning into 486 // Java (interpreted or compiled) code; i.e., the return address 487 // becomes the throwing pc. 488 // 489 // Arguments pushed before the runtime call are still on the stack 490 // but the exception handler will reset the stack pointer -> 491 // ignore them. A potential result in registers can be ignored as 492 // well. 493 494 #ifdef ASSERT 495 // make sure this code is only executed if there is a pending exception 496 { 497 Label L; 498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 499 __ cbnz(rscratch1, L); 500 __ stop("StubRoutines::forward exception: no pending exception (1)"); 501 __ bind(L); 502 } 503 #endif 504 505 // compute exception handler into r19 506 507 // call the VM to find the handler address associated with the 508 // caller address. pass thread in r0 and caller pc (ret address) 509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 510 // the stack. 511 __ mov(c_rarg1, lr); 512 // lr will be trashed by the VM call so we move it to R19 513 // (callee-saved) because we also need to pass it to the handler 514 // returned by this call. 515 __ mov(r19, lr); 516 BLOCK_COMMENT("call exception_handler_for_return_address"); 517 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 518 SharedRuntime::exception_handler_for_return_address), 519 rthread, c_rarg1); 520 // Reinitialize the ptrue predicate register, in case the external runtime 521 // call clobbers ptrue reg, as we may return to SVE compiled code. 522 __ reinitialize_ptrue(); 523 524 // we should not really care that lr is no longer the callee 525 // address. we saved the value the handler needs in r19 so we can 526 // just copy it to r3. however, the C2 handler will push its own 527 // frame and then calls into the VM and the VM code asserts that 528 // the PC for the frame above the handler belongs to a compiled 529 // Java method. So, we restore lr here to satisfy that assert. 530 __ mov(lr, r19); 531 // setup r0 & r3 & clear pending exception 532 __ mov(r3, r19); 533 __ mov(r19, r0); 534 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 535 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 536 537 #ifdef ASSERT 538 // make sure exception is set 539 { 540 Label L; 541 __ cbnz(r0, L); 542 __ stop("StubRoutines::forward exception: no pending exception (2)"); 543 __ bind(L); 544 } 545 #endif 546 547 // continue at exception handler 548 // r0: exception 549 // r3: throwing pc 550 // r19: exception handler 551 __ verify_oop(r0); 552 __ br(r19); 553 554 return start; 555 } 556 557 // Non-destructive plausibility checks for oops 558 // 559 // Arguments: 560 // r0: oop to verify 561 // rscratch1: error message 562 // 563 // Stack after saving c_rarg3: 564 // [tos + 0]: saved c_rarg3 565 // [tos + 1]: saved c_rarg2 566 // [tos + 2]: saved lr 567 // [tos + 3]: saved rscratch2 568 // [tos + 4]: saved r0 569 // [tos + 5]: saved rscratch1 570 address generate_verify_oop() { 571 StubId stub_id = StubId::stubgen_verify_oop_id; 572 StubCodeMark mark(this, stub_id); 573 address start = __ pc(); 574 575 Label exit, error; 576 577 // save c_rarg2 and c_rarg3 578 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 579 580 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 581 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ ldr(c_rarg3, Address(c_rarg2)); 583 __ add(c_rarg3, c_rarg3, 1); 584 __ str(c_rarg3, Address(c_rarg2)); 585 586 // object is in r0 587 // make sure object is 'reasonable' 588 __ cbz(r0, exit); // if obj is null it is OK 589 590 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 591 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 592 593 // return if everything seems ok 594 __ bind(exit); 595 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 __ ret(lr); 598 599 // handle errors 600 __ bind(error); 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 603 __ push(RegSet::range(r0, r29), sp); 604 // debug(char* msg, int64_t pc, int64_t regs[]) 605 __ mov(c_rarg0, rscratch1); // pass address of error message 606 __ mov(c_rarg1, lr); // pass return address 607 __ mov(c_rarg2, sp); // pass address of regs on stack 608 #ifndef PRODUCT 609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 610 #endif 611 BLOCK_COMMENT("call MacroAssembler::debug"); 612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 613 __ blr(rscratch1); 614 __ hlt(0); 615 616 return start; 617 } 618 619 // Generate indices for iota vector. 620 address generate_iota_indices(StubId stub_id) { 621 __ align(CodeEntryAlignment); 622 StubCodeMark mark(this, stub_id); 623 address start = __ pc(); 624 // B 625 __ emit_data64(0x0706050403020100, relocInfo::none); 626 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 627 // H 628 __ emit_data64(0x0003000200010000, relocInfo::none); 629 __ emit_data64(0x0007000600050004, relocInfo::none); 630 // S 631 __ emit_data64(0x0000000100000000, relocInfo::none); 632 __ emit_data64(0x0000000300000002, relocInfo::none); 633 // D 634 __ emit_data64(0x0000000000000000, relocInfo::none); 635 __ emit_data64(0x0000000000000001, relocInfo::none); 636 // S - FP 637 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 638 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 639 // D - FP 640 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 641 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 642 return start; 643 } 644 645 // The inner part of zero_words(). This is the bulk operation, 646 // zeroing words in blocks, possibly using DC ZVA to do it. The 647 // caller is responsible for zeroing the last few words. 648 // 649 // Inputs: 650 // r10: the HeapWord-aligned base address of an array to zero. 651 // r11: the count in HeapWords, r11 > 0. 652 // 653 // Returns r10 and r11, adjusted for the caller to clear. 654 // r10: the base address of the tail of words left to clear. 655 // r11: the number of words in the tail. 656 // r11 < MacroAssembler::zero_words_block_size. 657 658 address generate_zero_blocks() { 659 Label done; 660 Label base_aligned; 661 662 Register base = r10, cnt = r11; 663 664 __ align(CodeEntryAlignment); 665 StubId stub_id = StubId::stubgen_zero_blocks_id; 666 StubCodeMark mark(this, stub_id); 667 address start = __ pc(); 668 669 if (UseBlockZeroing) { 670 int zva_length = VM_Version::zva_length(); 671 672 // Ensure ZVA length can be divided by 16. This is required by 673 // the subsequent operations. 674 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 675 676 __ tbz(base, 3, base_aligned); 677 __ str(zr, Address(__ post(base, 8))); 678 __ sub(cnt, cnt, 1); 679 __ bind(base_aligned); 680 681 // Ensure count >= zva_length * 2 so that it still deserves a zva after 682 // alignment. 683 Label small; 684 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 685 __ subs(rscratch1, cnt, low_limit >> 3); 686 __ br(Assembler::LT, small); 687 __ zero_dcache_blocks(base, cnt); 688 __ bind(small); 689 } 690 691 { 692 // Number of stp instructions we'll unroll 693 const int unroll = 694 MacroAssembler::zero_words_block_size / 2; 695 // Clear the remaining blocks. 696 Label loop; 697 __ subs(cnt, cnt, unroll * 2); 698 __ br(Assembler::LT, done); 699 __ bind(loop); 700 for (int i = 0; i < unroll; i++) 701 __ stp(zr, zr, __ post(base, 16)); 702 __ subs(cnt, cnt, unroll * 2); 703 __ br(Assembler::GE, loop); 704 __ bind(done); 705 __ add(cnt, cnt, unroll * 2); 706 } 707 708 __ ret(lr); 709 710 return start; 711 } 712 713 714 typedef enum { 715 copy_forwards = 1, 716 copy_backwards = -1 717 } copy_direction; 718 719 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 720 // for arraycopy stubs. 721 class ArrayCopyBarrierSetHelper : StackObj { 722 BarrierSetAssembler* _bs_asm; 723 MacroAssembler* _masm; 724 DecoratorSet _decorators; 725 BasicType _type; 726 Register _gct1; 727 Register _gct2; 728 Register _gct3; 729 FloatRegister _gcvt1; 730 FloatRegister _gcvt2; 731 FloatRegister _gcvt3; 732 733 public: 734 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 735 DecoratorSet decorators, 736 BasicType type, 737 Register gct1, 738 Register gct2, 739 Register gct3, 740 FloatRegister gcvt1, 741 FloatRegister gcvt2, 742 FloatRegister gcvt3) 743 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 744 _masm(masm), 745 _decorators(decorators), 746 _type(type), 747 _gct1(gct1), 748 _gct2(gct2), 749 _gct3(gct3), 750 _gcvt1(gcvt1), 751 _gcvt2(gcvt2), 752 _gcvt3(gcvt3) { 753 } 754 755 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 756 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 757 dst1, dst2, src, 758 _gct1, _gct2, _gcvt1); 759 } 760 761 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 762 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 763 dst, src1, src2, 764 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 765 } 766 767 void copy_load_at_16(Register dst1, Register dst2, Address src) { 768 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 769 dst1, dst2, src, 770 _gct1); 771 } 772 773 void copy_store_at_16(Address dst, Register src1, Register src2) { 774 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 775 dst, src1, src2, 776 _gct1, _gct2, _gct3); 777 } 778 779 void copy_load_at_8(Register dst, Address src) { 780 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 781 dst, noreg, src, 782 _gct1); 783 } 784 785 void copy_store_at_8(Address dst, Register src) { 786 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 787 dst, src, noreg, 788 _gct1, _gct2, _gct3); 789 } 790 }; 791 792 // Bulk copy of blocks of 8 words. 793 // 794 // count is a count of words. 795 // 796 // Precondition: count >= 8 797 // 798 // Postconditions: 799 // 800 // The least significant bit of count contains the remaining count 801 // of words to copy. The rest of count is trash. 802 // 803 // s and d are adjusted to point to the remaining words to copy 804 // 805 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) { 806 BasicType type; 807 copy_direction direction; 808 809 switch (stub_id) { 810 case StubId::stubgen_copy_byte_f_id: 811 direction = copy_forwards; 812 type = T_BYTE; 813 break; 814 case StubId::stubgen_copy_byte_b_id: 815 direction = copy_backwards; 816 type = T_BYTE; 817 break; 818 case StubId::stubgen_copy_oop_f_id: 819 direction = copy_forwards; 820 type = T_OBJECT; 821 break; 822 case StubId::stubgen_copy_oop_b_id: 823 direction = copy_backwards; 824 type = T_OBJECT; 825 break; 826 case StubId::stubgen_copy_oop_uninit_f_id: 827 direction = copy_forwards; 828 type = T_OBJECT; 829 break; 830 case StubId::stubgen_copy_oop_uninit_b_id: 831 direction = copy_backwards; 832 type = T_OBJECT; 833 break; 834 default: 835 ShouldNotReachHere(); 836 } 837 838 int unit = wordSize * direction; 839 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 840 841 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 842 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 843 const Register stride = r14; 844 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 845 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 846 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 847 848 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 849 assert_different_registers(s, d, count, rscratch1, rscratch2); 850 851 Label again, drain; 852 853 __ align(CodeEntryAlignment); 854 855 StubCodeMark mark(this, stub_id); 856 857 address start = __ pc(); 858 859 Label unaligned_copy_long; 860 if (AvoidUnalignedAccesses) { 861 __ tbnz(d, 3, unaligned_copy_long); 862 } 863 864 if (direction == copy_forwards) { 865 __ sub(s, s, bias); 866 __ sub(d, d, bias); 867 } 868 869 #ifdef ASSERT 870 // Make sure we are never given < 8 words 871 { 872 Label L; 873 __ cmp(count, (u1)8); 874 __ br(Assembler::GE, L); 875 __ stop("genrate_copy_longs called with < 8 words"); 876 __ bind(L); 877 } 878 #endif 879 880 // Fill 8 registers 881 if (UseSIMDForMemoryOps) { 882 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 884 } else { 885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 886 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 887 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 888 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 889 } 890 891 __ subs(count, count, 16); 892 __ br(Assembler::LO, drain); 893 894 int prefetch = PrefetchCopyIntervalInBytes; 895 bool use_stride = false; 896 if (direction == copy_backwards) { 897 use_stride = prefetch > 256; 898 prefetch = -prefetch; 899 if (use_stride) __ mov(stride, prefetch); 900 } 901 902 __ bind(again); 903 904 if (PrefetchCopyIntervalInBytes > 0) 905 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 906 907 if (UseSIMDForMemoryOps) { 908 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 909 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 910 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 911 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 912 } else { 913 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 914 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 915 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 916 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 917 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 918 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 919 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 920 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 921 } 922 923 __ subs(count, count, 8); 924 __ br(Assembler::HS, again); 925 926 // Drain 927 __ bind(drain); 928 if (UseSIMDForMemoryOps) { 929 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 930 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 931 } else { 932 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 933 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 934 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 } 937 938 { 939 Label L1, L2; 940 __ tbz(count, exact_log2(4), L1); 941 if (UseSIMDForMemoryOps) { 942 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 943 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 944 } else { 945 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 946 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 947 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 948 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 949 } 950 __ bind(L1); 951 952 if (direction == copy_forwards) { 953 __ add(s, s, bias); 954 __ add(d, d, bias); 955 } 956 957 __ tbz(count, 1, L2); 958 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 959 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 960 __ bind(L2); 961 } 962 963 __ ret(lr); 964 965 if (AvoidUnalignedAccesses) { 966 Label drain, again; 967 // Register order for storing. Order is different for backward copy. 968 969 __ bind(unaligned_copy_long); 970 971 // source address is even aligned, target odd aligned 972 // 973 // when forward copying word pairs we read long pairs at offsets 974 // {0, 2, 4, 6} (in long words). when backwards copying we read 975 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 976 // address by -2 in the forwards case so we can compute the 977 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 978 // or -1. 979 // 980 // when forward copying we need to store 1 word, 3 pairs and 981 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 982 // zero offset We adjust the destination by -1 which means we 983 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 984 // 985 // When backwards copyng we need to store 1 word, 3 pairs and 986 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 987 // offsets {1, 3, 5, 7, 8} * unit. 988 989 if (direction == copy_forwards) { 990 __ sub(s, s, 16); 991 __ sub(d, d, 8); 992 } 993 994 // Fill 8 registers 995 // 996 // for forwards copy s was offset by -16 from the original input 997 // value of s so the register contents are at these offsets 998 // relative to the 64 bit block addressed by that original input 999 // and so on for each successive 64 byte block when s is updated 1000 // 1001 // t0 at offset 0, t1 at offset 8 1002 // t2 at offset 16, t3 at offset 24 1003 // t4 at offset 32, t5 at offset 40 1004 // t6 at offset 48, t7 at offset 56 1005 1006 // for backwards copy s was not offset so the register contents 1007 // are at these offsets into the preceding 64 byte block 1008 // relative to that original input and so on for each successive 1009 // preceding 64 byte block when s is updated. this explains the 1010 // slightly counter-intuitive looking pattern of register usage 1011 // in the stp instructions for backwards copy. 1012 // 1013 // t0 at offset -16, t1 at offset -8 1014 // t2 at offset -32, t3 at offset -24 1015 // t4 at offset -48, t5 at offset -40 1016 // t6 at offset -64, t7 at offset -56 1017 1018 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1019 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1020 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1021 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1022 1023 __ subs(count, count, 16); 1024 __ br(Assembler::LO, drain); 1025 1026 int prefetch = PrefetchCopyIntervalInBytes; 1027 bool use_stride = false; 1028 if (direction == copy_backwards) { 1029 use_stride = prefetch > 256; 1030 prefetch = -prefetch; 1031 if (use_stride) __ mov(stride, prefetch); 1032 } 1033 1034 __ bind(again); 1035 1036 if (PrefetchCopyIntervalInBytes > 0) 1037 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1038 1039 if (direction == copy_forwards) { 1040 // allowing for the offset of -8 the store instructions place 1041 // registers into the target 64 bit block at the following 1042 // offsets 1043 // 1044 // t0 at offset 0 1045 // t1 at offset 8, t2 at offset 16 1046 // t3 at offset 24, t4 at offset 32 1047 // t5 at offset 40, t6 at offset 48 1048 // t7 at offset 56 1049 1050 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1051 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1052 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1053 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1054 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1055 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1056 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1057 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1058 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1059 } else { 1060 // d was not offset when we started so the registers are 1061 // written into the 64 bit block preceding d with the following 1062 // offsets 1063 // 1064 // t1 at offset -8 1065 // t3 at offset -24, t0 at offset -16 1066 // t5 at offset -48, t2 at offset -32 1067 // t7 at offset -56, t4 at offset -48 1068 // t6 at offset -64 1069 // 1070 // note that this matches the offsets previously noted for the 1071 // loads 1072 1073 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1074 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1075 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1076 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1077 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1078 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1079 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1080 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1081 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1082 } 1083 1084 __ subs(count, count, 8); 1085 __ br(Assembler::HS, again); 1086 1087 // Drain 1088 // 1089 // this uses the same pattern of offsets and register arguments 1090 // as above 1091 __ bind(drain); 1092 if (direction == copy_forwards) { 1093 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1094 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1095 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1096 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1097 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1098 } else { 1099 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1100 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1101 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1102 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1103 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1104 } 1105 // now we need to copy any remaining part block which may 1106 // include a 4 word block subblock and/or a 2 word subblock. 1107 // bits 2 and 1 in the count are the tell-tale for whether we 1108 // have each such subblock 1109 { 1110 Label L1, L2; 1111 __ tbz(count, exact_log2(4), L1); 1112 // this is the same as above but copying only 4 longs hence 1113 // with only one intervening stp between the str instructions 1114 // but note that the offsets and registers still follow the 1115 // same pattern 1116 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1117 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1118 if (direction == copy_forwards) { 1119 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1120 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1121 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1122 } else { 1123 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1124 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1125 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1126 } 1127 __ bind(L1); 1128 1129 __ tbz(count, 1, L2); 1130 // this is the same as above but copying only 2 longs hence 1131 // there is no intervening stp between the str instructions 1132 // but note that the offset and register patterns are still 1133 // the same 1134 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1135 if (direction == copy_forwards) { 1136 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1137 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1141 } 1142 __ bind(L2); 1143 1144 // for forwards copy we need to re-adjust the offsets we 1145 // applied so that s and d are follow the last words written 1146 1147 if (direction == copy_forwards) { 1148 __ add(s, s, 16); 1149 __ add(d, d, 8); 1150 } 1151 1152 } 1153 1154 __ ret(lr); 1155 } 1156 1157 return start; 1158 } 1159 1160 // Small copy: less than 16 bytes. 1161 // 1162 // NB: Ignores all of the bits of count which represent more than 15 1163 // bytes, so a caller doesn't have to mask them. 1164 1165 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1166 bool is_backwards = step < 0; 1167 size_t granularity = g_uabs(step); 1168 int direction = is_backwards ? -1 : 1; 1169 1170 Label Lword, Lint, Lshort, Lbyte; 1171 1172 assert(granularity 1173 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1174 1175 const Register t0 = r3; 1176 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1177 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1178 1179 // ??? I don't know if this bit-test-and-branch is the right thing 1180 // to do. It does a lot of jumping, resulting in several 1181 // mispredicted branches. It might make more sense to do this 1182 // with something like Duff's device with a single computed branch. 1183 1184 __ tbz(count, 3 - exact_log2(granularity), Lword); 1185 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1186 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1187 __ bind(Lword); 1188 1189 if (granularity <= sizeof (jint)) { 1190 __ tbz(count, 2 - exact_log2(granularity), Lint); 1191 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1192 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1193 __ bind(Lint); 1194 } 1195 1196 if (granularity <= sizeof (jshort)) { 1197 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1198 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1199 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1200 __ bind(Lshort); 1201 } 1202 1203 if (granularity <= sizeof (jbyte)) { 1204 __ tbz(count, 0, Lbyte); 1205 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1206 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1207 __ bind(Lbyte); 1208 } 1209 } 1210 1211 // All-singing all-dancing memory copy. 1212 // 1213 // Copy count units of memory from s to d. The size of a unit is 1214 // step, which can be positive or negative depending on the direction 1215 // of copy. If is_aligned is false, we align the source address. 1216 // 1217 1218 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1219 Register s, Register d, Register count, int step) { 1220 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1221 bool is_backwards = step < 0; 1222 unsigned int granularity = g_uabs(step); 1223 const Register t0 = r3, t1 = r4; 1224 1225 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1226 // load all the data before writing anything 1227 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1228 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1229 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1230 const Register send = r17, dend = r16; 1231 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1232 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1233 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1234 1235 if (PrefetchCopyIntervalInBytes > 0) 1236 __ prfm(Address(s, 0), PLDL1KEEP); 1237 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1238 __ br(Assembler::HI, copy_big); 1239 1240 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1241 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1242 1243 __ cmp(count, u1(16/granularity)); 1244 __ br(Assembler::LS, copy16); 1245 1246 __ cmp(count, u1(64/granularity)); 1247 __ br(Assembler::HI, copy80); 1248 1249 __ cmp(count, u1(32/granularity)); 1250 __ br(Assembler::LS, copy32); 1251 1252 // 33..64 bytes 1253 if (UseSIMDForMemoryOps) { 1254 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1255 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1256 bs.copy_store_at_32(Address(d, 0), v0, v1); 1257 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1258 } else { 1259 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1260 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1261 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1262 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1263 1264 bs.copy_store_at_16(Address(d, 0), t0, t1); 1265 bs.copy_store_at_16(Address(d, 16), t2, t3); 1266 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1267 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1268 } 1269 __ b(finish); 1270 1271 // 17..32 bytes 1272 __ bind(copy32); 1273 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1274 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1275 1276 bs.copy_store_at_16(Address(d, 0), t0, t1); 1277 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1278 __ b(finish); 1279 1280 // 65..80/96 bytes 1281 // (96 bytes if SIMD because we do 32 byes per instruction) 1282 __ bind(copy80); 1283 if (UseSIMDForMemoryOps) { 1284 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1285 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1286 // Unaligned pointers can be an issue for copying. 1287 // The issue has more chances to happen when granularity of data is 1288 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1289 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1290 // The most performance drop has been seen for the range 65-80 bytes. 1291 // For such cases using the pair of ldp/stp instead of the third pair of 1292 // ldpq/stpq fixes the performance issue. 1293 if (granularity < sizeof (jint)) { 1294 Label copy96; 1295 __ cmp(count, u1(80/granularity)); 1296 __ br(Assembler::HI, copy96); 1297 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1298 1299 bs.copy_store_at_32(Address(d, 0), v0, v1); 1300 bs.copy_store_at_32(Address(d, 32), v2, v3); 1301 1302 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1303 __ b(finish); 1304 1305 __ bind(copy96); 1306 } 1307 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1308 1309 bs.copy_store_at_32(Address(d, 0), v0, v1); 1310 bs.copy_store_at_32(Address(d, 32), v2, v3); 1311 1312 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1313 } else { 1314 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1315 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1316 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1317 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1318 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1319 1320 bs.copy_store_at_16(Address(d, 0), t0, t1); 1321 bs.copy_store_at_16(Address(d, 16), t2, t3); 1322 bs.copy_store_at_16(Address(d, 32), t4, t5); 1323 bs.copy_store_at_16(Address(d, 48), t6, t7); 1324 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1325 } 1326 __ b(finish); 1327 1328 // 0..16 bytes 1329 __ bind(copy16); 1330 __ cmp(count, u1(8/granularity)); 1331 __ br(Assembler::LO, copy8); 1332 1333 // 8..16 bytes 1334 bs.copy_load_at_8(t0, Address(s, 0)); 1335 bs.copy_load_at_8(t1, Address(send, -8)); 1336 bs.copy_store_at_8(Address(d, 0), t0); 1337 bs.copy_store_at_8(Address(dend, -8), t1); 1338 __ b(finish); 1339 1340 if (granularity < 8) { 1341 // 4..7 bytes 1342 __ bind(copy8); 1343 __ tbz(count, 2 - exact_log2(granularity), copy4); 1344 __ ldrw(t0, Address(s, 0)); 1345 __ ldrw(t1, Address(send, -4)); 1346 __ strw(t0, Address(d, 0)); 1347 __ strw(t1, Address(dend, -4)); 1348 __ b(finish); 1349 if (granularity < 4) { 1350 // 0..3 bytes 1351 __ bind(copy4); 1352 __ cbz(count, finish); // get rid of 0 case 1353 if (granularity == 2) { 1354 __ ldrh(t0, Address(s, 0)); 1355 __ strh(t0, Address(d, 0)); 1356 } else { // granularity == 1 1357 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1358 // the first and last byte. 1359 // Handle the 3 byte case by loading and storing base + count/2 1360 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1361 // This does means in the 1 byte case we load/store the same 1362 // byte 3 times. 1363 __ lsr(count, count, 1); 1364 __ ldrb(t0, Address(s, 0)); 1365 __ ldrb(t1, Address(send, -1)); 1366 __ ldrb(t2, Address(s, count)); 1367 __ strb(t0, Address(d, 0)); 1368 __ strb(t1, Address(dend, -1)); 1369 __ strb(t2, Address(d, count)); 1370 } 1371 __ b(finish); 1372 } 1373 } 1374 1375 __ bind(copy_big); 1376 if (is_backwards) { 1377 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1378 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1379 } 1380 1381 // Now we've got the small case out of the way we can align the 1382 // source address on a 2-word boundary. 1383 1384 // Here we will materialize a count in r15, which is used by copy_memory_small 1385 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1386 // Up until here, we have used t9, which aliases r15, but from here on, that register 1387 // can not be used as a temp register, as it contains the count. 1388 1389 Label aligned; 1390 1391 if (is_aligned) { 1392 // We may have to adjust by 1 word to get s 2-word-aligned. 1393 __ tbz(s, exact_log2(wordSize), aligned); 1394 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1395 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1396 __ sub(count, count, wordSize/granularity); 1397 } else { 1398 if (is_backwards) { 1399 __ andr(r15, s, 2 * wordSize - 1); 1400 } else { 1401 __ neg(r15, s); 1402 __ andr(r15, r15, 2 * wordSize - 1); 1403 } 1404 // r15 is the byte adjustment needed to align s. 1405 __ cbz(r15, aligned); 1406 int shift = exact_log2(granularity); 1407 if (shift > 0) { 1408 __ lsr(r15, r15, shift); 1409 } 1410 __ sub(count, count, r15); 1411 1412 #if 0 1413 // ?? This code is only correct for a disjoint copy. It may or 1414 // may not make sense to use it in that case. 1415 1416 // Copy the first pair; s and d may not be aligned. 1417 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1418 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1419 1420 // Align s and d, adjust count 1421 if (is_backwards) { 1422 __ sub(s, s, r15); 1423 __ sub(d, d, r15); 1424 } else { 1425 __ add(s, s, r15); 1426 __ add(d, d, r15); 1427 } 1428 #else 1429 copy_memory_small(decorators, type, s, d, r15, step); 1430 #endif 1431 } 1432 1433 __ bind(aligned); 1434 1435 // s is now 2-word-aligned. 1436 1437 // We have a count of units and some trailing bytes. Adjust the 1438 // count and do a bulk copy of words. If the shift is zero 1439 // perform a move instead to benefit from zero latency moves. 1440 int shift = exact_log2(wordSize/granularity); 1441 if (shift > 0) { 1442 __ lsr(r15, count, shift); 1443 } else { 1444 __ mov(r15, count); 1445 } 1446 if (direction == copy_forwards) { 1447 if (type != T_OBJECT) { 1448 __ bl(StubRoutines::aarch64::copy_byte_f()); 1449 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1450 __ bl(StubRoutines::aarch64::copy_oop_uninit_f()); 1451 } else { 1452 __ bl(StubRoutines::aarch64::copy_oop_f()); 1453 } 1454 } else { 1455 if (type != T_OBJECT) { 1456 __ bl(StubRoutines::aarch64::copy_byte_b()); 1457 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1458 __ bl(StubRoutines::aarch64::copy_oop_uninit_b()); 1459 } else { 1460 __ bl(StubRoutines::aarch64::copy_oop_b()); 1461 } 1462 } 1463 1464 // And the tail. 1465 copy_memory_small(decorators, type, s, d, count, step); 1466 1467 if (granularity >= 8) __ bind(copy8); 1468 if (granularity >= 4) __ bind(copy4); 1469 __ bind(finish); 1470 } 1471 1472 1473 void clobber_registers() { 1474 #ifdef ASSERT 1475 RegSet clobbered 1476 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1477 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1478 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1479 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1480 __ mov(*it, rscratch1); 1481 } 1482 #endif 1483 1484 } 1485 1486 // Scan over array at a for count oops, verifying each one. 1487 // Preserves a and count, clobbers rscratch1 and rscratch2. 1488 void verify_oop_array (int size, Register a, Register count, Register temp) { 1489 Label loop, end; 1490 __ mov(rscratch1, a); 1491 __ mov(rscratch2, zr); 1492 __ bind(loop); 1493 __ cmp(rscratch2, count); 1494 __ br(Assembler::HS, end); 1495 if (size == wordSize) { 1496 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1497 __ verify_oop(temp); 1498 } else { 1499 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1500 __ decode_heap_oop(temp); // calls verify_oop 1501 } 1502 __ add(rscratch2, rscratch2, 1); 1503 __ b(loop); 1504 __ bind(end); 1505 } 1506 1507 // Arguments: 1508 // stub_id - is used to name the stub and identify all details of 1509 // how to perform the copy. 1510 // 1511 // entry - is assigned to the stub's post push entry point unless 1512 // it is null 1513 // 1514 // Inputs: 1515 // c_rarg0 - source array address 1516 // c_rarg1 - destination array address 1517 // c_rarg2 - element count, treated as ssize_t, can be zero 1518 // 1519 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1520 // the hardware handle it. The two dwords within qwords that span 1521 // cache line boundaries will still be loaded and stored atomically. 1522 // 1523 // Side Effects: nopush_entry is set to the (post push) entry point 1524 // so it can be used by the corresponding conjoint 1525 // copy method 1526 // 1527 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) { 1528 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1529 RegSet saved_reg = RegSet::of(s, d, count); 1530 int size; 1531 bool aligned; 1532 bool is_oop; 1533 bool dest_uninitialized; 1534 switch (stub_id) { 1535 case StubId::stubgen_jbyte_disjoint_arraycopy_id: 1536 size = sizeof(jbyte); 1537 aligned = false; 1538 is_oop = false; 1539 dest_uninitialized = false; 1540 break; 1541 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id: 1542 size = sizeof(jbyte); 1543 aligned = true; 1544 is_oop = false; 1545 dest_uninitialized = false; 1546 break; 1547 case StubId::stubgen_jshort_disjoint_arraycopy_id: 1548 size = sizeof(jshort); 1549 aligned = false; 1550 is_oop = false; 1551 dest_uninitialized = false; 1552 break; 1553 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id: 1554 size = sizeof(jshort); 1555 aligned = true; 1556 is_oop = false; 1557 dest_uninitialized = false; 1558 break; 1559 case StubId::stubgen_jint_disjoint_arraycopy_id: 1560 size = sizeof(jint); 1561 aligned = false; 1562 is_oop = false; 1563 dest_uninitialized = false; 1564 break; 1565 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id: 1566 size = sizeof(jint); 1567 aligned = true; 1568 is_oop = false; 1569 dest_uninitialized = false; 1570 break; 1571 case StubId::stubgen_jlong_disjoint_arraycopy_id: 1572 // since this is always aligned we can (should!) use the same 1573 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy 1574 ShouldNotReachHere(); 1575 break; 1576 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id: 1577 size = sizeof(jlong); 1578 aligned = true; 1579 is_oop = false; 1580 dest_uninitialized = false; 1581 break; 1582 case StubId::stubgen_oop_disjoint_arraycopy_id: 1583 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1584 aligned = !UseCompressedOops; 1585 is_oop = true; 1586 dest_uninitialized = false; 1587 break; 1588 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id: 1589 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1590 aligned = !UseCompressedOops; 1591 is_oop = true; 1592 dest_uninitialized = false; 1593 break; 1594 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id: 1595 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1596 aligned = !UseCompressedOops; 1597 is_oop = true; 1598 dest_uninitialized = true; 1599 break; 1600 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id: 1601 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1602 aligned = !UseCompressedOops; 1603 is_oop = true; 1604 dest_uninitialized = true; 1605 break; 1606 default: 1607 ShouldNotReachHere(); 1608 break; 1609 } 1610 1611 __ align(CodeEntryAlignment); 1612 StubCodeMark mark(this, stub_id); 1613 address start = __ pc(); 1614 __ enter(); 1615 1616 if (nopush_entry != nullptr) { 1617 *nopush_entry = __ pc(); 1618 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1619 BLOCK_COMMENT("Entry:"); 1620 } 1621 1622 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1623 if (dest_uninitialized) { 1624 decorators |= IS_DEST_UNINITIALIZED; 1625 } 1626 if (aligned) { 1627 decorators |= ARRAYCOPY_ALIGNED; 1628 } 1629 1630 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1631 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1632 1633 if (is_oop) { 1634 // save regs before copy_memory 1635 __ push(RegSet::of(d, count), sp); 1636 } 1637 { 1638 // UnsafeMemoryAccess page error: continue after unsafe access 1639 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1640 UnsafeMemoryAccessMark umam(this, add_entry, true); 1641 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1642 } 1643 1644 if (is_oop) { 1645 __ pop(RegSet::of(d, count), sp); 1646 if (VerifyOops) 1647 verify_oop_array(size, d, count, r16); 1648 } 1649 1650 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1651 1652 __ leave(); 1653 __ mov(r0, zr); // return 0 1654 __ ret(lr); 1655 return start; 1656 } 1657 1658 // Arguments: 1659 // stub_id - is used to name the stub and identify all details of 1660 // how to perform the copy. 1661 // 1662 // nooverlap_target - identifes the (post push) entry for the 1663 // corresponding disjoint copy routine which can be 1664 // jumped to if the ranges do not actually overlap 1665 // 1666 // entry - is assigned to the stub's post push entry point unless 1667 // it is null 1668 // 1669 // 1670 // Inputs: 1671 // c_rarg0 - source array address 1672 // c_rarg1 - destination array address 1673 // c_rarg2 - element count, treated as ssize_t, can be zero 1674 // 1675 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1676 // the hardware handle it. The two dwords within qwords that span 1677 // cache line boundaries will still be loaded and stored atomically. 1678 // 1679 // Side Effects: 1680 // nopush_entry is set to the no-overlap entry point so it can be 1681 // used by some other conjoint copy method 1682 // 1683 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) { 1684 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1685 RegSet saved_regs = RegSet::of(s, d, count); 1686 int size; 1687 bool aligned; 1688 bool is_oop; 1689 bool dest_uninitialized; 1690 switch (stub_id) { 1691 case StubId::stubgen_jbyte_arraycopy_id: 1692 size = sizeof(jbyte); 1693 aligned = false; 1694 is_oop = false; 1695 dest_uninitialized = false; 1696 break; 1697 case StubId::stubgen_arrayof_jbyte_arraycopy_id: 1698 size = sizeof(jbyte); 1699 aligned = true; 1700 is_oop = false; 1701 dest_uninitialized = false; 1702 break; 1703 case StubId::stubgen_jshort_arraycopy_id: 1704 size = sizeof(jshort); 1705 aligned = false; 1706 is_oop = false; 1707 dest_uninitialized = false; 1708 break; 1709 case StubId::stubgen_arrayof_jshort_arraycopy_id: 1710 size = sizeof(jshort); 1711 aligned = true; 1712 is_oop = false; 1713 dest_uninitialized = false; 1714 break; 1715 case StubId::stubgen_jint_arraycopy_id: 1716 size = sizeof(jint); 1717 aligned = false; 1718 is_oop = false; 1719 dest_uninitialized = false; 1720 break; 1721 case StubId::stubgen_arrayof_jint_arraycopy_id: 1722 size = sizeof(jint); 1723 aligned = true; 1724 is_oop = false; 1725 dest_uninitialized = false; 1726 break; 1727 case StubId::stubgen_jlong_arraycopy_id: 1728 // since this is always aligned we can (should!) use the same 1729 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy 1730 ShouldNotReachHere(); 1731 break; 1732 case StubId::stubgen_arrayof_jlong_arraycopy_id: 1733 size = sizeof(jlong); 1734 aligned = true; 1735 is_oop = false; 1736 dest_uninitialized = false; 1737 break; 1738 case StubId::stubgen_oop_arraycopy_id: 1739 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1740 aligned = !UseCompressedOops; 1741 is_oop = true; 1742 dest_uninitialized = false; 1743 break; 1744 case StubId::stubgen_arrayof_oop_arraycopy_id: 1745 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1746 aligned = !UseCompressedOops; 1747 is_oop = true; 1748 dest_uninitialized = false; 1749 break; 1750 case StubId::stubgen_oop_arraycopy_uninit_id: 1751 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1752 aligned = !UseCompressedOops; 1753 is_oop = true; 1754 dest_uninitialized = true; 1755 break; 1756 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id: 1757 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1758 aligned = !UseCompressedOops; 1759 is_oop = true; 1760 dest_uninitialized = true; 1761 break; 1762 default: 1763 ShouldNotReachHere(); 1764 } 1765 1766 StubCodeMark mark(this, stub_id); 1767 address start = __ pc(); 1768 __ enter(); 1769 1770 if (nopush_entry != nullptr) { 1771 *nopush_entry = __ pc(); 1772 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1773 BLOCK_COMMENT("Entry:"); 1774 } 1775 1776 // use fwd copy when (d-s) above_equal (count*size) 1777 Label L_overlapping; 1778 __ sub(rscratch1, d, s); 1779 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1780 __ br(Assembler::LO, L_overlapping); 1781 __ b(RuntimeAddress(nooverlap_target)); 1782 __ bind(L_overlapping); 1783 1784 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1785 if (dest_uninitialized) { 1786 decorators |= IS_DEST_UNINITIALIZED; 1787 } 1788 if (aligned) { 1789 decorators |= ARRAYCOPY_ALIGNED; 1790 } 1791 1792 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1793 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1794 1795 if (is_oop) { 1796 // save regs before copy_memory 1797 __ push(RegSet::of(d, count), sp); 1798 } 1799 { 1800 // UnsafeMemoryAccess page error: continue after unsafe access 1801 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1802 UnsafeMemoryAccessMark umam(this, add_entry, true); 1803 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1804 } 1805 if (is_oop) { 1806 __ pop(RegSet::of(d, count), sp); 1807 if (VerifyOops) 1808 verify_oop_array(size, d, count, r16); 1809 } 1810 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1811 __ leave(); 1812 __ mov(r0, zr); // return 0 1813 __ ret(lr); 1814 return start; 1815 } 1816 1817 // Helper for generating a dynamic type check. 1818 // Smashes rscratch1, rscratch2. 1819 void generate_type_check(Register sub_klass, 1820 Register super_check_offset, 1821 Register super_klass, 1822 Register temp1, 1823 Register temp2, 1824 Register result, 1825 Label& L_success) { 1826 assert_different_registers(sub_klass, super_check_offset, super_klass); 1827 1828 BLOCK_COMMENT("type_check:"); 1829 1830 Label L_miss; 1831 1832 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1833 super_check_offset); 1834 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1835 1836 // Fall through on failure! 1837 __ BIND(L_miss); 1838 } 1839 1840 // 1841 // Generate checkcasting array copy stub 1842 // 1843 // Input: 1844 // c_rarg0 - source array address 1845 // c_rarg1 - destination array address 1846 // c_rarg2 - element count, treated as ssize_t, can be zero 1847 // c_rarg3 - size_t ckoff (super_check_offset) 1848 // c_rarg4 - oop ckval (super_klass) 1849 // 1850 // Output: 1851 // r0 == 0 - success 1852 // r0 == -1^K - failure, where K is partial transfer count 1853 // 1854 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) { 1855 bool dest_uninitialized; 1856 switch (stub_id) { 1857 case StubId::stubgen_checkcast_arraycopy_id: 1858 dest_uninitialized = false; 1859 break; 1860 case StubId::stubgen_checkcast_arraycopy_uninit_id: 1861 dest_uninitialized = true; 1862 break; 1863 default: 1864 ShouldNotReachHere(); 1865 } 1866 1867 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1868 1869 // Input registers (after setup_arg_regs) 1870 const Register from = c_rarg0; // source array address 1871 const Register to = c_rarg1; // destination array address 1872 const Register count = c_rarg2; // elementscount 1873 const Register ckoff = c_rarg3; // super_check_offset 1874 const Register ckval = c_rarg4; // super_klass 1875 1876 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1877 RegSet wb_post_saved_regs = RegSet::of(count); 1878 1879 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1880 const Register copied_oop = r22; // actual oop copied 1881 const Register count_save = r21; // orig elementscount 1882 const Register start_to = r20; // destination array start address 1883 const Register r19_klass = r19; // oop._klass 1884 1885 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1886 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1887 1888 //--------------------------------------------------------------- 1889 // Assembler stub will be used for this call to arraycopy 1890 // if the two arrays are subtypes of Object[] but the 1891 // destination array type is not equal to or a supertype 1892 // of the source type. Each element must be separately 1893 // checked. 1894 1895 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1896 copied_oop, r19_klass, count_save); 1897 1898 __ align(CodeEntryAlignment); 1899 StubCodeMark mark(this, stub_id); 1900 address start = __ pc(); 1901 1902 __ enter(); // required for proper stackwalking of RuntimeStub frame 1903 1904 #ifdef ASSERT 1905 // caller guarantees that the arrays really are different 1906 // otherwise, we would have to make conjoint checks 1907 { Label L; 1908 __ b(L); // conjoint check not yet implemented 1909 __ stop("checkcast_copy within a single array"); 1910 __ bind(L); 1911 } 1912 #endif //ASSERT 1913 1914 // Caller of this entry point must set up the argument registers. 1915 if (nopush_entry != nullptr) { 1916 *nopush_entry = __ pc(); 1917 BLOCK_COMMENT("Entry:"); 1918 } 1919 1920 // Empty array: Nothing to do. 1921 __ cbz(count, L_done); 1922 __ push(RegSet::of(r19, r20, r21, r22), sp); 1923 1924 #ifdef ASSERT 1925 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1926 // The ckoff and ckval must be mutually consistent, 1927 // even though caller generates both. 1928 { Label L; 1929 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1930 __ ldrw(start_to, Address(ckval, sco_offset)); 1931 __ cmpw(ckoff, start_to); 1932 __ br(Assembler::EQ, L); 1933 __ stop("super_check_offset inconsistent"); 1934 __ bind(L); 1935 } 1936 #endif //ASSERT 1937 1938 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1939 bool is_oop = true; 1940 int element_size = UseCompressedOops ? 4 : 8; 1941 if (dest_uninitialized) { 1942 decorators |= IS_DEST_UNINITIALIZED; 1943 } 1944 1945 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1946 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1947 1948 // save the original count 1949 __ mov(count_save, count); 1950 1951 // Copy from low to high addresses 1952 __ mov(start_to, to); // Save destination array start address 1953 __ b(L_load_element); 1954 1955 // ======== begin loop ======== 1956 // (Loop is rotated; its entry is L_load_element.) 1957 // Loop control: 1958 // for (; count != 0; count--) { 1959 // copied_oop = load_heap_oop(from++); 1960 // ... generate_type_check ...; 1961 // store_heap_oop(to++, copied_oop); 1962 // } 1963 __ align(OptoLoopAlignment); 1964 1965 __ BIND(L_store_element); 1966 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1967 __ post(to, element_size), copied_oop, noreg, 1968 gct1, gct2, gct3); 1969 __ sub(count, count, 1); 1970 __ cbz(count, L_do_card_marks); 1971 1972 // ======== loop entry is here ======== 1973 __ BIND(L_load_element); 1974 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1975 copied_oop, noreg, __ post(from, element_size), 1976 gct1); 1977 __ cbz(copied_oop, L_store_element); 1978 1979 __ load_klass(r19_klass, copied_oop);// query the object klass 1980 1981 BLOCK_COMMENT("type_check:"); 1982 generate_type_check(/*sub_klass*/r19_klass, 1983 /*super_check_offset*/ckoff, 1984 /*super_klass*/ckval, 1985 /*r_array_base*/gct1, 1986 /*temp2*/gct2, 1987 /*result*/r10, L_store_element); 1988 1989 // Fall through on failure! 1990 1991 // ======== end loop ======== 1992 1993 // It was a real error; we must depend on the caller to finish the job. 1994 // Register count = remaining oops, count_orig = total oops. 1995 // Emit GC store barriers for the oops we have copied and report 1996 // their number to the caller. 1997 1998 __ subs(count, count_save, count); // K = partially copied oop count 1999 __ eon(count, count, zr); // report (-1^K) to caller 2000 __ br(Assembler::EQ, L_done_pop); 2001 2002 __ BIND(L_do_card_marks); 2003 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2004 2005 __ bind(L_done_pop); 2006 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2007 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2008 2009 __ bind(L_done); 2010 __ mov(r0, count); 2011 __ leave(); 2012 __ ret(lr); 2013 2014 return start; 2015 } 2016 2017 // Perform range checks on the proposed arraycopy. 2018 // Kills temp, but nothing else. 2019 // Also, clean the sign bits of src_pos and dst_pos. 2020 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2021 Register src_pos, // source position (c_rarg1) 2022 Register dst, // destination array oo (c_rarg2) 2023 Register dst_pos, // destination position (c_rarg3) 2024 Register length, 2025 Register temp, 2026 Label& L_failed) { 2027 BLOCK_COMMENT("arraycopy_range_checks:"); 2028 2029 assert_different_registers(rscratch1, temp); 2030 2031 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2032 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2033 __ addw(temp, length, src_pos); 2034 __ cmpw(temp, rscratch1); 2035 __ br(Assembler::HI, L_failed); 2036 2037 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2038 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2039 __ addw(temp, length, dst_pos); 2040 __ cmpw(temp, rscratch1); 2041 __ br(Assembler::HI, L_failed); 2042 2043 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2044 __ movw(src_pos, src_pos); 2045 __ movw(dst_pos, dst_pos); 2046 2047 BLOCK_COMMENT("arraycopy_range_checks done"); 2048 } 2049 2050 // These stubs get called from some dumb test routine. 2051 // I'll write them properly when they're called from 2052 // something that's actually doing something. 2053 static void fake_arraycopy_stub(address src, address dst, int count) { 2054 assert(count == 0, "huh?"); 2055 } 2056 2057 2058 // 2059 // Generate 'unsafe' array copy stub 2060 // Though just as safe as the other stubs, it takes an unscaled 2061 // size_t argument instead of an element count. 2062 // 2063 // Input: 2064 // c_rarg0 - source array address 2065 // c_rarg1 - destination array address 2066 // c_rarg2 - byte count, treated as ssize_t, can be zero 2067 // 2068 // Examines the alignment of the operands and dispatches 2069 // to a long, int, short, or byte copy loop. 2070 // 2071 address generate_unsafe_copy(address byte_copy_entry, 2072 address short_copy_entry, 2073 address int_copy_entry, 2074 address long_copy_entry) { 2075 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id; 2076 2077 Label L_long_aligned, L_int_aligned, L_short_aligned; 2078 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2079 2080 __ align(CodeEntryAlignment); 2081 StubCodeMark mark(this, stub_id); 2082 address start = __ pc(); 2083 __ enter(); // required for proper stackwalking of RuntimeStub frame 2084 2085 // bump this on entry, not on exit: 2086 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2087 2088 __ orr(rscratch1, s, d); 2089 __ orr(rscratch1, rscratch1, count); 2090 2091 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2092 __ cbz(rscratch1, L_long_aligned); 2093 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2094 __ cbz(rscratch1, L_int_aligned); 2095 __ tbz(rscratch1, 0, L_short_aligned); 2096 __ b(RuntimeAddress(byte_copy_entry)); 2097 2098 __ BIND(L_short_aligned); 2099 __ lsr(count, count, LogBytesPerShort); // size => short_count 2100 __ b(RuntimeAddress(short_copy_entry)); 2101 __ BIND(L_int_aligned); 2102 __ lsr(count, count, LogBytesPerInt); // size => int_count 2103 __ b(RuntimeAddress(int_copy_entry)); 2104 __ BIND(L_long_aligned); 2105 __ lsr(count, count, LogBytesPerLong); // size => long_count 2106 __ b(RuntimeAddress(long_copy_entry)); 2107 2108 return start; 2109 } 2110 2111 // 2112 // Generate generic array copy stubs 2113 // 2114 // Input: 2115 // c_rarg0 - src oop 2116 // c_rarg1 - src_pos (32-bits) 2117 // c_rarg2 - dst oop 2118 // c_rarg3 - dst_pos (32-bits) 2119 // c_rarg4 - element count (32-bits) 2120 // 2121 // Output: 2122 // r0 == 0 - success 2123 // r0 == -1^K - failure, where K is partial transfer count 2124 // 2125 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2126 address int_copy_entry, address oop_copy_entry, 2127 address long_copy_entry, address checkcast_copy_entry) { 2128 StubId stub_id = StubId::stubgen_generic_arraycopy_id; 2129 2130 Label L_failed, L_objArray; 2131 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2132 2133 // Input registers 2134 const Register src = c_rarg0; // source array oop 2135 const Register src_pos = c_rarg1; // source position 2136 const Register dst = c_rarg2; // destination array oop 2137 const Register dst_pos = c_rarg3; // destination position 2138 const Register length = c_rarg4; 2139 2140 2141 // Registers used as temps 2142 const Register dst_klass = c_rarg5; 2143 2144 __ align(CodeEntryAlignment); 2145 2146 StubCodeMark mark(this, stub_id); 2147 2148 address start = __ pc(); 2149 2150 __ enter(); // required for proper stackwalking of RuntimeStub frame 2151 2152 // bump this on entry, not on exit: 2153 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2154 2155 //----------------------------------------------------------------------- 2156 // Assembler stub will be used for this call to arraycopy 2157 // if the following conditions are met: 2158 // 2159 // (1) src and dst must not be null. 2160 // (2) src_pos must not be negative. 2161 // (3) dst_pos must not be negative. 2162 // (4) length must not be negative. 2163 // (5) src klass and dst klass should be the same and not null. 2164 // (6) src and dst should be arrays. 2165 // (7) src_pos + length must not exceed length of src. 2166 // (8) dst_pos + length must not exceed length of dst. 2167 // 2168 2169 // if (src == nullptr) return -1; 2170 __ cbz(src, L_failed); 2171 2172 // if (src_pos < 0) return -1; 2173 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2174 2175 // if (dst == nullptr) return -1; 2176 __ cbz(dst, L_failed); 2177 2178 // if (dst_pos < 0) return -1; 2179 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2180 2181 // registers used as temp 2182 const Register scratch_length = r16; // elements count to copy 2183 const Register scratch_src_klass = r17; // array klass 2184 const Register lh = r15; // layout helper 2185 2186 // if (length < 0) return -1; 2187 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2188 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2189 2190 __ load_klass(scratch_src_klass, src); 2191 #ifdef ASSERT 2192 // assert(src->klass() != nullptr); 2193 { 2194 BLOCK_COMMENT("assert klasses not null {"); 2195 Label L1, L2; 2196 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2197 __ bind(L1); 2198 __ stop("broken null klass"); 2199 __ bind(L2); 2200 __ load_klass(rscratch1, dst); 2201 __ cbz(rscratch1, L1); // this would be broken also 2202 BLOCK_COMMENT("} assert klasses not null done"); 2203 } 2204 #endif 2205 2206 // Load layout helper (32-bits) 2207 // 2208 // |array_tag| | header_size | element_type | |log2_element_size| 2209 // 32 30 24 16 8 2 0 2210 // 2211 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2212 // 2213 2214 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2215 2216 // Handle objArrays completely differently... 2217 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2218 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2219 __ movw(rscratch1, objArray_lh); 2220 __ eorw(rscratch2, lh, rscratch1); 2221 __ cbzw(rscratch2, L_objArray); 2222 2223 // if (src->klass() != dst->klass()) return -1; 2224 __ load_klass(rscratch2, dst); 2225 __ eor(rscratch2, rscratch2, scratch_src_klass); 2226 __ cbnz(rscratch2, L_failed); 2227 2228 // if (!src->is_Array()) return -1; 2229 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2230 2231 // At this point, it is known to be a typeArray (array_tag 0x3). 2232 #ifdef ASSERT 2233 { 2234 BLOCK_COMMENT("assert primitive array {"); 2235 Label L; 2236 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2237 __ cmpw(lh, rscratch2); 2238 __ br(Assembler::GE, L); 2239 __ stop("must be a primitive array"); 2240 __ bind(L); 2241 BLOCK_COMMENT("} assert primitive array done"); 2242 } 2243 #endif 2244 2245 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2246 rscratch2, L_failed); 2247 2248 // TypeArrayKlass 2249 // 2250 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2251 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2252 // 2253 2254 const Register rscratch1_offset = rscratch1; // array offset 2255 const Register r15_elsize = lh; // element size 2256 2257 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2258 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2259 __ add(src, src, rscratch1_offset); // src array offset 2260 __ add(dst, dst, rscratch1_offset); // dst array offset 2261 BLOCK_COMMENT("choose copy loop based on element size"); 2262 2263 // next registers should be set before the jump to corresponding stub 2264 const Register from = c_rarg0; // source array address 2265 const Register to = c_rarg1; // destination array address 2266 const Register count = c_rarg2; // elements count 2267 2268 // 'from', 'to', 'count' registers should be set in such order 2269 // since they are the same as 'src', 'src_pos', 'dst'. 2270 2271 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2272 2273 // The possible values of elsize are 0-3, i.e. exact_log2(element 2274 // size in bytes). We do a simple bitwise binary search. 2275 __ BIND(L_copy_bytes); 2276 __ tbnz(r15_elsize, 1, L_copy_ints); 2277 __ tbnz(r15_elsize, 0, L_copy_shorts); 2278 __ lea(from, Address(src, src_pos));// src_addr 2279 __ lea(to, Address(dst, dst_pos));// dst_addr 2280 __ movw(count, scratch_length); // length 2281 __ b(RuntimeAddress(byte_copy_entry)); 2282 2283 __ BIND(L_copy_shorts); 2284 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2285 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2286 __ movw(count, scratch_length); // length 2287 __ b(RuntimeAddress(short_copy_entry)); 2288 2289 __ BIND(L_copy_ints); 2290 __ tbnz(r15_elsize, 0, L_copy_longs); 2291 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2292 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2293 __ movw(count, scratch_length); // length 2294 __ b(RuntimeAddress(int_copy_entry)); 2295 2296 __ BIND(L_copy_longs); 2297 #ifdef ASSERT 2298 { 2299 BLOCK_COMMENT("assert long copy {"); 2300 Label L; 2301 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2302 __ cmpw(r15_elsize, LogBytesPerLong); 2303 __ br(Assembler::EQ, L); 2304 __ stop("must be long copy, but elsize is wrong"); 2305 __ bind(L); 2306 BLOCK_COMMENT("} assert long copy done"); 2307 } 2308 #endif 2309 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2310 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2311 __ movw(count, scratch_length); // length 2312 __ b(RuntimeAddress(long_copy_entry)); 2313 2314 // ObjArrayKlass 2315 __ BIND(L_objArray); 2316 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2317 2318 Label L_plain_copy, L_checkcast_copy; 2319 // test array classes for subtyping 2320 __ load_klass(r15, dst); 2321 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2322 __ br(Assembler::NE, L_checkcast_copy); 2323 2324 // Identically typed arrays can be copied without element-wise checks. 2325 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2326 rscratch2, L_failed); 2327 2328 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2329 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2330 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2331 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2332 __ movw(count, scratch_length); // length 2333 __ BIND(L_plain_copy); 2334 __ b(RuntimeAddress(oop_copy_entry)); 2335 2336 __ BIND(L_checkcast_copy); 2337 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2338 { 2339 // Before looking at dst.length, make sure dst is also an objArray. 2340 __ ldrw(rscratch1, Address(r15, lh_offset)); 2341 __ movw(rscratch2, objArray_lh); 2342 __ eorw(rscratch1, rscratch1, rscratch2); 2343 __ cbnzw(rscratch1, L_failed); 2344 2345 // It is safe to examine both src.length and dst.length. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 r15, L_failed); 2348 2349 __ load_klass(dst_klass, dst); // reload 2350 2351 // Marshal the base address arguments now, freeing registers. 2352 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2353 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2354 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2355 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2356 __ movw(count, length); // length (reloaded) 2357 Register sco_temp = c_rarg3; // this register is free now 2358 assert_different_registers(from, to, count, sco_temp, 2359 dst_klass, scratch_src_klass); 2360 // assert_clean_int(count, sco_temp); 2361 2362 // Generate the type check. 2363 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2364 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2365 2366 // Smashes rscratch1, rscratch2 2367 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2368 L_plain_copy); 2369 2370 // Fetch destination element klass from the ObjArrayKlass header. 2371 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2372 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2373 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2374 2375 // the checkcast_copy loop needs two extra arguments: 2376 assert(c_rarg3 == sco_temp, "#3 already in place"); 2377 // Set up arguments for checkcast_copy_entry. 2378 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2379 __ b(RuntimeAddress(checkcast_copy_entry)); 2380 } 2381 2382 __ BIND(L_failed); 2383 __ mov(r0, -1); 2384 __ leave(); // required for proper stackwalking of RuntimeStub frame 2385 __ ret(lr); 2386 2387 return start; 2388 } 2389 2390 // 2391 // Generate stub for array fill. If "aligned" is true, the 2392 // "to" address is assumed to be heapword aligned. 2393 // 2394 // Arguments for generated stub: 2395 // to: c_rarg0 2396 // value: c_rarg1 2397 // count: c_rarg2 treated as signed 2398 // 2399 address generate_fill(StubId stub_id) { 2400 BasicType t; 2401 bool aligned; 2402 2403 switch (stub_id) { 2404 case StubId::stubgen_jbyte_fill_id: 2405 t = T_BYTE; 2406 aligned = false; 2407 break; 2408 case StubId::stubgen_jshort_fill_id: 2409 t = T_SHORT; 2410 aligned = false; 2411 break; 2412 case StubId::stubgen_jint_fill_id: 2413 t = T_INT; 2414 aligned = false; 2415 break; 2416 case StubId::stubgen_arrayof_jbyte_fill_id: 2417 t = T_BYTE; 2418 aligned = true; 2419 break; 2420 case StubId::stubgen_arrayof_jshort_fill_id: 2421 t = T_SHORT; 2422 aligned = true; 2423 break; 2424 case StubId::stubgen_arrayof_jint_fill_id: 2425 t = T_INT; 2426 aligned = true; 2427 break; 2428 default: 2429 ShouldNotReachHere(); 2430 }; 2431 2432 __ align(CodeEntryAlignment); 2433 StubCodeMark mark(this, stub_id); 2434 address start = __ pc(); 2435 2436 BLOCK_COMMENT("Entry:"); 2437 2438 const Register to = c_rarg0; // source array address 2439 const Register value = c_rarg1; // value 2440 const Register count = c_rarg2; // elements count 2441 2442 const Register bz_base = r10; // base for block_zero routine 2443 const Register cnt_words = r11; // temp register 2444 2445 __ enter(); 2446 2447 Label L_fill_elements, L_exit1; 2448 2449 int shift = -1; 2450 switch (t) { 2451 case T_BYTE: 2452 shift = 0; 2453 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2454 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2455 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2456 __ br(Assembler::LO, L_fill_elements); 2457 break; 2458 case T_SHORT: 2459 shift = 1; 2460 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2461 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2462 __ br(Assembler::LO, L_fill_elements); 2463 break; 2464 case T_INT: 2465 shift = 2; 2466 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2467 __ br(Assembler::LO, L_fill_elements); 2468 break; 2469 default: ShouldNotReachHere(); 2470 } 2471 2472 // Align source address at 8 bytes address boundary. 2473 Label L_skip_align1, L_skip_align2, L_skip_align4; 2474 if (!aligned) { 2475 switch (t) { 2476 case T_BYTE: 2477 // One byte misalignment happens only for byte arrays. 2478 __ tbz(to, 0, L_skip_align1); 2479 __ strb(value, Address(__ post(to, 1))); 2480 __ subw(count, count, 1); 2481 __ bind(L_skip_align1); 2482 // Fallthrough 2483 case T_SHORT: 2484 // Two bytes misalignment happens only for byte and short (char) arrays. 2485 __ tbz(to, 1, L_skip_align2); 2486 __ strh(value, Address(__ post(to, 2))); 2487 __ subw(count, count, 2 >> shift); 2488 __ bind(L_skip_align2); 2489 // Fallthrough 2490 case T_INT: 2491 // Align to 8 bytes, we know we are 4 byte aligned to start. 2492 __ tbz(to, 2, L_skip_align4); 2493 __ strw(value, Address(__ post(to, 4))); 2494 __ subw(count, count, 4 >> shift); 2495 __ bind(L_skip_align4); 2496 break; 2497 default: ShouldNotReachHere(); 2498 } 2499 } 2500 2501 // 2502 // Fill large chunks 2503 // 2504 __ lsrw(cnt_words, count, 3 - shift); // number of words 2505 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2506 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2507 if (UseBlockZeroing) { 2508 Label non_block_zeroing, rest; 2509 // If the fill value is zero we can use the fast zero_words(). 2510 __ cbnz(value, non_block_zeroing); 2511 __ mov(bz_base, to); 2512 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2513 address tpc = __ zero_words(bz_base, cnt_words); 2514 if (tpc == nullptr) { 2515 fatal("CodeCache is full at generate_fill"); 2516 } 2517 __ b(rest); 2518 __ bind(non_block_zeroing); 2519 __ fill_words(to, cnt_words, value); 2520 __ bind(rest); 2521 } else { 2522 __ fill_words(to, cnt_words, value); 2523 } 2524 2525 // Remaining count is less than 8 bytes. Fill it by a single store. 2526 // Note that the total length is no less than 8 bytes. 2527 if (t == T_BYTE || t == T_SHORT) { 2528 Label L_exit1; 2529 __ cbzw(count, L_exit1); 2530 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2531 __ str(value, Address(to, -8)); // overwrite some elements 2532 __ bind(L_exit1); 2533 __ leave(); 2534 __ ret(lr); 2535 } 2536 2537 // Handle copies less than 8 bytes. 2538 Label L_fill_2, L_fill_4, L_exit2; 2539 __ bind(L_fill_elements); 2540 switch (t) { 2541 case T_BYTE: 2542 __ tbz(count, 0, L_fill_2); 2543 __ strb(value, Address(__ post(to, 1))); 2544 __ bind(L_fill_2); 2545 __ tbz(count, 1, L_fill_4); 2546 __ strh(value, Address(__ post(to, 2))); 2547 __ bind(L_fill_4); 2548 __ tbz(count, 2, L_exit2); 2549 __ strw(value, Address(to)); 2550 break; 2551 case T_SHORT: 2552 __ tbz(count, 0, L_fill_4); 2553 __ strh(value, Address(__ post(to, 2))); 2554 __ bind(L_fill_4); 2555 __ tbz(count, 1, L_exit2); 2556 __ strw(value, Address(to)); 2557 break; 2558 case T_INT: 2559 __ cbzw(count, L_exit2); 2560 __ strw(value, Address(to)); 2561 break; 2562 default: ShouldNotReachHere(); 2563 } 2564 __ bind(L_exit2); 2565 __ leave(); 2566 __ ret(lr); 2567 return start; 2568 } 2569 2570 address generate_unsafecopy_common_error_exit() { 2571 address start_pc = __ pc(); 2572 __ leave(); 2573 __ mov(r0, 0); 2574 __ ret(lr); 2575 return start_pc; 2576 } 2577 2578 // 2579 // Generate 'unsafe' set memory stub 2580 // Though just as safe as the other stubs, it takes an unscaled 2581 // size_t (# bytes) argument instead of an element count. 2582 // 2583 // This fill operation is atomicity preserving: as long as the 2584 // address supplied is sufficiently aligned, all writes of up to 64 2585 // bits in size are single-copy atomic. 2586 // 2587 // Input: 2588 // c_rarg0 - destination array address 2589 // c_rarg1 - byte count (size_t) 2590 // c_rarg2 - byte value 2591 // 2592 address generate_unsafe_setmemory() { 2593 __ align(CodeEntryAlignment); 2594 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id); 2595 address start = __ pc(); 2596 2597 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2; 2598 Label tail; 2599 2600 UnsafeMemoryAccessMark umam(this, true, false); 2601 2602 __ enter(); // required for proper stackwalking of RuntimeStub frame 2603 2604 __ dup(v0, __ T16B, value); 2605 2606 if (AvoidUnalignedAccesses) { 2607 __ cmp(count, (u1)16); 2608 __ br(__ LO, tail); 2609 2610 __ mov(rscratch1, 16); 2611 __ andr(rscratch2, dest, 15); 2612 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest 2613 __ strq(v0, Address(dest)); 2614 __ sub(count, count, rscratch1); 2615 __ add(dest, dest, rscratch1); 2616 } 2617 2618 __ subs(count, count, (u1)64); 2619 __ br(__ LO, tail); 2620 { 2621 Label again; 2622 __ bind(again); 2623 __ stpq(v0, v0, Address(dest)); 2624 __ stpq(v0, v0, Address(dest, 32)); 2625 2626 __ subs(count, count, 64); 2627 __ add(dest, dest, 64); 2628 __ br(__ HS, again); 2629 } 2630 2631 __ bind(tail); 2632 // The count of bytes is off by 64, but we don't need to correct 2633 // it because we're only going to use the least-significant few 2634 // count bits from here on. 2635 // __ add(count, count, 64); 2636 2637 { 2638 Label dont; 2639 __ tbz(count, exact_log2(32), dont); 2640 __ stpq(v0, v0, __ post(dest, 32)); 2641 __ bind(dont); 2642 } 2643 { 2644 Label dont; 2645 __ tbz(count, exact_log2(16), dont); 2646 __ strq(v0, __ post(dest, 16)); 2647 __ bind(dont); 2648 } 2649 { 2650 Label dont; 2651 __ tbz(count, exact_log2(8), dont); 2652 __ strd(v0, __ post(dest, 8)); 2653 __ bind(dont); 2654 } 2655 2656 Label finished; 2657 __ tst(count, 7); 2658 __ br(__ EQ, finished); 2659 2660 { 2661 Label dont; 2662 __ tbz(count, exact_log2(4), dont); 2663 __ strs(v0, __ post(dest, 4)); 2664 __ bind(dont); 2665 } 2666 { 2667 Label dont; 2668 __ tbz(count, exact_log2(2), dont); 2669 __ bfi(value, value, 8, 8); 2670 __ strh(value, __ post(dest, 2)); 2671 __ bind(dont); 2672 } 2673 { 2674 Label dont; 2675 __ tbz(count, exact_log2(1), dont); 2676 __ strb(value, Address(dest)); 2677 __ bind(dont); 2678 } 2679 2680 __ bind(finished); 2681 __ leave(); 2682 __ ret(lr); 2683 2684 return start; 2685 } 2686 2687 address generate_data_cache_writeback() { 2688 const Register line = c_rarg0; // address of line to write back 2689 2690 __ align(CodeEntryAlignment); 2691 2692 StubId stub_id = StubId::stubgen_data_cache_writeback_id; 2693 StubCodeMark mark(this, stub_id); 2694 2695 address start = __ pc(); 2696 __ enter(); 2697 __ cache_wb(Address(line, 0)); 2698 __ leave(); 2699 __ ret(lr); 2700 2701 return start; 2702 } 2703 2704 address generate_data_cache_writeback_sync() { 2705 const Register is_pre = c_rarg0; // pre or post sync 2706 2707 __ align(CodeEntryAlignment); 2708 2709 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id; 2710 StubCodeMark mark(this, stub_id); 2711 2712 // pre wbsync is a no-op 2713 // post wbsync translates to an sfence 2714 2715 Label skip; 2716 address start = __ pc(); 2717 __ enter(); 2718 __ cbnz(is_pre, skip); 2719 __ cache_wbsync(false); 2720 __ bind(skip); 2721 __ leave(); 2722 __ ret(lr); 2723 2724 return start; 2725 } 2726 2727 void generate_arraycopy_stubs() { 2728 // Some copy stubs publish a normal entry and then a 2nd 'fallback' 2729 // entry immediately following their stack push. This can be used 2730 // as a post-push branch target for compatible stubs when they 2731 // identify a special case that can be handled by the fallback 2732 // stub e.g a disjoint copy stub may be use as a special case 2733 // fallback for its compatible conjoint copy stub. 2734 // 2735 // A no push entry is always returned in the following local and 2736 // then published by assigning to the appropriate entry field in 2737 // class StubRoutines. The entry value is then passed to the 2738 // generator for the compatible stub. That means the entry must be 2739 // listed when saving to/restoring from the AOT cache, ensuring 2740 // that the inter-stub jumps are noted at AOT-cache save and 2741 // relocated at AOT cache load. 2742 address nopush_entry; 2743 2744 // generate the common exit first so later stubs can rely on it if 2745 // they want an UnsafeMemoryAccess exit non-local to the stub 2746 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit(); 2747 // register the stub as the default exit with class UnsafeMemoryAccess 2748 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit); 2749 2750 // generate and publish arch64-specific bulk copy routines first 2751 // so we can call them from other copy stubs 2752 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15); 2753 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15); 2754 2755 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15); 2756 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15); 2757 2758 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15); 2759 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15); 2760 2761 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2762 2763 //*** jbyte 2764 // Always need aligned and unaligned versions 2765 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry); 2766 // disjoint nopush entry is needed by conjoint copy 2767 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry; 2768 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry); 2769 // conjoint nopush entry is needed by generic/unsafe copy 2770 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry; 2771 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry); 2772 // disjoint arrayof nopush entry is needed by conjoint copy 2773 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry; 2774 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr); 2775 2776 //*** jshort 2777 // Always need aligned and unaligned versions 2778 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry); 2779 // disjoint nopush entry is needed by conjoint copy 2780 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry; 2781 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry); 2782 // conjoint nopush entry is used by generic/unsafe copy 2783 StubRoutines::_jshort_arraycopy_nopush = nopush_entry; 2784 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry); 2785 // disjoint arrayof nopush entry is needed by conjoint copy 2786 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry; 2787 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr); 2788 2789 //*** jint 2790 // Aligned versions 2791 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry); 2792 // disjoint arrayof nopush entry is needed by conjoint copy 2793 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry; 2794 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr); 2795 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2796 // jint_arraycopy_nopush always points to the unaligned version 2797 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry); 2798 // disjoint nopush entry is needed by conjoint copy 2799 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry; 2800 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry); 2801 // conjoint nopush entry is needed by generic/unsafe copy 2802 StubRoutines::_jint_arraycopy_nopush = nopush_entry; 2803 2804 //*** jlong 2805 // It is always aligned 2806 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry); 2807 // disjoint arrayof nopush entry is needed by conjoint copy 2808 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry; 2809 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry); 2810 // conjoint nopush entry is needed by generic/unsafe copy 2811 StubRoutines::_jlong_arraycopy_nopush = nopush_entry; 2812 // disjoint normal/nopush and conjoint normal entries are not 2813 // generated since the arrayof versions are the same 2814 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2815 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush; 2816 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2817 2818 //*** oops 2819 { 2820 StubRoutines::_arrayof_oop_disjoint_arraycopy 2821 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry); 2822 // disjoint arrayof nopush entry is needed by conjoint copy 2823 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry; 2824 StubRoutines::_arrayof_oop_arraycopy 2825 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry); 2826 // conjoint arrayof nopush entry is needed by generic/unsafe copy 2827 StubRoutines::_oop_arraycopy_nopush = nopush_entry; 2828 // Aligned versions without pre-barriers 2829 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2830 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry); 2831 // disjoint arrayof+uninit nopush entry is needed by conjoint copy 2832 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry; 2833 // note that we don't need a returned nopush entry because the 2834 // generic/unsafe copy does not cater for uninit arrays. 2835 StubRoutines::_arrayof_oop_arraycopy_uninit 2836 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr); 2837 } 2838 2839 // for oop copies reuse arrayof entries for non-arrayof cases 2840 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2841 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush; 2842 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2843 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2844 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush; 2845 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2846 2847 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry); 2848 // checkcast nopush entry is needed by generic copy 2849 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry; 2850 // note that we don't need a returned nopush entry because the 2851 // generic copy does not cater for uninit arrays. 2852 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr); 2853 2854 // unsafe arraycopy may fallback on conjoint stubs 2855 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush, 2856 StubRoutines::_jshort_arraycopy_nopush, 2857 StubRoutines::_jint_arraycopy_nopush, 2858 StubRoutines::_jlong_arraycopy_nopush); 2859 2860 // generic arraycopy may fallback on conjoint stubs 2861 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush, 2862 StubRoutines::_jshort_arraycopy_nopush, 2863 StubRoutines::_jint_arraycopy_nopush, 2864 StubRoutines::_oop_arraycopy_nopush, 2865 StubRoutines::_jlong_arraycopy_nopush, 2866 StubRoutines::_checkcast_arraycopy_nopush); 2867 2868 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id); 2869 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id); 2870 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id); 2871 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id); 2872 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id); 2873 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id); 2874 } 2875 2876 void generate_math_stubs() { Unimplemented(); } 2877 2878 // Arguments: 2879 // 2880 // Inputs: 2881 // c_rarg0 - source byte array address 2882 // c_rarg1 - destination byte array address 2883 // c_rarg2 - K (key) in little endian int array 2884 // 2885 address generate_aescrypt_encryptBlock() { 2886 __ align(CodeEntryAlignment); 2887 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id; 2888 StubCodeMark mark(this, stub_id); 2889 2890 const Register from = c_rarg0; // source array address 2891 const Register to = c_rarg1; // destination array address 2892 const Register key = c_rarg2; // key array address 2893 const Register keylen = rscratch1; 2894 2895 address start = __ pc(); 2896 __ enter(); 2897 2898 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2899 2900 __ aesenc_loadkeys(key, keylen); 2901 __ aesecb_encrypt(from, to, keylen); 2902 2903 __ mov(r0, 0); 2904 2905 __ leave(); 2906 __ ret(lr); 2907 2908 return start; 2909 } 2910 2911 // Arguments: 2912 // 2913 // Inputs: 2914 // c_rarg0 - source byte array address 2915 // c_rarg1 - destination byte array address 2916 // c_rarg2 - K (key) in little endian int array 2917 // 2918 address generate_aescrypt_decryptBlock() { 2919 assert(UseAES, "need AES cryptographic extension support"); 2920 __ align(CodeEntryAlignment); 2921 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id; 2922 StubCodeMark mark(this, stub_id); 2923 Label L_doLast; 2924 2925 const Register from = c_rarg0; // source array address 2926 const Register to = c_rarg1; // destination array address 2927 const Register key = c_rarg2; // key array address 2928 const Register keylen = rscratch1; 2929 2930 address start = __ pc(); 2931 __ enter(); // required for proper stackwalking of RuntimeStub frame 2932 2933 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2934 2935 __ aesecb_decrypt(from, to, key, keylen); 2936 2937 __ mov(r0, 0); 2938 2939 __ leave(); 2940 __ ret(lr); 2941 2942 return start; 2943 } 2944 2945 // Arguments: 2946 // 2947 // Inputs: 2948 // c_rarg0 - source byte array address 2949 // c_rarg1 - destination byte array address 2950 // c_rarg2 - K (key) in little endian int array 2951 // c_rarg3 - r vector byte array address 2952 // c_rarg4 - input length 2953 // 2954 // Output: 2955 // x0 - input length 2956 // 2957 address generate_cipherBlockChaining_encryptAESCrypt() { 2958 assert(UseAES, "need AES cryptographic extension support"); 2959 __ align(CodeEntryAlignment); 2960 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id; 2961 StubCodeMark mark(this, stub_id); 2962 2963 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2964 2965 const Register from = c_rarg0; // source array address 2966 const Register to = c_rarg1; // destination array address 2967 const Register key = c_rarg2; // key array address 2968 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2969 // and left with the results of the last encryption block 2970 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2971 const Register keylen = rscratch1; 2972 2973 address start = __ pc(); 2974 2975 __ enter(); 2976 2977 __ movw(rscratch2, len_reg); 2978 2979 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2980 2981 __ ld1(v0, __ T16B, rvec); 2982 2983 __ cmpw(keylen, 52); 2984 __ br(Assembler::CC, L_loadkeys_44); 2985 __ br(Assembler::EQ, L_loadkeys_52); 2986 2987 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2988 __ rev32(v17, __ T16B, v17); 2989 __ rev32(v18, __ T16B, v18); 2990 __ BIND(L_loadkeys_52); 2991 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2992 __ rev32(v19, __ T16B, v19); 2993 __ rev32(v20, __ T16B, v20); 2994 __ BIND(L_loadkeys_44); 2995 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2996 __ rev32(v21, __ T16B, v21); 2997 __ rev32(v22, __ T16B, v22); 2998 __ rev32(v23, __ T16B, v23); 2999 __ rev32(v24, __ T16B, v24); 3000 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3001 __ rev32(v25, __ T16B, v25); 3002 __ rev32(v26, __ T16B, v26); 3003 __ rev32(v27, __ T16B, v27); 3004 __ rev32(v28, __ T16B, v28); 3005 __ ld1(v29, v30, v31, __ T16B, key); 3006 __ rev32(v29, __ T16B, v29); 3007 __ rev32(v30, __ T16B, v30); 3008 __ rev32(v31, __ T16B, v31); 3009 3010 __ BIND(L_aes_loop); 3011 __ ld1(v1, __ T16B, __ post(from, 16)); 3012 __ eor(v0, __ T16B, v0, v1); 3013 3014 __ br(Assembler::CC, L_rounds_44); 3015 __ br(Assembler::EQ, L_rounds_52); 3016 3017 __ aese(v0, v17); __ aesmc(v0, v0); 3018 __ aese(v0, v18); __ aesmc(v0, v0); 3019 __ BIND(L_rounds_52); 3020 __ aese(v0, v19); __ aesmc(v0, v0); 3021 __ aese(v0, v20); __ aesmc(v0, v0); 3022 __ BIND(L_rounds_44); 3023 __ aese(v0, v21); __ aesmc(v0, v0); 3024 __ aese(v0, v22); __ aesmc(v0, v0); 3025 __ aese(v0, v23); __ aesmc(v0, v0); 3026 __ aese(v0, v24); __ aesmc(v0, v0); 3027 __ aese(v0, v25); __ aesmc(v0, v0); 3028 __ aese(v0, v26); __ aesmc(v0, v0); 3029 __ aese(v0, v27); __ aesmc(v0, v0); 3030 __ aese(v0, v28); __ aesmc(v0, v0); 3031 __ aese(v0, v29); __ aesmc(v0, v0); 3032 __ aese(v0, v30); 3033 __ eor(v0, __ T16B, v0, v31); 3034 3035 __ st1(v0, __ T16B, __ post(to, 16)); 3036 3037 __ subw(len_reg, len_reg, 16); 3038 __ cbnzw(len_reg, L_aes_loop); 3039 3040 __ st1(v0, __ T16B, rvec); 3041 3042 __ mov(r0, rscratch2); 3043 3044 __ leave(); 3045 __ ret(lr); 3046 3047 return start; 3048 } 3049 3050 // Arguments: 3051 // 3052 // Inputs: 3053 // c_rarg0 - source byte array address 3054 // c_rarg1 - destination byte array address 3055 // c_rarg2 - K (key) in little endian int array 3056 // c_rarg3 - r vector byte array address 3057 // c_rarg4 - input length 3058 // 3059 // Output: 3060 // r0 - input length 3061 // 3062 address generate_cipherBlockChaining_decryptAESCrypt() { 3063 assert(UseAES, "need AES cryptographic extension support"); 3064 __ align(CodeEntryAlignment); 3065 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id; 3066 StubCodeMark mark(this, stub_id); 3067 3068 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 3069 3070 const Register from = c_rarg0; // source array address 3071 const Register to = c_rarg1; // destination array address 3072 const Register key = c_rarg2; // key array address 3073 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3074 // and left with the results of the last encryption block 3075 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3076 const Register keylen = rscratch1; 3077 3078 address start = __ pc(); 3079 3080 __ enter(); 3081 3082 __ movw(rscratch2, len_reg); 3083 3084 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3085 3086 __ ld1(v2, __ T16B, rvec); 3087 3088 __ ld1(v31, __ T16B, __ post(key, 16)); 3089 __ rev32(v31, __ T16B, v31); 3090 3091 __ cmpw(keylen, 52); 3092 __ br(Assembler::CC, L_loadkeys_44); 3093 __ br(Assembler::EQ, L_loadkeys_52); 3094 3095 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 3096 __ rev32(v17, __ T16B, v17); 3097 __ rev32(v18, __ T16B, v18); 3098 __ BIND(L_loadkeys_52); 3099 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 3100 __ rev32(v19, __ T16B, v19); 3101 __ rev32(v20, __ T16B, v20); 3102 __ BIND(L_loadkeys_44); 3103 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 3104 __ rev32(v21, __ T16B, v21); 3105 __ rev32(v22, __ T16B, v22); 3106 __ rev32(v23, __ T16B, v23); 3107 __ rev32(v24, __ T16B, v24); 3108 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 3109 __ rev32(v25, __ T16B, v25); 3110 __ rev32(v26, __ T16B, v26); 3111 __ rev32(v27, __ T16B, v27); 3112 __ rev32(v28, __ T16B, v28); 3113 __ ld1(v29, v30, __ T16B, key); 3114 __ rev32(v29, __ T16B, v29); 3115 __ rev32(v30, __ T16B, v30); 3116 3117 __ BIND(L_aes_loop); 3118 __ ld1(v0, __ T16B, __ post(from, 16)); 3119 __ orr(v1, __ T16B, v0, v0); 3120 3121 __ br(Assembler::CC, L_rounds_44); 3122 __ br(Assembler::EQ, L_rounds_52); 3123 3124 __ aesd(v0, v17); __ aesimc(v0, v0); 3125 __ aesd(v0, v18); __ aesimc(v0, v0); 3126 __ BIND(L_rounds_52); 3127 __ aesd(v0, v19); __ aesimc(v0, v0); 3128 __ aesd(v0, v20); __ aesimc(v0, v0); 3129 __ BIND(L_rounds_44); 3130 __ aesd(v0, v21); __ aesimc(v0, v0); 3131 __ aesd(v0, v22); __ aesimc(v0, v0); 3132 __ aesd(v0, v23); __ aesimc(v0, v0); 3133 __ aesd(v0, v24); __ aesimc(v0, v0); 3134 __ aesd(v0, v25); __ aesimc(v0, v0); 3135 __ aesd(v0, v26); __ aesimc(v0, v0); 3136 __ aesd(v0, v27); __ aesimc(v0, v0); 3137 __ aesd(v0, v28); __ aesimc(v0, v0); 3138 __ aesd(v0, v29); __ aesimc(v0, v0); 3139 __ aesd(v0, v30); 3140 __ eor(v0, __ T16B, v0, v31); 3141 __ eor(v0, __ T16B, v0, v2); 3142 3143 __ st1(v0, __ T16B, __ post(to, 16)); 3144 __ orr(v2, __ T16B, v1, v1); 3145 3146 __ subw(len_reg, len_reg, 16); 3147 __ cbnzw(len_reg, L_aes_loop); 3148 3149 __ st1(v2, __ T16B, rvec); 3150 3151 __ mov(r0, rscratch2); 3152 3153 __ leave(); 3154 __ ret(lr); 3155 3156 return start; 3157 } 3158 3159 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3160 // Inputs: 128-bits. in is preserved. 3161 // The least-significant 64-bit word is in the upper dword of each vector. 3162 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3163 // Output: result 3164 void be_add_128_64(FloatRegister result, FloatRegister in, 3165 FloatRegister inc, FloatRegister tmp) { 3166 assert_different_registers(result, tmp, inc); 3167 3168 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3169 // input 3170 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3171 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3172 // MSD == 0 (must be!) to LSD 3173 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3174 } 3175 3176 // CTR AES crypt. 3177 // Arguments: 3178 // 3179 // Inputs: 3180 // c_rarg0 - source byte array address 3181 // c_rarg1 - destination byte array address 3182 // c_rarg2 - K (key) in little endian int array 3183 // c_rarg3 - counter vector byte array address 3184 // c_rarg4 - input length 3185 // c_rarg5 - saved encryptedCounter start 3186 // c_rarg6 - saved used length 3187 // 3188 // Output: 3189 // r0 - input length 3190 // 3191 address generate_counterMode_AESCrypt() { 3192 const Register in = c_rarg0; 3193 const Register out = c_rarg1; 3194 const Register key = c_rarg2; 3195 const Register counter = c_rarg3; 3196 const Register saved_len = c_rarg4, len = r10; 3197 const Register saved_encrypted_ctr = c_rarg5; 3198 const Register used_ptr = c_rarg6, used = r12; 3199 3200 const Register offset = r7; 3201 const Register keylen = r11; 3202 3203 const unsigned char block_size = 16; 3204 const int bulk_width = 4; 3205 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3206 // performance with larger data sizes, but it also means that the 3207 // fast path isn't used until you have at least 8 blocks, and up 3208 // to 127 bytes of data will be executed on the slow path. For 3209 // that reason, and also so as not to blow away too much icache, 4 3210 // blocks seems like a sensible compromise. 3211 3212 // Algorithm: 3213 // 3214 // if (len == 0) { 3215 // goto DONE; 3216 // } 3217 // int result = len; 3218 // do { 3219 // if (used >= blockSize) { 3220 // if (len >= bulk_width * blockSize) { 3221 // CTR_large_block(); 3222 // if (len == 0) 3223 // goto DONE; 3224 // } 3225 // for (;;) { 3226 // 16ByteVector v0 = counter; 3227 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3228 // used = 0; 3229 // if (len < blockSize) 3230 // break; /* goto NEXT */ 3231 // 16ByteVector v1 = load16Bytes(in, offset); 3232 // v1 = v1 ^ encryptedCounter; 3233 // store16Bytes(out, offset); 3234 // used = blockSize; 3235 // offset += blockSize; 3236 // len -= blockSize; 3237 // if (len == 0) 3238 // goto DONE; 3239 // } 3240 // } 3241 // NEXT: 3242 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3243 // len--; 3244 // } while (len != 0); 3245 // DONE: 3246 // return result; 3247 // 3248 // CTR_large_block() 3249 // Wide bulk encryption of whole blocks. 3250 3251 __ align(CodeEntryAlignment); 3252 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id; 3253 StubCodeMark mark(this, stub_id); 3254 const address start = __ pc(); 3255 __ enter(); 3256 3257 Label DONE, CTR_large_block, large_block_return; 3258 __ ldrw(used, Address(used_ptr)); 3259 __ cbzw(saved_len, DONE); 3260 3261 __ mov(len, saved_len); 3262 __ mov(offset, 0); 3263 3264 // Compute #rounds for AES based on the length of the key array 3265 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3266 3267 __ aesenc_loadkeys(key, keylen); 3268 3269 { 3270 Label L_CTR_loop, NEXT; 3271 3272 __ bind(L_CTR_loop); 3273 3274 __ cmp(used, block_size); 3275 __ br(__ LO, NEXT); 3276 3277 // Maybe we have a lot of data 3278 __ subsw(rscratch1, len, bulk_width * block_size); 3279 __ br(__ HS, CTR_large_block); 3280 __ BIND(large_block_return); 3281 __ cbzw(len, DONE); 3282 3283 // Setup the counter 3284 __ movi(v4, __ T4S, 0); 3285 __ movi(v5, __ T4S, 1); 3286 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3287 3288 // 128-bit big-endian increment 3289 __ ld1(v0, __ T16B, counter); 3290 __ rev64(v16, __ T16B, v0); 3291 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3292 __ rev64(v16, __ T16B, v16); 3293 __ st1(v16, __ T16B, counter); 3294 // Previous counter value is in v0 3295 // v4 contains { 0, 1 } 3296 3297 { 3298 // We have fewer than bulk_width blocks of data left. Encrypt 3299 // them one by one until there is less than a full block 3300 // remaining, being careful to save both the encrypted counter 3301 // and the counter. 3302 3303 Label inner_loop; 3304 __ bind(inner_loop); 3305 // Counter to encrypt is in v0 3306 __ aesecb_encrypt(noreg, noreg, keylen); 3307 __ st1(v0, __ T16B, saved_encrypted_ctr); 3308 3309 // Do we have a remaining full block? 3310 3311 __ mov(used, 0); 3312 __ cmp(len, block_size); 3313 __ br(__ LO, NEXT); 3314 3315 // Yes, we have a full block 3316 __ ldrq(v1, Address(in, offset)); 3317 __ eor(v1, __ T16B, v1, v0); 3318 __ strq(v1, Address(out, offset)); 3319 __ mov(used, block_size); 3320 __ add(offset, offset, block_size); 3321 3322 __ subw(len, len, block_size); 3323 __ cbzw(len, DONE); 3324 3325 // Increment the counter, store it back 3326 __ orr(v0, __ T16B, v16, v16); 3327 __ rev64(v16, __ T16B, v16); 3328 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3329 __ rev64(v16, __ T16B, v16); 3330 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3331 3332 __ b(inner_loop); 3333 } 3334 3335 __ BIND(NEXT); 3336 3337 // Encrypt a single byte, and loop. 3338 // We expect this to be a rare event. 3339 __ ldrb(rscratch1, Address(in, offset)); 3340 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3341 __ eor(rscratch1, rscratch1, rscratch2); 3342 __ strb(rscratch1, Address(out, offset)); 3343 __ add(offset, offset, 1); 3344 __ add(used, used, 1); 3345 __ subw(len, len,1); 3346 __ cbnzw(len, L_CTR_loop); 3347 } 3348 3349 __ bind(DONE); 3350 __ strw(used, Address(used_ptr)); 3351 __ mov(r0, saved_len); 3352 3353 __ leave(); // required for proper stackwalking of RuntimeStub frame 3354 __ ret(lr); 3355 3356 // Bulk encryption 3357 3358 __ BIND (CTR_large_block); 3359 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3360 3361 if (bulk_width == 8) { 3362 __ sub(sp, sp, 4 * 16); 3363 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3364 } 3365 __ sub(sp, sp, 4 * 16); 3366 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3367 RegSet saved_regs = (RegSet::of(in, out, offset) 3368 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3369 __ push(saved_regs, sp); 3370 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3371 __ add(in, in, offset); 3372 __ add(out, out, offset); 3373 3374 // Keys should already be loaded into the correct registers 3375 3376 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3377 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3378 3379 // AES/CTR loop 3380 { 3381 Label L_CTR_loop; 3382 __ BIND(L_CTR_loop); 3383 3384 // Setup the counters 3385 __ movi(v8, __ T4S, 0); 3386 __ movi(v9, __ T4S, 1); 3387 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3388 3389 for (int i = 0; i < bulk_width; i++) { 3390 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3391 __ rev64(v0_ofs, __ T16B, v16); 3392 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3393 } 3394 3395 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3396 3397 // Encrypt the counters 3398 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3399 3400 if (bulk_width == 8) { 3401 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3402 } 3403 3404 // XOR the encrypted counters with the inputs 3405 for (int i = 0; i < bulk_width; i++) { 3406 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3407 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3408 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3409 } 3410 3411 // Write the encrypted data 3412 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3413 if (bulk_width == 8) { 3414 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3415 } 3416 3417 __ subw(len, len, 16 * bulk_width); 3418 __ cbnzw(len, L_CTR_loop); 3419 } 3420 3421 // Save the counter back where it goes 3422 __ rev64(v16, __ T16B, v16); 3423 __ st1(v16, __ T16B, counter); 3424 3425 __ pop(saved_regs, sp); 3426 3427 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3428 if (bulk_width == 8) { 3429 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3430 } 3431 3432 __ andr(rscratch1, len, -16 * bulk_width); 3433 __ sub(len, len, rscratch1); 3434 __ add(offset, offset, rscratch1); 3435 __ mov(used, 16); 3436 __ strw(used, Address(used_ptr)); 3437 __ b(large_block_return); 3438 3439 return start; 3440 } 3441 3442 // Vector AES Galois Counter Mode implementation. Parameters: 3443 // 3444 // in = c_rarg0 3445 // len = c_rarg1 3446 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3447 // out = c_rarg3 3448 // key = c_rarg4 3449 // state = c_rarg5 - GHASH.state 3450 // subkeyHtbl = c_rarg6 - powers of H 3451 // counter = c_rarg7 - 16 bytes of CTR 3452 // return - number of processed bytes 3453 address generate_galoisCounterMode_AESCrypt() { 3454 Label ghash_polynomial; // local data generated after code 3455 3456 __ align(CodeEntryAlignment); 3457 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id; 3458 StubCodeMark mark(this, stub_id); 3459 address start = __ pc(); 3460 __ enter(); 3461 3462 const Register in = c_rarg0; 3463 const Register len = c_rarg1; 3464 const Register ct = c_rarg2; 3465 const Register out = c_rarg3; 3466 // and updated with the incremented counter in the end 3467 3468 const Register key = c_rarg4; 3469 const Register state = c_rarg5; 3470 3471 const Register subkeyHtbl = c_rarg6; 3472 3473 const Register counter = c_rarg7; 3474 3475 const Register keylen = r10; 3476 // Save state before entering routine 3477 __ sub(sp, sp, 4 * 16); 3478 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3479 __ sub(sp, sp, 4 * 16); 3480 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3481 3482 // __ andr(len, len, -512); 3483 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3484 __ str(len, __ pre(sp, -2 * wordSize)); 3485 3486 Label DONE; 3487 __ cbz(len, DONE); 3488 3489 // Compute #rounds for AES based on the length of the key array 3490 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3491 3492 __ aesenc_loadkeys(key, keylen); 3493 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3494 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3495 3496 // AES/CTR loop 3497 { 3498 Label L_CTR_loop; 3499 __ BIND(L_CTR_loop); 3500 3501 // Setup the counters 3502 __ movi(v8, __ T4S, 0); 3503 __ movi(v9, __ T4S, 1); 3504 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3505 3506 assert(v0->encoding() < v8->encoding(), ""); 3507 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3508 FloatRegister f = as_FloatRegister(i); 3509 __ rev32(f, __ T16B, v16); 3510 __ addv(v16, __ T4S, v16, v8); 3511 } 3512 3513 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3514 3515 // Encrypt the counters 3516 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3517 3518 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3519 3520 // XOR the encrypted counters with the inputs 3521 for (int i = 0; i < 8; i++) { 3522 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3523 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3524 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3525 } 3526 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3527 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3528 3529 __ subw(len, len, 16 * 8); 3530 __ cbnzw(len, L_CTR_loop); 3531 } 3532 3533 __ rev32(v16, __ T16B, v16); 3534 __ st1(v16, __ T16B, counter); 3535 3536 __ ldr(len, Address(sp)); 3537 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3538 3539 // GHASH/CTR loop 3540 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3541 len, /*unrolls*/4); 3542 3543 #ifdef ASSERT 3544 { Label L; 3545 __ cmp(len, (unsigned char)0); 3546 __ br(Assembler::EQ, L); 3547 __ stop("stubGenerator: abort"); 3548 __ bind(L); 3549 } 3550 #endif 3551 3552 __ bind(DONE); 3553 // Return the number of bytes processed 3554 __ ldr(r0, __ post(sp, 2 * wordSize)); 3555 3556 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3557 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3558 3559 __ leave(); // required for proper stackwalking of RuntimeStub frame 3560 __ ret(lr); 3561 3562 // bind label and generate polynomial data 3563 __ align(wordSize * 2); 3564 __ bind(ghash_polynomial); 3565 __ emit_int64(0x87); // The low-order bits of the field 3566 // polynomial (i.e. p = z^7+z^2+z+1) 3567 // repeated in the low and high parts of a 3568 // 128-bit vector 3569 __ emit_int64(0x87); 3570 3571 return start; 3572 } 3573 3574 class Cached64Bytes { 3575 private: 3576 MacroAssembler *_masm; 3577 Register _regs[8]; 3578 3579 public: 3580 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3581 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3582 auto it = rs.begin(); 3583 for (auto &r: _regs) { 3584 r = *it; 3585 ++it; 3586 } 3587 } 3588 3589 void gen_loads(Register base) { 3590 for (int i = 0; i < 8; i += 2) { 3591 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3592 } 3593 } 3594 3595 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3596 void extract_u32(Register dest, int i) { 3597 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3598 } 3599 }; 3600 3601 // Utility routines for md5. 3602 // Clobbers r10 and r11. 3603 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3604 int k, int s, int t) { 3605 Register rscratch3 = r10; 3606 Register rscratch4 = r11; 3607 3608 __ eorw(rscratch3, r3, r4); 3609 __ movw(rscratch2, t); 3610 __ andw(rscratch3, rscratch3, r2); 3611 __ addw(rscratch4, r1, rscratch2); 3612 reg_cache.extract_u32(rscratch1, k); 3613 __ eorw(rscratch3, rscratch3, r4); 3614 __ addw(rscratch4, rscratch4, rscratch1); 3615 __ addw(rscratch3, rscratch3, rscratch4); 3616 __ rorw(rscratch2, rscratch3, 32 - s); 3617 __ addw(r1, rscratch2, r2); 3618 } 3619 3620 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3621 int k, int s, int t) { 3622 Register rscratch3 = r10; 3623 Register rscratch4 = r11; 3624 3625 reg_cache.extract_u32(rscratch1, k); 3626 __ movw(rscratch2, t); 3627 __ addw(rscratch4, r1, rscratch2); 3628 __ addw(rscratch4, rscratch4, rscratch1); 3629 __ bicw(rscratch2, r3, r4); 3630 __ andw(rscratch3, r2, r4); 3631 __ addw(rscratch2, rscratch2, rscratch4); 3632 __ addw(rscratch2, rscratch2, rscratch3); 3633 __ rorw(rscratch2, rscratch2, 32 - s); 3634 __ addw(r1, rscratch2, r2); 3635 } 3636 3637 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3638 int k, int s, int t) { 3639 Register rscratch3 = r10; 3640 Register rscratch4 = r11; 3641 3642 __ eorw(rscratch3, r3, r4); 3643 __ movw(rscratch2, t); 3644 __ addw(rscratch4, r1, rscratch2); 3645 reg_cache.extract_u32(rscratch1, k); 3646 __ eorw(rscratch3, rscratch3, r2); 3647 __ addw(rscratch4, rscratch4, rscratch1); 3648 __ addw(rscratch3, rscratch3, rscratch4); 3649 __ rorw(rscratch2, rscratch3, 32 - s); 3650 __ addw(r1, rscratch2, r2); 3651 } 3652 3653 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3654 int k, int s, int t) { 3655 Register rscratch3 = r10; 3656 Register rscratch4 = r11; 3657 3658 __ movw(rscratch3, t); 3659 __ ornw(rscratch2, r2, r4); 3660 __ addw(rscratch4, r1, rscratch3); 3661 reg_cache.extract_u32(rscratch1, k); 3662 __ eorw(rscratch3, rscratch2, r3); 3663 __ addw(rscratch4, rscratch4, rscratch1); 3664 __ addw(rscratch3, rscratch3, rscratch4); 3665 __ rorw(rscratch2, rscratch3, 32 - s); 3666 __ addw(r1, rscratch2, r2); 3667 } 3668 3669 // Arguments: 3670 // 3671 // Inputs: 3672 // c_rarg0 - byte[] source+offset 3673 // c_rarg1 - int[] SHA.state 3674 // c_rarg2 - int offset 3675 // c_rarg3 - int limit 3676 // 3677 address generate_md5_implCompress(StubId stub_id) { 3678 bool multi_block; 3679 switch (stub_id) { 3680 case StubId::stubgen_md5_implCompress_id: 3681 multi_block = false; 3682 break; 3683 case StubId::stubgen_md5_implCompressMB_id: 3684 multi_block = true; 3685 break; 3686 default: 3687 ShouldNotReachHere(); 3688 } 3689 __ align(CodeEntryAlignment); 3690 3691 StubCodeMark mark(this, stub_id); 3692 address start = __ pc(); 3693 3694 Register buf = c_rarg0; 3695 Register state = c_rarg1; 3696 Register ofs = c_rarg2; 3697 Register limit = c_rarg3; 3698 Register a = r4; 3699 Register b = r5; 3700 Register c = r6; 3701 Register d = r7; 3702 Register rscratch3 = r10; 3703 Register rscratch4 = r11; 3704 3705 Register state_regs[2] = { r12, r13 }; 3706 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3707 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3708 3709 __ push(saved_regs, sp); 3710 3711 __ ldp(state_regs[0], state_regs[1], Address(state)); 3712 __ ubfx(a, state_regs[0], 0, 32); 3713 __ ubfx(b, state_regs[0], 32, 32); 3714 __ ubfx(c, state_regs[1], 0, 32); 3715 __ ubfx(d, state_regs[1], 32, 32); 3716 3717 Label md5_loop; 3718 __ BIND(md5_loop); 3719 3720 reg_cache.gen_loads(buf); 3721 3722 // Round 1 3723 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3724 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3725 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3726 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3727 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3728 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3729 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3730 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3731 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3732 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3733 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3734 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3735 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3736 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3737 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3738 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3739 3740 // Round 2 3741 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3742 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3743 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3744 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3745 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3746 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3747 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3748 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3749 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3750 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3751 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3752 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3753 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3754 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3755 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3756 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3757 3758 // Round 3 3759 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3760 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3761 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3762 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3763 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3764 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3765 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3766 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3767 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3768 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3769 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3770 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3771 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3772 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3773 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3774 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3775 3776 // Round 4 3777 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3778 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3779 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3780 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3781 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3782 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3783 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3784 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3785 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3786 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3787 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3788 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3789 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3790 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3791 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3792 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3793 3794 __ addw(a, state_regs[0], a); 3795 __ ubfx(rscratch2, state_regs[0], 32, 32); 3796 __ addw(b, rscratch2, b); 3797 __ addw(c, state_regs[1], c); 3798 __ ubfx(rscratch4, state_regs[1], 32, 32); 3799 __ addw(d, rscratch4, d); 3800 3801 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3802 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3803 3804 if (multi_block) { 3805 __ add(buf, buf, 64); 3806 __ add(ofs, ofs, 64); 3807 __ cmp(ofs, limit); 3808 __ br(Assembler::LE, md5_loop); 3809 __ mov(c_rarg0, ofs); // return ofs 3810 } 3811 3812 // write hash values back in the correct order 3813 __ stp(state_regs[0], state_regs[1], Address(state)); 3814 3815 __ pop(saved_regs, sp); 3816 3817 __ ret(lr); 3818 3819 return start; 3820 } 3821 3822 // Arguments: 3823 // 3824 // Inputs: 3825 // c_rarg0 - byte[] source+offset 3826 // c_rarg1 - int[] SHA.state 3827 // c_rarg2 - int offset 3828 // c_rarg3 - int limit 3829 // 3830 address generate_sha1_implCompress(StubId stub_id) { 3831 bool multi_block; 3832 switch (stub_id) { 3833 case StubId::stubgen_sha1_implCompress_id: 3834 multi_block = false; 3835 break; 3836 case StubId::stubgen_sha1_implCompressMB_id: 3837 multi_block = true; 3838 break; 3839 default: 3840 ShouldNotReachHere(); 3841 } 3842 3843 __ align(CodeEntryAlignment); 3844 3845 StubCodeMark mark(this, stub_id); 3846 address start = __ pc(); 3847 3848 Register buf = c_rarg0; 3849 Register state = c_rarg1; 3850 Register ofs = c_rarg2; 3851 Register limit = c_rarg3; 3852 3853 Label keys; 3854 Label sha1_loop; 3855 3856 // load the keys into v0..v3 3857 __ adr(rscratch1, keys); 3858 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3859 // load 5 words state into v6, v7 3860 __ ldrq(v6, Address(state, 0)); 3861 __ ldrs(v7, Address(state, 16)); 3862 3863 3864 __ BIND(sha1_loop); 3865 // load 64 bytes of data into v16..v19 3866 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3867 __ rev32(v16, __ T16B, v16); 3868 __ rev32(v17, __ T16B, v17); 3869 __ rev32(v18, __ T16B, v18); 3870 __ rev32(v19, __ T16B, v19); 3871 3872 // do the sha1 3873 __ addv(v4, __ T4S, v16, v0); 3874 __ orr(v20, __ T16B, v6, v6); 3875 3876 FloatRegister d0 = v16; 3877 FloatRegister d1 = v17; 3878 FloatRegister d2 = v18; 3879 FloatRegister d3 = v19; 3880 3881 for (int round = 0; round < 20; round++) { 3882 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3883 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3884 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3885 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3886 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3887 3888 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3889 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3890 __ sha1h(tmp2, __ T4S, v20); 3891 if (round < 5) 3892 __ sha1c(v20, __ T4S, tmp3, tmp4); 3893 else if (round < 10 || round >= 15) 3894 __ sha1p(v20, __ T4S, tmp3, tmp4); 3895 else 3896 __ sha1m(v20, __ T4S, tmp3, tmp4); 3897 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3898 3899 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3900 } 3901 3902 __ addv(v7, __ T2S, v7, v21); 3903 __ addv(v6, __ T4S, v6, v20); 3904 3905 if (multi_block) { 3906 __ add(ofs, ofs, 64); 3907 __ cmp(ofs, limit); 3908 __ br(Assembler::LE, sha1_loop); 3909 __ mov(c_rarg0, ofs); // return ofs 3910 } 3911 3912 __ strq(v6, Address(state, 0)); 3913 __ strs(v7, Address(state, 16)); 3914 3915 __ ret(lr); 3916 3917 __ bind(keys); 3918 __ emit_int32(0x5a827999); 3919 __ emit_int32(0x6ed9eba1); 3920 __ emit_int32(0x8f1bbcdc); 3921 __ emit_int32(0xca62c1d6); 3922 3923 return start; 3924 } 3925 3926 3927 // Arguments: 3928 // 3929 // Inputs: 3930 // c_rarg0 - byte[] source+offset 3931 // c_rarg1 - int[] SHA.state 3932 // c_rarg2 - int offset 3933 // c_rarg3 - int limit 3934 // 3935 address generate_sha256_implCompress(StubId stub_id) { 3936 bool multi_block; 3937 switch (stub_id) { 3938 case StubId::stubgen_sha256_implCompress_id: 3939 multi_block = false; 3940 break; 3941 case StubId::stubgen_sha256_implCompressMB_id: 3942 multi_block = true; 3943 break; 3944 default: 3945 ShouldNotReachHere(); 3946 } 3947 3948 static const uint32_t round_consts[64] = { 3949 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3950 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3951 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3952 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3953 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3954 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3955 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3956 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3957 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3958 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3959 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3960 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3961 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3962 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3963 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3964 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3965 }; 3966 3967 __ align(CodeEntryAlignment); 3968 3969 StubCodeMark mark(this, stub_id); 3970 address start = __ pc(); 3971 3972 Register buf = c_rarg0; 3973 Register state = c_rarg1; 3974 Register ofs = c_rarg2; 3975 Register limit = c_rarg3; 3976 3977 Label sha1_loop; 3978 3979 __ stpd(v8, v9, __ pre(sp, -32)); 3980 __ stpd(v10, v11, Address(sp, 16)); 3981 3982 // dga == v0 3983 // dgb == v1 3984 // dg0 == v2 3985 // dg1 == v3 3986 // dg2 == v4 3987 // t0 == v6 3988 // t1 == v7 3989 3990 // load 16 keys to v16..v31 3991 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3992 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3993 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3994 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3995 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3996 3997 // load 8 words (256 bits) state 3998 __ ldpq(v0, v1, state); 3999 4000 __ BIND(sha1_loop); 4001 // load 64 bytes of data into v8..v11 4002 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 4003 __ rev32(v8, __ T16B, v8); 4004 __ rev32(v9, __ T16B, v9); 4005 __ rev32(v10, __ T16B, v10); 4006 __ rev32(v11, __ T16B, v11); 4007 4008 __ addv(v6, __ T4S, v8, v16); 4009 __ orr(v2, __ T16B, v0, v0); 4010 __ orr(v3, __ T16B, v1, v1); 4011 4012 FloatRegister d0 = v8; 4013 FloatRegister d1 = v9; 4014 FloatRegister d2 = v10; 4015 FloatRegister d3 = v11; 4016 4017 4018 for (int round = 0; round < 16; round++) { 4019 FloatRegister tmp1 = (round & 1) ? v6 : v7; 4020 FloatRegister tmp2 = (round & 1) ? v7 : v6; 4021 FloatRegister tmp3 = (round & 1) ? v2 : v4; 4022 FloatRegister tmp4 = (round & 1) ? v4 : v2; 4023 4024 if (round < 12) __ sha256su0(d0, __ T4S, d1); 4025 __ orr(v4, __ T16B, v2, v2); 4026 if (round < 15) 4027 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 4028 __ sha256h(v2, __ T4S, v3, tmp2); 4029 __ sha256h2(v3, __ T4S, v4, tmp2); 4030 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 4031 4032 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 4033 } 4034 4035 __ addv(v0, __ T4S, v0, v2); 4036 __ addv(v1, __ T4S, v1, v3); 4037 4038 if (multi_block) { 4039 __ add(ofs, ofs, 64); 4040 __ cmp(ofs, limit); 4041 __ br(Assembler::LE, sha1_loop); 4042 __ mov(c_rarg0, ofs); // return ofs 4043 } 4044 4045 __ ldpd(v10, v11, Address(sp, 16)); 4046 __ ldpd(v8, v9, __ post(sp, 32)); 4047 4048 __ stpq(v0, v1, state); 4049 4050 __ ret(lr); 4051 4052 return start; 4053 } 4054 4055 // Double rounds for sha512. 4056 void sha512_dround(int dr, 4057 FloatRegister vi0, FloatRegister vi1, 4058 FloatRegister vi2, FloatRegister vi3, 4059 FloatRegister vi4, FloatRegister vrc0, 4060 FloatRegister vrc1, FloatRegister vin0, 4061 FloatRegister vin1, FloatRegister vin2, 4062 FloatRegister vin3, FloatRegister vin4) { 4063 if (dr < 36) { 4064 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 4065 } 4066 __ addv(v5, __ T2D, vrc0, vin0); 4067 __ ext(v6, __ T16B, vi2, vi3, 8); 4068 __ ext(v5, __ T16B, v5, v5, 8); 4069 __ ext(v7, __ T16B, vi1, vi2, 8); 4070 __ addv(vi3, __ T2D, vi3, v5); 4071 if (dr < 32) { 4072 __ ext(v5, __ T16B, vin3, vin4, 8); 4073 __ sha512su0(vin0, __ T2D, vin1); 4074 } 4075 __ sha512h(vi3, __ T2D, v6, v7); 4076 if (dr < 32) { 4077 __ sha512su1(vin0, __ T2D, vin2, v5); 4078 } 4079 __ addv(vi4, __ T2D, vi1, vi3); 4080 __ sha512h2(vi3, __ T2D, vi1, vi0); 4081 } 4082 4083 // Arguments: 4084 // 4085 // Inputs: 4086 // c_rarg0 - byte[] source+offset 4087 // c_rarg1 - int[] SHA.state 4088 // c_rarg2 - int offset 4089 // c_rarg3 - int limit 4090 // 4091 address generate_sha512_implCompress(StubId stub_id) { 4092 bool multi_block; 4093 switch (stub_id) { 4094 case StubId::stubgen_sha512_implCompress_id: 4095 multi_block = false; 4096 break; 4097 case StubId::stubgen_sha512_implCompressMB_id: 4098 multi_block = true; 4099 break; 4100 default: 4101 ShouldNotReachHere(); 4102 } 4103 4104 static const uint64_t round_consts[80] = { 4105 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 4106 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 4107 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 4108 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 4109 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 4110 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 4111 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 4112 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 4113 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 4114 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 4115 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 4116 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 4117 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 4118 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 4119 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 4120 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 4121 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 4122 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 4123 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 4124 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 4125 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 4126 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 4127 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 4128 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 4129 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 4130 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 4131 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 4132 }; 4133 4134 __ align(CodeEntryAlignment); 4135 4136 StubCodeMark mark(this, stub_id); 4137 address start = __ pc(); 4138 4139 Register buf = c_rarg0; 4140 Register state = c_rarg1; 4141 Register ofs = c_rarg2; 4142 Register limit = c_rarg3; 4143 4144 __ stpd(v8, v9, __ pre(sp, -64)); 4145 __ stpd(v10, v11, Address(sp, 16)); 4146 __ stpd(v12, v13, Address(sp, 32)); 4147 __ stpd(v14, v15, Address(sp, 48)); 4148 4149 Label sha512_loop; 4150 4151 // load state 4152 __ ld1(v8, v9, v10, v11, __ T2D, state); 4153 4154 // load first 4 round constants 4155 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4156 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4157 4158 __ BIND(sha512_loop); 4159 // load 128B of data into v12..v19 4160 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4161 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4162 __ rev64(v12, __ T16B, v12); 4163 __ rev64(v13, __ T16B, v13); 4164 __ rev64(v14, __ T16B, v14); 4165 __ rev64(v15, __ T16B, v15); 4166 __ rev64(v16, __ T16B, v16); 4167 __ rev64(v17, __ T16B, v17); 4168 __ rev64(v18, __ T16B, v18); 4169 __ rev64(v19, __ T16B, v19); 4170 4171 __ mov(rscratch2, rscratch1); 4172 4173 __ mov(v0, __ T16B, v8); 4174 __ mov(v1, __ T16B, v9); 4175 __ mov(v2, __ T16B, v10); 4176 __ mov(v3, __ T16B, v11); 4177 4178 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4179 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4180 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4181 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4182 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4183 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4184 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4185 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4186 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4187 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4188 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4189 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4190 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4191 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4192 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4193 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4194 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4195 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4196 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4197 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4198 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4199 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4200 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4201 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4202 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4203 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4204 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4205 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4206 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4207 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4208 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4209 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4210 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4211 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4212 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4213 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4214 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4215 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4216 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4217 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4218 4219 __ addv(v8, __ T2D, v8, v0); 4220 __ addv(v9, __ T2D, v9, v1); 4221 __ addv(v10, __ T2D, v10, v2); 4222 __ addv(v11, __ T2D, v11, v3); 4223 4224 if (multi_block) { 4225 __ add(ofs, ofs, 128); 4226 __ cmp(ofs, limit); 4227 __ br(Assembler::LE, sha512_loop); 4228 __ mov(c_rarg0, ofs); // return ofs 4229 } 4230 4231 __ st1(v8, v9, v10, v11, __ T2D, state); 4232 4233 __ ldpd(v14, v15, Address(sp, 48)); 4234 __ ldpd(v12, v13, Address(sp, 32)); 4235 __ ldpd(v10, v11, Address(sp, 16)); 4236 __ ldpd(v8, v9, __ post(sp, 64)); 4237 4238 __ ret(lr); 4239 4240 return start; 4241 } 4242 4243 // Execute one round of keccak of two computations in parallel. 4244 // One of the states should be loaded into the lower halves of 4245 // the vector registers v0-v24, the other should be loaded into 4246 // the upper halves of those registers. The ld1r instruction loads 4247 // the round constant into both halves of register v31. 4248 // Intermediate results c0...c5 and d0...d5 are computed 4249 // in registers v25...v30. 4250 // All vector instructions that are used operate on both register 4251 // halves in parallel. 4252 // If only a single computation is needed, one can only load the lower halves. 4253 void keccak_round(Register rscratch1) { 4254 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4255 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4256 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4257 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4258 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4259 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4260 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4261 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4262 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4263 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4264 4265 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4266 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4267 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4268 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4269 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4270 4271 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4272 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4273 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4274 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4275 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4276 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4277 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4278 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4279 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4280 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4281 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4282 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4283 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4284 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4285 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4286 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4287 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4288 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4289 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4290 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4291 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4292 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4293 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4294 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4295 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4296 4297 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4298 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4299 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4300 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4301 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4302 4303 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4304 4305 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4306 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4307 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4308 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4309 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4310 4311 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4312 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4313 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4314 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4315 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4316 4317 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4318 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4319 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4320 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4321 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4322 4323 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4324 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4325 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4326 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4327 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4328 4329 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4330 } 4331 4332 // Arguments: 4333 // 4334 // Inputs: 4335 // c_rarg0 - byte[] source+offset 4336 // c_rarg1 - byte[] SHA.state 4337 // c_rarg2 - int block_size 4338 // c_rarg3 - int offset 4339 // c_rarg4 - int limit 4340 // 4341 address generate_sha3_implCompress(StubId stub_id) { 4342 bool multi_block; 4343 switch (stub_id) { 4344 case StubId::stubgen_sha3_implCompress_id: 4345 multi_block = false; 4346 break; 4347 case StubId::stubgen_sha3_implCompressMB_id: 4348 multi_block = true; 4349 break; 4350 default: 4351 ShouldNotReachHere(); 4352 } 4353 4354 static const uint64_t round_consts[24] = { 4355 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4356 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4357 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4358 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4359 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4360 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4361 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4362 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4363 }; 4364 4365 __ align(CodeEntryAlignment); 4366 4367 StubCodeMark mark(this, stub_id); 4368 address start = __ pc(); 4369 4370 Register buf = c_rarg0; 4371 Register state = c_rarg1; 4372 Register block_size = c_rarg2; 4373 Register ofs = c_rarg3; 4374 Register limit = c_rarg4; 4375 4376 Label sha3_loop, rounds24_loop; 4377 Label sha3_512_or_sha3_384, shake128; 4378 4379 __ stpd(v8, v9, __ pre(sp, -64)); 4380 __ stpd(v10, v11, Address(sp, 16)); 4381 __ stpd(v12, v13, Address(sp, 32)); 4382 __ stpd(v14, v15, Address(sp, 48)); 4383 4384 // load state 4385 __ add(rscratch1, state, 32); 4386 __ ld1(v0, v1, v2, v3, __ T1D, state); 4387 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4388 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4389 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4390 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4391 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4392 __ ld1(v24, __ T1D, rscratch1); 4393 4394 __ BIND(sha3_loop); 4395 4396 // 24 keccak rounds 4397 __ movw(rscratch2, 24); 4398 4399 // load round_constants base 4400 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4401 4402 // load input 4403 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4404 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4405 __ eor(v0, __ T8B, v0, v25); 4406 __ eor(v1, __ T8B, v1, v26); 4407 __ eor(v2, __ T8B, v2, v27); 4408 __ eor(v3, __ T8B, v3, v28); 4409 __ eor(v4, __ T8B, v4, v29); 4410 __ eor(v5, __ T8B, v5, v30); 4411 __ eor(v6, __ T8B, v6, v31); 4412 4413 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4414 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4415 4416 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4417 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4418 __ eor(v7, __ T8B, v7, v25); 4419 __ eor(v8, __ T8B, v8, v26); 4420 __ eor(v9, __ T8B, v9, v27); 4421 __ eor(v10, __ T8B, v10, v28); 4422 __ eor(v11, __ T8B, v11, v29); 4423 __ eor(v12, __ T8B, v12, v30); 4424 __ eor(v13, __ T8B, v13, v31); 4425 4426 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4427 __ eor(v14, __ T8B, v14, v25); 4428 __ eor(v15, __ T8B, v15, v26); 4429 __ eor(v16, __ T8B, v16, v27); 4430 4431 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4432 __ andw(c_rarg5, block_size, 48); 4433 __ cbzw(c_rarg5, rounds24_loop); 4434 4435 __ tbnz(block_size, 5, shake128); 4436 // block_size == 144, bit5 == 0, SHA3-224 4437 __ ldrd(v28, __ post(buf, 8)); 4438 __ eor(v17, __ T8B, v17, v28); 4439 __ b(rounds24_loop); 4440 4441 __ BIND(shake128); 4442 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4443 __ eor(v17, __ T8B, v17, v28); 4444 __ eor(v18, __ T8B, v18, v29); 4445 __ eor(v19, __ T8B, v19, v30); 4446 __ eor(v20, __ T8B, v20, v31); 4447 __ b(rounds24_loop); // block_size == 168, SHAKE128 4448 4449 __ BIND(sha3_512_or_sha3_384); 4450 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4451 __ eor(v7, __ T8B, v7, v25); 4452 __ eor(v8, __ T8B, v8, v26); 4453 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4454 4455 // SHA3-384 4456 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4457 __ eor(v9, __ T8B, v9, v27); 4458 __ eor(v10, __ T8B, v10, v28); 4459 __ eor(v11, __ T8B, v11, v29); 4460 __ eor(v12, __ T8B, v12, v30); 4461 4462 __ BIND(rounds24_loop); 4463 __ subw(rscratch2, rscratch2, 1); 4464 4465 keccak_round(rscratch1); 4466 4467 __ cbnzw(rscratch2, rounds24_loop); 4468 4469 if (multi_block) { 4470 __ add(ofs, ofs, block_size); 4471 __ cmp(ofs, limit); 4472 __ br(Assembler::LE, sha3_loop); 4473 __ mov(c_rarg0, ofs); // return ofs 4474 } 4475 4476 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4477 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4478 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4479 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4480 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4481 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4482 __ st1(v24, __ T1D, state); 4483 4484 // restore callee-saved registers 4485 __ ldpd(v14, v15, Address(sp, 48)); 4486 __ ldpd(v12, v13, Address(sp, 32)); 4487 __ ldpd(v10, v11, Address(sp, 16)); 4488 __ ldpd(v8, v9, __ post(sp, 64)); 4489 4490 __ ret(lr); 4491 4492 return start; 4493 } 4494 4495 // Inputs: 4496 // c_rarg0 - long[] state0 4497 // c_rarg1 - long[] state1 4498 address generate_double_keccak() { 4499 static const uint64_t round_consts[24] = { 4500 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4501 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4502 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4503 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4504 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4505 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4506 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4507 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4508 }; 4509 4510 // Implements the double_keccak() method of the 4511 // sun.secyrity.provider.SHA3Parallel class 4512 __ align(CodeEntryAlignment); 4513 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4514 address start = __ pc(); 4515 __ enter(); 4516 4517 Register state0 = c_rarg0; 4518 Register state1 = c_rarg1; 4519 4520 Label rounds24_loop; 4521 4522 // save callee-saved registers 4523 __ stpd(v8, v9, __ pre(sp, -64)); 4524 __ stpd(v10, v11, Address(sp, 16)); 4525 __ stpd(v12, v13, Address(sp, 32)); 4526 __ stpd(v14, v15, Address(sp, 48)); 4527 4528 // load states 4529 __ add(rscratch1, state0, 32); 4530 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4531 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4532 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4533 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4534 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4535 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4536 __ ld1(v24, __ D, 0, rscratch1); 4537 __ add(rscratch1, state1, 32); 4538 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4539 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4540 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4541 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4542 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4543 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4544 __ ld1(v24, __ D, 1, rscratch1); 4545 4546 // 24 keccak rounds 4547 __ movw(rscratch2, 24); 4548 4549 // load round_constants base 4550 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4551 4552 __ BIND(rounds24_loop); 4553 __ subw(rscratch2, rscratch2, 1); 4554 keccak_round(rscratch1); 4555 __ cbnzw(rscratch2, rounds24_loop); 4556 4557 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4558 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4559 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4560 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4561 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4562 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4563 __ st1(v24, __ D, 0, state0); 4564 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4565 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4566 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4567 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4568 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4569 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4570 __ st1(v24, __ D, 1, state1); 4571 4572 // restore callee-saved vector registers 4573 __ ldpd(v14, v15, Address(sp, 48)); 4574 __ ldpd(v12, v13, Address(sp, 32)); 4575 __ ldpd(v10, v11, Address(sp, 16)); 4576 __ ldpd(v8, v9, __ post(sp, 64)); 4577 4578 __ leave(); // required for proper stackwalking of RuntimeStub frame 4579 __ mov(r0, zr); // return 0 4580 __ ret(lr); 4581 4582 return start; 4583 } 4584 4585 // ChaCha20 block function. This version parallelizes the 32-bit 4586 // state elements on each of 16 vectors, producing 4 blocks of 4587 // keystream at a time. 4588 // 4589 // state (int[16]) = c_rarg0 4590 // keystream (byte[256]) = c_rarg1 4591 // return - number of bytes of produced keystream (always 256) 4592 // 4593 // This implementation takes each 32-bit integer from the state 4594 // array and broadcasts it across all 4 32-bit lanes of a vector register 4595 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes 4596 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors, 4597 // the quarter round schedule is implemented as outlined in RFC 7539 section 4598 // 2.3. However, instead of sequentially processing the 3 quarter round 4599 // operations represented by one QUARTERROUND function, we instead stack all 4600 // the adds, xors and left-rotations from the first 4 quarter rounds together 4601 // and then do the same for the second set of 4 quarter rounds. This removes 4602 // some latency that would otherwise be incurred by waiting for an add to 4603 // complete before performing an xor (which depends on the result of the 4604 // add), etc. An adjustment happens between the first and second groups of 4 4605 // quarter rounds, but this is done only in the inputs to the macro functions 4606 // that generate the assembly instructions - these adjustments themselves are 4607 // not part of the resulting assembly. 4608 // The 4 registers v0-v3 are used during the quarter round operations as 4609 // scratch registers. Once the 20 rounds are complete, these 4 scratch 4610 // registers become the vectors involved in adding the start state back onto 4611 // the post-QR working state. After the adds are complete, each of the 16 4612 // vectors write their first lane back to the keystream buffer, followed 4613 // by the second lane from all vectors and so on. 4614 address generate_chacha20Block_blockpar() { 4615 Label L_twoRounds, L_cc20_const; 4616 __ align(CodeEntryAlignment); 4617 StubId stub_id = StubId::stubgen_chacha20Block_id; 4618 StubCodeMark mark(this, stub_id); 4619 address start = __ pc(); 4620 __ enter(); 4621 4622 int i, j; 4623 const Register state = c_rarg0; 4624 const Register keystream = c_rarg1; 4625 const Register loopCtr = r10; 4626 const Register tmpAddr = r11; 4627 const FloatRegister ctrAddOverlay = v28; 4628 const FloatRegister lrot8Tbl = v29; 4629 4630 // Organize SIMD registers in an array that facilitates 4631 // putting repetitive opcodes into loop structures. It is 4632 // important that each grouping of 4 registers is monotonically 4633 // increasing to support the requirements of multi-register 4634 // instructions (e.g. ld4r, st4, etc.) 4635 const FloatRegister workSt[16] = { 4636 v4, v5, v6, v7, v16, v17, v18, v19, 4637 v20, v21, v22, v23, v24, v25, v26, v27 4638 }; 4639 4640 // Pull in constant data. The first 16 bytes are the add overlay 4641 // which is applied to the vector holding the counter (state[12]). 4642 // The second 16 bytes is the index register for the 8-bit left 4643 // rotation tbl instruction. 4644 __ adr(tmpAddr, L_cc20_const); 4645 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr)); 4646 4647 // Load from memory and interlace across 16 SIMD registers, 4648 // With each word from memory being broadcast to all lanes of 4649 // each successive SIMD register. 4650 // Addr(0) -> All lanes in workSt[i] 4651 // Addr(4) -> All lanes workSt[i + 1], etc. 4652 __ mov(tmpAddr, state); 4653 for (i = 0; i < 16; i += 4) { 4654 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4655 __ post(tmpAddr, 16)); 4656 } 4657 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4658 4659 // Before entering the loop, create 5 4-register arrays. These 4660 // will hold the 4 registers that represent the a/b/c/d fields 4661 // in the quarter round operation. For instance the "b" field 4662 // for the first 4 quarter round operations is the set of v16/v17/v18/v19, 4663 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16 4664 // since it is part of a diagonal organization. The aSet and scratch 4665 // register sets are defined at declaration time because they do not change 4666 // organization at any point during the 20-round processing. 4667 FloatRegister aSet[4] = { v4, v5, v6, v7 }; 4668 FloatRegister bSet[4]; 4669 FloatRegister cSet[4]; 4670 FloatRegister dSet[4]; 4671 FloatRegister scratch[4] = { v0, v1, v2, v3 }; 4672 4673 // Set up the 10 iteration loop and perform all 8 quarter round ops 4674 __ mov(loopCtr, 10); 4675 __ BIND(L_twoRounds); 4676 4677 // Set to columnar organization and do the following 4 quarter-rounds: 4678 // QUARTERROUND(0, 4, 8, 12) 4679 // QUARTERROUND(1, 5, 9, 13) 4680 // QUARTERROUND(2, 6, 10, 14) 4681 // QUARTERROUND(3, 7, 11, 15) 4682 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7); 4683 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11); 4684 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15); 4685 4686 __ cc20_qr_add4(aSet, bSet); // a += b 4687 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4688 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4689 4690 __ cc20_qr_add4(cSet, dSet); // c += d 4691 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4692 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4693 4694 __ cc20_qr_add4(aSet, bSet); // a += b 4695 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4696 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4697 4698 __ cc20_qr_add4(cSet, dSet); // c += d 4699 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4700 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4701 4702 // Set to diagonal organization and do the next 4 quarter-rounds: 4703 // QUARTERROUND(0, 5, 10, 15) 4704 // QUARTERROUND(1, 6, 11, 12) 4705 // QUARTERROUND(2, 7, 8, 13) 4706 // QUARTERROUND(3, 4, 9, 14) 4707 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4); 4708 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9); 4709 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14); 4710 4711 __ cc20_qr_add4(aSet, bSet); // a += b 4712 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4713 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16 4714 4715 __ cc20_qr_add4(cSet, dSet); // c += d 4716 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4717 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12 4718 4719 __ cc20_qr_add4(aSet, bSet); // a += b 4720 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a 4721 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8 4722 4723 __ cc20_qr_add4(cSet, dSet); // c += d 4724 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch) 4725 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12 4726 4727 // Decrement and iterate 4728 __ sub(loopCtr, loopCtr, 1); 4729 __ cbnz(loopCtr, L_twoRounds); 4730 4731 __ mov(tmpAddr, state); 4732 4733 // Add the starting state back to the post-loop keystream 4734 // state. We read/interlace the state array from memory into 4735 // 4 registers similar to what we did in the beginning. Then 4736 // add the counter overlay onto workSt[12] at the end. 4737 for (i = 0; i < 16; i += 4) { 4738 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16)); 4739 __ addv(workSt[i], __ T4S, workSt[i], v0); 4740 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1); 4741 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2); 4742 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3); 4743 } 4744 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay 4745 4746 // Write working state into the keystream buffer. This is accomplished 4747 // by taking the lane "i" from each of the four vectors and writing 4748 // it to consecutive 4-byte offsets, then post-incrementing by 16 and 4749 // repeating with the next 4 vectors until all 16 vectors have been used. 4750 // Then move to the next lane and repeat the process until all lanes have 4751 // been written. 4752 for (i = 0; i < 4; i++) { 4753 for (j = 0; j < 16; j += 4) { 4754 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4755 __ post(keystream, 16)); 4756 } 4757 } 4758 4759 __ mov(r0, 256); // Return length of output keystream 4760 __ leave(); 4761 __ ret(lr); 4762 4763 // bind label and generate local constant data used by this stub 4764 // The constant data is broken into two 128-bit segments to be loaded 4765 // onto FloatRegisters. The first 128 bits are a counter add overlay 4766 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4767 // The second 128-bits is a table constant used for 8-bit left rotations. 4768 __ BIND(L_cc20_const); 4769 __ emit_int64(0x0000000100000000UL); 4770 __ emit_int64(0x0000000300000002UL); 4771 __ emit_int64(0x0605040702010003UL); 4772 __ emit_int64(0x0E0D0C0F0A09080BUL); 4773 4774 return start; 4775 } 4776 4777 // Helpers to schedule parallel operation bundles across vector 4778 // register sequences of size 2, 4 or 8. 4779 4780 // Implement various primitive computations across vector sequences 4781 4782 template<int N> 4783 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4784 const VSeq<N>& v1, const VSeq<N>& v2) { 4785 // output must not be constant 4786 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4787 // output cannot overwrite pending inputs 4788 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4789 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4790 for (int i = 0; i < N; i++) { 4791 __ addv(v[i], T, v1[i], v2[i]); 4792 } 4793 } 4794 4795 template<int N> 4796 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4797 const VSeq<N>& v1, const VSeq<N>& v2) { 4798 // output must not be constant 4799 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4800 // output cannot overwrite pending inputs 4801 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4802 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4803 for (int i = 0; i < N; i++) { 4804 __ subv(v[i], T, v1[i], v2[i]); 4805 } 4806 } 4807 4808 template<int N> 4809 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4810 const VSeq<N>& v1, const VSeq<N>& v2) { 4811 // output must not be constant 4812 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4813 // output cannot overwrite pending inputs 4814 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4815 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4816 for (int i = 0; i < N; i++) { 4817 __ mulv(v[i], T, v1[i], v2[i]); 4818 } 4819 } 4820 4821 template<int N> 4822 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4823 // output must not be constant 4824 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4825 // output cannot overwrite pending inputs 4826 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4827 for (int i = 0; i < N; i++) { 4828 __ negr(v[i], T, v1[i]); 4829 } 4830 } 4831 4832 template<int N> 4833 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4834 const VSeq<N>& v1, int shift) { 4835 // output must not be constant 4836 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4837 // output cannot overwrite pending inputs 4838 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4839 for (int i = 0; i < N; i++) { 4840 __ sshr(v[i], T, v1[i], shift); 4841 } 4842 } 4843 4844 template<int N> 4845 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4846 // output must not be constant 4847 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4848 // output cannot overwrite pending inputs 4849 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4850 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4851 for (int i = 0; i < N; i++) { 4852 __ andr(v[i], __ T16B, v1[i], v2[i]); 4853 } 4854 } 4855 4856 template<int N> 4857 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4858 // output must not be constant 4859 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4860 // output cannot overwrite pending inputs 4861 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4862 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4863 for (int i = 0; i < N; i++) { 4864 __ orr(v[i], __ T16B, v1[i], v2[i]); 4865 } 4866 } 4867 4868 template<int N> 4869 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4870 // output must not be constant 4871 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4872 // output cannot overwrite pending inputs 4873 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4874 for (int i = 0; i < N; i++) { 4875 __ notr(v[i], __ T16B, v1[i]); 4876 } 4877 } 4878 4879 template<int N> 4880 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) { 4881 // output must not be constant 4882 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4883 // output cannot overwrite pending inputs 4884 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4885 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4886 for (int i = 0; i < N; i++) { 4887 __ sqdmulh(v[i], T, v1[i], v2[i]); 4888 } 4889 } 4890 4891 template<int N> 4892 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) { 4893 // output must not be constant 4894 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector"); 4895 // output cannot overwrite pending inputs 4896 assert(!vs_write_before_read(v, v1), "output overwrites input"); 4897 assert(!vs_write_before_read(v, v2), "output overwrites input"); 4898 for (int i = 0; i < N; i++) { 4899 __ mlsv(v[i], T, v1[i], v2[i]); 4900 } 4901 } 4902 4903 // load N/2 successive pairs of quadword values from memory in order 4904 // into N successive vector registers of the sequence via the 4905 // address supplied in base. 4906 template<int N> 4907 void vs_ldpq(const VSeq<N>& v, Register base) { 4908 for (int i = 0; i < N; i += 2) { 4909 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4910 } 4911 } 4912 4913 // load N/2 successive pairs of quadword values from memory in order 4914 // into N vector registers of the sequence via the address supplied 4915 // in base using post-increment addressing 4916 template<int N> 4917 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4918 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4919 for (int i = 0; i < N; i += 2) { 4920 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4921 } 4922 } 4923 4924 // store N successive vector registers of the sequence into N/2 4925 // successive pairs of quadword memory locations via the address 4926 // supplied in base using post-increment addressing 4927 template<int N> 4928 void vs_stpq_post(const VSeq<N>& v, Register base) { 4929 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4930 for (int i = 0; i < N; i += 2) { 4931 __ stpq(v[i], v[i+1], __ post(base, 32)); 4932 } 4933 } 4934 4935 // load N/2 pairs of quadword values from memory de-interleaved into 4936 // N vector registers 2 at a time via the address supplied in base 4937 // using post-increment addressing. 4938 template<int N> 4939 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4940 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4941 for (int i = 0; i < N; i += 2) { 4942 __ ld2(v[i], v[i+1], T, __ post(base, 32)); 4943 } 4944 } 4945 4946 // store N vector registers interleaved into N/2 pairs of quadword 4947 // memory locations via the address supplied in base using 4948 // post-increment addressing. 4949 template<int N> 4950 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4951 static_assert((N & (N - 1)) == 0, "sequence length must be even"); 4952 for (int i = 0; i < N; i += 2) { 4953 __ st2(v[i], v[i+1], T, __ post(base, 32)); 4954 } 4955 } 4956 4957 // load N quadword values from memory de-interleaved into N vector 4958 // registers 3 elements at a time via the address supplied in base. 4959 template<int N> 4960 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4961 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4962 for (int i = 0; i < N; i += 3) { 4963 __ ld3(v[i], v[i+1], v[i+2], T, base); 4964 } 4965 } 4966 4967 // load N quadword values from memory de-interleaved into N vector 4968 // registers 3 elements at a time via the address supplied in base 4969 // using post-increment addressing. 4970 template<int N> 4971 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) { 4972 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3"); 4973 for (int i = 0; i < N; i += 3) { 4974 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48)); 4975 } 4976 } 4977 4978 // load N/2 pairs of quadword values from memory into N vector 4979 // registers via the address supplied in base with each pair indexed 4980 // using the the start offset plus the corresponding entry in the 4981 // offsets array 4982 template<int N> 4983 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4984 for (int i = 0; i < N/2; i++) { 4985 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4986 } 4987 } 4988 4989 // store N vector registers into N/2 pairs of quadword memory 4990 // locations via the address supplied in base with each pair indexed 4991 // using the the start offset plus the corresponding entry in the 4992 // offsets array 4993 template<int N> 4994 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4995 for (int i = 0; i < N/2; i++) { 4996 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4997 } 4998 } 4999 5000 // load N single quadword values from memory into N vector registers 5001 // via the address supplied in base with each value indexed using 5002 // the the start offset plus the corresponding entry in the offsets 5003 // array 5004 template<int N> 5005 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 5006 int start, int (&offsets)[N]) { 5007 for (int i = 0; i < N; i++) { 5008 __ ldr(v[i], T, Address(base, start + offsets[i])); 5009 } 5010 } 5011 5012 // store N vector registers into N single quadword memory locations 5013 // via the address supplied in base with each value indexed using 5014 // the the start offset plus the corresponding entry in the offsets 5015 // array 5016 template<int N> 5017 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 5018 int start, int (&offsets)[N]) { 5019 for (int i = 0; i < N; i++) { 5020 __ str(v[i], T, Address(base, start + offsets[i])); 5021 } 5022 } 5023 5024 // load N/2 pairs of quadword values from memory de-interleaved into 5025 // N vector registers 2 at a time via the address supplied in base 5026 // with each pair indexed using the the start offset plus the 5027 // corresponding entry in the offsets array 5028 template<int N> 5029 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 5030 Register tmp, int start, int (&offsets)[N/2]) { 5031 for (int i = 0; i < N/2; i++) { 5032 __ add(tmp, base, start + offsets[i]); 5033 __ ld2(v[2*i], v[2*i+1], T, tmp); 5034 } 5035 } 5036 5037 // store N vector registers 2 at a time interleaved into N/2 pairs 5038 // of quadword memory locations via the address supplied in base 5039 // with each pair indexed using the the start offset plus the 5040 // corresponding entry in the offsets array 5041 template<int N> 5042 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 5043 Register tmp, int start, int (&offsets)[N/2]) { 5044 for (int i = 0; i < N/2; i++) { 5045 __ add(tmp, base, start + offsets[i]); 5046 __ st2(v[2*i], v[2*i+1], T, tmp); 5047 } 5048 } 5049 5050 // Helper routines for various flavours of Montgomery multiply 5051 5052 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery 5053 // multiplications in parallel 5054 // 5055 5056 // See the montMul() method of the sun.security.provider.ML_DSA 5057 // class. 5058 // 5059 // Computes 4x4S results or 8x8H results 5060 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5061 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5062 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5063 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5064 // Outputs: va - 4x4S or 4x8H vector register sequences 5065 // vb, vc, vtmp and vq must all be disjoint 5066 // va must be disjoint from all other inputs/temps or must equal vc 5067 // va must have a non-zero delta i.e. it must not be a constant vseq. 5068 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5069 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5070 Assembler::SIMD_Arrangement T, 5071 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5072 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5073 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5074 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5075 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5076 5077 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5078 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5079 5080 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5081 5082 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5083 assert(vs_disjoint(va, vb), "va and vb overlap"); 5084 assert(vs_disjoint(va, vq), "va and vq overlap"); 5085 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5086 assert(!va.is_constant(), "output vector must identify 4 different registers"); 5087 5088 // schedule 4 streams of instructions across the vector sequences 5089 for (int i = 0; i < 4; i++) { 5090 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5091 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5092 } 5093 5094 for (int i = 0; i < 4; i++) { 5095 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5096 } 5097 5098 for (int i = 0; i < 4; i++) { 5099 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5100 } 5101 5102 for (int i = 0; i < 4; i++) { 5103 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5104 } 5105 } 5106 5107 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery 5108 // multiplications in parallel 5109 // 5110 5111 // See the montMul() method of the sun.security.provider.ML_DSA 5112 // class. 5113 // 5114 // Computes 4x4S results or 8x8H results 5115 // a = b * c * 2^MONT_R_BITS mod MONT_Q 5116 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences 5117 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R> 5118 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call 5119 // Outputs: va - 4x4S or 4x8H vector register sequences 5120 // vb, vc, vtmp and vq must all be disjoint 5121 // va must be disjoint from all other inputs/temps or must equal vc 5122 // va must have a non-zero delta i.e. it must not be a constant vseq. 5123 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit. 5124 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5125 Assembler::SIMD_Arrangement T, 5126 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5127 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul"); 5128 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5129 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5130 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5131 5132 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5133 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5134 5135 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5136 5137 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5138 assert(vs_disjoint(va, vb), "va and vb overlap"); 5139 assert(vs_disjoint(va, vq), "va and vq overlap"); 5140 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5141 assert(!va.is_constant(), "output vector must identify 2 different registers"); 5142 5143 // schedule 2 streams of instructions across the vector sequences 5144 for (int i = 0; i < 2; i++) { 5145 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 5146 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c) 5147 } 5148 5149 for (int i = 0; i < 2; i++) { 5150 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv 5151 } 5152 5153 for (int i = 0; i < 2; i++) { 5154 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q) 5155 } 5156 5157 for (int i = 0; i < 2; i++) { 5158 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2 5159 } 5160 } 5161 5162 // Perform 16 16-bit Montgomery multiplications in parallel. 5163 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc, 5164 const VSeq<2>& vtmp, const VSeq<2>& vq) { 5165 // Use the helper routine to schedule a 2x8H Montgomery multiply. 5166 // It will assert that the register use is valid 5167 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq); 5168 } 5169 5170 // Perform 32 16-bit Montgomery multiplications in parallel. 5171 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 5172 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5173 // Use the helper routine to schedule a 4x8H Montgomery multiply. 5174 // It will assert that the register use is valid 5175 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq); 5176 } 5177 5178 // Perform 64 16-bit Montgomery multiplications in parallel. 5179 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 5180 const VSeq<4>& vtmp, const VSeq<2>& vq) { 5181 // Schedule two successive 4x8H multiplies via the montmul helper 5182 // on the front and back halves of va, vb and vc. The helper will 5183 // assert that the register use has no overlap conflicts on each 5184 // individual call but we also need to ensure that the necessary 5185 // disjoint/equality constraints are met across both calls. 5186 5187 // vb, vc, vtmp and vq must be disjoint. va must either be 5188 // disjoint from all other registers or equal vc 5189 5190 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 5191 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 5192 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 5193 5194 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 5195 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 5196 5197 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 5198 5199 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 5200 assert(vs_disjoint(va, vb), "va and vb overlap"); 5201 assert(vs_disjoint(va, vq), "va and vq overlap"); 5202 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 5203 5204 // we multiply the front and back halves of each sequence 4 at a 5205 // time because 5206 // 5207 // 1) we are currently only able to get 4-way instruction 5208 // parallelism at best 5209 // 5210 // 2) we need registers for the constants in vq and temporary 5211 // scratch registers to hold intermediate results so vtmp can only 5212 // be a VSeq<4> which means we only have 4 scratch slots 5213 5214 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq); 5215 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq); 5216 } 5217 5218 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, 5219 const VSeq<4>& vc, 5220 const VSeq<4>& vtmp, 5221 const VSeq<2>& vq) { 5222 // compute a = montmul(a1, c) 5223 kyber_montmul32(vc, va1, vc, vtmp, vq); 5224 // ouptut a1 = a0 - a 5225 vs_subv(va1, __ T8H, va0, vc); 5226 // and a0 = a0 + a 5227 vs_addv(va0, __ T8H, va0, vc); 5228 } 5229 5230 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1, 5231 const VSeq<4>& vb, 5232 const VSeq<4>& vtmp1, 5233 const VSeq<4>& vtmp2, 5234 const VSeq<2>& vq) { 5235 // compute c = a0 - a1 5236 vs_subv(vtmp1, __ T8H, va0, va1); 5237 // output a0 = a0 + a1 5238 vs_addv(va0, __ T8H, va0, va1); 5239 // output a1 = b montmul c 5240 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq); 5241 } 5242 5243 void load64shorts(const VSeq<8>& v, Register shorts) { 5244 vs_ldpq_post(v, shorts); 5245 } 5246 5247 void load32shorts(const VSeq<4>& v, Register shorts) { 5248 vs_ldpq_post(v, shorts); 5249 } 5250 5251 void store64shorts(VSeq<8> v, Register tmpAddr) { 5252 vs_stpq_post(v, tmpAddr); 5253 } 5254 5255 // Kyber NTT function. 5256 // Implements 5257 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {} 5258 // 5259 // coeffs (short[256]) = c_rarg0 5260 // ntt_zetas (short[256]) = c_rarg1 5261 address generate_kyberNtt() { 5262 5263 __ align(CodeEntryAlignment); 5264 StubId stub_id = StubId::stubgen_kyberNtt_id; 5265 StubCodeMark mark(this, stub_id); 5266 address start = __ pc(); 5267 __ enter(); 5268 5269 const Register coeffs = c_rarg0; 5270 const Register zetas = c_rarg1; 5271 5272 const Register kyberConsts = r10; 5273 const Register tmpAddr = r11; 5274 5275 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5276 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5277 VSeq<2> vq(30); // n.b. constants overlap vs3 5278 5279 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5280 // load the montmul constants 5281 vs_ldpq(vq, kyberConsts); 5282 5283 // Each level corresponds to an iteration of the outermost loop of the 5284 // Java method seilerNTT(int[] coeffs). There are some differences 5285 // from what is done in the seilerNTT() method, though: 5286 // 1. The computation is using 16-bit signed values, we do not convert them 5287 // to ints here. 5288 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in 5289 // this array for each level, it is easier that way to fill up the vector 5290 // registers. 5291 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery 5292 // multiplications (this is because that way there should not be any 5293 // overflow during the inverse NTT computation), here we usr R = 2^16 so 5294 // that we can use the 16-bit arithmetic in the vector unit. 5295 // 5296 // On each level, we fill up the vector registers in such a way that the 5297 // array elements that need to be multiplied by the zetas go into one 5298 // set of vector registers while the corresponding ones that don't need to 5299 // be multiplied, go into another set. 5300 // We can do 32 Montgomery multiplications in parallel, using 12 vector 5301 // registers interleaving the steps of 4 identical computations, 5302 // each done on 8 16-bit values per register. 5303 5304 // At levels 0-3 the coefficients multiplied by or added/subtracted 5305 // to the zetas occur in discrete blocks whose size is some multiple 5306 // of 32. 5307 5308 // level 0 5309 __ add(tmpAddr, coeffs, 256); 5310 load64shorts(vs1, tmpAddr); 5311 load64shorts(vs2, zetas); 5312 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5313 __ add(tmpAddr, coeffs, 0); 5314 load64shorts(vs1, tmpAddr); 5315 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5316 vs_addv(vs1, __ T8H, vs1, vs2); 5317 __ add(tmpAddr, coeffs, 0); 5318 vs_stpq_post(vs1, tmpAddr); 5319 __ add(tmpAddr, coeffs, 256); 5320 vs_stpq_post(vs3, tmpAddr); 5321 // restore montmul constants 5322 vs_ldpq(vq, kyberConsts); 5323 load64shorts(vs1, tmpAddr); 5324 load64shorts(vs2, zetas); 5325 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5326 __ add(tmpAddr, coeffs, 128); 5327 load64shorts(vs1, tmpAddr); 5328 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5329 vs_addv(vs1, __ T8H, vs1, vs2); 5330 __ add(tmpAddr, coeffs, 128); 5331 store64shorts(vs1, tmpAddr); 5332 __ add(tmpAddr, coeffs, 384); 5333 store64shorts(vs3, tmpAddr); 5334 5335 // level 1 5336 // restore montmul constants 5337 vs_ldpq(vq, kyberConsts); 5338 __ add(tmpAddr, coeffs, 128); 5339 load64shorts(vs1, tmpAddr); 5340 load64shorts(vs2, zetas); 5341 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5342 __ add(tmpAddr, coeffs, 0); 5343 load64shorts(vs1, tmpAddr); 5344 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5345 vs_addv(vs1, __ T8H, vs1, vs2); 5346 __ add(tmpAddr, coeffs, 0); 5347 store64shorts(vs1, tmpAddr); 5348 store64shorts(vs3, tmpAddr); 5349 vs_ldpq(vq, kyberConsts); 5350 __ add(tmpAddr, coeffs, 384); 5351 load64shorts(vs1, tmpAddr); 5352 load64shorts(vs2, zetas); 5353 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5354 __ add(tmpAddr, coeffs, 256); 5355 load64shorts(vs1, tmpAddr); 5356 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5357 vs_addv(vs1, __ T8H, vs1, vs2); 5358 __ add(tmpAddr, coeffs, 256); 5359 store64shorts(vs1, tmpAddr); 5360 store64shorts(vs3, tmpAddr); 5361 5362 // level 2 5363 vs_ldpq(vq, kyberConsts); 5364 int offsets1[4] = { 0, 32, 128, 160 }; 5365 vs_ldpq_indexed(vs1, coeffs, 64, offsets1); 5366 load64shorts(vs2, zetas); 5367 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5368 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5369 // kyber_subv_addv64(); 5370 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5371 vs_addv(vs1, __ T8H, vs1, vs2); 5372 __ add(tmpAddr, coeffs, 0); 5373 vs_stpq_post(vs_front(vs1), tmpAddr); 5374 vs_stpq_post(vs_front(vs3), tmpAddr); 5375 vs_stpq_post(vs_back(vs1), tmpAddr); 5376 vs_stpq_post(vs_back(vs3), tmpAddr); 5377 vs_ldpq(vq, kyberConsts); 5378 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1); 5379 load64shorts(vs2, zetas); 5380 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5381 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5382 // kyber_subv_addv64(); 5383 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5384 vs_addv(vs1, __ T8H, vs1, vs2); 5385 __ add(tmpAddr, coeffs, 256); 5386 vs_stpq_post(vs_front(vs1), tmpAddr); 5387 vs_stpq_post(vs_front(vs3), tmpAddr); 5388 vs_stpq_post(vs_back(vs1), tmpAddr); 5389 vs_stpq_post(vs_back(vs3), tmpAddr); 5390 5391 // level 3 5392 vs_ldpq(vq, kyberConsts); 5393 int offsets2[4] = { 0, 64, 128, 192 }; 5394 vs_ldpq_indexed(vs1, coeffs, 32, offsets2); 5395 load64shorts(vs2, zetas); 5396 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5397 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5398 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5399 vs_addv(vs1, __ T8H, vs1, vs2); 5400 vs_stpq_indexed(vs1, coeffs, 0, offsets2); 5401 vs_stpq_indexed(vs3, coeffs, 32, offsets2); 5402 5403 vs_ldpq(vq, kyberConsts); 5404 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2); 5405 load64shorts(vs2, zetas); 5406 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5407 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5408 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5409 vs_addv(vs1, __ T8H, vs1, vs2); 5410 vs_stpq_indexed(vs1, coeffs, 256, offsets2); 5411 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2); 5412 5413 // level 4 5414 // At level 4 coefficients occur in 8 discrete blocks of size 16 5415 // so they are loaded using employing an ldr at 8 distinct offsets. 5416 5417 vs_ldpq(vq, kyberConsts); 5418 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5419 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3); 5420 load64shorts(vs2, zetas); 5421 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5422 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5423 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5424 vs_addv(vs1, __ T8H, vs1, vs2); 5425 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5426 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3); 5427 5428 vs_ldpq(vq, kyberConsts); 5429 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3); 5430 load64shorts(vs2, zetas); 5431 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5432 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5433 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5434 vs_addv(vs1, __ T8H, vs1, vs2); 5435 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5436 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3); 5437 5438 // level 5 5439 // At level 5 related coefficients occur in discrete blocks of size 8 so 5440 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5441 5442 vs_ldpq(vq, kyberConsts); 5443 int offsets4[4] = { 0, 32, 64, 96 }; 5444 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5445 load32shorts(vs_front(vs2), zetas); 5446 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5447 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5448 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5449 load32shorts(vs_front(vs2), zetas); 5450 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5451 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5452 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5453 load32shorts(vs_front(vs2), zetas); 5454 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5455 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5456 5457 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5458 load32shorts(vs_front(vs2), zetas); 5459 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5460 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5461 5462 // level 6 5463 // At level 6 related coefficients occur in discrete blocks of size 4 so 5464 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5465 5466 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5467 load32shorts(vs_front(vs2), zetas); 5468 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5469 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5470 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5471 // __ ldpq(v18, v19, __ post(zetas, 32)); 5472 load32shorts(vs_front(vs2), zetas); 5473 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5474 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5475 5476 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5477 load32shorts(vs_front(vs2), zetas); 5478 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5479 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5480 5481 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5482 load32shorts(vs_front(vs2), zetas); 5483 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq); 5484 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5485 5486 __ leave(); // required for proper stackwalking of RuntimeStub frame 5487 __ mov(r0, zr); // return 0 5488 __ ret(lr); 5489 5490 return start; 5491 } 5492 5493 // Kyber Inverse NTT function 5494 // Implements 5495 // static int implKyberInverseNtt(short[] poly, short[] zetas) {} 5496 // 5497 // coeffs (short[256]) = c_rarg0 5498 // ntt_zetas (short[256]) = c_rarg1 5499 address generate_kyberInverseNtt() { 5500 5501 __ align(CodeEntryAlignment); 5502 StubId stub_id = StubId::stubgen_kyberInverseNtt_id; 5503 StubCodeMark mark(this, stub_id); 5504 address start = __ pc(); 5505 __ enter(); 5506 5507 const Register coeffs = c_rarg0; 5508 const Register zetas = c_rarg1; 5509 5510 const Register kyberConsts = r10; 5511 const Register tmpAddr = r11; 5512 const Register tmpAddr2 = c_rarg2; 5513 5514 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs 5515 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5516 VSeq<2> vq(30); // n.b. constants overlap vs3 5517 5518 __ lea(kyberConsts, 5519 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5520 5521 // level 0 5522 // At level 0 related coefficients occur in discrete blocks of size 4 so 5523 // need to be loaded interleaved using an ld2 operation with arrangement 4S. 5524 5525 vs_ldpq(vq, kyberConsts); 5526 int offsets4[4] = { 0, 32, 64, 96 }; 5527 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5528 load32shorts(vs_front(vs2), zetas); 5529 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5530 vs_front(vs2), vs_back(vs2), vtmp, vq); 5531 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4); 5532 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5533 load32shorts(vs_front(vs2), zetas); 5534 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5535 vs_front(vs2), vs_back(vs2), vtmp, vq); 5536 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4); 5537 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5538 load32shorts(vs_front(vs2), zetas); 5539 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5540 vs_front(vs2), vs_back(vs2), vtmp, vq); 5541 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4); 5542 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5543 load32shorts(vs_front(vs2), zetas); 5544 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5545 vs_front(vs2), vs_back(vs2), vtmp, vq); 5546 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4); 5547 5548 // level 1 5549 // At level 1 related coefficients occur in discrete blocks of size 8 so 5550 // need to be loaded interleaved using an ld2 operation with arrangement 2D. 5551 5552 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5553 load32shorts(vs_front(vs2), zetas); 5554 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5555 vs_front(vs2), vs_back(vs2), vtmp, vq); 5556 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4); 5557 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5558 load32shorts(vs_front(vs2), zetas); 5559 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5560 vs_front(vs2), vs_back(vs2), vtmp, vq); 5561 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4); 5562 5563 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5564 load32shorts(vs_front(vs2), zetas); 5565 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5566 vs_front(vs2), vs_back(vs2), vtmp, vq); 5567 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4); 5568 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5569 load32shorts(vs_front(vs2), zetas); 5570 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1), 5571 vs_front(vs2), vs_back(vs2), vtmp, vq); 5572 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4); 5573 5574 // level 2 5575 // At level 2 coefficients occur in 8 discrete blocks of size 16 5576 // so they are loaded using employing an ldr at 8 distinct offsets. 5577 5578 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5579 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5580 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3); 5581 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5582 vs_subv(vs1, __ T8H, vs1, vs2); 5583 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3); 5584 load64shorts(vs2, zetas); 5585 vs_ldpq(vq, kyberConsts); 5586 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5587 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3); 5588 5589 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5590 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5591 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5592 vs_subv(vs1, __ T8H, vs1, vs2); 5593 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3); 5594 load64shorts(vs2, zetas); 5595 vs_ldpq(vq, kyberConsts); 5596 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5597 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3); 5598 5599 // Barrett reduction at indexes where overflow may happen 5600 5601 // load q and the multiplier for the Barrett reduction 5602 __ add(tmpAddr, kyberConsts, 16); 5603 vs_ldpq(vq, tmpAddr); 5604 5605 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences 5606 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants 5607 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul 5608 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3); 5609 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5610 vs_sshr(vs2, __ T8H, vs2, 11); 5611 vs_mlsv(vs1, __ T8H, vs2, vq1); 5612 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3); 5613 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3); 5614 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5615 vs_sshr(vs2, __ T8H, vs2, 11); 5616 vs_mlsv(vs1, __ T8H, vs2, vq1); 5617 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3); 5618 5619 // level 3 5620 // From level 3 upwards coefficients occur in discrete blocks whose size is 5621 // some multiple of 32 so can be loaded using ldpq and suitable indexes. 5622 5623 int offsets2[4] = { 0, 64, 128, 192 }; 5624 vs_ldpq_indexed(vs1, coeffs, 0, offsets2); 5625 vs_ldpq_indexed(vs2, coeffs, 32, offsets2); 5626 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5627 vs_subv(vs1, __ T8H, vs1, vs2); 5628 vs_stpq_indexed(vs3, coeffs, 0, offsets2); 5629 load64shorts(vs2, zetas); 5630 vs_ldpq(vq, kyberConsts); 5631 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5632 vs_stpq_indexed(vs2, coeffs, 32, offsets2); 5633 5634 vs_ldpq_indexed(vs1, coeffs, 256, offsets2); 5635 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5636 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5637 vs_subv(vs1, __ T8H, vs1, vs2); 5638 vs_stpq_indexed(vs3, coeffs, 256, offsets2); 5639 load64shorts(vs2, zetas); 5640 vs_ldpq(vq, kyberConsts); 5641 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5642 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2); 5643 5644 // level 4 5645 5646 int offsets1[4] = { 0, 32, 128, 160 }; 5647 vs_ldpq_indexed(vs1, coeffs, 0, offsets1); 5648 vs_ldpq_indexed(vs2, coeffs, 64, offsets1); 5649 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5650 vs_subv(vs1, __ T8H, vs1, vs2); 5651 vs_stpq_indexed(vs3, coeffs, 0, offsets1); 5652 load64shorts(vs2, zetas); 5653 vs_ldpq(vq, kyberConsts); 5654 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5655 vs_stpq_indexed(vs2, coeffs, 64, offsets1); 5656 5657 vs_ldpq_indexed(vs1, coeffs, 256, offsets1); 5658 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5659 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5660 vs_subv(vs1, __ T8H, vs1, vs2); 5661 vs_stpq_indexed(vs3, coeffs, 256, offsets1); 5662 load64shorts(vs2, zetas); 5663 vs_ldpq(vq, kyberConsts); 5664 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5665 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1); 5666 5667 // level 5 5668 5669 __ add(tmpAddr, coeffs, 0); 5670 load64shorts(vs1, tmpAddr); 5671 __ add(tmpAddr, coeffs, 128); 5672 load64shorts(vs2, tmpAddr); 5673 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5674 vs_subv(vs1, __ T8H, vs1, vs2); 5675 __ add(tmpAddr, coeffs, 0); 5676 store64shorts(vs3, tmpAddr); 5677 load64shorts(vs2, zetas); 5678 vs_ldpq(vq, kyberConsts); 5679 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5680 __ add(tmpAddr, coeffs, 128); 5681 store64shorts(vs2, tmpAddr); 5682 5683 load64shorts(vs1, tmpAddr); 5684 __ add(tmpAddr, coeffs, 384); 5685 load64shorts(vs2, tmpAddr); 5686 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5687 vs_subv(vs1, __ T8H, vs1, vs2); 5688 __ add(tmpAddr, coeffs, 256); 5689 store64shorts(vs3, tmpAddr); 5690 load64shorts(vs2, zetas); 5691 vs_ldpq(vq, kyberConsts); 5692 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5693 __ add(tmpAddr, coeffs, 384); 5694 store64shorts(vs2, tmpAddr); 5695 5696 // Barrett reduction at indexes where overflow may happen 5697 5698 // load q and the multiplier for the Barrett reduction 5699 __ add(tmpAddr, kyberConsts, 16); 5700 vs_ldpq(vq, tmpAddr); 5701 5702 int offsets0[2] = { 0, 256 }; 5703 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5704 vs_sqdmulh(vs2, __ T8H, vs1, vq2); 5705 vs_sshr(vs2, __ T8H, vs2, 11); 5706 vs_mlsv(vs1, __ T8H, vs2, vq1); 5707 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0); 5708 5709 // level 6 5710 5711 __ add(tmpAddr, coeffs, 0); 5712 load64shorts(vs1, tmpAddr); 5713 __ add(tmpAddr, coeffs, 256); 5714 load64shorts(vs2, tmpAddr); 5715 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5716 vs_subv(vs1, __ T8H, vs1, vs2); 5717 __ add(tmpAddr, coeffs, 0); 5718 store64shorts(vs3, tmpAddr); 5719 load64shorts(vs2, zetas); 5720 vs_ldpq(vq, kyberConsts); 5721 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5722 __ add(tmpAddr, coeffs, 256); 5723 store64shorts(vs2, tmpAddr); 5724 5725 __ add(tmpAddr, coeffs, 128); 5726 load64shorts(vs1, tmpAddr); 5727 __ add(tmpAddr, coeffs, 384); 5728 load64shorts(vs2, tmpAddr); 5729 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq 5730 vs_subv(vs1, __ T8H, vs1, vs2); 5731 __ add(tmpAddr, coeffs, 128); 5732 store64shorts(vs3, tmpAddr); 5733 load64shorts(vs2, zetas); 5734 vs_ldpq(vq, kyberConsts); 5735 kyber_montmul64(vs2, vs1, vs2, vtmp, vq); 5736 __ add(tmpAddr, coeffs, 384); 5737 store64shorts(vs2, tmpAddr); 5738 5739 // multiply by 2^-n 5740 5741 // load toMont(2^-n mod q) 5742 __ add(tmpAddr, kyberConsts, 48); 5743 __ ldr(v29, __ Q, tmpAddr); 5744 5745 vs_ldpq(vq, kyberConsts); 5746 __ add(tmpAddr, coeffs, 0); 5747 load64shorts(vs1, tmpAddr); 5748 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5749 __ add(tmpAddr, coeffs, 0); 5750 store64shorts(vs2, tmpAddr); 5751 5752 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so 5753 load64shorts(vs1, tmpAddr); 5754 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5755 __ add(tmpAddr, coeffs, 128); 5756 store64shorts(vs2, tmpAddr); 5757 5758 // now tmpAddr contains coeffs + 256 5759 load64shorts(vs1, tmpAddr); 5760 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5761 __ add(tmpAddr, coeffs, 256); 5762 store64shorts(vs2, tmpAddr); 5763 5764 // now tmpAddr contains coeffs + 384 5765 load64shorts(vs1, tmpAddr); 5766 kyber_montmul64(vs2, vs1, vq3, vtmp, vq); 5767 __ add(tmpAddr, coeffs, 384); 5768 store64shorts(vs2, tmpAddr); 5769 5770 __ leave(); // required for proper stackwalking of RuntimeStub frame 5771 __ mov(r0, zr); // return 0 5772 __ ret(lr); 5773 5774 return start; 5775 } 5776 5777 // Kyber multiply polynomials in the NTT domain. 5778 // Implements 5779 // static int implKyberNttMult( 5780 // short[] result, short[] ntta, short[] nttb, short[] zetas) {} 5781 // 5782 // result (short[256]) = c_rarg0 5783 // ntta (short[256]) = c_rarg1 5784 // nttb (short[256]) = c_rarg2 5785 // zetas (short[128]) = c_rarg3 5786 address generate_kyberNttMult() { 5787 5788 __ align(CodeEntryAlignment); 5789 StubId stub_id = StubId::stubgen_kyberNttMult_id; 5790 StubCodeMark mark(this, stub_id); 5791 address start = __ pc(); 5792 __ enter(); 5793 5794 const Register result = c_rarg0; 5795 const Register ntta = c_rarg1; 5796 const Register nttb = c_rarg2; 5797 const Register zetas = c_rarg3; 5798 5799 const Register kyberConsts = r10; 5800 const Register limit = r11; 5801 5802 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps 5803 VSeq<4> vs3(16), vs4(20); 5804 VSeq<2> vq(30); // pair of constants for montmul: q, qinv 5805 VSeq<2> vz(28); // pair of zetas 5806 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ 5807 5808 __ lea(kyberConsts, 5809 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5810 5811 Label kyberNttMult_loop; 5812 5813 __ add(limit, result, 512); 5814 5815 // load q and qinv 5816 vs_ldpq(vq, kyberConsts); 5817 5818 // load R^2 mod q (to convert back from Montgomery representation) 5819 __ add(kyberConsts, kyberConsts, 64); 5820 __ ldr(v27, __ Q, kyberConsts); 5821 5822 __ BIND(kyberNttMult_loop); 5823 5824 // load 16 zetas 5825 vs_ldpq_post(vz, zetas); 5826 5827 // load 2 sets of 32 coefficients from the two input arrays 5828 // interleaved as shorts. i.e. pairs of shorts adjacent in memory 5829 // are striped across pairs of vector registers 5830 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H 5831 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H 5832 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H 5833 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H 5834 5835 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1) 5836 // i.e. montmul the first and second halves of vs1 in order and 5837 // then with one sequence reversed storing the two results in vs3 5838 // 5839 // vs3[0] <- montmul(a0, b0) 5840 // vs3[1] <- montmul(a1, b1) 5841 // vs3[2] <- montmul(a0, b1) 5842 // vs3[3] <- montmul(a1, b0) 5843 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq); 5844 kyber_montmul16(vs_back(vs3), 5845 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq); 5846 5847 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3) 5848 // i.e. montmul the first and second halves of vs4 in order and 5849 // then with one sequence reversed storing the two results in vs1 5850 // 5851 // vs1[0] <- montmul(a2, b2) 5852 // vs1[1] <- montmul(a3, b3) 5853 // vs1[2] <- montmul(a2, b3) 5854 // vs1[3] <- montmul(a3, b2) 5855 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq); 5856 kyber_montmul16(vs_back(vs1), 5857 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq); 5858 5859 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta. 5860 // We can schedule two montmuls at a time if we use a suitable vector 5861 // sequence <vs3[1], vs1[1]>. 5862 int delta = vs1[1]->encoding() - vs3[1]->encoding(); 5863 VSeq<2> vs5(vs3[1], delta); 5864 5865 // vs3[1] <- montmul(montmul(a1, b1), z0) 5866 // vs1[1] <- montmul(montmul(a3, b3), z1) 5867 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq); 5868 5869 // add results in pairs storing in vs3 5870 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0); 5871 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0); 5872 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3)); 5873 5874 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1); 5875 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2); 5876 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1)); 5877 5878 // vs1 <- montmul(vs3, montRSquareModQ) 5879 kyber_montmul32(vs1, vs3, vc, vs2, vq); 5880 5881 // store back the two pairs of result vectors de-interleaved as 8H elements 5882 // i.e. storing each pairs of shorts striped across a register pair adjacent 5883 // in memory 5884 vs_st2_post(vs1, __ T8H, result); 5885 5886 __ cmp(result, limit); 5887 __ br(Assembler::NE, kyberNttMult_loop); 5888 5889 __ leave(); // required for proper stackwalking of RuntimeStub frame 5890 __ mov(r0, zr); // return 0 5891 __ ret(lr); 5892 5893 return start; 5894 } 5895 5896 // Kyber add 2 polynomials. 5897 // Implements 5898 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {} 5899 // 5900 // result (short[256]) = c_rarg0 5901 // a (short[256]) = c_rarg1 5902 // b (short[256]) = c_rarg2 5903 address generate_kyberAddPoly_2() { 5904 5905 __ align(CodeEntryAlignment); 5906 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id; 5907 StubCodeMark mark(this, stub_id); 5908 address start = __ pc(); 5909 __ enter(); 5910 5911 const Register result = c_rarg0; 5912 const Register a = c_rarg1; 5913 const Register b = c_rarg2; 5914 5915 const Register kyberConsts = r11; 5916 5917 // We sum 256 sets of values in total i.e. 32 x 8H quadwords. 5918 // So, we can load, add and store the data in 3 groups of 11, 5919 // 11 and 10 at a time i.e. we need to map sets of 10 or 11 5920 // registers. A further constraint is that the mapping needs 5921 // to skip callee saves. So, we allocate the register 5922 // sequences using two 8 sequences, two 2 sequences and two 5923 // single registers. 5924 VSeq<8> vs1_1(0); 5925 VSeq<2> vs1_2(16); 5926 FloatRegister vs1_3 = v28; 5927 VSeq<8> vs2_1(18); 5928 VSeq<2> vs2_2(26); 5929 FloatRegister vs2_3 = v29; 5930 5931 // two constant vector sequences 5932 VSeq<8> vc_1(31, 0); 5933 VSeq<2> vc_2(31, 0); 5934 5935 FloatRegister vc_3 = v31; 5936 __ lea(kyberConsts, 5937 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 5938 5939 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 5940 for (int i = 0; i < 3; i++) { 5941 // load 80 or 88 values from a into vs1_1/2/3 5942 vs_ldpq_post(vs1_1, a); 5943 vs_ldpq_post(vs1_2, a); 5944 if (i < 2) { 5945 __ ldr(vs1_3, __ Q, __ post(a, 16)); 5946 } 5947 // load 80 or 88 values from b into vs2_1/2/3 5948 vs_ldpq_post(vs2_1, b); 5949 vs_ldpq_post(vs2_2, b); 5950 if (i < 2) { 5951 __ ldr(vs2_3, __ Q, __ post(b, 16)); 5952 } 5953 // sum 80 or 88 values across vs1 and vs2 into vs1 5954 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 5955 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 5956 if (i < 2) { 5957 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 5958 } 5959 // add constant to all 80 or 88 results 5960 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 5961 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 5962 if (i < 2) { 5963 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 5964 } 5965 // store 80 or 88 values 5966 vs_stpq_post(vs1_1, result); 5967 vs_stpq_post(vs1_2, result); 5968 if (i < 2) { 5969 __ str(vs1_3, __ Q, __ post(result, 16)); 5970 } 5971 } 5972 5973 __ leave(); // required for proper stackwalking of RuntimeStub frame 5974 __ mov(r0, zr); // return 0 5975 __ ret(lr); 5976 5977 return start; 5978 } 5979 5980 // Kyber add 3 polynomials. 5981 // Implements 5982 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {} 5983 // 5984 // result (short[256]) = c_rarg0 5985 // a (short[256]) = c_rarg1 5986 // b (short[256]) = c_rarg2 5987 // c (short[256]) = c_rarg3 5988 address generate_kyberAddPoly_3() { 5989 5990 __ align(CodeEntryAlignment); 5991 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id; 5992 StubCodeMark mark(this, stub_id); 5993 address start = __ pc(); 5994 __ enter(); 5995 5996 const Register result = c_rarg0; 5997 const Register a = c_rarg1; 5998 const Register b = c_rarg2; 5999 const Register c = c_rarg3; 6000 6001 const Register kyberConsts = r11; 6002 6003 // As above we sum 256 sets of values in total i.e. 32 x 8H 6004 // quadwords. So, we can load, add and store the data in 3 6005 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6006 // of 10 or 11 registers. A further constraint is that the 6007 // mapping needs to skip callee saves. So, we allocate the 6008 // register sequences using two 8 sequences, two 2 sequences 6009 // and two single registers. 6010 VSeq<8> vs1_1(0); 6011 VSeq<2> vs1_2(16); 6012 FloatRegister vs1_3 = v28; 6013 VSeq<8> vs2_1(18); 6014 VSeq<2> vs2_2(26); 6015 FloatRegister vs2_3 = v29; 6016 6017 // two constant vector sequences 6018 VSeq<8> vc_1(31, 0); 6019 VSeq<2> vc_2(31, 0); 6020 6021 FloatRegister vc_3 = v31; 6022 6023 __ lea(kyberConsts, 6024 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6025 6026 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q 6027 for (int i = 0; i < 3; i++) { 6028 // load 80 or 88 values from a into vs1_1/2/3 6029 vs_ldpq_post(vs1_1, a); 6030 vs_ldpq_post(vs1_2, a); 6031 if (i < 2) { 6032 __ ldr(vs1_3, __ Q, __ post(a, 16)); 6033 } 6034 // load 80 or 88 values from b into vs2_1/2/3 6035 vs_ldpq_post(vs2_1, b); 6036 vs_ldpq_post(vs2_2, b); 6037 if (i < 2) { 6038 __ ldr(vs2_3, __ Q, __ post(b, 16)); 6039 } 6040 // sum 80 or 88 values across vs1 and vs2 into vs1 6041 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 6042 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 6043 if (i < 2) { 6044 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 6045 } 6046 // load 80 or 88 values from c into vs2_1/2/3 6047 vs_ldpq_post(vs2_1, c); 6048 vs_ldpq_post(vs2_2, c); 6049 if (i < 2) { 6050 __ ldr(vs2_3, __ Q, __ post(c, 16)); 6051 } 6052 // sum 80 or 88 values across vs1 and vs2 into vs1 6053 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1); 6054 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2); 6055 if (i < 2) { 6056 __ addv(vs1_3, __ T8H, vs1_3, vs2_3); 6057 } 6058 // add constant to all 80 or 88 results 6059 vs_addv(vs1_1, __ T8H, vs1_1, vc_1); 6060 vs_addv(vs1_2, __ T8H, vs1_2, vc_2); 6061 if (i < 2) { 6062 __ addv(vs1_3, __ T8H, vs1_3, vc_3); 6063 } 6064 // store 80 or 88 values 6065 vs_stpq_post(vs1_1, result); 6066 vs_stpq_post(vs1_2, result); 6067 if (i < 2) { 6068 __ str(vs1_3, __ Q, __ post(result, 16)); 6069 } 6070 } 6071 6072 __ leave(); // required for proper stackwalking of RuntimeStub frame 6073 __ mov(r0, zr); // return 0 6074 __ ret(lr); 6075 6076 return start; 6077 } 6078 6079 // Kyber parse XOF output to polynomial coefficient candidates 6080 // or decodePoly(12, ...). 6081 // Implements 6082 // static int implKyber12To16( 6083 // byte[] condensed, int index, short[] parsed, int parsedLength) {} 6084 // 6085 // (parsedLength or (parsedLength - 48) must be divisible by 64.) 6086 // 6087 // condensed (byte[]) = c_rarg0 6088 // condensedIndex = c_rarg1 6089 // parsed (short[112 or 256]) = c_rarg2 6090 // parsedLength (112 or 256) = c_rarg3 6091 address generate_kyber12To16() { 6092 Label L_F00, L_loop, L_end; 6093 6094 __ align(CodeEntryAlignment); 6095 StubId stub_id = StubId::stubgen_kyber12To16_id; 6096 StubCodeMark mark(this, stub_id); 6097 address start = __ pc(); 6098 __ enter(); 6099 6100 const Register condensed = c_rarg0; 6101 const Register condensedOffs = c_rarg1; 6102 const Register parsed = c_rarg2; 6103 const Register parsedLength = c_rarg3; 6104 6105 const Register tmpAddr = r11; 6106 6107 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B 6108 // quadwords so we need a 6 vector sequence for the inputs. 6109 // Parsing produces 64 shorts, employing two 8 vector 6110 // sequences to store and combine the intermediate data. 6111 VSeq<6> vin(24); 6112 VSeq<8> va(0), vb(16); 6113 6114 __ adr(tmpAddr, L_F00); 6115 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00 6116 __ add(condensed, condensed, condensedOffs); 6117 6118 __ BIND(L_loop); 6119 // load 96 (6 x 16B) byte values 6120 vs_ld3_post(vin, __ T16B, condensed); 6121 6122 // The front half of sequence vin (vin[0], vin[1] and vin[2]) 6123 // holds 48 (16x3) contiguous bytes from memory striped 6124 // horizontally across each of the 16 byte lanes. Equivalently, 6125 // that is 16 pairs of 12-bit integers. Likewise the back half 6126 // holds the next 48 bytes in the same arrangement. 6127 6128 // Each vector in the front half can also be viewed as a vertical 6129 // strip across the 16 pairs of 12 bit integers. Each byte in 6130 // vin[0] stores the low 8 bits of the first int in a pair. Each 6131 // byte in vin[1] stores the high 4 bits of the first int and the 6132 // low 4 bits of the second int. Each byte in vin[2] stores the 6133 // high 8 bits of the second int. Likewise the vectors in second 6134 // half. 6135 6136 // Converting the data to 16-bit shorts requires first of all 6137 // expanding each of the 6 x 16B vectors into 6 corresponding 6138 // pairs of 8H vectors. Mask, shift and add operations on the 6139 // resulting vector pairs can be used to combine 4 and 8 bit 6140 // parts of related 8H vector elements. 6141 // 6142 // The middle vectors (vin[2] and vin[5]) are actually expanded 6143 // twice, one copy manipulated to provide the lower 4 bits 6144 // belonging to the first short in a pair and another copy 6145 // manipulated to provide the higher 4 bits belonging to the 6146 // second short in a pair. This is why the the vector sequences va 6147 // and vb used to hold the expanded 8H elements are of length 8. 6148 6149 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6150 // n.b. target elements 2 and 3 duplicate elements 4 and 5 6151 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6152 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6153 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6154 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6155 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6156 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6157 6158 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3] 6159 // and vb[4:5] 6160 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6161 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0); 6162 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6163 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0); 6164 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6165 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0); 6166 6167 // shift lo byte of copy 1 of the middle stripe into the high byte 6168 __ shl(va[2], __ T8H, va[2], 8); 6169 __ shl(va[3], __ T8H, va[3], 8); 6170 __ shl(vb[2], __ T8H, vb[2], 8); 6171 __ shl(vb[3], __ T8H, vb[3], 8); 6172 6173 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this 6174 // time pre-shifted by 4 to ensure top bits of input 12-bit int 6175 // are in bit positions [4..11]. 6176 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6177 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6178 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6179 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4); 6180 6181 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and 6182 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of 6183 // copy2 6184 __ andr(va[2], __ T16B, va[2], v31); 6185 __ andr(va[3], __ T16B, va[3], v31); 6186 __ ushr(va[4], __ T8H, va[4], 4); 6187 __ ushr(va[5], __ T8H, va[5], 4); 6188 __ andr(vb[2], __ T16B, vb[2], v31); 6189 __ andr(vb[3], __ T16B, vb[3], v31); 6190 __ ushr(vb[4], __ T8H, vb[4], 4); 6191 __ ushr(vb[5], __ T8H, vb[5], 4); 6192 6193 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and 6194 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair 6195 // n.b. the ordering ensures: i) inputs are consumed before they 6196 // are overwritten ii) the order of 16-bit results across successive 6197 // pairs of vectors in va and then vb reflects the order of the 6198 // corresponding 12-bit inputs 6199 __ addv(va[0], __ T8H, va[0], va[2]); 6200 __ addv(va[2], __ T8H, va[1], va[3]); 6201 __ addv(va[1], __ T8H, va[4], va[6]); 6202 __ addv(va[3], __ T8H, va[5], va[7]); 6203 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6204 __ addv(vb[2], __ T8H, vb[1], vb[3]); 6205 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6206 __ addv(vb[3], __ T8H, vb[5], vb[7]); 6207 6208 // store 64 results interleaved as shorts 6209 vs_st2_post(vs_front(va), __ T8H, parsed); 6210 vs_st2_post(vs_front(vb), __ T8H, parsed); 6211 6212 __ sub(parsedLength, parsedLength, 64); 6213 __ cmp(parsedLength, (u1)64); 6214 __ br(Assembler::GE, L_loop); 6215 __ cbz(parsedLength, L_end); 6216 6217 // if anything is left it should be a final 72 bytes of input 6218 // i.e. a final 48 12-bit values. so we handle this by loading 6219 // 48 bytes into all 16B lanes of front(vin) and only 24 6220 // bytes into the lower 8B lane of back(vin) 6221 vs_ld3_post(vs_front(vin), __ T16B, condensed); 6222 vs_ld3(vs_back(vin), __ T8B, condensed); 6223 6224 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5] 6225 // n.b. target elements 2 and 3 of va duplicate elements 4 and 6226 // 5 and target element 2 of vb duplicates element 4. 6227 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0); 6228 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0); 6229 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0); 6230 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0); 6231 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0); 6232 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0); 6233 6234 // This time expand just the lower 8 lanes 6235 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0); 6236 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0); 6237 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0); 6238 6239 // shift lo byte of copy 1 of the middle stripe into the high byte 6240 __ shl(va[2], __ T8H, va[2], 8); 6241 __ shl(va[3], __ T8H, va[3], 8); 6242 __ shl(vb[2], __ T8H, vb[2], 8); 6243 6244 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into 6245 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit 6246 // int are in bit positions [4..11]. 6247 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4); 6248 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4); 6249 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4); 6250 6251 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and 6252 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of 6253 // copy2 6254 __ andr(va[2], __ T16B, va[2], v31); 6255 __ andr(va[3], __ T16B, va[3], v31); 6256 __ ushr(va[4], __ T8H, va[4], 4); 6257 __ ushr(va[5], __ T8H, va[5], 4); 6258 __ andr(vb[2], __ T16B, vb[2], v31); 6259 __ ushr(vb[4], __ T8H, vb[4], 4); 6260 6261 6262 6263 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and 6264 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair 6265 6266 // n.b. ordering ensures: i) inputs are consumed before they are 6267 // overwritten ii) order of 16-bit results across succsessive 6268 // pairs of vectors in va and then lower half of vb reflects order 6269 // of corresponding 12-bit inputs 6270 __ addv(va[0], __ T8H, va[0], va[2]); 6271 __ addv(va[2], __ T8H, va[1], va[3]); 6272 __ addv(va[1], __ T8H, va[4], va[6]); 6273 __ addv(va[3], __ T8H, va[5], va[7]); 6274 __ addv(vb[0], __ T8H, vb[0], vb[2]); 6275 __ addv(vb[1], __ T8H, vb[4], vb[6]); 6276 6277 // store 48 results interleaved as shorts 6278 vs_st2_post(vs_front(va), __ T8H, parsed); 6279 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed); 6280 6281 __ BIND(L_end); 6282 6283 __ leave(); // required for proper stackwalking of RuntimeStub frame 6284 __ mov(r0, zr); // return 0 6285 __ ret(lr); 6286 6287 // bind label and generate constant data used by this stub 6288 __ BIND(L_F00); 6289 __ emit_int64(0x0f000f000f000f00); 6290 __ emit_int64(0x0f000f000f000f00); 6291 6292 return start; 6293 } 6294 6295 // Kyber Barrett reduce function. 6296 // Implements 6297 // static int implKyberBarrettReduce(short[] coeffs) {} 6298 // 6299 // coeffs (short[256]) = c_rarg0 6300 address generate_kyberBarrettReduce() { 6301 6302 __ align(CodeEntryAlignment); 6303 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id; 6304 StubCodeMark mark(this, stub_id); 6305 address start = __ pc(); 6306 __ enter(); 6307 6308 const Register coeffs = c_rarg0; 6309 6310 const Register kyberConsts = r10; 6311 const Register result = r11; 6312 6313 // As above we process 256 sets of values in total i.e. 32 x 6314 // 8H quadwords. So, we can load, add and store the data in 3 6315 // groups of 11, 11 and 10 at a time i.e. we need to map sets 6316 // of 10 or 11 registers. A further constraint is that the 6317 // mapping needs to skip callee saves. So, we allocate the 6318 // register sequences using two 8 sequences, two 2 sequences 6319 // and two single registers. 6320 VSeq<8> vs1_1(0); 6321 VSeq<2> vs1_2(16); 6322 FloatRegister vs1_3 = v28; 6323 VSeq<8> vs2_1(18); 6324 VSeq<2> vs2_2(26); 6325 FloatRegister vs2_3 = v29; 6326 6327 // we also need a pair of corresponding constant sequences 6328 6329 VSeq<8> vc1_1(30, 0); 6330 VSeq<2> vc1_2(30, 0); 6331 FloatRegister vc1_3 = v30; // for kyber_q 6332 6333 VSeq<8> vc2_1(31, 0); 6334 VSeq<2> vc2_2(31, 0); 6335 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier 6336 6337 __ add(result, coeffs, 0); 6338 __ lea(kyberConsts, 6339 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts)); 6340 6341 // load q and the multiplier for the Barrett reduction 6342 __ add(kyberConsts, kyberConsts, 16); 6343 __ ldpq(vc1_3, vc2_3, kyberConsts); 6344 6345 for (int i = 0; i < 3; i++) { 6346 // load 80 or 88 coefficients 6347 vs_ldpq_post(vs1_1, coeffs); 6348 vs_ldpq_post(vs1_2, coeffs); 6349 if (i < 2) { 6350 __ ldr(vs1_3, __ Q, __ post(coeffs, 16)); 6351 } 6352 6353 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16 6354 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1); 6355 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2); 6356 if (i < 2) { 6357 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3); 6358 } 6359 6360 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26 6361 vs_sshr(vs2_1, __ T8H, vs2_1, 11); 6362 vs_sshr(vs2_2, __ T8H, vs2_2, 11); 6363 if (i < 2) { 6364 __ sshr(vs2_3, __ T8H, vs2_3, 11); 6365 } 6366 6367 // vs1 <- vs1 - vs2 * kyber_q 6368 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1); 6369 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2); 6370 if (i < 2) { 6371 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3); 6372 } 6373 6374 vs_stpq_post(vs1_1, result); 6375 vs_stpq_post(vs1_2, result); 6376 if (i < 2) { 6377 __ str(vs1_3, __ Q, __ post(result, 16)); 6378 } 6379 } 6380 6381 __ leave(); // required for proper stackwalking of RuntimeStub frame 6382 __ mov(r0, zr); // return 0 6383 __ ret(lr); 6384 6385 return start; 6386 } 6387 6388 6389 // Dilithium-specific montmul helper routines that generate parallel 6390 // code for, respectively, a single 4x4s vector sequence montmul or 6391 // two such multiplies in a row. 6392 6393 // Perform 16 32-bit Montgomery multiplications in parallel 6394 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 6395 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6396 // Use the helper routine to schedule a 4x4S Montgomery multiply. 6397 // It will assert that the register use is valid 6398 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq); 6399 } 6400 6401 // Perform 2x16 32-bit Montgomery multiplications in parallel 6402 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 6403 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6404 // Schedule two successive 4x4S multiplies via the montmul helper 6405 // on the front and back halves of va, vb and vc. The helper will 6406 // assert that the register use has no overlap conflicts on each 6407 // individual call but we also need to ensure that the necessary 6408 // disjoint/equality constraints are met across both calls. 6409 6410 // vb, vc, vtmp and vq must be disjoint. va must either be 6411 // disjoint from all other registers or equal vc 6412 6413 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 6414 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 6415 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 6416 6417 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 6418 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 6419 6420 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 6421 6422 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 6423 assert(vs_disjoint(va, vb), "va and vb overlap"); 6424 assert(vs_disjoint(va, vq), "va and vq overlap"); 6425 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 6426 6427 // We multiply the front and back halves of each sequence 4 at a 6428 // time because 6429 // 6430 // 1) we are currently only able to get 4-way instruction 6431 // parallelism at best 6432 // 6433 // 2) we need registers for the constants in vq and temporary 6434 // scratch registers to hold intermediate results so vtmp can only 6435 // be a VSeq<4> which means we only have 4 scratch slots. 6436 6437 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq); 6438 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq); 6439 } 6440 6441 // Perform combined montmul then add/sub on 4x4S vectors. 6442 void dilithium_montmul16_sub_add( 6443 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 6444 const VSeq<4>& vtmp, const VSeq<2>& vq) { 6445 // compute a = montmul(a1, c) 6446 dilithium_montmul16(vc, va1, vc, vtmp, vq); 6447 // ouptut a1 = a0 - a 6448 vs_subv(va1, __ T4S, va0, vc); 6449 // and a0 = a0 + a 6450 vs_addv(va0, __ T4S, va0, vc); 6451 } 6452 6453 // Perform combined add/sub then montul on 4x4S vectors. 6454 void dilithium_sub_add_montmul16( 6455 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 6456 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 6457 // compute c = a0 - a1 6458 vs_subv(vtmp1, __ T4S, va0, va1); 6459 // output a0 = a0 + a1 6460 vs_addv(va0, __ T4S, va0, va1); 6461 // output a1 = b montmul c 6462 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 6463 } 6464 6465 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6466 // in the Java implementation come in sequences of at least 8, so we 6467 // can use ldpq to collect the corresponding data into pairs of vector 6468 // registers. 6469 // We collect the coefficients corresponding to the 'j+l' indexes into 6470 // the vector registers v0-v7, the zetas into the vector registers v16-v23 6471 // then we do the (Montgomery) multiplications by the zetas in parallel 6472 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 6473 // v0-v7, then do the additions into v24-v31 and the subtractions into 6474 // v0-v7 and finally save the results back to the coeffs array. 6475 void dilithiumNttLevel0_4(const Register dilithiumConsts, 6476 const Register coeffs, const Register zetas) { 6477 int c1 = 0; 6478 int c2 = 512; 6479 int startIncr; 6480 // don't use callee save registers v8 - v15 6481 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6482 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6483 VSeq<2> vq(30); // n.b. constants overlap vs3 6484 int offsets[4] = { 0, 32, 64, 96 }; 6485 6486 for (int level = 0; level < 5; level++) { 6487 int c1Start = c1; 6488 int c2Start = c2; 6489 if (level == 3) { 6490 offsets[1] = 32; 6491 offsets[2] = 128; 6492 offsets[3] = 160; 6493 } else if (level == 4) { 6494 offsets[1] = 64; 6495 offsets[2] = 128; 6496 offsets[3] = 192; 6497 } 6498 6499 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a 6500 // time at 4 different offsets and multiply them in order by the 6501 // next set of input values. So we employ indexed load and store 6502 // pair instructions with arrangement 4S. 6503 for (int i = 0; i < 4; i++) { 6504 // reload q and qinv 6505 vs_ldpq(vq, dilithiumConsts); // qInv, q 6506 // load 8x4S coefficients via second start pos == c2 6507 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 6508 // load next 8x4S inputs == b 6509 vs_ldpq_post(vs2, zetas); 6510 // compute a == c2 * b mod MONT_Q 6511 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6512 // load 8x4s coefficients via first start pos == c1 6513 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6514 // compute a1 = c1 + a 6515 vs_addv(vs3, __ T4S, vs1, vs2); 6516 // compute a2 = c1 - a 6517 vs_subv(vs1, __ T4S, vs1, vs2); 6518 // output a1 and a2 6519 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6520 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 6521 6522 int k = 4 * level + i; 6523 6524 if (k > 7) { 6525 startIncr = 256; 6526 } else if (k == 5) { 6527 startIncr = 384; 6528 } else { 6529 startIncr = 128; 6530 } 6531 6532 c1Start += startIncr; 6533 c2Start += startIncr; 6534 } 6535 6536 c2 /= 2; 6537 } 6538 } 6539 6540 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 6541 // Implements the method 6542 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 6543 // of the Java class sun.security.provider 6544 // 6545 // coeffs (int[256]) = c_rarg0 6546 // zetas (int[256]) = c_rarg1 6547 address generate_dilithiumAlmostNtt() { 6548 6549 __ align(CodeEntryAlignment); 6550 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id; 6551 StubCodeMark mark(this, stub_id); 6552 address start = __ pc(); 6553 __ enter(); 6554 6555 const Register coeffs = c_rarg0; 6556 const Register zetas = c_rarg1; 6557 6558 const Register tmpAddr = r9; 6559 const Register dilithiumConsts = r10; 6560 const Register result = r11; 6561 // don't use callee save registers v8 - v15 6562 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6563 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6564 VSeq<2> vq(30); // n.b. constants overlap vs3 6565 int offsets[4] = { 0, 32, 64, 96}; 6566 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6567 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6568 __ add(result, coeffs, 0); 6569 __ lea(dilithiumConsts, 6570 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6571 6572 // Each level represents one iteration of the outer for loop of the Java version. 6573 6574 // level 0-4 6575 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 6576 6577 // level 5 6578 6579 // At level 5 the coefficients we need to combine with the zetas 6580 // are grouped in memory in blocks of size 4. So, for both sets of 6581 // coefficients we load 4 adjacent values at 8 different offsets 6582 // using an indexed ldr with register variant Q and multiply them 6583 // in sequence order by the next set of inputs. Likewise we store 6584 // the resuls using an indexed str with register variant Q. 6585 for (int i = 0; i < 1024; i += 256) { 6586 // reload constants q, qinv each iteration as they get clobbered later 6587 vs_ldpq(vq, dilithiumConsts); // qInv, q 6588 // load 32 (8x4S) coefficients via first offsets = c1 6589 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6590 // load next 32 (8x4S) inputs = b 6591 vs_ldpq_post(vs2, zetas); 6592 // a = b montul c1 6593 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6594 // load 32 (8x4S) coefficients via second offsets = c2 6595 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 6596 // add/sub with result of multiply 6597 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 6598 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 6599 // write back new coefficients using same offsets 6600 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 6601 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 6602 } 6603 6604 // level 6 6605 // At level 6 the coefficients we need to combine with the zetas 6606 // are grouped in memory in pairs, the first two being montmul 6607 // inputs and the second add/sub inputs. We can still implement 6608 // the montmul+sub+add using 4-way parallelism but only if we 6609 // combine the coefficients with the zetas 16 at a time. We load 8 6610 // adjacent values at 4 different offsets using an ld2 load with 6611 // arrangement 2D. That interleaves the lower and upper halves of 6612 // each pair of quadwords into successive vector registers. We 6613 // then need to montmul the 4 even elements of the coefficients 6614 // register sequence by the zetas in order and then add/sub the 4 6615 // odd elements of the coefficients register sequence. We use an 6616 // equivalent st2 operation to store the results back into memory 6617 // de-interleaved. 6618 for (int i = 0; i < 1024; i += 128) { 6619 // reload constants q, qinv each iteration as they get clobbered later 6620 vs_ldpq(vq, dilithiumConsts); // qInv, q 6621 // load interleaved 16 (4x2D) coefficients via offsets 6622 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6623 // load next 16 (4x4S) inputs 6624 vs_ldpq_post(vs_front(vs2), zetas); 6625 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6626 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6627 vs_front(vs2), vtmp, vq); 6628 // store interleaved 16 (4x2D) coefficients via offsets 6629 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6630 } 6631 6632 // level 7 6633 // At level 7 the coefficients we need to combine with the zetas 6634 // occur singly with montmul inputs alterating with add/sub 6635 // inputs. Once again we can use 4-way parallelism to combine 16 6636 // zetas at a time. However, we have to load 8 adjacent values at 6637 // 4 different offsets using an ld2 load with arrangement 4S. That 6638 // interleaves the the odd words of each pair into one 6639 // coefficients vector register and the even words of the pair 6640 // into the next register. We then need to montmul the 4 even 6641 // elements of the coefficients register sequence by the zetas in 6642 // order and then add/sub the 4 odd elements of the coefficients 6643 // register sequence. We use an equivalent st2 operation to store 6644 // the results back into memory de-interleaved. 6645 6646 for (int i = 0; i < 1024; i += 128) { 6647 // reload constants q, qinv each iteration as they get clobbered later 6648 vs_ldpq(vq, dilithiumConsts); // qInv, q 6649 // load interleaved 16 (4x4S) coefficients via offsets 6650 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6651 // load next 16 (4x4S) inputs 6652 vs_ldpq_post(vs_front(vs2), zetas); 6653 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 6654 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 6655 vs_front(vs2), vtmp, vq); 6656 // store interleaved 16 (4x4S) coefficients via offsets 6657 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6658 } 6659 __ leave(); // required for proper stackwalking of RuntimeStub frame 6660 __ mov(r0, zr); // return 0 6661 __ ret(lr); 6662 6663 return start; 6664 } 6665 6666 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 6667 // in the Java implementation come in sequences of at least 8, so we 6668 // can use ldpq to collect the corresponding data into pairs of vector 6669 // registers 6670 // We collect the coefficients that correspond to the 'j's into vs1 6671 // the coefficiets that correspond to the 'j+l's into vs2 then 6672 // do the additions into vs3 and the subtractions into vs1 then 6673 // save the result of the additions, load the zetas into vs2 6674 // do the (Montgomery) multiplications by zeta in parallel into vs2 6675 // finally save the results back to the coeffs array 6676 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 6677 const Register coeffs, const Register zetas) { 6678 int c1 = 0; 6679 int c2 = 32; 6680 int startIncr; 6681 int offsets[4]; 6682 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6683 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6684 VSeq<2> vq(30); // n.b. constants overlap vs3 6685 6686 offsets[0] = 0; 6687 6688 for (int level = 3; level < 8; level++) { 6689 int c1Start = c1; 6690 int c2Start = c2; 6691 if (level == 3) { 6692 offsets[1] = 64; 6693 offsets[2] = 128; 6694 offsets[3] = 192; 6695 } else if (level == 4) { 6696 offsets[1] = 32; 6697 offsets[2] = 128; 6698 offsets[3] = 160; 6699 } else { 6700 offsets[1] = 32; 6701 offsets[2] = 64; 6702 offsets[3] = 96; 6703 } 6704 6705 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a 6706 // time at 4 different offsets and multiply them in order by the 6707 // next set of input values. So we employ indexed load and store 6708 // pair instructions with arrangement 4S. 6709 for (int i = 0; i < 4; i++) { 6710 // load v1 32 (8x4S) coefficients relative to first start index 6711 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 6712 // load v2 32 (8x4S) coefficients relative to second start index 6713 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 6714 // a0 = v1 + v2 -- n.b. clobbers vqs 6715 vs_addv(vs3, __ T4S, vs1, vs2); 6716 // a1 = v1 - v2 6717 vs_subv(vs1, __ T4S, vs1, vs2); 6718 // save a1 relative to first start index 6719 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 6720 // load constants q, qinv each iteration as they get clobbered above 6721 vs_ldpq(vq, dilithiumConsts); // qInv, q 6722 // load b next 32 (8x4S) inputs 6723 vs_ldpq_post(vs2, zetas); 6724 // a = a1 montmul b 6725 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6726 // save a relative to second start index 6727 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 6728 6729 int k = 4 * level + i; 6730 6731 if (k < 24) { 6732 startIncr = 256; 6733 } else if (k == 25) { 6734 startIncr = 384; 6735 } else { 6736 startIncr = 128; 6737 } 6738 6739 c1Start += startIncr; 6740 c2Start += startIncr; 6741 } 6742 6743 c2 *= 2; 6744 } 6745 } 6746 6747 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 6748 // Implements the method 6749 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 6750 // the sun.security.provider.ML_DSA class. 6751 // 6752 // coeffs (int[256]) = c_rarg0 6753 // zetas (int[256]) = c_rarg1 6754 address generate_dilithiumAlmostInverseNtt() { 6755 6756 __ align(CodeEntryAlignment); 6757 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id; 6758 StubCodeMark mark(this, stub_id); 6759 address start = __ pc(); 6760 __ enter(); 6761 6762 const Register coeffs = c_rarg0; 6763 const Register zetas = c_rarg1; 6764 6765 const Register tmpAddr = r9; 6766 const Register dilithiumConsts = r10; 6767 const Register result = r11; 6768 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6769 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6770 VSeq<2> vq(30); // n.b. constants overlap vs3 6771 int offsets[4] = { 0, 32, 64, 96 }; 6772 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 6773 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 6774 6775 __ add(result, coeffs, 0); 6776 __ lea(dilithiumConsts, 6777 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6778 6779 // Each level represents one iteration of the outer for loop of the Java version 6780 6781 // level 0 6782 // At level 0 we need to interleave adjacent quartets of 6783 // coefficients before we multiply and add/sub by the next 16 6784 // zetas just as we did for level 7 in the multiply code. So we 6785 // load and store the values using an ld2/st2 with arrangement 4S. 6786 for (int i = 0; i < 1024; i += 128) { 6787 // load constants q, qinv 6788 // n.b. this can be moved out of the loop as they do not get 6789 // clobbered by first two loops 6790 vs_ldpq(vq, dilithiumConsts); // qInv, q 6791 // a0/a1 load interleaved 32 (8x4S) coefficients 6792 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6793 // b load next 32 (8x4S) inputs 6794 vs_ldpq_post(vs_front(vs2), zetas); 6795 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6796 // n.b. second half of vs2 provides temporary register storage 6797 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6798 vs_front(vs2), vs_back(vs2), vtmp, vq); 6799 // a0/a1 store interleaved 32 (8x4S) coefficients 6800 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 6801 } 6802 6803 // level 1 6804 // At level 1 we need to interleave pairs of adjacent pairs of 6805 // coefficients before we multiply by the next 16 zetas just as we 6806 // did for level 6 in the multiply code. So we load and store the 6807 // values an ld2/st2 with arrangement 2D. 6808 for (int i = 0; i < 1024; i += 128) { 6809 // a0/a1 load interleaved 32 (8x2D) coefficients 6810 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6811 // b load next 16 (4x4S) inputs 6812 vs_ldpq_post(vs_front(vs2), zetas); 6813 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 6814 // n.b. second half of vs2 provides temporary register storage 6815 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 6816 vs_front(vs2), vs_back(vs2), vtmp, vq); 6817 // a0/a1 store interleaved 32 (8x2D) coefficients 6818 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 6819 } 6820 6821 // level 2 6822 // At level 2 coefficients come in blocks of 4. So, we load 4 6823 // adjacent coefficients at 8 distinct offsets for both the first 6824 // and second coefficient sequences, using an ldr with register 6825 // variant Q then combine them with next set of 32 zetas. Likewise 6826 // we store the results using an str with register variant Q. 6827 for (int i = 0; i < 1024; i += 256) { 6828 // c0 load 32 (8x4S) coefficients via first offsets 6829 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 6830 // c1 load 32 (8x4S) coefficients via second offsets 6831 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 6832 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 6833 vs_addv(vs3, __ T4S, vs1, vs2); 6834 // c = c0 - c1 6835 vs_subv(vs1, __ T4S, vs1, vs2); 6836 // store a0 32 (8x4S) coefficients via first offsets 6837 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 6838 // b load 32 (8x4S) next inputs 6839 vs_ldpq_post(vs2, zetas); 6840 // reload constants q, qinv -- they were clobbered earlier 6841 vs_ldpq(vq, dilithiumConsts); // qInv, q 6842 // compute a1 = b montmul c 6843 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6844 // store a1 32 (8x4S) coefficients via second offsets 6845 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 6846 } 6847 6848 // level 3-7 6849 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 6850 6851 __ leave(); // required for proper stackwalking of RuntimeStub frame 6852 __ mov(r0, zr); // return 0 6853 __ ret(lr); 6854 6855 return start; 6856 } 6857 6858 // Dilithium multiply polynomials in the NTT domain. 6859 // Straightforward implementation of the method 6860 // static int implDilithiumNttMult( 6861 // int[] result, int[] ntta, int[] nttb {} of 6862 // the sun.security.provider.ML_DSA class. 6863 // 6864 // result (int[256]) = c_rarg0 6865 // poly1 (int[256]) = c_rarg1 6866 // poly2 (int[256]) = c_rarg2 6867 address generate_dilithiumNttMult() { 6868 6869 __ align(CodeEntryAlignment); 6870 StubId stub_id = StubId::stubgen_dilithiumNttMult_id; 6871 StubCodeMark mark(this, stub_id); 6872 address start = __ pc(); 6873 __ enter(); 6874 6875 Label L_loop; 6876 6877 const Register result = c_rarg0; 6878 const Register poly1 = c_rarg1; 6879 const Register poly2 = c_rarg2; 6880 6881 const Register dilithiumConsts = r10; 6882 const Register len = r11; 6883 6884 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6885 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6886 VSeq<2> vq(30); // n.b. constants overlap vs3 6887 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 6888 6889 __ lea(dilithiumConsts, 6890 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6891 6892 // load constants q, qinv 6893 vs_ldpq(vq, dilithiumConsts); // qInv, q 6894 // load constant rSquare into v29 6895 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 6896 6897 __ mov(len, zr); 6898 __ add(len, len, 1024); 6899 6900 __ BIND(L_loop); 6901 6902 // b load 32 (8x4S) next inputs from poly1 6903 vs_ldpq_post(vs1, poly1); 6904 // c load 32 (8x4S) next inputs from poly2 6905 vs_ldpq_post(vs2, poly2); 6906 // compute a = b montmul c 6907 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq); 6908 // compute a = rsquare montmul a 6909 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq); 6910 // save a 32 (8x4S) results 6911 vs_stpq_post(vs2, result); 6912 6913 __ sub(len, len, 128); 6914 __ cmp(len, (u1)128); 6915 __ br(Assembler::GE, L_loop); 6916 6917 __ leave(); // required for proper stackwalking of RuntimeStub frame 6918 __ mov(r0, zr); // return 0 6919 __ ret(lr); 6920 6921 return start; 6922 } 6923 6924 // Dilithium Motgomery multiply an array by a constant. 6925 // A straightforward implementation of the method 6926 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 6927 // of the sun.security.provider.MLDSA class 6928 // 6929 // coeffs (int[256]) = c_rarg0 6930 // constant (int) = c_rarg1 6931 address generate_dilithiumMontMulByConstant() { 6932 6933 __ align(CodeEntryAlignment); 6934 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id; 6935 StubCodeMark mark(this, stub_id); 6936 address start = __ pc(); 6937 __ enter(); 6938 6939 Label L_loop; 6940 6941 const Register coeffs = c_rarg0; 6942 const Register constant = c_rarg1; 6943 6944 const Register dilithiumConsts = r10; 6945 const Register result = r11; 6946 const Register len = r12; 6947 6948 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 6949 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 6950 VSeq<2> vq(30); // n.b. constants overlap vs3 6951 VSeq<8> vconst(29, 0); // for montmul by constant 6952 6953 // results track inputs 6954 __ add(result, coeffs, 0); 6955 __ lea(dilithiumConsts, 6956 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 6957 6958 // load constants q, qinv -- they do not get clobbered by first two loops 6959 vs_ldpq(vq, dilithiumConsts); // qInv, q 6960 // copy caller supplied constant across vconst 6961 __ dup(vconst[0], __ T4S, constant); 6962 __ mov(len, zr); 6963 __ add(len, len, 1024); 6964 6965 __ BIND(L_loop); 6966 6967 // load next 32 inputs 6968 vs_ldpq_post(vs2, coeffs); 6969 // mont mul by constant 6970 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq); 6971 // write next 32 results 6972 vs_stpq_post(vs2, result); 6973 6974 __ sub(len, len, 128); 6975 __ cmp(len, (u1)128); 6976 __ br(Assembler::GE, L_loop); 6977 6978 __ leave(); // required for proper stackwalking of RuntimeStub frame 6979 __ mov(r0, zr); // return 0 6980 __ ret(lr); 6981 6982 return start; 6983 } 6984 6985 // Dilithium decompose poly. 6986 // Implements the method 6987 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 6988 // of the sun.security.provider.ML_DSA class 6989 // 6990 // input (int[256]) = c_rarg0 6991 // lowPart (int[256]) = c_rarg1 6992 // highPart (int[256]) = c_rarg2 6993 // twoGamma2 (int) = c_rarg3 6994 // multiplier (int) = c_rarg4 6995 address generate_dilithiumDecomposePoly() { 6996 6997 __ align(CodeEntryAlignment); 6998 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id; 6999 StubCodeMark mark(this, stub_id); 7000 address start = __ pc(); 7001 Label L_loop; 7002 7003 const Register input = c_rarg0; 7004 const Register lowPart = c_rarg1; 7005 const Register highPart = c_rarg2; 7006 const Register twoGamma2 = c_rarg3; 7007 const Register multiplier = c_rarg4; 7008 7009 const Register len = r9; 7010 const Register dilithiumConsts = r10; 7011 const Register tmp = r11; 7012 7013 // 6 independent sets of 4x4s values 7014 VSeq<4> vs1(0), vs2(4), vs3(8); 7015 VSeq<4> vs4(12), vs5(16), vtmp(20); 7016 7017 // 7 constants for cross-multiplying 7018 VSeq<4> one(25, 0); 7019 VSeq<4> qminus1(26, 0); 7020 VSeq<4> g2(27, 0); 7021 VSeq<4> twog2(28, 0); 7022 VSeq<4> mult(29, 0); 7023 VSeq<4> q(30, 0); 7024 VSeq<4> qadd(31, 0); 7025 7026 __ enter(); 7027 7028 __ lea(dilithiumConsts, 7029 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 7030 7031 // save callee-saved registers 7032 __ stpd(v8, v9, __ pre(sp, -64)); 7033 __ stpd(v10, v11, Address(sp, 16)); 7034 __ stpd(v12, v13, Address(sp, 32)); 7035 __ stpd(v14, v15, Address(sp, 48)); 7036 7037 // populate constant registers 7038 __ mov(tmp, zr); 7039 __ add(tmp, tmp, 1); 7040 __ dup(one[0], __ T4S, tmp); // 1 7041 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 7042 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 7043 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 7044 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 7045 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 7046 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 7047 7048 __ mov(len, zr); 7049 __ add(len, len, 1024); 7050 7051 __ BIND(L_loop); 7052 7053 // load next 4x4S inputs interleaved: rplus --> vs1 7054 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 7055 7056 // rplus = rplus - ((rplus + qadd) >> 23) * q 7057 vs_addv(vtmp, __ T4S, vs1, qadd); 7058 vs_sshr(vtmp, __ T4S, vtmp, 23); 7059 vs_mulv(vtmp, __ T4S, vtmp, q); 7060 vs_subv(vs1, __ T4S, vs1, vtmp); 7061 7062 // rplus = rplus + ((rplus >> 31) & dilithium_q); 7063 vs_sshr(vtmp, __ T4S, vs1, 31); 7064 vs_andr(vtmp, vtmp, q); 7065 vs_addv(vs1, __ T4S, vs1, vtmp); 7066 7067 // quotient --> vs2 7068 // int quotient = (rplus * multiplier) >> 22; 7069 vs_mulv(vtmp, __ T4S, vs1, mult); 7070 vs_sshr(vs2, __ T4S, vtmp, 22); 7071 7072 // r0 --> vs3 7073 // int r0 = rplus - quotient * twoGamma2; 7074 vs_mulv(vtmp, __ T4S, vs2, twog2); 7075 vs_subv(vs3, __ T4S, vs1, vtmp); 7076 7077 // mask --> vs4 7078 // int mask = (twoGamma2 - r0) >> 22; 7079 vs_subv(vtmp, __ T4S, twog2, vs3); 7080 vs_sshr(vs4, __ T4S, vtmp, 22); 7081 7082 // r0 -= (mask & twoGamma2); 7083 vs_andr(vtmp, vs4, twog2); 7084 vs_subv(vs3, __ T4S, vs3, vtmp); 7085 7086 // quotient += (mask & 1); 7087 vs_andr(vtmp, vs4, one); 7088 vs_addv(vs2, __ T4S, vs2, vtmp); 7089 7090 // mask = (twoGamma2 / 2 - r0) >> 31; 7091 vs_subv(vtmp, __ T4S, g2, vs3); 7092 vs_sshr(vs4, __ T4S, vtmp, 31); 7093 7094 // r0 -= (mask & twoGamma2); 7095 vs_andr(vtmp, vs4, twog2); 7096 vs_subv(vs3, __ T4S, vs3, vtmp); 7097 7098 // quotient += (mask & 1); 7099 vs_andr(vtmp, vs4, one); 7100 vs_addv(vs2, __ T4S, vs2, vtmp); 7101 7102 // r1 --> vs5 7103 // int r1 = rplus - r0 - (dilithium_q - 1); 7104 vs_subv(vtmp, __ T4S, vs1, vs3); 7105 vs_subv(vs5, __ T4S, vtmp, qminus1); 7106 7107 // r1 --> vs1 (overwriting rplus) 7108 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 7109 vs_negr(vtmp, __ T4S, vs5); 7110 vs_orr(vtmp, vs5, vtmp); 7111 vs_sshr(vs1, __ T4S, vtmp, 31); 7112 7113 // r0 += ~r1; 7114 vs_notr(vtmp, vs1); 7115 vs_addv(vs3, __ T4S, vs3, vtmp); 7116 7117 // r1 = r1 & quotient; 7118 vs_andr(vs1, vs2, vs1); 7119 7120 // store results inteleaved 7121 // lowPart[m] = r0; 7122 // highPart[m] = r1; 7123 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 7124 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 7125 7126 __ sub(len, len, 64); 7127 __ cmp(len, (u1)64); 7128 __ br(Assembler::GE, L_loop); 7129 7130 // restore callee-saved vector registers 7131 __ ldpd(v14, v15, Address(sp, 48)); 7132 __ ldpd(v12, v13, Address(sp, 32)); 7133 __ ldpd(v10, v11, Address(sp, 16)); 7134 __ ldpd(v8, v9, __ post(sp, 64)); 7135 7136 __ leave(); // required for proper stackwalking of RuntimeStub frame 7137 __ mov(r0, zr); // return 0 7138 __ ret(lr); 7139 7140 return start; 7141 } 7142 7143 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4, 7144 Register tmp0, Register tmp1, Register tmp2) { 7145 __ bic(tmp0, a2, a1); // for a0 7146 __ bic(tmp1, a3, a2); // for a1 7147 __ bic(tmp2, a4, a3); // for a2 7148 __ eor(a2, a2, tmp2); 7149 __ bic(tmp2, a0, a4); // for a3 7150 __ eor(a3, a3, tmp2); 7151 __ bic(tmp2, a1, a0); // for a4 7152 __ eor(a0, a0, tmp0); 7153 __ eor(a1, a1, tmp1); 7154 __ eor(a4, a4, tmp2); 7155 } 7156 7157 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc, 7158 Register a0, Register a1, Register a2, Register a3, Register a4, 7159 Register a5, Register a6, Register a7, Register a8, Register a9, 7160 Register a10, Register a11, Register a12, Register a13, Register a14, 7161 Register a15, Register a16, Register a17, Register a18, Register a19, 7162 Register a20, Register a21, Register a22, Register a23, Register a24, 7163 Register tmp0, Register tmp1, Register tmp2) { 7164 __ eor3(tmp1, a4, a9, a14); 7165 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4 7166 __ eor3(tmp2, a1, a6, a11); 7167 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1 7168 __ rax1(tmp2, tmp0, tmp1); // d0 7169 { 7170 7171 Register tmp3, tmp4; 7172 if (can_use_fp && can_use_r18) { 7173 tmp3 = rfp; 7174 tmp4 = r18_tls; 7175 } else { 7176 tmp3 = a4; 7177 tmp4 = a9; 7178 __ stp(tmp3, tmp4, __ pre(sp, -16)); 7179 } 7180 7181 __ eor3(tmp3, a0, a5, a10); 7182 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0 7183 __ eor(a0, a0, tmp2); 7184 __ eor(a5, a5, tmp2); 7185 __ eor(a10, a10, tmp2); 7186 __ eor(a15, a15, tmp2); 7187 __ eor(a20, a20, tmp2); // d0(tmp2) 7188 __ eor3(tmp3, a2, a7, a12); 7189 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2 7190 __ rax1(tmp3, tmp4, tmp2); // d1 7191 __ eor(a1, a1, tmp3); 7192 __ eor(a6, a6, tmp3); 7193 __ eor(a11, a11, tmp3); 7194 __ eor(a16, a16, tmp3); 7195 __ eor(a21, a21, tmp3); // d1(tmp3) 7196 __ rax1(tmp3, tmp2, tmp0); // d3 7197 __ eor3(tmp2, a3, a8, a13); 7198 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3 7199 __ eor(a3, a3, tmp3); 7200 __ eor(a8, a8, tmp3); 7201 __ eor(a13, a13, tmp3); 7202 __ eor(a18, a18, tmp3); 7203 __ eor(a23, a23, tmp3); 7204 __ rax1(tmp2, tmp1, tmp0); // d2 7205 __ eor(a2, a2, tmp2); 7206 __ eor(a7, a7, tmp2); 7207 __ eor(a12, a12, tmp2); 7208 __ rax1(tmp0, tmp0, tmp4); // d4 7209 if (!can_use_fp || !can_use_r18) { 7210 __ ldp(tmp3, tmp4, __ post(sp, 16)); 7211 } 7212 __ eor(a17, a17, tmp2); 7213 __ eor(a22, a22, tmp2); 7214 __ eor(a4, a4, tmp0); 7215 __ eor(a9, a9, tmp0); 7216 __ eor(a14, a14, tmp0); 7217 __ eor(a19, a19, tmp0); 7218 __ eor(a24, a24, tmp0); 7219 } 7220 7221 __ rol(tmp0, a10, 3); 7222 __ rol(a10, a1, 1); 7223 __ rol(a1, a6, 44); 7224 __ rol(a6, a9, 20); 7225 __ rol(a9, a22, 61); 7226 __ rol(a22, a14, 39); 7227 __ rol(a14, a20, 18); 7228 __ rol(a20, a2, 62); 7229 __ rol(a2, a12, 43); 7230 __ rol(a12, a13, 25); 7231 __ rol(a13, a19, 8) ; 7232 __ rol(a19, a23, 56); 7233 __ rol(a23, a15, 41); 7234 __ rol(a15, a4, 27); 7235 __ rol(a4, a24, 14); 7236 __ rol(a24, a21, 2); 7237 __ rol(a21, a8, 55); 7238 __ rol(a8, a16, 45); 7239 __ rol(a16, a5, 36); 7240 __ rol(a5, a3, 28); 7241 __ rol(a3, a18, 21); 7242 __ rol(a18, a17, 15); 7243 __ rol(a17, a11, 10); 7244 __ rol(a11, a7, 6); 7245 __ mov(a7, tmp0); 7246 7247 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2); 7248 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2); 7249 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2); 7250 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2); 7251 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2); 7252 7253 __ ldr(tmp1, __ post(rc, 8)); 7254 __ eor(a0, a0, tmp1); 7255 7256 } 7257 7258 // Arguments: 7259 // 7260 // Inputs: 7261 // c_rarg0 - byte[] source+offset 7262 // c_rarg1 - byte[] SHA.state 7263 // c_rarg2 - int block_size 7264 // c_rarg3 - int offset 7265 // c_rarg4 - int limit 7266 // 7267 address generate_sha3_implCompress_gpr(StubId stub_id) { 7268 bool multi_block; 7269 switch (stub_id) { 7270 case StubId::stubgen_sha3_implCompress_id: 7271 multi_block = false; 7272 break; 7273 case StubId::stubgen_sha3_implCompressMB_id: 7274 multi_block = true; 7275 break; 7276 default: 7277 ShouldNotReachHere(); 7278 } 7279 7280 static const uint64_t round_consts[24] = { 7281 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 7282 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 7283 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 7284 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 7285 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 7286 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 7287 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 7288 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 7289 }; 7290 7291 __ align(CodeEntryAlignment); 7292 StubCodeMark mark(this, stub_id); 7293 address start = __ pc(); 7294 7295 Register buf = c_rarg0; 7296 Register state = c_rarg1; 7297 Register block_size = c_rarg2; 7298 Register ofs = c_rarg3; 7299 Register limit = c_rarg4; 7300 7301 // use r3.r17,r19..r28 to keep a0..a24. 7302 // a0..a24 are respective locals from SHA3.java 7303 Register a0 = r25, 7304 a1 = r26, 7305 a2 = r27, 7306 a3 = r3, 7307 a4 = r4, 7308 a5 = r5, 7309 a6 = r6, 7310 a7 = r7, 7311 a8 = rscratch1, // r8 7312 a9 = rscratch2, // r9 7313 a10 = r10, 7314 a11 = r11, 7315 a12 = r12, 7316 a13 = r13, 7317 a14 = r14, 7318 a15 = r15, 7319 a16 = r16, 7320 a17 = r17, 7321 a18 = r28, 7322 a19 = r19, 7323 a20 = r20, 7324 a21 = r21, 7325 a22 = r22, 7326 a23 = r23, 7327 a24 = r24; 7328 7329 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30; 7330 7331 Label sha3_loop, rounds24_preloop, loop_body; 7332 Label sha3_512_or_sha3_384, shake128; 7333 7334 bool can_use_r18 = false; 7335 #ifndef R18_RESERVED 7336 can_use_r18 = true; 7337 #endif 7338 bool can_use_fp = !PreserveFramePointer; 7339 7340 __ enter(); 7341 7342 // save almost all yet unsaved gpr registers on stack 7343 __ str(block_size, __ pre(sp, -128)); 7344 if (multi_block) { 7345 __ stpw(ofs, limit, Address(sp, 8)); 7346 } 7347 // 8 bytes at sp+16 will be used to keep buf 7348 __ stp(r19, r20, Address(sp, 32)); 7349 __ stp(r21, r22, Address(sp, 48)); 7350 __ stp(r23, r24, Address(sp, 64)); 7351 __ stp(r25, r26, Address(sp, 80)); 7352 __ stp(r27, r28, Address(sp, 96)); 7353 if (can_use_r18 && can_use_fp) { 7354 __ stp(r18_tls, state, Address(sp, 112)); 7355 } else { 7356 __ str(state, Address(sp, 112)); 7357 } 7358 7359 // begin sha3 calculations: loading a0..a24 from state arrary 7360 __ ldp(a0, a1, state); 7361 __ ldp(a2, a3, Address(state, 16)); 7362 __ ldp(a4, a5, Address(state, 32)); 7363 __ ldp(a6, a7, Address(state, 48)); 7364 __ ldp(a8, a9, Address(state, 64)); 7365 __ ldp(a10, a11, Address(state, 80)); 7366 __ ldp(a12, a13, Address(state, 96)); 7367 __ ldp(a14, a15, Address(state, 112)); 7368 __ ldp(a16, a17, Address(state, 128)); 7369 __ ldp(a18, a19, Address(state, 144)); 7370 __ ldp(a20, a21, Address(state, 160)); 7371 __ ldp(a22, a23, Address(state, 176)); 7372 __ ldr(a24, Address(state, 192)); 7373 7374 __ BIND(sha3_loop); 7375 7376 // load input 7377 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7378 __ eor(a0, a0, tmp3); 7379 __ eor(a1, a1, tmp2); 7380 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7381 __ eor(a2, a2, tmp3); 7382 __ eor(a3, a3, tmp2); 7383 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7384 __ eor(a4, a4, tmp3); 7385 __ eor(a5, a5, tmp2); 7386 __ ldr(tmp3, __ post(buf, 8)); 7387 __ eor(a6, a6, tmp3); 7388 7389 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 7390 __ tbz(block_size, 7, sha3_512_or_sha3_384); 7391 7392 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7393 __ eor(a7, a7, tmp3); 7394 __ eor(a8, a8, tmp2); 7395 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7396 __ eor(a9, a9, tmp3); 7397 __ eor(a10, a10, tmp2); 7398 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7399 __ eor(a11, a11, tmp3); 7400 __ eor(a12, a12, tmp2); 7401 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7402 __ eor(a13, a13, tmp3); 7403 __ eor(a14, a14, tmp2); 7404 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7405 __ eor(a15, a15, tmp3); 7406 __ eor(a16, a16, tmp2); 7407 7408 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 7409 __ andw(tmp2, block_size, 48); 7410 __ cbzw(tmp2, rounds24_preloop); 7411 __ tbnz(block_size, 5, shake128); 7412 // block_size == 144, bit5 == 0, SHA3-244 7413 __ ldr(tmp3, __ post(buf, 8)); 7414 __ eor(a17, a17, tmp3); 7415 __ b(rounds24_preloop); 7416 7417 __ BIND(shake128); 7418 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7419 __ eor(a17, a17, tmp3); 7420 __ eor(a18, a18, tmp2); 7421 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7422 __ eor(a19, a19, tmp3); 7423 __ eor(a20, a20, tmp2); 7424 __ b(rounds24_preloop); // block_size == 168, SHAKE128 7425 7426 __ BIND(sha3_512_or_sha3_384); 7427 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7428 __ eor(a7, a7, tmp3); 7429 __ eor(a8, a8, tmp2); 7430 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512 7431 7432 // SHA3-384 7433 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7434 __ eor(a9, a9, tmp3); 7435 __ eor(a10, a10, tmp2); 7436 __ ldp(tmp3, tmp2, __ post(buf, 16)); 7437 __ eor(a11, a11, tmp3); 7438 __ eor(a12, a12, tmp2); 7439 7440 __ BIND(rounds24_preloop); 7441 __ fmovs(v0, 24.0); // float loop counter, 7442 __ fmovs(v1, 1.0); // exact representation 7443 7444 __ str(buf, Address(sp, 16)); 7445 __ lea(tmp3, ExternalAddress((address) round_consts)); 7446 7447 __ BIND(loop_body); 7448 keccak_round_gpr(can_use_fp, can_use_r18, tmp3, 7449 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, 7450 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, 7451 tmp0, tmp1, tmp2); 7452 __ fsubs(v0, v0, v1); 7453 __ fcmps(v0, 0.0); 7454 __ br(__ NE, loop_body); 7455 7456 if (multi_block) { 7457 __ ldrw(block_size, sp); // block_size 7458 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit 7459 __ addw(tmp2, tmp2, block_size); 7460 __ cmpw(tmp2, tmp1); 7461 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping 7462 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping 7463 __ br(Assembler::LE, sha3_loop); 7464 __ movw(c_rarg0, tmp2); // return offset 7465 } 7466 if (can_use_fp && can_use_r18) { 7467 __ ldp(r18_tls, state, Address(sp, 112)); 7468 } else { 7469 __ ldr(state, Address(sp, 112)); 7470 } 7471 // save calculated sha3 state 7472 __ stp(a0, a1, Address(state)); 7473 __ stp(a2, a3, Address(state, 16)); 7474 __ stp(a4, a5, Address(state, 32)); 7475 __ stp(a6, a7, Address(state, 48)); 7476 __ stp(a8, a9, Address(state, 64)); 7477 __ stp(a10, a11, Address(state, 80)); 7478 __ stp(a12, a13, Address(state, 96)); 7479 __ stp(a14, a15, Address(state, 112)); 7480 __ stp(a16, a17, Address(state, 128)); 7481 __ stp(a18, a19, Address(state, 144)); 7482 __ stp(a20, a21, Address(state, 160)); 7483 __ stp(a22, a23, Address(state, 176)); 7484 __ str(a24, Address(state, 192)); 7485 7486 // restore required registers from stack 7487 __ ldp(r19, r20, Address(sp, 32)); 7488 __ ldp(r21, r22, Address(sp, 48)); 7489 __ ldp(r23, r24, Address(sp, 64)); 7490 __ ldp(r25, r26, Address(sp, 80)); 7491 __ ldp(r27, r28, Address(sp, 96)); 7492 if (can_use_fp && can_use_r18) { 7493 __ add(rfp, sp, 128); // leave() will copy rfp to sp below 7494 } // else no need to recalculate rfp, since it wasn't changed 7495 7496 __ leave(); 7497 7498 __ ret(lr); 7499 7500 return start; 7501 } 7502 7503 /** 7504 * Arguments: 7505 * 7506 * Inputs: 7507 * c_rarg0 - int crc 7508 * c_rarg1 - byte* buf 7509 * c_rarg2 - int length 7510 * 7511 * Output: 7512 * rax - int crc result 7513 */ 7514 address generate_updateBytesCRC32() { 7515 assert(UseCRC32Intrinsics, "what are we doing here?"); 7516 7517 __ align(CodeEntryAlignment); 7518 StubId stub_id = StubId::stubgen_updateBytesCRC32_id; 7519 StubCodeMark mark(this, stub_id); 7520 7521 address start = __ pc(); 7522 7523 const Register crc = c_rarg0; // crc 7524 const Register buf = c_rarg1; // source java byte array address 7525 const Register len = c_rarg2; // length 7526 const Register table0 = c_rarg3; // crc_table address 7527 const Register table1 = c_rarg4; 7528 const Register table2 = c_rarg5; 7529 const Register table3 = c_rarg6; 7530 const Register tmp3 = c_rarg7; 7531 7532 BLOCK_COMMENT("Entry:"); 7533 __ enter(); // required for proper stackwalking of RuntimeStub frame 7534 7535 __ kernel_crc32(crc, buf, len, 7536 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7537 7538 __ leave(); // required for proper stackwalking of RuntimeStub frame 7539 __ ret(lr); 7540 7541 return start; 7542 } 7543 7544 /** 7545 * Arguments: 7546 * 7547 * Inputs: 7548 * c_rarg0 - int crc 7549 * c_rarg1 - byte* buf 7550 * c_rarg2 - int length 7551 * c_rarg3 - int* table 7552 * 7553 * Output: 7554 * r0 - int crc result 7555 */ 7556 address generate_updateBytesCRC32C() { 7557 assert(UseCRC32CIntrinsics, "what are we doing here?"); 7558 7559 __ align(CodeEntryAlignment); 7560 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id; 7561 StubCodeMark mark(this, stub_id); 7562 7563 address start = __ pc(); 7564 7565 const Register crc = c_rarg0; // crc 7566 const Register buf = c_rarg1; // source java byte array address 7567 const Register len = c_rarg2; // length 7568 const Register table0 = c_rarg3; // crc_table address 7569 const Register table1 = c_rarg4; 7570 const Register table2 = c_rarg5; 7571 const Register table3 = c_rarg6; 7572 const Register tmp3 = c_rarg7; 7573 7574 BLOCK_COMMENT("Entry:"); 7575 __ enter(); // required for proper stackwalking of RuntimeStub frame 7576 7577 __ kernel_crc32c(crc, buf, len, 7578 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 7579 7580 __ leave(); // required for proper stackwalking of RuntimeStub frame 7581 __ ret(lr); 7582 7583 return start; 7584 } 7585 7586 /*** 7587 * Arguments: 7588 * 7589 * Inputs: 7590 * c_rarg0 - int adler 7591 * c_rarg1 - byte* buff 7592 * c_rarg2 - int len 7593 * 7594 * Output: 7595 * c_rarg0 - int adler result 7596 */ 7597 address generate_updateBytesAdler32() { 7598 __ align(CodeEntryAlignment); 7599 StubId stub_id = StubId::stubgen_updateBytesAdler32_id; 7600 StubCodeMark mark(this, stub_id); 7601 address start = __ pc(); 7602 7603 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 7604 7605 // Aliases 7606 Register adler = c_rarg0; 7607 Register s1 = c_rarg0; 7608 Register s2 = c_rarg3; 7609 Register buff = c_rarg1; 7610 Register len = c_rarg2; 7611 Register nmax = r4; 7612 Register base = r5; 7613 Register count = r6; 7614 Register temp0 = rscratch1; 7615 Register temp1 = rscratch2; 7616 FloatRegister vbytes = v0; 7617 FloatRegister vs1acc = v1; 7618 FloatRegister vs2acc = v2; 7619 FloatRegister vtable = v3; 7620 7621 // Max number of bytes we can process before having to take the mod 7622 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 7623 uint64_t BASE = 0xfff1; 7624 uint64_t NMAX = 0x15B0; 7625 7626 __ mov(base, BASE); 7627 __ mov(nmax, NMAX); 7628 7629 // Load accumulation coefficients for the upper 16 bits 7630 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 7631 __ ld1(vtable, __ T16B, Address(temp0)); 7632 7633 // s1 is initialized to the lower 16 bits of adler 7634 // s2 is initialized to the upper 16 bits of adler 7635 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 7636 __ uxth(s1, adler); // s1 = (adler & 0xffff) 7637 7638 // The pipelined loop needs at least 16 elements for 1 iteration 7639 // It does check this, but it is more effective to skip to the cleanup loop 7640 __ cmp(len, (u1)16); 7641 __ br(Assembler::HS, L_nmax); 7642 __ cbz(len, L_combine); 7643 7644 __ bind(L_simple_by1_loop); 7645 __ ldrb(temp0, Address(__ post(buff, 1))); 7646 __ add(s1, s1, temp0); 7647 __ add(s2, s2, s1); 7648 __ subs(len, len, 1); 7649 __ br(Assembler::HI, L_simple_by1_loop); 7650 7651 // s1 = s1 % BASE 7652 __ subs(temp0, s1, base); 7653 __ csel(s1, temp0, s1, Assembler::HS); 7654 7655 // s2 = s2 % BASE 7656 __ lsr(temp0, s2, 16); 7657 __ lsl(temp1, temp0, 4); 7658 __ sub(temp1, temp1, temp0); 7659 __ add(s2, temp1, s2, ext::uxth); 7660 7661 __ subs(temp0, s2, base); 7662 __ csel(s2, temp0, s2, Assembler::HS); 7663 7664 __ b(L_combine); 7665 7666 __ bind(L_nmax); 7667 __ subs(len, len, nmax); 7668 __ sub(count, nmax, 16); 7669 __ br(Assembler::LO, L_by16); 7670 7671 __ bind(L_nmax_loop); 7672 7673 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7674 vbytes, vs1acc, vs2acc, vtable); 7675 7676 __ subs(count, count, 16); 7677 __ br(Assembler::HS, L_nmax_loop); 7678 7679 // s1 = s1 % BASE 7680 __ lsr(temp0, s1, 16); 7681 __ lsl(temp1, temp0, 4); 7682 __ sub(temp1, temp1, temp0); 7683 __ add(temp1, temp1, s1, ext::uxth); 7684 7685 __ lsr(temp0, temp1, 16); 7686 __ lsl(s1, temp0, 4); 7687 __ sub(s1, s1, temp0); 7688 __ add(s1, s1, temp1, ext:: uxth); 7689 7690 __ subs(temp0, s1, base); 7691 __ csel(s1, temp0, s1, Assembler::HS); 7692 7693 // s2 = s2 % BASE 7694 __ lsr(temp0, s2, 16); 7695 __ lsl(temp1, temp0, 4); 7696 __ sub(temp1, temp1, temp0); 7697 __ add(temp1, temp1, s2, ext::uxth); 7698 7699 __ lsr(temp0, temp1, 16); 7700 __ lsl(s2, temp0, 4); 7701 __ sub(s2, s2, temp0); 7702 __ add(s2, s2, temp1, ext:: uxth); 7703 7704 __ subs(temp0, s2, base); 7705 __ csel(s2, temp0, s2, Assembler::HS); 7706 7707 __ subs(len, len, nmax); 7708 __ sub(count, nmax, 16); 7709 __ br(Assembler::HS, L_nmax_loop); 7710 7711 __ bind(L_by16); 7712 __ adds(len, len, count); 7713 __ br(Assembler::LO, L_by1); 7714 7715 __ bind(L_by16_loop); 7716 7717 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 7718 vbytes, vs1acc, vs2acc, vtable); 7719 7720 __ subs(len, len, 16); 7721 __ br(Assembler::HS, L_by16_loop); 7722 7723 __ bind(L_by1); 7724 __ adds(len, len, 15); 7725 __ br(Assembler::LO, L_do_mod); 7726 7727 __ bind(L_by1_loop); 7728 __ ldrb(temp0, Address(__ post(buff, 1))); 7729 __ add(s1, temp0, s1); 7730 __ add(s2, s2, s1); 7731 __ subs(len, len, 1); 7732 __ br(Assembler::HS, L_by1_loop); 7733 7734 __ bind(L_do_mod); 7735 // s1 = s1 % BASE 7736 __ lsr(temp0, s1, 16); 7737 __ lsl(temp1, temp0, 4); 7738 __ sub(temp1, temp1, temp0); 7739 __ add(temp1, temp1, s1, ext::uxth); 7740 7741 __ lsr(temp0, temp1, 16); 7742 __ lsl(s1, temp0, 4); 7743 __ sub(s1, s1, temp0); 7744 __ add(s1, s1, temp1, ext:: uxth); 7745 7746 __ subs(temp0, s1, base); 7747 __ csel(s1, temp0, s1, Assembler::HS); 7748 7749 // s2 = s2 % BASE 7750 __ lsr(temp0, s2, 16); 7751 __ lsl(temp1, temp0, 4); 7752 __ sub(temp1, temp1, temp0); 7753 __ add(temp1, temp1, s2, ext::uxth); 7754 7755 __ lsr(temp0, temp1, 16); 7756 __ lsl(s2, temp0, 4); 7757 __ sub(s2, s2, temp0); 7758 __ add(s2, s2, temp1, ext:: uxth); 7759 7760 __ subs(temp0, s2, base); 7761 __ csel(s2, temp0, s2, Assembler::HS); 7762 7763 // Combine lower bits and higher bits 7764 __ bind(L_combine); 7765 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 7766 7767 __ ret(lr); 7768 7769 return start; 7770 } 7771 7772 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 7773 Register temp0, Register temp1, FloatRegister vbytes, 7774 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 7775 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 7776 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 7777 // In non-vectorized code, we update s1 and s2 as: 7778 // s1 <- s1 + b1 7779 // s2 <- s2 + s1 7780 // s1 <- s1 + b2 7781 // s2 <- s2 + b1 7782 // ... 7783 // s1 <- s1 + b16 7784 // s2 <- s2 + s1 7785 // Putting above assignments together, we have: 7786 // s1_new = s1 + b1 + b2 + ... + b16 7787 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 7788 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 7789 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 7790 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 7791 7792 // s2 = s2 + s1 * 16 7793 __ add(s2, s2, s1, Assembler::LSL, 4); 7794 7795 // vs1acc = b1 + b2 + b3 + ... + b16 7796 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 7797 __ umullv(vs2acc, __ T8B, vtable, vbytes); 7798 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 7799 __ uaddlv(vs1acc, __ T16B, vbytes); 7800 __ uaddlv(vs2acc, __ T8H, vs2acc); 7801 7802 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 7803 __ fmovd(temp0, vs1acc); 7804 __ fmovd(temp1, vs2acc); 7805 __ add(s1, s1, temp0); 7806 __ add(s2, s2, temp1); 7807 } 7808 7809 /** 7810 * Arguments: 7811 * 7812 * Input: 7813 * c_rarg0 - x address 7814 * c_rarg1 - x length 7815 * c_rarg2 - y address 7816 * c_rarg3 - y length 7817 * c_rarg4 - z address 7818 */ 7819 address generate_multiplyToLen() { 7820 __ align(CodeEntryAlignment); 7821 StubId stub_id = StubId::stubgen_multiplyToLen_id; 7822 StubCodeMark mark(this, stub_id); 7823 7824 address start = __ pc(); 7825 const Register x = r0; 7826 const Register xlen = r1; 7827 const Register y = r2; 7828 const Register ylen = r3; 7829 const Register z = r4; 7830 7831 const Register tmp0 = r5; 7832 const Register tmp1 = r10; 7833 const Register tmp2 = r11; 7834 const Register tmp3 = r12; 7835 const Register tmp4 = r13; 7836 const Register tmp5 = r14; 7837 const Register tmp6 = r15; 7838 const Register tmp7 = r16; 7839 7840 BLOCK_COMMENT("Entry:"); 7841 __ enter(); // required for proper stackwalking of RuntimeStub frame 7842 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7843 __ leave(); // required for proper stackwalking of RuntimeStub frame 7844 __ ret(lr); 7845 7846 return start; 7847 } 7848 7849 address generate_squareToLen() { 7850 // squareToLen algorithm for sizes 1..127 described in java code works 7851 // faster than multiply_to_len on some CPUs and slower on others, but 7852 // multiply_to_len shows a bit better overall results 7853 __ align(CodeEntryAlignment); 7854 StubId stub_id = StubId::stubgen_squareToLen_id; 7855 StubCodeMark mark(this, stub_id); 7856 address start = __ pc(); 7857 7858 const Register x = r0; 7859 const Register xlen = r1; 7860 const Register z = r2; 7861 const Register y = r4; // == x 7862 const Register ylen = r5; // == xlen 7863 7864 const Register tmp0 = r3; 7865 const Register tmp1 = r10; 7866 const Register tmp2 = r11; 7867 const Register tmp3 = r12; 7868 const Register tmp4 = r13; 7869 const Register tmp5 = r14; 7870 const Register tmp6 = r15; 7871 const Register tmp7 = r16; 7872 7873 RegSet spilled_regs = RegSet::of(y, ylen); 7874 BLOCK_COMMENT("Entry:"); 7875 __ enter(); 7876 __ push(spilled_regs, sp); 7877 __ mov(y, x); 7878 __ mov(ylen, xlen); 7879 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 7880 __ pop(spilled_regs, sp); 7881 __ leave(); 7882 __ ret(lr); 7883 return start; 7884 } 7885 7886 address generate_mulAdd() { 7887 __ align(CodeEntryAlignment); 7888 StubId stub_id = StubId::stubgen_mulAdd_id; 7889 StubCodeMark mark(this, stub_id); 7890 7891 address start = __ pc(); 7892 7893 const Register out = r0; 7894 const Register in = r1; 7895 const Register offset = r2; 7896 const Register len = r3; 7897 const Register k = r4; 7898 7899 BLOCK_COMMENT("Entry:"); 7900 __ enter(); 7901 __ mul_add(out, in, offset, len, k); 7902 __ leave(); 7903 __ ret(lr); 7904 7905 return start; 7906 } 7907 7908 // Arguments: 7909 // 7910 // Input: 7911 // c_rarg0 - newArr address 7912 // c_rarg1 - oldArr address 7913 // c_rarg2 - newIdx 7914 // c_rarg3 - shiftCount 7915 // c_rarg4 - numIter 7916 // 7917 address generate_bigIntegerRightShift() { 7918 __ align(CodeEntryAlignment); 7919 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id; 7920 StubCodeMark mark(this, stub_id); 7921 address start = __ pc(); 7922 7923 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 7924 7925 Register newArr = c_rarg0; 7926 Register oldArr = c_rarg1; 7927 Register newIdx = c_rarg2; 7928 Register shiftCount = c_rarg3; 7929 Register numIter = c_rarg4; 7930 Register idx = numIter; 7931 7932 Register newArrCur = rscratch1; 7933 Register shiftRevCount = rscratch2; 7934 Register oldArrCur = r13; 7935 Register oldArrNext = r14; 7936 7937 FloatRegister oldElem0 = v0; 7938 FloatRegister oldElem1 = v1; 7939 FloatRegister newElem = v2; 7940 FloatRegister shiftVCount = v3; 7941 FloatRegister shiftVRevCount = v4; 7942 7943 __ cbz(idx, Exit); 7944 7945 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 7946 7947 // left shift count 7948 __ movw(shiftRevCount, 32); 7949 __ subw(shiftRevCount, shiftRevCount, shiftCount); 7950 7951 // numIter too small to allow a 4-words SIMD loop, rolling back 7952 __ cmp(numIter, (u1)4); 7953 __ br(Assembler::LT, ShiftThree); 7954 7955 __ dup(shiftVCount, __ T4S, shiftCount); 7956 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 7957 __ negr(shiftVCount, __ T4S, shiftVCount); 7958 7959 __ BIND(ShiftSIMDLoop); 7960 7961 // Calculate the load addresses 7962 __ sub(idx, idx, 4); 7963 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7964 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7965 __ add(oldArrCur, oldArrNext, 4); 7966 7967 // Load 4 words and process 7968 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 7969 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 7970 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 7971 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 7972 __ orr(newElem, __ T16B, oldElem0, oldElem1); 7973 __ st1(newElem, __ T4S, Address(newArrCur)); 7974 7975 __ cmp(idx, (u1)4); 7976 __ br(Assembler::LT, ShiftTwoLoop); 7977 __ b(ShiftSIMDLoop); 7978 7979 __ BIND(ShiftTwoLoop); 7980 __ cbz(idx, Exit); 7981 __ cmp(idx, (u1)1); 7982 __ br(Assembler::EQ, ShiftOne); 7983 7984 // Calculate the load addresses 7985 __ sub(idx, idx, 2); 7986 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 7987 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 7988 __ add(oldArrCur, oldArrNext, 4); 7989 7990 // Load 2 words and process 7991 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 7992 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 7993 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 7994 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 7995 __ orr(newElem, __ T8B, oldElem0, oldElem1); 7996 __ st1(newElem, __ T2S, Address(newArrCur)); 7997 __ b(ShiftTwoLoop); 7998 7999 __ BIND(ShiftThree); 8000 __ tbz(idx, 1, ShiftOne); 8001 __ tbz(idx, 0, ShiftTwo); 8002 __ ldrw(r10, Address(oldArr, 12)); 8003 __ ldrw(r11, Address(oldArr, 8)); 8004 __ lsrvw(r10, r10, shiftCount); 8005 __ lslvw(r11, r11, shiftRevCount); 8006 __ orrw(r12, r10, r11); 8007 __ strw(r12, Address(newArr, 8)); 8008 8009 __ BIND(ShiftTwo); 8010 __ ldrw(r10, Address(oldArr, 8)); 8011 __ ldrw(r11, Address(oldArr, 4)); 8012 __ lsrvw(r10, r10, shiftCount); 8013 __ lslvw(r11, r11, shiftRevCount); 8014 __ orrw(r12, r10, r11); 8015 __ strw(r12, Address(newArr, 4)); 8016 8017 __ BIND(ShiftOne); 8018 __ ldrw(r10, Address(oldArr, 4)); 8019 __ ldrw(r11, Address(oldArr)); 8020 __ lsrvw(r10, r10, shiftCount); 8021 __ lslvw(r11, r11, shiftRevCount); 8022 __ orrw(r12, r10, r11); 8023 __ strw(r12, Address(newArr)); 8024 8025 __ BIND(Exit); 8026 __ ret(lr); 8027 8028 return start; 8029 } 8030 8031 // Arguments: 8032 // 8033 // Input: 8034 // c_rarg0 - newArr address 8035 // c_rarg1 - oldArr address 8036 // c_rarg2 - newIdx 8037 // c_rarg3 - shiftCount 8038 // c_rarg4 - numIter 8039 // 8040 address generate_bigIntegerLeftShift() { 8041 __ align(CodeEntryAlignment); 8042 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id; 8043 StubCodeMark mark(this, stub_id); 8044 address start = __ pc(); 8045 8046 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 8047 8048 Register newArr = c_rarg0; 8049 Register oldArr = c_rarg1; 8050 Register newIdx = c_rarg2; 8051 Register shiftCount = c_rarg3; 8052 Register numIter = c_rarg4; 8053 8054 Register shiftRevCount = rscratch1; 8055 Register oldArrNext = rscratch2; 8056 8057 FloatRegister oldElem0 = v0; 8058 FloatRegister oldElem1 = v1; 8059 FloatRegister newElem = v2; 8060 FloatRegister shiftVCount = v3; 8061 FloatRegister shiftVRevCount = v4; 8062 8063 __ cbz(numIter, Exit); 8064 8065 __ add(oldArrNext, oldArr, 4); 8066 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 8067 8068 // right shift count 8069 __ movw(shiftRevCount, 32); 8070 __ subw(shiftRevCount, shiftRevCount, shiftCount); 8071 8072 // numIter too small to allow a 4-words SIMD loop, rolling back 8073 __ cmp(numIter, (u1)4); 8074 __ br(Assembler::LT, ShiftThree); 8075 8076 __ dup(shiftVCount, __ T4S, shiftCount); 8077 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 8078 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 8079 8080 __ BIND(ShiftSIMDLoop); 8081 8082 // load 4 words and process 8083 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 8084 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 8085 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 8086 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 8087 __ orr(newElem, __ T16B, oldElem0, oldElem1); 8088 __ st1(newElem, __ T4S, __ post(newArr, 16)); 8089 __ sub(numIter, numIter, 4); 8090 8091 __ cmp(numIter, (u1)4); 8092 __ br(Assembler::LT, ShiftTwoLoop); 8093 __ b(ShiftSIMDLoop); 8094 8095 __ BIND(ShiftTwoLoop); 8096 __ cbz(numIter, Exit); 8097 __ cmp(numIter, (u1)1); 8098 __ br(Assembler::EQ, ShiftOne); 8099 8100 // load 2 words and process 8101 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 8102 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 8103 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 8104 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 8105 __ orr(newElem, __ T8B, oldElem0, oldElem1); 8106 __ st1(newElem, __ T2S, __ post(newArr, 8)); 8107 __ sub(numIter, numIter, 2); 8108 __ b(ShiftTwoLoop); 8109 8110 __ BIND(ShiftThree); 8111 __ ldrw(r10, __ post(oldArr, 4)); 8112 __ ldrw(r11, __ post(oldArrNext, 4)); 8113 __ lslvw(r10, r10, shiftCount); 8114 __ lsrvw(r11, r11, shiftRevCount); 8115 __ orrw(r12, r10, r11); 8116 __ strw(r12, __ post(newArr, 4)); 8117 __ tbz(numIter, 1, Exit); 8118 __ tbz(numIter, 0, ShiftOne); 8119 8120 __ BIND(ShiftTwo); 8121 __ ldrw(r10, __ post(oldArr, 4)); 8122 __ ldrw(r11, __ post(oldArrNext, 4)); 8123 __ lslvw(r10, r10, shiftCount); 8124 __ lsrvw(r11, r11, shiftRevCount); 8125 __ orrw(r12, r10, r11); 8126 __ strw(r12, __ post(newArr, 4)); 8127 8128 __ BIND(ShiftOne); 8129 __ ldrw(r10, Address(oldArr)); 8130 __ ldrw(r11, Address(oldArrNext)); 8131 __ lslvw(r10, r10, shiftCount); 8132 __ lsrvw(r11, r11, shiftRevCount); 8133 __ orrw(r12, r10, r11); 8134 __ strw(r12, Address(newArr)); 8135 8136 __ BIND(Exit); 8137 __ ret(lr); 8138 8139 return start; 8140 } 8141 8142 address generate_count_positives(address &count_positives_long) { 8143 const u1 large_loop_size = 64; 8144 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 8145 int dcache_line = VM_Version::dcache_line_size(); 8146 8147 Register ary1 = r1, len = r2, result = r0; 8148 8149 __ align(CodeEntryAlignment); 8150 8151 StubId stub_id = StubId::stubgen_count_positives_id; 8152 StubCodeMark mark(this, stub_id); 8153 8154 address entry = __ pc(); 8155 8156 __ enter(); 8157 // precondition: a copy of len is already in result 8158 // __ mov(result, len); 8159 8160 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 8161 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 8162 8163 __ cmp(len, (u1)15); 8164 __ br(Assembler::GT, LEN_OVER_15); 8165 // The only case when execution falls into this code is when pointer is near 8166 // the end of memory page and we have to avoid reading next page 8167 __ add(ary1, ary1, len); 8168 __ subs(len, len, 8); 8169 __ br(Assembler::GT, LEN_OVER_8); 8170 __ ldr(rscratch2, Address(ary1, -8)); 8171 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 8172 __ lsrv(rscratch2, rscratch2, rscratch1); 8173 __ tst(rscratch2, UPPER_BIT_MASK); 8174 __ csel(result, zr, result, Assembler::NE); 8175 __ leave(); 8176 __ ret(lr); 8177 __ bind(LEN_OVER_8); 8178 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 8179 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 8180 __ tst(rscratch2, UPPER_BIT_MASK); 8181 __ br(Assembler::NE, RET_NO_POP); 8182 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 8183 __ lsrv(rscratch1, rscratch1, rscratch2); 8184 __ tst(rscratch1, UPPER_BIT_MASK); 8185 __ bind(RET_NO_POP); 8186 __ csel(result, zr, result, Assembler::NE); 8187 __ leave(); 8188 __ ret(lr); 8189 8190 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 8191 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 8192 8193 count_positives_long = __ pc(); // 2nd entry point 8194 8195 __ enter(); 8196 8197 __ bind(LEN_OVER_15); 8198 __ push(spilled_regs, sp); 8199 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 8200 __ cbz(rscratch2, ALIGNED); 8201 __ ldp(tmp6, tmp1, Address(ary1)); 8202 __ mov(tmp5, 16); 8203 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 8204 __ add(ary1, ary1, rscratch1); 8205 __ orr(tmp6, tmp6, tmp1); 8206 __ tst(tmp6, UPPER_BIT_MASK); 8207 __ br(Assembler::NE, RET_ADJUST); 8208 __ sub(len, len, rscratch1); 8209 8210 __ bind(ALIGNED); 8211 __ cmp(len, large_loop_size); 8212 __ br(Assembler::LT, CHECK_16); 8213 // Perform 16-byte load as early return in pre-loop to handle situation 8214 // when initially aligned large array has negative values at starting bytes, 8215 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 8216 // slower. Cases with negative bytes further ahead won't be affected that 8217 // much. In fact, it'll be faster due to early loads, less instructions and 8218 // less branches in LARGE_LOOP. 8219 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 8220 __ sub(len, len, 16); 8221 __ orr(tmp6, tmp6, tmp1); 8222 __ tst(tmp6, UPPER_BIT_MASK); 8223 __ br(Assembler::NE, RET_ADJUST_16); 8224 __ cmp(len, large_loop_size); 8225 __ br(Assembler::LT, CHECK_16); 8226 8227 if (SoftwarePrefetchHintDistance >= 0 8228 && SoftwarePrefetchHintDistance >= dcache_line) { 8229 // initial prefetch 8230 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 8231 } 8232 __ bind(LARGE_LOOP); 8233 if (SoftwarePrefetchHintDistance >= 0) { 8234 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 8235 } 8236 // Issue load instructions first, since it can save few CPU/MEM cycles, also 8237 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 8238 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 8239 // instructions per cycle and have less branches, but this approach disables 8240 // early return, thus, all 64 bytes are loaded and checked every time. 8241 __ ldp(tmp2, tmp3, Address(ary1)); 8242 __ ldp(tmp4, tmp5, Address(ary1, 16)); 8243 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 8244 __ ldp(tmp6, tmp1, Address(ary1, 48)); 8245 __ add(ary1, ary1, large_loop_size); 8246 __ sub(len, len, large_loop_size); 8247 __ orr(tmp2, tmp2, tmp3); 8248 __ orr(tmp4, tmp4, tmp5); 8249 __ orr(rscratch1, rscratch1, rscratch2); 8250 __ orr(tmp6, tmp6, tmp1); 8251 __ orr(tmp2, tmp2, tmp4); 8252 __ orr(rscratch1, rscratch1, tmp6); 8253 __ orr(tmp2, tmp2, rscratch1); 8254 __ tst(tmp2, UPPER_BIT_MASK); 8255 __ br(Assembler::NE, RET_ADJUST_LONG); 8256 __ cmp(len, large_loop_size); 8257 __ br(Assembler::GE, LARGE_LOOP); 8258 8259 __ bind(CHECK_16); // small 16-byte load pre-loop 8260 __ cmp(len, (u1)16); 8261 __ br(Assembler::LT, POST_LOOP16); 8262 8263 __ bind(LOOP16); // small 16-byte load loop 8264 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 8265 __ sub(len, len, 16); 8266 __ orr(tmp2, tmp2, tmp3); 8267 __ tst(tmp2, UPPER_BIT_MASK); 8268 __ br(Assembler::NE, RET_ADJUST_16); 8269 __ cmp(len, (u1)16); 8270 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 8271 8272 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 8273 __ cmp(len, (u1)8); 8274 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 8275 __ ldr(tmp3, Address(__ post(ary1, 8))); 8276 __ tst(tmp3, UPPER_BIT_MASK); 8277 __ br(Assembler::NE, RET_ADJUST); 8278 __ sub(len, len, 8); 8279 8280 __ bind(POST_LOOP16_LOAD_TAIL); 8281 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 8282 __ ldr(tmp1, Address(ary1)); 8283 __ mov(tmp2, 64); 8284 __ sub(tmp4, tmp2, len, __ LSL, 3); 8285 __ lslv(tmp1, tmp1, tmp4); 8286 __ tst(tmp1, UPPER_BIT_MASK); 8287 __ br(Assembler::NE, RET_ADJUST); 8288 // Fallthrough 8289 8290 __ bind(RET_LEN); 8291 __ pop(spilled_regs, sp); 8292 __ leave(); 8293 __ ret(lr); 8294 8295 // difference result - len is the count of guaranteed to be 8296 // positive bytes 8297 8298 __ bind(RET_ADJUST_LONG); 8299 __ add(len, len, (u1)(large_loop_size - 16)); 8300 __ bind(RET_ADJUST_16); 8301 __ add(len, len, 16); 8302 __ bind(RET_ADJUST); 8303 __ pop(spilled_regs, sp); 8304 __ leave(); 8305 __ sub(result, result, len); 8306 __ ret(lr); 8307 8308 return entry; 8309 } 8310 8311 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 8312 bool usePrefetch, Label &NOT_EQUAL) { 8313 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8314 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8315 tmp7 = r12, tmp8 = r13; 8316 Label LOOP; 8317 8318 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8319 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8320 __ bind(LOOP); 8321 if (usePrefetch) { 8322 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8323 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8324 } 8325 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8326 __ eor(tmp1, tmp1, tmp2); 8327 __ eor(tmp3, tmp3, tmp4); 8328 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8329 __ orr(tmp1, tmp1, tmp3); 8330 __ cbnz(tmp1, NOT_EQUAL); 8331 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8332 __ eor(tmp5, tmp5, tmp6); 8333 __ eor(tmp7, tmp7, tmp8); 8334 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8335 __ orr(tmp5, tmp5, tmp7); 8336 __ cbnz(tmp5, NOT_EQUAL); 8337 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 8338 __ eor(tmp1, tmp1, tmp2); 8339 __ eor(tmp3, tmp3, tmp4); 8340 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 8341 __ orr(tmp1, tmp1, tmp3); 8342 __ cbnz(tmp1, NOT_EQUAL); 8343 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 8344 __ eor(tmp5, tmp5, tmp6); 8345 __ sub(cnt1, cnt1, 8 * wordSize); 8346 __ eor(tmp7, tmp7, tmp8); 8347 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 8348 // tmp6 is not used. MacroAssembler::subs is used here (rather than 8349 // cmp) because subs allows an unlimited range of immediate operand. 8350 __ subs(tmp6, cnt1, loopThreshold); 8351 __ orr(tmp5, tmp5, tmp7); 8352 __ cbnz(tmp5, NOT_EQUAL); 8353 __ br(__ GE, LOOP); 8354 // post-loop 8355 __ eor(tmp1, tmp1, tmp2); 8356 __ eor(tmp3, tmp3, tmp4); 8357 __ orr(tmp1, tmp1, tmp3); 8358 __ sub(cnt1, cnt1, 2 * wordSize); 8359 __ cbnz(tmp1, NOT_EQUAL); 8360 } 8361 8362 void generate_large_array_equals_loop_simd(int loopThreshold, 8363 bool usePrefetch, Label &NOT_EQUAL) { 8364 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8365 tmp2 = rscratch2; 8366 Label LOOP; 8367 8368 __ bind(LOOP); 8369 if (usePrefetch) { 8370 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 8371 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 8372 } 8373 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 8374 __ sub(cnt1, cnt1, 8 * wordSize); 8375 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 8376 __ subs(tmp1, cnt1, loopThreshold); 8377 __ eor(v0, __ T16B, v0, v4); 8378 __ eor(v1, __ T16B, v1, v5); 8379 __ eor(v2, __ T16B, v2, v6); 8380 __ eor(v3, __ T16B, v3, v7); 8381 __ orr(v0, __ T16B, v0, v1); 8382 __ orr(v1, __ T16B, v2, v3); 8383 __ orr(v0, __ T16B, v0, v1); 8384 __ umov(tmp1, v0, __ D, 0); 8385 __ umov(tmp2, v0, __ D, 1); 8386 __ orr(tmp1, tmp1, tmp2); 8387 __ cbnz(tmp1, NOT_EQUAL); 8388 __ br(__ GE, LOOP); 8389 } 8390 8391 // a1 = r1 - array1 address 8392 // a2 = r2 - array2 address 8393 // result = r0 - return value. Already contains "false" 8394 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 8395 // r3-r5 are reserved temporary registers 8396 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 8397 address generate_large_array_equals() { 8398 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 8399 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 8400 tmp7 = r12, tmp8 = r13; 8401 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 8402 SMALL_LOOP, POST_LOOP; 8403 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 8404 // calculate if at least 32 prefetched bytes are used 8405 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 8406 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 8407 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 8408 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 8409 tmp5, tmp6, tmp7, tmp8); 8410 8411 __ align(CodeEntryAlignment); 8412 8413 StubId stub_id = StubId::stubgen_large_array_equals_id; 8414 StubCodeMark mark(this, stub_id); 8415 8416 address entry = __ pc(); 8417 __ enter(); 8418 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 8419 // also advance pointers to use post-increment instead of pre-increment 8420 __ add(a1, a1, wordSize); 8421 __ add(a2, a2, wordSize); 8422 if (AvoidUnalignedAccesses) { 8423 // both implementations (SIMD/nonSIMD) are using relatively large load 8424 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 8425 // on some CPUs in case of address is not at least 16-byte aligned. 8426 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 8427 // load if needed at least for 1st address and make if 16-byte aligned. 8428 Label ALIGNED16; 8429 __ tbz(a1, 3, ALIGNED16); 8430 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8431 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8432 __ sub(cnt1, cnt1, wordSize); 8433 __ eor(tmp1, tmp1, tmp2); 8434 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 8435 __ bind(ALIGNED16); 8436 } 8437 if (UseSIMDForArrayEquals) { 8438 if (SoftwarePrefetchHintDistance >= 0) { 8439 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8440 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8441 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 8442 /* prfm = */ true, NOT_EQUAL); 8443 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8444 __ br(__ LT, TAIL); 8445 } 8446 __ bind(NO_PREFETCH_LARGE_LOOP); 8447 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 8448 /* prfm = */ false, NOT_EQUAL); 8449 } else { 8450 __ push(spilled_regs, sp); 8451 if (SoftwarePrefetchHintDistance >= 0) { 8452 __ subs(tmp1, cnt1, prefetchLoopThreshold); 8453 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 8454 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 8455 /* prfm = */ true, NOT_EQUAL); 8456 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 8457 __ br(__ LT, TAIL); 8458 } 8459 __ bind(NO_PREFETCH_LARGE_LOOP); 8460 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 8461 /* prfm = */ false, NOT_EQUAL); 8462 } 8463 __ bind(TAIL); 8464 __ cbz(cnt1, EQUAL); 8465 __ subs(cnt1, cnt1, wordSize); 8466 __ br(__ LE, POST_LOOP); 8467 __ bind(SMALL_LOOP); 8468 __ ldr(tmp1, Address(__ post(a1, wordSize))); 8469 __ ldr(tmp2, Address(__ post(a2, wordSize))); 8470 __ subs(cnt1, cnt1, wordSize); 8471 __ eor(tmp1, tmp1, tmp2); 8472 __ cbnz(tmp1, NOT_EQUAL); 8473 __ br(__ GT, SMALL_LOOP); 8474 __ bind(POST_LOOP); 8475 __ ldr(tmp1, Address(a1, cnt1)); 8476 __ ldr(tmp2, Address(a2, cnt1)); 8477 __ eor(tmp1, tmp1, tmp2); 8478 __ cbnz(tmp1, NOT_EQUAL); 8479 __ bind(EQUAL); 8480 __ mov(result, true); 8481 __ bind(NOT_EQUAL); 8482 if (!UseSIMDForArrayEquals) { 8483 __ pop(spilled_regs, sp); 8484 } 8485 __ bind(NOT_EQUAL_NO_POP); 8486 __ leave(); 8487 __ ret(lr); 8488 return entry; 8489 } 8490 8491 // result = r0 - return value. Contains initial hashcode value on entry. 8492 // ary = r1 - array address 8493 // cnt = r2 - elements count 8494 // Clobbers: v0-v13, rscratch1, rscratch2 8495 address generate_large_arrays_hashcode(BasicType eltype) { 8496 const Register result = r0, ary = r1, cnt = r2; 8497 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 8498 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 8499 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 8500 const FloatRegister vpowm = v13; 8501 8502 ARRAYS_HASHCODE_REGISTERS; 8503 8504 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 8505 8506 unsigned int vf; // vectorization factor 8507 bool multiply_by_halves; 8508 Assembler::SIMD_Arrangement load_arrangement; 8509 switch (eltype) { 8510 case T_BOOLEAN: 8511 case T_BYTE: 8512 load_arrangement = Assembler::T8B; 8513 multiply_by_halves = true; 8514 vf = 8; 8515 break; 8516 case T_CHAR: 8517 case T_SHORT: 8518 load_arrangement = Assembler::T8H; 8519 multiply_by_halves = true; 8520 vf = 8; 8521 break; 8522 case T_INT: 8523 load_arrangement = Assembler::T4S; 8524 multiply_by_halves = false; 8525 vf = 4; 8526 break; 8527 default: 8528 ShouldNotReachHere(); 8529 } 8530 8531 // Unroll factor 8532 const unsigned uf = 4; 8533 8534 // Effective vectorization factor 8535 const unsigned evf = vf * uf; 8536 8537 __ align(CodeEntryAlignment); 8538 8539 StubId stub_id; 8540 switch (eltype) { 8541 case T_BOOLEAN: 8542 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id; 8543 break; 8544 case T_BYTE: 8545 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id; 8546 break; 8547 case T_CHAR: 8548 stub_id = StubId::stubgen_large_arrays_hashcode_char_id; 8549 break; 8550 case T_SHORT: 8551 stub_id = StubId::stubgen_large_arrays_hashcode_short_id; 8552 break; 8553 case T_INT: 8554 stub_id = StubId::stubgen_large_arrays_hashcode_int_id; 8555 break; 8556 default: 8557 stub_id = StubId::NO_STUBID; 8558 ShouldNotReachHere(); 8559 }; 8560 8561 StubCodeMark mark(this, stub_id); 8562 8563 address entry = __ pc(); 8564 __ enter(); 8565 8566 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 8567 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 8568 // value shouldn't change throughout both loops. 8569 __ movw(rscratch1, intpow(31U, 3)); 8570 __ mov(vpow, Assembler::S, 0, rscratch1); 8571 __ movw(rscratch1, intpow(31U, 2)); 8572 __ mov(vpow, Assembler::S, 1, rscratch1); 8573 __ movw(rscratch1, intpow(31U, 1)); 8574 __ mov(vpow, Assembler::S, 2, rscratch1); 8575 __ movw(rscratch1, intpow(31U, 0)); 8576 __ mov(vpow, Assembler::S, 3, rscratch1); 8577 8578 __ mov(vmul0, Assembler::T16B, 0); 8579 __ mov(vmul0, Assembler::S, 3, result); 8580 8581 __ andr(rscratch2, cnt, (uf - 1) * vf); 8582 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 8583 8584 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 8585 __ mov(vpowm, Assembler::S, 0, rscratch1); 8586 8587 // SMALL LOOP 8588 __ bind(SMALL_LOOP); 8589 8590 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 8591 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8592 __ subsw(rscratch2, rscratch2, vf); 8593 8594 if (load_arrangement == Assembler::T8B) { 8595 // Extend 8B to 8H to be able to use vector multiply 8596 // instructions 8597 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8598 if (is_signed_subword_type(eltype)) { 8599 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8600 } else { 8601 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8602 } 8603 } 8604 8605 switch (load_arrangement) { 8606 case Assembler::T4S: 8607 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8608 break; 8609 case Assembler::T8B: 8610 case Assembler::T8H: 8611 assert(is_subword_type(eltype), "subword type expected"); 8612 if (is_signed_subword_type(eltype)) { 8613 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8614 } else { 8615 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8616 } 8617 break; 8618 default: 8619 __ should_not_reach_here(); 8620 } 8621 8622 // Process the upper half of a vector 8623 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8624 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8625 if (is_signed_subword_type(eltype)) { 8626 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8627 } else { 8628 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8629 } 8630 } 8631 8632 __ br(Assembler::HI, SMALL_LOOP); 8633 8634 // SMALL LOOP'S EPILOQUE 8635 __ lsr(rscratch2, cnt, exact_log2(evf)); 8636 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 8637 8638 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8639 __ addv(vmul0, Assembler::T4S, vmul0); 8640 __ umov(result, vmul0, Assembler::S, 0); 8641 8642 // TAIL 8643 __ bind(TAIL); 8644 8645 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 8646 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 8647 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 8648 __ andr(rscratch2, cnt, vf - 1); 8649 __ bind(TAIL_SHORTCUT); 8650 __ adr(rscratch1, BR_BASE); 8651 // For Cortex-A53 offset is 4 because 2 nops are generated. 8652 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3); 8653 __ movw(rscratch2, 0x1f); 8654 __ br(rscratch1); 8655 8656 for (size_t i = 0; i < vf - 1; ++i) { 8657 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 8658 eltype); 8659 __ maddw(result, result, rscratch2, rscratch1); 8660 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler). 8661 // Generate 2nd nop to have 4 instructions per iteration. 8662 if (VM_Version::supports_a53mac()) { 8663 __ nop(); 8664 } 8665 } 8666 __ bind(BR_BASE); 8667 8668 __ leave(); 8669 __ ret(lr); 8670 8671 // LARGE LOOP 8672 __ bind(LARGE_LOOP_PREHEADER); 8673 8674 __ lsr(rscratch2, cnt, exact_log2(evf)); 8675 8676 if (multiply_by_halves) { 8677 // 31^4 - multiplier between lower and upper parts of a register 8678 __ movw(rscratch1, intpow(31U, vf / 2)); 8679 __ mov(vpowm, Assembler::S, 1, rscratch1); 8680 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 8681 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 8682 __ mov(vpowm, Assembler::S, 0, rscratch1); 8683 } else { 8684 // 31^16 8685 __ movw(rscratch1, intpow(31U, evf)); 8686 __ mov(vpowm, Assembler::S, 0, rscratch1); 8687 } 8688 8689 __ mov(vmul3, Assembler::T16B, 0); 8690 __ mov(vmul2, Assembler::T16B, 0); 8691 __ mov(vmul1, Assembler::T16B, 0); 8692 8693 __ bind(LARGE_LOOP); 8694 8695 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 8696 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 8697 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 8698 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 8699 8700 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 8701 Address(__ post(ary, evf * type2aelembytes(eltype)))); 8702 8703 if (load_arrangement == Assembler::T8B) { 8704 // Extend 8B to 8H to be able to use vector multiply 8705 // instructions 8706 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 8707 if (is_signed_subword_type(eltype)) { 8708 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8709 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8710 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8711 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8712 } else { 8713 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 8714 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 8715 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 8716 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 8717 } 8718 } 8719 8720 switch (load_arrangement) { 8721 case Assembler::T4S: 8722 __ addv(vmul3, load_arrangement, vmul3, vdata3); 8723 __ addv(vmul2, load_arrangement, vmul2, vdata2); 8724 __ addv(vmul1, load_arrangement, vmul1, vdata1); 8725 __ addv(vmul0, load_arrangement, vmul0, vdata0); 8726 break; 8727 case Assembler::T8B: 8728 case Assembler::T8H: 8729 assert(is_subword_type(eltype), "subword type expected"); 8730 if (is_signed_subword_type(eltype)) { 8731 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8732 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8733 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8734 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8735 } else { 8736 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 8737 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 8738 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 8739 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 8740 } 8741 break; 8742 default: 8743 __ should_not_reach_here(); 8744 } 8745 8746 // Process the upper half of a vector 8747 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 8748 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 8749 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 8750 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 8751 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 8752 if (is_signed_subword_type(eltype)) { 8753 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8754 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8755 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8756 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8757 } else { 8758 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 8759 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 8760 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 8761 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 8762 } 8763 } 8764 8765 __ subsw(rscratch2, rscratch2, 1); 8766 __ br(Assembler::HI, LARGE_LOOP); 8767 8768 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 8769 __ addv(vmul3, Assembler::T4S, vmul3); 8770 __ umov(result, vmul3, Assembler::S, 0); 8771 8772 __ mov(rscratch2, intpow(31U, vf)); 8773 8774 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 8775 __ addv(vmul2, Assembler::T4S, vmul2); 8776 __ umov(rscratch1, vmul2, Assembler::S, 0); 8777 __ maddw(result, result, rscratch2, rscratch1); 8778 8779 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 8780 __ addv(vmul1, Assembler::T4S, vmul1); 8781 __ umov(rscratch1, vmul1, Assembler::S, 0); 8782 __ maddw(result, result, rscratch2, rscratch1); 8783 8784 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 8785 __ addv(vmul0, Assembler::T4S, vmul0); 8786 __ umov(rscratch1, vmul0, Assembler::S, 0); 8787 __ maddw(result, result, rscratch2, rscratch1); 8788 8789 __ andr(rscratch2, cnt, vf - 1); 8790 __ cbnz(rscratch2, TAIL_SHORTCUT); 8791 8792 __ leave(); 8793 __ ret(lr); 8794 8795 return entry; 8796 } 8797 8798 address generate_dsin_dcos(bool isCos) { 8799 __ align(CodeEntryAlignment); 8800 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id); 8801 StubCodeMark mark(this, stub_id); 8802 address start = __ pc(); 8803 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 8804 (address)StubRoutines::aarch64::_two_over_pi, 8805 (address)StubRoutines::aarch64::_pio2, 8806 (address)StubRoutines::aarch64::_dsin_coef, 8807 (address)StubRoutines::aarch64::_dcos_coef); 8808 return start; 8809 } 8810 8811 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 8812 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 8813 Label &DIFF2) { 8814 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 8815 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 8816 8817 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 8818 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8819 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 8820 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 8821 8822 __ fmovd(tmpL, vtmp3); 8823 __ eor(rscratch2, tmp3, tmpL); 8824 __ cbnz(rscratch2, DIFF2); 8825 8826 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8827 __ umov(tmpL, vtmp3, __ D, 1); 8828 __ eor(rscratch2, tmpU, tmpL); 8829 __ cbnz(rscratch2, DIFF1); 8830 8831 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 8832 __ ldr(tmpU, Address(__ post(cnt1, 8))); 8833 __ fmovd(tmpL, vtmp); 8834 __ eor(rscratch2, tmp3, tmpL); 8835 __ cbnz(rscratch2, DIFF2); 8836 8837 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8838 __ umov(tmpL, vtmp, __ D, 1); 8839 __ eor(rscratch2, tmpU, tmpL); 8840 __ cbnz(rscratch2, DIFF1); 8841 } 8842 8843 // r0 = result 8844 // r1 = str1 8845 // r2 = cnt1 8846 // r3 = str2 8847 // r4 = cnt2 8848 // r10 = tmp1 8849 // r11 = tmp2 8850 address generate_compare_long_string_different_encoding(bool isLU) { 8851 __ align(CodeEntryAlignment); 8852 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id); 8853 StubCodeMark mark(this, stub_id); 8854 address entry = __ pc(); 8855 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 8856 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 8857 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 8858 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 8859 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 8860 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 8861 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 8862 8863 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 8864 8865 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 8866 // cnt2 == amount of characters left to compare 8867 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 8868 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8869 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 8870 __ add(str2, str2, isLU ? wordSize : wordSize/2); 8871 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 8872 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 8873 __ eor(rscratch2, tmp1, tmp2); 8874 __ mov(rscratch1, tmp2); 8875 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 8876 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 8877 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 8878 __ push(spilled_regs, sp); 8879 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 8880 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 8881 8882 __ ldr(tmp3, Address(__ post(cnt1, 8))); 8883 8884 if (SoftwarePrefetchHintDistance >= 0) { 8885 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8886 __ br(__ LT, NO_PREFETCH); 8887 __ bind(LARGE_LOOP_PREFETCH); 8888 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 8889 __ mov(tmp4, 2); 8890 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8891 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 8892 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8893 __ subs(tmp4, tmp4, 1); 8894 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 8895 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 8896 __ mov(tmp4, 2); 8897 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 8898 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8899 __ subs(tmp4, tmp4, 1); 8900 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 8901 __ sub(cnt2, cnt2, 64); 8902 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 8903 __ br(__ GE, LARGE_LOOP_PREFETCH); 8904 } 8905 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 8906 __ bind(NO_PREFETCH); 8907 __ subs(cnt2, cnt2, 16); 8908 __ br(__ LT, TAIL); 8909 __ align(OptoLoopAlignment); 8910 __ bind(SMALL_LOOP); // smaller loop 8911 __ subs(cnt2, cnt2, 16); 8912 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 8913 __ br(__ GE, SMALL_LOOP); 8914 __ cmn(cnt2, (u1)16); 8915 __ br(__ EQ, LOAD_LAST); 8916 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 8917 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 8918 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 8919 __ ldr(tmp3, Address(cnt1, -8)); 8920 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 8921 __ b(LOAD_LAST); 8922 __ bind(DIFF2); 8923 __ mov(tmpU, tmp3); 8924 __ bind(DIFF1); 8925 __ pop(spilled_regs, sp); 8926 __ b(CALCULATE_DIFFERENCE); 8927 __ bind(LOAD_LAST); 8928 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 8929 // No need to load it again 8930 __ mov(tmpU, tmp3); 8931 __ pop(spilled_regs, sp); 8932 8933 // tmp2 points to the address of the last 4 Latin1 characters right now 8934 __ ldrs(vtmp, Address(tmp2)); 8935 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 8936 __ fmovd(tmpL, vtmp); 8937 8938 __ eor(rscratch2, tmpU, tmpL); 8939 __ cbz(rscratch2, DONE); 8940 8941 // Find the first different characters in the longwords and 8942 // compute their difference. 8943 __ bind(CALCULATE_DIFFERENCE); 8944 __ rev(rscratch2, rscratch2); 8945 __ clz(rscratch2, rscratch2); 8946 __ andr(rscratch2, rscratch2, -16); 8947 __ lsrv(tmp1, tmp1, rscratch2); 8948 __ uxthw(tmp1, tmp1); 8949 __ lsrv(rscratch1, rscratch1, rscratch2); 8950 __ uxthw(rscratch1, rscratch1); 8951 __ subw(result, tmp1, rscratch1); 8952 __ bind(DONE); 8953 __ ret(lr); 8954 return entry; 8955 } 8956 8957 // r0 = input (float16) 8958 // v0 = result (float) 8959 // v1 = temporary float register 8960 address generate_float16ToFloat() { 8961 __ align(CodeEntryAlignment); 8962 StubId stub_id = StubId::stubgen_hf2f_id; 8963 StubCodeMark mark(this, stub_id); 8964 address entry = __ pc(); 8965 BLOCK_COMMENT("Entry:"); 8966 __ flt16_to_flt(v0, r0, v1); 8967 __ ret(lr); 8968 return entry; 8969 } 8970 8971 // v0 = input (float) 8972 // r0 = result (float16) 8973 // v1 = temporary float register 8974 address generate_floatToFloat16() { 8975 __ align(CodeEntryAlignment); 8976 StubId stub_id = StubId::stubgen_f2hf_id; 8977 StubCodeMark mark(this, stub_id); 8978 address entry = __ pc(); 8979 BLOCK_COMMENT("Entry:"); 8980 __ flt_to_flt16(r0, v0, v1); 8981 __ ret(lr); 8982 return entry; 8983 } 8984 8985 address generate_method_entry_barrier() { 8986 __ align(CodeEntryAlignment); 8987 StubId stub_id = StubId::stubgen_method_entry_barrier_id; 8988 StubCodeMark mark(this, stub_id); 8989 8990 Label deoptimize_label; 8991 8992 address start = __ pc(); 8993 8994 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 8995 8996 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 8997 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8998 // We can get here despite the nmethod being good, if we have not 8999 // yet applied our cross modification fence (or data fence). 9000 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 9001 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 9002 __ ldrw(rscratch2, rscratch2); 9003 __ strw(rscratch2, thread_epoch_addr); 9004 __ isb(); 9005 __ membar(__ LoadLoad); 9006 } 9007 9008 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 9009 9010 __ enter(); 9011 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 9012 9013 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 9014 9015 __ push_call_clobbered_registers(); 9016 9017 __ mov(c_rarg0, rscratch2); 9018 __ call_VM_leaf 9019 (CAST_FROM_FN_PTR 9020 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 9021 9022 __ reset_last_Java_frame(true); 9023 9024 __ mov(rscratch1, r0); 9025 9026 __ pop_call_clobbered_registers(); 9027 9028 __ cbnz(rscratch1, deoptimize_label); 9029 9030 __ leave(); 9031 __ ret(lr); 9032 9033 __ BIND(deoptimize_label); 9034 9035 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 9036 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 9037 9038 __ mov(sp, rscratch1); 9039 __ br(rscratch2); 9040 9041 return start; 9042 } 9043 9044 // r0 = result 9045 // r1 = str1 9046 // r2 = cnt1 9047 // r3 = str2 9048 // r4 = cnt2 9049 // r10 = tmp1 9050 // r11 = tmp2 9051 address generate_compare_long_string_same_encoding(bool isLL) { 9052 __ align(CodeEntryAlignment); 9053 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id); 9054 StubCodeMark mark(this, stub_id); 9055 address entry = __ pc(); 9056 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9057 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 9058 9059 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 9060 9061 // exit from large loop when less than 64 bytes left to read or we're about 9062 // to prefetch memory behind array border 9063 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 9064 9065 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 9066 __ eor(rscratch2, tmp1, tmp2); 9067 __ cbnz(rscratch2, CAL_DIFFERENCE); 9068 9069 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 9070 // update pointers, because of previous read 9071 __ add(str1, str1, wordSize); 9072 __ add(str2, str2, wordSize); 9073 if (SoftwarePrefetchHintDistance >= 0) { 9074 __ align(OptoLoopAlignment); 9075 __ bind(LARGE_LOOP_PREFETCH); 9076 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 9077 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 9078 9079 for (int i = 0; i < 4; i++) { 9080 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 9081 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 9082 __ cmp(tmp1, tmp2); 9083 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9084 __ br(Assembler::NE, DIFF); 9085 } 9086 __ sub(cnt2, cnt2, isLL ? 64 : 32); 9087 __ add(str1, str1, 64); 9088 __ add(str2, str2, 64); 9089 __ subs(rscratch2, cnt2, largeLoopExitCondition); 9090 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 9091 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 9092 } 9093 9094 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 9095 __ br(Assembler::LE, LESS16); 9096 __ align(OptoLoopAlignment); 9097 __ bind(LOOP_COMPARE16); 9098 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9099 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9100 __ cmp(tmp1, tmp2); 9101 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9102 __ br(Assembler::NE, DIFF); 9103 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9104 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9105 __ br(Assembler::LT, LESS16); 9106 9107 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 9108 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 9109 __ cmp(tmp1, tmp2); 9110 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 9111 __ br(Assembler::NE, DIFF); 9112 __ sub(cnt2, cnt2, isLL ? 16 : 8); 9113 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 9114 __ br(Assembler::GE, LOOP_COMPARE16); 9115 __ cbz(cnt2, LENGTH_DIFF); 9116 9117 __ bind(LESS16); 9118 // each 8 compare 9119 __ subs(cnt2, cnt2, isLL ? 8 : 4); 9120 __ br(Assembler::LE, LESS8); 9121 __ ldr(tmp1, Address(__ post(str1, 8))); 9122 __ ldr(tmp2, Address(__ post(str2, 8))); 9123 __ eor(rscratch2, tmp1, tmp2); 9124 __ cbnz(rscratch2, CAL_DIFFERENCE); 9125 __ sub(cnt2, cnt2, isLL ? 8 : 4); 9126 9127 __ bind(LESS8); // directly load last 8 bytes 9128 if (!isLL) { 9129 __ add(cnt2, cnt2, cnt2); 9130 } 9131 __ ldr(tmp1, Address(str1, cnt2)); 9132 __ ldr(tmp2, Address(str2, cnt2)); 9133 __ eor(rscratch2, tmp1, tmp2); 9134 __ cbz(rscratch2, LENGTH_DIFF); 9135 __ b(CAL_DIFFERENCE); 9136 9137 __ bind(DIFF); 9138 __ cmp(tmp1, tmp2); 9139 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 9140 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 9141 // reuse rscratch2 register for the result of eor instruction 9142 __ eor(rscratch2, tmp1, tmp2); 9143 9144 __ bind(CAL_DIFFERENCE); 9145 __ rev(rscratch2, rscratch2); 9146 __ clz(rscratch2, rscratch2); 9147 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 9148 __ lsrv(tmp1, tmp1, rscratch2); 9149 __ lsrv(tmp2, tmp2, rscratch2); 9150 if (isLL) { 9151 __ uxtbw(tmp1, tmp1); 9152 __ uxtbw(tmp2, tmp2); 9153 } else { 9154 __ uxthw(tmp1, tmp1); 9155 __ uxthw(tmp2, tmp2); 9156 } 9157 __ subw(result, tmp1, tmp2); 9158 9159 __ bind(LENGTH_DIFF); 9160 __ ret(lr); 9161 return entry; 9162 } 9163 9164 enum string_compare_mode { 9165 LL, 9166 LU, 9167 UL, 9168 UU, 9169 }; 9170 9171 // The following registers are declared in aarch64.ad 9172 // r0 = result 9173 // r1 = str1 9174 // r2 = cnt1 9175 // r3 = str2 9176 // r4 = cnt2 9177 // r10 = tmp1 9178 // r11 = tmp2 9179 // z0 = ztmp1 9180 // z1 = ztmp2 9181 // p0 = pgtmp1 9182 // p1 = pgtmp2 9183 address generate_compare_long_string_sve(string_compare_mode mode) { 9184 StubId stub_id; 9185 switch (mode) { 9186 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break; 9187 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break; 9188 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break; 9189 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break; 9190 default: ShouldNotReachHere(); 9191 } 9192 9193 __ align(CodeEntryAlignment); 9194 address entry = __ pc(); 9195 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 9196 tmp1 = r10, tmp2 = r11; 9197 9198 Label LOOP, DONE, MISMATCH; 9199 Register vec_len = tmp1; 9200 Register idx = tmp2; 9201 // The minimum of the string lengths has been stored in cnt2. 9202 Register cnt = cnt2; 9203 FloatRegister ztmp1 = z0, ztmp2 = z1; 9204 PRegister pgtmp1 = p0, pgtmp2 = p1; 9205 9206 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 9207 switch (mode) { \ 9208 case LL: \ 9209 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 9210 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 9211 break; \ 9212 case LU: \ 9213 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 9214 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9215 break; \ 9216 case UL: \ 9217 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9218 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 9219 break; \ 9220 case UU: \ 9221 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 9222 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 9223 break; \ 9224 default: \ 9225 ShouldNotReachHere(); \ 9226 } 9227 9228 StubCodeMark mark(this, stub_id); 9229 9230 __ mov(idx, 0); 9231 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9232 9233 if (mode == LL) { 9234 __ sve_cntb(vec_len); 9235 } else { 9236 __ sve_cnth(vec_len); 9237 } 9238 9239 __ sub(rscratch1, cnt, vec_len); 9240 9241 __ bind(LOOP); 9242 9243 // main loop 9244 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9245 __ add(idx, idx, vec_len); 9246 // Compare strings. 9247 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9248 __ br(__ NE, MISMATCH); 9249 __ cmp(idx, rscratch1); 9250 __ br(__ LT, LOOP); 9251 9252 // post loop, last iteration 9253 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 9254 9255 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 9256 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 9257 __ br(__ EQ, DONE); 9258 9259 __ bind(MISMATCH); 9260 9261 // Crop the vector to find its location. 9262 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 9263 // Extract the first different characters of each string. 9264 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 9265 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 9266 9267 // Compute the difference of the first different characters. 9268 __ sub(result, rscratch1, rscratch2); 9269 9270 __ bind(DONE); 9271 __ ret(lr); 9272 #undef LOAD_PAIR 9273 return entry; 9274 } 9275 9276 void generate_compare_long_strings() { 9277 if (UseSVE == 0) { 9278 StubRoutines::aarch64::_compare_long_string_LL 9279 = generate_compare_long_string_same_encoding(true); 9280 StubRoutines::aarch64::_compare_long_string_UU 9281 = generate_compare_long_string_same_encoding(false); 9282 StubRoutines::aarch64::_compare_long_string_LU 9283 = generate_compare_long_string_different_encoding(true); 9284 StubRoutines::aarch64::_compare_long_string_UL 9285 = generate_compare_long_string_different_encoding(false); 9286 } else { 9287 StubRoutines::aarch64::_compare_long_string_LL 9288 = generate_compare_long_string_sve(LL); 9289 StubRoutines::aarch64::_compare_long_string_UU 9290 = generate_compare_long_string_sve(UU); 9291 StubRoutines::aarch64::_compare_long_string_LU 9292 = generate_compare_long_string_sve(LU); 9293 StubRoutines::aarch64::_compare_long_string_UL 9294 = generate_compare_long_string_sve(UL); 9295 } 9296 } 9297 9298 // R0 = result 9299 // R1 = str2 9300 // R2 = cnt1 9301 // R3 = str1 9302 // R4 = cnt2 9303 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 9304 // 9305 // This generic linear code use few additional ideas, which makes it faster: 9306 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 9307 // in order to skip initial loading(help in systems with 1 ld pipeline) 9308 // 2) we can use "fast" algorithm of finding single character to search for 9309 // first symbol with less branches(1 branch per each loaded register instead 9310 // of branch for each symbol), so, this is where constants like 9311 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 9312 // 3) after loading and analyzing 1st register of source string, it can be 9313 // used to search for every 1st character entry, saving few loads in 9314 // comparison with "simplier-but-slower" implementation 9315 // 4) in order to avoid lots of push/pop operations, code below is heavily 9316 // re-using/re-initializing/compressing register values, which makes code 9317 // larger and a bit less readable, however, most of extra operations are 9318 // issued during loads or branches, so, penalty is minimal 9319 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 9320 StubId stub_id; 9321 if (str1_isL) { 9322 if (str2_isL) { 9323 stub_id = StubId::stubgen_string_indexof_linear_ll_id; 9324 } else { 9325 stub_id = StubId::stubgen_string_indexof_linear_ul_id; 9326 } 9327 } else { 9328 if (str2_isL) { 9329 ShouldNotReachHere(); 9330 } else { 9331 stub_id = StubId::stubgen_string_indexof_linear_uu_id; 9332 } 9333 } 9334 __ align(CodeEntryAlignment); 9335 StubCodeMark mark(this, stub_id); 9336 address entry = __ pc(); 9337 9338 int str1_chr_size = str1_isL ? 1 : 2; 9339 int str2_chr_size = str2_isL ? 1 : 2; 9340 int str1_chr_shift = str1_isL ? 0 : 1; 9341 int str2_chr_shift = str2_isL ? 0 : 1; 9342 bool isL = str1_isL && str2_isL; 9343 // parameters 9344 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 9345 // temporary registers 9346 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 9347 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 9348 // redefinitions 9349 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 9350 9351 __ push(spilled_regs, sp); 9352 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 9353 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 9354 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 9355 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 9356 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 9357 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 9358 // Read whole register from str1. It is safe, because length >=8 here 9359 __ ldr(ch1, Address(str1)); 9360 // Read whole register from str2. It is safe, because length >=8 here 9361 __ ldr(ch2, Address(str2)); 9362 __ sub(cnt2, cnt2, cnt1); 9363 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 9364 if (str1_isL != str2_isL) { 9365 __ eor(v0, __ T16B, v0, v0); 9366 } 9367 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 9368 __ mul(first, first, tmp1); 9369 // check if we have less than 1 register to check 9370 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 9371 if (str1_isL != str2_isL) { 9372 __ fmovd(v1, ch1); 9373 } 9374 __ br(__ LE, L_SMALL); 9375 __ eor(ch2, first, ch2); 9376 if (str1_isL != str2_isL) { 9377 __ zip1(v1, __ T16B, v1, v0); 9378 } 9379 __ sub(tmp2, ch2, tmp1); 9380 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9381 __ bics(tmp2, tmp2, ch2); 9382 if (str1_isL != str2_isL) { 9383 __ fmovd(ch1, v1); 9384 } 9385 __ br(__ NE, L_HAS_ZERO); 9386 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9387 __ add(result, result, wordSize/str2_chr_size); 9388 __ add(str2, str2, wordSize); 9389 __ br(__ LT, L_POST_LOOP); 9390 __ BIND(L_LOOP); 9391 __ ldr(ch2, Address(str2)); 9392 __ eor(ch2, first, ch2); 9393 __ sub(tmp2, ch2, tmp1); 9394 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9395 __ bics(tmp2, tmp2, ch2); 9396 __ br(__ NE, L_HAS_ZERO); 9397 __ BIND(L_LOOP_PROCEED); 9398 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 9399 __ add(str2, str2, wordSize); 9400 __ add(result, result, wordSize/str2_chr_size); 9401 __ br(__ GE, L_LOOP); 9402 __ BIND(L_POST_LOOP); 9403 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 9404 __ br(__ LE, NOMATCH); 9405 __ ldr(ch2, Address(str2)); 9406 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9407 __ eor(ch2, first, ch2); 9408 __ sub(tmp2, ch2, tmp1); 9409 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9410 __ mov(tmp4, -1); // all bits set 9411 __ b(L_SMALL_PROCEED); 9412 __ align(OptoLoopAlignment); 9413 __ BIND(L_SMALL); 9414 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 9415 __ eor(ch2, first, ch2); 9416 if (str1_isL != str2_isL) { 9417 __ zip1(v1, __ T16B, v1, v0); 9418 } 9419 __ sub(tmp2, ch2, tmp1); 9420 __ mov(tmp4, -1); // all bits set 9421 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 9422 if (str1_isL != str2_isL) { 9423 __ fmovd(ch1, v1); // move converted 4 symbols 9424 } 9425 __ BIND(L_SMALL_PROCEED); 9426 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 9427 __ bic(tmp2, tmp2, ch2); 9428 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 9429 __ rbit(tmp2, tmp2); 9430 __ br(__ EQ, NOMATCH); 9431 __ BIND(L_SMALL_HAS_ZERO_LOOP); 9432 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 9433 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 9434 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 9435 if (str2_isL) { // LL 9436 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9437 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9438 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9439 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9440 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9441 } else { 9442 __ mov(ch2, 0xE); // all bits in byte set except last one 9443 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9444 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9445 __ lslv(tmp2, tmp2, tmp4); 9446 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9447 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9448 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9449 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9450 } 9451 __ cmp(ch1, ch2); 9452 __ mov(tmp4, wordSize/str2_chr_size); 9453 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9454 __ BIND(L_SMALL_CMP_LOOP); 9455 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9456 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9457 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9458 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9459 __ add(tmp4, tmp4, 1); 9460 __ cmp(tmp4, cnt1); 9461 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 9462 __ cmp(first, ch2); 9463 __ br(__ EQ, L_SMALL_CMP_LOOP); 9464 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 9465 __ cbz(tmp2, NOMATCH); // no more matches. exit 9466 __ clz(tmp4, tmp2); 9467 __ add(result, result, 1); // advance index 9468 __ add(str2, str2, str2_chr_size); // advance pointer 9469 __ b(L_SMALL_HAS_ZERO_LOOP); 9470 __ align(OptoLoopAlignment); 9471 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 9472 __ cmp(first, ch2); 9473 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9474 __ b(DONE); 9475 __ align(OptoLoopAlignment); 9476 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 9477 if (str2_isL) { // LL 9478 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 9479 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 9480 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 9481 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 9482 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9483 } else { 9484 __ mov(ch2, 0xE); // all bits in byte set except last one 9485 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9486 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9487 __ lslv(tmp2, tmp2, tmp4); 9488 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9489 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9490 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 9491 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9492 } 9493 __ cmp(ch1, ch2); 9494 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 9495 __ b(DONE); 9496 __ align(OptoLoopAlignment); 9497 __ BIND(L_HAS_ZERO); 9498 __ rbit(tmp2, tmp2); 9499 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 9500 // Now, perform compression of counters(cnt2 and cnt1) into one register. 9501 // It's fine because both counters are 32bit and are not changed in this 9502 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 9503 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 9504 __ sub(result, result, 1); 9505 __ BIND(L_HAS_ZERO_LOOP); 9506 __ mov(cnt1, wordSize/str2_chr_size); 9507 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9508 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 9509 if (str2_isL) { 9510 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9511 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9512 __ lslv(tmp2, tmp2, tmp4); 9513 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9514 __ add(tmp4, tmp4, 1); 9515 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9516 __ lsl(tmp2, tmp2, 1); 9517 __ mov(tmp4, wordSize/str2_chr_size); 9518 } else { 9519 __ mov(ch2, 0xE); 9520 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9521 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9522 __ lslv(tmp2, tmp2, tmp4); 9523 __ add(tmp4, tmp4, 1); 9524 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9525 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9526 __ lsl(tmp2, tmp2, 1); 9527 __ mov(tmp4, wordSize/str2_chr_size); 9528 __ sub(str2, str2, str2_chr_size); 9529 } 9530 __ cmp(ch1, ch2); 9531 __ mov(tmp4, wordSize/str2_chr_size); 9532 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9533 __ BIND(L_CMP_LOOP); 9534 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 9535 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 9536 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 9537 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 9538 __ add(tmp4, tmp4, 1); 9539 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 9540 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 9541 __ cmp(cnt1, ch2); 9542 __ br(__ EQ, L_CMP_LOOP); 9543 __ BIND(L_CMP_LOOP_NOMATCH); 9544 // here we're not matched 9545 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 9546 __ clz(tmp4, tmp2); 9547 __ add(str2, str2, str2_chr_size); // advance pointer 9548 __ b(L_HAS_ZERO_LOOP); 9549 __ align(OptoLoopAlignment); 9550 __ BIND(L_CMP_LOOP_LAST_CMP); 9551 __ cmp(cnt1, ch2); 9552 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9553 __ b(DONE); 9554 __ align(OptoLoopAlignment); 9555 __ BIND(L_CMP_LOOP_LAST_CMP2); 9556 if (str2_isL) { 9557 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 9558 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9559 __ lslv(tmp2, tmp2, tmp4); 9560 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9561 __ add(tmp4, tmp4, 1); 9562 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9563 __ lsl(tmp2, tmp2, 1); 9564 } else { 9565 __ mov(ch2, 0xE); 9566 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 9567 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 9568 __ lslv(tmp2, tmp2, tmp4); 9569 __ add(tmp4, tmp4, 1); 9570 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 9571 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 9572 __ lsl(tmp2, tmp2, 1); 9573 __ sub(str2, str2, str2_chr_size); 9574 } 9575 __ cmp(ch1, ch2); 9576 __ br(__ NE, L_CMP_LOOP_NOMATCH); 9577 __ b(DONE); 9578 __ align(OptoLoopAlignment); 9579 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 9580 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 9581 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 9582 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 9583 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 9584 // result by analyzed characters value, so, we can just reset lower bits 9585 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 9586 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 9587 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 9588 // index of last analyzed substring inside current octet. So, str2 in at 9589 // respective start address. We need to advance it to next octet 9590 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 9591 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 9592 __ bfm(result, zr, 0, 2 - str2_chr_shift); 9593 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 9594 __ movw(cnt2, cnt2); 9595 __ b(L_LOOP_PROCEED); 9596 __ align(OptoLoopAlignment); 9597 __ BIND(NOMATCH); 9598 __ mov(result, -1); 9599 __ BIND(DONE); 9600 __ pop(spilled_regs, sp); 9601 __ ret(lr); 9602 return entry; 9603 } 9604 9605 void generate_string_indexof_stubs() { 9606 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 9607 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 9608 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 9609 } 9610 9611 void inflate_and_store_2_fp_registers(bool generatePrfm, 9612 FloatRegister src1, FloatRegister src2) { 9613 Register dst = r1; 9614 __ zip1(v1, __ T16B, src1, v0); 9615 __ zip2(v2, __ T16B, src1, v0); 9616 if (generatePrfm) { 9617 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 9618 } 9619 __ zip1(v3, __ T16B, src2, v0); 9620 __ zip2(v4, __ T16B, src2, v0); 9621 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 9622 } 9623 9624 // R0 = src 9625 // R1 = dst 9626 // R2 = len 9627 // R3 = len >> 3 9628 // V0 = 0 9629 // v1 = loaded 8 bytes 9630 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 9631 address generate_large_byte_array_inflate() { 9632 __ align(CodeEntryAlignment); 9633 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id; 9634 StubCodeMark mark(this, stub_id); 9635 address entry = __ pc(); 9636 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 9637 Register src = r0, dst = r1, len = r2, octetCounter = r3; 9638 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 9639 9640 // do one more 8-byte read to have address 16-byte aligned in most cases 9641 // also use single store instruction 9642 __ ldrd(v2, __ post(src, 8)); 9643 __ sub(octetCounter, octetCounter, 2); 9644 __ zip1(v1, __ T16B, v1, v0); 9645 __ zip1(v2, __ T16B, v2, v0); 9646 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 9647 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9648 __ subs(rscratch1, octetCounter, large_loop_threshold); 9649 __ br(__ LE, LOOP_START); 9650 __ b(LOOP_PRFM_START); 9651 __ bind(LOOP_PRFM); 9652 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9653 __ bind(LOOP_PRFM_START); 9654 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 9655 __ sub(octetCounter, octetCounter, 8); 9656 __ subs(rscratch1, octetCounter, large_loop_threshold); 9657 inflate_and_store_2_fp_registers(true, v3, v4); 9658 inflate_and_store_2_fp_registers(true, v5, v6); 9659 __ br(__ GT, LOOP_PRFM); 9660 __ cmp(octetCounter, (u1)8); 9661 __ br(__ LT, DONE); 9662 __ bind(LOOP); 9663 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 9664 __ bind(LOOP_START); 9665 __ sub(octetCounter, octetCounter, 8); 9666 __ cmp(octetCounter, (u1)8); 9667 inflate_and_store_2_fp_registers(false, v3, v4); 9668 inflate_and_store_2_fp_registers(false, v5, v6); 9669 __ br(__ GE, LOOP); 9670 __ bind(DONE); 9671 __ ret(lr); 9672 return entry; 9673 } 9674 9675 /** 9676 * Arguments: 9677 * 9678 * Input: 9679 * c_rarg0 - current state address 9680 * c_rarg1 - H key address 9681 * c_rarg2 - data address 9682 * c_rarg3 - number of blocks 9683 * 9684 * Output: 9685 * Updated state at c_rarg0 9686 */ 9687 address generate_ghash_processBlocks() { 9688 // Bafflingly, GCM uses little-endian for the byte order, but 9689 // big-endian for the bit order. For example, the polynomial 1 is 9690 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 9691 // 9692 // So, we must either reverse the bytes in each word and do 9693 // everything big-endian or reverse the bits in each byte and do 9694 // it little-endian. On AArch64 it's more idiomatic to reverse 9695 // the bits in each byte (we have an instruction, RBIT, to do 9696 // that) and keep the data in little-endian bit order through the 9697 // calculation, bit-reversing the inputs and outputs. 9698 9699 StubId stub_id = StubId::stubgen_ghash_processBlocks_id; 9700 StubCodeMark mark(this, stub_id); 9701 Label polynomial; // local data generated at end of stub 9702 __ align(CodeEntryAlignment); 9703 address start = __ pc(); 9704 9705 Register state = c_rarg0; 9706 Register subkeyH = c_rarg1; 9707 Register data = c_rarg2; 9708 Register blocks = c_rarg3; 9709 9710 FloatRegister vzr = v30; 9711 __ eor(vzr, __ T16B, vzr, vzr); // zero register 9712 9713 __ adr(rscratch1, polynomial); 9714 __ ldrq(v24, rscratch1); // The field polynomial 9715 9716 __ ldrq(v0, Address(state)); 9717 __ ldrq(v1, Address(subkeyH)); 9718 9719 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 9720 __ rbit(v0, __ T16B, v0); 9721 __ rev64(v1, __ T16B, v1); 9722 __ rbit(v1, __ T16B, v1); 9723 9724 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 9725 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 9726 9727 { 9728 Label L_ghash_loop; 9729 __ bind(L_ghash_loop); 9730 9731 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 9732 // reversing each byte 9733 __ rbit(v2, __ T16B, v2); 9734 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 9735 9736 // Multiply state in v2 by subkey in v1 9737 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 9738 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 9739 /*temps*/v6, v3, /*reuse/clobber b*/v2); 9740 // Reduce v7:v5 by the field polynomial 9741 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 9742 9743 __ sub(blocks, blocks, 1); 9744 __ cbnz(blocks, L_ghash_loop); 9745 } 9746 9747 // The bit-reversed result is at this point in v0 9748 __ rev64(v0, __ T16B, v0); 9749 __ rbit(v0, __ T16B, v0); 9750 9751 __ st1(v0, __ T16B, state); 9752 __ ret(lr); 9753 9754 // bind label and generate local polynomial data 9755 __ align(wordSize * 2); 9756 __ bind(polynomial); 9757 __ emit_int64(0x87); // The low-order bits of the field 9758 // polynomial (i.e. p = z^7+z^2+z+1) 9759 // repeated in the low and high parts of a 9760 // 128-bit vector 9761 __ emit_int64(0x87); 9762 9763 return start; 9764 } 9765 9766 address generate_ghash_processBlocks_wide() { 9767 address small = generate_ghash_processBlocks(); 9768 9769 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id; 9770 StubCodeMark mark(this, stub_id); 9771 Label polynomial; // local data generated after stub 9772 __ align(CodeEntryAlignment); 9773 address start = __ pc(); 9774 9775 Register state = c_rarg0; 9776 Register subkeyH = c_rarg1; 9777 Register data = c_rarg2; 9778 Register blocks = c_rarg3; 9779 9780 const int unroll = 4; 9781 9782 __ cmp(blocks, (unsigned char)(unroll * 2)); 9783 __ br(__ LT, small); 9784 9785 if (unroll > 1) { 9786 // Save state before entering routine 9787 __ sub(sp, sp, 4 * 16); 9788 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 9789 __ sub(sp, sp, 4 * 16); 9790 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 9791 } 9792 9793 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll); 9794 9795 if (unroll > 1) { 9796 // And restore state 9797 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 9798 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 9799 } 9800 9801 __ cmp(blocks, (unsigned char)0); 9802 __ br(__ GT, small); 9803 9804 __ ret(lr); 9805 9806 // bind label and generate polynomial data 9807 __ align(wordSize * 2); 9808 __ bind(polynomial); 9809 __ emit_int64(0x87); // The low-order bits of the field 9810 // polynomial (i.e. p = z^7+z^2+z+1) 9811 // repeated in the low and high parts of a 9812 // 128-bit vector 9813 __ emit_int64(0x87); 9814 9815 return start; 9816 9817 } 9818 9819 void generate_base64_encode_simdround(Register src, Register dst, 9820 FloatRegister codec, u8 size) { 9821 9822 FloatRegister in0 = v4, in1 = v5, in2 = v6; 9823 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 9824 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 9825 9826 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9827 9828 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 9829 9830 __ ushr(ind0, arrangement, in0, 2); 9831 9832 __ ushr(ind1, arrangement, in1, 2); 9833 __ shl(in0, arrangement, in0, 6); 9834 __ orr(ind1, arrangement, ind1, in0); 9835 __ ushr(ind1, arrangement, ind1, 2); 9836 9837 __ ushr(ind2, arrangement, in2, 4); 9838 __ shl(in1, arrangement, in1, 4); 9839 __ orr(ind2, arrangement, in1, ind2); 9840 __ ushr(ind2, arrangement, ind2, 2); 9841 9842 __ shl(ind3, arrangement, in2, 2); 9843 __ ushr(ind3, arrangement, ind3, 2); 9844 9845 __ tbl(out0, arrangement, codec, 4, ind0); 9846 __ tbl(out1, arrangement, codec, 4, ind1); 9847 __ tbl(out2, arrangement, codec, 4, ind2); 9848 __ tbl(out3, arrangement, codec, 4, ind3); 9849 9850 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 9851 } 9852 9853 /** 9854 * Arguments: 9855 * 9856 * Input: 9857 * c_rarg0 - src_start 9858 * c_rarg1 - src_offset 9859 * c_rarg2 - src_length 9860 * c_rarg3 - dest_start 9861 * c_rarg4 - dest_offset 9862 * c_rarg5 - isURL 9863 * 9864 */ 9865 address generate_base64_encodeBlock() { 9866 9867 static const char toBase64[64] = { 9868 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9869 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9870 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9871 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9872 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 9873 }; 9874 9875 static const char toBase64URL[64] = { 9876 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 9877 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 9878 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 9879 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 9880 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 9881 }; 9882 9883 __ align(CodeEntryAlignment); 9884 StubId stub_id = StubId::stubgen_base64_encodeBlock_id; 9885 StubCodeMark mark(this, stub_id); 9886 address start = __ pc(); 9887 9888 Register src = c_rarg0; // source array 9889 Register soff = c_rarg1; // source start offset 9890 Register send = c_rarg2; // source end offset 9891 Register dst = c_rarg3; // dest array 9892 Register doff = c_rarg4; // position for writing to dest array 9893 Register isURL = c_rarg5; // Base64 or URL character set 9894 9895 // c_rarg6 and c_rarg7 are free to use as temps 9896 Register codec = c_rarg6; 9897 Register length = c_rarg7; 9898 9899 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 9900 9901 __ add(src, src, soff); 9902 __ add(dst, dst, doff); 9903 __ sub(length, send, soff); 9904 9905 // load the codec base address 9906 __ lea(codec, ExternalAddress((address) toBase64)); 9907 __ cbz(isURL, ProcessData); 9908 __ lea(codec, ExternalAddress((address) toBase64URL)); 9909 9910 __ BIND(ProcessData); 9911 9912 // too short to formup a SIMD loop, roll back 9913 __ cmp(length, (u1)24); 9914 __ br(Assembler::LT, Process3B); 9915 9916 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 9917 9918 __ BIND(Process48B); 9919 __ cmp(length, (u1)48); 9920 __ br(Assembler::LT, Process24B); 9921 generate_base64_encode_simdround(src, dst, v0, 16); 9922 __ sub(length, length, 48); 9923 __ b(Process48B); 9924 9925 __ BIND(Process24B); 9926 __ cmp(length, (u1)24); 9927 __ br(Assembler::LT, SIMDExit); 9928 generate_base64_encode_simdround(src, dst, v0, 8); 9929 __ sub(length, length, 24); 9930 9931 __ BIND(SIMDExit); 9932 __ cbz(length, Exit); 9933 9934 __ BIND(Process3B); 9935 // 3 src bytes, 24 bits 9936 __ ldrb(r10, __ post(src, 1)); 9937 __ ldrb(r11, __ post(src, 1)); 9938 __ ldrb(r12, __ post(src, 1)); 9939 __ orrw(r11, r11, r10, Assembler::LSL, 8); 9940 __ orrw(r12, r12, r11, Assembler::LSL, 8); 9941 // codec index 9942 __ ubfmw(r15, r12, 18, 23); 9943 __ ubfmw(r14, r12, 12, 17); 9944 __ ubfmw(r13, r12, 6, 11); 9945 __ andw(r12, r12, 63); 9946 // get the code based on the codec 9947 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 9948 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 9949 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 9950 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 9951 __ strb(r15, __ post(dst, 1)); 9952 __ strb(r14, __ post(dst, 1)); 9953 __ strb(r13, __ post(dst, 1)); 9954 __ strb(r12, __ post(dst, 1)); 9955 __ sub(length, length, 3); 9956 __ cbnz(length, Process3B); 9957 9958 __ BIND(Exit); 9959 __ ret(lr); 9960 9961 return start; 9962 } 9963 9964 void generate_base64_decode_simdround(Register src, Register dst, 9965 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 9966 9967 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 9968 FloatRegister out0 = v20, out1 = v21, out2 = v22; 9969 9970 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 9971 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 9972 9973 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 9974 9975 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 9976 9977 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 9978 9979 // we need unsigned saturating subtract, to make sure all input values 9980 // in range [0, 63] will have 0U value in the higher half lookup 9981 __ uqsubv(decH0, __ T16B, in0, v27); 9982 __ uqsubv(decH1, __ T16B, in1, v27); 9983 __ uqsubv(decH2, __ T16B, in2, v27); 9984 __ uqsubv(decH3, __ T16B, in3, v27); 9985 9986 // lower half lookup 9987 __ tbl(decL0, arrangement, codecL, 4, in0); 9988 __ tbl(decL1, arrangement, codecL, 4, in1); 9989 __ tbl(decL2, arrangement, codecL, 4, in2); 9990 __ tbl(decL3, arrangement, codecL, 4, in3); 9991 9992 // higher half lookup 9993 __ tbx(decH0, arrangement, codecH, 4, decH0); 9994 __ tbx(decH1, arrangement, codecH, 4, decH1); 9995 __ tbx(decH2, arrangement, codecH, 4, decH2); 9996 __ tbx(decH3, arrangement, codecH, 4, decH3); 9997 9998 // combine lower and higher 9999 __ orr(decL0, arrangement, decL0, decH0); 10000 __ orr(decL1, arrangement, decL1, decH1); 10001 __ orr(decL2, arrangement, decL2, decH2); 10002 __ orr(decL3, arrangement, decL3, decH3); 10003 10004 // check illegal inputs, value larger than 63 (maximum of 6 bits) 10005 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 10006 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 10007 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 10008 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 10009 __ orr(in0, arrangement, decH0, decH1); 10010 __ orr(in1, arrangement, decH2, decH3); 10011 __ orr(in2, arrangement, in0, in1); 10012 __ umaxv(in3, arrangement, in2); 10013 __ umov(rscratch2, in3, __ B, 0); 10014 10015 // get the data to output 10016 __ shl(out0, arrangement, decL0, 2); 10017 __ ushr(out1, arrangement, decL1, 4); 10018 __ orr(out0, arrangement, out0, out1); 10019 __ shl(out1, arrangement, decL1, 4); 10020 __ ushr(out2, arrangement, decL2, 2); 10021 __ orr(out1, arrangement, out1, out2); 10022 __ shl(out2, arrangement, decL2, 6); 10023 __ orr(out2, arrangement, out2, decL3); 10024 10025 __ cbz(rscratch2, NoIllegalData); 10026 10027 // handle illegal input 10028 __ umov(r10, in2, __ D, 0); 10029 if (size == 16) { 10030 __ cbnz(r10, ErrorInLowerHalf); 10031 10032 // illegal input is in higher half, store the lower half now. 10033 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 10034 10035 __ umov(r10, in2, __ D, 1); 10036 __ umov(r11, out0, __ D, 1); 10037 __ umov(r12, out1, __ D, 1); 10038 __ umov(r13, out2, __ D, 1); 10039 __ b(StoreLegalData); 10040 10041 __ BIND(ErrorInLowerHalf); 10042 } 10043 __ umov(r11, out0, __ D, 0); 10044 __ umov(r12, out1, __ D, 0); 10045 __ umov(r13, out2, __ D, 0); 10046 10047 __ BIND(StoreLegalData); 10048 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 10049 __ strb(r11, __ post(dst, 1)); 10050 __ strb(r12, __ post(dst, 1)); 10051 __ strb(r13, __ post(dst, 1)); 10052 __ lsr(r10, r10, 8); 10053 __ lsr(r11, r11, 8); 10054 __ lsr(r12, r12, 8); 10055 __ lsr(r13, r13, 8); 10056 __ b(StoreLegalData); 10057 10058 __ BIND(NoIllegalData); 10059 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 10060 } 10061 10062 10063 /** 10064 * Arguments: 10065 * 10066 * Input: 10067 * c_rarg0 - src_start 10068 * c_rarg1 - src_offset 10069 * c_rarg2 - src_length 10070 * c_rarg3 - dest_start 10071 * c_rarg4 - dest_offset 10072 * c_rarg5 - isURL 10073 * c_rarg6 - isMIME 10074 * 10075 */ 10076 address generate_base64_decodeBlock() { 10077 10078 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 10079 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 10080 // titled "Base64 decoding". 10081 10082 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 10083 // except the trailing character '=' is also treated illegal value in this intrinsic. That 10084 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 10085 static const uint8_t fromBase64ForNoSIMD[256] = { 10086 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10087 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10088 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10089 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10090 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10091 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 10092 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10093 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10094 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10095 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10096 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10097 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10098 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10099 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10100 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10101 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10102 }; 10103 10104 static const uint8_t fromBase64URLForNoSIMD[256] = { 10105 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10106 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10107 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10108 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10109 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 10110 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 10111 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 10112 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 10113 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10114 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10115 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10116 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10117 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10118 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10119 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10120 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10121 }; 10122 10123 // A legal value of base64 code is in range [0, 127]. We need two lookups 10124 // with tbl/tbx and combine them to get the decode data. The 1st table vector 10125 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 10126 // table vector lookup use tbx, out of range indices are unchanged in 10127 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 10128 // The value of index 64 is set to 0, so that we know that we already get the 10129 // decoded data with the 1st lookup. 10130 static const uint8_t fromBase64ForSIMD[128] = { 10131 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10132 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10133 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 10134 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10135 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10136 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10137 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10138 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10139 }; 10140 10141 static const uint8_t fromBase64URLForSIMD[128] = { 10142 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10143 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 10144 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 10145 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 10146 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 10147 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 10148 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 10149 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 10150 }; 10151 10152 __ align(CodeEntryAlignment); 10153 StubId stub_id = StubId::stubgen_base64_decodeBlock_id; 10154 StubCodeMark mark(this, stub_id); 10155 address start = __ pc(); 10156 10157 Register src = c_rarg0; // source array 10158 Register soff = c_rarg1; // source start offset 10159 Register send = c_rarg2; // source end offset 10160 Register dst = c_rarg3; // dest array 10161 Register doff = c_rarg4; // position for writing to dest array 10162 Register isURL = c_rarg5; // Base64 or URL character set 10163 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 10164 10165 Register length = send; // reuse send as length of source data to process 10166 10167 Register simd_codec = c_rarg6; 10168 Register nosimd_codec = c_rarg7; 10169 10170 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 10171 10172 __ enter(); 10173 10174 __ add(src, src, soff); 10175 __ add(dst, dst, doff); 10176 10177 __ mov(doff, dst); 10178 10179 __ sub(length, send, soff); 10180 __ bfm(length, zr, 0, 1); 10181 10182 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 10183 __ cbz(isURL, ProcessData); 10184 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 10185 10186 __ BIND(ProcessData); 10187 __ mov(rscratch1, length); 10188 __ cmp(length, (u1)144); // 144 = 80 + 64 10189 __ br(Assembler::LT, Process4B); 10190 10191 // In the MIME case, the line length cannot be more than 76 10192 // bytes (see RFC 2045). This is too short a block for SIMD 10193 // to be worthwhile, so we use non-SIMD here. 10194 __ movw(rscratch1, 79); 10195 10196 __ BIND(Process4B); 10197 __ ldrw(r14, __ post(src, 4)); 10198 __ ubfxw(r10, r14, 0, 8); 10199 __ ubfxw(r11, r14, 8, 8); 10200 __ ubfxw(r12, r14, 16, 8); 10201 __ ubfxw(r13, r14, 24, 8); 10202 // get the de-code 10203 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 10204 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 10205 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 10206 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 10207 // error detection, 255u indicates an illegal input 10208 __ orrw(r14, r10, r11); 10209 __ orrw(r15, r12, r13); 10210 __ orrw(r14, r14, r15); 10211 __ tbnz(r14, 7, Exit); 10212 // recover the data 10213 __ lslw(r14, r10, 10); 10214 __ bfiw(r14, r11, 4, 6); 10215 __ bfmw(r14, r12, 2, 5); 10216 __ rev16w(r14, r14); 10217 __ bfiw(r13, r12, 6, 2); 10218 __ strh(r14, __ post(dst, 2)); 10219 __ strb(r13, __ post(dst, 1)); 10220 // non-simd loop 10221 __ subsw(rscratch1, rscratch1, 4); 10222 __ br(Assembler::GT, Process4B); 10223 10224 // if exiting from PreProcess80B, rscratch1 == -1; 10225 // otherwise, rscratch1 == 0. 10226 __ cbzw(rscratch1, Exit); 10227 __ sub(length, length, 80); 10228 10229 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 10230 __ cbz(isURL, SIMDEnter); 10231 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 10232 10233 __ BIND(SIMDEnter); 10234 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 10235 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 10236 __ mov(rscratch1, 63); 10237 __ dup(v27, __ T16B, rscratch1); 10238 10239 __ BIND(Process64B); 10240 __ cmp(length, (u1)64); 10241 __ br(Assembler::LT, Process32B); 10242 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 10243 __ sub(length, length, 64); 10244 __ b(Process64B); 10245 10246 __ BIND(Process32B); 10247 __ cmp(length, (u1)32); 10248 __ br(Assembler::LT, SIMDExit); 10249 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 10250 __ sub(length, length, 32); 10251 __ b(Process32B); 10252 10253 __ BIND(SIMDExit); 10254 __ cbz(length, Exit); 10255 __ movw(rscratch1, length); 10256 __ b(Process4B); 10257 10258 __ BIND(Exit); 10259 __ sub(c_rarg0, dst, doff); 10260 10261 __ leave(); 10262 __ ret(lr); 10263 10264 return start; 10265 } 10266 10267 // Support for spin waits. 10268 address generate_spin_wait() { 10269 __ align(CodeEntryAlignment); 10270 StubId stub_id = StubId::stubgen_spin_wait_id; 10271 StubCodeMark mark(this, stub_id); 10272 address start = __ pc(); 10273 10274 __ spin_wait(); 10275 __ ret(lr); 10276 10277 return start; 10278 } 10279 10280 void generate_lookup_secondary_supers_table_stub() { 10281 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id; 10282 StubCodeMark mark(this, stub_id); 10283 10284 const Register 10285 r_super_klass = r0, 10286 r_array_base = r1, 10287 r_array_length = r2, 10288 r_array_index = r3, 10289 r_sub_klass = r4, 10290 r_bitmap = rscratch2, 10291 result = r5; 10292 const FloatRegister 10293 vtemp = v0; 10294 10295 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 10296 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 10297 Label L_success; 10298 __ enter(); 10299 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 10300 r_array_base, r_array_length, r_array_index, 10301 vtemp, result, slot, 10302 /*stub_is_near*/true); 10303 __ leave(); 10304 __ ret(lr); 10305 } 10306 } 10307 10308 // Slow path implementation for UseSecondarySupersTable. 10309 address generate_lookup_secondary_supers_table_slow_path_stub() { 10310 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id; 10311 StubCodeMark mark(this, stub_id); 10312 10313 address start = __ pc(); 10314 const Register 10315 r_super_klass = r0, // argument 10316 r_array_base = r1, // argument 10317 temp1 = r2, // temp 10318 r_array_index = r3, // argument 10319 r_bitmap = rscratch2, // argument 10320 result = r5; // argument 10321 10322 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 10323 __ ret(lr); 10324 10325 return start; 10326 } 10327 10328 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 10329 10330 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX. 10331 // 10332 // If LSE is in use, generate LSE versions of all the stubs. The 10333 // non-LSE versions are in atomic_aarch64.S. 10334 10335 // class AtomicStubMark records the entry point of a stub and the 10336 // stub pointer which will point to it. The stub pointer is set to 10337 // the entry point when ~AtomicStubMark() is called, which must be 10338 // after ICache::invalidate_range. This ensures safe publication of 10339 // the generated code. 10340 class AtomicStubMark { 10341 address _entry_point; 10342 aarch64_atomic_stub_t *_stub; 10343 MacroAssembler *_masm; 10344 public: 10345 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 10346 _masm = masm; 10347 __ align(32); 10348 _entry_point = __ pc(); 10349 _stub = stub; 10350 } 10351 ~AtomicStubMark() { 10352 *_stub = (aarch64_atomic_stub_t)_entry_point; 10353 } 10354 }; 10355 10356 // NB: For memory_order_conservative we need a trailing membar after 10357 // LSE atomic operations but not a leading membar. 10358 // 10359 // We don't need a leading membar because a clause in the Arm ARM 10360 // says: 10361 // 10362 // Barrier-ordered-before 10363 // 10364 // Barrier instructions order prior Memory effects before subsequent 10365 // Memory effects generated by the same Observer. A read or a write 10366 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 10367 // Observer if and only if RW1 appears in program order before RW 2 10368 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 10369 // instruction with both Acquire and Release semantics. 10370 // 10371 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 10372 // and Release semantics, therefore we don't need a leading 10373 // barrier. However, there is no corresponding Barrier-ordered-after 10374 // relationship, therefore we need a trailing membar to prevent a 10375 // later store or load from being reordered with the store in an 10376 // atomic instruction. 10377 // 10378 // This was checked by using the herd7 consistency model simulator 10379 // (http://diy.inria.fr/) with this test case: 10380 // 10381 // AArch64 LseCas 10382 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 10383 // P0 | P1; 10384 // LDR W4, [X2] | MOV W3, #0; 10385 // DMB LD | MOV W4, #1; 10386 // LDR W3, [X1] | CASAL W3, W4, [X1]; 10387 // | DMB ISH; 10388 // | STR W4, [X2]; 10389 // exists 10390 // (0:X3=0 /\ 0:X4=1) 10391 // 10392 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 10393 // with the store to x in P1. Without the DMB in P1 this may happen. 10394 // 10395 // At the time of writing we don't know of any AArch64 hardware that 10396 // reorders stores in this way, but the Reference Manual permits it. 10397 10398 void gen_cas_entry(Assembler::operand_size size, 10399 atomic_memory_order order) { 10400 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 10401 exchange_val = c_rarg2; 10402 bool acquire, release; 10403 switch (order) { 10404 case memory_order_relaxed: 10405 acquire = false; 10406 release = false; 10407 break; 10408 case memory_order_release: 10409 acquire = false; 10410 release = true; 10411 break; 10412 default: 10413 acquire = true; 10414 release = true; 10415 break; 10416 } 10417 __ mov(prev, compare_val); 10418 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 10419 if (order == memory_order_conservative) { 10420 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10421 } 10422 if (size == Assembler::xword) { 10423 __ mov(r0, prev); 10424 } else { 10425 __ movw(r0, prev); 10426 } 10427 __ ret(lr); 10428 } 10429 10430 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 10431 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10432 // If not relaxed, then default to conservative. Relaxed is the only 10433 // case we use enough to be worth specializing. 10434 if (order == memory_order_relaxed) { 10435 __ ldadd(size, incr, prev, addr); 10436 } else { 10437 __ ldaddal(size, incr, prev, addr); 10438 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10439 } 10440 if (size == Assembler::xword) { 10441 __ mov(r0, prev); 10442 } else { 10443 __ movw(r0, prev); 10444 } 10445 __ ret(lr); 10446 } 10447 10448 void gen_swpal_entry(Assembler::operand_size size) { 10449 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 10450 __ swpal(size, incr, prev, addr); 10451 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 10452 if (size == Assembler::xword) { 10453 __ mov(r0, prev); 10454 } else { 10455 __ movw(r0, prev); 10456 } 10457 __ ret(lr); 10458 } 10459 10460 void generate_atomic_entry_points() { 10461 if (! UseLSE) { 10462 return; 10463 } 10464 __ align(CodeEntryAlignment); 10465 StubId stub_id = StubId::stubgen_atomic_entry_points_id; 10466 StubCodeMark mark(this, stub_id); 10467 address first_entry = __ pc(); 10468 10469 // ADD, memory_order_conservative 10470 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 10471 gen_ldadd_entry(Assembler::word, memory_order_conservative); 10472 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 10473 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 10474 10475 // ADD, memory_order_relaxed 10476 AtomicStubMark mark_fetch_add_4_relaxed 10477 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 10478 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 10479 AtomicStubMark mark_fetch_add_8_relaxed 10480 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 10481 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 10482 10483 // XCHG, memory_order_conservative 10484 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 10485 gen_swpal_entry(Assembler::word); 10486 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 10487 gen_swpal_entry(Assembler::xword); 10488 10489 // CAS, memory_order_conservative 10490 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 10491 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 10492 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 10493 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 10494 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 10495 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 10496 10497 // CAS, memory_order_relaxed 10498 AtomicStubMark mark_cmpxchg_1_relaxed 10499 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 10500 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 10501 AtomicStubMark mark_cmpxchg_4_relaxed 10502 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 10503 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 10504 AtomicStubMark mark_cmpxchg_8_relaxed 10505 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 10506 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 10507 10508 AtomicStubMark mark_cmpxchg_4_release 10509 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 10510 gen_cas_entry(MacroAssembler::word, memory_order_release); 10511 AtomicStubMark mark_cmpxchg_8_release 10512 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 10513 gen_cas_entry(MacroAssembler::xword, memory_order_release); 10514 10515 AtomicStubMark mark_cmpxchg_4_seq_cst 10516 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 10517 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 10518 AtomicStubMark mark_cmpxchg_8_seq_cst 10519 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 10520 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 10521 10522 ICache::invalidate_range(first_entry, __ pc() - first_entry); 10523 } 10524 #endif // LINUX 10525 10526 address generate_cont_thaw(Continuation::thaw_kind kind) { 10527 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 10528 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 10529 10530 address start = __ pc(); 10531 10532 if (return_barrier) { 10533 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 10534 __ mov(sp, rscratch1); 10535 } 10536 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10537 10538 if (return_barrier) { 10539 // preserve possible return value from a method returning to the return barrier 10540 __ fmovd(rscratch1, v0); 10541 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10542 } 10543 10544 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 10545 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 10546 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 10547 10548 if (return_barrier) { 10549 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10550 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10551 __ fmovd(v0, rscratch1); 10552 } 10553 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 10554 10555 10556 Label thaw_success; 10557 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 10558 __ cbnz(rscratch2, thaw_success); 10559 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 10560 __ br(rscratch1); 10561 __ bind(thaw_success); 10562 10563 // make room for the thawed frames 10564 __ sub(rscratch1, sp, rscratch2); 10565 __ andr(rscratch1, rscratch1, -16); // align 10566 __ mov(sp, rscratch1); 10567 10568 if (return_barrier) { 10569 // save original return value -- again 10570 __ fmovd(rscratch1, v0); 10571 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 10572 } 10573 10574 // If we want, we can templatize thaw by kind, and have three different entries 10575 __ movw(c_rarg1, (uint32_t)kind); 10576 10577 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 10578 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 10579 10580 if (return_barrier) { 10581 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 10582 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 10583 __ fmovd(v0, rscratch1); 10584 } else { 10585 __ mov(r0, zr); // return 0 (success) from doYield 10586 } 10587 10588 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 10589 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 10590 __ mov(rfp, sp); 10591 10592 if (return_barrier_exception) { 10593 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 10594 __ authenticate_return_address(c_rarg1); 10595 __ verify_oop(r0); 10596 // save return value containing the exception oop in callee-saved R19 10597 __ mov(r19, r0); 10598 10599 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 10600 10601 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 10602 // __ reinitialize_ptrue(); 10603 10604 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 10605 10606 __ mov(r1, r0); // the exception handler 10607 __ mov(r0, r19); // restore return value containing the exception oop 10608 __ verify_oop(r0); 10609 10610 __ leave(); 10611 __ mov(r3, lr); 10612 __ br(r1); // the exception handler 10613 } else { 10614 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 10615 __ leave(); 10616 __ ret(lr); 10617 } 10618 10619 return start; 10620 } 10621 10622 address generate_cont_thaw() { 10623 if (!Continuations::enabled()) return nullptr; 10624 10625 StubId stub_id = StubId::stubgen_cont_thaw_id; 10626 StubCodeMark mark(this, stub_id); 10627 address start = __ pc(); 10628 generate_cont_thaw(Continuation::thaw_top); 10629 return start; 10630 } 10631 10632 address generate_cont_returnBarrier() { 10633 if (!Continuations::enabled()) return nullptr; 10634 10635 // TODO: will probably need multiple return barriers depending on return type 10636 StubId stub_id = StubId::stubgen_cont_returnBarrier_id; 10637 StubCodeMark mark(this, stub_id); 10638 address start = __ pc(); 10639 10640 generate_cont_thaw(Continuation::thaw_return_barrier); 10641 10642 return start; 10643 } 10644 10645 address generate_cont_returnBarrier_exception() { 10646 if (!Continuations::enabled()) return nullptr; 10647 10648 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id; 10649 StubCodeMark mark(this, stub_id); 10650 address start = __ pc(); 10651 10652 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 10653 10654 return start; 10655 } 10656 10657 address generate_cont_preempt_stub() { 10658 if (!Continuations::enabled()) return nullptr; 10659 StubId stub_id = StubId::stubgen_cont_preempt_id; 10660 StubCodeMark mark(this, stub_id); 10661 address start = __ pc(); 10662 10663 __ reset_last_Java_frame(true); 10664 10665 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 10666 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 10667 __ mov(sp, rscratch2); 10668 10669 Label preemption_cancelled; 10670 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 10671 __ cbnz(rscratch1, preemption_cancelled); 10672 10673 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 10674 SharedRuntime::continuation_enter_cleanup(_masm); 10675 __ leave(); 10676 __ ret(lr); 10677 10678 // We acquired the monitor after freezing the frames so call thaw to continue execution. 10679 __ bind(preemption_cancelled); 10680 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 10681 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 10682 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 10683 __ ldr(rscratch1, Address(rscratch1)); 10684 __ br(rscratch1); 10685 10686 return start; 10687 } 10688 10689 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 10690 // are represented as long[5], with BITS_PER_LIMB = 26. 10691 // Pack five 26-bit limbs into three 64-bit registers. 10692 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 10693 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 10694 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 10695 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 10696 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 10697 10698 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 10699 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 10700 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 10701 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 10702 10703 if (dest2->is_valid()) { 10704 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 10705 } else { 10706 #ifdef ASSERT 10707 Label OK; 10708 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 10709 __ br(__ EQ, OK); 10710 __ stop("high bits of Poly1305 integer should be zero"); 10711 __ should_not_reach_here(); 10712 __ bind(OK); 10713 #endif 10714 } 10715 } 10716 10717 // As above, but return only a 128-bit integer, packed into two 10718 // 64-bit registers. 10719 void pack_26(Register dest0, Register dest1, Register src) { 10720 pack_26(dest0, dest1, noreg, src); 10721 } 10722 10723 // Multiply and multiply-accumulate unsigned 64-bit registers. 10724 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 10725 __ mul(prod_lo, n, m); 10726 __ umulh(prod_hi, n, m); 10727 } 10728 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 10729 wide_mul(rscratch1, rscratch2, n, m); 10730 __ adds(sum_lo, sum_lo, rscratch1); 10731 __ adc(sum_hi, sum_hi, rscratch2); 10732 } 10733 10734 // Poly1305, RFC 7539 10735 10736 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 10737 // description of the tricks used to simplify and accelerate this 10738 // computation. 10739 10740 address generate_poly1305_processBlocks() { 10741 __ align(CodeEntryAlignment); 10742 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id; 10743 StubCodeMark mark(this, stub_id); 10744 address start = __ pc(); 10745 Label here; 10746 __ enter(); 10747 RegSet callee_saved = RegSet::range(r19, r28); 10748 __ push(callee_saved, sp); 10749 10750 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 10751 10752 // Arguments 10753 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 10754 10755 // R_n is the 128-bit randomly-generated key, packed into two 10756 // registers. The caller passes this key to us as long[5], with 10757 // BITS_PER_LIMB = 26. 10758 const Register R_0 = *++regs, R_1 = *++regs; 10759 pack_26(R_0, R_1, r_start); 10760 10761 // RR_n is (R_n >> 2) * 5 10762 const Register RR_0 = *++regs, RR_1 = *++regs; 10763 __ lsr(RR_0, R_0, 2); 10764 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 10765 __ lsr(RR_1, R_1, 2); 10766 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 10767 10768 // U_n is the current checksum 10769 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 10770 pack_26(U_0, U_1, U_2, acc_start); 10771 10772 static constexpr int BLOCK_LENGTH = 16; 10773 Label DONE, LOOP; 10774 10775 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10776 __ br(Assembler::LT, DONE); { 10777 __ bind(LOOP); 10778 10779 // S_n is to be the sum of U_n and the next block of data 10780 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 10781 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 10782 __ adds(S_0, U_0, S_0); 10783 __ adcs(S_1, U_1, S_1); 10784 __ adc(S_2, U_2, zr); 10785 __ add(S_2, S_2, 1); 10786 10787 const Register U_0HI = *++regs, U_1HI = *++regs; 10788 10789 // NB: this logic depends on some of the special properties of 10790 // Poly1305 keys. In particular, because we know that the top 10791 // four bits of R_0 and R_1 are zero, we can add together 10792 // partial products without any risk of needing to propagate a 10793 // carry out. 10794 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 10795 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 10796 __ andr(U_2, R_0, 3); 10797 __ mul(U_2, S_2, U_2); 10798 10799 // Recycle registers S_0, S_1, S_2 10800 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 10801 10802 // Partial reduction mod 2**130 - 5 10803 __ adds(U_1, U_0HI, U_1); 10804 __ adc(U_2, U_1HI, U_2); 10805 // Sum now in U_2:U_1:U_0. 10806 // Dead: U_0HI, U_1HI. 10807 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 10808 10809 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 10810 10811 // First, U_2:U_1:U_0 += (U_2 >> 2) 10812 __ lsr(rscratch1, U_2, 2); 10813 __ andr(U_2, U_2, (u8)3); 10814 __ adds(U_0, U_0, rscratch1); 10815 __ adcs(U_1, U_1, zr); 10816 __ adc(U_2, U_2, zr); 10817 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 10818 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 10819 __ adcs(U_1, U_1, zr); 10820 __ adc(U_2, U_2, zr); 10821 10822 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 10823 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 10824 __ br(~ Assembler::LT, LOOP); 10825 } 10826 10827 // Further reduce modulo 2^130 - 5 10828 __ lsr(rscratch1, U_2, 2); 10829 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 10830 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 10831 __ adcs(U_1, U_1, zr); 10832 __ andr(U_2, U_2, (u1)3); 10833 __ adc(U_2, U_2, zr); 10834 10835 // Unpack the sum into five 26-bit limbs and write to memory. 10836 __ ubfiz(rscratch1, U_0, 0, 26); 10837 __ ubfx(rscratch2, U_0, 26, 26); 10838 __ stp(rscratch1, rscratch2, Address(acc_start)); 10839 __ ubfx(rscratch1, U_0, 52, 12); 10840 __ bfi(rscratch1, U_1, 12, 14); 10841 __ ubfx(rscratch2, U_1, 14, 26); 10842 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 10843 __ ubfx(rscratch1, U_1, 40, 24); 10844 __ bfi(rscratch1, U_2, 24, 3); 10845 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 10846 10847 __ bind(DONE); 10848 __ pop(callee_saved, sp); 10849 __ leave(); 10850 __ ret(lr); 10851 10852 return start; 10853 } 10854 10855 // exception handler for upcall stubs 10856 address generate_upcall_stub_exception_handler() { 10857 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id; 10858 StubCodeMark mark(this, stub_id); 10859 address start = __ pc(); 10860 10861 // Native caller has no idea how to handle exceptions, 10862 // so we just crash here. Up to callee to catch exceptions. 10863 __ verify_oop(r0); 10864 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 10865 __ blr(rscratch1); 10866 __ should_not_reach_here(); 10867 10868 return start; 10869 } 10870 10871 // load Method* target of MethodHandle 10872 // j_rarg0 = jobject receiver 10873 // rmethod = result 10874 address generate_upcall_stub_load_target() { 10875 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id; 10876 StubCodeMark mark(this, stub_id); 10877 address start = __ pc(); 10878 10879 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 10880 // Load target method from receiver 10881 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 10882 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 10883 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 10884 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 10885 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 10886 noreg, noreg); 10887 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 10888 10889 __ ret(lr); 10890 10891 return start; 10892 } 10893 10894 #undef __ 10895 #define __ masm-> 10896 10897 class MontgomeryMultiplyGenerator : public MacroAssembler { 10898 10899 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 10900 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 10901 10902 RegSet _toSave; 10903 bool _squaring; 10904 10905 public: 10906 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 10907 : MacroAssembler(as->code()), _squaring(squaring) { 10908 10909 // Register allocation 10910 10911 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 10912 Pa_base = *regs; // Argument registers 10913 if (squaring) 10914 Pb_base = Pa_base; 10915 else 10916 Pb_base = *++regs; 10917 Pn_base = *++regs; 10918 Rlen= *++regs; 10919 inv = *++regs; 10920 Pm_base = *++regs; 10921 10922 // Working registers: 10923 Ra = *++regs; // The current digit of a, b, n, and m. 10924 Rb = *++regs; 10925 Rm = *++regs; 10926 Rn = *++regs; 10927 10928 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 10929 Pb = *++regs; 10930 Pm = *++regs; 10931 Pn = *++regs; 10932 10933 t0 = *++regs; // Three registers which form a 10934 t1 = *++regs; // triple-precision accumuator. 10935 t2 = *++regs; 10936 10937 Ri = *++regs; // Inner and outer loop indexes. 10938 Rj = *++regs; 10939 10940 Rhi_ab = *++regs; // Product registers: low and high parts 10941 Rlo_ab = *++regs; // of a*b and m*n. 10942 Rhi_mn = *++regs; 10943 Rlo_mn = *++regs; 10944 10945 // r19 and up are callee-saved. 10946 _toSave = RegSet::range(r19, *regs) + Pm_base; 10947 } 10948 10949 private: 10950 void save_regs() { 10951 push(_toSave, sp); 10952 } 10953 10954 void restore_regs() { 10955 pop(_toSave, sp); 10956 } 10957 10958 template <typename T> 10959 void unroll_2(Register count, T block) { 10960 Label loop, end, odd; 10961 tbnz(count, 0, odd); 10962 cbz(count, end); 10963 align(16); 10964 bind(loop); 10965 (this->*block)(); 10966 bind(odd); 10967 (this->*block)(); 10968 subs(count, count, 2); 10969 br(Assembler::GT, loop); 10970 bind(end); 10971 } 10972 10973 template <typename T> 10974 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 10975 Label loop, end, odd; 10976 tbnz(count, 0, odd); 10977 cbz(count, end); 10978 align(16); 10979 bind(loop); 10980 (this->*block)(d, s, tmp); 10981 bind(odd); 10982 (this->*block)(d, s, tmp); 10983 subs(count, count, 2); 10984 br(Assembler::GT, loop); 10985 bind(end); 10986 } 10987 10988 void pre1(RegisterOrConstant i) { 10989 block_comment("pre1"); 10990 // Pa = Pa_base; 10991 // Pb = Pb_base + i; 10992 // Pm = Pm_base; 10993 // Pn = Pn_base + i; 10994 // Ra = *Pa; 10995 // Rb = *Pb; 10996 // Rm = *Pm; 10997 // Rn = *Pn; 10998 ldr(Ra, Address(Pa_base)); 10999 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 11000 ldr(Rm, Address(Pm_base)); 11001 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11002 lea(Pa, Address(Pa_base)); 11003 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 11004 lea(Pm, Address(Pm_base)); 11005 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11006 11007 // Zero the m*n result. 11008 mov(Rhi_mn, zr); 11009 mov(Rlo_mn, zr); 11010 } 11011 11012 // The core multiply-accumulate step of a Montgomery 11013 // multiplication. The idea is to schedule operations as a 11014 // pipeline so that instructions with long latencies (loads and 11015 // multiplies) have time to complete before their results are 11016 // used. This most benefits in-order implementations of the 11017 // architecture but out-of-order ones also benefit. 11018 void step() { 11019 block_comment("step"); 11020 // MACC(Ra, Rb, t0, t1, t2); 11021 // Ra = *++Pa; 11022 // Rb = *--Pb; 11023 umulh(Rhi_ab, Ra, Rb); 11024 mul(Rlo_ab, Ra, Rb); 11025 ldr(Ra, pre(Pa, wordSize)); 11026 ldr(Rb, pre(Pb, -wordSize)); 11027 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 11028 // previous iteration. 11029 // MACC(Rm, Rn, t0, t1, t2); 11030 // Rm = *++Pm; 11031 // Rn = *--Pn; 11032 umulh(Rhi_mn, Rm, Rn); 11033 mul(Rlo_mn, Rm, Rn); 11034 ldr(Rm, pre(Pm, wordSize)); 11035 ldr(Rn, pre(Pn, -wordSize)); 11036 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11037 } 11038 11039 void post1() { 11040 block_comment("post1"); 11041 11042 // MACC(Ra, Rb, t0, t1, t2); 11043 // Ra = *++Pa; 11044 // Rb = *--Pb; 11045 umulh(Rhi_ab, Ra, Rb); 11046 mul(Rlo_ab, Ra, Rb); 11047 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11048 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11049 11050 // *Pm = Rm = t0 * inv; 11051 mul(Rm, t0, inv); 11052 str(Rm, Address(Pm)); 11053 11054 // MACC(Rm, Rn, t0, t1, t2); 11055 // t0 = t1; t1 = t2; t2 = 0; 11056 umulh(Rhi_mn, Rm, Rn); 11057 11058 #ifndef PRODUCT 11059 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11060 { 11061 mul(Rlo_mn, Rm, Rn); 11062 add(Rlo_mn, t0, Rlo_mn); 11063 Label ok; 11064 cbz(Rlo_mn, ok); { 11065 stop("broken Montgomery multiply"); 11066 } bind(ok); 11067 } 11068 #endif 11069 // We have very carefully set things up so that 11070 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11071 // the lower half of Rm * Rn because we know the result already: 11072 // it must be -t0. t0 + (-t0) must generate a carry iff 11073 // t0 != 0. So, rather than do a mul and an adds we just set 11074 // the carry flag iff t0 is nonzero. 11075 // 11076 // mul(Rlo_mn, Rm, Rn); 11077 // adds(zr, t0, Rlo_mn); 11078 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11079 adcs(t0, t1, Rhi_mn); 11080 adc(t1, t2, zr); 11081 mov(t2, zr); 11082 } 11083 11084 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 11085 block_comment("pre2"); 11086 // Pa = Pa_base + i-len; 11087 // Pb = Pb_base + len; 11088 // Pm = Pm_base + i-len; 11089 // Pn = Pn_base + len; 11090 11091 if (i.is_register()) { 11092 sub(Rj, i.as_register(), len); 11093 } else { 11094 mov(Rj, i.as_constant()); 11095 sub(Rj, Rj, len); 11096 } 11097 // Rj == i-len 11098 11099 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 11100 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 11101 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11102 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 11103 11104 // Ra = *++Pa; 11105 // Rb = *--Pb; 11106 // Rm = *++Pm; 11107 // Rn = *--Pn; 11108 ldr(Ra, pre(Pa, wordSize)); 11109 ldr(Rb, pre(Pb, -wordSize)); 11110 ldr(Rm, pre(Pm, wordSize)); 11111 ldr(Rn, pre(Pn, -wordSize)); 11112 11113 mov(Rhi_mn, zr); 11114 mov(Rlo_mn, zr); 11115 } 11116 11117 void post2(RegisterOrConstant i, RegisterOrConstant len) { 11118 block_comment("post2"); 11119 if (i.is_constant()) { 11120 mov(Rj, i.as_constant()-len.as_constant()); 11121 } else { 11122 sub(Rj, i.as_register(), len); 11123 } 11124 11125 adds(t0, t0, Rlo_mn); // The pending m*n, low part 11126 11127 // As soon as we know the least significant digit of our result, 11128 // store it. 11129 // Pm_base[i-len] = t0; 11130 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 11131 11132 // t0 = t1; t1 = t2; t2 = 0; 11133 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 11134 adc(t1, t2, zr); 11135 mov(t2, zr); 11136 } 11137 11138 // A carry in t0 after Montgomery multiplication means that we 11139 // should subtract multiples of n from our result in m. We'll 11140 // keep doing that until there is no carry. 11141 void normalize(RegisterOrConstant len) { 11142 block_comment("normalize"); 11143 // while (t0) 11144 // t0 = sub(Pm_base, Pn_base, t0, len); 11145 Label loop, post, again; 11146 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 11147 cbz(t0, post); { 11148 bind(again); { 11149 mov(i, zr); 11150 mov(cnt, len); 11151 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11152 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11153 subs(zr, zr, zr); // set carry flag, i.e. no borrow 11154 align(16); 11155 bind(loop); { 11156 sbcs(Rm, Rm, Rn); 11157 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11158 add(i, i, 1); 11159 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 11160 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 11161 sub(cnt, cnt, 1); 11162 } cbnz(cnt, loop); 11163 sbc(t0, t0, zr); 11164 } cbnz(t0, again); 11165 } bind(post); 11166 } 11167 11168 // Move memory at s to d, reversing words. 11169 // Increments d to end of copied memory 11170 // Destroys tmp1, tmp2 11171 // Preserves len 11172 // Leaves s pointing to the address which was in d at start 11173 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 11174 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 11175 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 11176 11177 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 11178 mov(tmp1, len); 11179 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 11180 sub(s, d, len, ext::uxtw, LogBytesPerWord); 11181 } 11182 // where 11183 void reverse1(Register d, Register s, Register tmp) { 11184 ldr(tmp, pre(s, -wordSize)); 11185 ror(tmp, tmp, 32); 11186 str(tmp, post(d, wordSize)); 11187 } 11188 11189 void step_squaring() { 11190 // An extra ACC 11191 step(); 11192 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11193 } 11194 11195 void last_squaring(RegisterOrConstant i) { 11196 Label dont; 11197 // if ((i & 1) == 0) { 11198 tbnz(i.as_register(), 0, dont); { 11199 // MACC(Ra, Rb, t0, t1, t2); 11200 // Ra = *++Pa; 11201 // Rb = *--Pb; 11202 umulh(Rhi_ab, Ra, Rb); 11203 mul(Rlo_ab, Ra, Rb); 11204 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 11205 } bind(dont); 11206 } 11207 11208 void extra_step_squaring() { 11209 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11210 11211 // MACC(Rm, Rn, t0, t1, t2); 11212 // Rm = *++Pm; 11213 // Rn = *--Pn; 11214 umulh(Rhi_mn, Rm, Rn); 11215 mul(Rlo_mn, Rm, Rn); 11216 ldr(Rm, pre(Pm, wordSize)); 11217 ldr(Rn, pre(Pn, -wordSize)); 11218 } 11219 11220 void post1_squaring() { 11221 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 11222 11223 // *Pm = Rm = t0 * inv; 11224 mul(Rm, t0, inv); 11225 str(Rm, Address(Pm)); 11226 11227 // MACC(Rm, Rn, t0, t1, t2); 11228 // t0 = t1; t1 = t2; t2 = 0; 11229 umulh(Rhi_mn, Rm, Rn); 11230 11231 #ifndef PRODUCT 11232 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 11233 { 11234 mul(Rlo_mn, Rm, Rn); 11235 add(Rlo_mn, t0, Rlo_mn); 11236 Label ok; 11237 cbz(Rlo_mn, ok); { 11238 stop("broken Montgomery multiply"); 11239 } bind(ok); 11240 } 11241 #endif 11242 // We have very carefully set things up so that 11243 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 11244 // the lower half of Rm * Rn because we know the result already: 11245 // it must be -t0. t0 + (-t0) must generate a carry iff 11246 // t0 != 0. So, rather than do a mul and an adds we just set 11247 // the carry flag iff t0 is nonzero. 11248 // 11249 // mul(Rlo_mn, Rm, Rn); 11250 // adds(zr, t0, Rlo_mn); 11251 subs(zr, t0, 1); // Set carry iff t0 is nonzero 11252 adcs(t0, t1, Rhi_mn); 11253 adc(t1, t2, zr); 11254 mov(t2, zr); 11255 } 11256 11257 void acc(Register Rhi, Register Rlo, 11258 Register t0, Register t1, Register t2) { 11259 adds(t0, t0, Rlo); 11260 adcs(t1, t1, Rhi); 11261 adc(t2, t2, zr); 11262 } 11263 11264 public: 11265 /** 11266 * Fast Montgomery multiplication. The derivation of the 11267 * algorithm is in A Cryptographic Library for the Motorola 11268 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 11269 * 11270 * Arguments: 11271 * 11272 * Inputs for multiplication: 11273 * c_rarg0 - int array elements a 11274 * c_rarg1 - int array elements b 11275 * c_rarg2 - int array elements n (the modulus) 11276 * c_rarg3 - int length 11277 * c_rarg4 - int inv 11278 * c_rarg5 - int array elements m (the result) 11279 * 11280 * Inputs for squaring: 11281 * c_rarg0 - int array elements a 11282 * c_rarg1 - int array elements n (the modulus) 11283 * c_rarg2 - int length 11284 * c_rarg3 - int inv 11285 * c_rarg4 - int array elements m (the result) 11286 * 11287 */ 11288 address generate_multiply() { 11289 Label argh, nothing; 11290 bind(argh); 11291 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11292 11293 align(CodeEntryAlignment); 11294 address entry = pc(); 11295 11296 cbzw(Rlen, nothing); 11297 11298 enter(); 11299 11300 // Make room. 11301 cmpw(Rlen, 512); 11302 br(Assembler::HI, argh); 11303 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11304 andr(sp, Ra, -2 * wordSize); 11305 11306 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11307 11308 { 11309 // Copy input args, reversing as we go. We use Ra as a 11310 // temporary variable. 11311 reverse(Ra, Pa_base, Rlen, t0, t1); 11312 if (!_squaring) 11313 reverse(Ra, Pb_base, Rlen, t0, t1); 11314 reverse(Ra, Pn_base, Rlen, t0, t1); 11315 } 11316 11317 // Push all call-saved registers and also Pm_base which we'll need 11318 // at the end. 11319 save_regs(); 11320 11321 #ifndef PRODUCT 11322 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 11323 { 11324 ldr(Rn, Address(Pn_base, 0)); 11325 mul(Rlo_mn, Rn, inv); 11326 subs(zr, Rlo_mn, -1); 11327 Label ok; 11328 br(EQ, ok); { 11329 stop("broken inverse in Montgomery multiply"); 11330 } bind(ok); 11331 } 11332 #endif 11333 11334 mov(Pm_base, Ra); 11335 11336 mov(t0, zr); 11337 mov(t1, zr); 11338 mov(t2, zr); 11339 11340 block_comment("for (int i = 0; i < len; i++) {"); 11341 mov(Ri, zr); { 11342 Label loop, end; 11343 cmpw(Ri, Rlen); 11344 br(Assembler::GE, end); 11345 11346 bind(loop); 11347 pre1(Ri); 11348 11349 block_comment(" for (j = i; j; j--) {"); { 11350 movw(Rj, Ri); 11351 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11352 } block_comment(" } // j"); 11353 11354 post1(); 11355 addw(Ri, Ri, 1); 11356 cmpw(Ri, Rlen); 11357 br(Assembler::LT, loop); 11358 bind(end); 11359 block_comment("} // i"); 11360 } 11361 11362 block_comment("for (int i = len; i < 2*len; i++) {"); 11363 mov(Ri, Rlen); { 11364 Label loop, end; 11365 cmpw(Ri, Rlen, Assembler::LSL, 1); 11366 br(Assembler::GE, end); 11367 11368 bind(loop); 11369 pre2(Ri, Rlen); 11370 11371 block_comment(" for (j = len*2-i-1; j; j--) {"); { 11372 lslw(Rj, Rlen, 1); 11373 subw(Rj, Rj, Ri); 11374 subw(Rj, Rj, 1); 11375 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 11376 } block_comment(" } // j"); 11377 11378 post2(Ri, Rlen); 11379 addw(Ri, Ri, 1); 11380 cmpw(Ri, Rlen, Assembler::LSL, 1); 11381 br(Assembler::LT, loop); 11382 bind(end); 11383 } 11384 block_comment("} // i"); 11385 11386 normalize(Rlen); 11387 11388 mov(Ra, Pm_base); // Save Pm_base in Ra 11389 restore_regs(); // Restore caller's Pm_base 11390 11391 // Copy our result into caller's Pm_base 11392 reverse(Pm_base, Ra, Rlen, t0, t1); 11393 11394 leave(); 11395 bind(nothing); 11396 ret(lr); 11397 11398 return entry; 11399 } 11400 // In C, approximately: 11401 11402 // void 11403 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 11404 // julong Pn_base[], julong Pm_base[], 11405 // julong inv, int len) { 11406 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11407 // julong *Pa, *Pb, *Pn, *Pm; 11408 // julong Ra, Rb, Rn, Rm; 11409 11410 // int i; 11411 11412 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11413 11414 // for (i = 0; i < len; i++) { 11415 // int j; 11416 11417 // Pa = Pa_base; 11418 // Pb = Pb_base + i; 11419 // Pm = Pm_base; 11420 // Pn = Pn_base + i; 11421 11422 // Ra = *Pa; 11423 // Rb = *Pb; 11424 // Rm = *Pm; 11425 // Rn = *Pn; 11426 11427 // int iters = i; 11428 // for (j = 0; iters--; j++) { 11429 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11430 // MACC(Ra, Rb, t0, t1, t2); 11431 // Ra = *++Pa; 11432 // Rb = *--Pb; 11433 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11434 // MACC(Rm, Rn, t0, t1, t2); 11435 // Rm = *++Pm; 11436 // Rn = *--Pn; 11437 // } 11438 11439 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 11440 // MACC(Ra, Rb, t0, t1, t2); 11441 // *Pm = Rm = t0 * inv; 11442 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11443 // MACC(Rm, Rn, t0, t1, t2); 11444 11445 // assert(t0 == 0, "broken Montgomery multiply"); 11446 11447 // t0 = t1; t1 = t2; t2 = 0; 11448 // } 11449 11450 // for (i = len; i < 2*len; i++) { 11451 // int j; 11452 11453 // Pa = Pa_base + i-len; 11454 // Pb = Pb_base + len; 11455 // Pm = Pm_base + i-len; 11456 // Pn = Pn_base + len; 11457 11458 // Ra = *++Pa; 11459 // Rb = *--Pb; 11460 // Rm = *++Pm; 11461 // Rn = *--Pn; 11462 11463 // int iters = len*2-i-1; 11464 // for (j = i-len+1; iters--; j++) { 11465 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 11466 // MACC(Ra, Rb, t0, t1, t2); 11467 // Ra = *++Pa; 11468 // Rb = *--Pb; 11469 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11470 // MACC(Rm, Rn, t0, t1, t2); 11471 // Rm = *++Pm; 11472 // Rn = *--Pn; 11473 // } 11474 11475 // Pm_base[i-len] = t0; 11476 // t0 = t1; t1 = t2; t2 = 0; 11477 // } 11478 11479 // while (t0) 11480 // t0 = sub(Pm_base, Pn_base, t0, len); 11481 // } 11482 11483 /** 11484 * Fast Montgomery squaring. This uses asymptotically 25% fewer 11485 * multiplies than Montgomery multiplication so it should be up to 11486 * 25% faster. However, its loop control is more complex and it 11487 * may actually run slower on some machines. 11488 * 11489 * Arguments: 11490 * 11491 * Inputs: 11492 * c_rarg0 - int array elements a 11493 * c_rarg1 - int array elements n (the modulus) 11494 * c_rarg2 - int length 11495 * c_rarg3 - int inv 11496 * c_rarg4 - int array elements m (the result) 11497 * 11498 */ 11499 address generate_square() { 11500 Label argh; 11501 bind(argh); 11502 stop("MontgomeryMultiply total_allocation must be <= 8192"); 11503 11504 align(CodeEntryAlignment); 11505 address entry = pc(); 11506 11507 enter(); 11508 11509 // Make room. 11510 cmpw(Rlen, 512); 11511 br(Assembler::HI, argh); 11512 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 11513 andr(sp, Ra, -2 * wordSize); 11514 11515 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 11516 11517 { 11518 // Copy input args, reversing as we go. We use Ra as a 11519 // temporary variable. 11520 reverse(Ra, Pa_base, Rlen, t0, t1); 11521 reverse(Ra, Pn_base, Rlen, t0, t1); 11522 } 11523 11524 // Push all call-saved registers and also Pm_base which we'll need 11525 // at the end. 11526 save_regs(); 11527 11528 mov(Pm_base, Ra); 11529 11530 mov(t0, zr); 11531 mov(t1, zr); 11532 mov(t2, zr); 11533 11534 block_comment("for (int i = 0; i < len; i++) {"); 11535 mov(Ri, zr); { 11536 Label loop, end; 11537 bind(loop); 11538 cmp(Ri, Rlen); 11539 br(Assembler::GE, end); 11540 11541 pre1(Ri); 11542 11543 block_comment("for (j = (i+1)/2; j; j--) {"); { 11544 add(Rj, Ri, 1); 11545 lsr(Rj, Rj, 1); 11546 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11547 } block_comment(" } // j"); 11548 11549 last_squaring(Ri); 11550 11551 block_comment(" for (j = i/2; j; j--) {"); { 11552 lsr(Rj, Ri, 1); 11553 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11554 } block_comment(" } // j"); 11555 11556 post1_squaring(); 11557 add(Ri, Ri, 1); 11558 cmp(Ri, Rlen); 11559 br(Assembler::LT, loop); 11560 11561 bind(end); 11562 block_comment("} // i"); 11563 } 11564 11565 block_comment("for (int i = len; i < 2*len; i++) {"); 11566 mov(Ri, Rlen); { 11567 Label loop, end; 11568 bind(loop); 11569 cmp(Ri, Rlen, Assembler::LSL, 1); 11570 br(Assembler::GE, end); 11571 11572 pre2(Ri, Rlen); 11573 11574 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 11575 lsl(Rj, Rlen, 1); 11576 sub(Rj, Rj, Ri); 11577 sub(Rj, Rj, 1); 11578 lsr(Rj, Rj, 1); 11579 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 11580 } block_comment(" } // j"); 11581 11582 last_squaring(Ri); 11583 11584 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 11585 lsl(Rj, Rlen, 1); 11586 sub(Rj, Rj, Ri); 11587 lsr(Rj, Rj, 1); 11588 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 11589 } block_comment(" } // j"); 11590 11591 post2(Ri, Rlen); 11592 add(Ri, Ri, 1); 11593 cmp(Ri, Rlen, Assembler::LSL, 1); 11594 11595 br(Assembler::LT, loop); 11596 bind(end); 11597 block_comment("} // i"); 11598 } 11599 11600 normalize(Rlen); 11601 11602 mov(Ra, Pm_base); // Save Pm_base in Ra 11603 restore_regs(); // Restore caller's Pm_base 11604 11605 // Copy our result into caller's Pm_base 11606 reverse(Pm_base, Ra, Rlen, t0, t1); 11607 11608 leave(); 11609 ret(lr); 11610 11611 return entry; 11612 } 11613 // In C, approximately: 11614 11615 // void 11616 // montgomery_square(julong Pa_base[], julong Pn_base[], 11617 // julong Pm_base[], julong inv, int len) { 11618 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 11619 // julong *Pa, *Pb, *Pn, *Pm; 11620 // julong Ra, Rb, Rn, Rm; 11621 11622 // int i; 11623 11624 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 11625 11626 // for (i = 0; i < len; i++) { 11627 // int j; 11628 11629 // Pa = Pa_base; 11630 // Pb = Pa_base + i; 11631 // Pm = Pm_base; 11632 // Pn = Pn_base + i; 11633 11634 // Ra = *Pa; 11635 // Rb = *Pb; 11636 // Rm = *Pm; 11637 // Rn = *Pn; 11638 11639 // int iters = (i+1)/2; 11640 // for (j = 0; iters--; j++) { 11641 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11642 // MACC2(Ra, Rb, t0, t1, t2); 11643 // Ra = *++Pa; 11644 // Rb = *--Pb; 11645 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11646 // MACC(Rm, Rn, t0, t1, t2); 11647 // Rm = *++Pm; 11648 // Rn = *--Pn; 11649 // } 11650 // if ((i & 1) == 0) { 11651 // assert(Ra == Pa_base[j], "must be"); 11652 // MACC(Ra, Ra, t0, t1, t2); 11653 // } 11654 // iters = i/2; 11655 // assert(iters == i-j, "must be"); 11656 // for (; iters--; j++) { 11657 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11658 // MACC(Rm, Rn, t0, t1, t2); 11659 // Rm = *++Pm; 11660 // Rn = *--Pn; 11661 // } 11662 11663 // *Pm = Rm = t0 * inv; 11664 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 11665 // MACC(Rm, Rn, t0, t1, t2); 11666 11667 // assert(t0 == 0, "broken Montgomery multiply"); 11668 11669 // t0 = t1; t1 = t2; t2 = 0; 11670 // } 11671 11672 // for (i = len; i < 2*len; i++) { 11673 // int start = i-len+1; 11674 // int end = start + (len - start)/2; 11675 // int j; 11676 11677 // Pa = Pa_base + i-len; 11678 // Pb = Pa_base + len; 11679 // Pm = Pm_base + i-len; 11680 // Pn = Pn_base + len; 11681 11682 // Ra = *++Pa; 11683 // Rb = *--Pb; 11684 // Rm = *++Pm; 11685 // Rn = *--Pn; 11686 11687 // int iters = (2*len-i-1)/2; 11688 // assert(iters == end-start, "must be"); 11689 // for (j = start; iters--; j++) { 11690 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 11691 // MACC2(Ra, Rb, t0, t1, t2); 11692 // Ra = *++Pa; 11693 // Rb = *--Pb; 11694 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11695 // MACC(Rm, Rn, t0, t1, t2); 11696 // Rm = *++Pm; 11697 // Rn = *--Pn; 11698 // } 11699 // if ((i & 1) == 0) { 11700 // assert(Ra == Pa_base[j], "must be"); 11701 // MACC(Ra, Ra, t0, t1, t2); 11702 // } 11703 // iters = (2*len-i)/2; 11704 // assert(iters == len-j, "must be"); 11705 // for (; iters--; j++) { 11706 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 11707 // MACC(Rm, Rn, t0, t1, t2); 11708 // Rm = *++Pm; 11709 // Rn = *--Pn; 11710 // } 11711 // Pm_base[i-len] = t0; 11712 // t0 = t1; t1 = t2; t2 = 0; 11713 // } 11714 11715 // while (t0) 11716 // t0 = sub(Pm_base, Pn_base, t0, len); 11717 // } 11718 }; 11719 11720 // Initialization 11721 void generate_preuniverse_stubs() { 11722 // preuniverse stubs are not needed for aarch64 11723 } 11724 11725 void generate_initial_stubs() { 11726 // Generate initial stubs and initializes the entry points 11727 11728 // entry points that exist in all platforms Note: This is code 11729 // that could be shared among different platforms - however the 11730 // benefit seems to be smaller than the disadvantage of having a 11731 // much more complicated generator structure. See also comment in 11732 // stubRoutines.hpp. 11733 11734 StubRoutines::_forward_exception_entry = generate_forward_exception(); 11735 11736 StubRoutines::_call_stub_entry = 11737 generate_call_stub(StubRoutines::_call_stub_return_address); 11738 11739 // is referenced by megamorphic call 11740 StubRoutines::_catch_exception_entry = generate_catch_exception(); 11741 11742 // Initialize table for copy memory (arraycopy) check. 11743 if (UnsafeMemoryAccess::_table == nullptr) { 11744 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 11745 } 11746 11747 if (UseCRC32Intrinsics) { 11748 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 11749 } 11750 11751 if (UseCRC32CIntrinsics) { 11752 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 11753 } 11754 11755 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 11756 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 11757 } 11758 11759 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 11760 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 11761 } 11762 11763 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 11764 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 11765 StubRoutines::_hf2f = generate_float16ToFloat(); 11766 StubRoutines::_f2hf = generate_floatToFloat16(); 11767 } 11768 } 11769 11770 void generate_continuation_stubs() { 11771 // Continuation stubs: 11772 StubRoutines::_cont_thaw = generate_cont_thaw(); 11773 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 11774 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 11775 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 11776 } 11777 11778 void generate_final_stubs() { 11779 // support for verify_oop (must happen after universe_init) 11780 if (VerifyOops) { 11781 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 11782 } 11783 11784 // arraycopy stubs used by compilers 11785 generate_arraycopy_stubs(); 11786 11787 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 11788 11789 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 11790 11791 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 11792 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 11793 11794 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 11795 11796 generate_atomic_entry_points(); 11797 11798 #endif // LINUX 11799 11800 #ifdef COMPILER2 11801 if (UseSecondarySupersTable) { 11802 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 11803 if (! InlineSecondarySupersTest) { 11804 generate_lookup_secondary_supers_table_stub(); 11805 } 11806 } 11807 #endif 11808 11809 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(); 11810 11811 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 11812 } 11813 11814 void generate_compiler_stubs() { 11815 #if COMPILER2_OR_JVMCI 11816 11817 if (UseSVE == 0) { 11818 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id); 11819 } 11820 11821 // array equals stub for large arrays. 11822 if (!UseSimpleArrayEquals) { 11823 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 11824 } 11825 11826 // arrays_hascode stub for large arrays. 11827 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 11828 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 11829 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 11830 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 11831 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 11832 11833 // byte_array_inflate stub for large arrays. 11834 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 11835 11836 // countPositives stub for large arrays. 11837 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 11838 11839 generate_compare_long_strings(); 11840 11841 generate_string_indexof_stubs(); 11842 11843 #ifdef COMPILER2 11844 if (UseMultiplyToLenIntrinsic) { 11845 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 11846 } 11847 11848 if (UseSquareToLenIntrinsic) { 11849 StubRoutines::_squareToLen = generate_squareToLen(); 11850 } 11851 11852 if (UseMulAddIntrinsic) { 11853 StubRoutines::_mulAdd = generate_mulAdd(); 11854 } 11855 11856 if (UseSIMDForBigIntegerShiftIntrinsics) { 11857 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 11858 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 11859 } 11860 11861 if (UseMontgomeryMultiplyIntrinsic) { 11862 StubId stub_id = StubId::stubgen_montgomeryMultiply_id; 11863 StubCodeMark mark(this, stub_id); 11864 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 11865 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 11866 } 11867 11868 if (UseMontgomerySquareIntrinsic) { 11869 StubId stub_id = StubId::stubgen_montgomerySquare_id; 11870 StubCodeMark mark(this, stub_id); 11871 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 11872 // We use generate_multiply() rather than generate_square() 11873 // because it's faster for the sizes of modulus we care about. 11874 StubRoutines::_montgomerySquare = g.generate_multiply(); 11875 } 11876 11877 #endif // COMPILER2 11878 11879 if (UseChaCha20Intrinsics) { 11880 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 11881 } 11882 11883 if (UseKyberIntrinsics) { 11884 StubRoutines::_kyberNtt = generate_kyberNtt(); 11885 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt(); 11886 StubRoutines::_kyberNttMult = generate_kyberNttMult(); 11887 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2(); 11888 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3(); 11889 StubRoutines::_kyber12To16 = generate_kyber12To16(); 11890 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce(); 11891 } 11892 11893 if (UseDilithiumIntrinsics) { 11894 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 11895 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 11896 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 11897 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 11898 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 11899 } 11900 11901 if (UseBASE64Intrinsics) { 11902 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 11903 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 11904 } 11905 11906 // data cache line writeback 11907 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 11908 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 11909 11910 if (UseAESIntrinsics) { 11911 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 11912 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 11913 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 11914 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 11915 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 11916 } 11917 if (UseGHASHIntrinsics) { 11918 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 11919 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 11920 } 11921 if (UseAESIntrinsics && UseGHASHIntrinsics) { 11922 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 11923 } 11924 11925 if (UseMD5Intrinsics) { 11926 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id); 11927 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id); 11928 } 11929 if (UseSHA1Intrinsics) { 11930 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id); 11931 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id); 11932 } 11933 if (UseSHA256Intrinsics) { 11934 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id); 11935 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id); 11936 } 11937 if (UseSHA512Intrinsics) { 11938 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id); 11939 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id); 11940 } 11941 if (UseSHA3Intrinsics) { 11942 11943 StubRoutines::_double_keccak = generate_double_keccak(); 11944 if (UseSIMDForSHA3Intrinsic) { 11945 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id); 11946 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id); 11947 } else { 11948 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id); 11949 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id); 11950 } 11951 } 11952 11953 if (UsePoly1305Intrinsics) { 11954 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 11955 } 11956 11957 // generate Adler32 intrinsics code 11958 if (UseAdler32Intrinsics) { 11959 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 11960 } 11961 11962 #endif // COMPILER2_OR_JVMCI 11963 } 11964 11965 public: 11966 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) { 11967 switch(blob_id) { 11968 case BlobId::stubgen_preuniverse_id: 11969 generate_preuniverse_stubs(); 11970 break; 11971 case BlobId::stubgen_initial_id: 11972 generate_initial_stubs(); 11973 break; 11974 case BlobId::stubgen_continuation_id: 11975 generate_continuation_stubs(); 11976 break; 11977 case BlobId::stubgen_compiler_id: 11978 generate_compiler_stubs(); 11979 break; 11980 case BlobId::stubgen_final_id: 11981 generate_final_stubs(); 11982 break; 11983 default: 11984 fatal("unexpected blob id: %s", StubInfo::name(blob_id)); 11985 break; 11986 }; 11987 } 11988 }; // end class declaration 11989 11990 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) { 11991 StubGenerator g(code, blob_id); 11992 } 11993 11994 11995 #if defined (LINUX) 11996 11997 // Define pointers to atomic stubs and initialize them to point to the 11998 // code in atomic_aarch64.S. 11999 12000 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 12001 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 12002 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 12003 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 12004 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 12005 12006 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 12007 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 12008 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 12009 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 12010 DEFAULT_ATOMIC_OP(xchg, 4, ) 12011 DEFAULT_ATOMIC_OP(xchg, 8, ) 12012 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 12013 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 12014 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 12015 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 12016 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 12017 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 12018 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 12019 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 12020 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 12021 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 12022 12023 #undef DEFAULT_ATOMIC_OP 12024 12025 #endif // LINUX