1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubGenStubId stub_id = StubGenStubId::call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 __ ldr(j_rarg2, result); 332 Label is_long, is_float, is_double, exit; 333 __ ldr(j_rarg1, result_type); 334 __ cmp(j_rarg1, (u1)T_OBJECT); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, (u1)T_LONG); 337 __ br(Assembler::EQ, is_long); 338 __ cmp(j_rarg1, (u1)T_FLOAT); 339 __ br(Assembler::EQ, is_float); 340 __ cmp(j_rarg1, (u1)T_DOUBLE); 341 __ br(Assembler::EQ, is_double); 342 343 // handle T_INT case 344 __ strw(r0, Address(j_rarg2)); 345 346 __ BIND(exit); 347 348 // pop parameters 349 __ sub(esp, rfp, -sp_after_call_off * wordSize); 350 351 #ifdef ASSERT 352 // verify that threads correspond 353 { 354 Label L, S; 355 __ ldr(rscratch1, thread); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::NE, S); 358 __ get_thread(rscratch1); 359 __ cmp(rthread, rscratch1); 360 __ br(Assembler::EQ, L); 361 __ BIND(S); 362 __ stop("StubRoutines::call_stub: threads must correspond"); 363 __ BIND(L); 364 } 365 #endif 366 367 __ pop_cont_fastpath(rthread); 368 369 // restore callee-save registers 370 __ ldpd(v15, v14, d15_save); 371 __ ldpd(v13, v12, d13_save); 372 __ ldpd(v11, v10, d11_save); 373 __ ldpd(v9, v8, d9_save); 374 375 __ ldp(r28, r27, r28_save); 376 __ ldp(r26, r25, r26_save); 377 __ ldp(r24, r23, r24_save); 378 __ ldp(r22, r21, r22_save); 379 __ ldp(r20, r19, r20_save); 380 381 // restore fpcr 382 __ ldr(rscratch1, fpcr_save); 383 __ set_fpcr(rscratch1); 384 385 __ ldp(c_rarg0, c_rarg1, call_wrapper); 386 __ ldrw(c_rarg2, result_type); 387 __ ldr(c_rarg3, method); 388 __ ldp(c_rarg4, c_rarg5, entry_point); 389 __ ldp(c_rarg6, c_rarg7, parameter_size); 390 391 // leave frame and return to caller 392 __ leave(); 393 __ ret(lr); 394 395 // handle return types different from T_INT 396 397 __ BIND(is_long); 398 __ str(r0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 __ BIND(is_float); 402 __ strs(j_farg0, Address(j_rarg2, 0)); 403 __ br(Assembler::AL, exit); 404 405 __ BIND(is_double); 406 __ strd(j_farg0, Address(j_rarg2, 0)); 407 __ br(Assembler::AL, exit); 408 409 return start; 410 } 411 412 // Return point for a Java call if there's an exception thrown in 413 // Java code. The exception is caught and transformed into a 414 // pending exception stored in JavaThread that can be tested from 415 // within the VM. 416 // 417 // Note: Usually the parameters are removed by the callee. In case 418 // of an exception crossing an activation frame boundary, that is 419 // not the case if the callee is compiled code => need to setup the 420 // rsp. 421 // 422 // r0: exception oop 423 424 address generate_catch_exception() { 425 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 426 StubCodeMark mark(this, stub_id); 427 address start = __ pc(); 428 429 // same as in generate_call_stub(): 430 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 431 const Address thread (rfp, thread_off * wordSize); 432 433 #ifdef ASSERT 434 // verify that threads correspond 435 { 436 Label L, S; 437 __ ldr(rscratch1, thread); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::NE, S); 440 __ get_thread(rscratch1); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::EQ, L); 443 __ bind(S); 444 __ stop("StubRoutines::catch_exception: threads must correspond"); 445 __ bind(L); 446 } 447 #endif 448 449 // set pending exception 450 __ verify_oop(r0); 451 452 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 453 __ mov(rscratch1, (address)__FILE__); 454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 455 __ movw(rscratch1, (int)__LINE__); 456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 457 458 // complete return to VM 459 assert(StubRoutines::_call_stub_return_address != nullptr, 460 "_call_stub_return_address must have been generated before"); 461 __ b(StubRoutines::_call_stub_return_address); 462 463 return start; 464 } 465 466 // Continuation point for runtime calls returning with a pending 467 // exception. The pending exception check happened in the runtime 468 // or native call stub. The pending exception in Thread is 469 // converted into a Java-level exception. 470 // 471 // Contract with Java-level exception handlers: 472 // r0: exception 473 // r3: throwing pc 474 // 475 // NOTE: At entry of this stub, exception-pc must be in LR !! 476 477 // NOTE: this is always used as a jump target within generated code 478 // so it just needs to be generated code with no x86 prolog 479 480 address generate_forward_exception() { 481 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 482 StubCodeMark mark(this, stub_id); 483 address start = __ pc(); 484 485 // Upon entry, LR points to the return address returning into 486 // Java (interpreted or compiled) code; i.e., the return address 487 // becomes the throwing pc. 488 // 489 // Arguments pushed before the runtime call are still on the stack 490 // but the exception handler will reset the stack pointer -> 491 // ignore them. A potential result in registers can be ignored as 492 // well. 493 494 #ifdef ASSERT 495 // make sure this code is only executed if there is a pending exception 496 { 497 Label L; 498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 499 __ cbnz(rscratch1, L); 500 __ stop("StubRoutines::forward exception: no pending exception (1)"); 501 __ bind(L); 502 } 503 #endif 504 505 // compute exception handler into r19 506 507 // call the VM to find the handler address associated with the 508 // caller address. pass thread in r0 and caller pc (ret address) 509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 510 // the stack. 511 __ mov(c_rarg1, lr); 512 // lr will be trashed by the VM call so we move it to R19 513 // (callee-saved) because we also need to pass it to the handler 514 // returned by this call. 515 __ mov(r19, lr); 516 BLOCK_COMMENT("call exception_handler_for_return_address"); 517 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 518 SharedRuntime::exception_handler_for_return_address), 519 rthread, c_rarg1); 520 // Reinitialize the ptrue predicate register, in case the external runtime 521 // call clobbers ptrue reg, as we may return to SVE compiled code. 522 __ reinitialize_ptrue(); 523 524 // we should not really care that lr is no longer the callee 525 // address. we saved the value the handler needs in r19 so we can 526 // just copy it to r3. however, the C2 handler will push its own 527 // frame and then calls into the VM and the VM code asserts that 528 // the PC for the frame above the handler belongs to a compiled 529 // Java method. So, we restore lr here to satisfy that assert. 530 __ mov(lr, r19); 531 // setup r0 & r3 & clear pending exception 532 __ mov(r3, r19); 533 __ mov(r19, r0); 534 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 535 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 536 537 #ifdef ASSERT 538 // make sure exception is set 539 { 540 Label L; 541 __ cbnz(r0, L); 542 __ stop("StubRoutines::forward exception: no pending exception (2)"); 543 __ bind(L); 544 } 545 #endif 546 547 // continue at exception handler 548 // r0: exception 549 // r3: throwing pc 550 // r19: exception handler 551 __ verify_oop(r0); 552 __ br(r19); 553 554 return start; 555 } 556 557 // Non-destructive plausibility checks for oops 558 // 559 // Arguments: 560 // r0: oop to verify 561 // rscratch1: error message 562 // 563 // Stack after saving c_rarg3: 564 // [tos + 0]: saved c_rarg3 565 // [tos + 1]: saved c_rarg2 566 // [tos + 2]: saved lr 567 // [tos + 3]: saved rscratch2 568 // [tos + 4]: saved r0 569 // [tos + 5]: saved rscratch1 570 address generate_verify_oop() { 571 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 572 StubCodeMark mark(this, stub_id); 573 address start = __ pc(); 574 575 Label exit, error; 576 577 // save c_rarg2 and c_rarg3 578 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 579 580 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 581 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ ldr(c_rarg3, Address(c_rarg2)); 583 __ add(c_rarg3, c_rarg3, 1); 584 __ str(c_rarg3, Address(c_rarg2)); 585 586 // object is in r0 587 // make sure object is 'reasonable' 588 __ cbz(r0, exit); // if obj is null it is OK 589 590 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 591 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 592 593 // return if everything seems ok 594 __ bind(exit); 595 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 __ ret(lr); 598 599 // handle errors 600 __ bind(error); 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 603 __ push(RegSet::range(r0, r29), sp); 604 // debug(char* msg, int64_t pc, int64_t regs[]) 605 __ mov(c_rarg0, rscratch1); // pass address of error message 606 __ mov(c_rarg1, lr); // pass return address 607 __ mov(c_rarg2, sp); // pass address of regs on stack 608 #ifndef PRODUCT 609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 610 #endif 611 BLOCK_COMMENT("call MacroAssembler::debug"); 612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 613 __ blr(rscratch1); 614 __ hlt(0); 615 616 return start; 617 } 618 619 // Generate indices for iota vector. 620 address generate_iota_indices(StubGenStubId stub_id) { 621 __ align(CodeEntryAlignment); 622 StubCodeMark mark(this, stub_id); 623 address start = __ pc(); 624 // B 625 __ emit_data64(0x0706050403020100, relocInfo::none); 626 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 627 // H 628 __ emit_data64(0x0003000200010000, relocInfo::none); 629 __ emit_data64(0x0007000600050004, relocInfo::none); 630 // S 631 __ emit_data64(0x0000000100000000, relocInfo::none); 632 __ emit_data64(0x0000000300000002, relocInfo::none); 633 // D 634 __ emit_data64(0x0000000000000000, relocInfo::none); 635 __ emit_data64(0x0000000000000001, relocInfo::none); 636 // S - FP 637 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 638 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 639 // D - FP 640 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 641 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 642 return start; 643 } 644 645 // The inner part of zero_words(). This is the bulk operation, 646 // zeroing words in blocks, possibly using DC ZVA to do it. The 647 // caller is responsible for zeroing the last few words. 648 // 649 // Inputs: 650 // r10: the HeapWord-aligned base address of an array to zero. 651 // r11: the count in HeapWords, r11 > 0. 652 // 653 // Returns r10 and r11, adjusted for the caller to clear. 654 // r10: the base address of the tail of words left to clear. 655 // r11: the number of words in the tail. 656 // r11 < MacroAssembler::zero_words_block_size. 657 658 address generate_zero_blocks() { 659 Label done; 660 Label base_aligned; 661 662 Register base = r10, cnt = r11; 663 664 __ align(CodeEntryAlignment); 665 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 666 StubCodeMark mark(this, stub_id); 667 address start = __ pc(); 668 669 if (UseBlockZeroing) { 670 int zva_length = VM_Version::zva_length(); 671 672 // Ensure ZVA length can be divided by 16. This is required by 673 // the subsequent operations. 674 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 675 676 __ tbz(base, 3, base_aligned); 677 __ str(zr, Address(__ post(base, 8))); 678 __ sub(cnt, cnt, 1); 679 __ bind(base_aligned); 680 681 // Ensure count >= zva_length * 2 so that it still deserves a zva after 682 // alignment. 683 Label small; 684 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 685 __ subs(rscratch1, cnt, low_limit >> 3); 686 __ br(Assembler::LT, small); 687 __ zero_dcache_blocks(base, cnt); 688 __ bind(small); 689 } 690 691 { 692 // Number of stp instructions we'll unroll 693 const int unroll = 694 MacroAssembler::zero_words_block_size / 2; 695 // Clear the remaining blocks. 696 Label loop; 697 __ subs(cnt, cnt, unroll * 2); 698 __ br(Assembler::LT, done); 699 __ bind(loop); 700 for (int i = 0; i < unroll; i++) 701 __ stp(zr, zr, __ post(base, 16)); 702 __ subs(cnt, cnt, unroll * 2); 703 __ br(Assembler::GE, loop); 704 __ bind(done); 705 __ add(cnt, cnt, unroll * 2); 706 } 707 708 __ ret(lr); 709 710 return start; 711 } 712 713 714 typedef enum { 715 copy_forwards = 1, 716 copy_backwards = -1 717 } copy_direction; 718 719 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 720 // for arraycopy stubs. 721 class ArrayCopyBarrierSetHelper : StackObj { 722 BarrierSetAssembler* _bs_asm; 723 MacroAssembler* _masm; 724 DecoratorSet _decorators; 725 BasicType _type; 726 Register _gct1; 727 Register _gct2; 728 Register _gct3; 729 FloatRegister _gcvt1; 730 FloatRegister _gcvt2; 731 FloatRegister _gcvt3; 732 733 public: 734 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 735 DecoratorSet decorators, 736 BasicType type, 737 Register gct1, 738 Register gct2, 739 Register gct3, 740 FloatRegister gcvt1, 741 FloatRegister gcvt2, 742 FloatRegister gcvt3) 743 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 744 _masm(masm), 745 _decorators(decorators), 746 _type(type), 747 _gct1(gct1), 748 _gct2(gct2), 749 _gct3(gct3), 750 _gcvt1(gcvt1), 751 _gcvt2(gcvt2), 752 _gcvt3(gcvt3) { 753 } 754 755 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 756 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 757 dst1, dst2, src, 758 _gct1, _gct2, _gcvt1); 759 } 760 761 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 762 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 763 dst, src1, src2, 764 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 765 } 766 767 void copy_load_at_16(Register dst1, Register dst2, Address src) { 768 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 769 dst1, dst2, src, 770 _gct1); 771 } 772 773 void copy_store_at_16(Address dst, Register src1, Register src2) { 774 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 775 dst, src1, src2, 776 _gct1, _gct2, _gct3); 777 } 778 779 void copy_load_at_8(Register dst, Address src) { 780 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 781 dst, noreg, src, 782 _gct1); 783 } 784 785 void copy_store_at_8(Address dst, Register src) { 786 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 787 dst, src, noreg, 788 _gct1, _gct2, _gct3); 789 } 790 }; 791 792 // Bulk copy of blocks of 8 words. 793 // 794 // count is a count of words. 795 // 796 // Precondition: count >= 8 797 // 798 // Postconditions: 799 // 800 // The least significant bit of count contains the remaining count 801 // of words to copy. The rest of count is trash. 802 // 803 // s and d are adjusted to point to the remaining words to copy 804 // 805 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 806 BasicType type; 807 copy_direction direction; 808 809 switch (stub_id) { 810 case copy_byte_f_id: 811 direction = copy_forwards; 812 type = T_BYTE; 813 break; 814 case copy_byte_b_id: 815 direction = copy_backwards; 816 type = T_BYTE; 817 break; 818 case copy_oop_f_id: 819 direction = copy_forwards; 820 type = T_OBJECT; 821 break; 822 case copy_oop_b_id: 823 direction = copy_backwards; 824 type = T_OBJECT; 825 break; 826 case copy_oop_uninit_f_id: 827 direction = copy_forwards; 828 type = T_OBJECT; 829 break; 830 case copy_oop_uninit_b_id: 831 direction = copy_backwards; 832 type = T_OBJECT; 833 break; 834 default: 835 ShouldNotReachHere(); 836 } 837 838 int unit = wordSize * direction; 839 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 840 841 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 842 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 843 const Register stride = r14; 844 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 845 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 846 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 847 848 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 849 assert_different_registers(s, d, count, rscratch1, rscratch2); 850 851 Label again, drain; 852 853 __ align(CodeEntryAlignment); 854 855 StubCodeMark mark(this, stub_id); 856 857 __ bind(start); 858 859 Label unaligned_copy_long; 860 if (AvoidUnalignedAccesses) { 861 __ tbnz(d, 3, unaligned_copy_long); 862 } 863 864 if (direction == copy_forwards) { 865 __ sub(s, s, bias); 866 __ sub(d, d, bias); 867 } 868 869 #ifdef ASSERT 870 // Make sure we are never given < 8 words 871 { 872 Label L; 873 __ cmp(count, (u1)8); 874 __ br(Assembler::GE, L); 875 __ stop("genrate_copy_longs called with < 8 words"); 876 __ bind(L); 877 } 878 #endif 879 880 // Fill 8 registers 881 if (UseSIMDForMemoryOps) { 882 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 884 } else { 885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 886 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 887 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 888 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 889 } 890 891 __ subs(count, count, 16); 892 __ br(Assembler::LO, drain); 893 894 int prefetch = PrefetchCopyIntervalInBytes; 895 bool use_stride = false; 896 if (direction == copy_backwards) { 897 use_stride = prefetch > 256; 898 prefetch = -prefetch; 899 if (use_stride) __ mov(stride, prefetch); 900 } 901 902 __ bind(again); 903 904 if (PrefetchCopyIntervalInBytes > 0) 905 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 906 907 if (UseSIMDForMemoryOps) { 908 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 909 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 910 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 911 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 912 } else { 913 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 914 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 915 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 916 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 917 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 918 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 919 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 920 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 921 } 922 923 __ subs(count, count, 8); 924 __ br(Assembler::HS, again); 925 926 // Drain 927 __ bind(drain); 928 if (UseSIMDForMemoryOps) { 929 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 930 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 931 } else { 932 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 933 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 934 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 } 937 938 { 939 Label L1, L2; 940 __ tbz(count, exact_log2(4), L1); 941 if (UseSIMDForMemoryOps) { 942 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 943 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 944 } else { 945 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 946 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 947 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 948 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 949 } 950 __ bind(L1); 951 952 if (direction == copy_forwards) { 953 __ add(s, s, bias); 954 __ add(d, d, bias); 955 } 956 957 __ tbz(count, 1, L2); 958 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 959 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 960 __ bind(L2); 961 } 962 963 __ ret(lr); 964 965 if (AvoidUnalignedAccesses) { 966 Label drain, again; 967 // Register order for storing. Order is different for backward copy. 968 969 __ bind(unaligned_copy_long); 970 971 // source address is even aligned, target odd aligned 972 // 973 // when forward copying word pairs we read long pairs at offsets 974 // {0, 2, 4, 6} (in long words). when backwards copying we read 975 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 976 // address by -2 in the forwards case so we can compute the 977 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 978 // or -1. 979 // 980 // when forward copying we need to store 1 word, 3 pairs and 981 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 982 // zero offset We adjust the destination by -1 which means we 983 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 984 // 985 // When backwards copyng we need to store 1 word, 3 pairs and 986 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 987 // offsets {1, 3, 5, 7, 8} * unit. 988 989 if (direction == copy_forwards) { 990 __ sub(s, s, 16); 991 __ sub(d, d, 8); 992 } 993 994 // Fill 8 registers 995 // 996 // for forwards copy s was offset by -16 from the original input 997 // value of s so the register contents are at these offsets 998 // relative to the 64 bit block addressed by that original input 999 // and so on for each successive 64 byte block when s is updated 1000 // 1001 // t0 at offset 0, t1 at offset 8 1002 // t2 at offset 16, t3 at offset 24 1003 // t4 at offset 32, t5 at offset 40 1004 // t6 at offset 48, t7 at offset 56 1005 1006 // for backwards copy s was not offset so the register contents 1007 // are at these offsets into the preceding 64 byte block 1008 // relative to that original input and so on for each successive 1009 // preceding 64 byte block when s is updated. this explains the 1010 // slightly counter-intuitive looking pattern of register usage 1011 // in the stp instructions for backwards copy. 1012 // 1013 // t0 at offset -16, t1 at offset -8 1014 // t2 at offset -32, t3 at offset -24 1015 // t4 at offset -48, t5 at offset -40 1016 // t6 at offset -64, t7 at offset -56 1017 1018 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1019 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1020 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1021 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1022 1023 __ subs(count, count, 16); 1024 __ br(Assembler::LO, drain); 1025 1026 int prefetch = PrefetchCopyIntervalInBytes; 1027 bool use_stride = false; 1028 if (direction == copy_backwards) { 1029 use_stride = prefetch > 256; 1030 prefetch = -prefetch; 1031 if (use_stride) __ mov(stride, prefetch); 1032 } 1033 1034 __ bind(again); 1035 1036 if (PrefetchCopyIntervalInBytes > 0) 1037 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1038 1039 if (direction == copy_forwards) { 1040 // allowing for the offset of -8 the store instructions place 1041 // registers into the target 64 bit block at the following 1042 // offsets 1043 // 1044 // t0 at offset 0 1045 // t1 at offset 8, t2 at offset 16 1046 // t3 at offset 24, t4 at offset 32 1047 // t5 at offset 40, t6 at offset 48 1048 // t7 at offset 56 1049 1050 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1051 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1052 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1053 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1054 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1055 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1056 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1057 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1058 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1059 } else { 1060 // d was not offset when we started so the registers are 1061 // written into the 64 bit block preceding d with the following 1062 // offsets 1063 // 1064 // t1 at offset -8 1065 // t3 at offset -24, t0 at offset -16 1066 // t5 at offset -48, t2 at offset -32 1067 // t7 at offset -56, t4 at offset -48 1068 // t6 at offset -64 1069 // 1070 // note that this matches the offsets previously noted for the 1071 // loads 1072 1073 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1074 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1075 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1076 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1077 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1078 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1079 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1080 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1081 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1082 } 1083 1084 __ subs(count, count, 8); 1085 __ br(Assembler::HS, again); 1086 1087 // Drain 1088 // 1089 // this uses the same pattern of offsets and register arguments 1090 // as above 1091 __ bind(drain); 1092 if (direction == copy_forwards) { 1093 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1094 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1095 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1096 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1097 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1098 } else { 1099 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1100 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1101 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1102 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1103 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1104 } 1105 // now we need to copy any remaining part block which may 1106 // include a 4 word block subblock and/or a 2 word subblock. 1107 // bits 2 and 1 in the count are the tell-tale for whether we 1108 // have each such subblock 1109 { 1110 Label L1, L2; 1111 __ tbz(count, exact_log2(4), L1); 1112 // this is the same as above but copying only 4 longs hence 1113 // with only one intervening stp between the str instructions 1114 // but note that the offsets and registers still follow the 1115 // same pattern 1116 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1117 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1118 if (direction == copy_forwards) { 1119 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1120 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1121 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1122 } else { 1123 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1124 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1125 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1126 } 1127 __ bind(L1); 1128 1129 __ tbz(count, 1, L2); 1130 // this is the same as above but copying only 2 longs hence 1131 // there is no intervening stp between the str instructions 1132 // but note that the offset and register patterns are still 1133 // the same 1134 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1135 if (direction == copy_forwards) { 1136 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1137 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1141 } 1142 __ bind(L2); 1143 1144 // for forwards copy we need to re-adjust the offsets we 1145 // applied so that s and d are follow the last words written 1146 1147 if (direction == copy_forwards) { 1148 __ add(s, s, 16); 1149 __ add(d, d, 8); 1150 } 1151 1152 } 1153 1154 __ ret(lr); 1155 } 1156 } 1157 1158 // Small copy: less than 16 bytes. 1159 // 1160 // NB: Ignores all of the bits of count which represent more than 15 1161 // bytes, so a caller doesn't have to mask them. 1162 1163 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1164 bool is_backwards = step < 0; 1165 size_t granularity = uabs(step); 1166 int direction = is_backwards ? -1 : 1; 1167 1168 Label Lword, Lint, Lshort, Lbyte; 1169 1170 assert(granularity 1171 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1172 1173 const Register t0 = r3; 1174 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1175 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1176 1177 // ??? I don't know if this bit-test-and-branch is the right thing 1178 // to do. It does a lot of jumping, resulting in several 1179 // mispredicted branches. It might make more sense to do this 1180 // with something like Duff's device with a single computed branch. 1181 1182 __ tbz(count, 3 - exact_log2(granularity), Lword); 1183 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1184 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1185 __ bind(Lword); 1186 1187 if (granularity <= sizeof (jint)) { 1188 __ tbz(count, 2 - exact_log2(granularity), Lint); 1189 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1190 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1191 __ bind(Lint); 1192 } 1193 1194 if (granularity <= sizeof (jshort)) { 1195 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1196 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1197 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1198 __ bind(Lshort); 1199 } 1200 1201 if (granularity <= sizeof (jbyte)) { 1202 __ tbz(count, 0, Lbyte); 1203 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1204 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1205 __ bind(Lbyte); 1206 } 1207 } 1208 1209 Label copy_f, copy_b; 1210 Label copy_obj_f, copy_obj_b; 1211 Label copy_obj_uninit_f, copy_obj_uninit_b; 1212 1213 // All-singing all-dancing memory copy. 1214 // 1215 // Copy count units of memory from s to d. The size of a unit is 1216 // step, which can be positive or negative depending on the direction 1217 // of copy. If is_aligned is false, we align the source address. 1218 // 1219 1220 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1221 Register s, Register d, Register count, int step) { 1222 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1223 bool is_backwards = step < 0; 1224 unsigned int granularity = uabs(step); 1225 const Register t0 = r3, t1 = r4; 1226 1227 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1228 // load all the data before writing anything 1229 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1230 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1231 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1232 const Register send = r17, dend = r16; 1233 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1234 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1235 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1236 1237 if (PrefetchCopyIntervalInBytes > 0) 1238 __ prfm(Address(s, 0), PLDL1KEEP); 1239 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1240 __ br(Assembler::HI, copy_big); 1241 1242 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1243 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1244 1245 __ cmp(count, u1(16/granularity)); 1246 __ br(Assembler::LS, copy16); 1247 1248 __ cmp(count, u1(64/granularity)); 1249 __ br(Assembler::HI, copy80); 1250 1251 __ cmp(count, u1(32/granularity)); 1252 __ br(Assembler::LS, copy32); 1253 1254 // 33..64 bytes 1255 if (UseSIMDForMemoryOps) { 1256 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1257 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1258 bs.copy_store_at_32(Address(d, 0), v0, v1); 1259 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1260 } else { 1261 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1262 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1263 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1264 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1265 1266 bs.copy_store_at_16(Address(d, 0), t0, t1); 1267 bs.copy_store_at_16(Address(d, 16), t2, t3); 1268 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1269 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1270 } 1271 __ b(finish); 1272 1273 // 17..32 bytes 1274 __ bind(copy32); 1275 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1276 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1277 1278 bs.copy_store_at_16(Address(d, 0), t0, t1); 1279 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1280 __ b(finish); 1281 1282 // 65..80/96 bytes 1283 // (96 bytes if SIMD because we do 32 byes per instruction) 1284 __ bind(copy80); 1285 if (UseSIMDForMemoryOps) { 1286 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1287 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1288 // Unaligned pointers can be an issue for copying. 1289 // The issue has more chances to happen when granularity of data is 1290 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1291 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1292 // The most performance drop has been seen for the range 65-80 bytes. 1293 // For such cases using the pair of ldp/stp instead of the third pair of 1294 // ldpq/stpq fixes the performance issue. 1295 if (granularity < sizeof (jint)) { 1296 Label copy96; 1297 __ cmp(count, u1(80/granularity)); 1298 __ br(Assembler::HI, copy96); 1299 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1300 1301 bs.copy_store_at_32(Address(d, 0), v0, v1); 1302 bs.copy_store_at_32(Address(d, 32), v2, v3); 1303 1304 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1305 __ b(finish); 1306 1307 __ bind(copy96); 1308 } 1309 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1310 1311 bs.copy_store_at_32(Address(d, 0), v0, v1); 1312 bs.copy_store_at_32(Address(d, 32), v2, v3); 1313 1314 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1315 } else { 1316 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1317 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1318 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1319 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1320 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1321 1322 bs.copy_store_at_16(Address(d, 0), t0, t1); 1323 bs.copy_store_at_16(Address(d, 16), t2, t3); 1324 bs.copy_store_at_16(Address(d, 32), t4, t5); 1325 bs.copy_store_at_16(Address(d, 48), t6, t7); 1326 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1327 } 1328 __ b(finish); 1329 1330 // 0..16 bytes 1331 __ bind(copy16); 1332 __ cmp(count, u1(8/granularity)); 1333 __ br(Assembler::LO, copy8); 1334 1335 // 8..16 bytes 1336 bs.copy_load_at_8(t0, Address(s, 0)); 1337 bs.copy_load_at_8(t1, Address(send, -8)); 1338 bs.copy_store_at_8(Address(d, 0), t0); 1339 bs.copy_store_at_8(Address(dend, -8), t1); 1340 __ b(finish); 1341 1342 if (granularity < 8) { 1343 // 4..7 bytes 1344 __ bind(copy8); 1345 __ tbz(count, 2 - exact_log2(granularity), copy4); 1346 __ ldrw(t0, Address(s, 0)); 1347 __ ldrw(t1, Address(send, -4)); 1348 __ strw(t0, Address(d, 0)); 1349 __ strw(t1, Address(dend, -4)); 1350 __ b(finish); 1351 if (granularity < 4) { 1352 // 0..3 bytes 1353 __ bind(copy4); 1354 __ cbz(count, finish); // get rid of 0 case 1355 if (granularity == 2) { 1356 __ ldrh(t0, Address(s, 0)); 1357 __ strh(t0, Address(d, 0)); 1358 } else { // granularity == 1 1359 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1360 // the first and last byte. 1361 // Handle the 3 byte case by loading and storing base + count/2 1362 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1363 // This does means in the 1 byte case we load/store the same 1364 // byte 3 times. 1365 __ lsr(count, count, 1); 1366 __ ldrb(t0, Address(s, 0)); 1367 __ ldrb(t1, Address(send, -1)); 1368 __ ldrb(t2, Address(s, count)); 1369 __ strb(t0, Address(d, 0)); 1370 __ strb(t1, Address(dend, -1)); 1371 __ strb(t2, Address(d, count)); 1372 } 1373 __ b(finish); 1374 } 1375 } 1376 1377 __ bind(copy_big); 1378 if (is_backwards) { 1379 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1380 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1381 } 1382 1383 // Now we've got the small case out of the way we can align the 1384 // source address on a 2-word boundary. 1385 1386 // Here we will materialize a count in r15, which is used by copy_memory_small 1387 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1388 // Up until here, we have used t9, which aliases r15, but from here on, that register 1389 // can not be used as a temp register, as it contains the count. 1390 1391 Label aligned; 1392 1393 if (is_aligned) { 1394 // We may have to adjust by 1 word to get s 2-word-aligned. 1395 __ tbz(s, exact_log2(wordSize), aligned); 1396 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1397 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1398 __ sub(count, count, wordSize/granularity); 1399 } else { 1400 if (is_backwards) { 1401 __ andr(r15, s, 2 * wordSize - 1); 1402 } else { 1403 __ neg(r15, s); 1404 __ andr(r15, r15, 2 * wordSize - 1); 1405 } 1406 // r15 is the byte adjustment needed to align s. 1407 __ cbz(r15, aligned); 1408 int shift = exact_log2(granularity); 1409 if (shift > 0) { 1410 __ lsr(r15, r15, shift); 1411 } 1412 __ sub(count, count, r15); 1413 1414 #if 0 1415 // ?? This code is only correct for a disjoint copy. It may or 1416 // may not make sense to use it in that case. 1417 1418 // Copy the first pair; s and d may not be aligned. 1419 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1420 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1421 1422 // Align s and d, adjust count 1423 if (is_backwards) { 1424 __ sub(s, s, r15); 1425 __ sub(d, d, r15); 1426 } else { 1427 __ add(s, s, r15); 1428 __ add(d, d, r15); 1429 } 1430 #else 1431 copy_memory_small(decorators, type, s, d, r15, step); 1432 #endif 1433 } 1434 1435 __ bind(aligned); 1436 1437 // s is now 2-word-aligned. 1438 1439 // We have a count of units and some trailing bytes. Adjust the 1440 // count and do a bulk copy of words. If the shift is zero 1441 // perform a move instead to benefit from zero latency moves. 1442 int shift = exact_log2(wordSize/granularity); 1443 if (shift > 0) { 1444 __ lsr(r15, count, shift); 1445 } else { 1446 __ mov(r15, count); 1447 } 1448 if (direction == copy_forwards) { 1449 if (type != T_OBJECT) { 1450 __ bl(copy_f); 1451 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1452 __ bl(copy_obj_uninit_f); 1453 } else { 1454 __ bl(copy_obj_f); 1455 } 1456 } else { 1457 if (type != T_OBJECT) { 1458 __ bl(copy_b); 1459 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1460 __ bl(copy_obj_uninit_b); 1461 } else { 1462 __ bl(copy_obj_b); 1463 } 1464 } 1465 1466 // And the tail. 1467 copy_memory_small(decorators, type, s, d, count, step); 1468 1469 if (granularity >= 8) __ bind(copy8); 1470 if (granularity >= 4) __ bind(copy4); 1471 __ bind(finish); 1472 } 1473 1474 1475 void clobber_registers() { 1476 #ifdef ASSERT 1477 RegSet clobbered 1478 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1479 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1480 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1481 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1482 __ mov(*it, rscratch1); 1483 } 1484 #endif 1485 1486 } 1487 1488 // Scan over array at a for count oops, verifying each one. 1489 // Preserves a and count, clobbers rscratch1 and rscratch2. 1490 void verify_oop_array (int size, Register a, Register count, Register temp) { 1491 Label loop, end; 1492 __ mov(rscratch1, a); 1493 __ mov(rscratch2, zr); 1494 __ bind(loop); 1495 __ cmp(rscratch2, count); 1496 __ br(Assembler::HS, end); 1497 if (size == wordSize) { 1498 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1499 __ verify_oop(temp); 1500 } else { 1501 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1502 __ decode_heap_oop(temp); // calls verify_oop 1503 } 1504 __ add(rscratch2, rscratch2, 1); 1505 __ b(loop); 1506 __ bind(end); 1507 } 1508 1509 // Arguments: 1510 // stub_id - is used to name the stub and identify all details of 1511 // how to perform the copy. 1512 // 1513 // entry - is assigned to the stub's post push entry point unless 1514 // it is null 1515 // 1516 // Inputs: 1517 // c_rarg0 - source array address 1518 // c_rarg1 - destination array address 1519 // c_rarg2 - element count, treated as ssize_t, can be zero 1520 // 1521 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1522 // the hardware handle it. The two dwords within qwords that span 1523 // cache line boundaries will still be loaded and stored atomically. 1524 // 1525 // Side Effects: entry is set to the (post push) entry point so it 1526 // can be used by the corresponding conjoint copy 1527 // method 1528 // 1529 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1530 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1531 RegSet saved_reg = RegSet::of(s, d, count); 1532 int size; 1533 bool aligned; 1534 bool is_oop; 1535 bool dest_uninitialized; 1536 switch (stub_id) { 1537 case jbyte_disjoint_arraycopy_id: 1538 size = sizeof(jbyte); 1539 aligned = false; 1540 is_oop = false; 1541 dest_uninitialized = false; 1542 break; 1543 case arrayof_jbyte_disjoint_arraycopy_id: 1544 size = sizeof(jbyte); 1545 aligned = true; 1546 is_oop = false; 1547 dest_uninitialized = false; 1548 break; 1549 case jshort_disjoint_arraycopy_id: 1550 size = sizeof(jshort); 1551 aligned = false; 1552 is_oop = false; 1553 dest_uninitialized = false; 1554 break; 1555 case arrayof_jshort_disjoint_arraycopy_id: 1556 size = sizeof(jshort); 1557 aligned = true; 1558 is_oop = false; 1559 dest_uninitialized = false; 1560 break; 1561 case jint_disjoint_arraycopy_id: 1562 size = sizeof(jint); 1563 aligned = false; 1564 is_oop = false; 1565 dest_uninitialized = false; 1566 break; 1567 case arrayof_jint_disjoint_arraycopy_id: 1568 size = sizeof(jint); 1569 aligned = true; 1570 is_oop = false; 1571 dest_uninitialized = false; 1572 break; 1573 case jlong_disjoint_arraycopy_id: 1574 // since this is always aligned we can (should!) use the same 1575 // stub as for case arrayof_jlong_disjoint_arraycopy 1576 ShouldNotReachHere(); 1577 break; 1578 case arrayof_jlong_disjoint_arraycopy_id: 1579 size = sizeof(jlong); 1580 aligned = true; 1581 is_oop = false; 1582 dest_uninitialized = false; 1583 break; 1584 case oop_disjoint_arraycopy_id: 1585 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1586 aligned = !UseCompressedOops; 1587 is_oop = true; 1588 dest_uninitialized = false; 1589 break; 1590 case arrayof_oop_disjoint_arraycopy_id: 1591 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1592 aligned = !UseCompressedOops; 1593 is_oop = true; 1594 dest_uninitialized = false; 1595 break; 1596 case oop_disjoint_arraycopy_uninit_id: 1597 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1598 aligned = !UseCompressedOops; 1599 is_oop = true; 1600 dest_uninitialized = true; 1601 break; 1602 case arrayof_oop_disjoint_arraycopy_uninit_id: 1603 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1604 aligned = !UseCompressedOops; 1605 is_oop = true; 1606 dest_uninitialized = true; 1607 break; 1608 default: 1609 ShouldNotReachHere(); 1610 break; 1611 } 1612 1613 __ align(CodeEntryAlignment); 1614 StubCodeMark mark(this, stub_id); 1615 address start = __ pc(); 1616 __ enter(); 1617 1618 if (entry != nullptr) { 1619 *entry = __ pc(); 1620 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1621 BLOCK_COMMENT("Entry:"); 1622 } 1623 1624 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1625 if (dest_uninitialized) { 1626 decorators |= IS_DEST_UNINITIALIZED; 1627 } 1628 if (aligned) { 1629 decorators |= ARRAYCOPY_ALIGNED; 1630 } 1631 1632 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1633 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1634 1635 if (is_oop) { 1636 // save regs before copy_memory 1637 __ push(RegSet::of(d, count), sp); 1638 } 1639 { 1640 // UnsafeMemoryAccess page error: continue after unsafe access 1641 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1642 UnsafeMemoryAccessMark umam(this, add_entry, true); 1643 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1644 } 1645 1646 if (is_oop) { 1647 __ pop(RegSet::of(d, count), sp); 1648 if (VerifyOops) 1649 verify_oop_array(size, d, count, r16); 1650 } 1651 1652 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1653 1654 __ leave(); 1655 __ mov(r0, zr); // return 0 1656 __ ret(lr); 1657 return start; 1658 } 1659 1660 // Arguments: 1661 // stub_id - is used to name the stub and identify all details of 1662 // how to perform the copy. 1663 // 1664 // nooverlap_target - identifes the (post push) entry for the 1665 // corresponding disjoint copy routine which can be 1666 // jumped to if the ranges do not actually overlap 1667 // 1668 // entry - is assigned to the stub's post push entry point unless 1669 // it is null 1670 // 1671 // 1672 // Inputs: 1673 // c_rarg0 - source array address 1674 // c_rarg1 - destination array address 1675 // c_rarg2 - element count, treated as ssize_t, can be zero 1676 // 1677 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1678 // the hardware handle it. The two dwords within qwords that span 1679 // cache line boundaries will still be loaded and stored atomically. 1680 // 1681 // Side Effects: 1682 // entry is set to the no-overlap entry point so it can be used by 1683 // some other conjoint copy method 1684 // 1685 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1686 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1687 RegSet saved_regs = RegSet::of(s, d, count); 1688 int size; 1689 bool aligned; 1690 bool is_oop; 1691 bool dest_uninitialized; 1692 switch (stub_id) { 1693 case jbyte_arraycopy_id: 1694 size = sizeof(jbyte); 1695 aligned = false; 1696 is_oop = false; 1697 dest_uninitialized = false; 1698 break; 1699 case arrayof_jbyte_arraycopy_id: 1700 size = sizeof(jbyte); 1701 aligned = true; 1702 is_oop = false; 1703 dest_uninitialized = false; 1704 break; 1705 case jshort_arraycopy_id: 1706 size = sizeof(jshort); 1707 aligned = false; 1708 is_oop = false; 1709 dest_uninitialized = false; 1710 break; 1711 case arrayof_jshort_arraycopy_id: 1712 size = sizeof(jshort); 1713 aligned = true; 1714 is_oop = false; 1715 dest_uninitialized = false; 1716 break; 1717 case jint_arraycopy_id: 1718 size = sizeof(jint); 1719 aligned = false; 1720 is_oop = false; 1721 dest_uninitialized = false; 1722 break; 1723 case arrayof_jint_arraycopy_id: 1724 size = sizeof(jint); 1725 aligned = true; 1726 is_oop = false; 1727 dest_uninitialized = false; 1728 break; 1729 case jlong_arraycopy_id: 1730 // since this is always aligned we can (should!) use the same 1731 // stub as for case arrayof_jlong_disjoint_arraycopy 1732 ShouldNotReachHere(); 1733 break; 1734 case arrayof_jlong_arraycopy_id: 1735 size = sizeof(jlong); 1736 aligned = true; 1737 is_oop = false; 1738 dest_uninitialized = false; 1739 break; 1740 case oop_arraycopy_id: 1741 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1742 aligned = !UseCompressedOops; 1743 is_oop = true; 1744 dest_uninitialized = false; 1745 break; 1746 case arrayof_oop_arraycopy_id: 1747 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1748 aligned = !UseCompressedOops; 1749 is_oop = true; 1750 dest_uninitialized = false; 1751 break; 1752 case oop_arraycopy_uninit_id: 1753 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1754 aligned = !UseCompressedOops; 1755 is_oop = true; 1756 dest_uninitialized = true; 1757 break; 1758 case arrayof_oop_arraycopy_uninit_id: 1759 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1760 aligned = !UseCompressedOops; 1761 is_oop = true; 1762 dest_uninitialized = true; 1763 break; 1764 default: 1765 ShouldNotReachHere(); 1766 } 1767 1768 StubCodeMark mark(this, stub_id); 1769 address start = __ pc(); 1770 __ enter(); 1771 1772 if (entry != nullptr) { 1773 *entry = __ pc(); 1774 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1775 BLOCK_COMMENT("Entry:"); 1776 } 1777 1778 // use fwd copy when (d-s) above_equal (count*size) 1779 __ sub(rscratch1, d, s); 1780 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1781 __ br(Assembler::HS, nooverlap_target); 1782 1783 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1784 if (dest_uninitialized) { 1785 decorators |= IS_DEST_UNINITIALIZED; 1786 } 1787 if (aligned) { 1788 decorators |= ARRAYCOPY_ALIGNED; 1789 } 1790 1791 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1792 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1793 1794 if (is_oop) { 1795 // save regs before copy_memory 1796 __ push(RegSet::of(d, count), sp); 1797 } 1798 { 1799 // UnsafeMemoryAccess page error: continue after unsafe access 1800 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1801 UnsafeMemoryAccessMark umam(this, add_entry, true); 1802 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1803 } 1804 if (is_oop) { 1805 __ pop(RegSet::of(d, count), sp); 1806 if (VerifyOops) 1807 verify_oop_array(size, d, count, r16); 1808 } 1809 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1810 __ leave(); 1811 __ mov(r0, zr); // return 0 1812 __ ret(lr); 1813 return start; 1814 } 1815 1816 // Helper for generating a dynamic type check. 1817 // Smashes rscratch1, rscratch2. 1818 void generate_type_check(Register sub_klass, 1819 Register super_check_offset, 1820 Register super_klass, 1821 Register temp1, 1822 Register temp2, 1823 Register result, 1824 Label& L_success) { 1825 assert_different_registers(sub_klass, super_check_offset, super_klass); 1826 1827 BLOCK_COMMENT("type_check:"); 1828 1829 Label L_miss; 1830 1831 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1832 super_check_offset); 1833 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1834 1835 // Fall through on failure! 1836 __ BIND(L_miss); 1837 } 1838 1839 // 1840 // Generate checkcasting array copy stub 1841 // 1842 // Input: 1843 // c_rarg0 - source array address 1844 // c_rarg1 - destination array address 1845 // c_rarg2 - element count, treated as ssize_t, can be zero 1846 // c_rarg3 - size_t ckoff (super_check_offset) 1847 // c_rarg4 - oop ckval (super_klass) 1848 // 1849 // Output: 1850 // r0 == 0 - success 1851 // r0 == -1^K - failure, where K is partial transfer count 1852 // 1853 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1854 bool dest_uninitialized; 1855 switch (stub_id) { 1856 case checkcast_arraycopy_id: 1857 dest_uninitialized = false; 1858 break; 1859 case checkcast_arraycopy_uninit_id: 1860 dest_uninitialized = true; 1861 break; 1862 default: 1863 ShouldNotReachHere(); 1864 } 1865 1866 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1867 1868 // Input registers (after setup_arg_regs) 1869 const Register from = c_rarg0; // source array address 1870 const Register to = c_rarg1; // destination array address 1871 const Register count = c_rarg2; // elementscount 1872 const Register ckoff = c_rarg3; // super_check_offset 1873 const Register ckval = c_rarg4; // super_klass 1874 1875 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1876 RegSet wb_post_saved_regs = RegSet::of(count); 1877 1878 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1879 const Register copied_oop = r22; // actual oop copied 1880 const Register count_save = r21; // orig elementscount 1881 const Register start_to = r20; // destination array start address 1882 const Register r19_klass = r19; // oop._klass 1883 1884 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1885 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1886 1887 //--------------------------------------------------------------- 1888 // Assembler stub will be used for this call to arraycopy 1889 // if the two arrays are subtypes of Object[] but the 1890 // destination array type is not equal to or a supertype 1891 // of the source type. Each element must be separately 1892 // checked. 1893 1894 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1895 copied_oop, r19_klass, count_save); 1896 1897 __ align(CodeEntryAlignment); 1898 StubCodeMark mark(this, stub_id); 1899 address start = __ pc(); 1900 1901 __ enter(); // required for proper stackwalking of RuntimeStub frame 1902 1903 #ifdef ASSERT 1904 // caller guarantees that the arrays really are different 1905 // otherwise, we would have to make conjoint checks 1906 { Label L; 1907 __ b(L); // conjoint check not yet implemented 1908 __ stop("checkcast_copy within a single array"); 1909 __ bind(L); 1910 } 1911 #endif //ASSERT 1912 1913 // Caller of this entry point must set up the argument registers. 1914 if (entry != nullptr) { 1915 *entry = __ pc(); 1916 BLOCK_COMMENT("Entry:"); 1917 } 1918 1919 // Empty array: Nothing to do. 1920 __ cbz(count, L_done); 1921 __ push(RegSet::of(r19, r20, r21, r22), sp); 1922 1923 #ifdef ASSERT 1924 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1925 // The ckoff and ckval must be mutually consistent, 1926 // even though caller generates both. 1927 { Label L; 1928 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1929 __ ldrw(start_to, Address(ckval, sco_offset)); 1930 __ cmpw(ckoff, start_to); 1931 __ br(Assembler::EQ, L); 1932 __ stop("super_check_offset inconsistent"); 1933 __ bind(L); 1934 } 1935 #endif //ASSERT 1936 1937 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1938 bool is_oop = true; 1939 int element_size = UseCompressedOops ? 4 : 8; 1940 if (dest_uninitialized) { 1941 decorators |= IS_DEST_UNINITIALIZED; 1942 } 1943 1944 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1945 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1946 1947 // save the original count 1948 __ mov(count_save, count); 1949 1950 // Copy from low to high addresses 1951 __ mov(start_to, to); // Save destination array start address 1952 __ b(L_load_element); 1953 1954 // ======== begin loop ======== 1955 // (Loop is rotated; its entry is L_load_element.) 1956 // Loop control: 1957 // for (; count != 0; count--) { 1958 // copied_oop = load_heap_oop(from++); 1959 // ... generate_type_check ...; 1960 // store_heap_oop(to++, copied_oop); 1961 // } 1962 __ align(OptoLoopAlignment); 1963 1964 __ BIND(L_store_element); 1965 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1966 __ post(to, element_size), copied_oop, noreg, 1967 gct1, gct2, gct3); 1968 __ sub(count, count, 1); 1969 __ cbz(count, L_do_card_marks); 1970 1971 // ======== loop entry is here ======== 1972 __ BIND(L_load_element); 1973 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1974 copied_oop, noreg, __ post(from, element_size), 1975 gct1); 1976 __ cbz(copied_oop, L_store_element); 1977 1978 __ load_klass(r19_klass, copied_oop);// query the object klass 1979 1980 BLOCK_COMMENT("type_check:"); 1981 generate_type_check(/*sub_klass*/r19_klass, 1982 /*super_check_offset*/ckoff, 1983 /*super_klass*/ckval, 1984 /*r_array_base*/gct1, 1985 /*temp2*/gct2, 1986 /*result*/r10, L_store_element); 1987 1988 // Fall through on failure! 1989 1990 // ======== end loop ======== 1991 1992 // It was a real error; we must depend on the caller to finish the job. 1993 // Register count = remaining oops, count_orig = total oops. 1994 // Emit GC store barriers for the oops we have copied and report 1995 // their number to the caller. 1996 1997 __ subs(count, count_save, count); // K = partially copied oop count 1998 __ eon(count, count, zr); // report (-1^K) to caller 1999 __ br(Assembler::EQ, L_done_pop); 2000 2001 __ BIND(L_do_card_marks); 2002 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2003 2004 __ bind(L_done_pop); 2005 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2006 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2007 2008 __ bind(L_done); 2009 __ mov(r0, count); 2010 __ leave(); 2011 __ ret(lr); 2012 2013 return start; 2014 } 2015 2016 // Perform range checks on the proposed arraycopy. 2017 // Kills temp, but nothing else. 2018 // Also, clean the sign bits of src_pos and dst_pos. 2019 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2020 Register src_pos, // source position (c_rarg1) 2021 Register dst, // destination array oo (c_rarg2) 2022 Register dst_pos, // destination position (c_rarg3) 2023 Register length, 2024 Register temp, 2025 Label& L_failed) { 2026 BLOCK_COMMENT("arraycopy_range_checks:"); 2027 2028 assert_different_registers(rscratch1, temp); 2029 2030 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2031 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2032 __ addw(temp, length, src_pos); 2033 __ cmpw(temp, rscratch1); 2034 __ br(Assembler::HI, L_failed); 2035 2036 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2037 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2038 __ addw(temp, length, dst_pos); 2039 __ cmpw(temp, rscratch1); 2040 __ br(Assembler::HI, L_failed); 2041 2042 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2043 __ movw(src_pos, src_pos); 2044 __ movw(dst_pos, dst_pos); 2045 2046 BLOCK_COMMENT("arraycopy_range_checks done"); 2047 } 2048 2049 // These stubs get called from some dumb test routine. 2050 // I'll write them properly when they're called from 2051 // something that's actually doing something. 2052 static void fake_arraycopy_stub(address src, address dst, int count) { 2053 assert(count == 0, "huh?"); 2054 } 2055 2056 2057 // 2058 // Generate 'unsafe' array copy stub 2059 // Though just as safe as the other stubs, it takes an unscaled 2060 // size_t argument instead of an element count. 2061 // 2062 // Input: 2063 // c_rarg0 - source array address 2064 // c_rarg1 - destination array address 2065 // c_rarg2 - byte count, treated as ssize_t, can be zero 2066 // 2067 // Examines the alignment of the operands and dispatches 2068 // to a long, int, short, or byte copy loop. 2069 // 2070 address generate_unsafe_copy(address byte_copy_entry, 2071 address short_copy_entry, 2072 address int_copy_entry, 2073 address long_copy_entry) { 2074 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2075 2076 Label L_long_aligned, L_int_aligned, L_short_aligned; 2077 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2078 2079 __ align(CodeEntryAlignment); 2080 StubCodeMark mark(this, stub_id); 2081 address start = __ pc(); 2082 __ enter(); // required for proper stackwalking of RuntimeStub frame 2083 2084 // bump this on entry, not on exit: 2085 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2086 2087 __ orr(rscratch1, s, d); 2088 __ orr(rscratch1, rscratch1, count); 2089 2090 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2091 __ cbz(rscratch1, L_long_aligned); 2092 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2093 __ cbz(rscratch1, L_int_aligned); 2094 __ tbz(rscratch1, 0, L_short_aligned); 2095 __ b(RuntimeAddress(byte_copy_entry)); 2096 2097 __ BIND(L_short_aligned); 2098 __ lsr(count, count, LogBytesPerShort); // size => short_count 2099 __ b(RuntimeAddress(short_copy_entry)); 2100 __ BIND(L_int_aligned); 2101 __ lsr(count, count, LogBytesPerInt); // size => int_count 2102 __ b(RuntimeAddress(int_copy_entry)); 2103 __ BIND(L_long_aligned); 2104 __ lsr(count, count, LogBytesPerLong); // size => long_count 2105 __ b(RuntimeAddress(long_copy_entry)); 2106 2107 return start; 2108 } 2109 2110 // 2111 // Generate generic array copy stubs 2112 // 2113 // Input: 2114 // c_rarg0 - src oop 2115 // c_rarg1 - src_pos (32-bits) 2116 // c_rarg2 - dst oop 2117 // c_rarg3 - dst_pos (32-bits) 2118 // c_rarg4 - element count (32-bits) 2119 // 2120 // Output: 2121 // r0 == 0 - success 2122 // r0 == -1^K - failure, where K is partial transfer count 2123 // 2124 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2125 address int_copy_entry, address oop_copy_entry, 2126 address long_copy_entry, address checkcast_copy_entry) { 2127 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2128 2129 Label L_failed, L_objArray; 2130 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2131 2132 // Input registers 2133 const Register src = c_rarg0; // source array oop 2134 const Register src_pos = c_rarg1; // source position 2135 const Register dst = c_rarg2; // destination array oop 2136 const Register dst_pos = c_rarg3; // destination position 2137 const Register length = c_rarg4; 2138 2139 2140 // Registers used as temps 2141 const Register dst_klass = c_rarg5; 2142 2143 __ align(CodeEntryAlignment); 2144 2145 StubCodeMark mark(this, stub_id); 2146 2147 address start = __ pc(); 2148 2149 __ enter(); // required for proper stackwalking of RuntimeStub frame 2150 2151 // bump this on entry, not on exit: 2152 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2153 2154 //----------------------------------------------------------------------- 2155 // Assembler stub will be used for this call to arraycopy 2156 // if the following conditions are met: 2157 // 2158 // (1) src and dst must not be null. 2159 // (2) src_pos must not be negative. 2160 // (3) dst_pos must not be negative. 2161 // (4) length must not be negative. 2162 // (5) src klass and dst klass should be the same and not null. 2163 // (6) src and dst should be arrays. 2164 // (7) src_pos + length must not exceed length of src. 2165 // (8) dst_pos + length must not exceed length of dst. 2166 // 2167 2168 // if (src == nullptr) return -1; 2169 __ cbz(src, L_failed); 2170 2171 // if (src_pos < 0) return -1; 2172 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2173 2174 // if (dst == nullptr) return -1; 2175 __ cbz(dst, L_failed); 2176 2177 // if (dst_pos < 0) return -1; 2178 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2179 2180 // registers used as temp 2181 const Register scratch_length = r16; // elements count to copy 2182 const Register scratch_src_klass = r17; // array klass 2183 const Register lh = r15; // layout helper 2184 2185 // if (length < 0) return -1; 2186 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2187 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2188 2189 __ load_klass(scratch_src_klass, src); 2190 #ifdef ASSERT 2191 // assert(src->klass() != nullptr); 2192 { 2193 BLOCK_COMMENT("assert klasses not null {"); 2194 Label L1, L2; 2195 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2196 __ bind(L1); 2197 __ stop("broken null klass"); 2198 __ bind(L2); 2199 __ load_klass(rscratch1, dst); 2200 __ cbz(rscratch1, L1); // this would be broken also 2201 BLOCK_COMMENT("} assert klasses not null done"); 2202 } 2203 #endif 2204 2205 // Load layout helper (32-bits) 2206 // 2207 // |array_tag| | header_size | element_type | |log2_element_size| 2208 // 32 30 24 16 8 2 0 2209 // 2210 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2211 // 2212 2213 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2214 2215 // Handle objArrays completely differently... 2216 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2217 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2218 __ movw(rscratch1, objArray_lh); 2219 __ eorw(rscratch2, lh, rscratch1); 2220 __ cbzw(rscratch2, L_objArray); 2221 2222 // if (src->klass() != dst->klass()) return -1; 2223 __ load_klass(rscratch2, dst); 2224 __ eor(rscratch2, rscratch2, scratch_src_klass); 2225 __ cbnz(rscratch2, L_failed); 2226 2227 // if (!src->is_Array()) return -1; 2228 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2229 2230 // At this point, it is known to be a typeArray (array_tag 0x3). 2231 #ifdef ASSERT 2232 { 2233 BLOCK_COMMENT("assert primitive array {"); 2234 Label L; 2235 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2236 __ cmpw(lh, rscratch2); 2237 __ br(Assembler::GE, L); 2238 __ stop("must be a primitive array"); 2239 __ bind(L); 2240 BLOCK_COMMENT("} assert primitive array done"); 2241 } 2242 #endif 2243 2244 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2245 rscratch2, L_failed); 2246 2247 // TypeArrayKlass 2248 // 2249 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2250 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2251 // 2252 2253 const Register rscratch1_offset = rscratch1; // array offset 2254 const Register r15_elsize = lh; // element size 2255 2256 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2257 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2258 __ add(src, src, rscratch1_offset); // src array offset 2259 __ add(dst, dst, rscratch1_offset); // dst array offset 2260 BLOCK_COMMENT("choose copy loop based on element size"); 2261 2262 // next registers should be set before the jump to corresponding stub 2263 const Register from = c_rarg0; // source array address 2264 const Register to = c_rarg1; // destination array address 2265 const Register count = c_rarg2; // elements count 2266 2267 // 'from', 'to', 'count' registers should be set in such order 2268 // since they are the same as 'src', 'src_pos', 'dst'. 2269 2270 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2271 2272 // The possible values of elsize are 0-3, i.e. exact_log2(element 2273 // size in bytes). We do a simple bitwise binary search. 2274 __ BIND(L_copy_bytes); 2275 __ tbnz(r15_elsize, 1, L_copy_ints); 2276 __ tbnz(r15_elsize, 0, L_copy_shorts); 2277 __ lea(from, Address(src, src_pos));// src_addr 2278 __ lea(to, Address(dst, dst_pos));// dst_addr 2279 __ movw(count, scratch_length); // length 2280 __ b(RuntimeAddress(byte_copy_entry)); 2281 2282 __ BIND(L_copy_shorts); 2283 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2284 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2285 __ movw(count, scratch_length); // length 2286 __ b(RuntimeAddress(short_copy_entry)); 2287 2288 __ BIND(L_copy_ints); 2289 __ tbnz(r15_elsize, 0, L_copy_longs); 2290 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2291 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2292 __ movw(count, scratch_length); // length 2293 __ b(RuntimeAddress(int_copy_entry)); 2294 2295 __ BIND(L_copy_longs); 2296 #ifdef ASSERT 2297 { 2298 BLOCK_COMMENT("assert long copy {"); 2299 Label L; 2300 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2301 __ cmpw(r15_elsize, LogBytesPerLong); 2302 __ br(Assembler::EQ, L); 2303 __ stop("must be long copy, but elsize is wrong"); 2304 __ bind(L); 2305 BLOCK_COMMENT("} assert long copy done"); 2306 } 2307 #endif 2308 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2309 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2310 __ movw(count, scratch_length); // length 2311 __ b(RuntimeAddress(long_copy_entry)); 2312 2313 // ObjArrayKlass 2314 __ BIND(L_objArray); 2315 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2316 2317 Label L_plain_copy, L_checkcast_copy; 2318 // test array classes for subtyping 2319 __ load_klass(r15, dst); 2320 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2321 __ br(Assembler::NE, L_checkcast_copy); 2322 2323 // Identically typed arrays can be copied without element-wise checks. 2324 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2325 rscratch2, L_failed); 2326 2327 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2328 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2329 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2330 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2331 __ movw(count, scratch_length); // length 2332 __ BIND(L_plain_copy); 2333 __ b(RuntimeAddress(oop_copy_entry)); 2334 2335 __ BIND(L_checkcast_copy); 2336 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2337 { 2338 // Before looking at dst.length, make sure dst is also an objArray. 2339 __ ldrw(rscratch1, Address(r15, lh_offset)); 2340 __ movw(rscratch2, objArray_lh); 2341 __ eorw(rscratch1, rscratch1, rscratch2); 2342 __ cbnzw(rscratch1, L_failed); 2343 2344 // It is safe to examine both src.length and dst.length. 2345 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2346 r15, L_failed); 2347 2348 __ load_klass(dst_klass, dst); // reload 2349 2350 // Marshal the base address arguments now, freeing registers. 2351 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2354 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2355 __ movw(count, length); // length (reloaded) 2356 Register sco_temp = c_rarg3; // this register is free now 2357 assert_different_registers(from, to, count, sco_temp, 2358 dst_klass, scratch_src_klass); 2359 // assert_clean_int(count, sco_temp); 2360 2361 // Generate the type check. 2362 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2363 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2364 2365 // Smashes rscratch1, rscratch2 2366 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2367 L_plain_copy); 2368 2369 // Fetch destination element klass from the ObjArrayKlass header. 2370 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2371 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2372 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2373 2374 // the checkcast_copy loop needs two extra arguments: 2375 assert(c_rarg3 == sco_temp, "#3 already in place"); 2376 // Set up arguments for checkcast_copy_entry. 2377 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2378 __ b(RuntimeAddress(checkcast_copy_entry)); 2379 } 2380 2381 __ BIND(L_failed); 2382 __ mov(r0, -1); 2383 __ leave(); // required for proper stackwalking of RuntimeStub frame 2384 __ ret(lr); 2385 2386 return start; 2387 } 2388 2389 // 2390 // Generate stub for array fill. If "aligned" is true, the 2391 // "to" address is assumed to be heapword aligned. 2392 // 2393 // Arguments for generated stub: 2394 // to: c_rarg0 2395 // value: c_rarg1 2396 // count: c_rarg2 treated as signed 2397 // 2398 address generate_fill(StubGenStubId stub_id) { 2399 BasicType t; 2400 bool aligned; 2401 2402 switch (stub_id) { 2403 case jbyte_fill_id: 2404 t = T_BYTE; 2405 aligned = false; 2406 break; 2407 case jshort_fill_id: 2408 t = T_SHORT; 2409 aligned = false; 2410 break; 2411 case jint_fill_id: 2412 t = T_INT; 2413 aligned = false; 2414 break; 2415 case arrayof_jbyte_fill_id: 2416 t = T_BYTE; 2417 aligned = true; 2418 break; 2419 case arrayof_jshort_fill_id: 2420 t = T_SHORT; 2421 aligned = true; 2422 break; 2423 case arrayof_jint_fill_id: 2424 t = T_INT; 2425 aligned = true; 2426 break; 2427 default: 2428 ShouldNotReachHere(); 2429 }; 2430 2431 __ align(CodeEntryAlignment); 2432 StubCodeMark mark(this, stub_id); 2433 address start = __ pc(); 2434 2435 BLOCK_COMMENT("Entry:"); 2436 2437 const Register to = c_rarg0; // source array address 2438 const Register value = c_rarg1; // value 2439 const Register count = c_rarg2; // elements count 2440 2441 const Register bz_base = r10; // base for block_zero routine 2442 const Register cnt_words = r11; // temp register 2443 2444 __ enter(); 2445 2446 Label L_fill_elements, L_exit1; 2447 2448 int shift = -1; 2449 switch (t) { 2450 case T_BYTE: 2451 shift = 0; 2452 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2453 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2454 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2455 __ br(Assembler::LO, L_fill_elements); 2456 break; 2457 case T_SHORT: 2458 shift = 1; 2459 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2460 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2461 __ br(Assembler::LO, L_fill_elements); 2462 break; 2463 case T_INT: 2464 shift = 2; 2465 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2466 __ br(Assembler::LO, L_fill_elements); 2467 break; 2468 default: ShouldNotReachHere(); 2469 } 2470 2471 // Align source address at 8 bytes address boundary. 2472 Label L_skip_align1, L_skip_align2, L_skip_align4; 2473 if (!aligned) { 2474 switch (t) { 2475 case T_BYTE: 2476 // One byte misalignment happens only for byte arrays. 2477 __ tbz(to, 0, L_skip_align1); 2478 __ strb(value, Address(__ post(to, 1))); 2479 __ subw(count, count, 1); 2480 __ bind(L_skip_align1); 2481 // Fallthrough 2482 case T_SHORT: 2483 // Two bytes misalignment happens only for byte and short (char) arrays. 2484 __ tbz(to, 1, L_skip_align2); 2485 __ strh(value, Address(__ post(to, 2))); 2486 __ subw(count, count, 2 >> shift); 2487 __ bind(L_skip_align2); 2488 // Fallthrough 2489 case T_INT: 2490 // Align to 8 bytes, we know we are 4 byte aligned to start. 2491 __ tbz(to, 2, L_skip_align4); 2492 __ strw(value, Address(__ post(to, 4))); 2493 __ subw(count, count, 4 >> shift); 2494 __ bind(L_skip_align4); 2495 break; 2496 default: ShouldNotReachHere(); 2497 } 2498 } 2499 2500 // 2501 // Fill large chunks 2502 // 2503 __ lsrw(cnt_words, count, 3 - shift); // number of words 2504 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2505 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2506 if (UseBlockZeroing) { 2507 Label non_block_zeroing, rest; 2508 // If the fill value is zero we can use the fast zero_words(). 2509 __ cbnz(value, non_block_zeroing); 2510 __ mov(bz_base, to); 2511 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2512 address tpc = __ zero_words(bz_base, cnt_words); 2513 if (tpc == nullptr) { 2514 fatal("CodeCache is full at generate_fill"); 2515 } 2516 __ b(rest); 2517 __ bind(non_block_zeroing); 2518 __ fill_words(to, cnt_words, value); 2519 __ bind(rest); 2520 } else { 2521 __ fill_words(to, cnt_words, value); 2522 } 2523 2524 // Remaining count is less than 8 bytes. Fill it by a single store. 2525 // Note that the total length is no less than 8 bytes. 2526 if (t == T_BYTE || t == T_SHORT) { 2527 Label L_exit1; 2528 __ cbzw(count, L_exit1); 2529 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2530 __ str(value, Address(to, -8)); // overwrite some elements 2531 __ bind(L_exit1); 2532 __ leave(); 2533 __ ret(lr); 2534 } 2535 2536 // Handle copies less than 8 bytes. 2537 Label L_fill_2, L_fill_4, L_exit2; 2538 __ bind(L_fill_elements); 2539 switch (t) { 2540 case T_BYTE: 2541 __ tbz(count, 0, L_fill_2); 2542 __ strb(value, Address(__ post(to, 1))); 2543 __ bind(L_fill_2); 2544 __ tbz(count, 1, L_fill_4); 2545 __ strh(value, Address(__ post(to, 2))); 2546 __ bind(L_fill_4); 2547 __ tbz(count, 2, L_exit2); 2548 __ strw(value, Address(to)); 2549 break; 2550 case T_SHORT: 2551 __ tbz(count, 0, L_fill_4); 2552 __ strh(value, Address(__ post(to, 2))); 2553 __ bind(L_fill_4); 2554 __ tbz(count, 1, L_exit2); 2555 __ strw(value, Address(to)); 2556 break; 2557 case T_INT: 2558 __ cbzw(count, L_exit2); 2559 __ strw(value, Address(to)); 2560 break; 2561 default: ShouldNotReachHere(); 2562 } 2563 __ bind(L_exit2); 2564 __ leave(); 2565 __ ret(lr); 2566 return start; 2567 } 2568 2569 address generate_data_cache_writeback() { 2570 const Register line = c_rarg0; // address of line to write back 2571 2572 __ align(CodeEntryAlignment); 2573 2574 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2575 StubCodeMark mark(this, stub_id); 2576 2577 address start = __ pc(); 2578 __ enter(); 2579 __ cache_wb(Address(line, 0)); 2580 __ leave(); 2581 __ ret(lr); 2582 2583 return start; 2584 } 2585 2586 address generate_data_cache_writeback_sync() { 2587 const Register is_pre = c_rarg0; // pre or post sync 2588 2589 __ align(CodeEntryAlignment); 2590 2591 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2592 StubCodeMark mark(this, stub_id); 2593 2594 // pre wbsync is a no-op 2595 // post wbsync translates to an sfence 2596 2597 Label skip; 2598 address start = __ pc(); 2599 __ enter(); 2600 __ cbnz(is_pre, skip); 2601 __ cache_wbsync(false); 2602 __ bind(skip); 2603 __ leave(); 2604 __ ret(lr); 2605 2606 return start; 2607 } 2608 2609 void generate_arraycopy_stubs() { 2610 address entry; 2611 address entry_jbyte_arraycopy; 2612 address entry_jshort_arraycopy; 2613 address entry_jint_arraycopy; 2614 address entry_oop_arraycopy; 2615 address entry_jlong_arraycopy; 2616 address entry_checkcast_arraycopy; 2617 2618 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2619 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2620 2621 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2622 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2623 2624 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2625 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2626 2627 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2628 2629 //*** jbyte 2630 // Always need aligned and unaligned versions 2631 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2632 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2633 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2634 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2635 2636 //*** jshort 2637 // Always need aligned and unaligned versions 2638 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2639 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2640 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2641 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2642 2643 //*** jint 2644 // Aligned versions 2645 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2646 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2647 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2648 // entry_jint_arraycopy always points to the unaligned version 2649 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2650 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2651 2652 //*** jlong 2653 // It is always aligned 2654 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2655 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2656 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2657 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2658 2659 //*** oops 2660 { 2661 // With compressed oops we need unaligned versions; notice that 2662 // we overwrite entry_oop_arraycopy. 2663 bool aligned = !UseCompressedOops; 2664 2665 StubRoutines::_arrayof_oop_disjoint_arraycopy 2666 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2667 StubRoutines::_arrayof_oop_arraycopy 2668 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2669 // Aligned versions without pre-barriers 2670 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2671 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2672 StubRoutines::_arrayof_oop_arraycopy_uninit 2673 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2674 } 2675 2676 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2677 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2678 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2679 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2680 2681 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2682 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2683 2684 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2685 entry_jshort_arraycopy, 2686 entry_jint_arraycopy, 2687 entry_jlong_arraycopy); 2688 2689 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2690 entry_jshort_arraycopy, 2691 entry_jint_arraycopy, 2692 entry_oop_arraycopy, 2693 entry_jlong_arraycopy, 2694 entry_checkcast_arraycopy); 2695 2696 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2697 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2698 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2699 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2700 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2701 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2702 } 2703 2704 void generate_math_stubs() { Unimplemented(); } 2705 2706 // Arguments: 2707 // 2708 // Inputs: 2709 // c_rarg0 - source byte array address 2710 // c_rarg1 - destination byte array address 2711 // c_rarg2 - K (key) in little endian int array 2712 // 2713 address generate_aescrypt_encryptBlock() { 2714 __ align(CodeEntryAlignment); 2715 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2716 StubCodeMark mark(this, stub_id); 2717 2718 const Register from = c_rarg0; // source array address 2719 const Register to = c_rarg1; // destination array address 2720 const Register key = c_rarg2; // key array address 2721 const Register keylen = rscratch1; 2722 2723 address start = __ pc(); 2724 __ enter(); 2725 2726 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2727 2728 __ aesenc_loadkeys(key, keylen); 2729 __ aesecb_encrypt(from, to, keylen); 2730 2731 __ mov(r0, 0); 2732 2733 __ leave(); 2734 __ ret(lr); 2735 2736 return start; 2737 } 2738 2739 // Arguments: 2740 // 2741 // Inputs: 2742 // c_rarg0 - source byte array address 2743 // c_rarg1 - destination byte array address 2744 // c_rarg2 - K (key) in little endian int array 2745 // 2746 address generate_aescrypt_decryptBlock() { 2747 assert(UseAES, "need AES cryptographic extension support"); 2748 __ align(CodeEntryAlignment); 2749 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2750 StubCodeMark mark(this, stub_id); 2751 Label L_doLast; 2752 2753 const Register from = c_rarg0; // source array address 2754 const Register to = c_rarg1; // destination array address 2755 const Register key = c_rarg2; // key array address 2756 const Register keylen = rscratch1; 2757 2758 address start = __ pc(); 2759 __ enter(); // required for proper stackwalking of RuntimeStub frame 2760 2761 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2762 2763 __ aesecb_decrypt(from, to, key, keylen); 2764 2765 __ mov(r0, 0); 2766 2767 __ leave(); 2768 __ ret(lr); 2769 2770 return start; 2771 } 2772 2773 // Arguments: 2774 // 2775 // Inputs: 2776 // c_rarg0 - source byte array address 2777 // c_rarg1 - destination byte array address 2778 // c_rarg2 - K (key) in little endian int array 2779 // c_rarg3 - r vector byte array address 2780 // c_rarg4 - input length 2781 // 2782 // Output: 2783 // x0 - input length 2784 // 2785 address generate_cipherBlockChaining_encryptAESCrypt() { 2786 assert(UseAES, "need AES cryptographic extension support"); 2787 __ align(CodeEntryAlignment); 2788 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2789 StubCodeMark mark(this, stub_id); 2790 2791 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2792 2793 const Register from = c_rarg0; // source array address 2794 const Register to = c_rarg1; // destination array address 2795 const Register key = c_rarg2; // key array address 2796 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2797 // and left with the results of the last encryption block 2798 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2799 const Register keylen = rscratch1; 2800 2801 address start = __ pc(); 2802 2803 __ enter(); 2804 2805 __ movw(rscratch2, len_reg); 2806 2807 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2808 2809 __ ld1(v0, __ T16B, rvec); 2810 2811 __ cmpw(keylen, 52); 2812 __ br(Assembler::CC, L_loadkeys_44); 2813 __ br(Assembler::EQ, L_loadkeys_52); 2814 2815 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2816 __ rev32(v17, __ T16B, v17); 2817 __ rev32(v18, __ T16B, v18); 2818 __ BIND(L_loadkeys_52); 2819 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2820 __ rev32(v19, __ T16B, v19); 2821 __ rev32(v20, __ T16B, v20); 2822 __ BIND(L_loadkeys_44); 2823 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2824 __ rev32(v21, __ T16B, v21); 2825 __ rev32(v22, __ T16B, v22); 2826 __ rev32(v23, __ T16B, v23); 2827 __ rev32(v24, __ T16B, v24); 2828 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2829 __ rev32(v25, __ T16B, v25); 2830 __ rev32(v26, __ T16B, v26); 2831 __ rev32(v27, __ T16B, v27); 2832 __ rev32(v28, __ T16B, v28); 2833 __ ld1(v29, v30, v31, __ T16B, key); 2834 __ rev32(v29, __ T16B, v29); 2835 __ rev32(v30, __ T16B, v30); 2836 __ rev32(v31, __ T16B, v31); 2837 2838 __ BIND(L_aes_loop); 2839 __ ld1(v1, __ T16B, __ post(from, 16)); 2840 __ eor(v0, __ T16B, v0, v1); 2841 2842 __ br(Assembler::CC, L_rounds_44); 2843 __ br(Assembler::EQ, L_rounds_52); 2844 2845 __ aese(v0, v17); __ aesmc(v0, v0); 2846 __ aese(v0, v18); __ aesmc(v0, v0); 2847 __ BIND(L_rounds_52); 2848 __ aese(v0, v19); __ aesmc(v0, v0); 2849 __ aese(v0, v20); __ aesmc(v0, v0); 2850 __ BIND(L_rounds_44); 2851 __ aese(v0, v21); __ aesmc(v0, v0); 2852 __ aese(v0, v22); __ aesmc(v0, v0); 2853 __ aese(v0, v23); __ aesmc(v0, v0); 2854 __ aese(v0, v24); __ aesmc(v0, v0); 2855 __ aese(v0, v25); __ aesmc(v0, v0); 2856 __ aese(v0, v26); __ aesmc(v0, v0); 2857 __ aese(v0, v27); __ aesmc(v0, v0); 2858 __ aese(v0, v28); __ aesmc(v0, v0); 2859 __ aese(v0, v29); __ aesmc(v0, v0); 2860 __ aese(v0, v30); 2861 __ eor(v0, __ T16B, v0, v31); 2862 2863 __ st1(v0, __ T16B, __ post(to, 16)); 2864 2865 __ subw(len_reg, len_reg, 16); 2866 __ cbnzw(len_reg, L_aes_loop); 2867 2868 __ st1(v0, __ T16B, rvec); 2869 2870 __ mov(r0, rscratch2); 2871 2872 __ leave(); 2873 __ ret(lr); 2874 2875 return start; 2876 } 2877 2878 // Arguments: 2879 // 2880 // Inputs: 2881 // c_rarg0 - source byte array address 2882 // c_rarg1 - destination byte array address 2883 // c_rarg2 - K (key) in little endian int array 2884 // c_rarg3 - r vector byte array address 2885 // c_rarg4 - input length 2886 // 2887 // Output: 2888 // r0 - input length 2889 // 2890 address generate_cipherBlockChaining_decryptAESCrypt() { 2891 assert(UseAES, "need AES cryptographic extension support"); 2892 __ align(CodeEntryAlignment); 2893 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 2894 StubCodeMark mark(this, stub_id); 2895 2896 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2897 2898 const Register from = c_rarg0; // source array address 2899 const Register to = c_rarg1; // destination array address 2900 const Register key = c_rarg2; // key array address 2901 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2902 // and left with the results of the last encryption block 2903 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2904 const Register keylen = rscratch1; 2905 2906 address start = __ pc(); 2907 2908 __ enter(); 2909 2910 __ movw(rscratch2, len_reg); 2911 2912 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2913 2914 __ ld1(v2, __ T16B, rvec); 2915 2916 __ ld1(v31, __ T16B, __ post(key, 16)); 2917 __ rev32(v31, __ T16B, v31); 2918 2919 __ cmpw(keylen, 52); 2920 __ br(Assembler::CC, L_loadkeys_44); 2921 __ br(Assembler::EQ, L_loadkeys_52); 2922 2923 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2924 __ rev32(v17, __ T16B, v17); 2925 __ rev32(v18, __ T16B, v18); 2926 __ BIND(L_loadkeys_52); 2927 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2928 __ rev32(v19, __ T16B, v19); 2929 __ rev32(v20, __ T16B, v20); 2930 __ BIND(L_loadkeys_44); 2931 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2932 __ rev32(v21, __ T16B, v21); 2933 __ rev32(v22, __ T16B, v22); 2934 __ rev32(v23, __ T16B, v23); 2935 __ rev32(v24, __ T16B, v24); 2936 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2937 __ rev32(v25, __ T16B, v25); 2938 __ rev32(v26, __ T16B, v26); 2939 __ rev32(v27, __ T16B, v27); 2940 __ rev32(v28, __ T16B, v28); 2941 __ ld1(v29, v30, __ T16B, key); 2942 __ rev32(v29, __ T16B, v29); 2943 __ rev32(v30, __ T16B, v30); 2944 2945 __ BIND(L_aes_loop); 2946 __ ld1(v0, __ T16B, __ post(from, 16)); 2947 __ orr(v1, __ T16B, v0, v0); 2948 2949 __ br(Assembler::CC, L_rounds_44); 2950 __ br(Assembler::EQ, L_rounds_52); 2951 2952 __ aesd(v0, v17); __ aesimc(v0, v0); 2953 __ aesd(v0, v18); __ aesimc(v0, v0); 2954 __ BIND(L_rounds_52); 2955 __ aesd(v0, v19); __ aesimc(v0, v0); 2956 __ aesd(v0, v20); __ aesimc(v0, v0); 2957 __ BIND(L_rounds_44); 2958 __ aesd(v0, v21); __ aesimc(v0, v0); 2959 __ aesd(v0, v22); __ aesimc(v0, v0); 2960 __ aesd(v0, v23); __ aesimc(v0, v0); 2961 __ aesd(v0, v24); __ aesimc(v0, v0); 2962 __ aesd(v0, v25); __ aesimc(v0, v0); 2963 __ aesd(v0, v26); __ aesimc(v0, v0); 2964 __ aesd(v0, v27); __ aesimc(v0, v0); 2965 __ aesd(v0, v28); __ aesimc(v0, v0); 2966 __ aesd(v0, v29); __ aesimc(v0, v0); 2967 __ aesd(v0, v30); 2968 __ eor(v0, __ T16B, v0, v31); 2969 __ eor(v0, __ T16B, v0, v2); 2970 2971 __ st1(v0, __ T16B, __ post(to, 16)); 2972 __ orr(v2, __ T16B, v1, v1); 2973 2974 __ subw(len_reg, len_reg, 16); 2975 __ cbnzw(len_reg, L_aes_loop); 2976 2977 __ st1(v2, __ T16B, rvec); 2978 2979 __ mov(r0, rscratch2); 2980 2981 __ leave(); 2982 __ ret(lr); 2983 2984 return start; 2985 } 2986 2987 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2988 // Inputs: 128-bits. in is preserved. 2989 // The least-significant 64-bit word is in the upper dword of each vector. 2990 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2991 // Output: result 2992 void be_add_128_64(FloatRegister result, FloatRegister in, 2993 FloatRegister inc, FloatRegister tmp) { 2994 assert_different_registers(result, tmp, inc); 2995 2996 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2997 // input 2998 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 2999 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3000 // MSD == 0 (must be!) to LSD 3001 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3002 } 3003 3004 // CTR AES crypt. 3005 // Arguments: 3006 // 3007 // Inputs: 3008 // c_rarg0 - source byte array address 3009 // c_rarg1 - destination byte array address 3010 // c_rarg2 - K (key) in little endian int array 3011 // c_rarg3 - counter vector byte array address 3012 // c_rarg4 - input length 3013 // c_rarg5 - saved encryptedCounter start 3014 // c_rarg6 - saved used length 3015 // 3016 // Output: 3017 // r0 - input length 3018 // 3019 address generate_counterMode_AESCrypt() { 3020 const Register in = c_rarg0; 3021 const Register out = c_rarg1; 3022 const Register key = c_rarg2; 3023 const Register counter = c_rarg3; 3024 const Register saved_len = c_rarg4, len = r10; 3025 const Register saved_encrypted_ctr = c_rarg5; 3026 const Register used_ptr = c_rarg6, used = r12; 3027 3028 const Register offset = r7; 3029 const Register keylen = r11; 3030 3031 const unsigned char block_size = 16; 3032 const int bulk_width = 4; 3033 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3034 // performance with larger data sizes, but it also means that the 3035 // fast path isn't used until you have at least 8 blocks, and up 3036 // to 127 bytes of data will be executed on the slow path. For 3037 // that reason, and also so as not to blow away too much icache, 4 3038 // blocks seems like a sensible compromise. 3039 3040 // Algorithm: 3041 // 3042 // if (len == 0) { 3043 // goto DONE; 3044 // } 3045 // int result = len; 3046 // do { 3047 // if (used >= blockSize) { 3048 // if (len >= bulk_width * blockSize) { 3049 // CTR_large_block(); 3050 // if (len == 0) 3051 // goto DONE; 3052 // } 3053 // for (;;) { 3054 // 16ByteVector v0 = counter; 3055 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3056 // used = 0; 3057 // if (len < blockSize) 3058 // break; /* goto NEXT */ 3059 // 16ByteVector v1 = load16Bytes(in, offset); 3060 // v1 = v1 ^ encryptedCounter; 3061 // store16Bytes(out, offset); 3062 // used = blockSize; 3063 // offset += blockSize; 3064 // len -= blockSize; 3065 // if (len == 0) 3066 // goto DONE; 3067 // } 3068 // } 3069 // NEXT: 3070 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3071 // len--; 3072 // } while (len != 0); 3073 // DONE: 3074 // return result; 3075 // 3076 // CTR_large_block() 3077 // Wide bulk encryption of whole blocks. 3078 3079 __ align(CodeEntryAlignment); 3080 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3081 StubCodeMark mark(this, stub_id); 3082 const address start = __ pc(); 3083 __ enter(); 3084 3085 Label DONE, CTR_large_block, large_block_return; 3086 __ ldrw(used, Address(used_ptr)); 3087 __ cbzw(saved_len, DONE); 3088 3089 __ mov(len, saved_len); 3090 __ mov(offset, 0); 3091 3092 // Compute #rounds for AES based on the length of the key array 3093 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3094 3095 __ aesenc_loadkeys(key, keylen); 3096 3097 { 3098 Label L_CTR_loop, NEXT; 3099 3100 __ bind(L_CTR_loop); 3101 3102 __ cmp(used, block_size); 3103 __ br(__ LO, NEXT); 3104 3105 // Maybe we have a lot of data 3106 __ subsw(rscratch1, len, bulk_width * block_size); 3107 __ br(__ HS, CTR_large_block); 3108 __ BIND(large_block_return); 3109 __ cbzw(len, DONE); 3110 3111 // Setup the counter 3112 __ movi(v4, __ T4S, 0); 3113 __ movi(v5, __ T4S, 1); 3114 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3115 3116 // 128-bit big-endian increment 3117 __ ld1(v0, __ T16B, counter); 3118 __ rev64(v16, __ T16B, v0); 3119 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3120 __ rev64(v16, __ T16B, v16); 3121 __ st1(v16, __ T16B, counter); 3122 // Previous counter value is in v0 3123 // v4 contains { 0, 1 } 3124 3125 { 3126 // We have fewer than bulk_width blocks of data left. Encrypt 3127 // them one by one until there is less than a full block 3128 // remaining, being careful to save both the encrypted counter 3129 // and the counter. 3130 3131 Label inner_loop; 3132 __ bind(inner_loop); 3133 // Counter to encrypt is in v0 3134 __ aesecb_encrypt(noreg, noreg, keylen); 3135 __ st1(v0, __ T16B, saved_encrypted_ctr); 3136 3137 // Do we have a remaining full block? 3138 3139 __ mov(used, 0); 3140 __ cmp(len, block_size); 3141 __ br(__ LO, NEXT); 3142 3143 // Yes, we have a full block 3144 __ ldrq(v1, Address(in, offset)); 3145 __ eor(v1, __ T16B, v1, v0); 3146 __ strq(v1, Address(out, offset)); 3147 __ mov(used, block_size); 3148 __ add(offset, offset, block_size); 3149 3150 __ subw(len, len, block_size); 3151 __ cbzw(len, DONE); 3152 3153 // Increment the counter, store it back 3154 __ orr(v0, __ T16B, v16, v16); 3155 __ rev64(v16, __ T16B, v16); 3156 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3157 __ rev64(v16, __ T16B, v16); 3158 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3159 3160 __ b(inner_loop); 3161 } 3162 3163 __ BIND(NEXT); 3164 3165 // Encrypt a single byte, and loop. 3166 // We expect this to be a rare event. 3167 __ ldrb(rscratch1, Address(in, offset)); 3168 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3169 __ eor(rscratch1, rscratch1, rscratch2); 3170 __ strb(rscratch1, Address(out, offset)); 3171 __ add(offset, offset, 1); 3172 __ add(used, used, 1); 3173 __ subw(len, len,1); 3174 __ cbnzw(len, L_CTR_loop); 3175 } 3176 3177 __ bind(DONE); 3178 __ strw(used, Address(used_ptr)); 3179 __ mov(r0, saved_len); 3180 3181 __ leave(); // required for proper stackwalking of RuntimeStub frame 3182 __ ret(lr); 3183 3184 // Bulk encryption 3185 3186 __ BIND (CTR_large_block); 3187 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3188 3189 if (bulk_width == 8) { 3190 __ sub(sp, sp, 4 * 16); 3191 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3192 } 3193 __ sub(sp, sp, 4 * 16); 3194 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3195 RegSet saved_regs = (RegSet::of(in, out, offset) 3196 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3197 __ push(saved_regs, sp); 3198 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3199 __ add(in, in, offset); 3200 __ add(out, out, offset); 3201 3202 // Keys should already be loaded into the correct registers 3203 3204 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3205 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3206 3207 // AES/CTR loop 3208 { 3209 Label L_CTR_loop; 3210 __ BIND(L_CTR_loop); 3211 3212 // Setup the counters 3213 __ movi(v8, __ T4S, 0); 3214 __ movi(v9, __ T4S, 1); 3215 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3216 3217 for (int i = 0; i < bulk_width; i++) { 3218 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3219 __ rev64(v0_ofs, __ T16B, v16); 3220 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3221 } 3222 3223 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3224 3225 // Encrypt the counters 3226 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3227 3228 if (bulk_width == 8) { 3229 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3230 } 3231 3232 // XOR the encrypted counters with the inputs 3233 for (int i = 0; i < bulk_width; i++) { 3234 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3235 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3236 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3237 } 3238 3239 // Write the encrypted data 3240 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3241 if (bulk_width == 8) { 3242 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3243 } 3244 3245 __ subw(len, len, 16 * bulk_width); 3246 __ cbnzw(len, L_CTR_loop); 3247 } 3248 3249 // Save the counter back where it goes 3250 __ rev64(v16, __ T16B, v16); 3251 __ st1(v16, __ T16B, counter); 3252 3253 __ pop(saved_regs, sp); 3254 3255 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3256 if (bulk_width == 8) { 3257 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3258 } 3259 3260 __ andr(rscratch1, len, -16 * bulk_width); 3261 __ sub(len, len, rscratch1); 3262 __ add(offset, offset, rscratch1); 3263 __ mov(used, 16); 3264 __ strw(used, Address(used_ptr)); 3265 __ b(large_block_return); 3266 3267 return start; 3268 } 3269 3270 // Vector AES Galois Counter Mode implementation. Parameters: 3271 // 3272 // in = c_rarg0 3273 // len = c_rarg1 3274 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3275 // out = c_rarg3 3276 // key = c_rarg4 3277 // state = c_rarg5 - GHASH.state 3278 // subkeyHtbl = c_rarg6 - powers of H 3279 // counter = c_rarg7 - 16 bytes of CTR 3280 // return - number of processed bytes 3281 address generate_galoisCounterMode_AESCrypt() { 3282 address ghash_polynomial = __ pc(); 3283 __ emit_int64(0x87); // The low-order bits of the field 3284 // polynomial (i.e. p = z^7+z^2+z+1) 3285 // repeated in the low and high parts of a 3286 // 128-bit vector 3287 __ emit_int64(0x87); 3288 3289 __ align(CodeEntryAlignment); 3290 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3291 StubCodeMark mark(this, stub_id); 3292 address start = __ pc(); 3293 __ enter(); 3294 3295 const Register in = c_rarg0; 3296 const Register len = c_rarg1; 3297 const Register ct = c_rarg2; 3298 const Register out = c_rarg3; 3299 // and updated with the incremented counter in the end 3300 3301 const Register key = c_rarg4; 3302 const Register state = c_rarg5; 3303 3304 const Register subkeyHtbl = c_rarg6; 3305 3306 const Register counter = c_rarg7; 3307 3308 const Register keylen = r10; 3309 // Save state before entering routine 3310 __ sub(sp, sp, 4 * 16); 3311 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3312 __ sub(sp, sp, 4 * 16); 3313 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3314 3315 // __ andr(len, len, -512); 3316 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3317 __ str(len, __ pre(sp, -2 * wordSize)); 3318 3319 Label DONE; 3320 __ cbz(len, DONE); 3321 3322 // Compute #rounds for AES based on the length of the key array 3323 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3324 3325 __ aesenc_loadkeys(key, keylen); 3326 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3327 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3328 3329 // AES/CTR loop 3330 { 3331 Label L_CTR_loop; 3332 __ BIND(L_CTR_loop); 3333 3334 // Setup the counters 3335 __ movi(v8, __ T4S, 0); 3336 __ movi(v9, __ T4S, 1); 3337 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3338 3339 assert(v0->encoding() < v8->encoding(), ""); 3340 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3341 FloatRegister f = as_FloatRegister(i); 3342 __ rev32(f, __ T16B, v16); 3343 __ addv(v16, __ T4S, v16, v8); 3344 } 3345 3346 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3347 3348 // Encrypt the counters 3349 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3350 3351 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3352 3353 // XOR the encrypted counters with the inputs 3354 for (int i = 0; i < 8; i++) { 3355 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3356 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3357 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3358 } 3359 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3360 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3361 3362 __ subw(len, len, 16 * 8); 3363 __ cbnzw(len, L_CTR_loop); 3364 } 3365 3366 __ rev32(v16, __ T16B, v16); 3367 __ st1(v16, __ T16B, counter); 3368 3369 __ ldr(len, Address(sp)); 3370 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3371 3372 // GHASH/CTR loop 3373 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3374 len, /*unrolls*/4); 3375 3376 #ifdef ASSERT 3377 { Label L; 3378 __ cmp(len, (unsigned char)0); 3379 __ br(Assembler::EQ, L); 3380 __ stop("stubGenerator: abort"); 3381 __ bind(L); 3382 } 3383 #endif 3384 3385 __ bind(DONE); 3386 // Return the number of bytes processed 3387 __ ldr(r0, __ post(sp, 2 * wordSize)); 3388 3389 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3390 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3391 3392 __ leave(); // required for proper stackwalking of RuntimeStub frame 3393 __ ret(lr); 3394 return start; 3395 } 3396 3397 class Cached64Bytes { 3398 private: 3399 MacroAssembler *_masm; 3400 Register _regs[8]; 3401 3402 public: 3403 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3404 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3405 auto it = rs.begin(); 3406 for (auto &r: _regs) { 3407 r = *it; 3408 ++it; 3409 } 3410 } 3411 3412 void gen_loads(Register base) { 3413 for (int i = 0; i < 8; i += 2) { 3414 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3415 } 3416 } 3417 3418 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3419 void extract_u32(Register dest, int i) { 3420 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3421 } 3422 }; 3423 3424 // Utility routines for md5. 3425 // Clobbers r10 and r11. 3426 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3427 int k, int s, int t) { 3428 Register rscratch3 = r10; 3429 Register rscratch4 = r11; 3430 3431 __ eorw(rscratch3, r3, r4); 3432 __ movw(rscratch2, t); 3433 __ andw(rscratch3, rscratch3, r2); 3434 __ addw(rscratch4, r1, rscratch2); 3435 reg_cache.extract_u32(rscratch1, k); 3436 __ eorw(rscratch3, rscratch3, r4); 3437 __ addw(rscratch4, rscratch4, rscratch1); 3438 __ addw(rscratch3, rscratch3, rscratch4); 3439 __ rorw(rscratch2, rscratch3, 32 - s); 3440 __ addw(r1, rscratch2, r2); 3441 } 3442 3443 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3444 int k, int s, int t) { 3445 Register rscratch3 = r10; 3446 Register rscratch4 = r11; 3447 3448 reg_cache.extract_u32(rscratch1, k); 3449 __ movw(rscratch2, t); 3450 __ addw(rscratch4, r1, rscratch2); 3451 __ addw(rscratch4, rscratch4, rscratch1); 3452 __ bicw(rscratch2, r3, r4); 3453 __ andw(rscratch3, r2, r4); 3454 __ addw(rscratch2, rscratch2, rscratch4); 3455 __ addw(rscratch2, rscratch2, rscratch3); 3456 __ rorw(rscratch2, rscratch2, 32 - s); 3457 __ addw(r1, rscratch2, r2); 3458 } 3459 3460 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3461 int k, int s, int t) { 3462 Register rscratch3 = r10; 3463 Register rscratch4 = r11; 3464 3465 __ eorw(rscratch3, r3, r4); 3466 __ movw(rscratch2, t); 3467 __ addw(rscratch4, r1, rscratch2); 3468 reg_cache.extract_u32(rscratch1, k); 3469 __ eorw(rscratch3, rscratch3, r2); 3470 __ addw(rscratch4, rscratch4, rscratch1); 3471 __ addw(rscratch3, rscratch3, rscratch4); 3472 __ rorw(rscratch2, rscratch3, 32 - s); 3473 __ addw(r1, rscratch2, r2); 3474 } 3475 3476 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3477 int k, int s, int t) { 3478 Register rscratch3 = r10; 3479 Register rscratch4 = r11; 3480 3481 __ movw(rscratch3, t); 3482 __ ornw(rscratch2, r2, r4); 3483 __ addw(rscratch4, r1, rscratch3); 3484 reg_cache.extract_u32(rscratch1, k); 3485 __ eorw(rscratch3, rscratch2, r3); 3486 __ addw(rscratch4, rscratch4, rscratch1); 3487 __ addw(rscratch3, rscratch3, rscratch4); 3488 __ rorw(rscratch2, rscratch3, 32 - s); 3489 __ addw(r1, rscratch2, r2); 3490 } 3491 3492 // Arguments: 3493 // 3494 // Inputs: 3495 // c_rarg0 - byte[] source+offset 3496 // c_rarg1 - int[] SHA.state 3497 // c_rarg2 - int offset 3498 // c_rarg3 - int limit 3499 // 3500 address generate_md5_implCompress(StubGenStubId stub_id) { 3501 bool multi_block; 3502 switch (stub_id) { 3503 case md5_implCompress_id: 3504 multi_block = false; 3505 break; 3506 case md5_implCompressMB_id: 3507 multi_block = true; 3508 break; 3509 default: 3510 ShouldNotReachHere(); 3511 } 3512 __ align(CodeEntryAlignment); 3513 3514 StubCodeMark mark(this, stub_id); 3515 address start = __ pc(); 3516 3517 Register buf = c_rarg0; 3518 Register state = c_rarg1; 3519 Register ofs = c_rarg2; 3520 Register limit = c_rarg3; 3521 Register a = r4; 3522 Register b = r5; 3523 Register c = r6; 3524 Register d = r7; 3525 Register rscratch3 = r10; 3526 Register rscratch4 = r11; 3527 3528 Register state_regs[2] = { r12, r13 }; 3529 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3530 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3531 3532 __ push(saved_regs, sp); 3533 3534 __ ldp(state_regs[0], state_regs[1], Address(state)); 3535 __ ubfx(a, state_regs[0], 0, 32); 3536 __ ubfx(b, state_regs[0], 32, 32); 3537 __ ubfx(c, state_regs[1], 0, 32); 3538 __ ubfx(d, state_regs[1], 32, 32); 3539 3540 Label md5_loop; 3541 __ BIND(md5_loop); 3542 3543 reg_cache.gen_loads(buf); 3544 3545 // Round 1 3546 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3547 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3548 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3549 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3550 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3551 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3552 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3553 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3554 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3555 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3556 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3557 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3558 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3559 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3560 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3561 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3562 3563 // Round 2 3564 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3565 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3566 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3567 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3568 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3569 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3570 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3571 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3572 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3573 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3574 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3575 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3576 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3577 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3578 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3579 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3580 3581 // Round 3 3582 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3583 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3584 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3585 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3586 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3587 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3588 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3589 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3590 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3591 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3592 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3593 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3594 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3595 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3596 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3597 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3598 3599 // Round 4 3600 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3601 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3602 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3603 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3604 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3605 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3606 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3607 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3608 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3609 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3610 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3611 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3612 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3613 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3614 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3615 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3616 3617 __ addw(a, state_regs[0], a); 3618 __ ubfx(rscratch2, state_regs[0], 32, 32); 3619 __ addw(b, rscratch2, b); 3620 __ addw(c, state_regs[1], c); 3621 __ ubfx(rscratch4, state_regs[1], 32, 32); 3622 __ addw(d, rscratch4, d); 3623 3624 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3625 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3626 3627 if (multi_block) { 3628 __ add(buf, buf, 64); 3629 __ add(ofs, ofs, 64); 3630 __ cmp(ofs, limit); 3631 __ br(Assembler::LE, md5_loop); 3632 __ mov(c_rarg0, ofs); // return ofs 3633 } 3634 3635 // write hash values back in the correct order 3636 __ stp(state_regs[0], state_regs[1], Address(state)); 3637 3638 __ pop(saved_regs, sp); 3639 3640 __ ret(lr); 3641 3642 return start; 3643 } 3644 3645 // Arguments: 3646 // 3647 // Inputs: 3648 // c_rarg0 - byte[] source+offset 3649 // c_rarg1 - int[] SHA.state 3650 // c_rarg2 - int offset 3651 // c_rarg3 - int limit 3652 // 3653 address generate_sha1_implCompress(StubGenStubId stub_id) { 3654 bool multi_block; 3655 switch (stub_id) { 3656 case sha1_implCompress_id: 3657 multi_block = false; 3658 break; 3659 case sha1_implCompressMB_id: 3660 multi_block = true; 3661 break; 3662 default: 3663 ShouldNotReachHere(); 3664 } 3665 3666 __ align(CodeEntryAlignment); 3667 3668 StubCodeMark mark(this, stub_id); 3669 address start = __ pc(); 3670 3671 Register buf = c_rarg0; 3672 Register state = c_rarg1; 3673 Register ofs = c_rarg2; 3674 Register limit = c_rarg3; 3675 3676 Label keys; 3677 Label sha1_loop; 3678 3679 // load the keys into v0..v3 3680 __ adr(rscratch1, keys); 3681 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3682 // load 5 words state into v6, v7 3683 __ ldrq(v6, Address(state, 0)); 3684 __ ldrs(v7, Address(state, 16)); 3685 3686 3687 __ BIND(sha1_loop); 3688 // load 64 bytes of data into v16..v19 3689 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3690 __ rev32(v16, __ T16B, v16); 3691 __ rev32(v17, __ T16B, v17); 3692 __ rev32(v18, __ T16B, v18); 3693 __ rev32(v19, __ T16B, v19); 3694 3695 // do the sha1 3696 __ addv(v4, __ T4S, v16, v0); 3697 __ orr(v20, __ T16B, v6, v6); 3698 3699 FloatRegister d0 = v16; 3700 FloatRegister d1 = v17; 3701 FloatRegister d2 = v18; 3702 FloatRegister d3 = v19; 3703 3704 for (int round = 0; round < 20; round++) { 3705 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3706 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3707 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3708 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3709 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3710 3711 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3712 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3713 __ sha1h(tmp2, __ T4S, v20); 3714 if (round < 5) 3715 __ sha1c(v20, __ T4S, tmp3, tmp4); 3716 else if (round < 10 || round >= 15) 3717 __ sha1p(v20, __ T4S, tmp3, tmp4); 3718 else 3719 __ sha1m(v20, __ T4S, tmp3, tmp4); 3720 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3721 3722 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3723 } 3724 3725 __ addv(v7, __ T2S, v7, v21); 3726 __ addv(v6, __ T4S, v6, v20); 3727 3728 if (multi_block) { 3729 __ add(ofs, ofs, 64); 3730 __ cmp(ofs, limit); 3731 __ br(Assembler::LE, sha1_loop); 3732 __ mov(c_rarg0, ofs); // return ofs 3733 } 3734 3735 __ strq(v6, Address(state, 0)); 3736 __ strs(v7, Address(state, 16)); 3737 3738 __ ret(lr); 3739 3740 __ bind(keys); 3741 __ emit_int32(0x5a827999); 3742 __ emit_int32(0x6ed9eba1); 3743 __ emit_int32(0x8f1bbcdc); 3744 __ emit_int32(0xca62c1d6); 3745 3746 return start; 3747 } 3748 3749 3750 // Arguments: 3751 // 3752 // Inputs: 3753 // c_rarg0 - byte[] source+offset 3754 // c_rarg1 - int[] SHA.state 3755 // c_rarg2 - int offset 3756 // c_rarg3 - int limit 3757 // 3758 address generate_sha256_implCompress(StubGenStubId stub_id) { 3759 bool multi_block; 3760 switch (stub_id) { 3761 case sha256_implCompress_id: 3762 multi_block = false; 3763 break; 3764 case sha256_implCompressMB_id: 3765 multi_block = true; 3766 break; 3767 default: 3768 ShouldNotReachHere(); 3769 } 3770 3771 static const uint32_t round_consts[64] = { 3772 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3773 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3774 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3775 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3776 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3777 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3778 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3779 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3780 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3781 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3782 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3783 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3784 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3785 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3786 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3787 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3788 }; 3789 3790 __ align(CodeEntryAlignment); 3791 3792 StubCodeMark mark(this, stub_id); 3793 address start = __ pc(); 3794 3795 Register buf = c_rarg0; 3796 Register state = c_rarg1; 3797 Register ofs = c_rarg2; 3798 Register limit = c_rarg3; 3799 3800 Label sha1_loop; 3801 3802 __ stpd(v8, v9, __ pre(sp, -32)); 3803 __ stpd(v10, v11, Address(sp, 16)); 3804 3805 // dga == v0 3806 // dgb == v1 3807 // dg0 == v2 3808 // dg1 == v3 3809 // dg2 == v4 3810 // t0 == v6 3811 // t1 == v7 3812 3813 // load 16 keys to v16..v31 3814 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3815 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3816 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3817 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3818 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3819 3820 // load 8 words (256 bits) state 3821 __ ldpq(v0, v1, state); 3822 3823 __ BIND(sha1_loop); 3824 // load 64 bytes of data into v8..v11 3825 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3826 __ rev32(v8, __ T16B, v8); 3827 __ rev32(v9, __ T16B, v9); 3828 __ rev32(v10, __ T16B, v10); 3829 __ rev32(v11, __ T16B, v11); 3830 3831 __ addv(v6, __ T4S, v8, v16); 3832 __ orr(v2, __ T16B, v0, v0); 3833 __ orr(v3, __ T16B, v1, v1); 3834 3835 FloatRegister d0 = v8; 3836 FloatRegister d1 = v9; 3837 FloatRegister d2 = v10; 3838 FloatRegister d3 = v11; 3839 3840 3841 for (int round = 0; round < 16; round++) { 3842 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3843 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3844 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3845 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3846 3847 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3848 __ orr(v4, __ T16B, v2, v2); 3849 if (round < 15) 3850 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3851 __ sha256h(v2, __ T4S, v3, tmp2); 3852 __ sha256h2(v3, __ T4S, v4, tmp2); 3853 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3854 3855 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3856 } 3857 3858 __ addv(v0, __ T4S, v0, v2); 3859 __ addv(v1, __ T4S, v1, v3); 3860 3861 if (multi_block) { 3862 __ add(ofs, ofs, 64); 3863 __ cmp(ofs, limit); 3864 __ br(Assembler::LE, sha1_loop); 3865 __ mov(c_rarg0, ofs); // return ofs 3866 } 3867 3868 __ ldpd(v10, v11, Address(sp, 16)); 3869 __ ldpd(v8, v9, __ post(sp, 32)); 3870 3871 __ stpq(v0, v1, state); 3872 3873 __ ret(lr); 3874 3875 return start; 3876 } 3877 3878 // Double rounds for sha512. 3879 void sha512_dround(int dr, 3880 FloatRegister vi0, FloatRegister vi1, 3881 FloatRegister vi2, FloatRegister vi3, 3882 FloatRegister vi4, FloatRegister vrc0, 3883 FloatRegister vrc1, FloatRegister vin0, 3884 FloatRegister vin1, FloatRegister vin2, 3885 FloatRegister vin3, FloatRegister vin4) { 3886 if (dr < 36) { 3887 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3888 } 3889 __ addv(v5, __ T2D, vrc0, vin0); 3890 __ ext(v6, __ T16B, vi2, vi3, 8); 3891 __ ext(v5, __ T16B, v5, v5, 8); 3892 __ ext(v7, __ T16B, vi1, vi2, 8); 3893 __ addv(vi3, __ T2D, vi3, v5); 3894 if (dr < 32) { 3895 __ ext(v5, __ T16B, vin3, vin4, 8); 3896 __ sha512su0(vin0, __ T2D, vin1); 3897 } 3898 __ sha512h(vi3, __ T2D, v6, v7); 3899 if (dr < 32) { 3900 __ sha512su1(vin0, __ T2D, vin2, v5); 3901 } 3902 __ addv(vi4, __ T2D, vi1, vi3); 3903 __ sha512h2(vi3, __ T2D, vi1, vi0); 3904 } 3905 3906 // Arguments: 3907 // 3908 // Inputs: 3909 // c_rarg0 - byte[] source+offset 3910 // c_rarg1 - int[] SHA.state 3911 // c_rarg2 - int offset 3912 // c_rarg3 - int limit 3913 // 3914 address generate_sha512_implCompress(StubGenStubId stub_id) { 3915 bool multi_block; 3916 switch (stub_id) { 3917 case sha512_implCompress_id: 3918 multi_block = false; 3919 break; 3920 case sha512_implCompressMB_id: 3921 multi_block = true; 3922 break; 3923 default: 3924 ShouldNotReachHere(); 3925 } 3926 3927 static const uint64_t round_consts[80] = { 3928 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3929 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3930 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3931 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3932 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3933 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3934 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3935 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3936 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3937 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3938 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3939 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3940 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3941 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3942 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3943 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3944 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3945 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3946 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3947 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3948 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3949 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3950 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3951 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3952 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3953 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3954 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3955 }; 3956 3957 __ align(CodeEntryAlignment); 3958 3959 StubCodeMark mark(this, stub_id); 3960 address start = __ pc(); 3961 3962 Register buf = c_rarg0; 3963 Register state = c_rarg1; 3964 Register ofs = c_rarg2; 3965 Register limit = c_rarg3; 3966 3967 __ stpd(v8, v9, __ pre(sp, -64)); 3968 __ stpd(v10, v11, Address(sp, 16)); 3969 __ stpd(v12, v13, Address(sp, 32)); 3970 __ stpd(v14, v15, Address(sp, 48)); 3971 3972 Label sha512_loop; 3973 3974 // load state 3975 __ ld1(v8, v9, v10, v11, __ T2D, state); 3976 3977 // load first 4 round constants 3978 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3979 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3980 3981 __ BIND(sha512_loop); 3982 // load 128B of data into v12..v19 3983 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3984 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3985 __ rev64(v12, __ T16B, v12); 3986 __ rev64(v13, __ T16B, v13); 3987 __ rev64(v14, __ T16B, v14); 3988 __ rev64(v15, __ T16B, v15); 3989 __ rev64(v16, __ T16B, v16); 3990 __ rev64(v17, __ T16B, v17); 3991 __ rev64(v18, __ T16B, v18); 3992 __ rev64(v19, __ T16B, v19); 3993 3994 __ mov(rscratch2, rscratch1); 3995 3996 __ mov(v0, __ T16B, v8); 3997 __ mov(v1, __ T16B, v9); 3998 __ mov(v2, __ T16B, v10); 3999 __ mov(v3, __ T16B, v11); 4000 4001 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4002 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4003 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4004 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4005 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4006 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4007 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4008 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4009 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4010 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4011 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4012 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4013 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4014 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4015 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4016 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4017 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4018 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4019 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4020 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4021 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4022 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4023 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4024 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4025 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4026 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4027 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4028 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4029 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4030 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4031 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4032 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4033 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4034 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4035 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4036 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4037 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4038 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4039 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4040 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4041 4042 __ addv(v8, __ T2D, v8, v0); 4043 __ addv(v9, __ T2D, v9, v1); 4044 __ addv(v10, __ T2D, v10, v2); 4045 __ addv(v11, __ T2D, v11, v3); 4046 4047 if (multi_block) { 4048 __ add(ofs, ofs, 128); 4049 __ cmp(ofs, limit); 4050 __ br(Assembler::LE, sha512_loop); 4051 __ mov(c_rarg0, ofs); // return ofs 4052 } 4053 4054 __ st1(v8, v9, v10, v11, __ T2D, state); 4055 4056 __ ldpd(v14, v15, Address(sp, 48)); 4057 __ ldpd(v12, v13, Address(sp, 32)); 4058 __ ldpd(v10, v11, Address(sp, 16)); 4059 __ ldpd(v8, v9, __ post(sp, 64)); 4060 4061 __ ret(lr); 4062 4063 return start; 4064 } 4065 4066 // Execute one round of keccak of two computations in parallel. 4067 // One of the states should be loaded into the lower halves of 4068 // the vector registers v0-v24, the other should be loaded into 4069 // the upper halves of those registers. The ld1r instruction loads 4070 // the round constant into both halves of register v31. 4071 // Intermediate results c0...c5 and d0...d5 are computed 4072 // in registers v25...v30. 4073 // All vector instructions that are used operate on both register 4074 // halves in parallel. 4075 // If only a single computation is needed, one can only load the lower halves. 4076 void keccak_round(Register rscratch1) { 4077 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4078 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4079 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4080 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4081 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4082 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4083 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4084 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4085 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4086 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4087 4088 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4089 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4090 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4091 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4092 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4093 4094 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4095 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4096 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4097 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4098 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4099 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4100 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4101 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4102 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4103 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4104 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4105 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4106 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4107 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4108 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4109 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4110 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4111 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4112 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4113 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4114 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4115 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4116 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4117 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4118 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4119 4120 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4121 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4122 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4123 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4124 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4125 4126 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4127 4128 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4129 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4130 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4131 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4132 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4133 4134 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4135 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4136 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4137 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4138 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4139 4140 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4141 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4142 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4143 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4144 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4145 4146 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4147 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4148 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4149 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4150 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4151 4152 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4153 } 4154 4155 // Arguments: 4156 // 4157 // Inputs: 4158 // c_rarg0 - byte[] source+offset 4159 // c_rarg1 - byte[] SHA.state 4160 // c_rarg2 - int block_size 4161 // c_rarg3 - int offset 4162 // c_rarg4 - int limit 4163 // 4164 address generate_sha3_implCompress(StubGenStubId stub_id) { 4165 bool multi_block; 4166 switch (stub_id) { 4167 case sha3_implCompress_id: 4168 multi_block = false; 4169 break; 4170 case sha3_implCompressMB_id: 4171 multi_block = true; 4172 break; 4173 default: 4174 ShouldNotReachHere(); 4175 } 4176 4177 static const uint64_t round_consts[24] = { 4178 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4179 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4180 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4181 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4182 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4183 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4184 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4185 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4186 }; 4187 4188 __ align(CodeEntryAlignment); 4189 4190 StubCodeMark mark(this, stub_id); 4191 address start = __ pc(); 4192 4193 Register buf = c_rarg0; 4194 Register state = c_rarg1; 4195 Register block_size = c_rarg2; 4196 Register ofs = c_rarg3; 4197 Register limit = c_rarg4; 4198 4199 Label sha3_loop, rounds24_loop; 4200 Label sha3_512_or_sha3_384, shake128; 4201 4202 __ stpd(v8, v9, __ pre(sp, -64)); 4203 __ stpd(v10, v11, Address(sp, 16)); 4204 __ stpd(v12, v13, Address(sp, 32)); 4205 __ stpd(v14, v15, Address(sp, 48)); 4206 4207 // load state 4208 __ add(rscratch1, state, 32); 4209 __ ld1(v0, v1, v2, v3, __ T1D, state); 4210 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4211 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4212 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4213 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4214 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4215 __ ld1(v24, __ T1D, rscratch1); 4216 4217 __ BIND(sha3_loop); 4218 4219 // 24 keccak rounds 4220 __ movw(rscratch2, 24); 4221 4222 // load round_constants base 4223 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4224 4225 // load input 4226 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4227 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4228 __ eor(v0, __ T8B, v0, v25); 4229 __ eor(v1, __ T8B, v1, v26); 4230 __ eor(v2, __ T8B, v2, v27); 4231 __ eor(v3, __ T8B, v3, v28); 4232 __ eor(v4, __ T8B, v4, v29); 4233 __ eor(v5, __ T8B, v5, v30); 4234 __ eor(v6, __ T8B, v6, v31); 4235 4236 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4237 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4238 4239 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4240 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4241 __ eor(v7, __ T8B, v7, v25); 4242 __ eor(v8, __ T8B, v8, v26); 4243 __ eor(v9, __ T8B, v9, v27); 4244 __ eor(v10, __ T8B, v10, v28); 4245 __ eor(v11, __ T8B, v11, v29); 4246 __ eor(v12, __ T8B, v12, v30); 4247 __ eor(v13, __ T8B, v13, v31); 4248 4249 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4250 __ eor(v14, __ T8B, v14, v25); 4251 __ eor(v15, __ T8B, v15, v26); 4252 __ eor(v16, __ T8B, v16, v27); 4253 4254 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4255 __ andw(c_rarg5, block_size, 48); 4256 __ cbzw(c_rarg5, rounds24_loop); 4257 4258 __ tbnz(block_size, 5, shake128); 4259 // block_size == 144, bit5 == 0, SHA3-224 4260 __ ldrd(v28, __ post(buf, 8)); 4261 __ eor(v17, __ T8B, v17, v28); 4262 __ b(rounds24_loop); 4263 4264 __ BIND(shake128); 4265 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4266 __ eor(v17, __ T8B, v17, v28); 4267 __ eor(v18, __ T8B, v18, v29); 4268 __ eor(v19, __ T8B, v19, v30); 4269 __ eor(v20, __ T8B, v20, v31); 4270 __ b(rounds24_loop); // block_size == 168, SHAKE128 4271 4272 __ BIND(sha3_512_or_sha3_384); 4273 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4274 __ eor(v7, __ T8B, v7, v25); 4275 __ eor(v8, __ T8B, v8, v26); 4276 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4277 4278 // SHA3-384 4279 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4280 __ eor(v9, __ T8B, v9, v27); 4281 __ eor(v10, __ T8B, v10, v28); 4282 __ eor(v11, __ T8B, v11, v29); 4283 __ eor(v12, __ T8B, v12, v30); 4284 4285 __ BIND(rounds24_loop); 4286 __ subw(rscratch2, rscratch2, 1); 4287 4288 keccak_round(rscratch1); 4289 4290 __ cbnzw(rscratch2, rounds24_loop); 4291 4292 if (multi_block) { 4293 __ add(ofs, ofs, block_size); 4294 __ cmp(ofs, limit); 4295 __ br(Assembler::LE, sha3_loop); 4296 __ mov(c_rarg0, ofs); // return ofs 4297 } 4298 4299 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4300 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4301 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4302 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4303 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4304 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4305 __ st1(v24, __ T1D, state); 4306 4307 // restore callee-saved registers 4308 __ ldpd(v14, v15, Address(sp, 48)); 4309 __ ldpd(v12, v13, Address(sp, 32)); 4310 __ ldpd(v10, v11, Address(sp, 16)); 4311 __ ldpd(v8, v9, __ post(sp, 64)); 4312 4313 __ ret(lr); 4314 4315 return start; 4316 } 4317 4318 // Inputs: 4319 // c_rarg0 - long[] state0 4320 // c_rarg1 - long[] state1 4321 address generate_double_keccak() { 4322 static const uint64_t round_consts[24] = { 4323 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4324 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4325 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4326 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4327 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4328 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4329 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4330 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4331 }; 4332 4333 // Implements the double_keccak() method of the 4334 // sun.secyrity.provider.SHA3Parallel class 4335 __ align(CodeEntryAlignment); 4336 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4337 address start = __ pc(); 4338 __ enter(); 4339 4340 Register state0 = c_rarg0; 4341 Register state1 = c_rarg1; 4342 4343 Label rounds24_loop; 4344 4345 // save callee-saved registers 4346 __ stpd(v8, v9, __ pre(sp, -64)); 4347 __ stpd(v10, v11, Address(sp, 16)); 4348 __ stpd(v12, v13, Address(sp, 32)); 4349 __ stpd(v14, v15, Address(sp, 48)); 4350 4351 // load states 4352 __ add(rscratch1, state0, 32); 4353 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4354 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4355 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4356 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4357 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4358 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4359 __ ld1(v24, __ D, 0, rscratch1); 4360 __ add(rscratch1, state1, 32); 4361 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4362 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4363 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4364 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4365 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4366 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4367 __ ld1(v24, __ D, 1, rscratch1); 4368 4369 // 24 keccak rounds 4370 __ movw(rscratch2, 24); 4371 4372 // load round_constants base 4373 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4374 4375 __ BIND(rounds24_loop); 4376 __ subw(rscratch2, rscratch2, 1); 4377 keccak_round(rscratch1); 4378 __ cbnzw(rscratch2, rounds24_loop); 4379 4380 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4381 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4382 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4383 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4384 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4385 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4386 __ st1(v24, __ D, 0, state0); 4387 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4388 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4389 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4390 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4391 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4392 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4393 __ st1(v24, __ D, 1, state1); 4394 4395 // restore callee-saved vector registers 4396 __ ldpd(v14, v15, Address(sp, 48)); 4397 __ ldpd(v12, v13, Address(sp, 32)); 4398 __ ldpd(v10, v11, Address(sp, 16)); 4399 __ ldpd(v8, v9, __ post(sp, 64)); 4400 4401 __ leave(); // required for proper stackwalking of RuntimeStub frame 4402 __ mov(r0, zr); // return 0 4403 __ ret(lr); 4404 4405 return start; 4406 } 4407 4408 /** 4409 * Arguments: 4410 * 4411 * Inputs: 4412 * c_rarg0 - int crc 4413 * c_rarg1 - byte* buf 4414 * c_rarg2 - int length 4415 * 4416 * Output: 4417 * rax - int crc result 4418 */ 4419 address generate_updateBytesCRC32() { 4420 assert(UseCRC32Intrinsics, "what are we doing here?"); 4421 4422 __ align(CodeEntryAlignment); 4423 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 4424 StubCodeMark mark(this, stub_id); 4425 4426 address start = __ pc(); 4427 4428 const Register crc = c_rarg0; // crc 4429 const Register buf = c_rarg1; // source java byte array address 4430 const Register len = c_rarg2; // length 4431 const Register table0 = c_rarg3; // crc_table address 4432 const Register table1 = c_rarg4; 4433 const Register table2 = c_rarg5; 4434 const Register table3 = c_rarg6; 4435 const Register tmp3 = c_rarg7; 4436 4437 BLOCK_COMMENT("Entry:"); 4438 __ enter(); // required for proper stackwalking of RuntimeStub frame 4439 4440 __ kernel_crc32(crc, buf, len, 4441 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4442 4443 __ leave(); // required for proper stackwalking of RuntimeStub frame 4444 __ ret(lr); 4445 4446 return start; 4447 } 4448 4449 // ChaCha20 block function. This version parallelizes 4 quarter 4450 // round operations at a time. It uses 16 SIMD registers to 4451 // produce 4 blocks of key stream. 4452 // 4453 // state (int[16]) = c_rarg0 4454 // keystream (byte[256]) = c_rarg1 4455 // return - number of bytes of keystream (always 256) 4456 // 4457 // In this approach, we load the 512-bit start state sequentially into 4458 // 4 128-bit vectors. We then make 4 4-vector copies of that starting 4459 // state, with each successive set of 4 vectors having a +1 added into 4460 // the first 32-bit lane of the 4th vector in that group (the counter). 4461 // By doing this, we can perform the block function on 4 512-bit blocks 4462 // within one run of this intrinsic. 4463 // The alignment of the data across the 4-vector group is such that at 4464 // the start it is already aligned for the first round of each two-round 4465 // loop iteration. In other words, the corresponding lanes of each vector 4466 // will contain the values needed for that quarter round operation (e.g. 4467 // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.). 4468 // In between each full round, a lane shift must occur. Within a loop 4469 // iteration, between the first and second rounds, the 2nd, 3rd, and 4th 4470 // vectors are rotated left 32, 64 and 96 bits, respectively. The result 4471 // is effectively a diagonal orientation in columnar form. After the 4472 // second full round, those registers are left-rotated again, this time 4473 // 96, 64, and 32 bits - returning the vectors to their columnar organization. 4474 // After all 10 iterations, the original state is added to each 4-vector 4475 // working state along with the add mask, and the 4 vector groups are 4476 // sequentially written to the memory dedicated for the output key stream. 4477 // 4478 // For a more detailed explanation, see Goll and Gueron, "Vectorization of 4479 // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology: 4480 // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33 4481 address generate_chacha20Block_qrpar() { 4482 Label L_Q_twoRounds, L_Q_cc20_const; 4483 // The constant data is broken into two 128-bit segments to be loaded 4484 // onto SIMD registers. The first 128 bits are a counter add overlay 4485 // that adds +1/+0/+0/+0 to the vectors holding replicated state[12]. 4486 // The second 128-bits is a table constant used for 8-bit left rotations. 4487 // on 32-bit lanes within a SIMD register. 4488 __ BIND(L_Q_cc20_const); 4489 __ emit_int64(0x0000000000000001UL); 4490 __ emit_int64(0x0000000000000000UL); 4491 __ emit_int64(0x0605040702010003UL); 4492 __ emit_int64(0x0E0D0C0F0A09080BUL); 4493 4494 __ align(CodeEntryAlignment); 4495 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4496 StubCodeMark mark(this, stub_id); 4497 address start = __ pc(); 4498 __ enter(); 4499 4500 const Register state = c_rarg0; 4501 const Register keystream = c_rarg1; 4502 const Register loopCtr = r10; 4503 const Register tmpAddr = r11; 4504 4505 const FloatRegister aState = v0; 4506 const FloatRegister bState = v1; 4507 const FloatRegister cState = v2; 4508 const FloatRegister dState = v3; 4509 const FloatRegister a1Vec = v4; 4510 const FloatRegister b1Vec = v5; 4511 const FloatRegister c1Vec = v6; 4512 const FloatRegister d1Vec = v7; 4513 // Skip the callee-saved registers v8 - v15 4514 const FloatRegister a2Vec = v16; 4515 const FloatRegister b2Vec = v17; 4516 const FloatRegister c2Vec = v18; 4517 const FloatRegister d2Vec = v19; 4518 const FloatRegister a3Vec = v20; 4519 const FloatRegister b3Vec = v21; 4520 const FloatRegister c3Vec = v22; 4521 const FloatRegister d3Vec = v23; 4522 const FloatRegister a4Vec = v24; 4523 const FloatRegister b4Vec = v25; 4524 const FloatRegister c4Vec = v26; 4525 const FloatRegister d4Vec = v27; 4526 const FloatRegister scratch = v28; 4527 const FloatRegister addMask = v29; 4528 const FloatRegister lrot8Tbl = v30; 4529 4530 // Load the initial state in the first 4 quadword registers, 4531 // then copy the initial state into the next 4 quadword registers 4532 // that will be used for the working state. 4533 __ ld1(aState, bState, cState, dState, __ T16B, Address(state)); 4534 4535 // Load the index register for 2 constant 128-bit data fields. 4536 // The first represents the +1/+0/+0/+0 add mask. The second is 4537 // the 8-bit left rotation. 4538 __ adr(tmpAddr, L_Q_cc20_const); 4539 __ ldpq(addMask, lrot8Tbl, Address(tmpAddr)); 4540 4541 __ mov(a1Vec, __ T16B, aState); 4542 __ mov(b1Vec, __ T16B, bState); 4543 __ mov(c1Vec, __ T16B, cState); 4544 __ mov(d1Vec, __ T16B, dState); 4545 4546 __ mov(a2Vec, __ T16B, aState); 4547 __ mov(b2Vec, __ T16B, bState); 4548 __ mov(c2Vec, __ T16B, cState); 4549 __ addv(d2Vec, __ T4S, d1Vec, addMask); 4550 4551 __ mov(a3Vec, __ T16B, aState); 4552 __ mov(b3Vec, __ T16B, bState); 4553 __ mov(c3Vec, __ T16B, cState); 4554 __ addv(d3Vec, __ T4S, d2Vec, addMask); 4555 4556 __ mov(a4Vec, __ T16B, aState); 4557 __ mov(b4Vec, __ T16B, bState); 4558 __ mov(c4Vec, __ T16B, cState); 4559 __ addv(d4Vec, __ T4S, d3Vec, addMask); 4560 4561 // Set up the 10 iteration loop 4562 __ mov(loopCtr, 10); 4563 __ BIND(L_Q_twoRounds); 4564 4565 // The first set of operations on the vectors covers the first 4 quarter 4566 // round operations: 4567 // Qround(state, 0, 4, 8,12) 4568 // Qround(state, 1, 5, 9,13) 4569 // Qround(state, 2, 6,10,14) 4570 // Qround(state, 3, 7,11,15) 4571 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4572 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4573 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4574 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4575 4576 // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to 4577 // diagonals. The a1Vec does not need to change orientation. 4578 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true); 4579 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true); 4580 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true); 4581 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true); 4582 4583 // The second set of operations on the vectors covers the second 4 quarter 4584 // round operations, now acting on the diagonals: 4585 // Qround(state, 0, 5,10,15) 4586 // Qround(state, 1, 6,11,12) 4587 // Qround(state, 2, 7, 8,13) 4588 // Qround(state, 3, 4, 9,14) 4589 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4590 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4591 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4592 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4593 4594 // Before we start the next iteration, we need to perform shuffles 4595 // on the b/c/d vectors to move them back to columnar organizations 4596 // from their current diagonal orientation. 4597 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false); 4598 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false); 4599 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false); 4600 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false); 4601 4602 // Decrement and iterate 4603 __ sub(loopCtr, loopCtr, 1); 4604 __ cbnz(loopCtr, L_Q_twoRounds); 4605 4606 // Once the counter reaches zero, we fall out of the loop 4607 // and need to add the initial state back into the working state 4608 // represented by the a/b/c/d1Vec registers. This is destructive 4609 // on the dState register but we no longer will need it. 4610 __ addv(a1Vec, __ T4S, a1Vec, aState); 4611 __ addv(b1Vec, __ T4S, b1Vec, bState); 4612 __ addv(c1Vec, __ T4S, c1Vec, cState); 4613 __ addv(d1Vec, __ T4S, d1Vec, dState); 4614 4615 __ addv(a2Vec, __ T4S, a2Vec, aState); 4616 __ addv(b2Vec, __ T4S, b2Vec, bState); 4617 __ addv(c2Vec, __ T4S, c2Vec, cState); 4618 __ addv(dState, __ T4S, dState, addMask); 4619 __ addv(d2Vec, __ T4S, d2Vec, dState); 4620 4621 __ addv(a3Vec, __ T4S, a3Vec, aState); 4622 __ addv(b3Vec, __ T4S, b3Vec, bState); 4623 __ addv(c3Vec, __ T4S, c3Vec, cState); 4624 __ addv(dState, __ T4S, dState, addMask); 4625 __ addv(d3Vec, __ T4S, d3Vec, dState); 4626 4627 __ addv(a4Vec, __ T4S, a4Vec, aState); 4628 __ addv(b4Vec, __ T4S, b4Vec, bState); 4629 __ addv(c4Vec, __ T4S, c4Vec, cState); 4630 __ addv(dState, __ T4S, dState, addMask); 4631 __ addv(d4Vec, __ T4S, d4Vec, dState); 4632 4633 // Write the final state back to the result buffer 4634 __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64)); 4635 __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64)); 4636 __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64)); 4637 __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64)); 4638 4639 __ mov(r0, 256); // Return length of output keystream 4640 __ leave(); 4641 __ ret(lr); 4642 4643 return start; 4644 } 4645 4646 void dilithium_load16zetas(int o0, Register zetas) { 4647 __ ldpq(as_FloatRegister(o0), as_FloatRegister(o0 + 1), __ post (zetas, 32)); 4648 __ ldpq(as_FloatRegister(o0 + 2), as_FloatRegister(o0 + 3), __ post (zetas, 32)); 4649 4650 } 4651 4652 void dilithium_load32zetas(Register zetas) { 4653 dilithium_load16zetas(16, zetas); 4654 dilithium_load16zetas(20, zetas); 4655 } 4656 4657 // 2x16 32-bit Montgomery multiplications in parallel 4658 // See the montMul() method of the sun.security.provider.ML_DSA class. 4659 // Here MONT_R_BITS is 32, so the right shift by it is implicit. 4660 // The constants qInv = MONT_Q_INV_MOD_R and q = MONT_Q are loaded in 4661 // (all 32-bit chunks of) vector registers v30 and v31, resp. 4662 // The inputs are b[i]s in v0-v7 and c[i]s v16-v23 and 4663 // the results are a[i]s in v16-v23, four 32-bit values in each register 4664 // and we do a_i = b_i * c_i * 2^-32 mod MONT_Q for all 4665 void dilithium_montmul32(bool by_constant) { 4666 FloatRegister vr0 = by_constant ? v29 : v0; 4667 FloatRegister vr1 = by_constant ? v29 : v1; 4668 FloatRegister vr2 = by_constant ? v29 : v2; 4669 FloatRegister vr3 = by_constant ? v29 : v3; 4670 FloatRegister vr4 = by_constant ? v29 : v4; 4671 FloatRegister vr5 = by_constant ? v29 : v5; 4672 FloatRegister vr6 = by_constant ? v29 : v6; 4673 FloatRegister vr7 = by_constant ? v29 : v7; 4674 4675 __ sqdmulh(v24, __ T4S, vr0, v16); // aHigh = hi32(2 * b * c) 4676 __ mulv(v16, __ T4S, vr0, v16); // aLow = lo32(b * c) 4677 __ sqdmulh(v25, __ T4S, vr1, v17); 4678 __ mulv(v17, __ T4S, vr1, v17); 4679 __ sqdmulh(v26, __ T4S, vr2, v18); 4680 __ mulv(v18, __ T4S, vr2, v18); 4681 __ sqdmulh(v27, __ T4S, vr3, v19); 4682 __ mulv(v19, __ T4S, vr3, v19); 4683 4684 __ mulv(v16, __ T4S, v16, v30); // m = aLow * qinv 4685 __ mulv(v17, __ T4S, v17, v30); 4686 __ mulv(v18, __ T4S, v18, v30); 4687 __ mulv(v19, __ T4S, v19, v30); 4688 4689 __ sqdmulh(v16, __ T4S, v16, v31); // n = hi32(2 * m * q) 4690 __ sqdmulh(v17, __ T4S, v17, v31); 4691 __ sqdmulh(v18, __ T4S, v18, v31); 4692 __ sqdmulh(v19, __ T4S, v19, v31); 4693 4694 __ shsubv(v16, __ T4S, v24, v16); // a = (aHigh - n) / 2 4695 __ shsubv(v17, __ T4S, v25, v17); 4696 __ shsubv(v18, __ T4S, v26, v18); 4697 __ shsubv(v19, __ T4S, v27, v19); 4698 4699 __ sqdmulh(v24, __ T4S, vr4, v20); 4700 __ mulv(v20, __ T4S, vr4, v20); 4701 __ sqdmulh(v25, __ T4S, vr5, v21); 4702 __ mulv(v21, __ T4S, vr5, v21); 4703 __ sqdmulh(v26, __ T4S, vr6, v22); 4704 __ mulv(v22, __ T4S, vr6, v22); 4705 __ sqdmulh(v27, __ T4S, vr7, v23); 4706 __ mulv(v23, __ T4S, vr7, v23); 4707 4708 __ mulv(v20, __ T4S, v20, v30); 4709 __ mulv(v21, __ T4S, v21, v30); 4710 __ mulv(v22, __ T4S, v22, v30); 4711 __ mulv(v23, __ T4S, v23, v30); 4712 4713 __ sqdmulh(v20, __ T4S, v20, v31); 4714 __ sqdmulh(v21, __ T4S, v21, v31); 4715 __ sqdmulh(v22, __ T4S, v22, v31); 4716 __ sqdmulh(v23, __ T4S, v23, v31); 4717 4718 __ shsubv(v20, __ T4S, v24, v20); 4719 __ shsubv(v21, __ T4S, v25, v21); 4720 __ shsubv(v22, __ T4S, v26, v22); 4721 __ shsubv(v23, __ T4S, v27, v23); 4722 } 4723 4724 // Do the addition and subtraction done in the ntt algorithm. 4725 // See sun.security.provider.ML_DSA.implDilithiumAlmostNttJava() 4726 void dilithium_add_sub32() { 4727 __ addv(v24, __ T4S, v0, v16); // coeffs[j] = coeffs[j] + tmp; 4728 __ addv(v25, __ T4S, v1, v17); 4729 __ addv(v26, __ T4S, v2, v18); 4730 __ addv(v27, __ T4S, v3, v19); 4731 __ addv(v28, __ T4S, v4, v20); 4732 __ addv(v29, __ T4S, v5, v21); 4733 __ addv(v30, __ T4S, v6, v22); 4734 __ addv(v31, __ T4S, v7, v23); 4735 4736 __ subv(v0, __ T4S, v0, v16); // coeffs[j + l] = coeffs[j] - tmp; 4737 __ subv(v1, __ T4S, v1, v17); 4738 __ subv(v2, __ T4S, v2, v18); 4739 __ subv(v3, __ T4S, v3, v19); 4740 __ subv(v4, __ T4S, v4, v20); 4741 __ subv(v5, __ T4S, v5, v21); 4742 __ subv(v6, __ T4S, v6, v22); 4743 __ subv(v7, __ T4S, v7, v23); 4744 } 4745 4746 // Do the same computation that 4747 // dilithium_montmul32() and dilithium_add_sub32() does, 4748 // except for only 4x4 32-bit vector elements and with 4749 // different register usage. 4750 void dilithium_montmul_sub_add16() { 4751 __ sqdmulh(v24, __ T4S, v1, v16); 4752 __ mulv(v16, __ T4S, v1, v16); 4753 __ sqdmulh(v25, __ T4S, v3, v17); 4754 __ mulv(v17, __ T4S, v3, v17); 4755 __ sqdmulh(v26, __ T4S, v5, v18); 4756 __ mulv(v18, __ T4S, v5, v18); 4757 __ sqdmulh(v27, __ T4S, v7, v19); 4758 __ mulv(v19, __ T4S, v7, v19); 4759 4760 __ mulv(v16, __ T4S, v16, v30); 4761 __ mulv(v17, __ T4S, v17, v30); 4762 __ mulv(v18, __ T4S, v18, v30); 4763 __ mulv(v19, __ T4S, v19, v30); 4764 4765 __ sqdmulh(v16, __ T4S, v16, v31); 4766 __ sqdmulh(v17, __ T4S, v17, v31); 4767 __ sqdmulh(v18, __ T4S, v18, v31); 4768 __ sqdmulh(v19, __ T4S, v19, v31); 4769 4770 __ shsubv(v16, __ T4S, v24, v16); 4771 __ shsubv(v17, __ T4S, v25, v17); 4772 __ shsubv(v18, __ T4S, v26, v18); 4773 __ shsubv(v19, __ T4S, v27, v19); 4774 4775 __ subv(v1, __ T4S, v0, v16); 4776 __ subv(v3, __ T4S, v2, v17); 4777 __ subv(v5, __ T4S, v4, v18); 4778 __ subv(v7, __ T4S, v6, v19); 4779 4780 __ addv(v0, __ T4S, v0, v16); 4781 __ addv(v2, __ T4S, v2, v17); 4782 __ addv(v4, __ T4S, v4, v18); 4783 __ addv(v6, __ T4S, v6, v19); 4784 } 4785 4786 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 4787 // in the Java implementation come in sequences of at least 8, so we 4788 // can use ldpq to collect the corresponding data into pairs of vector 4789 // registers. 4790 // We collect the coefficients corresponding to the 'j+l' indexes into 4791 // the vector registers v0-v7, the zetas into the vector registers v16-v23 4792 // then we do the (Montgomery) multiplications by the zetas in parallel 4793 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 4794 // v0-v7, then do the additions into v24-v31 and the subtractions into 4795 // v0-v7 and finally save the results back to the coeffs array. 4796 void dilithiumNttLevel0_4(const Register dilithiumConsts, 4797 const Register coeffs, const Register zetas) { 4798 int c1 = 0; 4799 int c2 = 512; 4800 int startIncr; 4801 int incr1 = 32; 4802 int incr2 = 64; 4803 int incr3 = 96; 4804 4805 for (int level = 0; level < 5; level++) { 4806 int c1Start = c1; 4807 int c2Start = c2; 4808 if (level == 3) { 4809 incr1 = 32; 4810 incr2 = 128; 4811 incr3 = 160; 4812 } else if (level == 4) { 4813 incr1 = 64; 4814 incr2 = 128; 4815 incr3 = 192; 4816 } 4817 4818 for (int i = 0; i < 4; i++) { 4819 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4820 __ ldpq(v0, v1, Address(coeffs, c2Start)); 4821 __ ldpq(v2, v3, Address(coeffs, c2Start + incr1)); 4822 __ ldpq(v4, v5, Address(coeffs, c2Start + incr2)); 4823 __ ldpq(v6, v7, Address(coeffs, c2Start + incr3)); 4824 dilithium_load32zetas(zetas); 4825 dilithium_montmul32(false); 4826 __ ldpq(v0, v1, Address(coeffs, c1Start)); 4827 __ ldpq(v2, v3, Address(coeffs, c1Start + incr1)); 4828 __ ldpq(v4, v5, Address(coeffs, c1Start + incr2)); 4829 __ ldpq(v6, v7, Address(coeffs, c1Start + incr3)); 4830 dilithium_add_sub32(); 4831 __ stpq(v24, v25, Address(coeffs, c1Start)); 4832 __ stpq(v26, v27, Address(coeffs, c1Start + incr1)); 4833 __ stpq(v28, v29, Address(coeffs, c1Start + incr2)); 4834 __ stpq(v30, v31, Address(coeffs, c1Start + incr3)); 4835 __ stpq(v0, v1, Address(coeffs, c2Start)); 4836 __ stpq(v2, v3, Address(coeffs, c2Start + incr1)); 4837 __ stpq(v4, v5, Address(coeffs, c2Start + incr2)); 4838 __ stpq(v6, v7, Address(coeffs, c2Start + incr3)); 4839 4840 int k = 4 * level + i; 4841 4842 if (k > 7) { 4843 startIncr = 256; 4844 } else if (k == 5) { 4845 startIncr = 384; 4846 } else { 4847 startIncr = 128; 4848 } 4849 4850 c1Start += startIncr; 4851 c2Start += startIncr; 4852 } 4853 4854 c2 /= 2; 4855 } 4856 } 4857 4858 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 4859 // Implements the method 4860 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 4861 // of the Java class sun.security.provider 4862 // 4863 // coeffs (int[256]) = c_rarg0 4864 // zetas (int[256]) = c_rarg1 4865 address generate_dilithiumAlmostNtt() { 4866 4867 __ align(CodeEntryAlignment); 4868 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 4869 StubCodeMark mark(this, stub_id); 4870 address start = __ pc(); 4871 __ enter(); 4872 4873 const Register coeffs = c_rarg0; 4874 const Register zetas = c_rarg1; 4875 4876 const Register tmpAddr = r9; 4877 const Register dilithiumConsts = r10; 4878 const Register result = r11; 4879 4880 __ add(result, coeffs, 0); 4881 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 4882 4883 // Each level represents one iteration of the outer for loop of the Java version 4884 4885 // level 0-4 4886 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 4887 4888 // level 5 4889 for (int i = 0; i < 1024; i += 256) { 4890 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4891 __ ldr(v0, __ Q, Address(coeffs, i + 16)); 4892 __ ldr(v1, __ Q, Address(coeffs, i + 48)); 4893 __ ldr(v2, __ Q, Address(coeffs, i + 80)); 4894 __ ldr(v3, __ Q, Address(coeffs, i + 112)); 4895 __ ldr(v4, __ Q, Address(coeffs, i + 144)); 4896 __ ldr(v5, __ Q, Address(coeffs, i + 176)); 4897 __ ldr(v6, __ Q, Address(coeffs, i + 208)); 4898 __ ldr(v7, __ Q, Address(coeffs, i + 240)); 4899 dilithium_load32zetas(zetas); 4900 dilithium_montmul32(false); 4901 __ ldr(v0, __ Q, Address(coeffs, i)); 4902 __ ldr(v1, __ Q, Address(coeffs, i + 32)); 4903 __ ldr(v2, __ Q, Address(coeffs, i + 64)); 4904 __ ldr(v3, __ Q, Address(coeffs, i + 96)); 4905 __ ldr(v4, __ Q, Address(coeffs, i + 128)); 4906 __ ldr(v5, __ Q, Address(coeffs, i + 160)); 4907 __ ldr(v6, __ Q, Address(coeffs, i + 192)); 4908 __ ldr(v7, __ Q, Address(coeffs, i + 224)); 4909 dilithium_add_sub32(); 4910 __ str(v24, __ Q, Address(coeffs, i)); 4911 __ str(v25, __ Q, Address(coeffs, i + 32)); 4912 __ str(v26, __ Q, Address(coeffs, i + 64)); 4913 __ str(v27, __ Q, Address(coeffs, i + 96)); 4914 __ str(v28, __ Q, Address(coeffs, i + 128)); 4915 __ str(v29, __ Q, Address(coeffs, i + 160)); 4916 __ str(v30, __ Q, Address(coeffs, i + 192)); 4917 __ str(v31, __ Q, Address(coeffs, i + 224)); 4918 __ str(v0, __ Q, Address(coeffs, i + 16)); 4919 __ str(v1, __ Q, Address(coeffs, i + 48)); 4920 __ str(v2, __ Q, Address(coeffs, i + 80)); 4921 __ str(v3, __ Q, Address(coeffs, i + 112)); 4922 __ str(v4, __ Q, Address(coeffs, i + 144)); 4923 __ str(v5, __ Q, Address(coeffs, i + 176)); 4924 __ str(v6, __ Q, Address(coeffs, i + 208)); 4925 __ str(v7, __ Q, Address(coeffs, i + 240)); 4926 } 4927 4928 // level 6 4929 for (int i = 0; i < 1024; i += 128) { 4930 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4931 __ add(tmpAddr, coeffs, i); 4932 __ ld2(v0, v1, __ T2D, tmpAddr); 4933 __ add(tmpAddr, coeffs, i + 32); 4934 __ ld2(v2, v3, __ T2D, tmpAddr); 4935 __ add(tmpAddr, coeffs, i + 64); 4936 __ ld2(v4, v5, __ T2D, tmpAddr); 4937 __ add(tmpAddr, coeffs, i + 96); 4938 __ ld2(v6, v7, __ T2D, tmpAddr); 4939 dilithium_load16zetas(16, zetas); 4940 dilithium_montmul_sub_add16(); 4941 __ add(tmpAddr, coeffs, i); 4942 __ st2(v0, v1, __ T2D, tmpAddr); 4943 __ add(tmpAddr, coeffs, i + 32); 4944 __ st2(v2, v3, __ T2D, tmpAddr); 4945 __ add(tmpAddr, coeffs, i + 64); 4946 __ st2(v4, v5, __ T2D, tmpAddr); 4947 __ add(tmpAddr, coeffs, i + 96); 4948 __ st2(v6, v7, __ T2D, tmpAddr); 4949 } 4950 4951 // level 7 4952 for (int i = 0; i < 1024; i += 128) { 4953 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4954 __ add(tmpAddr, coeffs, i); 4955 __ ld2(v0, v1, __ T4S, tmpAddr); 4956 __ add(tmpAddr, coeffs, i + 32); 4957 __ ld2(v2, v3, __ T4S, tmpAddr); 4958 __ add(tmpAddr, coeffs, i + 64); 4959 __ ld2(v4, v5, __ T4S, tmpAddr); 4960 __ add(tmpAddr, coeffs, i + 96); 4961 __ ld2(v6, v7, __ T4S, tmpAddr); 4962 dilithium_load16zetas(16, zetas); 4963 dilithium_montmul_sub_add16(); 4964 __ add(tmpAddr, coeffs, i); 4965 __ st2(v0, v1, __ T4S, tmpAddr); 4966 __ add(tmpAddr, coeffs, i + 32); 4967 __ st2(v2, v3, __ T4S, tmpAddr); 4968 __ add(tmpAddr, coeffs, i + 64); 4969 __ st2(v4, v5, __ T4S, tmpAddr); 4970 __ add(tmpAddr, coeffs, i + 96); 4971 __ st2(v6, v7, __ T4S, tmpAddr); 4972 } 4973 __ leave(); // required for proper stackwalking of RuntimeStub frame 4974 __ mov(r0, zr); // return 0 4975 __ ret(lr); 4976 4977 return start; 4978 4979 } 4980 4981 // Do the computations that can be found in the body of the loop in 4982 // sun.security.provider.ML_DSA.implDilithiumAlmostInverseNttJava() 4983 // for 16 coefficients in parallel: 4984 // tmp = coeffs[j]; 4985 // coeffs[j] = (tmp + coeffs[j + l]); 4986 // coeffs[j + l] = montMul(tmp - coeffs[j + l], -MONT_ZETAS_FOR_NTT[m]); 4987 // coefss[j]s are loaded in v0, v2, v4 and v6, 4988 // coeffs[j + l]s in v1, v3, v5 and v7, 4989 // the corresponding zetas in v16, v17, v18 and v19. 4990 void dilithium_sub_add_montmul16() { 4991 __ subv(v20, __ T4S, v0, v1); 4992 __ subv(v21, __ T4S, v2, v3); 4993 __ subv(v22, __ T4S, v4, v5); 4994 __ subv(v23, __ T4S, v6, v7); 4995 4996 __ addv(v0, __ T4S, v0, v1); 4997 __ addv(v2, __ T4S, v2, v3); 4998 __ addv(v4, __ T4S, v4, v5); 4999 __ addv(v6, __ T4S, v6, v7); 5000 5001 __ sqdmulh(v24, __ T4S, v20, v16); // aHigh = hi32(2 * b * c) 5002 __ mulv(v1, __ T4S, v20, v16); // aLow = lo32(b * c) 5003 __ sqdmulh(v25, __ T4S, v21, v17); 5004 __ mulv(v3, __ T4S, v21, v17); 5005 __ sqdmulh(v26, __ T4S, v22, v18); 5006 __ mulv(v5, __ T4S, v22, v18); 5007 __ sqdmulh(v27, __ T4S, v23, v19); 5008 __ mulv(v7, __ T4S, v23, v19); 5009 5010 __ mulv(v1, __ T4S, v1, v30); // m = (aLow * q) 5011 __ mulv(v3, __ T4S, v3, v30); 5012 __ mulv(v5, __ T4S, v5, v30); 5013 __ mulv(v7, __ T4S, v7, v30); 5014 5015 __ sqdmulh(v1, __ T4S, v1, v31); // n = hi32(2 * m * q) 5016 __ sqdmulh(v3, __ T4S, v3, v31); 5017 __ sqdmulh(v5, __ T4S, v5, v31); 5018 __ sqdmulh(v7, __ T4S, v7, v31); 5019 5020 __ shsubv(v1, __ T4S, v24, v1); // a = (aHigh - n) / 2 5021 __ shsubv(v3, __ T4S, v25, v3); 5022 __ shsubv(v5, __ T4S, v26, v5); 5023 __ shsubv(v7, __ T4S, v27, v7); 5024 } 5025 5026 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 5027 // in the Java implementation come in sequences of at least 8, so we 5028 // can use ldpq to collect the corresponding data into pairs of vector 5029 // registers 5030 // We collect the coefficients that correspond to the 'j's into v0-v7 5031 // the coefficiets that correspond to the 'j+l's into v16-v23 then 5032 // do the additions into v24-v31 and the subtractions into v0-v7 then 5033 // save the result of the additions, load the zetas into v16-v23 5034 // do the (Montgomery) multiplications by zeta in parallel into v16-v23 5035 // finally save the results back to the coeffs array 5036 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 5037 const Register coeffs, const Register zetas) { 5038 int c1 = 0; 5039 int c2 = 32; 5040 int startIncr; 5041 int incr1; 5042 int incr2; 5043 int incr3; 5044 5045 for (int level = 3; level < 8; level++) { 5046 int c1Start = c1; 5047 int c2Start = c2; 5048 if (level == 3) { 5049 incr1 = 64; 5050 incr2 = 128; 5051 incr3 = 192; 5052 } else if (level == 4) { 5053 incr1 = 32; 5054 incr2 = 128; 5055 incr3 = 160; 5056 } else { 5057 incr1 = 32; 5058 incr2 = 64; 5059 incr3 = 96; 5060 } 5061 5062 for (int i = 0; i < 4; i++) { 5063 __ ldpq(v0, v1, Address(coeffs, c1Start)); 5064 __ ldpq(v2, v3, Address(coeffs, c1Start + incr1)); 5065 __ ldpq(v4, v5, Address(coeffs, c1Start + incr2)); 5066 __ ldpq(v6, v7, Address(coeffs, c1Start + incr3)); 5067 __ ldpq(v16, v17, Address(coeffs, c2Start)); 5068 __ ldpq(v18, v19, Address(coeffs, c2Start + incr1)); 5069 __ ldpq(v20, v21, Address(coeffs, c2Start + incr2)); 5070 __ ldpq(v22, v23, Address(coeffs, c2Start + incr3)); 5071 dilithium_add_sub32(); 5072 __ stpq(v24, v25, Address(coeffs, c1Start)); 5073 __ stpq(v26, v27, Address(coeffs, c1Start + incr1)); 5074 __ stpq(v28, v29, Address(coeffs, c1Start + incr2)); 5075 __ stpq(v30, v31, Address(coeffs, c1Start + incr3)); 5076 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5077 dilithium_load32zetas(zetas); 5078 dilithium_montmul32(false); 5079 __ stpq(v16, v17, Address(coeffs, c2Start)); 5080 __ stpq(v18, v19, Address(coeffs, c2Start + incr1)); 5081 __ stpq(v20, v21, Address(coeffs, c2Start + incr2)); 5082 __ stpq(v22, v23, Address(coeffs, c2Start + incr3)); 5083 5084 int k = 4 * level + i; 5085 5086 if (k < 24) { 5087 startIncr = 256; 5088 } else if (k == 25) { 5089 startIncr = 384; 5090 } else { 5091 startIncr = 128; 5092 } 5093 5094 c1Start += startIncr; 5095 c2Start += startIncr; 5096 } 5097 5098 c2 *= 2; 5099 } 5100 } 5101 5102 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 5103 // Implements the method 5104 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 5105 // the sun.security.provider.ML_DSA class. 5106 // 5107 // coeffs (int[256]) = c_rarg0 5108 // zetas (int[256]) = c_rarg1 5109 address generate_dilithiumAlmostInverseNtt() { 5110 5111 __ align(CodeEntryAlignment); 5112 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 5113 StubCodeMark mark(this, stub_id); 5114 address start = __ pc(); 5115 __ enter(); 5116 5117 const Register coeffs = c_rarg0; 5118 const Register zetas = c_rarg1; 5119 5120 const Register tmpAddr = r9; 5121 const Register dilithiumConsts = r10; 5122 const Register result = r11; 5123 5124 __ add(result, coeffs, 0); 5125 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5126 5127 // Each level represents one iteration of the outer for loop of the Java version 5128 // level0 5129 for (int i = 0; i < 1024; i += 128) { 5130 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5131 __ add(tmpAddr, coeffs, i); 5132 __ ld2(v0, v1, __ T4S, tmpAddr); 5133 __ add(tmpAddr, coeffs, i + 32); 5134 __ ld2(v2, v3, __ T4S, tmpAddr); 5135 __ add(tmpAddr, coeffs, i + 64); 5136 __ ld2(v4, v5, __ T4S, tmpAddr); 5137 __ add(tmpAddr, coeffs, i + 96); 5138 __ ld2(v6, v7, __ T4S, tmpAddr); 5139 dilithium_load16zetas(16, zetas); 5140 dilithium_sub_add_montmul16(); 5141 __ add(tmpAddr, coeffs, i); 5142 __ st2(v0, v1, __ T4S, tmpAddr); 5143 __ add(tmpAddr, coeffs, i + 32); 5144 __ st2(v2, v3, __ T4S, tmpAddr); 5145 __ add(tmpAddr, coeffs, i + 64); 5146 __ st2(v4, v5, __ T4S, tmpAddr); 5147 __ add(tmpAddr, coeffs, i + 96); 5148 __ st2(v6, v7, __ T4S, tmpAddr); 5149 } 5150 5151 // level 1 5152 for (int i = 0; i < 1024; i += 128) { 5153 __ add(tmpAddr, coeffs, i); 5154 __ ld2(v0, v1, __ T2D, tmpAddr); 5155 __ add(tmpAddr, coeffs, i + 32); 5156 __ ld2(v2, v3, __ T2D, tmpAddr); 5157 __ add(tmpAddr, coeffs, i + 64); 5158 __ ld2(v4, v5, __ T2D, tmpAddr); 5159 __ add(tmpAddr, coeffs, i + 96); 5160 __ ld2(v6, v7, __ T2D, tmpAddr); 5161 dilithium_load16zetas(16, zetas); 5162 dilithium_sub_add_montmul16(); 5163 __ add(tmpAddr, coeffs, i); 5164 __ st2(v0, v1, __ T2D, tmpAddr); 5165 __ add(tmpAddr, coeffs, i + 32); 5166 __ st2(v2, v3, __ T2D, tmpAddr); 5167 __ add(tmpAddr, coeffs, i + 64); 5168 __ st2(v4, v5, __ T2D, tmpAddr); 5169 __ add(tmpAddr, coeffs, i + 96); 5170 __ st2(v6, v7, __ T2D, tmpAddr); 5171 } 5172 5173 //level 2 5174 for (int i = 0; i < 1024; i += 256) { 5175 __ ldr(v0, __ Q, Address(coeffs, i)); 5176 __ ldr(v1, __ Q, Address(coeffs, i + 32)); 5177 __ ldr(v2, __ Q, Address(coeffs, i + 64)); 5178 __ ldr(v3, __ Q, Address(coeffs, i + 96)); 5179 __ ldr(v4, __ Q, Address(coeffs, i + 128)); 5180 __ ldr(v5, __ Q, Address(coeffs, i + 160)); 5181 __ ldr(v6, __ Q, Address(coeffs, i + 192)); 5182 __ ldr(v7, __ Q, Address(coeffs, i + 224)); 5183 __ ldr(v16, __ Q, Address(coeffs, i + 16)); 5184 __ ldr(v17, __ Q, Address(coeffs, i + 48)); 5185 __ ldr(v18, __ Q, Address(coeffs, i + 80)); 5186 __ ldr(v19, __ Q, Address(coeffs, i + 112)); 5187 __ ldr(v20, __ Q, Address(coeffs, i + 144)); 5188 __ ldr(v21, __ Q, Address(coeffs, i + 176)); 5189 __ ldr(v22, __ Q, Address(coeffs, i + 208)); 5190 __ ldr(v23, __ Q, Address(coeffs, i + 240)); 5191 dilithium_add_sub32(); 5192 __ str(v24, __ Q, Address(coeffs, i)); 5193 __ str(v25, __ Q, Address(coeffs, i + 32)); 5194 __ str(v26, __ Q, Address(coeffs, i + 64)); 5195 __ str(v27, __ Q, Address(coeffs, i + 96)); 5196 __ str(v28, __ Q, Address(coeffs, i + 128)); 5197 __ str(v29, __ Q, Address(coeffs, i + 160)); 5198 __ str(v30, __ Q, Address(coeffs, i + 192)); 5199 __ str(v31, __ Q, Address(coeffs, i + 224)); 5200 dilithium_load32zetas(zetas); 5201 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5202 dilithium_montmul32(false); 5203 __ str(v16, __ Q, Address(coeffs, i + 16)); 5204 __ str(v17, __ Q, Address(coeffs, i + 48)); 5205 __ str(v18, __ Q, Address(coeffs, i + 80)); 5206 __ str(v19, __ Q, Address(coeffs, i + 112)); 5207 __ str(v20, __ Q, Address(coeffs, i + 144)); 5208 __ str(v21, __ Q, Address(coeffs, i + 176)); 5209 __ str(v22, __ Q, Address(coeffs, i + 208)); 5210 __ str(v23, __ Q, Address(coeffs, i + 240)); 5211 } 5212 5213 // level 3-7 5214 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 5215 5216 __ leave(); // required for proper stackwalking of RuntimeStub frame 5217 __ mov(r0, zr); // return 0 5218 __ ret(lr); 5219 5220 return start; 5221 5222 } 5223 5224 // Dilithium multiply polynomials in the NTT domain. 5225 // Straightforward implementation of the method 5226 // static int implDilithiumNttMult( 5227 // int[] result, int[] ntta, int[] nttb {} of 5228 // the sun.security.provider.ML_DSA class. 5229 // 5230 // result (int[256]) = c_rarg0 5231 // poly1 (int[256]) = c_rarg1 5232 // poly2 (int[256]) = c_rarg2 5233 address generate_dilithiumNttMult() { 5234 5235 __ align(CodeEntryAlignment); 5236 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 5237 StubCodeMark mark(this, stub_id); 5238 address start = __ pc(); 5239 __ enter(); 5240 5241 Label L_loop; 5242 5243 const Register result = c_rarg0; 5244 const Register poly1 = c_rarg1; 5245 const Register poly2 = c_rarg2; 5246 5247 const Register dilithiumConsts = r10; 5248 const Register len = r11; 5249 5250 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5251 5252 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5253 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 5254 5255 __ mov(len, zr); 5256 __ add(len, len, 1024); 5257 5258 __ BIND(L_loop); 5259 5260 __ ldpq(v0, v1, __ post(poly1, 32)); 5261 __ ldpq(v2, v3, __ post(poly1, 32)); 5262 __ ldpq(v4, v5, __ post(poly1, 32)); 5263 __ ldpq(v6, v7, __ post(poly1, 32)); 5264 __ ldpq(v16, v17, __ post(poly2, 32)); 5265 __ ldpq(v18, v19, __ post(poly2, 32)); 5266 __ ldpq(v20, v21, __ post(poly2, 32)); 5267 __ ldpq(v22, v23, __ post(poly2, 32)); 5268 dilithium_montmul32(false); 5269 dilithium_montmul32(true); 5270 __ stpq(v16, v17, __ post(result, 32)); 5271 __ stpq(v18, v19, __ post(result, 32)); 5272 __ stpq(v20, v21, __ post(result, 32)); 5273 __ stpq(v22, v23, __ post(result, 32)); 5274 5275 __ sub(len, len, 128); 5276 __ cmp(len, (u1)128); 5277 __ br(Assembler::GE, L_loop); 5278 5279 __ leave(); // required for proper stackwalking of RuntimeStub frame 5280 __ mov(r0, zr); // return 0 5281 __ ret(lr); 5282 5283 return start; 5284 5285 } 5286 5287 // Dilithium Motgomery multiply an array by a constant. 5288 // A straightforward implementation of the method 5289 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 5290 // of the sun.security.provider.MLDSA class 5291 // 5292 // coeffs (int[256]) = c_rarg0 5293 // constant (int) = c_rarg1 5294 address generate_dilithiumMontMulByConstant() { 5295 5296 __ align(CodeEntryAlignment); 5297 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 5298 StubCodeMark mark(this, stub_id); 5299 address start = __ pc(); 5300 __ enter(); 5301 5302 Label L_loop; 5303 5304 const Register coeffs = c_rarg0; 5305 const Register constant = c_rarg1; 5306 5307 const Register dilithiumConsts = r10; 5308 const Register result = r11; 5309 const Register len = r12; 5310 5311 __ add(result, coeffs, 0); 5312 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5313 5314 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5315 __ dup(v29, __ T4S, constant); 5316 __ mov(len, zr); 5317 __ add(len, len, 1024); 5318 5319 __ BIND(L_loop); 5320 5321 __ ldpq(v16, v17, __ post(coeffs, 32)); 5322 __ ldpq(v18, v19, __ post(coeffs, 32)); 5323 __ ldpq(v20, v21, __ post(coeffs, 32)); 5324 __ ldpq(v22, v23, __ post(coeffs, 32)); 5325 dilithium_montmul32(true); 5326 __ stpq(v16, v17, __ post(result, 32)); 5327 __ stpq(v18, v19, __ post(result, 32)); 5328 __ stpq(v20, v21, __ post(result, 32)); 5329 __ stpq(v22, v23, __ post(result, 32)); 5330 5331 __ sub(len, len, 128); 5332 __ cmp(len, (u1)128); 5333 __ br(Assembler::GE, L_loop); 5334 5335 __ leave(); // required for proper stackwalking of RuntimeStub frame 5336 __ mov(r0, zr); // return 0 5337 __ ret(lr); 5338 5339 return start; 5340 } 5341 5342 // Dilithium decompose poly. 5343 // Implements the method 5344 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 5345 // of the sun.security.provider.ML_DSA class 5346 // 5347 // input (int[256]) = c_rarg0 5348 // lowPart (int[256]) = c_rarg1 5349 // highPart (int[256]) = c_rarg2 5350 // twoGamma2 (int) = c_rarg3 5351 // multiplier (int) = c_rarg4 5352 address generate_dilithiumDecomposePoly() { 5353 5354 __ align(CodeEntryAlignment); 5355 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 5356 StubCodeMark mark(this, stub_id); 5357 address start = __ pc(); 5358 __ enter(); 5359 5360 Label L_loop; 5361 5362 const Register input = c_rarg0; 5363 const Register lowPart = c_rarg1; 5364 const Register highPart = c_rarg2; 5365 const Register twoGamma2 = c_rarg3; 5366 const Register multiplier = c_rarg4; 5367 5368 const Register len = r9; 5369 const Register dilithiumConsts = r10; 5370 const Register tmp = r11; 5371 5372 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5373 5374 // save callee-saved registers 5375 __ stpd(v8, v9, __ pre(sp, -64)); 5376 __ stpd(v10, v11, Address(sp, 16)); 5377 __ stpd(v12, v13, Address(sp, 32)); 5378 __ stpd(v14, v15, Address(sp, 48)); 5379 5380 5381 __ mov(tmp, zr); 5382 __ add(tmp, tmp, 1); 5383 __ dup(v25, __ T4S, tmp); // 1 5384 __ ldr(v30, __ Q, Address(dilithiumConsts, 16)); // q 5385 __ ldr(v31, __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 5386 __ dup(v28, __ T4S, twoGamma2); // 2 * gamma2 5387 __ dup(v29, __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 5388 __ subv(v26, __ T4S, v30, v25); // q - 1 5389 __ sshr(v27, __ T4S, v28, 1); // gamma2 5390 5391 __ mov(len, zr); 5392 __ add(len, len, 1024); 5393 5394 __ BIND(L_loop); 5395 5396 __ ld4(v0, v1, v2, v3, __ T4S, __ post(input, 64)); 5397 5398 // rplus in v0 5399 // rplus = rplus - ((rplus + 5373807) >> 23) * dilithium_q; 5400 __ addv(v4, __ T4S, v0, v31); 5401 __ addv(v5, __ T4S, v1, v31); 5402 __ addv(v6, __ T4S, v2, v31); 5403 __ addv(v7, __ T4S, v3, v31); 5404 5405 __ sshr(v4, __ T4S, v4, 23); 5406 __ sshr(v5, __ T4S, v5, 23); 5407 __ sshr(v6, __ T4S, v6, 23); 5408 __ sshr(v7, __ T4S, v7, 23); 5409 5410 __ mulv(v4, __ T4S, v4, v30); 5411 __ mulv(v5, __ T4S, v5, v30); 5412 __ mulv(v6, __ T4S, v6, v30); 5413 __ mulv(v7, __ T4S, v7, v30); 5414 5415 __ subv(v0, __ T4S, v0, v4); 5416 __ subv(v1, __ T4S, v1, v5); 5417 __ subv(v2, __ T4S, v2, v6); 5418 __ subv(v3, __ T4S, v3, v7); 5419 5420 // rplus in v0 5421 // rplus = rplus + ((rplus >> 31) & dilithium_q); 5422 __ sshr(v4, __ T4S, v0, 31); 5423 __ sshr(v5, __ T4S, v1, 31); 5424 __ sshr(v6, __ T4S, v2, 31); 5425 __ sshr(v7, __ T4S, v3, 31); 5426 5427 __ andr(v4, __ T16B, v4, v30); 5428 __ andr(v5, __ T16B, v5, v30); 5429 __ andr(v6, __ T16B, v6, v30); 5430 __ andr(v7, __ T16B, v7, v30); 5431 5432 __ addv(v0, __ T4S, v0, v4); 5433 __ addv(v1, __ T4S, v1, v5); 5434 __ addv(v2, __ T4S, v2, v6); 5435 __ addv(v3, __ T4S, v3, v7); 5436 5437 // rplus in v0 5438 // int quotient = (rplus * multiplier) >> 22; 5439 __ mulv(v4, __ T4S, v0, v29); 5440 __ mulv(v5, __ T4S, v1, v29); 5441 __ mulv(v6, __ T4S, v2, v29); 5442 __ mulv(v7, __ T4S, v3, v29); 5443 5444 __ sshr(v4, __ T4S, v4, 22); 5445 __ sshr(v5, __ T4S, v5, 22); 5446 __ sshr(v6, __ T4S, v6, 22); 5447 __ sshr(v7, __ T4S, v7, 22); 5448 5449 // quotient in v4 5450 // int r0 = rplus - quotient * twoGamma2; 5451 __ mulv(v8, __ T4S, v4, v28); 5452 __ mulv(v9, __ T4S, v5, v28); 5453 __ mulv(v10, __ T4S, v6, v28); 5454 __ mulv(v11, __ T4S, v7, v28); 5455 5456 __ subv(v8, __ T4S, v0, v8); 5457 __ subv(v9, __ T4S, v1, v9); 5458 __ subv(v10, __ T4S, v2, v10); 5459 __ subv(v11, __ T4S, v3, v11); 5460 5461 // r0 in v8 5462 // int mask = (twoGamma2 - r0) >> 22; 5463 __ subv(v12, __ T4S, v28, v8); 5464 __ subv(v13, __ T4S, v28, v9); 5465 __ subv(v14, __ T4S, v28, v10); 5466 __ subv(v15, __ T4S, v28, v11); 5467 5468 __ sshr(v12, __ T4S, v12, 22); 5469 __ sshr(v13, __ T4S, v13, 22); 5470 __ sshr(v14, __ T4S, v14, 22); 5471 __ sshr(v15, __ T4S, v15, 22); 5472 5473 // mask in v12 5474 // r0 -= (mask & twoGamma2); 5475 __ andr(v16, __ T16B, v12, v28); 5476 __ andr(v17, __ T16B, v13, v28); 5477 __ andr(v18, __ T16B, v14, v28); 5478 __ andr(v19, __ T16B, v15, v28); 5479 5480 __ subv(v8, __ T4S, v8, v16); 5481 __ subv(v9, __ T4S, v9, v17); 5482 __ subv(v10, __ T4S, v10, v18); 5483 __ subv(v11, __ T4S, v11, v19); 5484 5485 // r0 in v8 5486 // quotient += (mask & 1); 5487 __ andr(v16, __ T16B, v12, v25); 5488 __ andr(v17, __ T16B, v13, v25); 5489 __ andr(v18, __ T16B, v14, v25); 5490 __ andr(v19, __ T16B, v15, v25); 5491 5492 __ addv(v4, __ T4S, v4, v16); 5493 __ addv(v5, __ T4S, v5, v17); 5494 __ addv(v6, __ T4S, v6, v18); 5495 __ addv(v7, __ T4S, v7, v19); 5496 5497 // mask = (twoGamma2 / 2 - r0) >> 31; 5498 __ subv(v12, __ T4S, v27, v8); 5499 __ subv(v13, __ T4S, v27, v9); 5500 __ subv(v14, __ T4S, v27, v10); 5501 __ subv(v15, __ T4S, v27, v11); 5502 5503 __ sshr(v12, __ T4S, v12, 31); 5504 __ sshr(v13, __ T4S, v13, 31); 5505 __ sshr(v14, __ T4S, v14, 31); 5506 __ sshr(v15, __ T4S, v15, 31); 5507 5508 // r0 -= (mask & twoGamma2); 5509 __ andr(v16, __ T16B, v12, v28); 5510 __ andr(v17, __ T16B, v13, v28); 5511 __ andr(v18, __ T16B, v14, v28); 5512 __ andr(v19, __ T16B, v15, v28); 5513 5514 __ subv(v8, __ T4S, v8, v16); 5515 __ subv(v9, __ T4S, v9, v17); 5516 __ subv(v10, __ T4S, v10, v18); 5517 __ subv(v11, __ T4S, v11, v19); 5518 5519 // quotient += (mask & 1); 5520 __ andr(v16, __ T16B, v12, v25); 5521 __ andr(v17, __ T16B, v13, v25); 5522 __ andr(v18, __ T16B, v14, v25); 5523 __ andr(v19, __ T16B, v15, v25); 5524 5525 __ addv(v4, __ T4S, v4, v16); 5526 __ addv(v5, __ T4S, v5, v17); 5527 __ addv(v6, __ T4S, v6, v18); 5528 __ addv(v7, __ T4S, v7, v19); 5529 5530 // int r1 = rplus - r0 - (dilithium_q - 1); 5531 __ subv(v16, __ T4S, v0, v8); 5532 __ subv(v17, __ T4S, v1, v9); 5533 __ subv(v18, __ T4S, v2, v10); 5534 __ subv(v19, __ T4S, v3, v11); 5535 5536 __ subv(v16, __ T4S, v16, v26); 5537 __ subv(v17, __ T4S, v17, v26); 5538 __ subv(v18, __ T4S, v18, v26); 5539 __ subv(v19, __ T4S, v19, v26); 5540 5541 // r1 in v16 5542 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 5543 __ negr(v20, __ T4S, v16); 5544 __ negr(v21, __ T4S, v17); 5545 __ negr(v22, __ T4S, v18); 5546 __ negr(v23, __ T4S, v19); 5547 5548 __ orr(v16, __ T16B, v16, v20); 5549 __ orr(v17, __ T16B, v17, v21); 5550 __ orr(v18, __ T16B, v18, v22); 5551 __ orr(v19, __ T16B, v19, v23); 5552 5553 __ sshr(v0, __ T4S, v16, 31); 5554 __ sshr(v1, __ T4S, v17, 31); 5555 __ sshr(v2, __ T4S, v18, 31); 5556 __ sshr(v3, __ T4S, v19, 31); 5557 5558 // r1 in v0 5559 // r0 += ~r1; 5560 __ notr(v20, __ T16B, v0); 5561 __ notr(v21, __ T16B, v1); 5562 __ notr(v22, __ T16B, v2); 5563 __ notr(v23, __ T16B, v3); 5564 5565 __ addv(v8, __ T4S, v8, v20); 5566 __ addv(v9, __ T4S, v9, v21); 5567 __ addv(v10, __ T4S, v10, v22); 5568 __ addv(v11, __ T4S, v11, v23); 5569 5570 // r0 in v8 5571 // r1 = r1 & quotient; 5572 __ andr(v0, __ T16B, v4, v0); 5573 __ andr(v1, __ T16B, v5, v1); 5574 __ andr(v2, __ T16B, v6, v2); 5575 __ andr(v3, __ T16B, v7, v3); 5576 5577 // r1 in v0 5578 // lowPart[m] = r0; 5579 // highPart[m] = r1; 5580 __ st4(v8, v9, v10, v11, __ T4S, __ post(lowPart, 64)); 5581 __ st4(v0, v1, v2, v3, __ T4S, __ post(highPart, 64)); 5582 5583 5584 __ sub(len, len, 64); 5585 __ cmp(len, (u1)64); 5586 __ br(Assembler::GE, L_loop); 5587 5588 // restore callee-saved vector registers 5589 __ ldpd(v14, v15, Address(sp, 48)); 5590 __ ldpd(v12, v13, Address(sp, 32)); 5591 __ ldpd(v10, v11, Address(sp, 16)); 5592 __ ldpd(v8, v9, __ post(sp, 64)); 5593 5594 __ leave(); // required for proper stackwalking of RuntimeStub frame 5595 __ mov(r0, zr); // return 0 5596 __ ret(lr); 5597 5598 return start; 5599 } 5600 5601 /** 5602 * Arguments: 5603 * 5604 * Inputs: 5605 * c_rarg0 - int crc 5606 * c_rarg1 - byte* buf 5607 * c_rarg2 - int length 5608 * c_rarg3 - int* table 5609 * 5610 * Output: 5611 * r0 - int crc result 5612 */ 5613 address generate_updateBytesCRC32C() { 5614 assert(UseCRC32CIntrinsics, "what are we doing here?"); 5615 5616 __ align(CodeEntryAlignment); 5617 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 5618 StubCodeMark mark(this, stub_id); 5619 5620 address start = __ pc(); 5621 5622 const Register crc = c_rarg0; // crc 5623 const Register buf = c_rarg1; // source java byte array address 5624 const Register len = c_rarg2; // length 5625 const Register table0 = c_rarg3; // crc_table address 5626 const Register table1 = c_rarg4; 5627 const Register table2 = c_rarg5; 5628 const Register table3 = c_rarg6; 5629 const Register tmp3 = c_rarg7; 5630 5631 BLOCK_COMMENT("Entry:"); 5632 __ enter(); // required for proper stackwalking of RuntimeStub frame 5633 5634 __ kernel_crc32c(crc, buf, len, 5635 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 5636 5637 __ leave(); // required for proper stackwalking of RuntimeStub frame 5638 __ ret(lr); 5639 5640 return start; 5641 } 5642 5643 /*** 5644 * Arguments: 5645 * 5646 * Inputs: 5647 * c_rarg0 - int adler 5648 * c_rarg1 - byte* buff 5649 * c_rarg2 - int len 5650 * 5651 * Output: 5652 * c_rarg0 - int adler result 5653 */ 5654 address generate_updateBytesAdler32() { 5655 __ align(CodeEntryAlignment); 5656 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 5657 StubCodeMark mark(this, stub_id); 5658 address start = __ pc(); 5659 5660 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 5661 5662 // Aliases 5663 Register adler = c_rarg0; 5664 Register s1 = c_rarg0; 5665 Register s2 = c_rarg3; 5666 Register buff = c_rarg1; 5667 Register len = c_rarg2; 5668 Register nmax = r4; 5669 Register base = r5; 5670 Register count = r6; 5671 Register temp0 = rscratch1; 5672 Register temp1 = rscratch2; 5673 FloatRegister vbytes = v0; 5674 FloatRegister vs1acc = v1; 5675 FloatRegister vs2acc = v2; 5676 FloatRegister vtable = v3; 5677 5678 // Max number of bytes we can process before having to take the mod 5679 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 5680 uint64_t BASE = 0xfff1; 5681 uint64_t NMAX = 0x15B0; 5682 5683 __ mov(base, BASE); 5684 __ mov(nmax, NMAX); 5685 5686 // Load accumulation coefficients for the upper 16 bits 5687 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 5688 __ ld1(vtable, __ T16B, Address(temp0)); 5689 5690 // s1 is initialized to the lower 16 bits of adler 5691 // s2 is initialized to the upper 16 bits of adler 5692 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 5693 __ uxth(s1, adler); // s1 = (adler & 0xffff) 5694 5695 // The pipelined loop needs at least 16 elements for 1 iteration 5696 // It does check this, but it is more effective to skip to the cleanup loop 5697 __ cmp(len, (u1)16); 5698 __ br(Assembler::HS, L_nmax); 5699 __ cbz(len, L_combine); 5700 5701 __ bind(L_simple_by1_loop); 5702 __ ldrb(temp0, Address(__ post(buff, 1))); 5703 __ add(s1, s1, temp0); 5704 __ add(s2, s2, s1); 5705 __ subs(len, len, 1); 5706 __ br(Assembler::HI, L_simple_by1_loop); 5707 5708 // s1 = s1 % BASE 5709 __ subs(temp0, s1, base); 5710 __ csel(s1, temp0, s1, Assembler::HS); 5711 5712 // s2 = s2 % BASE 5713 __ lsr(temp0, s2, 16); 5714 __ lsl(temp1, temp0, 4); 5715 __ sub(temp1, temp1, temp0); 5716 __ add(s2, temp1, s2, ext::uxth); 5717 5718 __ subs(temp0, s2, base); 5719 __ csel(s2, temp0, s2, Assembler::HS); 5720 5721 __ b(L_combine); 5722 5723 __ bind(L_nmax); 5724 __ subs(len, len, nmax); 5725 __ sub(count, nmax, 16); 5726 __ br(Assembler::LO, L_by16); 5727 5728 __ bind(L_nmax_loop); 5729 5730 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 5731 vbytes, vs1acc, vs2acc, vtable); 5732 5733 __ subs(count, count, 16); 5734 __ br(Assembler::HS, L_nmax_loop); 5735 5736 // s1 = s1 % BASE 5737 __ lsr(temp0, s1, 16); 5738 __ lsl(temp1, temp0, 4); 5739 __ sub(temp1, temp1, temp0); 5740 __ add(temp1, temp1, s1, ext::uxth); 5741 5742 __ lsr(temp0, temp1, 16); 5743 __ lsl(s1, temp0, 4); 5744 __ sub(s1, s1, temp0); 5745 __ add(s1, s1, temp1, ext:: uxth); 5746 5747 __ subs(temp0, s1, base); 5748 __ csel(s1, temp0, s1, Assembler::HS); 5749 5750 // s2 = s2 % BASE 5751 __ lsr(temp0, s2, 16); 5752 __ lsl(temp1, temp0, 4); 5753 __ sub(temp1, temp1, temp0); 5754 __ add(temp1, temp1, s2, ext::uxth); 5755 5756 __ lsr(temp0, temp1, 16); 5757 __ lsl(s2, temp0, 4); 5758 __ sub(s2, s2, temp0); 5759 __ add(s2, s2, temp1, ext:: uxth); 5760 5761 __ subs(temp0, s2, base); 5762 __ csel(s2, temp0, s2, Assembler::HS); 5763 5764 __ subs(len, len, nmax); 5765 __ sub(count, nmax, 16); 5766 __ br(Assembler::HS, L_nmax_loop); 5767 5768 __ bind(L_by16); 5769 __ adds(len, len, count); 5770 __ br(Assembler::LO, L_by1); 5771 5772 __ bind(L_by16_loop); 5773 5774 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 5775 vbytes, vs1acc, vs2acc, vtable); 5776 5777 __ subs(len, len, 16); 5778 __ br(Assembler::HS, L_by16_loop); 5779 5780 __ bind(L_by1); 5781 __ adds(len, len, 15); 5782 __ br(Assembler::LO, L_do_mod); 5783 5784 __ bind(L_by1_loop); 5785 __ ldrb(temp0, Address(__ post(buff, 1))); 5786 __ add(s1, temp0, s1); 5787 __ add(s2, s2, s1); 5788 __ subs(len, len, 1); 5789 __ br(Assembler::HS, L_by1_loop); 5790 5791 __ bind(L_do_mod); 5792 // s1 = s1 % BASE 5793 __ lsr(temp0, s1, 16); 5794 __ lsl(temp1, temp0, 4); 5795 __ sub(temp1, temp1, temp0); 5796 __ add(temp1, temp1, s1, ext::uxth); 5797 5798 __ lsr(temp0, temp1, 16); 5799 __ lsl(s1, temp0, 4); 5800 __ sub(s1, s1, temp0); 5801 __ add(s1, s1, temp1, ext:: uxth); 5802 5803 __ subs(temp0, s1, base); 5804 __ csel(s1, temp0, s1, Assembler::HS); 5805 5806 // s2 = s2 % BASE 5807 __ lsr(temp0, s2, 16); 5808 __ lsl(temp1, temp0, 4); 5809 __ sub(temp1, temp1, temp0); 5810 __ add(temp1, temp1, s2, ext::uxth); 5811 5812 __ lsr(temp0, temp1, 16); 5813 __ lsl(s2, temp0, 4); 5814 __ sub(s2, s2, temp0); 5815 __ add(s2, s2, temp1, ext:: uxth); 5816 5817 __ subs(temp0, s2, base); 5818 __ csel(s2, temp0, s2, Assembler::HS); 5819 5820 // Combine lower bits and higher bits 5821 __ bind(L_combine); 5822 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 5823 5824 __ ret(lr); 5825 5826 return start; 5827 } 5828 5829 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 5830 Register temp0, Register temp1, FloatRegister vbytes, 5831 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 5832 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 5833 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 5834 // In non-vectorized code, we update s1 and s2 as: 5835 // s1 <- s1 + b1 5836 // s2 <- s2 + s1 5837 // s1 <- s1 + b2 5838 // s2 <- s2 + b1 5839 // ... 5840 // s1 <- s1 + b16 5841 // s2 <- s2 + s1 5842 // Putting above assignments together, we have: 5843 // s1_new = s1 + b1 + b2 + ... + b16 5844 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 5845 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 5846 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 5847 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 5848 5849 // s2 = s2 + s1 * 16 5850 __ add(s2, s2, s1, Assembler::LSL, 4); 5851 5852 // vs1acc = b1 + b2 + b3 + ... + b16 5853 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 5854 __ umullv(vs2acc, __ T8B, vtable, vbytes); 5855 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 5856 __ uaddlv(vs1acc, __ T16B, vbytes); 5857 __ uaddlv(vs2acc, __ T8H, vs2acc); 5858 5859 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 5860 __ fmovd(temp0, vs1acc); 5861 __ fmovd(temp1, vs2acc); 5862 __ add(s1, s1, temp0); 5863 __ add(s2, s2, temp1); 5864 } 5865 5866 /** 5867 * Arguments: 5868 * 5869 * Input: 5870 * c_rarg0 - x address 5871 * c_rarg1 - x length 5872 * c_rarg2 - y address 5873 * c_rarg3 - y length 5874 * c_rarg4 - z address 5875 */ 5876 address generate_multiplyToLen() { 5877 __ align(CodeEntryAlignment); 5878 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 5879 StubCodeMark mark(this, stub_id); 5880 5881 address start = __ pc(); 5882 const Register x = r0; 5883 const Register xlen = r1; 5884 const Register y = r2; 5885 const Register ylen = r3; 5886 const Register z = r4; 5887 5888 const Register tmp0 = r5; 5889 const Register tmp1 = r10; 5890 const Register tmp2 = r11; 5891 const Register tmp3 = r12; 5892 const Register tmp4 = r13; 5893 const Register tmp5 = r14; 5894 const Register tmp6 = r15; 5895 const Register tmp7 = r16; 5896 5897 BLOCK_COMMENT("Entry:"); 5898 __ enter(); // required for proper stackwalking of RuntimeStub frame 5899 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 5900 __ leave(); // required for proper stackwalking of RuntimeStub frame 5901 __ ret(lr); 5902 5903 return start; 5904 } 5905 5906 address generate_squareToLen() { 5907 // squareToLen algorithm for sizes 1..127 described in java code works 5908 // faster than multiply_to_len on some CPUs and slower on others, but 5909 // multiply_to_len shows a bit better overall results 5910 __ align(CodeEntryAlignment); 5911 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 5912 StubCodeMark mark(this, stub_id); 5913 address start = __ pc(); 5914 5915 const Register x = r0; 5916 const Register xlen = r1; 5917 const Register z = r2; 5918 const Register y = r4; // == x 5919 const Register ylen = r5; // == xlen 5920 5921 const Register tmp0 = r3; 5922 const Register tmp1 = r10; 5923 const Register tmp2 = r11; 5924 const Register tmp3 = r12; 5925 const Register tmp4 = r13; 5926 const Register tmp5 = r14; 5927 const Register tmp6 = r15; 5928 const Register tmp7 = r16; 5929 5930 RegSet spilled_regs = RegSet::of(y, ylen); 5931 BLOCK_COMMENT("Entry:"); 5932 __ enter(); 5933 __ push(spilled_regs, sp); 5934 __ mov(y, x); 5935 __ mov(ylen, xlen); 5936 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 5937 __ pop(spilled_regs, sp); 5938 __ leave(); 5939 __ ret(lr); 5940 return start; 5941 } 5942 5943 address generate_mulAdd() { 5944 __ align(CodeEntryAlignment); 5945 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 5946 StubCodeMark mark(this, stub_id); 5947 5948 address start = __ pc(); 5949 5950 const Register out = r0; 5951 const Register in = r1; 5952 const Register offset = r2; 5953 const Register len = r3; 5954 const Register k = r4; 5955 5956 BLOCK_COMMENT("Entry:"); 5957 __ enter(); 5958 __ mul_add(out, in, offset, len, k); 5959 __ leave(); 5960 __ ret(lr); 5961 5962 return start; 5963 } 5964 5965 // Arguments: 5966 // 5967 // Input: 5968 // c_rarg0 - newArr address 5969 // c_rarg1 - oldArr address 5970 // c_rarg2 - newIdx 5971 // c_rarg3 - shiftCount 5972 // c_rarg4 - numIter 5973 // 5974 address generate_bigIntegerRightShift() { 5975 __ align(CodeEntryAlignment); 5976 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 5977 StubCodeMark mark(this, stub_id); 5978 address start = __ pc(); 5979 5980 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 5981 5982 Register newArr = c_rarg0; 5983 Register oldArr = c_rarg1; 5984 Register newIdx = c_rarg2; 5985 Register shiftCount = c_rarg3; 5986 Register numIter = c_rarg4; 5987 Register idx = numIter; 5988 5989 Register newArrCur = rscratch1; 5990 Register shiftRevCount = rscratch2; 5991 Register oldArrCur = r13; 5992 Register oldArrNext = r14; 5993 5994 FloatRegister oldElem0 = v0; 5995 FloatRegister oldElem1 = v1; 5996 FloatRegister newElem = v2; 5997 FloatRegister shiftVCount = v3; 5998 FloatRegister shiftVRevCount = v4; 5999 6000 __ cbz(idx, Exit); 6001 6002 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 6003 6004 // left shift count 6005 __ movw(shiftRevCount, 32); 6006 __ subw(shiftRevCount, shiftRevCount, shiftCount); 6007 6008 // numIter too small to allow a 4-words SIMD loop, rolling back 6009 __ cmp(numIter, (u1)4); 6010 __ br(Assembler::LT, ShiftThree); 6011 6012 __ dup(shiftVCount, __ T4S, shiftCount); 6013 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 6014 __ negr(shiftVCount, __ T4S, shiftVCount); 6015 6016 __ BIND(ShiftSIMDLoop); 6017 6018 // Calculate the load addresses 6019 __ sub(idx, idx, 4); 6020 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 6021 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 6022 __ add(oldArrCur, oldArrNext, 4); 6023 6024 // Load 4 words and process 6025 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 6026 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 6027 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 6028 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 6029 __ orr(newElem, __ T16B, oldElem0, oldElem1); 6030 __ st1(newElem, __ T4S, Address(newArrCur)); 6031 6032 __ cmp(idx, (u1)4); 6033 __ br(Assembler::LT, ShiftTwoLoop); 6034 __ b(ShiftSIMDLoop); 6035 6036 __ BIND(ShiftTwoLoop); 6037 __ cbz(idx, Exit); 6038 __ cmp(idx, (u1)1); 6039 __ br(Assembler::EQ, ShiftOne); 6040 6041 // Calculate the load addresses 6042 __ sub(idx, idx, 2); 6043 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 6044 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 6045 __ add(oldArrCur, oldArrNext, 4); 6046 6047 // Load 2 words and process 6048 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 6049 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 6050 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 6051 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 6052 __ orr(newElem, __ T8B, oldElem0, oldElem1); 6053 __ st1(newElem, __ T2S, Address(newArrCur)); 6054 __ b(ShiftTwoLoop); 6055 6056 __ BIND(ShiftThree); 6057 __ tbz(idx, 1, ShiftOne); 6058 __ tbz(idx, 0, ShiftTwo); 6059 __ ldrw(r10, Address(oldArr, 12)); 6060 __ ldrw(r11, Address(oldArr, 8)); 6061 __ lsrvw(r10, r10, shiftCount); 6062 __ lslvw(r11, r11, shiftRevCount); 6063 __ orrw(r12, r10, r11); 6064 __ strw(r12, Address(newArr, 8)); 6065 6066 __ BIND(ShiftTwo); 6067 __ ldrw(r10, Address(oldArr, 8)); 6068 __ ldrw(r11, Address(oldArr, 4)); 6069 __ lsrvw(r10, r10, shiftCount); 6070 __ lslvw(r11, r11, shiftRevCount); 6071 __ orrw(r12, r10, r11); 6072 __ strw(r12, Address(newArr, 4)); 6073 6074 __ BIND(ShiftOne); 6075 __ ldrw(r10, Address(oldArr, 4)); 6076 __ ldrw(r11, Address(oldArr)); 6077 __ lsrvw(r10, r10, shiftCount); 6078 __ lslvw(r11, r11, shiftRevCount); 6079 __ orrw(r12, r10, r11); 6080 __ strw(r12, Address(newArr)); 6081 6082 __ BIND(Exit); 6083 __ ret(lr); 6084 6085 return start; 6086 } 6087 6088 // Arguments: 6089 // 6090 // Input: 6091 // c_rarg0 - newArr address 6092 // c_rarg1 - oldArr address 6093 // c_rarg2 - newIdx 6094 // c_rarg3 - shiftCount 6095 // c_rarg4 - numIter 6096 // 6097 address generate_bigIntegerLeftShift() { 6098 __ align(CodeEntryAlignment); 6099 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 6100 StubCodeMark mark(this, stub_id); 6101 address start = __ pc(); 6102 6103 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 6104 6105 Register newArr = c_rarg0; 6106 Register oldArr = c_rarg1; 6107 Register newIdx = c_rarg2; 6108 Register shiftCount = c_rarg3; 6109 Register numIter = c_rarg4; 6110 6111 Register shiftRevCount = rscratch1; 6112 Register oldArrNext = rscratch2; 6113 6114 FloatRegister oldElem0 = v0; 6115 FloatRegister oldElem1 = v1; 6116 FloatRegister newElem = v2; 6117 FloatRegister shiftVCount = v3; 6118 FloatRegister shiftVRevCount = v4; 6119 6120 __ cbz(numIter, Exit); 6121 6122 __ add(oldArrNext, oldArr, 4); 6123 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 6124 6125 // right shift count 6126 __ movw(shiftRevCount, 32); 6127 __ subw(shiftRevCount, shiftRevCount, shiftCount); 6128 6129 // numIter too small to allow a 4-words SIMD loop, rolling back 6130 __ cmp(numIter, (u1)4); 6131 __ br(Assembler::LT, ShiftThree); 6132 6133 __ dup(shiftVCount, __ T4S, shiftCount); 6134 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 6135 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 6136 6137 __ BIND(ShiftSIMDLoop); 6138 6139 // load 4 words and process 6140 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 6141 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 6142 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 6143 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 6144 __ orr(newElem, __ T16B, oldElem0, oldElem1); 6145 __ st1(newElem, __ T4S, __ post(newArr, 16)); 6146 __ sub(numIter, numIter, 4); 6147 6148 __ cmp(numIter, (u1)4); 6149 __ br(Assembler::LT, ShiftTwoLoop); 6150 __ b(ShiftSIMDLoop); 6151 6152 __ BIND(ShiftTwoLoop); 6153 __ cbz(numIter, Exit); 6154 __ cmp(numIter, (u1)1); 6155 __ br(Assembler::EQ, ShiftOne); 6156 6157 // load 2 words and process 6158 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 6159 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 6160 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 6161 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 6162 __ orr(newElem, __ T8B, oldElem0, oldElem1); 6163 __ st1(newElem, __ T2S, __ post(newArr, 8)); 6164 __ sub(numIter, numIter, 2); 6165 __ b(ShiftTwoLoop); 6166 6167 __ BIND(ShiftThree); 6168 __ ldrw(r10, __ post(oldArr, 4)); 6169 __ ldrw(r11, __ post(oldArrNext, 4)); 6170 __ lslvw(r10, r10, shiftCount); 6171 __ lsrvw(r11, r11, shiftRevCount); 6172 __ orrw(r12, r10, r11); 6173 __ strw(r12, __ post(newArr, 4)); 6174 __ tbz(numIter, 1, Exit); 6175 __ tbz(numIter, 0, ShiftOne); 6176 6177 __ BIND(ShiftTwo); 6178 __ ldrw(r10, __ post(oldArr, 4)); 6179 __ ldrw(r11, __ post(oldArrNext, 4)); 6180 __ lslvw(r10, r10, shiftCount); 6181 __ lsrvw(r11, r11, shiftRevCount); 6182 __ orrw(r12, r10, r11); 6183 __ strw(r12, __ post(newArr, 4)); 6184 6185 __ BIND(ShiftOne); 6186 __ ldrw(r10, Address(oldArr)); 6187 __ ldrw(r11, Address(oldArrNext)); 6188 __ lslvw(r10, r10, shiftCount); 6189 __ lsrvw(r11, r11, shiftRevCount); 6190 __ orrw(r12, r10, r11); 6191 __ strw(r12, Address(newArr)); 6192 6193 __ BIND(Exit); 6194 __ ret(lr); 6195 6196 return start; 6197 } 6198 6199 address generate_count_positives(address &count_positives_long) { 6200 const u1 large_loop_size = 64; 6201 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 6202 int dcache_line = VM_Version::dcache_line_size(); 6203 6204 Register ary1 = r1, len = r2, result = r0; 6205 6206 __ align(CodeEntryAlignment); 6207 6208 StubGenStubId stub_id = StubGenStubId::count_positives_id; 6209 StubCodeMark mark(this, stub_id); 6210 6211 address entry = __ pc(); 6212 6213 __ enter(); 6214 // precondition: a copy of len is already in result 6215 // __ mov(result, len); 6216 6217 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 6218 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 6219 6220 __ cmp(len, (u1)15); 6221 __ br(Assembler::GT, LEN_OVER_15); 6222 // The only case when execution falls into this code is when pointer is near 6223 // the end of memory page and we have to avoid reading next page 6224 __ add(ary1, ary1, len); 6225 __ subs(len, len, 8); 6226 __ br(Assembler::GT, LEN_OVER_8); 6227 __ ldr(rscratch2, Address(ary1, -8)); 6228 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 6229 __ lsrv(rscratch2, rscratch2, rscratch1); 6230 __ tst(rscratch2, UPPER_BIT_MASK); 6231 __ csel(result, zr, result, Assembler::NE); 6232 __ leave(); 6233 __ ret(lr); 6234 __ bind(LEN_OVER_8); 6235 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 6236 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 6237 __ tst(rscratch2, UPPER_BIT_MASK); 6238 __ br(Assembler::NE, RET_NO_POP); 6239 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 6240 __ lsrv(rscratch1, rscratch1, rscratch2); 6241 __ tst(rscratch1, UPPER_BIT_MASK); 6242 __ bind(RET_NO_POP); 6243 __ csel(result, zr, result, Assembler::NE); 6244 __ leave(); 6245 __ ret(lr); 6246 6247 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 6248 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 6249 6250 count_positives_long = __ pc(); // 2nd entry point 6251 6252 __ enter(); 6253 6254 __ bind(LEN_OVER_15); 6255 __ push(spilled_regs, sp); 6256 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 6257 __ cbz(rscratch2, ALIGNED); 6258 __ ldp(tmp6, tmp1, Address(ary1)); 6259 __ mov(tmp5, 16); 6260 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 6261 __ add(ary1, ary1, rscratch1); 6262 __ orr(tmp6, tmp6, tmp1); 6263 __ tst(tmp6, UPPER_BIT_MASK); 6264 __ br(Assembler::NE, RET_ADJUST); 6265 __ sub(len, len, rscratch1); 6266 6267 __ bind(ALIGNED); 6268 __ cmp(len, large_loop_size); 6269 __ br(Assembler::LT, CHECK_16); 6270 // Perform 16-byte load as early return in pre-loop to handle situation 6271 // when initially aligned large array has negative values at starting bytes, 6272 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 6273 // slower. Cases with negative bytes further ahead won't be affected that 6274 // much. In fact, it'll be faster due to early loads, less instructions and 6275 // less branches in LARGE_LOOP. 6276 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 6277 __ sub(len, len, 16); 6278 __ orr(tmp6, tmp6, tmp1); 6279 __ tst(tmp6, UPPER_BIT_MASK); 6280 __ br(Assembler::NE, RET_ADJUST_16); 6281 __ cmp(len, large_loop_size); 6282 __ br(Assembler::LT, CHECK_16); 6283 6284 if (SoftwarePrefetchHintDistance >= 0 6285 && SoftwarePrefetchHintDistance >= dcache_line) { 6286 // initial prefetch 6287 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 6288 } 6289 __ bind(LARGE_LOOP); 6290 if (SoftwarePrefetchHintDistance >= 0) { 6291 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 6292 } 6293 // Issue load instructions first, since it can save few CPU/MEM cycles, also 6294 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 6295 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 6296 // instructions per cycle and have less branches, but this approach disables 6297 // early return, thus, all 64 bytes are loaded and checked every time. 6298 __ ldp(tmp2, tmp3, Address(ary1)); 6299 __ ldp(tmp4, tmp5, Address(ary1, 16)); 6300 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 6301 __ ldp(tmp6, tmp1, Address(ary1, 48)); 6302 __ add(ary1, ary1, large_loop_size); 6303 __ sub(len, len, large_loop_size); 6304 __ orr(tmp2, tmp2, tmp3); 6305 __ orr(tmp4, tmp4, tmp5); 6306 __ orr(rscratch1, rscratch1, rscratch2); 6307 __ orr(tmp6, tmp6, tmp1); 6308 __ orr(tmp2, tmp2, tmp4); 6309 __ orr(rscratch1, rscratch1, tmp6); 6310 __ orr(tmp2, tmp2, rscratch1); 6311 __ tst(tmp2, UPPER_BIT_MASK); 6312 __ br(Assembler::NE, RET_ADJUST_LONG); 6313 __ cmp(len, large_loop_size); 6314 __ br(Assembler::GE, LARGE_LOOP); 6315 6316 __ bind(CHECK_16); // small 16-byte load pre-loop 6317 __ cmp(len, (u1)16); 6318 __ br(Assembler::LT, POST_LOOP16); 6319 6320 __ bind(LOOP16); // small 16-byte load loop 6321 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 6322 __ sub(len, len, 16); 6323 __ orr(tmp2, tmp2, tmp3); 6324 __ tst(tmp2, UPPER_BIT_MASK); 6325 __ br(Assembler::NE, RET_ADJUST_16); 6326 __ cmp(len, (u1)16); 6327 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 6328 6329 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 6330 __ cmp(len, (u1)8); 6331 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 6332 __ ldr(tmp3, Address(__ post(ary1, 8))); 6333 __ tst(tmp3, UPPER_BIT_MASK); 6334 __ br(Assembler::NE, RET_ADJUST); 6335 __ sub(len, len, 8); 6336 6337 __ bind(POST_LOOP16_LOAD_TAIL); 6338 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 6339 __ ldr(tmp1, Address(ary1)); 6340 __ mov(tmp2, 64); 6341 __ sub(tmp4, tmp2, len, __ LSL, 3); 6342 __ lslv(tmp1, tmp1, tmp4); 6343 __ tst(tmp1, UPPER_BIT_MASK); 6344 __ br(Assembler::NE, RET_ADJUST); 6345 // Fallthrough 6346 6347 __ bind(RET_LEN); 6348 __ pop(spilled_regs, sp); 6349 __ leave(); 6350 __ ret(lr); 6351 6352 // difference result - len is the count of guaranteed to be 6353 // positive bytes 6354 6355 __ bind(RET_ADJUST_LONG); 6356 __ add(len, len, (u1)(large_loop_size - 16)); 6357 __ bind(RET_ADJUST_16); 6358 __ add(len, len, 16); 6359 __ bind(RET_ADJUST); 6360 __ pop(spilled_regs, sp); 6361 __ leave(); 6362 __ sub(result, result, len); 6363 __ ret(lr); 6364 6365 return entry; 6366 } 6367 6368 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 6369 bool usePrefetch, Label &NOT_EQUAL) { 6370 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6371 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 6372 tmp7 = r12, tmp8 = r13; 6373 Label LOOP; 6374 6375 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6376 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6377 __ bind(LOOP); 6378 if (usePrefetch) { 6379 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 6380 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 6381 } 6382 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 6383 __ eor(tmp1, tmp1, tmp2); 6384 __ eor(tmp3, tmp3, tmp4); 6385 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 6386 __ orr(tmp1, tmp1, tmp3); 6387 __ cbnz(tmp1, NOT_EQUAL); 6388 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6389 __ eor(tmp5, tmp5, tmp6); 6390 __ eor(tmp7, tmp7, tmp8); 6391 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6392 __ orr(tmp5, tmp5, tmp7); 6393 __ cbnz(tmp5, NOT_EQUAL); 6394 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 6395 __ eor(tmp1, tmp1, tmp2); 6396 __ eor(tmp3, tmp3, tmp4); 6397 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 6398 __ orr(tmp1, tmp1, tmp3); 6399 __ cbnz(tmp1, NOT_EQUAL); 6400 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6401 __ eor(tmp5, tmp5, tmp6); 6402 __ sub(cnt1, cnt1, 8 * wordSize); 6403 __ eor(tmp7, tmp7, tmp8); 6404 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6405 // tmp6 is not used. MacroAssembler::subs is used here (rather than 6406 // cmp) because subs allows an unlimited range of immediate operand. 6407 __ subs(tmp6, cnt1, loopThreshold); 6408 __ orr(tmp5, tmp5, tmp7); 6409 __ cbnz(tmp5, NOT_EQUAL); 6410 __ br(__ GE, LOOP); 6411 // post-loop 6412 __ eor(tmp1, tmp1, tmp2); 6413 __ eor(tmp3, tmp3, tmp4); 6414 __ orr(tmp1, tmp1, tmp3); 6415 __ sub(cnt1, cnt1, 2 * wordSize); 6416 __ cbnz(tmp1, NOT_EQUAL); 6417 } 6418 6419 void generate_large_array_equals_loop_simd(int loopThreshold, 6420 bool usePrefetch, Label &NOT_EQUAL) { 6421 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6422 tmp2 = rscratch2; 6423 Label LOOP; 6424 6425 __ bind(LOOP); 6426 if (usePrefetch) { 6427 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 6428 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 6429 } 6430 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 6431 __ sub(cnt1, cnt1, 8 * wordSize); 6432 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 6433 __ subs(tmp1, cnt1, loopThreshold); 6434 __ eor(v0, __ T16B, v0, v4); 6435 __ eor(v1, __ T16B, v1, v5); 6436 __ eor(v2, __ T16B, v2, v6); 6437 __ eor(v3, __ T16B, v3, v7); 6438 __ orr(v0, __ T16B, v0, v1); 6439 __ orr(v1, __ T16B, v2, v3); 6440 __ orr(v0, __ T16B, v0, v1); 6441 __ umov(tmp1, v0, __ D, 0); 6442 __ umov(tmp2, v0, __ D, 1); 6443 __ orr(tmp1, tmp1, tmp2); 6444 __ cbnz(tmp1, NOT_EQUAL); 6445 __ br(__ GE, LOOP); 6446 } 6447 6448 // a1 = r1 - array1 address 6449 // a2 = r2 - array2 address 6450 // result = r0 - return value. Already contains "false" 6451 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 6452 // r3-r5 are reserved temporary registers 6453 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 6454 address generate_large_array_equals() { 6455 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6456 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 6457 tmp7 = r12, tmp8 = r13; 6458 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 6459 SMALL_LOOP, POST_LOOP; 6460 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 6461 // calculate if at least 32 prefetched bytes are used 6462 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 6463 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 6464 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 6465 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 6466 tmp5, tmp6, tmp7, tmp8); 6467 6468 __ align(CodeEntryAlignment); 6469 6470 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 6471 StubCodeMark mark(this, stub_id); 6472 6473 address entry = __ pc(); 6474 __ enter(); 6475 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 6476 // also advance pointers to use post-increment instead of pre-increment 6477 __ add(a1, a1, wordSize); 6478 __ add(a2, a2, wordSize); 6479 if (AvoidUnalignedAccesses) { 6480 // both implementations (SIMD/nonSIMD) are using relatively large load 6481 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 6482 // on some CPUs in case of address is not at least 16-byte aligned. 6483 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 6484 // load if needed at least for 1st address and make if 16-byte aligned. 6485 Label ALIGNED16; 6486 __ tbz(a1, 3, ALIGNED16); 6487 __ ldr(tmp1, Address(__ post(a1, wordSize))); 6488 __ ldr(tmp2, Address(__ post(a2, wordSize))); 6489 __ sub(cnt1, cnt1, wordSize); 6490 __ eor(tmp1, tmp1, tmp2); 6491 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 6492 __ bind(ALIGNED16); 6493 } 6494 if (UseSIMDForArrayEquals) { 6495 if (SoftwarePrefetchHintDistance >= 0) { 6496 __ subs(tmp1, cnt1, prefetchLoopThreshold); 6497 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 6498 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 6499 /* prfm = */ true, NOT_EQUAL); 6500 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 6501 __ br(__ LT, TAIL); 6502 } 6503 __ bind(NO_PREFETCH_LARGE_LOOP); 6504 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 6505 /* prfm = */ false, NOT_EQUAL); 6506 } else { 6507 __ push(spilled_regs, sp); 6508 if (SoftwarePrefetchHintDistance >= 0) { 6509 __ subs(tmp1, cnt1, prefetchLoopThreshold); 6510 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 6511 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 6512 /* prfm = */ true, NOT_EQUAL); 6513 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 6514 __ br(__ LT, TAIL); 6515 } 6516 __ bind(NO_PREFETCH_LARGE_LOOP); 6517 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 6518 /* prfm = */ false, NOT_EQUAL); 6519 } 6520 __ bind(TAIL); 6521 __ cbz(cnt1, EQUAL); 6522 __ subs(cnt1, cnt1, wordSize); 6523 __ br(__ LE, POST_LOOP); 6524 __ bind(SMALL_LOOP); 6525 __ ldr(tmp1, Address(__ post(a1, wordSize))); 6526 __ ldr(tmp2, Address(__ post(a2, wordSize))); 6527 __ subs(cnt1, cnt1, wordSize); 6528 __ eor(tmp1, tmp1, tmp2); 6529 __ cbnz(tmp1, NOT_EQUAL); 6530 __ br(__ GT, SMALL_LOOP); 6531 __ bind(POST_LOOP); 6532 __ ldr(tmp1, Address(a1, cnt1)); 6533 __ ldr(tmp2, Address(a2, cnt1)); 6534 __ eor(tmp1, tmp1, tmp2); 6535 __ cbnz(tmp1, NOT_EQUAL); 6536 __ bind(EQUAL); 6537 __ mov(result, true); 6538 __ bind(NOT_EQUAL); 6539 if (!UseSIMDForArrayEquals) { 6540 __ pop(spilled_regs, sp); 6541 } 6542 __ bind(NOT_EQUAL_NO_POP); 6543 __ leave(); 6544 __ ret(lr); 6545 return entry; 6546 } 6547 6548 // result = r0 - return value. Contains initial hashcode value on entry. 6549 // ary = r1 - array address 6550 // cnt = r2 - elements count 6551 // Clobbers: v0-v13, rscratch1, rscratch2 6552 address generate_large_arrays_hashcode(BasicType eltype) { 6553 const Register result = r0, ary = r1, cnt = r2; 6554 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 6555 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 6556 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 6557 const FloatRegister vpowm = v13; 6558 6559 ARRAYS_HASHCODE_REGISTERS; 6560 6561 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 6562 6563 unsigned int vf; // vectorization factor 6564 bool multiply_by_halves; 6565 Assembler::SIMD_Arrangement load_arrangement; 6566 switch (eltype) { 6567 case T_BOOLEAN: 6568 case T_BYTE: 6569 load_arrangement = Assembler::T8B; 6570 multiply_by_halves = true; 6571 vf = 8; 6572 break; 6573 case T_CHAR: 6574 case T_SHORT: 6575 load_arrangement = Assembler::T8H; 6576 multiply_by_halves = true; 6577 vf = 8; 6578 break; 6579 case T_INT: 6580 load_arrangement = Assembler::T4S; 6581 multiply_by_halves = false; 6582 vf = 4; 6583 break; 6584 default: 6585 ShouldNotReachHere(); 6586 } 6587 6588 // Unroll factor 6589 const unsigned uf = 4; 6590 6591 // Effective vectorization factor 6592 const unsigned evf = vf * uf; 6593 6594 __ align(CodeEntryAlignment); 6595 6596 StubGenStubId stub_id; 6597 switch (eltype) { 6598 case T_BOOLEAN: 6599 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 6600 break; 6601 case T_BYTE: 6602 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 6603 break; 6604 case T_CHAR: 6605 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 6606 break; 6607 case T_SHORT: 6608 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 6609 break; 6610 case T_INT: 6611 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 6612 break; 6613 default: 6614 stub_id = StubGenStubId::NO_STUBID; 6615 ShouldNotReachHere(); 6616 }; 6617 6618 StubCodeMark mark(this, stub_id); 6619 6620 address entry = __ pc(); 6621 __ enter(); 6622 6623 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 6624 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 6625 // value shouldn't change throughout both loops. 6626 __ movw(rscratch1, intpow(31U, 3)); 6627 __ mov(vpow, Assembler::S, 0, rscratch1); 6628 __ movw(rscratch1, intpow(31U, 2)); 6629 __ mov(vpow, Assembler::S, 1, rscratch1); 6630 __ movw(rscratch1, intpow(31U, 1)); 6631 __ mov(vpow, Assembler::S, 2, rscratch1); 6632 __ movw(rscratch1, intpow(31U, 0)); 6633 __ mov(vpow, Assembler::S, 3, rscratch1); 6634 6635 __ mov(vmul0, Assembler::T16B, 0); 6636 __ mov(vmul0, Assembler::S, 3, result); 6637 6638 __ andr(rscratch2, cnt, (uf - 1) * vf); 6639 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 6640 6641 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 6642 __ mov(vpowm, Assembler::S, 0, rscratch1); 6643 6644 // SMALL LOOP 6645 __ bind(SMALL_LOOP); 6646 6647 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 6648 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6649 __ subsw(rscratch2, rscratch2, vf); 6650 6651 if (load_arrangement == Assembler::T8B) { 6652 // Extend 8B to 8H to be able to use vector multiply 6653 // instructions 6654 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 6655 if (is_signed_subword_type(eltype)) { 6656 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6657 } else { 6658 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6659 } 6660 } 6661 6662 switch (load_arrangement) { 6663 case Assembler::T4S: 6664 __ addv(vmul0, load_arrangement, vmul0, vdata0); 6665 break; 6666 case Assembler::T8B: 6667 case Assembler::T8H: 6668 assert(is_subword_type(eltype), "subword type expected"); 6669 if (is_signed_subword_type(eltype)) { 6670 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6671 } else { 6672 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6673 } 6674 break; 6675 default: 6676 __ should_not_reach_here(); 6677 } 6678 6679 // Process the upper half of a vector 6680 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 6681 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6682 if (is_signed_subword_type(eltype)) { 6683 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6684 } else { 6685 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6686 } 6687 } 6688 6689 __ br(Assembler::HI, SMALL_LOOP); 6690 6691 // SMALL LOOP'S EPILOQUE 6692 __ lsr(rscratch2, cnt, exact_log2(evf)); 6693 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 6694 6695 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 6696 __ addv(vmul0, Assembler::T4S, vmul0); 6697 __ umov(result, vmul0, Assembler::S, 0); 6698 6699 // TAIL 6700 __ bind(TAIL); 6701 6702 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 6703 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 6704 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 6705 __ andr(rscratch2, cnt, vf - 1); 6706 __ bind(TAIL_SHORTCUT); 6707 __ adr(rscratch1, BR_BASE); 6708 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 6709 __ movw(rscratch2, 0x1f); 6710 __ br(rscratch1); 6711 6712 for (size_t i = 0; i < vf - 1; ++i) { 6713 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 6714 eltype); 6715 __ maddw(result, result, rscratch2, rscratch1); 6716 } 6717 __ bind(BR_BASE); 6718 6719 __ leave(); 6720 __ ret(lr); 6721 6722 // LARGE LOOP 6723 __ bind(LARGE_LOOP_PREHEADER); 6724 6725 __ lsr(rscratch2, cnt, exact_log2(evf)); 6726 6727 if (multiply_by_halves) { 6728 // 31^4 - multiplier between lower and upper parts of a register 6729 __ movw(rscratch1, intpow(31U, vf / 2)); 6730 __ mov(vpowm, Assembler::S, 1, rscratch1); 6731 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 6732 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 6733 __ mov(vpowm, Assembler::S, 0, rscratch1); 6734 } else { 6735 // 31^16 6736 __ movw(rscratch1, intpow(31U, evf)); 6737 __ mov(vpowm, Assembler::S, 0, rscratch1); 6738 } 6739 6740 __ mov(vmul3, Assembler::T16B, 0); 6741 __ mov(vmul2, Assembler::T16B, 0); 6742 __ mov(vmul1, Assembler::T16B, 0); 6743 6744 __ bind(LARGE_LOOP); 6745 6746 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 6747 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 6748 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 6749 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6750 6751 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 6752 Address(__ post(ary, evf * type2aelembytes(eltype)))); 6753 6754 if (load_arrangement == Assembler::T8B) { 6755 // Extend 8B to 8H to be able to use vector multiply 6756 // instructions 6757 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 6758 if (is_signed_subword_type(eltype)) { 6759 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 6760 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 6761 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 6762 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6763 } else { 6764 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 6765 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 6766 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 6767 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6768 } 6769 } 6770 6771 switch (load_arrangement) { 6772 case Assembler::T4S: 6773 __ addv(vmul3, load_arrangement, vmul3, vdata3); 6774 __ addv(vmul2, load_arrangement, vmul2, vdata2); 6775 __ addv(vmul1, load_arrangement, vmul1, vdata1); 6776 __ addv(vmul0, load_arrangement, vmul0, vdata0); 6777 break; 6778 case Assembler::T8B: 6779 case Assembler::T8H: 6780 assert(is_subword_type(eltype), "subword type expected"); 6781 if (is_signed_subword_type(eltype)) { 6782 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 6783 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 6784 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 6785 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6786 } else { 6787 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 6788 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 6789 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 6790 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6791 } 6792 break; 6793 default: 6794 __ should_not_reach_here(); 6795 } 6796 6797 // Process the upper half of a vector 6798 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 6799 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 6800 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 6801 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 6802 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 6803 if (is_signed_subword_type(eltype)) { 6804 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 6805 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 6806 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 6807 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6808 } else { 6809 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 6810 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 6811 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 6812 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6813 } 6814 } 6815 6816 __ subsw(rscratch2, rscratch2, 1); 6817 __ br(Assembler::HI, LARGE_LOOP); 6818 6819 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 6820 __ addv(vmul3, Assembler::T4S, vmul3); 6821 __ umov(result, vmul3, Assembler::S, 0); 6822 6823 __ mov(rscratch2, intpow(31U, vf)); 6824 6825 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 6826 __ addv(vmul2, Assembler::T4S, vmul2); 6827 __ umov(rscratch1, vmul2, Assembler::S, 0); 6828 __ maddw(result, result, rscratch2, rscratch1); 6829 6830 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 6831 __ addv(vmul1, Assembler::T4S, vmul1); 6832 __ umov(rscratch1, vmul1, Assembler::S, 0); 6833 __ maddw(result, result, rscratch2, rscratch1); 6834 6835 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 6836 __ addv(vmul0, Assembler::T4S, vmul0); 6837 __ umov(rscratch1, vmul0, Assembler::S, 0); 6838 __ maddw(result, result, rscratch2, rscratch1); 6839 6840 __ andr(rscratch2, cnt, vf - 1); 6841 __ cbnz(rscratch2, TAIL_SHORTCUT); 6842 6843 __ leave(); 6844 __ ret(lr); 6845 6846 return entry; 6847 } 6848 6849 address generate_dsin_dcos(bool isCos) { 6850 __ align(CodeEntryAlignment); 6851 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 6852 StubCodeMark mark(this, stub_id); 6853 address start = __ pc(); 6854 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 6855 (address)StubRoutines::aarch64::_two_over_pi, 6856 (address)StubRoutines::aarch64::_pio2, 6857 (address)StubRoutines::aarch64::_dsin_coef, 6858 (address)StubRoutines::aarch64::_dcos_coef); 6859 return start; 6860 } 6861 6862 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 6863 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 6864 Label &DIFF2) { 6865 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 6866 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 6867 6868 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 6869 __ ldr(tmpU, Address(__ post(cnt1, 8))); 6870 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 6871 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 6872 6873 __ fmovd(tmpL, vtmp3); 6874 __ eor(rscratch2, tmp3, tmpL); 6875 __ cbnz(rscratch2, DIFF2); 6876 6877 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6878 __ umov(tmpL, vtmp3, __ D, 1); 6879 __ eor(rscratch2, tmpU, tmpL); 6880 __ cbnz(rscratch2, DIFF1); 6881 6882 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 6883 __ ldr(tmpU, Address(__ post(cnt1, 8))); 6884 __ fmovd(tmpL, vtmp); 6885 __ eor(rscratch2, tmp3, tmpL); 6886 __ cbnz(rscratch2, DIFF2); 6887 6888 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6889 __ umov(tmpL, vtmp, __ D, 1); 6890 __ eor(rscratch2, tmpU, tmpL); 6891 __ cbnz(rscratch2, DIFF1); 6892 } 6893 6894 // r0 = result 6895 // r1 = str1 6896 // r2 = cnt1 6897 // r3 = str2 6898 // r4 = cnt2 6899 // r10 = tmp1 6900 // r11 = tmp2 6901 address generate_compare_long_string_different_encoding(bool isLU) { 6902 __ align(CodeEntryAlignment); 6903 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 6904 StubCodeMark mark(this, stub_id); 6905 address entry = __ pc(); 6906 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 6907 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 6908 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 6909 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6910 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 6911 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 6912 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 6913 6914 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 6915 6916 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 6917 // cnt2 == amount of characters left to compare 6918 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 6919 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 6920 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 6921 __ add(str2, str2, isLU ? wordSize : wordSize/2); 6922 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 6923 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 6924 __ eor(rscratch2, tmp1, tmp2); 6925 __ mov(rscratch1, tmp2); 6926 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 6927 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 6928 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 6929 __ push(spilled_regs, sp); 6930 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 6931 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 6932 6933 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6934 6935 if (SoftwarePrefetchHintDistance >= 0) { 6936 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 6937 __ br(__ LT, NO_PREFETCH); 6938 __ bind(LARGE_LOOP_PREFETCH); 6939 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 6940 __ mov(tmp4, 2); 6941 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 6942 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 6943 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6944 __ subs(tmp4, tmp4, 1); 6945 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 6946 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 6947 __ mov(tmp4, 2); 6948 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 6949 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6950 __ subs(tmp4, tmp4, 1); 6951 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 6952 __ sub(cnt2, cnt2, 64); 6953 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 6954 __ br(__ GE, LARGE_LOOP_PREFETCH); 6955 } 6956 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 6957 __ bind(NO_PREFETCH); 6958 __ subs(cnt2, cnt2, 16); 6959 __ br(__ LT, TAIL); 6960 __ align(OptoLoopAlignment); 6961 __ bind(SMALL_LOOP); // smaller loop 6962 __ subs(cnt2, cnt2, 16); 6963 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6964 __ br(__ GE, SMALL_LOOP); 6965 __ cmn(cnt2, (u1)16); 6966 __ br(__ EQ, LOAD_LAST); 6967 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 6968 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 6969 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 6970 __ ldr(tmp3, Address(cnt1, -8)); 6971 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 6972 __ b(LOAD_LAST); 6973 __ bind(DIFF2); 6974 __ mov(tmpU, tmp3); 6975 __ bind(DIFF1); 6976 __ pop(spilled_regs, sp); 6977 __ b(CALCULATE_DIFFERENCE); 6978 __ bind(LOAD_LAST); 6979 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 6980 // No need to load it again 6981 __ mov(tmpU, tmp3); 6982 __ pop(spilled_regs, sp); 6983 6984 // tmp2 points to the address of the last 4 Latin1 characters right now 6985 __ ldrs(vtmp, Address(tmp2)); 6986 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 6987 __ fmovd(tmpL, vtmp); 6988 6989 __ eor(rscratch2, tmpU, tmpL); 6990 __ cbz(rscratch2, DONE); 6991 6992 // Find the first different characters in the longwords and 6993 // compute their difference. 6994 __ bind(CALCULATE_DIFFERENCE); 6995 __ rev(rscratch2, rscratch2); 6996 __ clz(rscratch2, rscratch2); 6997 __ andr(rscratch2, rscratch2, -16); 6998 __ lsrv(tmp1, tmp1, rscratch2); 6999 __ uxthw(tmp1, tmp1); 7000 __ lsrv(rscratch1, rscratch1, rscratch2); 7001 __ uxthw(rscratch1, rscratch1); 7002 __ subw(result, tmp1, rscratch1); 7003 __ bind(DONE); 7004 __ ret(lr); 7005 return entry; 7006 } 7007 7008 // r0 = input (float16) 7009 // v0 = result (float) 7010 // v1 = temporary float register 7011 address generate_float16ToFloat() { 7012 __ align(CodeEntryAlignment); 7013 StubGenStubId stub_id = StubGenStubId::hf2f_id; 7014 StubCodeMark mark(this, stub_id); 7015 address entry = __ pc(); 7016 BLOCK_COMMENT("Entry:"); 7017 __ flt16_to_flt(v0, r0, v1); 7018 __ ret(lr); 7019 return entry; 7020 } 7021 7022 // v0 = input (float) 7023 // r0 = result (float16) 7024 // v1 = temporary float register 7025 address generate_floatToFloat16() { 7026 __ align(CodeEntryAlignment); 7027 StubGenStubId stub_id = StubGenStubId::f2hf_id; 7028 StubCodeMark mark(this, stub_id); 7029 address entry = __ pc(); 7030 BLOCK_COMMENT("Entry:"); 7031 __ flt_to_flt16(r0, v0, v1); 7032 __ ret(lr); 7033 return entry; 7034 } 7035 7036 address generate_method_entry_barrier() { 7037 __ align(CodeEntryAlignment); 7038 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 7039 StubCodeMark mark(this, stub_id); 7040 7041 Label deoptimize_label; 7042 7043 address start = __ pc(); 7044 7045 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 7046 7047 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 7048 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 7049 // We can get here despite the nmethod being good, if we have not 7050 // yet applied our cross modification fence (or data fence). 7051 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 7052 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 7053 __ ldrw(rscratch2, rscratch2); 7054 __ strw(rscratch2, thread_epoch_addr); 7055 __ isb(); 7056 __ membar(__ LoadLoad); 7057 } 7058 7059 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 7060 7061 __ enter(); 7062 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 7063 7064 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 7065 7066 __ push_call_clobbered_registers(); 7067 7068 __ mov(c_rarg0, rscratch2); 7069 __ call_VM_leaf 7070 (CAST_FROM_FN_PTR 7071 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 7072 7073 __ reset_last_Java_frame(true); 7074 7075 __ mov(rscratch1, r0); 7076 7077 __ pop_call_clobbered_registers(); 7078 7079 __ cbnz(rscratch1, deoptimize_label); 7080 7081 __ leave(); 7082 __ ret(lr); 7083 7084 __ BIND(deoptimize_label); 7085 7086 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 7087 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 7088 7089 __ mov(sp, rscratch1); 7090 __ br(rscratch2); 7091 7092 return start; 7093 } 7094 7095 // r0 = result 7096 // r1 = str1 7097 // r2 = cnt1 7098 // r3 = str2 7099 // r4 = cnt2 7100 // r10 = tmp1 7101 // r11 = tmp2 7102 address generate_compare_long_string_same_encoding(bool isLL) { 7103 __ align(CodeEntryAlignment); 7104 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 7105 StubCodeMark mark(this, stub_id); 7106 address entry = __ pc(); 7107 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 7108 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 7109 7110 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 7111 7112 // exit from large loop when less than 64 bytes left to read or we're about 7113 // to prefetch memory behind array border 7114 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 7115 7116 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 7117 __ eor(rscratch2, tmp1, tmp2); 7118 __ cbnz(rscratch2, CAL_DIFFERENCE); 7119 7120 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 7121 // update pointers, because of previous read 7122 __ add(str1, str1, wordSize); 7123 __ add(str2, str2, wordSize); 7124 if (SoftwarePrefetchHintDistance >= 0) { 7125 __ align(OptoLoopAlignment); 7126 __ bind(LARGE_LOOP_PREFETCH); 7127 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 7128 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 7129 7130 for (int i = 0; i < 4; i++) { 7131 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 7132 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 7133 __ cmp(tmp1, tmp2); 7134 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7135 __ br(Assembler::NE, DIFF); 7136 } 7137 __ sub(cnt2, cnt2, isLL ? 64 : 32); 7138 __ add(str1, str1, 64); 7139 __ add(str2, str2, 64); 7140 __ subs(rscratch2, cnt2, largeLoopExitCondition); 7141 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 7142 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 7143 } 7144 7145 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 7146 __ br(Assembler::LE, LESS16); 7147 __ align(OptoLoopAlignment); 7148 __ bind(LOOP_COMPARE16); 7149 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 7150 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 7151 __ cmp(tmp1, tmp2); 7152 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7153 __ br(Assembler::NE, DIFF); 7154 __ sub(cnt2, cnt2, isLL ? 16 : 8); 7155 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 7156 __ br(Assembler::LT, LESS16); 7157 7158 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 7159 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 7160 __ cmp(tmp1, tmp2); 7161 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7162 __ br(Assembler::NE, DIFF); 7163 __ sub(cnt2, cnt2, isLL ? 16 : 8); 7164 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 7165 __ br(Assembler::GE, LOOP_COMPARE16); 7166 __ cbz(cnt2, LENGTH_DIFF); 7167 7168 __ bind(LESS16); 7169 // each 8 compare 7170 __ subs(cnt2, cnt2, isLL ? 8 : 4); 7171 __ br(Assembler::LE, LESS8); 7172 __ ldr(tmp1, Address(__ post(str1, 8))); 7173 __ ldr(tmp2, Address(__ post(str2, 8))); 7174 __ eor(rscratch2, tmp1, tmp2); 7175 __ cbnz(rscratch2, CAL_DIFFERENCE); 7176 __ sub(cnt2, cnt2, isLL ? 8 : 4); 7177 7178 __ bind(LESS8); // directly load last 8 bytes 7179 if (!isLL) { 7180 __ add(cnt2, cnt2, cnt2); 7181 } 7182 __ ldr(tmp1, Address(str1, cnt2)); 7183 __ ldr(tmp2, Address(str2, cnt2)); 7184 __ eor(rscratch2, tmp1, tmp2); 7185 __ cbz(rscratch2, LENGTH_DIFF); 7186 __ b(CAL_DIFFERENCE); 7187 7188 __ bind(DIFF); 7189 __ cmp(tmp1, tmp2); 7190 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 7191 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 7192 // reuse rscratch2 register for the result of eor instruction 7193 __ eor(rscratch2, tmp1, tmp2); 7194 7195 __ bind(CAL_DIFFERENCE); 7196 __ rev(rscratch2, rscratch2); 7197 __ clz(rscratch2, rscratch2); 7198 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 7199 __ lsrv(tmp1, tmp1, rscratch2); 7200 __ lsrv(tmp2, tmp2, rscratch2); 7201 if (isLL) { 7202 __ uxtbw(tmp1, tmp1); 7203 __ uxtbw(tmp2, tmp2); 7204 } else { 7205 __ uxthw(tmp1, tmp1); 7206 __ uxthw(tmp2, tmp2); 7207 } 7208 __ subw(result, tmp1, tmp2); 7209 7210 __ bind(LENGTH_DIFF); 7211 __ ret(lr); 7212 return entry; 7213 } 7214 7215 enum string_compare_mode { 7216 LL, 7217 LU, 7218 UL, 7219 UU, 7220 }; 7221 7222 // The following registers are declared in aarch64.ad 7223 // r0 = result 7224 // r1 = str1 7225 // r2 = cnt1 7226 // r3 = str2 7227 // r4 = cnt2 7228 // r10 = tmp1 7229 // r11 = tmp2 7230 // z0 = ztmp1 7231 // z1 = ztmp2 7232 // p0 = pgtmp1 7233 // p1 = pgtmp2 7234 address generate_compare_long_string_sve(string_compare_mode mode) { 7235 StubGenStubId stub_id; 7236 switch (mode) { 7237 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 7238 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 7239 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 7240 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 7241 default: ShouldNotReachHere(); 7242 } 7243 7244 __ align(CodeEntryAlignment); 7245 address entry = __ pc(); 7246 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 7247 tmp1 = r10, tmp2 = r11; 7248 7249 Label LOOP, DONE, MISMATCH; 7250 Register vec_len = tmp1; 7251 Register idx = tmp2; 7252 // The minimum of the string lengths has been stored in cnt2. 7253 Register cnt = cnt2; 7254 FloatRegister ztmp1 = z0, ztmp2 = z1; 7255 PRegister pgtmp1 = p0, pgtmp2 = p1; 7256 7257 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 7258 switch (mode) { \ 7259 case LL: \ 7260 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 7261 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 7262 break; \ 7263 case LU: \ 7264 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 7265 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 7266 break; \ 7267 case UL: \ 7268 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 7269 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 7270 break; \ 7271 case UU: \ 7272 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 7273 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 7274 break; \ 7275 default: \ 7276 ShouldNotReachHere(); \ 7277 } 7278 7279 StubCodeMark mark(this, stub_id); 7280 7281 __ mov(idx, 0); 7282 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 7283 7284 if (mode == LL) { 7285 __ sve_cntb(vec_len); 7286 } else { 7287 __ sve_cnth(vec_len); 7288 } 7289 7290 __ sub(rscratch1, cnt, vec_len); 7291 7292 __ bind(LOOP); 7293 7294 // main loop 7295 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 7296 __ add(idx, idx, vec_len); 7297 // Compare strings. 7298 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 7299 __ br(__ NE, MISMATCH); 7300 __ cmp(idx, rscratch1); 7301 __ br(__ LT, LOOP); 7302 7303 // post loop, last iteration 7304 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 7305 7306 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 7307 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 7308 __ br(__ EQ, DONE); 7309 7310 __ bind(MISMATCH); 7311 7312 // Crop the vector to find its location. 7313 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 7314 // Extract the first different characters of each string. 7315 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 7316 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 7317 7318 // Compute the difference of the first different characters. 7319 __ sub(result, rscratch1, rscratch2); 7320 7321 __ bind(DONE); 7322 __ ret(lr); 7323 #undef LOAD_PAIR 7324 return entry; 7325 } 7326 7327 void generate_compare_long_strings() { 7328 if (UseSVE == 0) { 7329 StubRoutines::aarch64::_compare_long_string_LL 7330 = generate_compare_long_string_same_encoding(true); 7331 StubRoutines::aarch64::_compare_long_string_UU 7332 = generate_compare_long_string_same_encoding(false); 7333 StubRoutines::aarch64::_compare_long_string_LU 7334 = generate_compare_long_string_different_encoding(true); 7335 StubRoutines::aarch64::_compare_long_string_UL 7336 = generate_compare_long_string_different_encoding(false); 7337 } else { 7338 StubRoutines::aarch64::_compare_long_string_LL 7339 = generate_compare_long_string_sve(LL); 7340 StubRoutines::aarch64::_compare_long_string_UU 7341 = generate_compare_long_string_sve(UU); 7342 StubRoutines::aarch64::_compare_long_string_LU 7343 = generate_compare_long_string_sve(LU); 7344 StubRoutines::aarch64::_compare_long_string_UL 7345 = generate_compare_long_string_sve(UL); 7346 } 7347 } 7348 7349 // R0 = result 7350 // R1 = str2 7351 // R2 = cnt1 7352 // R3 = str1 7353 // R4 = cnt2 7354 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 7355 // 7356 // This generic linear code use few additional ideas, which makes it faster: 7357 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 7358 // in order to skip initial loading(help in systems with 1 ld pipeline) 7359 // 2) we can use "fast" algorithm of finding single character to search for 7360 // first symbol with less branches(1 branch per each loaded register instead 7361 // of branch for each symbol), so, this is where constants like 7362 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 7363 // 3) after loading and analyzing 1st register of source string, it can be 7364 // used to search for every 1st character entry, saving few loads in 7365 // comparison with "simplier-but-slower" implementation 7366 // 4) in order to avoid lots of push/pop operations, code below is heavily 7367 // re-using/re-initializing/compressing register values, which makes code 7368 // larger and a bit less readable, however, most of extra operations are 7369 // issued during loads or branches, so, penalty is minimal 7370 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 7371 StubGenStubId stub_id; 7372 if (str1_isL) { 7373 if (str2_isL) { 7374 stub_id = StubGenStubId::string_indexof_linear_ll_id; 7375 } else { 7376 stub_id = StubGenStubId::string_indexof_linear_ul_id; 7377 } 7378 } else { 7379 if (str2_isL) { 7380 ShouldNotReachHere(); 7381 } else { 7382 stub_id = StubGenStubId::string_indexof_linear_uu_id; 7383 } 7384 } 7385 __ align(CodeEntryAlignment); 7386 StubCodeMark mark(this, stub_id); 7387 address entry = __ pc(); 7388 7389 int str1_chr_size = str1_isL ? 1 : 2; 7390 int str2_chr_size = str2_isL ? 1 : 2; 7391 int str1_chr_shift = str1_isL ? 0 : 1; 7392 int str2_chr_shift = str2_isL ? 0 : 1; 7393 bool isL = str1_isL && str2_isL; 7394 // parameters 7395 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 7396 // temporary registers 7397 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 7398 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 7399 // redefinitions 7400 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 7401 7402 __ push(spilled_regs, sp); 7403 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 7404 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 7405 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 7406 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 7407 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 7408 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 7409 // Read whole register from str1. It is safe, because length >=8 here 7410 __ ldr(ch1, Address(str1)); 7411 // Read whole register from str2. It is safe, because length >=8 here 7412 __ ldr(ch2, Address(str2)); 7413 __ sub(cnt2, cnt2, cnt1); 7414 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 7415 if (str1_isL != str2_isL) { 7416 __ eor(v0, __ T16B, v0, v0); 7417 } 7418 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 7419 __ mul(first, first, tmp1); 7420 // check if we have less than 1 register to check 7421 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 7422 if (str1_isL != str2_isL) { 7423 __ fmovd(v1, ch1); 7424 } 7425 __ br(__ LE, L_SMALL); 7426 __ eor(ch2, first, ch2); 7427 if (str1_isL != str2_isL) { 7428 __ zip1(v1, __ T16B, v1, v0); 7429 } 7430 __ sub(tmp2, ch2, tmp1); 7431 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7432 __ bics(tmp2, tmp2, ch2); 7433 if (str1_isL != str2_isL) { 7434 __ fmovd(ch1, v1); 7435 } 7436 __ br(__ NE, L_HAS_ZERO); 7437 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 7438 __ add(result, result, wordSize/str2_chr_size); 7439 __ add(str2, str2, wordSize); 7440 __ br(__ LT, L_POST_LOOP); 7441 __ BIND(L_LOOP); 7442 __ ldr(ch2, Address(str2)); 7443 __ eor(ch2, first, ch2); 7444 __ sub(tmp2, ch2, tmp1); 7445 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7446 __ bics(tmp2, tmp2, ch2); 7447 __ br(__ NE, L_HAS_ZERO); 7448 __ BIND(L_LOOP_PROCEED); 7449 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 7450 __ add(str2, str2, wordSize); 7451 __ add(result, result, wordSize/str2_chr_size); 7452 __ br(__ GE, L_LOOP); 7453 __ BIND(L_POST_LOOP); 7454 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 7455 __ br(__ LE, NOMATCH); 7456 __ ldr(ch2, Address(str2)); 7457 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 7458 __ eor(ch2, first, ch2); 7459 __ sub(tmp2, ch2, tmp1); 7460 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7461 __ mov(tmp4, -1); // all bits set 7462 __ b(L_SMALL_PROCEED); 7463 __ align(OptoLoopAlignment); 7464 __ BIND(L_SMALL); 7465 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 7466 __ eor(ch2, first, ch2); 7467 if (str1_isL != str2_isL) { 7468 __ zip1(v1, __ T16B, v1, v0); 7469 } 7470 __ sub(tmp2, ch2, tmp1); 7471 __ mov(tmp4, -1); // all bits set 7472 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7473 if (str1_isL != str2_isL) { 7474 __ fmovd(ch1, v1); // move converted 4 symbols 7475 } 7476 __ BIND(L_SMALL_PROCEED); 7477 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 7478 __ bic(tmp2, tmp2, ch2); 7479 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 7480 __ rbit(tmp2, tmp2); 7481 __ br(__ EQ, NOMATCH); 7482 __ BIND(L_SMALL_HAS_ZERO_LOOP); 7483 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 7484 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 7485 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 7486 if (str2_isL) { // LL 7487 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 7488 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 7489 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 7490 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 7491 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7492 } else { 7493 __ mov(ch2, 0xE); // all bits in byte set except last one 7494 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7495 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7496 __ lslv(tmp2, tmp2, tmp4); 7497 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7498 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7499 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7500 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7501 } 7502 __ cmp(ch1, ch2); 7503 __ mov(tmp4, wordSize/str2_chr_size); 7504 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7505 __ BIND(L_SMALL_CMP_LOOP); 7506 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 7507 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 7508 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 7509 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 7510 __ add(tmp4, tmp4, 1); 7511 __ cmp(tmp4, cnt1); 7512 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 7513 __ cmp(first, ch2); 7514 __ br(__ EQ, L_SMALL_CMP_LOOP); 7515 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 7516 __ cbz(tmp2, NOMATCH); // no more matches. exit 7517 __ clz(tmp4, tmp2); 7518 __ add(result, result, 1); // advance index 7519 __ add(str2, str2, str2_chr_size); // advance pointer 7520 __ b(L_SMALL_HAS_ZERO_LOOP); 7521 __ align(OptoLoopAlignment); 7522 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 7523 __ cmp(first, ch2); 7524 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7525 __ b(DONE); 7526 __ align(OptoLoopAlignment); 7527 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 7528 if (str2_isL) { // LL 7529 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 7530 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 7531 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 7532 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 7533 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7534 } else { 7535 __ mov(ch2, 0xE); // all bits in byte set except last one 7536 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7537 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7538 __ lslv(tmp2, tmp2, tmp4); 7539 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7540 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7541 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7542 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7543 } 7544 __ cmp(ch1, ch2); 7545 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7546 __ b(DONE); 7547 __ align(OptoLoopAlignment); 7548 __ BIND(L_HAS_ZERO); 7549 __ rbit(tmp2, tmp2); 7550 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 7551 // Now, perform compression of counters(cnt2 and cnt1) into one register. 7552 // It's fine because both counters are 32bit and are not changed in this 7553 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 7554 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 7555 __ sub(result, result, 1); 7556 __ BIND(L_HAS_ZERO_LOOP); 7557 __ mov(cnt1, wordSize/str2_chr_size); 7558 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 7559 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 7560 if (str2_isL) { 7561 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 7562 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7563 __ lslv(tmp2, tmp2, tmp4); 7564 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7565 __ add(tmp4, tmp4, 1); 7566 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7567 __ lsl(tmp2, tmp2, 1); 7568 __ mov(tmp4, wordSize/str2_chr_size); 7569 } else { 7570 __ mov(ch2, 0xE); 7571 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7572 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7573 __ lslv(tmp2, tmp2, tmp4); 7574 __ add(tmp4, tmp4, 1); 7575 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7576 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 7577 __ lsl(tmp2, tmp2, 1); 7578 __ mov(tmp4, wordSize/str2_chr_size); 7579 __ sub(str2, str2, str2_chr_size); 7580 } 7581 __ cmp(ch1, ch2); 7582 __ mov(tmp4, wordSize/str2_chr_size); 7583 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7584 __ BIND(L_CMP_LOOP); 7585 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 7586 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 7587 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 7588 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 7589 __ add(tmp4, tmp4, 1); 7590 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 7591 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 7592 __ cmp(cnt1, ch2); 7593 __ br(__ EQ, L_CMP_LOOP); 7594 __ BIND(L_CMP_LOOP_NOMATCH); 7595 // here we're not matched 7596 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 7597 __ clz(tmp4, tmp2); 7598 __ add(str2, str2, str2_chr_size); // advance pointer 7599 __ b(L_HAS_ZERO_LOOP); 7600 __ align(OptoLoopAlignment); 7601 __ BIND(L_CMP_LOOP_LAST_CMP); 7602 __ cmp(cnt1, ch2); 7603 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7604 __ b(DONE); 7605 __ align(OptoLoopAlignment); 7606 __ BIND(L_CMP_LOOP_LAST_CMP2); 7607 if (str2_isL) { 7608 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 7609 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7610 __ lslv(tmp2, tmp2, tmp4); 7611 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7612 __ add(tmp4, tmp4, 1); 7613 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7614 __ lsl(tmp2, tmp2, 1); 7615 } else { 7616 __ mov(ch2, 0xE); 7617 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7618 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7619 __ lslv(tmp2, tmp2, tmp4); 7620 __ add(tmp4, tmp4, 1); 7621 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7622 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 7623 __ lsl(tmp2, tmp2, 1); 7624 __ sub(str2, str2, str2_chr_size); 7625 } 7626 __ cmp(ch1, ch2); 7627 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7628 __ b(DONE); 7629 __ align(OptoLoopAlignment); 7630 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 7631 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 7632 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 7633 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 7634 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 7635 // result by analyzed characters value, so, we can just reset lower bits 7636 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 7637 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 7638 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 7639 // index of last analyzed substring inside current octet. So, str2 in at 7640 // respective start address. We need to advance it to next octet 7641 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 7642 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 7643 __ bfm(result, zr, 0, 2 - str2_chr_shift); 7644 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 7645 __ movw(cnt2, cnt2); 7646 __ b(L_LOOP_PROCEED); 7647 __ align(OptoLoopAlignment); 7648 __ BIND(NOMATCH); 7649 __ mov(result, -1); 7650 __ BIND(DONE); 7651 __ pop(spilled_regs, sp); 7652 __ ret(lr); 7653 return entry; 7654 } 7655 7656 void generate_string_indexof_stubs() { 7657 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 7658 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 7659 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 7660 } 7661 7662 void inflate_and_store_2_fp_registers(bool generatePrfm, 7663 FloatRegister src1, FloatRegister src2) { 7664 Register dst = r1; 7665 __ zip1(v1, __ T16B, src1, v0); 7666 __ zip2(v2, __ T16B, src1, v0); 7667 if (generatePrfm) { 7668 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 7669 } 7670 __ zip1(v3, __ T16B, src2, v0); 7671 __ zip2(v4, __ T16B, src2, v0); 7672 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 7673 } 7674 7675 // R0 = src 7676 // R1 = dst 7677 // R2 = len 7678 // R3 = len >> 3 7679 // V0 = 0 7680 // v1 = loaded 8 bytes 7681 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 7682 address generate_large_byte_array_inflate() { 7683 __ align(CodeEntryAlignment); 7684 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 7685 StubCodeMark mark(this, stub_id); 7686 address entry = __ pc(); 7687 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 7688 Register src = r0, dst = r1, len = r2, octetCounter = r3; 7689 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 7690 7691 // do one more 8-byte read to have address 16-byte aligned in most cases 7692 // also use single store instruction 7693 __ ldrd(v2, __ post(src, 8)); 7694 __ sub(octetCounter, octetCounter, 2); 7695 __ zip1(v1, __ T16B, v1, v0); 7696 __ zip1(v2, __ T16B, v2, v0); 7697 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 7698 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7699 __ subs(rscratch1, octetCounter, large_loop_threshold); 7700 __ br(__ LE, LOOP_START); 7701 __ b(LOOP_PRFM_START); 7702 __ bind(LOOP_PRFM); 7703 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7704 __ bind(LOOP_PRFM_START); 7705 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 7706 __ sub(octetCounter, octetCounter, 8); 7707 __ subs(rscratch1, octetCounter, large_loop_threshold); 7708 inflate_and_store_2_fp_registers(true, v3, v4); 7709 inflate_and_store_2_fp_registers(true, v5, v6); 7710 __ br(__ GT, LOOP_PRFM); 7711 __ cmp(octetCounter, (u1)8); 7712 __ br(__ LT, DONE); 7713 __ bind(LOOP); 7714 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7715 __ bind(LOOP_START); 7716 __ sub(octetCounter, octetCounter, 8); 7717 __ cmp(octetCounter, (u1)8); 7718 inflate_and_store_2_fp_registers(false, v3, v4); 7719 inflate_and_store_2_fp_registers(false, v5, v6); 7720 __ br(__ GE, LOOP); 7721 __ bind(DONE); 7722 __ ret(lr); 7723 return entry; 7724 } 7725 7726 /** 7727 * Arguments: 7728 * 7729 * Input: 7730 * c_rarg0 - current state address 7731 * c_rarg1 - H key address 7732 * c_rarg2 - data address 7733 * c_rarg3 - number of blocks 7734 * 7735 * Output: 7736 * Updated state at c_rarg0 7737 */ 7738 address generate_ghash_processBlocks() { 7739 // Bafflingly, GCM uses little-endian for the byte order, but 7740 // big-endian for the bit order. For example, the polynomial 1 is 7741 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 7742 // 7743 // So, we must either reverse the bytes in each word and do 7744 // everything big-endian or reverse the bits in each byte and do 7745 // it little-endian. On AArch64 it's more idiomatic to reverse 7746 // the bits in each byte (we have an instruction, RBIT, to do 7747 // that) and keep the data in little-endian bit order through the 7748 // calculation, bit-reversing the inputs and outputs. 7749 7750 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 7751 StubCodeMark mark(this, stub_id); 7752 __ align(wordSize * 2); 7753 address p = __ pc(); 7754 __ emit_int64(0x87); // The low-order bits of the field 7755 // polynomial (i.e. p = z^7+z^2+z+1) 7756 // repeated in the low and high parts of a 7757 // 128-bit vector 7758 __ emit_int64(0x87); 7759 7760 __ align(CodeEntryAlignment); 7761 address start = __ pc(); 7762 7763 Register state = c_rarg0; 7764 Register subkeyH = c_rarg1; 7765 Register data = c_rarg2; 7766 Register blocks = c_rarg3; 7767 7768 FloatRegister vzr = v30; 7769 __ eor(vzr, __ T16B, vzr, vzr); // zero register 7770 7771 __ ldrq(v24, p); // The field polynomial 7772 7773 __ ldrq(v0, Address(state)); 7774 __ ldrq(v1, Address(subkeyH)); 7775 7776 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 7777 __ rbit(v0, __ T16B, v0); 7778 __ rev64(v1, __ T16B, v1); 7779 __ rbit(v1, __ T16B, v1); 7780 7781 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 7782 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 7783 7784 { 7785 Label L_ghash_loop; 7786 __ bind(L_ghash_loop); 7787 7788 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 7789 // reversing each byte 7790 __ rbit(v2, __ T16B, v2); 7791 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 7792 7793 // Multiply state in v2 by subkey in v1 7794 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 7795 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 7796 /*temps*/v6, v3, /*reuse/clobber b*/v2); 7797 // Reduce v7:v5 by the field polynomial 7798 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 7799 7800 __ sub(blocks, blocks, 1); 7801 __ cbnz(blocks, L_ghash_loop); 7802 } 7803 7804 // The bit-reversed result is at this point in v0 7805 __ rev64(v0, __ T16B, v0); 7806 __ rbit(v0, __ T16B, v0); 7807 7808 __ st1(v0, __ T16B, state); 7809 __ ret(lr); 7810 7811 return start; 7812 } 7813 7814 address generate_ghash_processBlocks_wide() { 7815 address small = generate_ghash_processBlocks(); 7816 7817 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 7818 StubCodeMark mark(this, stub_id); 7819 __ align(wordSize * 2); 7820 address p = __ pc(); 7821 __ emit_int64(0x87); // The low-order bits of the field 7822 // polynomial (i.e. p = z^7+z^2+z+1) 7823 // repeated in the low and high parts of a 7824 // 128-bit vector 7825 __ emit_int64(0x87); 7826 7827 __ align(CodeEntryAlignment); 7828 address start = __ pc(); 7829 7830 Register state = c_rarg0; 7831 Register subkeyH = c_rarg1; 7832 Register data = c_rarg2; 7833 Register blocks = c_rarg3; 7834 7835 const int unroll = 4; 7836 7837 __ cmp(blocks, (unsigned char)(unroll * 2)); 7838 __ br(__ LT, small); 7839 7840 if (unroll > 1) { 7841 // Save state before entering routine 7842 __ sub(sp, sp, 4 * 16); 7843 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 7844 __ sub(sp, sp, 4 * 16); 7845 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 7846 } 7847 7848 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 7849 7850 if (unroll > 1) { 7851 // And restore state 7852 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 7853 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 7854 } 7855 7856 __ cmp(blocks, (unsigned char)0); 7857 __ br(__ GT, small); 7858 7859 __ ret(lr); 7860 7861 return start; 7862 } 7863 7864 void generate_base64_encode_simdround(Register src, Register dst, 7865 FloatRegister codec, u8 size) { 7866 7867 FloatRegister in0 = v4, in1 = v5, in2 = v6; 7868 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 7869 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 7870 7871 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 7872 7873 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 7874 7875 __ ushr(ind0, arrangement, in0, 2); 7876 7877 __ ushr(ind1, arrangement, in1, 2); 7878 __ shl(in0, arrangement, in0, 6); 7879 __ orr(ind1, arrangement, ind1, in0); 7880 __ ushr(ind1, arrangement, ind1, 2); 7881 7882 __ ushr(ind2, arrangement, in2, 4); 7883 __ shl(in1, arrangement, in1, 4); 7884 __ orr(ind2, arrangement, in1, ind2); 7885 __ ushr(ind2, arrangement, ind2, 2); 7886 7887 __ shl(ind3, arrangement, in2, 2); 7888 __ ushr(ind3, arrangement, ind3, 2); 7889 7890 __ tbl(out0, arrangement, codec, 4, ind0); 7891 __ tbl(out1, arrangement, codec, 4, ind1); 7892 __ tbl(out2, arrangement, codec, 4, ind2); 7893 __ tbl(out3, arrangement, codec, 4, ind3); 7894 7895 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 7896 } 7897 7898 /** 7899 * Arguments: 7900 * 7901 * Input: 7902 * c_rarg0 - src_start 7903 * c_rarg1 - src_offset 7904 * c_rarg2 - src_length 7905 * c_rarg3 - dest_start 7906 * c_rarg4 - dest_offset 7907 * c_rarg5 - isURL 7908 * 7909 */ 7910 address generate_base64_encodeBlock() { 7911 7912 static const char toBase64[64] = { 7913 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 7914 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 7915 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 7916 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 7917 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 7918 }; 7919 7920 static const char toBase64URL[64] = { 7921 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 7922 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 7923 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 7924 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 7925 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 7926 }; 7927 7928 __ align(CodeEntryAlignment); 7929 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 7930 StubCodeMark mark(this, stub_id); 7931 address start = __ pc(); 7932 7933 Register src = c_rarg0; // source array 7934 Register soff = c_rarg1; // source start offset 7935 Register send = c_rarg2; // source end offset 7936 Register dst = c_rarg3; // dest array 7937 Register doff = c_rarg4; // position for writing to dest array 7938 Register isURL = c_rarg5; // Base64 or URL character set 7939 7940 // c_rarg6 and c_rarg7 are free to use as temps 7941 Register codec = c_rarg6; 7942 Register length = c_rarg7; 7943 7944 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 7945 7946 __ add(src, src, soff); 7947 __ add(dst, dst, doff); 7948 __ sub(length, send, soff); 7949 7950 // load the codec base address 7951 __ lea(codec, ExternalAddress((address) toBase64)); 7952 __ cbz(isURL, ProcessData); 7953 __ lea(codec, ExternalAddress((address) toBase64URL)); 7954 7955 __ BIND(ProcessData); 7956 7957 // too short to formup a SIMD loop, roll back 7958 __ cmp(length, (u1)24); 7959 __ br(Assembler::LT, Process3B); 7960 7961 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 7962 7963 __ BIND(Process48B); 7964 __ cmp(length, (u1)48); 7965 __ br(Assembler::LT, Process24B); 7966 generate_base64_encode_simdround(src, dst, v0, 16); 7967 __ sub(length, length, 48); 7968 __ b(Process48B); 7969 7970 __ BIND(Process24B); 7971 __ cmp(length, (u1)24); 7972 __ br(Assembler::LT, SIMDExit); 7973 generate_base64_encode_simdround(src, dst, v0, 8); 7974 __ sub(length, length, 24); 7975 7976 __ BIND(SIMDExit); 7977 __ cbz(length, Exit); 7978 7979 __ BIND(Process3B); 7980 // 3 src bytes, 24 bits 7981 __ ldrb(r10, __ post(src, 1)); 7982 __ ldrb(r11, __ post(src, 1)); 7983 __ ldrb(r12, __ post(src, 1)); 7984 __ orrw(r11, r11, r10, Assembler::LSL, 8); 7985 __ orrw(r12, r12, r11, Assembler::LSL, 8); 7986 // codec index 7987 __ ubfmw(r15, r12, 18, 23); 7988 __ ubfmw(r14, r12, 12, 17); 7989 __ ubfmw(r13, r12, 6, 11); 7990 __ andw(r12, r12, 63); 7991 // get the code based on the codec 7992 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 7993 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 7994 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 7995 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 7996 __ strb(r15, __ post(dst, 1)); 7997 __ strb(r14, __ post(dst, 1)); 7998 __ strb(r13, __ post(dst, 1)); 7999 __ strb(r12, __ post(dst, 1)); 8000 __ sub(length, length, 3); 8001 __ cbnz(length, Process3B); 8002 8003 __ BIND(Exit); 8004 __ ret(lr); 8005 8006 return start; 8007 } 8008 8009 void generate_base64_decode_simdround(Register src, Register dst, 8010 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 8011 8012 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 8013 FloatRegister out0 = v20, out1 = v21, out2 = v22; 8014 8015 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 8016 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 8017 8018 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 8019 8020 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 8021 8022 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 8023 8024 // we need unsigned saturating subtract, to make sure all input values 8025 // in range [0, 63] will have 0U value in the higher half lookup 8026 __ uqsubv(decH0, __ T16B, in0, v27); 8027 __ uqsubv(decH1, __ T16B, in1, v27); 8028 __ uqsubv(decH2, __ T16B, in2, v27); 8029 __ uqsubv(decH3, __ T16B, in3, v27); 8030 8031 // lower half lookup 8032 __ tbl(decL0, arrangement, codecL, 4, in0); 8033 __ tbl(decL1, arrangement, codecL, 4, in1); 8034 __ tbl(decL2, arrangement, codecL, 4, in2); 8035 __ tbl(decL3, arrangement, codecL, 4, in3); 8036 8037 // higher half lookup 8038 __ tbx(decH0, arrangement, codecH, 4, decH0); 8039 __ tbx(decH1, arrangement, codecH, 4, decH1); 8040 __ tbx(decH2, arrangement, codecH, 4, decH2); 8041 __ tbx(decH3, arrangement, codecH, 4, decH3); 8042 8043 // combine lower and higher 8044 __ orr(decL0, arrangement, decL0, decH0); 8045 __ orr(decL1, arrangement, decL1, decH1); 8046 __ orr(decL2, arrangement, decL2, decH2); 8047 __ orr(decL3, arrangement, decL3, decH3); 8048 8049 // check illegal inputs, value larger than 63 (maximum of 6 bits) 8050 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 8051 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 8052 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 8053 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 8054 __ orr(in0, arrangement, decH0, decH1); 8055 __ orr(in1, arrangement, decH2, decH3); 8056 __ orr(in2, arrangement, in0, in1); 8057 __ umaxv(in3, arrangement, in2); 8058 __ umov(rscratch2, in3, __ B, 0); 8059 8060 // get the data to output 8061 __ shl(out0, arrangement, decL0, 2); 8062 __ ushr(out1, arrangement, decL1, 4); 8063 __ orr(out0, arrangement, out0, out1); 8064 __ shl(out1, arrangement, decL1, 4); 8065 __ ushr(out2, arrangement, decL2, 2); 8066 __ orr(out1, arrangement, out1, out2); 8067 __ shl(out2, arrangement, decL2, 6); 8068 __ orr(out2, arrangement, out2, decL3); 8069 8070 __ cbz(rscratch2, NoIllegalData); 8071 8072 // handle illegal input 8073 __ umov(r10, in2, __ D, 0); 8074 if (size == 16) { 8075 __ cbnz(r10, ErrorInLowerHalf); 8076 8077 // illegal input is in higher half, store the lower half now. 8078 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 8079 8080 __ umov(r10, in2, __ D, 1); 8081 __ umov(r11, out0, __ D, 1); 8082 __ umov(r12, out1, __ D, 1); 8083 __ umov(r13, out2, __ D, 1); 8084 __ b(StoreLegalData); 8085 8086 __ BIND(ErrorInLowerHalf); 8087 } 8088 __ umov(r11, out0, __ D, 0); 8089 __ umov(r12, out1, __ D, 0); 8090 __ umov(r13, out2, __ D, 0); 8091 8092 __ BIND(StoreLegalData); 8093 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 8094 __ strb(r11, __ post(dst, 1)); 8095 __ strb(r12, __ post(dst, 1)); 8096 __ strb(r13, __ post(dst, 1)); 8097 __ lsr(r10, r10, 8); 8098 __ lsr(r11, r11, 8); 8099 __ lsr(r12, r12, 8); 8100 __ lsr(r13, r13, 8); 8101 __ b(StoreLegalData); 8102 8103 __ BIND(NoIllegalData); 8104 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 8105 } 8106 8107 8108 /** 8109 * Arguments: 8110 * 8111 * Input: 8112 * c_rarg0 - src_start 8113 * c_rarg1 - src_offset 8114 * c_rarg2 - src_length 8115 * c_rarg3 - dest_start 8116 * c_rarg4 - dest_offset 8117 * c_rarg5 - isURL 8118 * c_rarg6 - isMIME 8119 * 8120 */ 8121 address generate_base64_decodeBlock() { 8122 8123 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 8124 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 8125 // titled "Base64 decoding". 8126 8127 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 8128 // except the trailing character '=' is also treated illegal value in this intrinsic. That 8129 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 8130 static const uint8_t fromBase64ForNoSIMD[256] = { 8131 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8132 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8133 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 8134 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8135 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 8136 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 8137 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 8138 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 8139 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8140 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8141 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8142 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8143 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8144 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8145 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8146 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8147 }; 8148 8149 static const uint8_t fromBase64URLForNoSIMD[256] = { 8150 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8151 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8152 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 8153 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8154 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 8155 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 8156 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 8157 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 8158 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8159 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8160 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8161 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8162 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8163 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8164 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8165 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8166 }; 8167 8168 // A legal value of base64 code is in range [0, 127]. We need two lookups 8169 // with tbl/tbx and combine them to get the decode data. The 1st table vector 8170 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 8171 // table vector lookup use tbx, out of range indices are unchanged in 8172 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 8173 // The value of index 64 is set to 0, so that we know that we already get the 8174 // decoded data with the 1st lookup. 8175 static const uint8_t fromBase64ForSIMD[128] = { 8176 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8177 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8178 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 8179 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8180 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 8181 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 8182 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 8183 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 8184 }; 8185 8186 static const uint8_t fromBase64URLForSIMD[128] = { 8187 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8188 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8189 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 8190 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8191 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 8192 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 8193 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 8194 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 8195 }; 8196 8197 __ align(CodeEntryAlignment); 8198 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 8199 StubCodeMark mark(this, stub_id); 8200 address start = __ pc(); 8201 8202 Register src = c_rarg0; // source array 8203 Register soff = c_rarg1; // source start offset 8204 Register send = c_rarg2; // source end offset 8205 Register dst = c_rarg3; // dest array 8206 Register doff = c_rarg4; // position for writing to dest array 8207 Register isURL = c_rarg5; // Base64 or URL character set 8208 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 8209 8210 Register length = send; // reuse send as length of source data to process 8211 8212 Register simd_codec = c_rarg6; 8213 Register nosimd_codec = c_rarg7; 8214 8215 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 8216 8217 __ enter(); 8218 8219 __ add(src, src, soff); 8220 __ add(dst, dst, doff); 8221 8222 __ mov(doff, dst); 8223 8224 __ sub(length, send, soff); 8225 __ bfm(length, zr, 0, 1); 8226 8227 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 8228 __ cbz(isURL, ProcessData); 8229 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 8230 8231 __ BIND(ProcessData); 8232 __ mov(rscratch1, length); 8233 __ cmp(length, (u1)144); // 144 = 80 + 64 8234 __ br(Assembler::LT, Process4B); 8235 8236 // In the MIME case, the line length cannot be more than 76 8237 // bytes (see RFC 2045). This is too short a block for SIMD 8238 // to be worthwhile, so we use non-SIMD here. 8239 __ movw(rscratch1, 79); 8240 8241 __ BIND(Process4B); 8242 __ ldrw(r14, __ post(src, 4)); 8243 __ ubfxw(r10, r14, 0, 8); 8244 __ ubfxw(r11, r14, 8, 8); 8245 __ ubfxw(r12, r14, 16, 8); 8246 __ ubfxw(r13, r14, 24, 8); 8247 // get the de-code 8248 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 8249 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 8250 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 8251 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 8252 // error detection, 255u indicates an illegal input 8253 __ orrw(r14, r10, r11); 8254 __ orrw(r15, r12, r13); 8255 __ orrw(r14, r14, r15); 8256 __ tbnz(r14, 7, Exit); 8257 // recover the data 8258 __ lslw(r14, r10, 10); 8259 __ bfiw(r14, r11, 4, 6); 8260 __ bfmw(r14, r12, 2, 5); 8261 __ rev16w(r14, r14); 8262 __ bfiw(r13, r12, 6, 2); 8263 __ strh(r14, __ post(dst, 2)); 8264 __ strb(r13, __ post(dst, 1)); 8265 // non-simd loop 8266 __ subsw(rscratch1, rscratch1, 4); 8267 __ br(Assembler::GT, Process4B); 8268 8269 // if exiting from PreProcess80B, rscratch1 == -1; 8270 // otherwise, rscratch1 == 0. 8271 __ cbzw(rscratch1, Exit); 8272 __ sub(length, length, 80); 8273 8274 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 8275 __ cbz(isURL, SIMDEnter); 8276 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 8277 8278 __ BIND(SIMDEnter); 8279 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 8280 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 8281 __ mov(rscratch1, 63); 8282 __ dup(v27, __ T16B, rscratch1); 8283 8284 __ BIND(Process64B); 8285 __ cmp(length, (u1)64); 8286 __ br(Assembler::LT, Process32B); 8287 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 8288 __ sub(length, length, 64); 8289 __ b(Process64B); 8290 8291 __ BIND(Process32B); 8292 __ cmp(length, (u1)32); 8293 __ br(Assembler::LT, SIMDExit); 8294 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 8295 __ sub(length, length, 32); 8296 __ b(Process32B); 8297 8298 __ BIND(SIMDExit); 8299 __ cbz(length, Exit); 8300 __ movw(rscratch1, length); 8301 __ b(Process4B); 8302 8303 __ BIND(Exit); 8304 __ sub(c_rarg0, dst, doff); 8305 8306 __ leave(); 8307 __ ret(lr); 8308 8309 return start; 8310 } 8311 8312 // Support for spin waits. 8313 address generate_spin_wait() { 8314 __ align(CodeEntryAlignment); 8315 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 8316 StubCodeMark mark(this, stub_id); 8317 address start = __ pc(); 8318 8319 __ spin_wait(); 8320 __ ret(lr); 8321 8322 return start; 8323 } 8324 8325 void generate_lookup_secondary_supers_table_stub() { 8326 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 8327 StubCodeMark mark(this, stub_id); 8328 8329 const Register 8330 r_super_klass = r0, 8331 r_array_base = r1, 8332 r_array_length = r2, 8333 r_array_index = r3, 8334 r_sub_klass = r4, 8335 r_bitmap = rscratch2, 8336 result = r5; 8337 const FloatRegister 8338 vtemp = v0; 8339 8340 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 8341 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 8342 Label L_success; 8343 __ enter(); 8344 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 8345 r_array_base, r_array_length, r_array_index, 8346 vtemp, result, slot, 8347 /*stub_is_near*/true); 8348 __ leave(); 8349 __ ret(lr); 8350 } 8351 } 8352 8353 // Slow path implementation for UseSecondarySupersTable. 8354 address generate_lookup_secondary_supers_table_slow_path_stub() { 8355 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 8356 StubCodeMark mark(this, stub_id); 8357 8358 address start = __ pc(); 8359 const Register 8360 r_super_klass = r0, // argument 8361 r_array_base = r1, // argument 8362 temp1 = r2, // temp 8363 r_array_index = r3, // argument 8364 r_bitmap = rscratch2, // argument 8365 result = r5; // argument 8366 8367 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 8368 __ ret(lr); 8369 8370 return start; 8371 } 8372 8373 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8374 8375 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 8376 // 8377 // If LSE is in use, generate LSE versions of all the stubs. The 8378 // non-LSE versions are in atomic_aarch64.S. 8379 8380 // class AtomicStubMark records the entry point of a stub and the 8381 // stub pointer which will point to it. The stub pointer is set to 8382 // the entry point when ~AtomicStubMark() is called, which must be 8383 // after ICache::invalidate_range. This ensures safe publication of 8384 // the generated code. 8385 class AtomicStubMark { 8386 address _entry_point; 8387 aarch64_atomic_stub_t *_stub; 8388 MacroAssembler *_masm; 8389 public: 8390 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 8391 _masm = masm; 8392 __ align(32); 8393 _entry_point = __ pc(); 8394 _stub = stub; 8395 } 8396 ~AtomicStubMark() { 8397 *_stub = (aarch64_atomic_stub_t)_entry_point; 8398 } 8399 }; 8400 8401 // NB: For memory_order_conservative we need a trailing membar after 8402 // LSE atomic operations but not a leading membar. 8403 // 8404 // We don't need a leading membar because a clause in the Arm ARM 8405 // says: 8406 // 8407 // Barrier-ordered-before 8408 // 8409 // Barrier instructions order prior Memory effects before subsequent 8410 // Memory effects generated by the same Observer. A read or a write 8411 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 8412 // Observer if and only if RW1 appears in program order before RW 2 8413 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 8414 // instruction with both Acquire and Release semantics. 8415 // 8416 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 8417 // and Release semantics, therefore we don't need a leading 8418 // barrier. However, there is no corresponding Barrier-ordered-after 8419 // relationship, therefore we need a trailing membar to prevent a 8420 // later store or load from being reordered with the store in an 8421 // atomic instruction. 8422 // 8423 // This was checked by using the herd7 consistency model simulator 8424 // (http://diy.inria.fr/) with this test case: 8425 // 8426 // AArch64 LseCas 8427 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 8428 // P0 | P1; 8429 // LDR W4, [X2] | MOV W3, #0; 8430 // DMB LD | MOV W4, #1; 8431 // LDR W3, [X1] | CASAL W3, W4, [X1]; 8432 // | DMB ISH; 8433 // | STR W4, [X2]; 8434 // exists 8435 // (0:X3=0 /\ 0:X4=1) 8436 // 8437 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 8438 // with the store to x in P1. Without the DMB in P1 this may happen. 8439 // 8440 // At the time of writing we don't know of any AArch64 hardware that 8441 // reorders stores in this way, but the Reference Manual permits it. 8442 8443 void gen_cas_entry(Assembler::operand_size size, 8444 atomic_memory_order order) { 8445 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 8446 exchange_val = c_rarg2; 8447 bool acquire, release; 8448 switch (order) { 8449 case memory_order_relaxed: 8450 acquire = false; 8451 release = false; 8452 break; 8453 case memory_order_release: 8454 acquire = false; 8455 release = true; 8456 break; 8457 default: 8458 acquire = true; 8459 release = true; 8460 break; 8461 } 8462 __ mov(prev, compare_val); 8463 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 8464 if (order == memory_order_conservative) { 8465 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8466 } 8467 if (size == Assembler::xword) { 8468 __ mov(r0, prev); 8469 } else { 8470 __ movw(r0, prev); 8471 } 8472 __ ret(lr); 8473 } 8474 8475 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 8476 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 8477 // If not relaxed, then default to conservative. Relaxed is the only 8478 // case we use enough to be worth specializing. 8479 if (order == memory_order_relaxed) { 8480 __ ldadd(size, incr, prev, addr); 8481 } else { 8482 __ ldaddal(size, incr, prev, addr); 8483 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8484 } 8485 if (size == Assembler::xword) { 8486 __ mov(r0, prev); 8487 } else { 8488 __ movw(r0, prev); 8489 } 8490 __ ret(lr); 8491 } 8492 8493 void gen_swpal_entry(Assembler::operand_size size) { 8494 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 8495 __ swpal(size, incr, prev, addr); 8496 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8497 if (size == Assembler::xword) { 8498 __ mov(r0, prev); 8499 } else { 8500 __ movw(r0, prev); 8501 } 8502 __ ret(lr); 8503 } 8504 8505 void generate_atomic_entry_points() { 8506 if (! UseLSE) { 8507 return; 8508 } 8509 __ align(CodeEntryAlignment); 8510 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 8511 StubCodeMark mark(this, stub_id); 8512 address first_entry = __ pc(); 8513 8514 // ADD, memory_order_conservative 8515 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 8516 gen_ldadd_entry(Assembler::word, memory_order_conservative); 8517 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 8518 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 8519 8520 // ADD, memory_order_relaxed 8521 AtomicStubMark mark_fetch_add_4_relaxed 8522 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 8523 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 8524 AtomicStubMark mark_fetch_add_8_relaxed 8525 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 8526 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 8527 8528 // XCHG, memory_order_conservative 8529 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 8530 gen_swpal_entry(Assembler::word); 8531 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 8532 gen_swpal_entry(Assembler::xword); 8533 8534 // CAS, memory_order_conservative 8535 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 8536 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 8537 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 8538 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 8539 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 8540 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 8541 8542 // CAS, memory_order_relaxed 8543 AtomicStubMark mark_cmpxchg_1_relaxed 8544 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 8545 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 8546 AtomicStubMark mark_cmpxchg_4_relaxed 8547 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 8548 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 8549 AtomicStubMark mark_cmpxchg_8_relaxed 8550 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 8551 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 8552 8553 AtomicStubMark mark_cmpxchg_4_release 8554 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 8555 gen_cas_entry(MacroAssembler::word, memory_order_release); 8556 AtomicStubMark mark_cmpxchg_8_release 8557 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 8558 gen_cas_entry(MacroAssembler::xword, memory_order_release); 8559 8560 AtomicStubMark mark_cmpxchg_4_seq_cst 8561 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 8562 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 8563 AtomicStubMark mark_cmpxchg_8_seq_cst 8564 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 8565 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 8566 8567 ICache::invalidate_range(first_entry, __ pc() - first_entry); 8568 } 8569 #endif // LINUX 8570 8571 address generate_cont_thaw(Continuation::thaw_kind kind) { 8572 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 8573 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 8574 8575 address start = __ pc(); 8576 8577 if (return_barrier) { 8578 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 8579 __ mov(sp, rscratch1); 8580 } 8581 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 8582 8583 if (return_barrier) { 8584 // preserve possible return value from a method returning to the return barrier 8585 __ fmovd(rscratch1, v0); 8586 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 8587 } 8588 8589 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 8590 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 8591 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 8592 8593 if (return_barrier) { 8594 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 8595 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 8596 __ fmovd(v0, rscratch1); 8597 } 8598 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 8599 8600 8601 Label thaw_success; 8602 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 8603 __ cbnz(rscratch2, thaw_success); 8604 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 8605 __ br(rscratch1); 8606 __ bind(thaw_success); 8607 8608 // make room for the thawed frames 8609 __ sub(rscratch1, sp, rscratch2); 8610 __ andr(rscratch1, rscratch1, -16); // align 8611 __ mov(sp, rscratch1); 8612 8613 if (return_barrier) { 8614 // save original return value -- again 8615 __ fmovd(rscratch1, v0); 8616 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 8617 } 8618 8619 // If we want, we can templatize thaw by kind, and have three different entries 8620 __ movw(c_rarg1, (uint32_t)kind); 8621 8622 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 8623 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 8624 8625 if (return_barrier) { 8626 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 8627 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 8628 __ fmovd(v0, rscratch1); 8629 } else { 8630 __ mov(r0, zr); // return 0 (success) from doYield 8631 } 8632 8633 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 8634 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 8635 __ mov(rfp, sp); 8636 8637 if (return_barrier_exception) { 8638 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 8639 __ authenticate_return_address(c_rarg1); 8640 __ verify_oop(r0); 8641 // save return value containing the exception oop in callee-saved R19 8642 __ mov(r19, r0); 8643 8644 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 8645 8646 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 8647 // __ reinitialize_ptrue(); 8648 8649 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 8650 8651 __ mov(r1, r0); // the exception handler 8652 __ mov(r0, r19); // restore return value containing the exception oop 8653 __ verify_oop(r0); 8654 8655 __ leave(); 8656 __ mov(r3, lr); 8657 __ br(r1); // the exception handler 8658 } else { 8659 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 8660 __ leave(); 8661 __ ret(lr); 8662 } 8663 8664 return start; 8665 } 8666 8667 address generate_cont_thaw() { 8668 if (!Continuations::enabled()) return nullptr; 8669 8670 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 8671 StubCodeMark mark(this, stub_id); 8672 address start = __ pc(); 8673 generate_cont_thaw(Continuation::thaw_top); 8674 return start; 8675 } 8676 8677 address generate_cont_returnBarrier() { 8678 if (!Continuations::enabled()) return nullptr; 8679 8680 // TODO: will probably need multiple return barriers depending on return type 8681 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 8682 StubCodeMark mark(this, stub_id); 8683 address start = __ pc(); 8684 8685 generate_cont_thaw(Continuation::thaw_return_barrier); 8686 8687 return start; 8688 } 8689 8690 address generate_cont_returnBarrier_exception() { 8691 if (!Continuations::enabled()) return nullptr; 8692 8693 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 8694 StubCodeMark mark(this, stub_id); 8695 address start = __ pc(); 8696 8697 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 8698 8699 return start; 8700 } 8701 8702 address generate_cont_preempt_stub() { 8703 if (!Continuations::enabled()) return nullptr; 8704 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 8705 StubCodeMark mark(this, stub_id); 8706 address start = __ pc(); 8707 8708 __ reset_last_Java_frame(true); 8709 8710 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 8711 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 8712 __ mov(sp, rscratch2); 8713 8714 Label preemption_cancelled; 8715 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 8716 __ cbnz(rscratch1, preemption_cancelled); 8717 8718 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 8719 SharedRuntime::continuation_enter_cleanup(_masm); 8720 __ leave(); 8721 __ ret(lr); 8722 8723 // We acquired the monitor after freezing the frames so call thaw to continue execution. 8724 __ bind(preemption_cancelled); 8725 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 8726 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 8727 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 8728 __ ldr(rscratch1, Address(rscratch1)); 8729 __ br(rscratch1); 8730 8731 return start; 8732 } 8733 8734 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 8735 // are represented as long[5], with BITS_PER_LIMB = 26. 8736 // Pack five 26-bit limbs into three 64-bit registers. 8737 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 8738 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 8739 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 8740 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 8741 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 8742 8743 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 8744 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 8745 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 8746 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 8747 8748 if (dest2->is_valid()) { 8749 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 8750 } else { 8751 #ifdef ASSERT 8752 Label OK; 8753 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 8754 __ br(__ EQ, OK); 8755 __ stop("high bits of Poly1305 integer should be zero"); 8756 __ should_not_reach_here(); 8757 __ bind(OK); 8758 #endif 8759 } 8760 } 8761 8762 // As above, but return only a 128-bit integer, packed into two 8763 // 64-bit registers. 8764 void pack_26(Register dest0, Register dest1, Register src) { 8765 pack_26(dest0, dest1, noreg, src); 8766 } 8767 8768 // Multiply and multiply-accumulate unsigned 64-bit registers. 8769 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 8770 __ mul(prod_lo, n, m); 8771 __ umulh(prod_hi, n, m); 8772 } 8773 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 8774 wide_mul(rscratch1, rscratch2, n, m); 8775 __ adds(sum_lo, sum_lo, rscratch1); 8776 __ adc(sum_hi, sum_hi, rscratch2); 8777 } 8778 8779 // Poly1305, RFC 7539 8780 8781 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 8782 // description of the tricks used to simplify and accelerate this 8783 // computation. 8784 8785 address generate_poly1305_processBlocks() { 8786 __ align(CodeEntryAlignment); 8787 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 8788 StubCodeMark mark(this, stub_id); 8789 address start = __ pc(); 8790 Label here; 8791 __ enter(); 8792 RegSet callee_saved = RegSet::range(r19, r28); 8793 __ push(callee_saved, sp); 8794 8795 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 8796 8797 // Arguments 8798 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 8799 8800 // R_n is the 128-bit randomly-generated key, packed into two 8801 // registers. The caller passes this key to us as long[5], with 8802 // BITS_PER_LIMB = 26. 8803 const Register R_0 = *++regs, R_1 = *++regs; 8804 pack_26(R_0, R_1, r_start); 8805 8806 // RR_n is (R_n >> 2) * 5 8807 const Register RR_0 = *++regs, RR_1 = *++regs; 8808 __ lsr(RR_0, R_0, 2); 8809 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 8810 __ lsr(RR_1, R_1, 2); 8811 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 8812 8813 // U_n is the current checksum 8814 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 8815 pack_26(U_0, U_1, U_2, acc_start); 8816 8817 static constexpr int BLOCK_LENGTH = 16; 8818 Label DONE, LOOP; 8819 8820 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 8821 __ br(Assembler::LT, DONE); { 8822 __ bind(LOOP); 8823 8824 // S_n is to be the sum of U_n and the next block of data 8825 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 8826 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 8827 __ adds(S_0, U_0, S_0); 8828 __ adcs(S_1, U_1, S_1); 8829 __ adc(S_2, U_2, zr); 8830 __ add(S_2, S_2, 1); 8831 8832 const Register U_0HI = *++regs, U_1HI = *++regs; 8833 8834 // NB: this logic depends on some of the special properties of 8835 // Poly1305 keys. In particular, because we know that the top 8836 // four bits of R_0 and R_1 are zero, we can add together 8837 // partial products without any risk of needing to propagate a 8838 // carry out. 8839 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 8840 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 8841 __ andr(U_2, R_0, 3); 8842 __ mul(U_2, S_2, U_2); 8843 8844 // Recycle registers S_0, S_1, S_2 8845 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 8846 8847 // Partial reduction mod 2**130 - 5 8848 __ adds(U_1, U_0HI, U_1); 8849 __ adc(U_2, U_1HI, U_2); 8850 // Sum now in U_2:U_1:U_0. 8851 // Dead: U_0HI, U_1HI. 8852 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 8853 8854 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 8855 8856 // First, U_2:U_1:U_0 += (U_2 >> 2) 8857 __ lsr(rscratch1, U_2, 2); 8858 __ andr(U_2, U_2, (u8)3); 8859 __ adds(U_0, U_0, rscratch1); 8860 __ adcs(U_1, U_1, zr); 8861 __ adc(U_2, U_2, zr); 8862 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 8863 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 8864 __ adcs(U_1, U_1, zr); 8865 __ adc(U_2, U_2, zr); 8866 8867 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 8868 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 8869 __ br(~ Assembler::LT, LOOP); 8870 } 8871 8872 // Further reduce modulo 2^130 - 5 8873 __ lsr(rscratch1, U_2, 2); 8874 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 8875 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 8876 __ adcs(U_1, U_1, zr); 8877 __ andr(U_2, U_2, (u1)3); 8878 __ adc(U_2, U_2, zr); 8879 8880 // Unpack the sum into five 26-bit limbs and write to memory. 8881 __ ubfiz(rscratch1, U_0, 0, 26); 8882 __ ubfx(rscratch2, U_0, 26, 26); 8883 __ stp(rscratch1, rscratch2, Address(acc_start)); 8884 __ ubfx(rscratch1, U_0, 52, 12); 8885 __ bfi(rscratch1, U_1, 12, 14); 8886 __ ubfx(rscratch2, U_1, 14, 26); 8887 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 8888 __ ubfx(rscratch1, U_1, 40, 24); 8889 __ bfi(rscratch1, U_2, 24, 3); 8890 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 8891 8892 __ bind(DONE); 8893 __ pop(callee_saved, sp); 8894 __ leave(); 8895 __ ret(lr); 8896 8897 return start; 8898 } 8899 8900 // exception handler for upcall stubs 8901 address generate_upcall_stub_exception_handler() { 8902 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 8903 StubCodeMark mark(this, stub_id); 8904 address start = __ pc(); 8905 8906 // Native caller has no idea how to handle exceptions, 8907 // so we just crash here. Up to callee to catch exceptions. 8908 __ verify_oop(r0); 8909 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 8910 __ blr(rscratch1); 8911 __ should_not_reach_here(); 8912 8913 return start; 8914 } 8915 8916 // load Method* target of MethodHandle 8917 // j_rarg0 = jobject receiver 8918 // rmethod = result 8919 address generate_upcall_stub_load_target() { 8920 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 8921 StubCodeMark mark(this, stub_id); 8922 address start = __ pc(); 8923 8924 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 8925 // Load target method from receiver 8926 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 8927 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 8928 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 8929 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 8930 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 8931 noreg, noreg); 8932 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 8933 8934 __ ret(lr); 8935 8936 return start; 8937 } 8938 8939 #undef __ 8940 #define __ masm-> 8941 8942 class MontgomeryMultiplyGenerator : public MacroAssembler { 8943 8944 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 8945 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 8946 8947 RegSet _toSave; 8948 bool _squaring; 8949 8950 public: 8951 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 8952 : MacroAssembler(as->code()), _squaring(squaring) { 8953 8954 // Register allocation 8955 8956 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 8957 Pa_base = *regs; // Argument registers 8958 if (squaring) 8959 Pb_base = Pa_base; 8960 else 8961 Pb_base = *++regs; 8962 Pn_base = *++regs; 8963 Rlen= *++regs; 8964 inv = *++regs; 8965 Pm_base = *++regs; 8966 8967 // Working registers: 8968 Ra = *++regs; // The current digit of a, b, n, and m. 8969 Rb = *++regs; 8970 Rm = *++regs; 8971 Rn = *++regs; 8972 8973 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 8974 Pb = *++regs; 8975 Pm = *++regs; 8976 Pn = *++regs; 8977 8978 t0 = *++regs; // Three registers which form a 8979 t1 = *++regs; // triple-precision accumuator. 8980 t2 = *++regs; 8981 8982 Ri = *++regs; // Inner and outer loop indexes. 8983 Rj = *++regs; 8984 8985 Rhi_ab = *++regs; // Product registers: low and high parts 8986 Rlo_ab = *++regs; // of a*b and m*n. 8987 Rhi_mn = *++regs; 8988 Rlo_mn = *++regs; 8989 8990 // r19 and up are callee-saved. 8991 _toSave = RegSet::range(r19, *regs) + Pm_base; 8992 } 8993 8994 private: 8995 void save_regs() { 8996 push(_toSave, sp); 8997 } 8998 8999 void restore_regs() { 9000 pop(_toSave, sp); 9001 } 9002 9003 template <typename T> 9004 void unroll_2(Register count, T block) { 9005 Label loop, end, odd; 9006 tbnz(count, 0, odd); 9007 cbz(count, end); 9008 align(16); 9009 bind(loop); 9010 (this->*block)(); 9011 bind(odd); 9012 (this->*block)(); 9013 subs(count, count, 2); 9014 br(Assembler::GT, loop); 9015 bind(end); 9016 } 9017 9018 template <typename T> 9019 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 9020 Label loop, end, odd; 9021 tbnz(count, 0, odd); 9022 cbz(count, end); 9023 align(16); 9024 bind(loop); 9025 (this->*block)(d, s, tmp); 9026 bind(odd); 9027 (this->*block)(d, s, tmp); 9028 subs(count, count, 2); 9029 br(Assembler::GT, loop); 9030 bind(end); 9031 } 9032 9033 void pre1(RegisterOrConstant i) { 9034 block_comment("pre1"); 9035 // Pa = Pa_base; 9036 // Pb = Pb_base + i; 9037 // Pm = Pm_base; 9038 // Pn = Pn_base + i; 9039 // Ra = *Pa; 9040 // Rb = *Pb; 9041 // Rm = *Pm; 9042 // Rn = *Pn; 9043 ldr(Ra, Address(Pa_base)); 9044 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 9045 ldr(Rm, Address(Pm_base)); 9046 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9047 lea(Pa, Address(Pa_base)); 9048 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 9049 lea(Pm, Address(Pm_base)); 9050 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9051 9052 // Zero the m*n result. 9053 mov(Rhi_mn, zr); 9054 mov(Rlo_mn, zr); 9055 } 9056 9057 // The core multiply-accumulate step of a Montgomery 9058 // multiplication. The idea is to schedule operations as a 9059 // pipeline so that instructions with long latencies (loads and 9060 // multiplies) have time to complete before their results are 9061 // used. This most benefits in-order implementations of the 9062 // architecture but out-of-order ones also benefit. 9063 void step() { 9064 block_comment("step"); 9065 // MACC(Ra, Rb, t0, t1, t2); 9066 // Ra = *++Pa; 9067 // Rb = *--Pb; 9068 umulh(Rhi_ab, Ra, Rb); 9069 mul(Rlo_ab, Ra, Rb); 9070 ldr(Ra, pre(Pa, wordSize)); 9071 ldr(Rb, pre(Pb, -wordSize)); 9072 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 9073 // previous iteration. 9074 // MACC(Rm, Rn, t0, t1, t2); 9075 // Rm = *++Pm; 9076 // Rn = *--Pn; 9077 umulh(Rhi_mn, Rm, Rn); 9078 mul(Rlo_mn, Rm, Rn); 9079 ldr(Rm, pre(Pm, wordSize)); 9080 ldr(Rn, pre(Pn, -wordSize)); 9081 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9082 } 9083 9084 void post1() { 9085 block_comment("post1"); 9086 9087 // MACC(Ra, Rb, t0, t1, t2); 9088 // Ra = *++Pa; 9089 // Rb = *--Pb; 9090 umulh(Rhi_ab, Ra, Rb); 9091 mul(Rlo_ab, Ra, Rb); 9092 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9093 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9094 9095 // *Pm = Rm = t0 * inv; 9096 mul(Rm, t0, inv); 9097 str(Rm, Address(Pm)); 9098 9099 // MACC(Rm, Rn, t0, t1, t2); 9100 // t0 = t1; t1 = t2; t2 = 0; 9101 umulh(Rhi_mn, Rm, Rn); 9102 9103 #ifndef PRODUCT 9104 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 9105 { 9106 mul(Rlo_mn, Rm, Rn); 9107 add(Rlo_mn, t0, Rlo_mn); 9108 Label ok; 9109 cbz(Rlo_mn, ok); { 9110 stop("broken Montgomery multiply"); 9111 } bind(ok); 9112 } 9113 #endif 9114 // We have very carefully set things up so that 9115 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 9116 // the lower half of Rm * Rn because we know the result already: 9117 // it must be -t0. t0 + (-t0) must generate a carry iff 9118 // t0 != 0. So, rather than do a mul and an adds we just set 9119 // the carry flag iff t0 is nonzero. 9120 // 9121 // mul(Rlo_mn, Rm, Rn); 9122 // adds(zr, t0, Rlo_mn); 9123 subs(zr, t0, 1); // Set carry iff t0 is nonzero 9124 adcs(t0, t1, Rhi_mn); 9125 adc(t1, t2, zr); 9126 mov(t2, zr); 9127 } 9128 9129 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 9130 block_comment("pre2"); 9131 // Pa = Pa_base + i-len; 9132 // Pb = Pb_base + len; 9133 // Pm = Pm_base + i-len; 9134 // Pn = Pn_base + len; 9135 9136 if (i.is_register()) { 9137 sub(Rj, i.as_register(), len); 9138 } else { 9139 mov(Rj, i.as_constant()); 9140 sub(Rj, Rj, len); 9141 } 9142 // Rj == i-len 9143 9144 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 9145 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 9146 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 9147 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 9148 9149 // Ra = *++Pa; 9150 // Rb = *--Pb; 9151 // Rm = *++Pm; 9152 // Rn = *--Pn; 9153 ldr(Ra, pre(Pa, wordSize)); 9154 ldr(Rb, pre(Pb, -wordSize)); 9155 ldr(Rm, pre(Pm, wordSize)); 9156 ldr(Rn, pre(Pn, -wordSize)); 9157 9158 mov(Rhi_mn, zr); 9159 mov(Rlo_mn, zr); 9160 } 9161 9162 void post2(RegisterOrConstant i, RegisterOrConstant len) { 9163 block_comment("post2"); 9164 if (i.is_constant()) { 9165 mov(Rj, i.as_constant()-len.as_constant()); 9166 } else { 9167 sub(Rj, i.as_register(), len); 9168 } 9169 9170 adds(t0, t0, Rlo_mn); // The pending m*n, low part 9171 9172 // As soon as we know the least significant digit of our result, 9173 // store it. 9174 // Pm_base[i-len] = t0; 9175 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 9176 9177 // t0 = t1; t1 = t2; t2 = 0; 9178 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 9179 adc(t1, t2, zr); 9180 mov(t2, zr); 9181 } 9182 9183 // A carry in t0 after Montgomery multiplication means that we 9184 // should subtract multiples of n from our result in m. We'll 9185 // keep doing that until there is no carry. 9186 void normalize(RegisterOrConstant len) { 9187 block_comment("normalize"); 9188 // while (t0) 9189 // t0 = sub(Pm_base, Pn_base, t0, len); 9190 Label loop, post, again; 9191 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 9192 cbz(t0, post); { 9193 bind(again); { 9194 mov(i, zr); 9195 mov(cnt, len); 9196 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9197 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9198 subs(zr, zr, zr); // set carry flag, i.e. no borrow 9199 align(16); 9200 bind(loop); { 9201 sbcs(Rm, Rm, Rn); 9202 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9203 add(i, i, 1); 9204 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9205 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9206 sub(cnt, cnt, 1); 9207 } cbnz(cnt, loop); 9208 sbc(t0, t0, zr); 9209 } cbnz(t0, again); 9210 } bind(post); 9211 } 9212 9213 // Move memory at s to d, reversing words. 9214 // Increments d to end of copied memory 9215 // Destroys tmp1, tmp2 9216 // Preserves len 9217 // Leaves s pointing to the address which was in d at start 9218 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 9219 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 9220 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 9221 9222 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 9223 mov(tmp1, len); 9224 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 9225 sub(s, d, len, ext::uxtw, LogBytesPerWord); 9226 } 9227 // where 9228 void reverse1(Register d, Register s, Register tmp) { 9229 ldr(tmp, pre(s, -wordSize)); 9230 ror(tmp, tmp, 32); 9231 str(tmp, post(d, wordSize)); 9232 } 9233 9234 void step_squaring() { 9235 // An extra ACC 9236 step(); 9237 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9238 } 9239 9240 void last_squaring(RegisterOrConstant i) { 9241 Label dont; 9242 // if ((i & 1) == 0) { 9243 tbnz(i.as_register(), 0, dont); { 9244 // MACC(Ra, Rb, t0, t1, t2); 9245 // Ra = *++Pa; 9246 // Rb = *--Pb; 9247 umulh(Rhi_ab, Ra, Rb); 9248 mul(Rlo_ab, Ra, Rb); 9249 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9250 } bind(dont); 9251 } 9252 9253 void extra_step_squaring() { 9254 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9255 9256 // MACC(Rm, Rn, t0, t1, t2); 9257 // Rm = *++Pm; 9258 // Rn = *--Pn; 9259 umulh(Rhi_mn, Rm, Rn); 9260 mul(Rlo_mn, Rm, Rn); 9261 ldr(Rm, pre(Pm, wordSize)); 9262 ldr(Rn, pre(Pn, -wordSize)); 9263 } 9264 9265 void post1_squaring() { 9266 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9267 9268 // *Pm = Rm = t0 * inv; 9269 mul(Rm, t0, inv); 9270 str(Rm, Address(Pm)); 9271 9272 // MACC(Rm, Rn, t0, t1, t2); 9273 // t0 = t1; t1 = t2; t2 = 0; 9274 umulh(Rhi_mn, Rm, Rn); 9275 9276 #ifndef PRODUCT 9277 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 9278 { 9279 mul(Rlo_mn, Rm, Rn); 9280 add(Rlo_mn, t0, Rlo_mn); 9281 Label ok; 9282 cbz(Rlo_mn, ok); { 9283 stop("broken Montgomery multiply"); 9284 } bind(ok); 9285 } 9286 #endif 9287 // We have very carefully set things up so that 9288 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 9289 // the lower half of Rm * Rn because we know the result already: 9290 // it must be -t0. t0 + (-t0) must generate a carry iff 9291 // t0 != 0. So, rather than do a mul and an adds we just set 9292 // the carry flag iff t0 is nonzero. 9293 // 9294 // mul(Rlo_mn, Rm, Rn); 9295 // adds(zr, t0, Rlo_mn); 9296 subs(zr, t0, 1); // Set carry iff t0 is nonzero 9297 adcs(t0, t1, Rhi_mn); 9298 adc(t1, t2, zr); 9299 mov(t2, zr); 9300 } 9301 9302 void acc(Register Rhi, Register Rlo, 9303 Register t0, Register t1, Register t2) { 9304 adds(t0, t0, Rlo); 9305 adcs(t1, t1, Rhi); 9306 adc(t2, t2, zr); 9307 } 9308 9309 public: 9310 /** 9311 * Fast Montgomery multiplication. The derivation of the 9312 * algorithm is in A Cryptographic Library for the Motorola 9313 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 9314 * 9315 * Arguments: 9316 * 9317 * Inputs for multiplication: 9318 * c_rarg0 - int array elements a 9319 * c_rarg1 - int array elements b 9320 * c_rarg2 - int array elements n (the modulus) 9321 * c_rarg3 - int length 9322 * c_rarg4 - int inv 9323 * c_rarg5 - int array elements m (the result) 9324 * 9325 * Inputs for squaring: 9326 * c_rarg0 - int array elements a 9327 * c_rarg1 - int array elements n (the modulus) 9328 * c_rarg2 - int length 9329 * c_rarg3 - int inv 9330 * c_rarg4 - int array elements m (the result) 9331 * 9332 */ 9333 address generate_multiply() { 9334 Label argh, nothing; 9335 bind(argh); 9336 stop("MontgomeryMultiply total_allocation must be <= 8192"); 9337 9338 align(CodeEntryAlignment); 9339 address entry = pc(); 9340 9341 cbzw(Rlen, nothing); 9342 9343 enter(); 9344 9345 // Make room. 9346 cmpw(Rlen, 512); 9347 br(Assembler::HI, argh); 9348 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 9349 andr(sp, Ra, -2 * wordSize); 9350 9351 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 9352 9353 { 9354 // Copy input args, reversing as we go. We use Ra as a 9355 // temporary variable. 9356 reverse(Ra, Pa_base, Rlen, t0, t1); 9357 if (!_squaring) 9358 reverse(Ra, Pb_base, Rlen, t0, t1); 9359 reverse(Ra, Pn_base, Rlen, t0, t1); 9360 } 9361 9362 // Push all call-saved registers and also Pm_base which we'll need 9363 // at the end. 9364 save_regs(); 9365 9366 #ifndef PRODUCT 9367 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 9368 { 9369 ldr(Rn, Address(Pn_base, 0)); 9370 mul(Rlo_mn, Rn, inv); 9371 subs(zr, Rlo_mn, -1); 9372 Label ok; 9373 br(EQ, ok); { 9374 stop("broken inverse in Montgomery multiply"); 9375 } bind(ok); 9376 } 9377 #endif 9378 9379 mov(Pm_base, Ra); 9380 9381 mov(t0, zr); 9382 mov(t1, zr); 9383 mov(t2, zr); 9384 9385 block_comment("for (int i = 0; i < len; i++) {"); 9386 mov(Ri, zr); { 9387 Label loop, end; 9388 cmpw(Ri, Rlen); 9389 br(Assembler::GE, end); 9390 9391 bind(loop); 9392 pre1(Ri); 9393 9394 block_comment(" for (j = i; j; j--) {"); { 9395 movw(Rj, Ri); 9396 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 9397 } block_comment(" } // j"); 9398 9399 post1(); 9400 addw(Ri, Ri, 1); 9401 cmpw(Ri, Rlen); 9402 br(Assembler::LT, loop); 9403 bind(end); 9404 block_comment("} // i"); 9405 } 9406 9407 block_comment("for (int i = len; i < 2*len; i++) {"); 9408 mov(Ri, Rlen); { 9409 Label loop, end; 9410 cmpw(Ri, Rlen, Assembler::LSL, 1); 9411 br(Assembler::GE, end); 9412 9413 bind(loop); 9414 pre2(Ri, Rlen); 9415 9416 block_comment(" for (j = len*2-i-1; j; j--) {"); { 9417 lslw(Rj, Rlen, 1); 9418 subw(Rj, Rj, Ri); 9419 subw(Rj, Rj, 1); 9420 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 9421 } block_comment(" } // j"); 9422 9423 post2(Ri, Rlen); 9424 addw(Ri, Ri, 1); 9425 cmpw(Ri, Rlen, Assembler::LSL, 1); 9426 br(Assembler::LT, loop); 9427 bind(end); 9428 } 9429 block_comment("} // i"); 9430 9431 normalize(Rlen); 9432 9433 mov(Ra, Pm_base); // Save Pm_base in Ra 9434 restore_regs(); // Restore caller's Pm_base 9435 9436 // Copy our result into caller's Pm_base 9437 reverse(Pm_base, Ra, Rlen, t0, t1); 9438 9439 leave(); 9440 bind(nothing); 9441 ret(lr); 9442 9443 return entry; 9444 } 9445 // In C, approximately: 9446 9447 // void 9448 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 9449 // julong Pn_base[], julong Pm_base[], 9450 // julong inv, int len) { 9451 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 9452 // julong *Pa, *Pb, *Pn, *Pm; 9453 // julong Ra, Rb, Rn, Rm; 9454 9455 // int i; 9456 9457 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 9458 9459 // for (i = 0; i < len; i++) { 9460 // int j; 9461 9462 // Pa = Pa_base; 9463 // Pb = Pb_base + i; 9464 // Pm = Pm_base; 9465 // Pn = Pn_base + i; 9466 9467 // Ra = *Pa; 9468 // Rb = *Pb; 9469 // Rm = *Pm; 9470 // Rn = *Pn; 9471 9472 // int iters = i; 9473 // for (j = 0; iters--; j++) { 9474 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 9475 // MACC(Ra, Rb, t0, t1, t2); 9476 // Ra = *++Pa; 9477 // Rb = *--Pb; 9478 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9479 // MACC(Rm, Rn, t0, t1, t2); 9480 // Rm = *++Pm; 9481 // Rn = *--Pn; 9482 // } 9483 9484 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 9485 // MACC(Ra, Rb, t0, t1, t2); 9486 // *Pm = Rm = t0 * inv; 9487 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 9488 // MACC(Rm, Rn, t0, t1, t2); 9489 9490 // assert(t0 == 0, "broken Montgomery multiply"); 9491 9492 // t0 = t1; t1 = t2; t2 = 0; 9493 // } 9494 9495 // for (i = len; i < 2*len; i++) { 9496 // int j; 9497 9498 // Pa = Pa_base + i-len; 9499 // Pb = Pb_base + len; 9500 // Pm = Pm_base + i-len; 9501 // Pn = Pn_base + len; 9502 9503 // Ra = *++Pa; 9504 // Rb = *--Pb; 9505 // Rm = *++Pm; 9506 // Rn = *--Pn; 9507 9508 // int iters = len*2-i-1; 9509 // for (j = i-len+1; iters--; j++) { 9510 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 9511 // MACC(Ra, Rb, t0, t1, t2); 9512 // Ra = *++Pa; 9513 // Rb = *--Pb; 9514 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9515 // MACC(Rm, Rn, t0, t1, t2); 9516 // Rm = *++Pm; 9517 // Rn = *--Pn; 9518 // } 9519 9520 // Pm_base[i-len] = t0; 9521 // t0 = t1; t1 = t2; t2 = 0; 9522 // } 9523 9524 // while (t0) 9525 // t0 = sub(Pm_base, Pn_base, t0, len); 9526 // } 9527 9528 /** 9529 * Fast Montgomery squaring. This uses asymptotically 25% fewer 9530 * multiplies than Montgomery multiplication so it should be up to 9531 * 25% faster. However, its loop control is more complex and it 9532 * may actually run slower on some machines. 9533 * 9534 * Arguments: 9535 * 9536 * Inputs: 9537 * c_rarg0 - int array elements a 9538 * c_rarg1 - int array elements n (the modulus) 9539 * c_rarg2 - int length 9540 * c_rarg3 - int inv 9541 * c_rarg4 - int array elements m (the result) 9542 * 9543 */ 9544 address generate_square() { 9545 Label argh; 9546 bind(argh); 9547 stop("MontgomeryMultiply total_allocation must be <= 8192"); 9548 9549 align(CodeEntryAlignment); 9550 address entry = pc(); 9551 9552 enter(); 9553 9554 // Make room. 9555 cmpw(Rlen, 512); 9556 br(Assembler::HI, argh); 9557 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 9558 andr(sp, Ra, -2 * wordSize); 9559 9560 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 9561 9562 { 9563 // Copy input args, reversing as we go. We use Ra as a 9564 // temporary variable. 9565 reverse(Ra, Pa_base, Rlen, t0, t1); 9566 reverse(Ra, Pn_base, Rlen, t0, t1); 9567 } 9568 9569 // Push all call-saved registers and also Pm_base which we'll need 9570 // at the end. 9571 save_regs(); 9572 9573 mov(Pm_base, Ra); 9574 9575 mov(t0, zr); 9576 mov(t1, zr); 9577 mov(t2, zr); 9578 9579 block_comment("for (int i = 0; i < len; i++) {"); 9580 mov(Ri, zr); { 9581 Label loop, end; 9582 bind(loop); 9583 cmp(Ri, Rlen); 9584 br(Assembler::GE, end); 9585 9586 pre1(Ri); 9587 9588 block_comment("for (j = (i+1)/2; j; j--) {"); { 9589 add(Rj, Ri, 1); 9590 lsr(Rj, Rj, 1); 9591 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 9592 } block_comment(" } // j"); 9593 9594 last_squaring(Ri); 9595 9596 block_comment(" for (j = i/2; j; j--) {"); { 9597 lsr(Rj, Ri, 1); 9598 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 9599 } block_comment(" } // j"); 9600 9601 post1_squaring(); 9602 add(Ri, Ri, 1); 9603 cmp(Ri, Rlen); 9604 br(Assembler::LT, loop); 9605 9606 bind(end); 9607 block_comment("} // i"); 9608 } 9609 9610 block_comment("for (int i = len; i < 2*len; i++) {"); 9611 mov(Ri, Rlen); { 9612 Label loop, end; 9613 bind(loop); 9614 cmp(Ri, Rlen, Assembler::LSL, 1); 9615 br(Assembler::GE, end); 9616 9617 pre2(Ri, Rlen); 9618 9619 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 9620 lsl(Rj, Rlen, 1); 9621 sub(Rj, Rj, Ri); 9622 sub(Rj, Rj, 1); 9623 lsr(Rj, Rj, 1); 9624 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 9625 } block_comment(" } // j"); 9626 9627 last_squaring(Ri); 9628 9629 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 9630 lsl(Rj, Rlen, 1); 9631 sub(Rj, Rj, Ri); 9632 lsr(Rj, Rj, 1); 9633 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 9634 } block_comment(" } // j"); 9635 9636 post2(Ri, Rlen); 9637 add(Ri, Ri, 1); 9638 cmp(Ri, Rlen, Assembler::LSL, 1); 9639 9640 br(Assembler::LT, loop); 9641 bind(end); 9642 block_comment("} // i"); 9643 } 9644 9645 normalize(Rlen); 9646 9647 mov(Ra, Pm_base); // Save Pm_base in Ra 9648 restore_regs(); // Restore caller's Pm_base 9649 9650 // Copy our result into caller's Pm_base 9651 reverse(Pm_base, Ra, Rlen, t0, t1); 9652 9653 leave(); 9654 ret(lr); 9655 9656 return entry; 9657 } 9658 // In C, approximately: 9659 9660 // void 9661 // montgomery_square(julong Pa_base[], julong Pn_base[], 9662 // julong Pm_base[], julong inv, int len) { 9663 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 9664 // julong *Pa, *Pb, *Pn, *Pm; 9665 // julong Ra, Rb, Rn, Rm; 9666 9667 // int i; 9668 9669 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 9670 9671 // for (i = 0; i < len; i++) { 9672 // int j; 9673 9674 // Pa = Pa_base; 9675 // Pb = Pa_base + i; 9676 // Pm = Pm_base; 9677 // Pn = Pn_base + i; 9678 9679 // Ra = *Pa; 9680 // Rb = *Pb; 9681 // Rm = *Pm; 9682 // Rn = *Pn; 9683 9684 // int iters = (i+1)/2; 9685 // for (j = 0; iters--; j++) { 9686 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 9687 // MACC2(Ra, Rb, t0, t1, t2); 9688 // Ra = *++Pa; 9689 // Rb = *--Pb; 9690 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9691 // MACC(Rm, Rn, t0, t1, t2); 9692 // Rm = *++Pm; 9693 // Rn = *--Pn; 9694 // } 9695 // if ((i & 1) == 0) { 9696 // assert(Ra == Pa_base[j], "must be"); 9697 // MACC(Ra, Ra, t0, t1, t2); 9698 // } 9699 // iters = i/2; 9700 // assert(iters == i-j, "must be"); 9701 // for (; iters--; j++) { 9702 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9703 // MACC(Rm, Rn, t0, t1, t2); 9704 // Rm = *++Pm; 9705 // Rn = *--Pn; 9706 // } 9707 9708 // *Pm = Rm = t0 * inv; 9709 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 9710 // MACC(Rm, Rn, t0, t1, t2); 9711 9712 // assert(t0 == 0, "broken Montgomery multiply"); 9713 9714 // t0 = t1; t1 = t2; t2 = 0; 9715 // } 9716 9717 // for (i = len; i < 2*len; i++) { 9718 // int start = i-len+1; 9719 // int end = start + (len - start)/2; 9720 // int j; 9721 9722 // Pa = Pa_base + i-len; 9723 // Pb = Pa_base + len; 9724 // Pm = Pm_base + i-len; 9725 // Pn = Pn_base + len; 9726 9727 // Ra = *++Pa; 9728 // Rb = *--Pb; 9729 // Rm = *++Pm; 9730 // Rn = *--Pn; 9731 9732 // int iters = (2*len-i-1)/2; 9733 // assert(iters == end-start, "must be"); 9734 // for (j = start; iters--; j++) { 9735 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 9736 // MACC2(Ra, Rb, t0, t1, t2); 9737 // Ra = *++Pa; 9738 // Rb = *--Pb; 9739 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9740 // MACC(Rm, Rn, t0, t1, t2); 9741 // Rm = *++Pm; 9742 // Rn = *--Pn; 9743 // } 9744 // if ((i & 1) == 0) { 9745 // assert(Ra == Pa_base[j], "must be"); 9746 // MACC(Ra, Ra, t0, t1, t2); 9747 // } 9748 // iters = (2*len-i)/2; 9749 // assert(iters == len-j, "must be"); 9750 // for (; iters--; j++) { 9751 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9752 // MACC(Rm, Rn, t0, t1, t2); 9753 // Rm = *++Pm; 9754 // Rn = *--Pn; 9755 // } 9756 // Pm_base[i-len] = t0; 9757 // t0 = t1; t1 = t2; t2 = 0; 9758 // } 9759 9760 // while (t0) 9761 // t0 = sub(Pm_base, Pn_base, t0, len); 9762 // } 9763 }; 9764 9765 void generate_vector_math_stubs() { 9766 // Get native vector math stub routine addresses 9767 void* libsleef = nullptr; 9768 char ebuf[1024]; 9769 char dll_name[JVM_MAXPATHLEN]; 9770 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 9771 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 9772 } 9773 if (libsleef == nullptr) { 9774 log_info(library)("Failed to load native vector math library, %s!", ebuf); 9775 return; 9776 } 9777 // Method naming convention 9778 // All the methods are named as <OP><T><N>_<U><suffix> 9779 // Where: 9780 // <OP> is the operation name, e.g. sin 9781 // <T> is optional to indicate float/double 9782 // "f/d" for vector float/double operation 9783 // <N> is the number of elements in the vector 9784 // "2/4" for neon, and "x" for sve 9785 // <U> is the precision level 9786 // "u10/u05" represents 1.0/0.5 ULP error bounds 9787 // We use "u10" for all operations by default 9788 // But for those functions do not have u10 support, we use "u05" instead 9789 // <suffix> indicates neon/sve 9790 // "sve/advsimd" for sve/neon implementations 9791 // e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions 9792 // cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions 9793 // 9794 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 9795 9796 // Math vector stubs implemented with SVE for scalable vector size. 9797 if (UseSVE > 0) { 9798 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 9799 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 9800 // Skip "tanh" because there is performance regression 9801 if (vop == VectorSupport::VECTOR_OP_TANH) { 9802 continue; 9803 } 9804 9805 // The native library does not support u10 level of "hypot". 9806 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 9807 9808 snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf); 9809 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 9810 9811 snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf); 9812 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 9813 } 9814 } 9815 9816 // Math vector stubs implemented with NEON for 64/128 bits vector size. 9817 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 9818 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 9819 // Skip "tanh" because there is performance regression 9820 if (vop == VectorSupport::VECTOR_OP_TANH) { 9821 continue; 9822 } 9823 9824 // The native library does not support u10 level of "hypot". 9825 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 9826 9827 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 9828 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf); 9829 9830 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 9831 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 9832 9833 snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf); 9834 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 9835 } 9836 } 9837 9838 // Initialization 9839 void generate_initial_stubs() { 9840 // Generate initial stubs and initializes the entry points 9841 9842 // entry points that exist in all platforms Note: This is code 9843 // that could be shared among different platforms - however the 9844 // benefit seems to be smaller than the disadvantage of having a 9845 // much more complicated generator structure. See also comment in 9846 // stubRoutines.hpp. 9847 9848 StubRoutines::_forward_exception_entry = generate_forward_exception(); 9849 9850 StubRoutines::_call_stub_entry = 9851 generate_call_stub(StubRoutines::_call_stub_return_address); 9852 9853 // is referenced by megamorphic call 9854 StubRoutines::_catch_exception_entry = generate_catch_exception(); 9855 9856 // Initialize table for copy memory (arraycopy) check. 9857 if (UnsafeMemoryAccess::_table == nullptr) { 9858 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 9859 } 9860 9861 if (UseCRC32Intrinsics) { 9862 // set table address before stub generation which use it 9863 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 9864 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 9865 } 9866 9867 if (UseCRC32CIntrinsics) { 9868 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 9869 } 9870 9871 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 9872 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 9873 } 9874 9875 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 9876 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 9877 } 9878 9879 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 9880 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 9881 StubRoutines::_hf2f = generate_float16ToFloat(); 9882 StubRoutines::_f2hf = generate_floatToFloat16(); 9883 } 9884 } 9885 9886 void generate_continuation_stubs() { 9887 // Continuation stubs: 9888 StubRoutines::_cont_thaw = generate_cont_thaw(); 9889 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 9890 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 9891 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 9892 } 9893 9894 void generate_final_stubs() { 9895 // support for verify_oop (must happen after universe_init) 9896 if (VerifyOops) { 9897 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 9898 } 9899 9900 // arraycopy stubs used by compilers 9901 generate_arraycopy_stubs(); 9902 9903 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 9904 if (bs_nm != nullptr) { 9905 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 9906 } 9907 9908 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 9909 9910 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 9911 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 9912 9913 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 9914 9915 generate_atomic_entry_points(); 9916 9917 #endif // LINUX 9918 9919 #ifdef COMPILER2 9920 if (UseSecondarySupersTable) { 9921 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 9922 if (! InlineSecondarySupersTest) { 9923 generate_lookup_secondary_supers_table_stub(); 9924 } 9925 } 9926 #endif 9927 9928 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 9929 } 9930 9931 void generate_compiler_stubs() { 9932 #if COMPILER2_OR_JVMCI 9933 9934 if (UseSVE == 0) { 9935 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 9936 } 9937 9938 // array equals stub for large arrays. 9939 if (!UseSimpleArrayEquals) { 9940 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 9941 } 9942 9943 // arrays_hascode stub for large arrays. 9944 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 9945 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 9946 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 9947 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 9948 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 9949 9950 // byte_array_inflate stub for large arrays. 9951 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 9952 9953 // countPositives stub for large arrays. 9954 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 9955 9956 generate_compare_long_strings(); 9957 9958 generate_string_indexof_stubs(); 9959 9960 #ifdef COMPILER2 9961 if (UseMultiplyToLenIntrinsic) { 9962 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 9963 } 9964 9965 if (UseSquareToLenIntrinsic) { 9966 StubRoutines::_squareToLen = generate_squareToLen(); 9967 } 9968 9969 if (UseMulAddIntrinsic) { 9970 StubRoutines::_mulAdd = generate_mulAdd(); 9971 } 9972 9973 if (UseSIMDForBigIntegerShiftIntrinsics) { 9974 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 9975 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 9976 } 9977 9978 if (UseMontgomeryMultiplyIntrinsic) { 9979 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 9980 StubCodeMark mark(this, stub_id); 9981 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 9982 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 9983 } 9984 9985 if (UseMontgomerySquareIntrinsic) { 9986 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 9987 StubCodeMark mark(this, stub_id); 9988 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 9989 // We use generate_multiply() rather than generate_square() 9990 // because it's faster for the sizes of modulus we care about. 9991 StubRoutines::_montgomerySquare = g.generate_multiply(); 9992 } 9993 9994 generate_vector_math_stubs(); 9995 9996 #endif // COMPILER2 9997 9998 if (UseChaCha20Intrinsics) { 9999 StubRoutines::_chacha20Block = generate_chacha20Block_qrpar(); 10000 } 10001 10002 if (UseDilithiumIntrinsics) { 10003 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 10004 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 10005 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 10006 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 10007 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 10008 } 10009 10010 if (UseBASE64Intrinsics) { 10011 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 10012 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 10013 } 10014 10015 // data cache line writeback 10016 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 10017 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 10018 10019 if (UseAESIntrinsics) { 10020 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 10021 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 10022 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 10023 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 10024 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 10025 } 10026 if (UseGHASHIntrinsics) { 10027 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 10028 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 10029 } 10030 if (UseAESIntrinsics && UseGHASHIntrinsics) { 10031 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 10032 } 10033 10034 if (UseMD5Intrinsics) { 10035 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 10036 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 10037 } 10038 if (UseSHA1Intrinsics) { 10039 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 10040 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 10041 } 10042 if (UseSHA256Intrinsics) { 10043 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 10044 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 10045 } 10046 if (UseSHA512Intrinsics) { 10047 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 10048 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 10049 } 10050 if (UseSHA3Intrinsics) { 10051 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 10052 StubRoutines::_double_keccak = generate_double_keccak(); 10053 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 10054 } 10055 10056 if (UsePoly1305Intrinsics) { 10057 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 10058 } 10059 10060 // generate Adler32 intrinsics code 10061 if (UseAdler32Intrinsics) { 10062 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 10063 } 10064 10065 #endif // COMPILER2_OR_JVMCI 10066 } 10067 10068 public: 10069 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 10070 switch(blob_id) { 10071 case initial_id: 10072 generate_initial_stubs(); 10073 break; 10074 case continuation_id: 10075 generate_continuation_stubs(); 10076 break; 10077 case compiler_id: 10078 generate_compiler_stubs(); 10079 break; 10080 case final_id: 10081 generate_final_stubs(); 10082 break; 10083 default: 10084 fatal("unexpected blob id: %d", blob_id); 10085 break; 10086 }; 10087 } 10088 }; // end class declaration 10089 10090 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 10091 StubGenerator g(code, blob_id); 10092 } 10093 10094 10095 #if defined (LINUX) 10096 10097 // Define pointers to atomic stubs and initialize them to point to the 10098 // code in atomic_aarch64.S. 10099 10100 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 10101 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 10102 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 10103 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 10104 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 10105 10106 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 10107 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 10108 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 10109 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 10110 DEFAULT_ATOMIC_OP(xchg, 4, ) 10111 DEFAULT_ATOMIC_OP(xchg, 8, ) 10112 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 10113 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 10114 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 10115 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 10116 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 10117 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 10118 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 10119 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 10120 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 10121 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 10122 10123 #undef DEFAULT_ATOMIC_OP 10124 10125 #endif // LINUX