1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubGenStubId stub_id = StubGenStubId::call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 // All of j_rargN may be used to return inline type fields so be careful 332 // not to clobber those. 333 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register 334 // assignment of Rresult below. 335 Register Rresult = r14, Rresult_type = r15; 336 __ ldr(Rresult, result); 337 Label is_long, is_float, is_double, check_prim, exit; 338 __ ldr(Rresult_type, result_type); 339 __ cmp(Rresult_type, (u1)T_OBJECT); 340 __ br(Assembler::EQ, check_prim); 341 __ cmp(Rresult_type, (u1)T_LONG); 342 __ br(Assembler::EQ, is_long); 343 __ cmp(Rresult_type, (u1)T_FLOAT); 344 __ br(Assembler::EQ, is_float); 345 __ cmp(Rresult_type, (u1)T_DOUBLE); 346 __ br(Assembler::EQ, is_double); 347 348 // handle T_INT case 349 __ strw(r0, Address(Rresult)); 350 351 __ BIND(exit); 352 353 // pop parameters 354 __ sub(esp, rfp, -sp_after_call_off * wordSize); 355 356 #ifdef ASSERT 357 // verify that threads correspond 358 { 359 Label L, S; 360 __ ldr(rscratch1, thread); 361 __ cmp(rthread, rscratch1); 362 __ br(Assembler::NE, S); 363 __ get_thread(rscratch1); 364 __ cmp(rthread, rscratch1); 365 __ br(Assembler::EQ, L); 366 __ BIND(S); 367 __ stop("StubRoutines::call_stub: threads must correspond"); 368 __ BIND(L); 369 } 370 #endif 371 372 __ pop_cont_fastpath(rthread); 373 374 // restore callee-save registers 375 __ ldpd(v15, v14, d15_save); 376 __ ldpd(v13, v12, d13_save); 377 __ ldpd(v11, v10, d11_save); 378 __ ldpd(v9, v8, d9_save); 379 380 __ ldp(r28, r27, r28_save); 381 __ ldp(r26, r25, r26_save); 382 __ ldp(r24, r23, r24_save); 383 __ ldp(r22, r21, r22_save); 384 __ ldp(r20, r19, r20_save); 385 386 // restore fpcr 387 __ ldr(rscratch1, fpcr_save); 388 __ set_fpcr(rscratch1); 389 390 __ ldp(c_rarg0, c_rarg1, call_wrapper); 391 __ ldrw(c_rarg2, result_type); 392 __ ldr(c_rarg3, method); 393 __ ldp(c_rarg4, c_rarg5, entry_point); 394 __ ldp(c_rarg6, c_rarg7, parameter_size); 395 396 // leave frame and return to caller 397 __ leave(); 398 __ ret(lr); 399 400 // handle return types different from T_INT 401 __ BIND(check_prim); 402 if (InlineTypeReturnedAsFields) { 403 // Check for scalarized return value 404 __ tbz(r0, 0, is_long); 405 // Load pack handler address 406 __ andr(rscratch1, r0, -2); 407 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset())); 408 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset())); 409 __ blr(rscratch1); 410 __ b(exit); 411 } 412 413 __ BIND(is_long); 414 __ str(r0, Address(Rresult, 0)); 415 __ br(Assembler::AL, exit); 416 417 __ BIND(is_float); 418 __ strs(j_farg0, Address(Rresult, 0)); 419 __ br(Assembler::AL, exit); 420 421 __ BIND(is_double); 422 __ strd(j_farg0, Address(Rresult, 0)); 423 __ br(Assembler::AL, exit); 424 425 return start; 426 } 427 428 // Return point for a Java call if there's an exception thrown in 429 // Java code. The exception is caught and transformed into a 430 // pending exception stored in JavaThread that can be tested from 431 // within the VM. 432 // 433 // Note: Usually the parameters are removed by the callee. In case 434 // of an exception crossing an activation frame boundary, that is 435 // not the case if the callee is compiled code => need to setup the 436 // rsp. 437 // 438 // r0: exception oop 439 440 address generate_catch_exception() { 441 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 442 StubCodeMark mark(this, stub_id); 443 address start = __ pc(); 444 445 // same as in generate_call_stub(): 446 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 447 const Address thread (rfp, thread_off * wordSize); 448 449 #ifdef ASSERT 450 // verify that threads correspond 451 { 452 Label L, S; 453 __ ldr(rscratch1, thread); 454 __ cmp(rthread, rscratch1); 455 __ br(Assembler::NE, S); 456 __ get_thread(rscratch1); 457 __ cmp(rthread, rscratch1); 458 __ br(Assembler::EQ, L); 459 __ bind(S); 460 __ stop("StubRoutines::catch_exception: threads must correspond"); 461 __ bind(L); 462 } 463 #endif 464 465 // set pending exception 466 __ verify_oop(r0); 467 468 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 469 __ mov(rscratch1, (address)__FILE__); 470 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 471 __ movw(rscratch1, (int)__LINE__); 472 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 473 474 // complete return to VM 475 assert(StubRoutines::_call_stub_return_address != nullptr, 476 "_call_stub_return_address must have been generated before"); 477 __ b(StubRoutines::_call_stub_return_address); 478 479 return start; 480 } 481 482 // Continuation point for runtime calls returning with a pending 483 // exception. The pending exception check happened in the runtime 484 // or native call stub. The pending exception in Thread is 485 // converted into a Java-level exception. 486 // 487 // Contract with Java-level exception handlers: 488 // r0: exception 489 // r3: throwing pc 490 // 491 // NOTE: At entry of this stub, exception-pc must be in LR !! 492 493 // NOTE: this is always used as a jump target within generated code 494 // so it just needs to be generated code with no x86 prolog 495 496 address generate_forward_exception() { 497 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 498 StubCodeMark mark(this, stub_id); 499 address start = __ pc(); 500 501 // Upon entry, LR points to the return address returning into 502 // Java (interpreted or compiled) code; i.e., the return address 503 // becomes the throwing pc. 504 // 505 // Arguments pushed before the runtime call are still on the stack 506 // but the exception handler will reset the stack pointer -> 507 // ignore them. A potential result in registers can be ignored as 508 // well. 509 510 #ifdef ASSERT 511 // make sure this code is only executed if there is a pending exception 512 { 513 Label L; 514 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 515 __ cbnz(rscratch1, L); 516 __ stop("StubRoutines::forward exception: no pending exception (1)"); 517 __ bind(L); 518 } 519 #endif 520 521 // compute exception handler into r19 522 523 // call the VM to find the handler address associated with the 524 // caller address. pass thread in r0 and caller pc (ret address) 525 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 526 // the stack. 527 __ mov(c_rarg1, lr); 528 // lr will be trashed by the VM call so we move it to R19 529 // (callee-saved) because we also need to pass it to the handler 530 // returned by this call. 531 __ mov(r19, lr); 532 BLOCK_COMMENT("call exception_handler_for_return_address"); 533 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 534 SharedRuntime::exception_handler_for_return_address), 535 rthread, c_rarg1); 536 // Reinitialize the ptrue predicate register, in case the external runtime 537 // call clobbers ptrue reg, as we may return to SVE compiled code. 538 __ reinitialize_ptrue(); 539 540 // we should not really care that lr is no longer the callee 541 // address. we saved the value the handler needs in r19 so we can 542 // just copy it to r3. however, the C2 handler will push its own 543 // frame and then calls into the VM and the VM code asserts that 544 // the PC for the frame above the handler belongs to a compiled 545 // Java method. So, we restore lr here to satisfy that assert. 546 __ mov(lr, r19); 547 // setup r0 & r3 & clear pending exception 548 __ mov(r3, r19); 549 __ mov(r19, r0); 550 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 551 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 552 553 #ifdef ASSERT 554 // make sure exception is set 555 { 556 Label L; 557 __ cbnz(r0, L); 558 __ stop("StubRoutines::forward exception: no pending exception (2)"); 559 __ bind(L); 560 } 561 #endif 562 563 // continue at exception handler 564 // r0: exception 565 // r3: throwing pc 566 // r19: exception handler 567 __ verify_oop(r0); 568 __ br(r19); 569 570 return start; 571 } 572 573 // Non-destructive plausibility checks for oops 574 // 575 // Arguments: 576 // r0: oop to verify 577 // rscratch1: error message 578 // 579 // Stack after saving c_rarg3: 580 // [tos + 0]: saved c_rarg3 581 // [tos + 1]: saved c_rarg2 582 // [tos + 2]: saved lr 583 // [tos + 3]: saved rscratch2 584 // [tos + 4]: saved r0 585 // [tos + 5]: saved rscratch1 586 address generate_verify_oop() { 587 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 588 StubCodeMark mark(this, stub_id); 589 address start = __ pc(); 590 591 Label exit, error; 592 593 // save c_rarg2 and c_rarg3 594 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 595 596 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 597 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 598 __ ldr(c_rarg3, Address(c_rarg2)); 599 __ add(c_rarg3, c_rarg3, 1); 600 __ str(c_rarg3, Address(c_rarg2)); 601 602 // object is in r0 603 // make sure object is 'reasonable' 604 __ cbz(r0, exit); // if obj is null it is OK 605 606 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 607 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 608 609 // return if everything seems ok 610 __ bind(exit); 611 612 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 613 __ ret(lr); 614 615 // handle errors 616 __ bind(error); 617 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 618 619 __ push(RegSet::range(r0, r29), sp); 620 // debug(char* msg, int64_t pc, int64_t regs[]) 621 __ mov(c_rarg0, rscratch1); // pass address of error message 622 __ mov(c_rarg1, lr); // pass return address 623 __ mov(c_rarg2, sp); // pass address of regs on stack 624 #ifndef PRODUCT 625 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 626 #endif 627 BLOCK_COMMENT("call MacroAssembler::debug"); 628 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 629 __ blr(rscratch1); 630 __ hlt(0); 631 632 return start; 633 } 634 635 // Generate indices for iota vector. 636 address generate_iota_indices(StubGenStubId stub_id) { 637 __ align(CodeEntryAlignment); 638 StubCodeMark mark(this, stub_id); 639 address start = __ pc(); 640 // B 641 __ emit_data64(0x0706050403020100, relocInfo::none); 642 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 643 // H 644 __ emit_data64(0x0003000200010000, relocInfo::none); 645 __ emit_data64(0x0007000600050004, relocInfo::none); 646 // S 647 __ emit_data64(0x0000000100000000, relocInfo::none); 648 __ emit_data64(0x0000000300000002, relocInfo::none); 649 // D 650 __ emit_data64(0x0000000000000000, relocInfo::none); 651 __ emit_data64(0x0000000000000001, relocInfo::none); 652 // S - FP 653 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 654 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 655 // D - FP 656 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 657 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 658 return start; 659 } 660 661 // The inner part of zero_words(). This is the bulk operation, 662 // zeroing words in blocks, possibly using DC ZVA to do it. The 663 // caller is responsible for zeroing the last few words. 664 // 665 // Inputs: 666 // r10: the HeapWord-aligned base address of an array to zero. 667 // r11: the count in HeapWords, r11 > 0. 668 // 669 // Returns r10 and r11, adjusted for the caller to clear. 670 // r10: the base address of the tail of words left to clear. 671 // r11: the number of words in the tail. 672 // r11 < MacroAssembler::zero_words_block_size. 673 674 address generate_zero_blocks() { 675 Label done; 676 Label base_aligned; 677 678 Register base = r10, cnt = r11; 679 680 __ align(CodeEntryAlignment); 681 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 682 StubCodeMark mark(this, stub_id); 683 address start = __ pc(); 684 685 if (UseBlockZeroing) { 686 int zva_length = VM_Version::zva_length(); 687 688 // Ensure ZVA length can be divided by 16. This is required by 689 // the subsequent operations. 690 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 691 692 __ tbz(base, 3, base_aligned); 693 __ str(zr, Address(__ post(base, 8))); 694 __ sub(cnt, cnt, 1); 695 __ bind(base_aligned); 696 697 // Ensure count >= zva_length * 2 so that it still deserves a zva after 698 // alignment. 699 Label small; 700 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 701 __ subs(rscratch1, cnt, low_limit >> 3); 702 __ br(Assembler::LT, small); 703 __ zero_dcache_blocks(base, cnt); 704 __ bind(small); 705 } 706 707 { 708 // Number of stp instructions we'll unroll 709 const int unroll = 710 MacroAssembler::zero_words_block_size / 2; 711 // Clear the remaining blocks. 712 Label loop; 713 __ subs(cnt, cnt, unroll * 2); 714 __ br(Assembler::LT, done); 715 __ bind(loop); 716 for (int i = 0; i < unroll; i++) 717 __ stp(zr, zr, __ post(base, 16)); 718 __ subs(cnt, cnt, unroll * 2); 719 __ br(Assembler::GE, loop); 720 __ bind(done); 721 __ add(cnt, cnt, unroll * 2); 722 } 723 724 __ ret(lr); 725 726 return start; 727 } 728 729 730 typedef enum { 731 copy_forwards = 1, 732 copy_backwards = -1 733 } copy_direction; 734 735 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 736 // for arraycopy stubs. 737 class ArrayCopyBarrierSetHelper : StackObj { 738 BarrierSetAssembler* _bs_asm; 739 MacroAssembler* _masm; 740 DecoratorSet _decorators; 741 BasicType _type; 742 Register _gct1; 743 Register _gct2; 744 Register _gct3; 745 FloatRegister _gcvt1; 746 FloatRegister _gcvt2; 747 FloatRegister _gcvt3; 748 749 public: 750 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 751 DecoratorSet decorators, 752 BasicType type, 753 Register gct1, 754 Register gct2, 755 Register gct3, 756 FloatRegister gcvt1, 757 FloatRegister gcvt2, 758 FloatRegister gcvt3) 759 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 760 _masm(masm), 761 _decorators(decorators), 762 _type(type), 763 _gct1(gct1), 764 _gct2(gct2), 765 _gct3(gct3), 766 _gcvt1(gcvt1), 767 _gcvt2(gcvt2), 768 _gcvt3(gcvt3) { 769 } 770 771 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 772 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 773 dst1, dst2, src, 774 _gct1, _gct2, _gcvt1); 775 } 776 777 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 778 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 779 dst, src1, src2, 780 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 781 } 782 783 void copy_load_at_16(Register dst1, Register dst2, Address src) { 784 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 785 dst1, dst2, src, 786 _gct1); 787 } 788 789 void copy_store_at_16(Address dst, Register src1, Register src2) { 790 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 791 dst, src1, src2, 792 _gct1, _gct2, _gct3); 793 } 794 795 void copy_load_at_8(Register dst, Address src) { 796 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 797 dst, noreg, src, 798 _gct1); 799 } 800 801 void copy_store_at_8(Address dst, Register src) { 802 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 803 dst, src, noreg, 804 _gct1, _gct2, _gct3); 805 } 806 }; 807 808 // Bulk copy of blocks of 8 words. 809 // 810 // count is a count of words. 811 // 812 // Precondition: count >= 8 813 // 814 // Postconditions: 815 // 816 // The least significant bit of count contains the remaining count 817 // of words to copy. The rest of count is trash. 818 // 819 // s and d are adjusted to point to the remaining words to copy 820 // 821 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 822 BasicType type; 823 copy_direction direction; 824 825 switch (stub_id) { 826 case copy_byte_f_id: 827 direction = copy_forwards; 828 type = T_BYTE; 829 break; 830 case copy_byte_b_id: 831 direction = copy_backwards; 832 type = T_BYTE; 833 break; 834 case copy_oop_f_id: 835 direction = copy_forwards; 836 type = T_OBJECT; 837 break; 838 case copy_oop_b_id: 839 direction = copy_backwards; 840 type = T_OBJECT; 841 break; 842 case copy_oop_uninit_f_id: 843 direction = copy_forwards; 844 type = T_OBJECT; 845 break; 846 case copy_oop_uninit_b_id: 847 direction = copy_backwards; 848 type = T_OBJECT; 849 break; 850 default: 851 ShouldNotReachHere(); 852 } 853 854 int unit = wordSize * direction; 855 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 856 857 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 858 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 859 const Register stride = r14; 860 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 861 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 862 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 863 864 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 865 assert_different_registers(s, d, count, rscratch1, rscratch2); 866 867 Label again, drain; 868 869 __ align(CodeEntryAlignment); 870 871 StubCodeMark mark(this, stub_id); 872 873 __ bind(start); 874 875 Label unaligned_copy_long; 876 if (AvoidUnalignedAccesses) { 877 __ tbnz(d, 3, unaligned_copy_long); 878 } 879 880 if (direction == copy_forwards) { 881 __ sub(s, s, bias); 882 __ sub(d, d, bias); 883 } 884 885 #ifdef ASSERT 886 // Make sure we are never given < 8 words 887 { 888 Label L; 889 __ cmp(count, (u1)8); 890 __ br(Assembler::GE, L); 891 __ stop("genrate_copy_longs called with < 8 words"); 892 __ bind(L); 893 } 894 #endif 895 896 // Fill 8 registers 897 if (UseSIMDForMemoryOps) { 898 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 899 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 900 } else { 901 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 902 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 903 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 904 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 905 } 906 907 __ subs(count, count, 16); 908 __ br(Assembler::LO, drain); 909 910 int prefetch = PrefetchCopyIntervalInBytes; 911 bool use_stride = false; 912 if (direction == copy_backwards) { 913 use_stride = prefetch > 256; 914 prefetch = -prefetch; 915 if (use_stride) __ mov(stride, prefetch); 916 } 917 918 __ bind(again); 919 920 if (PrefetchCopyIntervalInBytes > 0) 921 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 922 923 if (UseSIMDForMemoryOps) { 924 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 925 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 926 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 927 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 928 } else { 929 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 930 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 931 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 932 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 933 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 934 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 937 } 938 939 __ subs(count, count, 8); 940 __ br(Assembler::HS, again); 941 942 // Drain 943 __ bind(drain); 944 if (UseSIMDForMemoryOps) { 945 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 946 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 947 } else { 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 950 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 951 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 952 } 953 954 { 955 Label L1, L2; 956 __ tbz(count, exact_log2(4), L1); 957 if (UseSIMDForMemoryOps) { 958 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 959 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 960 } else { 961 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 962 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 963 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 964 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 965 } 966 __ bind(L1); 967 968 if (direction == copy_forwards) { 969 __ add(s, s, bias); 970 __ add(d, d, bias); 971 } 972 973 __ tbz(count, 1, L2); 974 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 975 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 976 __ bind(L2); 977 } 978 979 __ ret(lr); 980 981 if (AvoidUnalignedAccesses) { 982 Label drain, again; 983 // Register order for storing. Order is different for backward copy. 984 985 __ bind(unaligned_copy_long); 986 987 // source address is even aligned, target odd aligned 988 // 989 // when forward copying word pairs we read long pairs at offsets 990 // {0, 2, 4, 6} (in long words). when backwards copying we read 991 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 992 // address by -2 in the forwards case so we can compute the 993 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 994 // or -1. 995 // 996 // when forward copying we need to store 1 word, 3 pairs and 997 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 998 // zero offset We adjust the destination by -1 which means we 999 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 1000 // 1001 // When backwards copyng we need to store 1 word, 3 pairs and 1002 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 1003 // offsets {1, 3, 5, 7, 8} * unit. 1004 1005 if (direction == copy_forwards) { 1006 __ sub(s, s, 16); 1007 __ sub(d, d, 8); 1008 } 1009 1010 // Fill 8 registers 1011 // 1012 // for forwards copy s was offset by -16 from the original input 1013 // value of s so the register contents are at these offsets 1014 // relative to the 64 bit block addressed by that original input 1015 // and so on for each successive 64 byte block when s is updated 1016 // 1017 // t0 at offset 0, t1 at offset 8 1018 // t2 at offset 16, t3 at offset 24 1019 // t4 at offset 32, t5 at offset 40 1020 // t6 at offset 48, t7 at offset 56 1021 1022 // for backwards copy s was not offset so the register contents 1023 // are at these offsets into the preceding 64 byte block 1024 // relative to that original input and so on for each successive 1025 // preceding 64 byte block when s is updated. this explains the 1026 // slightly counter-intuitive looking pattern of register usage 1027 // in the stp instructions for backwards copy. 1028 // 1029 // t0 at offset -16, t1 at offset -8 1030 // t2 at offset -32, t3 at offset -24 1031 // t4 at offset -48, t5 at offset -40 1032 // t6 at offset -64, t7 at offset -56 1033 1034 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1035 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1036 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1037 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1038 1039 __ subs(count, count, 16); 1040 __ br(Assembler::LO, drain); 1041 1042 int prefetch = PrefetchCopyIntervalInBytes; 1043 bool use_stride = false; 1044 if (direction == copy_backwards) { 1045 use_stride = prefetch > 256; 1046 prefetch = -prefetch; 1047 if (use_stride) __ mov(stride, prefetch); 1048 } 1049 1050 __ bind(again); 1051 1052 if (PrefetchCopyIntervalInBytes > 0) 1053 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1054 1055 if (direction == copy_forwards) { 1056 // allowing for the offset of -8 the store instructions place 1057 // registers into the target 64 bit block at the following 1058 // offsets 1059 // 1060 // t0 at offset 0 1061 // t1 at offset 8, t2 at offset 16 1062 // t3 at offset 24, t4 at offset 32 1063 // t5 at offset 40, t6 at offset 48 1064 // t7 at offset 56 1065 1066 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1067 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1068 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1069 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1070 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1071 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1072 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1073 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1074 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1075 } else { 1076 // d was not offset when we started so the registers are 1077 // written into the 64 bit block preceding d with the following 1078 // offsets 1079 // 1080 // t1 at offset -8 1081 // t3 at offset -24, t0 at offset -16 1082 // t5 at offset -48, t2 at offset -32 1083 // t7 at offset -56, t4 at offset -48 1084 // t6 at offset -64 1085 // 1086 // note that this matches the offsets previously noted for the 1087 // loads 1088 1089 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1090 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1091 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1092 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1093 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1094 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1095 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1096 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1097 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1098 } 1099 1100 __ subs(count, count, 8); 1101 __ br(Assembler::HS, again); 1102 1103 // Drain 1104 // 1105 // this uses the same pattern of offsets and register arguments 1106 // as above 1107 __ bind(drain); 1108 if (direction == copy_forwards) { 1109 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1110 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1111 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1112 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1113 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1114 } else { 1115 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1116 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1117 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1118 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1119 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1120 } 1121 // now we need to copy any remaining part block which may 1122 // include a 4 word block subblock and/or a 2 word subblock. 1123 // bits 2 and 1 in the count are the tell-tale for whether we 1124 // have each such subblock 1125 { 1126 Label L1, L2; 1127 __ tbz(count, exact_log2(4), L1); 1128 // this is the same as above but copying only 4 longs hence 1129 // with only one intervening stp between the str instructions 1130 // but note that the offsets and registers still follow the 1131 // same pattern 1132 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1133 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1134 if (direction == copy_forwards) { 1135 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1136 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1137 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1141 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1142 } 1143 __ bind(L1); 1144 1145 __ tbz(count, 1, L2); 1146 // this is the same as above but copying only 2 longs hence 1147 // there is no intervening stp between the str instructions 1148 // but note that the offset and register patterns are still 1149 // the same 1150 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1151 if (direction == copy_forwards) { 1152 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1153 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1154 } else { 1155 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1156 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1157 } 1158 __ bind(L2); 1159 1160 // for forwards copy we need to re-adjust the offsets we 1161 // applied so that s and d are follow the last words written 1162 1163 if (direction == copy_forwards) { 1164 __ add(s, s, 16); 1165 __ add(d, d, 8); 1166 } 1167 1168 } 1169 1170 __ ret(lr); 1171 } 1172 } 1173 1174 // Small copy: less than 16 bytes. 1175 // 1176 // NB: Ignores all of the bits of count which represent more than 15 1177 // bytes, so a caller doesn't have to mask them. 1178 1179 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1180 bool is_backwards = step < 0; 1181 size_t granularity = uabs(step); 1182 int direction = is_backwards ? -1 : 1; 1183 1184 Label Lword, Lint, Lshort, Lbyte; 1185 1186 assert(granularity 1187 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1188 1189 const Register t0 = r3; 1190 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1191 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1192 1193 // ??? I don't know if this bit-test-and-branch is the right thing 1194 // to do. It does a lot of jumping, resulting in several 1195 // mispredicted branches. It might make more sense to do this 1196 // with something like Duff's device with a single computed branch. 1197 1198 __ tbz(count, 3 - exact_log2(granularity), Lword); 1199 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1200 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1201 __ bind(Lword); 1202 1203 if (granularity <= sizeof (jint)) { 1204 __ tbz(count, 2 - exact_log2(granularity), Lint); 1205 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1206 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1207 __ bind(Lint); 1208 } 1209 1210 if (granularity <= sizeof (jshort)) { 1211 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1212 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1213 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1214 __ bind(Lshort); 1215 } 1216 1217 if (granularity <= sizeof (jbyte)) { 1218 __ tbz(count, 0, Lbyte); 1219 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1220 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1221 __ bind(Lbyte); 1222 } 1223 } 1224 1225 Label copy_f, copy_b; 1226 Label copy_obj_f, copy_obj_b; 1227 Label copy_obj_uninit_f, copy_obj_uninit_b; 1228 1229 // All-singing all-dancing memory copy. 1230 // 1231 // Copy count units of memory from s to d. The size of a unit is 1232 // step, which can be positive or negative depending on the direction 1233 // of copy. If is_aligned is false, we align the source address. 1234 // 1235 1236 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1237 Register s, Register d, Register count, int step) { 1238 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1239 bool is_backwards = step < 0; 1240 unsigned int granularity = uabs(step); 1241 const Register t0 = r3, t1 = r4; 1242 1243 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1244 // load all the data before writing anything 1245 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1246 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1247 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1248 const Register send = r17, dend = r16; 1249 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1250 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1251 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1252 1253 if (PrefetchCopyIntervalInBytes > 0) 1254 __ prfm(Address(s, 0), PLDL1KEEP); 1255 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1256 __ br(Assembler::HI, copy_big); 1257 1258 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1259 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1260 1261 __ cmp(count, u1(16/granularity)); 1262 __ br(Assembler::LS, copy16); 1263 1264 __ cmp(count, u1(64/granularity)); 1265 __ br(Assembler::HI, copy80); 1266 1267 __ cmp(count, u1(32/granularity)); 1268 __ br(Assembler::LS, copy32); 1269 1270 // 33..64 bytes 1271 if (UseSIMDForMemoryOps) { 1272 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1273 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1274 bs.copy_store_at_32(Address(d, 0), v0, v1); 1275 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1276 } else { 1277 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1278 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1279 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1280 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1281 1282 bs.copy_store_at_16(Address(d, 0), t0, t1); 1283 bs.copy_store_at_16(Address(d, 16), t2, t3); 1284 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1285 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1286 } 1287 __ b(finish); 1288 1289 // 17..32 bytes 1290 __ bind(copy32); 1291 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1292 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1293 1294 bs.copy_store_at_16(Address(d, 0), t0, t1); 1295 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1296 __ b(finish); 1297 1298 // 65..80/96 bytes 1299 // (96 bytes if SIMD because we do 32 byes per instruction) 1300 __ bind(copy80); 1301 if (UseSIMDForMemoryOps) { 1302 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1303 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1304 // Unaligned pointers can be an issue for copying. 1305 // The issue has more chances to happen when granularity of data is 1306 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1307 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1308 // The most performance drop has been seen for the range 65-80 bytes. 1309 // For such cases using the pair of ldp/stp instead of the third pair of 1310 // ldpq/stpq fixes the performance issue. 1311 if (granularity < sizeof (jint)) { 1312 Label copy96; 1313 __ cmp(count, u1(80/granularity)); 1314 __ br(Assembler::HI, copy96); 1315 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1316 1317 bs.copy_store_at_32(Address(d, 0), v0, v1); 1318 bs.copy_store_at_32(Address(d, 32), v2, v3); 1319 1320 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1321 __ b(finish); 1322 1323 __ bind(copy96); 1324 } 1325 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1326 1327 bs.copy_store_at_32(Address(d, 0), v0, v1); 1328 bs.copy_store_at_32(Address(d, 32), v2, v3); 1329 1330 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1331 } else { 1332 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1333 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1334 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1335 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1336 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1337 1338 bs.copy_store_at_16(Address(d, 0), t0, t1); 1339 bs.copy_store_at_16(Address(d, 16), t2, t3); 1340 bs.copy_store_at_16(Address(d, 32), t4, t5); 1341 bs.copy_store_at_16(Address(d, 48), t6, t7); 1342 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1343 } 1344 __ b(finish); 1345 1346 // 0..16 bytes 1347 __ bind(copy16); 1348 __ cmp(count, u1(8/granularity)); 1349 __ br(Assembler::LO, copy8); 1350 1351 // 8..16 bytes 1352 bs.copy_load_at_8(t0, Address(s, 0)); 1353 bs.copy_load_at_8(t1, Address(send, -8)); 1354 bs.copy_store_at_8(Address(d, 0), t0); 1355 bs.copy_store_at_8(Address(dend, -8), t1); 1356 __ b(finish); 1357 1358 if (granularity < 8) { 1359 // 4..7 bytes 1360 __ bind(copy8); 1361 __ tbz(count, 2 - exact_log2(granularity), copy4); 1362 __ ldrw(t0, Address(s, 0)); 1363 __ ldrw(t1, Address(send, -4)); 1364 __ strw(t0, Address(d, 0)); 1365 __ strw(t1, Address(dend, -4)); 1366 __ b(finish); 1367 if (granularity < 4) { 1368 // 0..3 bytes 1369 __ bind(copy4); 1370 __ cbz(count, finish); // get rid of 0 case 1371 if (granularity == 2) { 1372 __ ldrh(t0, Address(s, 0)); 1373 __ strh(t0, Address(d, 0)); 1374 } else { // granularity == 1 1375 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1376 // the first and last byte. 1377 // Handle the 3 byte case by loading and storing base + count/2 1378 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1379 // This does means in the 1 byte case we load/store the same 1380 // byte 3 times. 1381 __ lsr(count, count, 1); 1382 __ ldrb(t0, Address(s, 0)); 1383 __ ldrb(t1, Address(send, -1)); 1384 __ ldrb(t2, Address(s, count)); 1385 __ strb(t0, Address(d, 0)); 1386 __ strb(t1, Address(dend, -1)); 1387 __ strb(t2, Address(d, count)); 1388 } 1389 __ b(finish); 1390 } 1391 } 1392 1393 __ bind(copy_big); 1394 if (is_backwards) { 1395 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1396 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1397 } 1398 1399 // Now we've got the small case out of the way we can align the 1400 // source address on a 2-word boundary. 1401 1402 // Here we will materialize a count in r15, which is used by copy_memory_small 1403 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1404 // Up until here, we have used t9, which aliases r15, but from here on, that register 1405 // can not be used as a temp register, as it contains the count. 1406 1407 Label aligned; 1408 1409 if (is_aligned) { 1410 // We may have to adjust by 1 word to get s 2-word-aligned. 1411 __ tbz(s, exact_log2(wordSize), aligned); 1412 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1413 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1414 __ sub(count, count, wordSize/granularity); 1415 } else { 1416 if (is_backwards) { 1417 __ andr(r15, s, 2 * wordSize - 1); 1418 } else { 1419 __ neg(r15, s); 1420 __ andr(r15, r15, 2 * wordSize - 1); 1421 } 1422 // r15 is the byte adjustment needed to align s. 1423 __ cbz(r15, aligned); 1424 int shift = exact_log2(granularity); 1425 if (shift > 0) { 1426 __ lsr(r15, r15, shift); 1427 } 1428 __ sub(count, count, r15); 1429 1430 #if 0 1431 // ?? This code is only correct for a disjoint copy. It may or 1432 // may not make sense to use it in that case. 1433 1434 // Copy the first pair; s and d may not be aligned. 1435 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1436 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1437 1438 // Align s and d, adjust count 1439 if (is_backwards) { 1440 __ sub(s, s, r15); 1441 __ sub(d, d, r15); 1442 } else { 1443 __ add(s, s, r15); 1444 __ add(d, d, r15); 1445 } 1446 #else 1447 copy_memory_small(decorators, type, s, d, r15, step); 1448 #endif 1449 } 1450 1451 __ bind(aligned); 1452 1453 // s is now 2-word-aligned. 1454 1455 // We have a count of units and some trailing bytes. Adjust the 1456 // count and do a bulk copy of words. If the shift is zero 1457 // perform a move instead to benefit from zero latency moves. 1458 int shift = exact_log2(wordSize/granularity); 1459 if (shift > 0) { 1460 __ lsr(r15, count, shift); 1461 } else { 1462 __ mov(r15, count); 1463 } 1464 if (direction == copy_forwards) { 1465 if (type != T_OBJECT) { 1466 __ bl(copy_f); 1467 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1468 __ bl(copy_obj_uninit_f); 1469 } else { 1470 __ bl(copy_obj_f); 1471 } 1472 } else { 1473 if (type != T_OBJECT) { 1474 __ bl(copy_b); 1475 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1476 __ bl(copy_obj_uninit_b); 1477 } else { 1478 __ bl(copy_obj_b); 1479 } 1480 } 1481 1482 // And the tail. 1483 copy_memory_small(decorators, type, s, d, count, step); 1484 1485 if (granularity >= 8) __ bind(copy8); 1486 if (granularity >= 4) __ bind(copy4); 1487 __ bind(finish); 1488 } 1489 1490 1491 void clobber_registers() { 1492 #ifdef ASSERT 1493 RegSet clobbered 1494 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1495 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1496 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1497 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1498 __ mov(*it, rscratch1); 1499 } 1500 #endif 1501 1502 } 1503 1504 // Scan over array at a for count oops, verifying each one. 1505 // Preserves a and count, clobbers rscratch1 and rscratch2. 1506 void verify_oop_array (int size, Register a, Register count, Register temp) { 1507 Label loop, end; 1508 __ mov(rscratch1, a); 1509 __ mov(rscratch2, zr); 1510 __ bind(loop); 1511 __ cmp(rscratch2, count); 1512 __ br(Assembler::HS, end); 1513 if (size == wordSize) { 1514 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1515 __ verify_oop(temp); 1516 } else { 1517 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1518 __ decode_heap_oop(temp); // calls verify_oop 1519 } 1520 __ add(rscratch2, rscratch2, 1); 1521 __ b(loop); 1522 __ bind(end); 1523 } 1524 1525 // Arguments: 1526 // stub_id - is used to name the stub and identify all details of 1527 // how to perform the copy. 1528 // 1529 // entry - is assigned to the stub's post push entry point unless 1530 // it is null 1531 // 1532 // Inputs: 1533 // c_rarg0 - source array address 1534 // c_rarg1 - destination array address 1535 // c_rarg2 - element count, treated as ssize_t, can be zero 1536 // 1537 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1538 // the hardware handle it. The two dwords within qwords that span 1539 // cache line boundaries will still be loaded and stored atomically. 1540 // 1541 // Side Effects: entry is set to the (post push) entry point so it 1542 // can be used by the corresponding conjoint copy 1543 // method 1544 // 1545 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1546 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1547 RegSet saved_reg = RegSet::of(s, d, count); 1548 int size; 1549 bool aligned; 1550 bool is_oop; 1551 bool dest_uninitialized; 1552 switch (stub_id) { 1553 case jbyte_disjoint_arraycopy_id: 1554 size = sizeof(jbyte); 1555 aligned = false; 1556 is_oop = false; 1557 dest_uninitialized = false; 1558 break; 1559 case arrayof_jbyte_disjoint_arraycopy_id: 1560 size = sizeof(jbyte); 1561 aligned = true; 1562 is_oop = false; 1563 dest_uninitialized = false; 1564 break; 1565 case jshort_disjoint_arraycopy_id: 1566 size = sizeof(jshort); 1567 aligned = false; 1568 is_oop = false; 1569 dest_uninitialized = false; 1570 break; 1571 case arrayof_jshort_disjoint_arraycopy_id: 1572 size = sizeof(jshort); 1573 aligned = true; 1574 is_oop = false; 1575 dest_uninitialized = false; 1576 break; 1577 case jint_disjoint_arraycopy_id: 1578 size = sizeof(jint); 1579 aligned = false; 1580 is_oop = false; 1581 dest_uninitialized = false; 1582 break; 1583 case arrayof_jint_disjoint_arraycopy_id: 1584 size = sizeof(jint); 1585 aligned = true; 1586 is_oop = false; 1587 dest_uninitialized = false; 1588 break; 1589 case jlong_disjoint_arraycopy_id: 1590 // since this is always aligned we can (should!) use the same 1591 // stub as for case arrayof_jlong_disjoint_arraycopy 1592 ShouldNotReachHere(); 1593 break; 1594 case arrayof_jlong_disjoint_arraycopy_id: 1595 size = sizeof(jlong); 1596 aligned = true; 1597 is_oop = false; 1598 dest_uninitialized = false; 1599 break; 1600 case oop_disjoint_arraycopy_id: 1601 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1602 aligned = !UseCompressedOops; 1603 is_oop = true; 1604 dest_uninitialized = false; 1605 break; 1606 case arrayof_oop_disjoint_arraycopy_id: 1607 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1608 aligned = !UseCompressedOops; 1609 is_oop = true; 1610 dest_uninitialized = false; 1611 break; 1612 case oop_disjoint_arraycopy_uninit_id: 1613 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1614 aligned = !UseCompressedOops; 1615 is_oop = true; 1616 dest_uninitialized = true; 1617 break; 1618 case arrayof_oop_disjoint_arraycopy_uninit_id: 1619 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1620 aligned = !UseCompressedOops; 1621 is_oop = true; 1622 dest_uninitialized = true; 1623 break; 1624 default: 1625 ShouldNotReachHere(); 1626 break; 1627 } 1628 1629 __ align(CodeEntryAlignment); 1630 StubCodeMark mark(this, stub_id); 1631 address start = __ pc(); 1632 __ enter(); 1633 1634 if (entry != nullptr) { 1635 *entry = __ pc(); 1636 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1637 BLOCK_COMMENT("Entry:"); 1638 } 1639 1640 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1641 if (dest_uninitialized) { 1642 decorators |= IS_DEST_UNINITIALIZED; 1643 } 1644 if (aligned) { 1645 decorators |= ARRAYCOPY_ALIGNED; 1646 } 1647 1648 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1649 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1650 1651 if (is_oop) { 1652 // save regs before copy_memory 1653 __ push(RegSet::of(d, count), sp); 1654 } 1655 { 1656 // UnsafeMemoryAccess page error: continue after unsafe access 1657 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1658 UnsafeMemoryAccessMark umam(this, add_entry, true); 1659 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1660 } 1661 1662 if (is_oop) { 1663 __ pop(RegSet::of(d, count), sp); 1664 if (VerifyOops) 1665 verify_oop_array(size, d, count, r16); 1666 } 1667 1668 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1669 1670 __ leave(); 1671 __ mov(r0, zr); // return 0 1672 __ ret(lr); 1673 return start; 1674 } 1675 1676 // Arguments: 1677 // stub_id - is used to name the stub and identify all details of 1678 // how to perform the copy. 1679 // 1680 // nooverlap_target - identifes the (post push) entry for the 1681 // corresponding disjoint copy routine which can be 1682 // jumped to if the ranges do not actually overlap 1683 // 1684 // entry - is assigned to the stub's post push entry point unless 1685 // it is null 1686 // 1687 // 1688 // Inputs: 1689 // c_rarg0 - source array address 1690 // c_rarg1 - destination array address 1691 // c_rarg2 - element count, treated as ssize_t, can be zero 1692 // 1693 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1694 // the hardware handle it. The two dwords within qwords that span 1695 // cache line boundaries will still be loaded and stored atomically. 1696 // 1697 // Side Effects: 1698 // entry is set to the no-overlap entry point so it can be used by 1699 // some other conjoint copy method 1700 // 1701 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1702 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1703 RegSet saved_regs = RegSet::of(s, d, count); 1704 int size; 1705 bool aligned; 1706 bool is_oop; 1707 bool dest_uninitialized; 1708 switch (stub_id) { 1709 case jbyte_arraycopy_id: 1710 size = sizeof(jbyte); 1711 aligned = false; 1712 is_oop = false; 1713 dest_uninitialized = false; 1714 break; 1715 case arrayof_jbyte_arraycopy_id: 1716 size = sizeof(jbyte); 1717 aligned = true; 1718 is_oop = false; 1719 dest_uninitialized = false; 1720 break; 1721 case jshort_arraycopy_id: 1722 size = sizeof(jshort); 1723 aligned = false; 1724 is_oop = false; 1725 dest_uninitialized = false; 1726 break; 1727 case arrayof_jshort_arraycopy_id: 1728 size = sizeof(jshort); 1729 aligned = true; 1730 is_oop = false; 1731 dest_uninitialized = false; 1732 break; 1733 case jint_arraycopy_id: 1734 size = sizeof(jint); 1735 aligned = false; 1736 is_oop = false; 1737 dest_uninitialized = false; 1738 break; 1739 case arrayof_jint_arraycopy_id: 1740 size = sizeof(jint); 1741 aligned = true; 1742 is_oop = false; 1743 dest_uninitialized = false; 1744 break; 1745 case jlong_arraycopy_id: 1746 // since this is always aligned we can (should!) use the same 1747 // stub as for case arrayof_jlong_disjoint_arraycopy 1748 ShouldNotReachHere(); 1749 break; 1750 case arrayof_jlong_arraycopy_id: 1751 size = sizeof(jlong); 1752 aligned = true; 1753 is_oop = false; 1754 dest_uninitialized = false; 1755 break; 1756 case oop_arraycopy_id: 1757 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1758 aligned = !UseCompressedOops; 1759 is_oop = true; 1760 dest_uninitialized = false; 1761 break; 1762 case arrayof_oop_arraycopy_id: 1763 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1764 aligned = !UseCompressedOops; 1765 is_oop = true; 1766 dest_uninitialized = false; 1767 break; 1768 case oop_arraycopy_uninit_id: 1769 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1770 aligned = !UseCompressedOops; 1771 is_oop = true; 1772 dest_uninitialized = true; 1773 break; 1774 case arrayof_oop_arraycopy_uninit_id: 1775 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1776 aligned = !UseCompressedOops; 1777 is_oop = true; 1778 dest_uninitialized = true; 1779 break; 1780 default: 1781 ShouldNotReachHere(); 1782 } 1783 1784 StubCodeMark mark(this, stub_id); 1785 address start = __ pc(); 1786 __ enter(); 1787 1788 if (entry != nullptr) { 1789 *entry = __ pc(); 1790 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1791 BLOCK_COMMENT("Entry:"); 1792 } 1793 1794 // use fwd copy when (d-s) above_equal (count*size) 1795 __ sub(rscratch1, d, s); 1796 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1797 __ br(Assembler::HS, nooverlap_target); 1798 1799 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1800 if (dest_uninitialized) { 1801 decorators |= IS_DEST_UNINITIALIZED; 1802 } 1803 if (aligned) { 1804 decorators |= ARRAYCOPY_ALIGNED; 1805 } 1806 1807 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1808 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1809 1810 if (is_oop) { 1811 // save regs before copy_memory 1812 __ push(RegSet::of(d, count), sp); 1813 } 1814 { 1815 // UnsafeMemoryAccess page error: continue after unsafe access 1816 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1817 UnsafeMemoryAccessMark umam(this, add_entry, true); 1818 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1819 } 1820 if (is_oop) { 1821 __ pop(RegSet::of(d, count), sp); 1822 if (VerifyOops) 1823 verify_oop_array(size, d, count, r16); 1824 } 1825 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1826 __ leave(); 1827 __ mov(r0, zr); // return 0 1828 __ ret(lr); 1829 return start; 1830 } 1831 1832 // Helper for generating a dynamic type check. 1833 // Smashes rscratch1, rscratch2. 1834 void generate_type_check(Register sub_klass, 1835 Register super_check_offset, 1836 Register super_klass, 1837 Register temp1, 1838 Register temp2, 1839 Register result, 1840 Label& L_success) { 1841 assert_different_registers(sub_klass, super_check_offset, super_klass); 1842 1843 BLOCK_COMMENT("type_check:"); 1844 1845 Label L_miss; 1846 1847 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1848 super_check_offset); 1849 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1850 1851 // Fall through on failure! 1852 __ BIND(L_miss); 1853 } 1854 1855 // 1856 // Generate checkcasting array copy stub 1857 // 1858 // Input: 1859 // c_rarg0 - source array address 1860 // c_rarg1 - destination array address 1861 // c_rarg2 - element count, treated as ssize_t, can be zero 1862 // c_rarg3 - size_t ckoff (super_check_offset) 1863 // c_rarg4 - oop ckval (super_klass) 1864 // 1865 // Output: 1866 // r0 == 0 - success 1867 // r0 == -1^K - failure, where K is partial transfer count 1868 // 1869 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1870 bool dest_uninitialized; 1871 switch (stub_id) { 1872 case checkcast_arraycopy_id: 1873 dest_uninitialized = false; 1874 break; 1875 case checkcast_arraycopy_uninit_id: 1876 dest_uninitialized = true; 1877 break; 1878 default: 1879 ShouldNotReachHere(); 1880 } 1881 1882 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1883 1884 // Input registers (after setup_arg_regs) 1885 const Register from = c_rarg0; // source array address 1886 const Register to = c_rarg1; // destination array address 1887 const Register count = c_rarg2; // elementscount 1888 const Register ckoff = c_rarg3; // super_check_offset 1889 const Register ckval = c_rarg4; // super_klass 1890 1891 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1892 RegSet wb_post_saved_regs = RegSet::of(count); 1893 1894 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1895 const Register copied_oop = r22; // actual oop copied 1896 const Register count_save = r21; // orig elementscount 1897 const Register start_to = r20; // destination array start address 1898 const Register r19_klass = r19; // oop._klass 1899 1900 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1901 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1902 1903 //--------------------------------------------------------------- 1904 // Assembler stub will be used for this call to arraycopy 1905 // if the two arrays are subtypes of Object[] but the 1906 // destination array type is not equal to or a supertype 1907 // of the source type. Each element must be separately 1908 // checked. 1909 1910 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1911 copied_oop, r19_klass, count_save); 1912 1913 __ align(CodeEntryAlignment); 1914 StubCodeMark mark(this, stub_id); 1915 address start = __ pc(); 1916 1917 __ enter(); // required for proper stackwalking of RuntimeStub frame 1918 1919 #ifdef ASSERT 1920 // caller guarantees that the arrays really are different 1921 // otherwise, we would have to make conjoint checks 1922 { Label L; 1923 __ b(L); // conjoint check not yet implemented 1924 __ stop("checkcast_copy within a single array"); 1925 __ bind(L); 1926 } 1927 #endif //ASSERT 1928 1929 // Caller of this entry point must set up the argument registers. 1930 if (entry != nullptr) { 1931 *entry = __ pc(); 1932 BLOCK_COMMENT("Entry:"); 1933 } 1934 1935 // Empty array: Nothing to do. 1936 __ cbz(count, L_done); 1937 __ push(RegSet::of(r19, r20, r21, r22), sp); 1938 1939 #ifdef ASSERT 1940 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1941 // The ckoff and ckval must be mutually consistent, 1942 // even though caller generates both. 1943 { Label L; 1944 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1945 __ ldrw(start_to, Address(ckval, sco_offset)); 1946 __ cmpw(ckoff, start_to); 1947 __ br(Assembler::EQ, L); 1948 __ stop("super_check_offset inconsistent"); 1949 __ bind(L); 1950 } 1951 #endif //ASSERT 1952 1953 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1954 bool is_oop = true; 1955 int element_size = UseCompressedOops ? 4 : 8; 1956 if (dest_uninitialized) { 1957 decorators |= IS_DEST_UNINITIALIZED; 1958 } 1959 1960 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1961 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1962 1963 // save the original count 1964 __ mov(count_save, count); 1965 1966 // Copy from low to high addresses 1967 __ mov(start_to, to); // Save destination array start address 1968 __ b(L_load_element); 1969 1970 // ======== begin loop ======== 1971 // (Loop is rotated; its entry is L_load_element.) 1972 // Loop control: 1973 // for (; count != 0; count--) { 1974 // copied_oop = load_heap_oop(from++); 1975 // ... generate_type_check ...; 1976 // store_heap_oop(to++, copied_oop); 1977 // } 1978 __ align(OptoLoopAlignment); 1979 1980 __ BIND(L_store_element); 1981 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1982 __ post(to, element_size), copied_oop, noreg, 1983 gct1, gct2, gct3); 1984 __ sub(count, count, 1); 1985 __ cbz(count, L_do_card_marks); 1986 1987 // ======== loop entry is here ======== 1988 __ BIND(L_load_element); 1989 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1990 copied_oop, noreg, __ post(from, element_size), 1991 gct1); 1992 __ cbz(copied_oop, L_store_element); 1993 1994 __ load_klass(r19_klass, copied_oop);// query the object klass 1995 1996 BLOCK_COMMENT("type_check:"); 1997 generate_type_check(/*sub_klass*/r19_klass, 1998 /*super_check_offset*/ckoff, 1999 /*super_klass*/ckval, 2000 /*r_array_base*/gct1, 2001 /*temp2*/gct2, 2002 /*result*/r10, L_store_element); 2003 2004 // Fall through on failure! 2005 2006 // ======== end loop ======== 2007 2008 // It was a real error; we must depend on the caller to finish the job. 2009 // Register count = remaining oops, count_orig = total oops. 2010 // Emit GC store barriers for the oops we have copied and report 2011 // their number to the caller. 2012 2013 __ subs(count, count_save, count); // K = partially copied oop count 2014 __ eon(count, count, zr); // report (-1^K) to caller 2015 __ br(Assembler::EQ, L_done_pop); 2016 2017 __ BIND(L_do_card_marks); 2018 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2019 2020 __ bind(L_done_pop); 2021 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2022 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2023 2024 __ bind(L_done); 2025 __ mov(r0, count); 2026 __ leave(); 2027 __ ret(lr); 2028 2029 return start; 2030 } 2031 2032 // Perform range checks on the proposed arraycopy. 2033 // Kills temp, but nothing else. 2034 // Also, clean the sign bits of src_pos and dst_pos. 2035 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2036 Register src_pos, // source position (c_rarg1) 2037 Register dst, // destination array oo (c_rarg2) 2038 Register dst_pos, // destination position (c_rarg3) 2039 Register length, 2040 Register temp, 2041 Label& L_failed) { 2042 BLOCK_COMMENT("arraycopy_range_checks:"); 2043 2044 assert_different_registers(rscratch1, temp); 2045 2046 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2047 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2048 __ addw(temp, length, src_pos); 2049 __ cmpw(temp, rscratch1); 2050 __ br(Assembler::HI, L_failed); 2051 2052 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2053 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2054 __ addw(temp, length, dst_pos); 2055 __ cmpw(temp, rscratch1); 2056 __ br(Assembler::HI, L_failed); 2057 2058 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2059 __ movw(src_pos, src_pos); 2060 __ movw(dst_pos, dst_pos); 2061 2062 BLOCK_COMMENT("arraycopy_range_checks done"); 2063 } 2064 2065 // These stubs get called from some dumb test routine. 2066 // I'll write them properly when they're called from 2067 // something that's actually doing something. 2068 static void fake_arraycopy_stub(address src, address dst, int count) { 2069 assert(count == 0, "huh?"); 2070 } 2071 2072 2073 // 2074 // Generate 'unsafe' array copy stub 2075 // Though just as safe as the other stubs, it takes an unscaled 2076 // size_t argument instead of an element count. 2077 // 2078 // Input: 2079 // c_rarg0 - source array address 2080 // c_rarg1 - destination array address 2081 // c_rarg2 - byte count, treated as ssize_t, can be zero 2082 // 2083 // Examines the alignment of the operands and dispatches 2084 // to a long, int, short, or byte copy loop. 2085 // 2086 address generate_unsafe_copy(address byte_copy_entry, 2087 address short_copy_entry, 2088 address int_copy_entry, 2089 address long_copy_entry) { 2090 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2091 2092 Label L_long_aligned, L_int_aligned, L_short_aligned; 2093 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2094 2095 __ align(CodeEntryAlignment); 2096 StubCodeMark mark(this, stub_id); 2097 address start = __ pc(); 2098 __ enter(); // required for proper stackwalking of RuntimeStub frame 2099 2100 // bump this on entry, not on exit: 2101 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2102 2103 __ orr(rscratch1, s, d); 2104 __ orr(rscratch1, rscratch1, count); 2105 2106 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2107 __ cbz(rscratch1, L_long_aligned); 2108 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2109 __ cbz(rscratch1, L_int_aligned); 2110 __ tbz(rscratch1, 0, L_short_aligned); 2111 __ b(RuntimeAddress(byte_copy_entry)); 2112 2113 __ BIND(L_short_aligned); 2114 __ lsr(count, count, LogBytesPerShort); // size => short_count 2115 __ b(RuntimeAddress(short_copy_entry)); 2116 __ BIND(L_int_aligned); 2117 __ lsr(count, count, LogBytesPerInt); // size => int_count 2118 __ b(RuntimeAddress(int_copy_entry)); 2119 __ BIND(L_long_aligned); 2120 __ lsr(count, count, LogBytesPerLong); // size => long_count 2121 __ b(RuntimeAddress(long_copy_entry)); 2122 2123 return start; 2124 } 2125 2126 // 2127 // Generate generic array copy stubs 2128 // 2129 // Input: 2130 // c_rarg0 - src oop 2131 // c_rarg1 - src_pos (32-bits) 2132 // c_rarg2 - dst oop 2133 // c_rarg3 - dst_pos (32-bits) 2134 // c_rarg4 - element count (32-bits) 2135 // 2136 // Output: 2137 // r0 == 0 - success 2138 // r0 == -1^K - failure, where K is partial transfer count 2139 // 2140 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2141 address int_copy_entry, address oop_copy_entry, 2142 address long_copy_entry, address checkcast_copy_entry) { 2143 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2144 2145 Label L_failed, L_objArray; 2146 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2147 2148 // Input registers 2149 const Register src = c_rarg0; // source array oop 2150 const Register src_pos = c_rarg1; // source position 2151 const Register dst = c_rarg2; // destination array oop 2152 const Register dst_pos = c_rarg3; // destination position 2153 const Register length = c_rarg4; 2154 2155 2156 // Registers used as temps 2157 const Register dst_klass = c_rarg5; 2158 2159 __ align(CodeEntryAlignment); 2160 2161 StubCodeMark mark(this, stub_id); 2162 2163 address start = __ pc(); 2164 2165 __ enter(); // required for proper stackwalking of RuntimeStub frame 2166 2167 // bump this on entry, not on exit: 2168 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2169 2170 //----------------------------------------------------------------------- 2171 // Assembler stub will be used for this call to arraycopy 2172 // if the following conditions are met: 2173 // 2174 // (1) src and dst must not be null. 2175 // (2) src_pos must not be negative. 2176 // (3) dst_pos must not be negative. 2177 // (4) length must not be negative. 2178 // (5) src klass and dst klass should be the same and not null. 2179 // (6) src and dst should be arrays. 2180 // (7) src_pos + length must not exceed length of src. 2181 // (8) dst_pos + length must not exceed length of dst. 2182 // 2183 2184 // if (src == nullptr) return -1; 2185 __ cbz(src, L_failed); 2186 2187 // if (src_pos < 0) return -1; 2188 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2189 2190 // if (dst == nullptr) return -1; 2191 __ cbz(dst, L_failed); 2192 2193 // if (dst_pos < 0) return -1; 2194 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2195 2196 // registers used as temp 2197 const Register scratch_length = r16; // elements count to copy 2198 const Register scratch_src_klass = r17; // array klass 2199 const Register lh = r15; // layout helper 2200 2201 // if (length < 0) return -1; 2202 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2203 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2204 2205 __ load_klass(scratch_src_klass, src); 2206 #ifdef ASSERT 2207 // assert(src->klass() != nullptr); 2208 { 2209 BLOCK_COMMENT("assert klasses not null {"); 2210 Label L1, L2; 2211 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2212 __ bind(L1); 2213 __ stop("broken null klass"); 2214 __ bind(L2); 2215 __ load_klass(rscratch1, dst); 2216 __ cbz(rscratch1, L1); // this would be broken also 2217 BLOCK_COMMENT("} assert klasses not null done"); 2218 } 2219 #endif 2220 2221 // Load layout helper (32-bits) 2222 // 2223 // |array_tag| | header_size | element_type | |log2_element_size| 2224 // 32 30 24 16 8 2 0 2225 // 2226 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2227 // 2228 2229 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2230 2231 // Handle objArrays completely differently... 2232 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2233 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2234 __ movw(rscratch1, objArray_lh); 2235 __ eorw(rscratch2, lh, rscratch1); 2236 __ cbzw(rscratch2, L_objArray); 2237 2238 // if (src->klass() != dst->klass()) return -1; 2239 __ load_klass(rscratch2, dst); 2240 __ eor(rscratch2, rscratch2, scratch_src_klass); 2241 __ cbnz(rscratch2, L_failed); 2242 2243 // Check for flat inline type array -> return -1 2244 __ test_flat_array_oop(src, rscratch2, L_failed); 2245 2246 // Check for null-free (non-flat) inline type array -> handle as object array 2247 __ test_null_free_array_oop(src, rscratch2, L_objArray); 2248 2249 // if (!src->is_Array()) return -1; 2250 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2251 2252 // At this point, it is known to be a typeArray (array_tag 0x3). 2253 #ifdef ASSERT 2254 { 2255 BLOCK_COMMENT("assert primitive array {"); 2256 Label L; 2257 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2258 __ cmpw(lh, rscratch2); 2259 __ br(Assembler::GE, L); 2260 __ stop("must be a primitive array"); 2261 __ bind(L); 2262 BLOCK_COMMENT("} assert primitive array done"); 2263 } 2264 #endif 2265 2266 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2267 rscratch2, L_failed); 2268 2269 // TypeArrayKlass 2270 // 2271 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2272 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2273 // 2274 2275 const Register rscratch1_offset = rscratch1; // array offset 2276 const Register r15_elsize = lh; // element size 2277 2278 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2279 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2280 __ add(src, src, rscratch1_offset); // src array offset 2281 __ add(dst, dst, rscratch1_offset); // dst array offset 2282 BLOCK_COMMENT("choose copy loop based on element size"); 2283 2284 // next registers should be set before the jump to corresponding stub 2285 const Register from = c_rarg0; // source array address 2286 const Register to = c_rarg1; // destination array address 2287 const Register count = c_rarg2; // elements count 2288 2289 // 'from', 'to', 'count' registers should be set in such order 2290 // since they are the same as 'src', 'src_pos', 'dst'. 2291 2292 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2293 2294 // The possible values of elsize are 0-3, i.e. exact_log2(element 2295 // size in bytes). We do a simple bitwise binary search. 2296 __ BIND(L_copy_bytes); 2297 __ tbnz(r15_elsize, 1, L_copy_ints); 2298 __ tbnz(r15_elsize, 0, L_copy_shorts); 2299 __ lea(from, Address(src, src_pos));// src_addr 2300 __ lea(to, Address(dst, dst_pos));// dst_addr 2301 __ movw(count, scratch_length); // length 2302 __ b(RuntimeAddress(byte_copy_entry)); 2303 2304 __ BIND(L_copy_shorts); 2305 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2306 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2307 __ movw(count, scratch_length); // length 2308 __ b(RuntimeAddress(short_copy_entry)); 2309 2310 __ BIND(L_copy_ints); 2311 __ tbnz(r15_elsize, 0, L_copy_longs); 2312 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2313 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2314 __ movw(count, scratch_length); // length 2315 __ b(RuntimeAddress(int_copy_entry)); 2316 2317 __ BIND(L_copy_longs); 2318 #ifdef ASSERT 2319 { 2320 BLOCK_COMMENT("assert long copy {"); 2321 Label L; 2322 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2323 __ cmpw(r15_elsize, LogBytesPerLong); 2324 __ br(Assembler::EQ, L); 2325 __ stop("must be long copy, but elsize is wrong"); 2326 __ bind(L); 2327 BLOCK_COMMENT("} assert long copy done"); 2328 } 2329 #endif 2330 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2331 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2332 __ movw(count, scratch_length); // length 2333 __ b(RuntimeAddress(long_copy_entry)); 2334 2335 // ObjArrayKlass 2336 __ BIND(L_objArray); 2337 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2338 2339 Label L_plain_copy, L_checkcast_copy; 2340 // test array classes for subtyping 2341 __ load_klass(r15, dst); 2342 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2343 __ br(Assembler::NE, L_checkcast_copy); 2344 2345 // Identically typed arrays can be copied without element-wise checks. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 rscratch2, L_failed); 2348 2349 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2350 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2351 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ movw(count, scratch_length); // length 2354 __ BIND(L_plain_copy); 2355 __ b(RuntimeAddress(oop_copy_entry)); 2356 2357 __ BIND(L_checkcast_copy); 2358 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2359 { 2360 // Before looking at dst.length, make sure dst is also an objArray. 2361 __ ldrw(rscratch1, Address(r15, lh_offset)); 2362 __ movw(rscratch2, objArray_lh); 2363 __ eorw(rscratch1, rscratch1, rscratch2); 2364 __ cbnzw(rscratch1, L_failed); 2365 2366 // It is safe to examine both src.length and dst.length. 2367 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2368 r15, L_failed); 2369 2370 __ load_klass(dst_klass, dst); // reload 2371 2372 // Marshal the base address arguments now, freeing registers. 2373 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2374 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2375 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2376 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2377 __ movw(count, length); // length (reloaded) 2378 Register sco_temp = c_rarg3; // this register is free now 2379 assert_different_registers(from, to, count, sco_temp, 2380 dst_klass, scratch_src_klass); 2381 // assert_clean_int(count, sco_temp); 2382 2383 // Generate the type check. 2384 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2385 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2386 2387 // Smashes rscratch1, rscratch2 2388 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2389 L_plain_copy); 2390 2391 // Fetch destination element klass from the ObjArrayKlass header. 2392 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2393 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2394 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2395 2396 // the checkcast_copy loop needs two extra arguments: 2397 assert(c_rarg3 == sco_temp, "#3 already in place"); 2398 // Set up arguments for checkcast_copy_entry. 2399 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2400 __ b(RuntimeAddress(checkcast_copy_entry)); 2401 } 2402 2403 __ BIND(L_failed); 2404 __ mov(r0, -1); 2405 __ leave(); // required for proper stackwalking of RuntimeStub frame 2406 __ ret(lr); 2407 2408 return start; 2409 } 2410 2411 // 2412 // Generate stub for array fill. If "aligned" is true, the 2413 // "to" address is assumed to be heapword aligned. 2414 // 2415 // Arguments for generated stub: 2416 // to: c_rarg0 2417 // value: c_rarg1 2418 // count: c_rarg2 treated as signed 2419 // 2420 address generate_fill(StubGenStubId stub_id) { 2421 BasicType t; 2422 bool aligned; 2423 2424 switch (stub_id) { 2425 case jbyte_fill_id: 2426 t = T_BYTE; 2427 aligned = false; 2428 break; 2429 case jshort_fill_id: 2430 t = T_SHORT; 2431 aligned = false; 2432 break; 2433 case jint_fill_id: 2434 t = T_INT; 2435 aligned = false; 2436 break; 2437 case arrayof_jbyte_fill_id: 2438 t = T_BYTE; 2439 aligned = true; 2440 break; 2441 case arrayof_jshort_fill_id: 2442 t = T_SHORT; 2443 aligned = true; 2444 break; 2445 case arrayof_jint_fill_id: 2446 t = T_INT; 2447 aligned = true; 2448 break; 2449 default: 2450 ShouldNotReachHere(); 2451 }; 2452 2453 __ align(CodeEntryAlignment); 2454 StubCodeMark mark(this, stub_id); 2455 address start = __ pc(); 2456 2457 BLOCK_COMMENT("Entry:"); 2458 2459 const Register to = c_rarg0; // source array address 2460 const Register value = c_rarg1; // value 2461 const Register count = c_rarg2; // elements count 2462 2463 const Register bz_base = r10; // base for block_zero routine 2464 const Register cnt_words = r11; // temp register 2465 2466 __ enter(); 2467 2468 Label L_fill_elements, L_exit1; 2469 2470 int shift = -1; 2471 switch (t) { 2472 case T_BYTE: 2473 shift = 0; 2474 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2475 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2476 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2477 __ br(Assembler::LO, L_fill_elements); 2478 break; 2479 case T_SHORT: 2480 shift = 1; 2481 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2482 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2483 __ br(Assembler::LO, L_fill_elements); 2484 break; 2485 case T_INT: 2486 shift = 2; 2487 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2488 __ br(Assembler::LO, L_fill_elements); 2489 break; 2490 default: ShouldNotReachHere(); 2491 } 2492 2493 // Align source address at 8 bytes address boundary. 2494 Label L_skip_align1, L_skip_align2, L_skip_align4; 2495 if (!aligned) { 2496 switch (t) { 2497 case T_BYTE: 2498 // One byte misalignment happens only for byte arrays. 2499 __ tbz(to, 0, L_skip_align1); 2500 __ strb(value, Address(__ post(to, 1))); 2501 __ subw(count, count, 1); 2502 __ bind(L_skip_align1); 2503 // Fallthrough 2504 case T_SHORT: 2505 // Two bytes misalignment happens only for byte and short (char) arrays. 2506 __ tbz(to, 1, L_skip_align2); 2507 __ strh(value, Address(__ post(to, 2))); 2508 __ subw(count, count, 2 >> shift); 2509 __ bind(L_skip_align2); 2510 // Fallthrough 2511 case T_INT: 2512 // Align to 8 bytes, we know we are 4 byte aligned to start. 2513 __ tbz(to, 2, L_skip_align4); 2514 __ strw(value, Address(__ post(to, 4))); 2515 __ subw(count, count, 4 >> shift); 2516 __ bind(L_skip_align4); 2517 break; 2518 default: ShouldNotReachHere(); 2519 } 2520 } 2521 2522 // 2523 // Fill large chunks 2524 // 2525 __ lsrw(cnt_words, count, 3 - shift); // number of words 2526 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2527 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2528 if (UseBlockZeroing) { 2529 Label non_block_zeroing, rest; 2530 // If the fill value is zero we can use the fast zero_words(). 2531 __ cbnz(value, non_block_zeroing); 2532 __ mov(bz_base, to); 2533 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2534 address tpc = __ zero_words(bz_base, cnt_words); 2535 if (tpc == nullptr) { 2536 fatal("CodeCache is full at generate_fill"); 2537 } 2538 __ b(rest); 2539 __ bind(non_block_zeroing); 2540 __ fill_words(to, cnt_words, value); 2541 __ bind(rest); 2542 } else { 2543 __ fill_words(to, cnt_words, value); 2544 } 2545 2546 // Remaining count is less than 8 bytes. Fill it by a single store. 2547 // Note that the total length is no less than 8 bytes. 2548 if (t == T_BYTE || t == T_SHORT) { 2549 Label L_exit1; 2550 __ cbzw(count, L_exit1); 2551 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2552 __ str(value, Address(to, -8)); // overwrite some elements 2553 __ bind(L_exit1); 2554 __ leave(); 2555 __ ret(lr); 2556 } 2557 2558 // Handle copies less than 8 bytes. 2559 Label L_fill_2, L_fill_4, L_exit2; 2560 __ bind(L_fill_elements); 2561 switch (t) { 2562 case T_BYTE: 2563 __ tbz(count, 0, L_fill_2); 2564 __ strb(value, Address(__ post(to, 1))); 2565 __ bind(L_fill_2); 2566 __ tbz(count, 1, L_fill_4); 2567 __ strh(value, Address(__ post(to, 2))); 2568 __ bind(L_fill_4); 2569 __ tbz(count, 2, L_exit2); 2570 __ strw(value, Address(to)); 2571 break; 2572 case T_SHORT: 2573 __ tbz(count, 0, L_fill_4); 2574 __ strh(value, Address(__ post(to, 2))); 2575 __ bind(L_fill_4); 2576 __ tbz(count, 1, L_exit2); 2577 __ strw(value, Address(to)); 2578 break; 2579 case T_INT: 2580 __ cbzw(count, L_exit2); 2581 __ strw(value, Address(to)); 2582 break; 2583 default: ShouldNotReachHere(); 2584 } 2585 __ bind(L_exit2); 2586 __ leave(); 2587 __ ret(lr); 2588 return start; 2589 } 2590 2591 address generate_data_cache_writeback() { 2592 const Register line = c_rarg0; // address of line to write back 2593 2594 __ align(CodeEntryAlignment); 2595 2596 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2597 StubCodeMark mark(this, stub_id); 2598 2599 address start = __ pc(); 2600 __ enter(); 2601 __ cache_wb(Address(line, 0)); 2602 __ leave(); 2603 __ ret(lr); 2604 2605 return start; 2606 } 2607 2608 address generate_data_cache_writeback_sync() { 2609 const Register is_pre = c_rarg0; // pre or post sync 2610 2611 __ align(CodeEntryAlignment); 2612 2613 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2614 StubCodeMark mark(this, stub_id); 2615 2616 // pre wbsync is a no-op 2617 // post wbsync translates to an sfence 2618 2619 Label skip; 2620 address start = __ pc(); 2621 __ enter(); 2622 __ cbnz(is_pre, skip); 2623 __ cache_wbsync(false); 2624 __ bind(skip); 2625 __ leave(); 2626 __ ret(lr); 2627 2628 return start; 2629 } 2630 2631 void generate_arraycopy_stubs() { 2632 address entry; 2633 address entry_jbyte_arraycopy; 2634 address entry_jshort_arraycopy; 2635 address entry_jint_arraycopy; 2636 address entry_oop_arraycopy; 2637 address entry_jlong_arraycopy; 2638 address entry_checkcast_arraycopy; 2639 2640 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2641 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2642 2643 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2644 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2645 2646 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2647 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2648 2649 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2650 2651 //*** jbyte 2652 // Always need aligned and unaligned versions 2653 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2654 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2655 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2656 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2657 2658 //*** jshort 2659 // Always need aligned and unaligned versions 2660 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2661 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2662 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2663 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2664 2665 //*** jint 2666 // Aligned versions 2667 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2668 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2669 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2670 // entry_jint_arraycopy always points to the unaligned version 2671 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2672 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2673 2674 //*** jlong 2675 // It is always aligned 2676 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2677 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2678 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2679 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2680 2681 //*** oops 2682 { 2683 // With compressed oops we need unaligned versions; notice that 2684 // we overwrite entry_oop_arraycopy. 2685 bool aligned = !UseCompressedOops; 2686 2687 StubRoutines::_arrayof_oop_disjoint_arraycopy 2688 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2689 StubRoutines::_arrayof_oop_arraycopy 2690 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2691 // Aligned versions without pre-barriers 2692 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2693 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2694 StubRoutines::_arrayof_oop_arraycopy_uninit 2695 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2696 } 2697 2698 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2699 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2700 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2701 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2702 2703 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2704 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2705 2706 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2707 entry_jshort_arraycopy, 2708 entry_jint_arraycopy, 2709 entry_jlong_arraycopy); 2710 2711 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2712 entry_jshort_arraycopy, 2713 entry_jint_arraycopy, 2714 entry_oop_arraycopy, 2715 entry_jlong_arraycopy, 2716 entry_checkcast_arraycopy); 2717 2718 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2719 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2720 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2721 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2722 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2723 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2724 } 2725 2726 void generate_math_stubs() { Unimplemented(); } 2727 2728 // Arguments: 2729 // 2730 // Inputs: 2731 // c_rarg0 - source byte array address 2732 // c_rarg1 - destination byte array address 2733 // c_rarg2 - K (key) in little endian int array 2734 // 2735 address generate_aescrypt_encryptBlock() { 2736 __ align(CodeEntryAlignment); 2737 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2738 StubCodeMark mark(this, stub_id); 2739 2740 const Register from = c_rarg0; // source array address 2741 const Register to = c_rarg1; // destination array address 2742 const Register key = c_rarg2; // key array address 2743 const Register keylen = rscratch1; 2744 2745 address start = __ pc(); 2746 __ enter(); 2747 2748 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2749 2750 __ aesenc_loadkeys(key, keylen); 2751 __ aesecb_encrypt(from, to, keylen); 2752 2753 __ mov(r0, 0); 2754 2755 __ leave(); 2756 __ ret(lr); 2757 2758 return start; 2759 } 2760 2761 // Arguments: 2762 // 2763 // Inputs: 2764 // c_rarg0 - source byte array address 2765 // c_rarg1 - destination byte array address 2766 // c_rarg2 - K (key) in little endian int array 2767 // 2768 address generate_aescrypt_decryptBlock() { 2769 assert(UseAES, "need AES cryptographic extension support"); 2770 __ align(CodeEntryAlignment); 2771 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2772 StubCodeMark mark(this, stub_id); 2773 Label L_doLast; 2774 2775 const Register from = c_rarg0; // source array address 2776 const Register to = c_rarg1; // destination array address 2777 const Register key = c_rarg2; // key array address 2778 const Register keylen = rscratch1; 2779 2780 address start = __ pc(); 2781 __ enter(); // required for proper stackwalking of RuntimeStub frame 2782 2783 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2784 2785 __ aesecb_decrypt(from, to, key, keylen); 2786 2787 __ mov(r0, 0); 2788 2789 __ leave(); 2790 __ ret(lr); 2791 2792 return start; 2793 } 2794 2795 // Arguments: 2796 // 2797 // Inputs: 2798 // c_rarg0 - source byte array address 2799 // c_rarg1 - destination byte array address 2800 // c_rarg2 - K (key) in little endian int array 2801 // c_rarg3 - r vector byte array address 2802 // c_rarg4 - input length 2803 // 2804 // Output: 2805 // x0 - input length 2806 // 2807 address generate_cipherBlockChaining_encryptAESCrypt() { 2808 assert(UseAES, "need AES cryptographic extension support"); 2809 __ align(CodeEntryAlignment); 2810 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2811 StubCodeMark mark(this, stub_id); 2812 2813 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2814 2815 const Register from = c_rarg0; // source array address 2816 const Register to = c_rarg1; // destination array address 2817 const Register key = c_rarg2; // key array address 2818 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2819 // and left with the results of the last encryption block 2820 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2821 const Register keylen = rscratch1; 2822 2823 address start = __ pc(); 2824 2825 __ enter(); 2826 2827 __ movw(rscratch2, len_reg); 2828 2829 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2830 2831 __ ld1(v0, __ T16B, rvec); 2832 2833 __ cmpw(keylen, 52); 2834 __ br(Assembler::CC, L_loadkeys_44); 2835 __ br(Assembler::EQ, L_loadkeys_52); 2836 2837 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2838 __ rev32(v17, __ T16B, v17); 2839 __ rev32(v18, __ T16B, v18); 2840 __ BIND(L_loadkeys_52); 2841 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2842 __ rev32(v19, __ T16B, v19); 2843 __ rev32(v20, __ T16B, v20); 2844 __ BIND(L_loadkeys_44); 2845 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2846 __ rev32(v21, __ T16B, v21); 2847 __ rev32(v22, __ T16B, v22); 2848 __ rev32(v23, __ T16B, v23); 2849 __ rev32(v24, __ T16B, v24); 2850 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2851 __ rev32(v25, __ T16B, v25); 2852 __ rev32(v26, __ T16B, v26); 2853 __ rev32(v27, __ T16B, v27); 2854 __ rev32(v28, __ T16B, v28); 2855 __ ld1(v29, v30, v31, __ T16B, key); 2856 __ rev32(v29, __ T16B, v29); 2857 __ rev32(v30, __ T16B, v30); 2858 __ rev32(v31, __ T16B, v31); 2859 2860 __ BIND(L_aes_loop); 2861 __ ld1(v1, __ T16B, __ post(from, 16)); 2862 __ eor(v0, __ T16B, v0, v1); 2863 2864 __ br(Assembler::CC, L_rounds_44); 2865 __ br(Assembler::EQ, L_rounds_52); 2866 2867 __ aese(v0, v17); __ aesmc(v0, v0); 2868 __ aese(v0, v18); __ aesmc(v0, v0); 2869 __ BIND(L_rounds_52); 2870 __ aese(v0, v19); __ aesmc(v0, v0); 2871 __ aese(v0, v20); __ aesmc(v0, v0); 2872 __ BIND(L_rounds_44); 2873 __ aese(v0, v21); __ aesmc(v0, v0); 2874 __ aese(v0, v22); __ aesmc(v0, v0); 2875 __ aese(v0, v23); __ aesmc(v0, v0); 2876 __ aese(v0, v24); __ aesmc(v0, v0); 2877 __ aese(v0, v25); __ aesmc(v0, v0); 2878 __ aese(v0, v26); __ aesmc(v0, v0); 2879 __ aese(v0, v27); __ aesmc(v0, v0); 2880 __ aese(v0, v28); __ aesmc(v0, v0); 2881 __ aese(v0, v29); __ aesmc(v0, v0); 2882 __ aese(v0, v30); 2883 __ eor(v0, __ T16B, v0, v31); 2884 2885 __ st1(v0, __ T16B, __ post(to, 16)); 2886 2887 __ subw(len_reg, len_reg, 16); 2888 __ cbnzw(len_reg, L_aes_loop); 2889 2890 __ st1(v0, __ T16B, rvec); 2891 2892 __ mov(r0, rscratch2); 2893 2894 __ leave(); 2895 __ ret(lr); 2896 2897 return start; 2898 } 2899 2900 // Arguments: 2901 // 2902 // Inputs: 2903 // c_rarg0 - source byte array address 2904 // c_rarg1 - destination byte array address 2905 // c_rarg2 - K (key) in little endian int array 2906 // c_rarg3 - r vector byte array address 2907 // c_rarg4 - input length 2908 // 2909 // Output: 2910 // r0 - input length 2911 // 2912 address generate_cipherBlockChaining_decryptAESCrypt() { 2913 assert(UseAES, "need AES cryptographic extension support"); 2914 __ align(CodeEntryAlignment); 2915 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 2916 StubCodeMark mark(this, stub_id); 2917 2918 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2919 2920 const Register from = c_rarg0; // source array address 2921 const Register to = c_rarg1; // destination array address 2922 const Register key = c_rarg2; // key array address 2923 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2924 // and left with the results of the last encryption block 2925 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2926 const Register keylen = rscratch1; 2927 2928 address start = __ pc(); 2929 2930 __ enter(); 2931 2932 __ movw(rscratch2, len_reg); 2933 2934 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2935 2936 __ ld1(v2, __ T16B, rvec); 2937 2938 __ ld1(v31, __ T16B, __ post(key, 16)); 2939 __ rev32(v31, __ T16B, v31); 2940 2941 __ cmpw(keylen, 52); 2942 __ br(Assembler::CC, L_loadkeys_44); 2943 __ br(Assembler::EQ, L_loadkeys_52); 2944 2945 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2946 __ rev32(v17, __ T16B, v17); 2947 __ rev32(v18, __ T16B, v18); 2948 __ BIND(L_loadkeys_52); 2949 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2950 __ rev32(v19, __ T16B, v19); 2951 __ rev32(v20, __ T16B, v20); 2952 __ BIND(L_loadkeys_44); 2953 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2954 __ rev32(v21, __ T16B, v21); 2955 __ rev32(v22, __ T16B, v22); 2956 __ rev32(v23, __ T16B, v23); 2957 __ rev32(v24, __ T16B, v24); 2958 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2959 __ rev32(v25, __ T16B, v25); 2960 __ rev32(v26, __ T16B, v26); 2961 __ rev32(v27, __ T16B, v27); 2962 __ rev32(v28, __ T16B, v28); 2963 __ ld1(v29, v30, __ T16B, key); 2964 __ rev32(v29, __ T16B, v29); 2965 __ rev32(v30, __ T16B, v30); 2966 2967 __ BIND(L_aes_loop); 2968 __ ld1(v0, __ T16B, __ post(from, 16)); 2969 __ orr(v1, __ T16B, v0, v0); 2970 2971 __ br(Assembler::CC, L_rounds_44); 2972 __ br(Assembler::EQ, L_rounds_52); 2973 2974 __ aesd(v0, v17); __ aesimc(v0, v0); 2975 __ aesd(v0, v18); __ aesimc(v0, v0); 2976 __ BIND(L_rounds_52); 2977 __ aesd(v0, v19); __ aesimc(v0, v0); 2978 __ aesd(v0, v20); __ aesimc(v0, v0); 2979 __ BIND(L_rounds_44); 2980 __ aesd(v0, v21); __ aesimc(v0, v0); 2981 __ aesd(v0, v22); __ aesimc(v0, v0); 2982 __ aesd(v0, v23); __ aesimc(v0, v0); 2983 __ aesd(v0, v24); __ aesimc(v0, v0); 2984 __ aesd(v0, v25); __ aesimc(v0, v0); 2985 __ aesd(v0, v26); __ aesimc(v0, v0); 2986 __ aesd(v0, v27); __ aesimc(v0, v0); 2987 __ aesd(v0, v28); __ aesimc(v0, v0); 2988 __ aesd(v0, v29); __ aesimc(v0, v0); 2989 __ aesd(v0, v30); 2990 __ eor(v0, __ T16B, v0, v31); 2991 __ eor(v0, __ T16B, v0, v2); 2992 2993 __ st1(v0, __ T16B, __ post(to, 16)); 2994 __ orr(v2, __ T16B, v1, v1); 2995 2996 __ subw(len_reg, len_reg, 16); 2997 __ cbnzw(len_reg, L_aes_loop); 2998 2999 __ st1(v2, __ T16B, rvec); 3000 3001 __ mov(r0, rscratch2); 3002 3003 __ leave(); 3004 __ ret(lr); 3005 3006 return start; 3007 } 3008 3009 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3010 // Inputs: 128-bits. in is preserved. 3011 // The least-significant 64-bit word is in the upper dword of each vector. 3012 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3013 // Output: result 3014 void be_add_128_64(FloatRegister result, FloatRegister in, 3015 FloatRegister inc, FloatRegister tmp) { 3016 assert_different_registers(result, tmp, inc); 3017 3018 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3019 // input 3020 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3021 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3022 // MSD == 0 (must be!) to LSD 3023 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3024 } 3025 3026 // CTR AES crypt. 3027 // Arguments: 3028 // 3029 // Inputs: 3030 // c_rarg0 - source byte array address 3031 // c_rarg1 - destination byte array address 3032 // c_rarg2 - K (key) in little endian int array 3033 // c_rarg3 - counter vector byte array address 3034 // c_rarg4 - input length 3035 // c_rarg5 - saved encryptedCounter start 3036 // c_rarg6 - saved used length 3037 // 3038 // Output: 3039 // r0 - input length 3040 // 3041 address generate_counterMode_AESCrypt() { 3042 const Register in = c_rarg0; 3043 const Register out = c_rarg1; 3044 const Register key = c_rarg2; 3045 const Register counter = c_rarg3; 3046 const Register saved_len = c_rarg4, len = r10; 3047 const Register saved_encrypted_ctr = c_rarg5; 3048 const Register used_ptr = c_rarg6, used = r12; 3049 3050 const Register offset = r7; 3051 const Register keylen = r11; 3052 3053 const unsigned char block_size = 16; 3054 const int bulk_width = 4; 3055 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3056 // performance with larger data sizes, but it also means that the 3057 // fast path isn't used until you have at least 8 blocks, and up 3058 // to 127 bytes of data will be executed on the slow path. For 3059 // that reason, and also so as not to blow away too much icache, 4 3060 // blocks seems like a sensible compromise. 3061 3062 // Algorithm: 3063 // 3064 // if (len == 0) { 3065 // goto DONE; 3066 // } 3067 // int result = len; 3068 // do { 3069 // if (used >= blockSize) { 3070 // if (len >= bulk_width * blockSize) { 3071 // CTR_large_block(); 3072 // if (len == 0) 3073 // goto DONE; 3074 // } 3075 // for (;;) { 3076 // 16ByteVector v0 = counter; 3077 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3078 // used = 0; 3079 // if (len < blockSize) 3080 // break; /* goto NEXT */ 3081 // 16ByteVector v1 = load16Bytes(in, offset); 3082 // v1 = v1 ^ encryptedCounter; 3083 // store16Bytes(out, offset); 3084 // used = blockSize; 3085 // offset += blockSize; 3086 // len -= blockSize; 3087 // if (len == 0) 3088 // goto DONE; 3089 // } 3090 // } 3091 // NEXT: 3092 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3093 // len--; 3094 // } while (len != 0); 3095 // DONE: 3096 // return result; 3097 // 3098 // CTR_large_block() 3099 // Wide bulk encryption of whole blocks. 3100 3101 __ align(CodeEntryAlignment); 3102 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3103 StubCodeMark mark(this, stub_id); 3104 const address start = __ pc(); 3105 __ enter(); 3106 3107 Label DONE, CTR_large_block, large_block_return; 3108 __ ldrw(used, Address(used_ptr)); 3109 __ cbzw(saved_len, DONE); 3110 3111 __ mov(len, saved_len); 3112 __ mov(offset, 0); 3113 3114 // Compute #rounds for AES based on the length of the key array 3115 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3116 3117 __ aesenc_loadkeys(key, keylen); 3118 3119 { 3120 Label L_CTR_loop, NEXT; 3121 3122 __ bind(L_CTR_loop); 3123 3124 __ cmp(used, block_size); 3125 __ br(__ LO, NEXT); 3126 3127 // Maybe we have a lot of data 3128 __ subsw(rscratch1, len, bulk_width * block_size); 3129 __ br(__ HS, CTR_large_block); 3130 __ BIND(large_block_return); 3131 __ cbzw(len, DONE); 3132 3133 // Setup the counter 3134 __ movi(v4, __ T4S, 0); 3135 __ movi(v5, __ T4S, 1); 3136 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3137 3138 // 128-bit big-endian increment 3139 __ ld1(v0, __ T16B, counter); 3140 __ rev64(v16, __ T16B, v0); 3141 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3142 __ rev64(v16, __ T16B, v16); 3143 __ st1(v16, __ T16B, counter); 3144 // Previous counter value is in v0 3145 // v4 contains { 0, 1 } 3146 3147 { 3148 // We have fewer than bulk_width blocks of data left. Encrypt 3149 // them one by one until there is less than a full block 3150 // remaining, being careful to save both the encrypted counter 3151 // and the counter. 3152 3153 Label inner_loop; 3154 __ bind(inner_loop); 3155 // Counter to encrypt is in v0 3156 __ aesecb_encrypt(noreg, noreg, keylen); 3157 __ st1(v0, __ T16B, saved_encrypted_ctr); 3158 3159 // Do we have a remaining full block? 3160 3161 __ mov(used, 0); 3162 __ cmp(len, block_size); 3163 __ br(__ LO, NEXT); 3164 3165 // Yes, we have a full block 3166 __ ldrq(v1, Address(in, offset)); 3167 __ eor(v1, __ T16B, v1, v0); 3168 __ strq(v1, Address(out, offset)); 3169 __ mov(used, block_size); 3170 __ add(offset, offset, block_size); 3171 3172 __ subw(len, len, block_size); 3173 __ cbzw(len, DONE); 3174 3175 // Increment the counter, store it back 3176 __ orr(v0, __ T16B, v16, v16); 3177 __ rev64(v16, __ T16B, v16); 3178 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3179 __ rev64(v16, __ T16B, v16); 3180 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3181 3182 __ b(inner_loop); 3183 } 3184 3185 __ BIND(NEXT); 3186 3187 // Encrypt a single byte, and loop. 3188 // We expect this to be a rare event. 3189 __ ldrb(rscratch1, Address(in, offset)); 3190 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3191 __ eor(rscratch1, rscratch1, rscratch2); 3192 __ strb(rscratch1, Address(out, offset)); 3193 __ add(offset, offset, 1); 3194 __ add(used, used, 1); 3195 __ subw(len, len,1); 3196 __ cbnzw(len, L_CTR_loop); 3197 } 3198 3199 __ bind(DONE); 3200 __ strw(used, Address(used_ptr)); 3201 __ mov(r0, saved_len); 3202 3203 __ leave(); // required for proper stackwalking of RuntimeStub frame 3204 __ ret(lr); 3205 3206 // Bulk encryption 3207 3208 __ BIND (CTR_large_block); 3209 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3210 3211 if (bulk_width == 8) { 3212 __ sub(sp, sp, 4 * 16); 3213 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3214 } 3215 __ sub(sp, sp, 4 * 16); 3216 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3217 RegSet saved_regs = (RegSet::of(in, out, offset) 3218 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3219 __ push(saved_regs, sp); 3220 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3221 __ add(in, in, offset); 3222 __ add(out, out, offset); 3223 3224 // Keys should already be loaded into the correct registers 3225 3226 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3227 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3228 3229 // AES/CTR loop 3230 { 3231 Label L_CTR_loop; 3232 __ BIND(L_CTR_loop); 3233 3234 // Setup the counters 3235 __ movi(v8, __ T4S, 0); 3236 __ movi(v9, __ T4S, 1); 3237 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3238 3239 for (int i = 0; i < bulk_width; i++) { 3240 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3241 __ rev64(v0_ofs, __ T16B, v16); 3242 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3243 } 3244 3245 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3246 3247 // Encrypt the counters 3248 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3249 3250 if (bulk_width == 8) { 3251 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3252 } 3253 3254 // XOR the encrypted counters with the inputs 3255 for (int i = 0; i < bulk_width; i++) { 3256 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3257 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3258 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3259 } 3260 3261 // Write the encrypted data 3262 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3263 if (bulk_width == 8) { 3264 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3265 } 3266 3267 __ subw(len, len, 16 * bulk_width); 3268 __ cbnzw(len, L_CTR_loop); 3269 } 3270 3271 // Save the counter back where it goes 3272 __ rev64(v16, __ T16B, v16); 3273 __ st1(v16, __ T16B, counter); 3274 3275 __ pop(saved_regs, sp); 3276 3277 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3278 if (bulk_width == 8) { 3279 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3280 } 3281 3282 __ andr(rscratch1, len, -16 * bulk_width); 3283 __ sub(len, len, rscratch1); 3284 __ add(offset, offset, rscratch1); 3285 __ mov(used, 16); 3286 __ strw(used, Address(used_ptr)); 3287 __ b(large_block_return); 3288 3289 return start; 3290 } 3291 3292 // Vector AES Galois Counter Mode implementation. Parameters: 3293 // 3294 // in = c_rarg0 3295 // len = c_rarg1 3296 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3297 // out = c_rarg3 3298 // key = c_rarg4 3299 // state = c_rarg5 - GHASH.state 3300 // subkeyHtbl = c_rarg6 - powers of H 3301 // counter = c_rarg7 - 16 bytes of CTR 3302 // return - number of processed bytes 3303 address generate_galoisCounterMode_AESCrypt() { 3304 address ghash_polynomial = __ pc(); 3305 __ emit_int64(0x87); // The low-order bits of the field 3306 // polynomial (i.e. p = z^7+z^2+z+1) 3307 // repeated in the low and high parts of a 3308 // 128-bit vector 3309 __ emit_int64(0x87); 3310 3311 __ align(CodeEntryAlignment); 3312 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3313 StubCodeMark mark(this, stub_id); 3314 address start = __ pc(); 3315 __ enter(); 3316 3317 const Register in = c_rarg0; 3318 const Register len = c_rarg1; 3319 const Register ct = c_rarg2; 3320 const Register out = c_rarg3; 3321 // and updated with the incremented counter in the end 3322 3323 const Register key = c_rarg4; 3324 const Register state = c_rarg5; 3325 3326 const Register subkeyHtbl = c_rarg6; 3327 3328 const Register counter = c_rarg7; 3329 3330 const Register keylen = r10; 3331 // Save state before entering routine 3332 __ sub(sp, sp, 4 * 16); 3333 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3334 __ sub(sp, sp, 4 * 16); 3335 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3336 3337 // __ andr(len, len, -512); 3338 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3339 __ str(len, __ pre(sp, -2 * wordSize)); 3340 3341 Label DONE; 3342 __ cbz(len, DONE); 3343 3344 // Compute #rounds for AES based on the length of the key array 3345 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3346 3347 __ aesenc_loadkeys(key, keylen); 3348 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3349 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3350 3351 // AES/CTR loop 3352 { 3353 Label L_CTR_loop; 3354 __ BIND(L_CTR_loop); 3355 3356 // Setup the counters 3357 __ movi(v8, __ T4S, 0); 3358 __ movi(v9, __ T4S, 1); 3359 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3360 3361 assert(v0->encoding() < v8->encoding(), ""); 3362 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3363 FloatRegister f = as_FloatRegister(i); 3364 __ rev32(f, __ T16B, v16); 3365 __ addv(v16, __ T4S, v16, v8); 3366 } 3367 3368 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3369 3370 // Encrypt the counters 3371 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3372 3373 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3374 3375 // XOR the encrypted counters with the inputs 3376 for (int i = 0; i < 8; i++) { 3377 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3378 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3379 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3380 } 3381 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3382 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3383 3384 __ subw(len, len, 16 * 8); 3385 __ cbnzw(len, L_CTR_loop); 3386 } 3387 3388 __ rev32(v16, __ T16B, v16); 3389 __ st1(v16, __ T16B, counter); 3390 3391 __ ldr(len, Address(sp)); 3392 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3393 3394 // GHASH/CTR loop 3395 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3396 len, /*unrolls*/4); 3397 3398 #ifdef ASSERT 3399 { Label L; 3400 __ cmp(len, (unsigned char)0); 3401 __ br(Assembler::EQ, L); 3402 __ stop("stubGenerator: abort"); 3403 __ bind(L); 3404 } 3405 #endif 3406 3407 __ bind(DONE); 3408 // Return the number of bytes processed 3409 __ ldr(r0, __ post(sp, 2 * wordSize)); 3410 3411 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3412 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3413 3414 __ leave(); // required for proper stackwalking of RuntimeStub frame 3415 __ ret(lr); 3416 return start; 3417 } 3418 3419 class Cached64Bytes { 3420 private: 3421 MacroAssembler *_masm; 3422 Register _regs[8]; 3423 3424 public: 3425 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3426 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3427 auto it = rs.begin(); 3428 for (auto &r: _regs) { 3429 r = *it; 3430 ++it; 3431 } 3432 } 3433 3434 void gen_loads(Register base) { 3435 for (int i = 0; i < 8; i += 2) { 3436 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3437 } 3438 } 3439 3440 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3441 void extract_u32(Register dest, int i) { 3442 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3443 } 3444 }; 3445 3446 // Utility routines for md5. 3447 // Clobbers r10 and r11. 3448 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3449 int k, int s, int t) { 3450 Register rscratch3 = r10; 3451 Register rscratch4 = r11; 3452 3453 __ eorw(rscratch3, r3, r4); 3454 __ movw(rscratch2, t); 3455 __ andw(rscratch3, rscratch3, r2); 3456 __ addw(rscratch4, r1, rscratch2); 3457 reg_cache.extract_u32(rscratch1, k); 3458 __ eorw(rscratch3, rscratch3, r4); 3459 __ addw(rscratch4, rscratch4, rscratch1); 3460 __ addw(rscratch3, rscratch3, rscratch4); 3461 __ rorw(rscratch2, rscratch3, 32 - s); 3462 __ addw(r1, rscratch2, r2); 3463 } 3464 3465 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3466 int k, int s, int t) { 3467 Register rscratch3 = r10; 3468 Register rscratch4 = r11; 3469 3470 reg_cache.extract_u32(rscratch1, k); 3471 __ movw(rscratch2, t); 3472 __ addw(rscratch4, r1, rscratch2); 3473 __ addw(rscratch4, rscratch4, rscratch1); 3474 __ bicw(rscratch2, r3, r4); 3475 __ andw(rscratch3, r2, r4); 3476 __ addw(rscratch2, rscratch2, rscratch4); 3477 __ addw(rscratch2, rscratch2, rscratch3); 3478 __ rorw(rscratch2, rscratch2, 32 - s); 3479 __ addw(r1, rscratch2, r2); 3480 } 3481 3482 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3483 int k, int s, int t) { 3484 Register rscratch3 = r10; 3485 Register rscratch4 = r11; 3486 3487 __ eorw(rscratch3, r3, r4); 3488 __ movw(rscratch2, t); 3489 __ addw(rscratch4, r1, rscratch2); 3490 reg_cache.extract_u32(rscratch1, k); 3491 __ eorw(rscratch3, rscratch3, r2); 3492 __ addw(rscratch4, rscratch4, rscratch1); 3493 __ addw(rscratch3, rscratch3, rscratch4); 3494 __ rorw(rscratch2, rscratch3, 32 - s); 3495 __ addw(r1, rscratch2, r2); 3496 } 3497 3498 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3499 int k, int s, int t) { 3500 Register rscratch3 = r10; 3501 Register rscratch4 = r11; 3502 3503 __ movw(rscratch3, t); 3504 __ ornw(rscratch2, r2, r4); 3505 __ addw(rscratch4, r1, rscratch3); 3506 reg_cache.extract_u32(rscratch1, k); 3507 __ eorw(rscratch3, rscratch2, r3); 3508 __ addw(rscratch4, rscratch4, rscratch1); 3509 __ addw(rscratch3, rscratch3, rscratch4); 3510 __ rorw(rscratch2, rscratch3, 32 - s); 3511 __ addw(r1, rscratch2, r2); 3512 } 3513 3514 // Arguments: 3515 // 3516 // Inputs: 3517 // c_rarg0 - byte[] source+offset 3518 // c_rarg1 - int[] SHA.state 3519 // c_rarg2 - int offset 3520 // c_rarg3 - int limit 3521 // 3522 address generate_md5_implCompress(StubGenStubId stub_id) { 3523 bool multi_block; 3524 switch (stub_id) { 3525 case md5_implCompress_id: 3526 multi_block = false; 3527 break; 3528 case md5_implCompressMB_id: 3529 multi_block = true; 3530 break; 3531 default: 3532 ShouldNotReachHere(); 3533 } 3534 __ align(CodeEntryAlignment); 3535 3536 StubCodeMark mark(this, stub_id); 3537 address start = __ pc(); 3538 3539 Register buf = c_rarg0; 3540 Register state = c_rarg1; 3541 Register ofs = c_rarg2; 3542 Register limit = c_rarg3; 3543 Register a = r4; 3544 Register b = r5; 3545 Register c = r6; 3546 Register d = r7; 3547 Register rscratch3 = r10; 3548 Register rscratch4 = r11; 3549 3550 Register state_regs[2] = { r12, r13 }; 3551 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3552 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3553 3554 __ push(saved_regs, sp); 3555 3556 __ ldp(state_regs[0], state_regs[1], Address(state)); 3557 __ ubfx(a, state_regs[0], 0, 32); 3558 __ ubfx(b, state_regs[0], 32, 32); 3559 __ ubfx(c, state_regs[1], 0, 32); 3560 __ ubfx(d, state_regs[1], 32, 32); 3561 3562 Label md5_loop; 3563 __ BIND(md5_loop); 3564 3565 reg_cache.gen_loads(buf); 3566 3567 // Round 1 3568 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3569 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3570 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3571 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3572 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3573 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3574 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3575 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3576 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3577 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3578 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3579 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3580 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3581 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3582 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3583 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3584 3585 // Round 2 3586 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3587 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3588 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3589 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3590 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3591 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3592 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3593 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3594 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3595 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3596 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3597 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3598 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3599 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3600 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3601 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3602 3603 // Round 3 3604 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3605 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3606 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3607 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3608 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3609 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3610 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3611 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3612 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3613 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3614 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3615 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3616 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3617 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3618 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3619 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3620 3621 // Round 4 3622 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3623 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3624 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3625 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3626 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3627 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3628 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3629 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3630 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3631 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3632 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3633 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3634 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3635 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3636 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3637 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3638 3639 __ addw(a, state_regs[0], a); 3640 __ ubfx(rscratch2, state_regs[0], 32, 32); 3641 __ addw(b, rscratch2, b); 3642 __ addw(c, state_regs[1], c); 3643 __ ubfx(rscratch4, state_regs[1], 32, 32); 3644 __ addw(d, rscratch4, d); 3645 3646 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3647 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3648 3649 if (multi_block) { 3650 __ add(buf, buf, 64); 3651 __ add(ofs, ofs, 64); 3652 __ cmp(ofs, limit); 3653 __ br(Assembler::LE, md5_loop); 3654 __ mov(c_rarg0, ofs); // return ofs 3655 } 3656 3657 // write hash values back in the correct order 3658 __ stp(state_regs[0], state_regs[1], Address(state)); 3659 3660 __ pop(saved_regs, sp); 3661 3662 __ ret(lr); 3663 3664 return start; 3665 } 3666 3667 // Arguments: 3668 // 3669 // Inputs: 3670 // c_rarg0 - byte[] source+offset 3671 // c_rarg1 - int[] SHA.state 3672 // c_rarg2 - int offset 3673 // c_rarg3 - int limit 3674 // 3675 address generate_sha1_implCompress(StubGenStubId stub_id) { 3676 bool multi_block; 3677 switch (stub_id) { 3678 case sha1_implCompress_id: 3679 multi_block = false; 3680 break; 3681 case sha1_implCompressMB_id: 3682 multi_block = true; 3683 break; 3684 default: 3685 ShouldNotReachHere(); 3686 } 3687 3688 __ align(CodeEntryAlignment); 3689 3690 StubCodeMark mark(this, stub_id); 3691 address start = __ pc(); 3692 3693 Register buf = c_rarg0; 3694 Register state = c_rarg1; 3695 Register ofs = c_rarg2; 3696 Register limit = c_rarg3; 3697 3698 Label keys; 3699 Label sha1_loop; 3700 3701 // load the keys into v0..v3 3702 __ adr(rscratch1, keys); 3703 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3704 // load 5 words state into v6, v7 3705 __ ldrq(v6, Address(state, 0)); 3706 __ ldrs(v7, Address(state, 16)); 3707 3708 3709 __ BIND(sha1_loop); 3710 // load 64 bytes of data into v16..v19 3711 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3712 __ rev32(v16, __ T16B, v16); 3713 __ rev32(v17, __ T16B, v17); 3714 __ rev32(v18, __ T16B, v18); 3715 __ rev32(v19, __ T16B, v19); 3716 3717 // do the sha1 3718 __ addv(v4, __ T4S, v16, v0); 3719 __ orr(v20, __ T16B, v6, v6); 3720 3721 FloatRegister d0 = v16; 3722 FloatRegister d1 = v17; 3723 FloatRegister d2 = v18; 3724 FloatRegister d3 = v19; 3725 3726 for (int round = 0; round < 20; round++) { 3727 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3728 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3729 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3730 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3731 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3732 3733 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3734 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3735 __ sha1h(tmp2, __ T4S, v20); 3736 if (round < 5) 3737 __ sha1c(v20, __ T4S, tmp3, tmp4); 3738 else if (round < 10 || round >= 15) 3739 __ sha1p(v20, __ T4S, tmp3, tmp4); 3740 else 3741 __ sha1m(v20, __ T4S, tmp3, tmp4); 3742 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3743 3744 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3745 } 3746 3747 __ addv(v7, __ T2S, v7, v21); 3748 __ addv(v6, __ T4S, v6, v20); 3749 3750 if (multi_block) { 3751 __ add(ofs, ofs, 64); 3752 __ cmp(ofs, limit); 3753 __ br(Assembler::LE, sha1_loop); 3754 __ mov(c_rarg0, ofs); // return ofs 3755 } 3756 3757 __ strq(v6, Address(state, 0)); 3758 __ strs(v7, Address(state, 16)); 3759 3760 __ ret(lr); 3761 3762 __ bind(keys); 3763 __ emit_int32(0x5a827999); 3764 __ emit_int32(0x6ed9eba1); 3765 __ emit_int32(0x8f1bbcdc); 3766 __ emit_int32(0xca62c1d6); 3767 3768 return start; 3769 } 3770 3771 3772 // Arguments: 3773 // 3774 // Inputs: 3775 // c_rarg0 - byte[] source+offset 3776 // c_rarg1 - int[] SHA.state 3777 // c_rarg2 - int offset 3778 // c_rarg3 - int limit 3779 // 3780 address generate_sha256_implCompress(StubGenStubId stub_id) { 3781 bool multi_block; 3782 switch (stub_id) { 3783 case sha256_implCompress_id: 3784 multi_block = false; 3785 break; 3786 case sha256_implCompressMB_id: 3787 multi_block = true; 3788 break; 3789 default: 3790 ShouldNotReachHere(); 3791 } 3792 3793 static const uint32_t round_consts[64] = { 3794 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3795 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3796 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3797 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3798 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3799 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3800 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3801 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3802 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3803 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3804 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3805 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3806 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3807 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3808 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3809 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3810 }; 3811 3812 __ align(CodeEntryAlignment); 3813 3814 StubCodeMark mark(this, stub_id); 3815 address start = __ pc(); 3816 3817 Register buf = c_rarg0; 3818 Register state = c_rarg1; 3819 Register ofs = c_rarg2; 3820 Register limit = c_rarg3; 3821 3822 Label sha1_loop; 3823 3824 __ stpd(v8, v9, __ pre(sp, -32)); 3825 __ stpd(v10, v11, Address(sp, 16)); 3826 3827 // dga == v0 3828 // dgb == v1 3829 // dg0 == v2 3830 // dg1 == v3 3831 // dg2 == v4 3832 // t0 == v6 3833 // t1 == v7 3834 3835 // load 16 keys to v16..v31 3836 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3837 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3838 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3839 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3840 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3841 3842 // load 8 words (256 bits) state 3843 __ ldpq(v0, v1, state); 3844 3845 __ BIND(sha1_loop); 3846 // load 64 bytes of data into v8..v11 3847 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3848 __ rev32(v8, __ T16B, v8); 3849 __ rev32(v9, __ T16B, v9); 3850 __ rev32(v10, __ T16B, v10); 3851 __ rev32(v11, __ T16B, v11); 3852 3853 __ addv(v6, __ T4S, v8, v16); 3854 __ orr(v2, __ T16B, v0, v0); 3855 __ orr(v3, __ T16B, v1, v1); 3856 3857 FloatRegister d0 = v8; 3858 FloatRegister d1 = v9; 3859 FloatRegister d2 = v10; 3860 FloatRegister d3 = v11; 3861 3862 3863 for (int round = 0; round < 16; round++) { 3864 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3865 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3866 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3867 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3868 3869 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3870 __ orr(v4, __ T16B, v2, v2); 3871 if (round < 15) 3872 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3873 __ sha256h(v2, __ T4S, v3, tmp2); 3874 __ sha256h2(v3, __ T4S, v4, tmp2); 3875 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3876 3877 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3878 } 3879 3880 __ addv(v0, __ T4S, v0, v2); 3881 __ addv(v1, __ T4S, v1, v3); 3882 3883 if (multi_block) { 3884 __ add(ofs, ofs, 64); 3885 __ cmp(ofs, limit); 3886 __ br(Assembler::LE, sha1_loop); 3887 __ mov(c_rarg0, ofs); // return ofs 3888 } 3889 3890 __ ldpd(v10, v11, Address(sp, 16)); 3891 __ ldpd(v8, v9, __ post(sp, 32)); 3892 3893 __ stpq(v0, v1, state); 3894 3895 __ ret(lr); 3896 3897 return start; 3898 } 3899 3900 // Double rounds for sha512. 3901 void sha512_dround(int dr, 3902 FloatRegister vi0, FloatRegister vi1, 3903 FloatRegister vi2, FloatRegister vi3, 3904 FloatRegister vi4, FloatRegister vrc0, 3905 FloatRegister vrc1, FloatRegister vin0, 3906 FloatRegister vin1, FloatRegister vin2, 3907 FloatRegister vin3, FloatRegister vin4) { 3908 if (dr < 36) { 3909 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3910 } 3911 __ addv(v5, __ T2D, vrc0, vin0); 3912 __ ext(v6, __ T16B, vi2, vi3, 8); 3913 __ ext(v5, __ T16B, v5, v5, 8); 3914 __ ext(v7, __ T16B, vi1, vi2, 8); 3915 __ addv(vi3, __ T2D, vi3, v5); 3916 if (dr < 32) { 3917 __ ext(v5, __ T16B, vin3, vin4, 8); 3918 __ sha512su0(vin0, __ T2D, vin1); 3919 } 3920 __ sha512h(vi3, __ T2D, v6, v7); 3921 if (dr < 32) { 3922 __ sha512su1(vin0, __ T2D, vin2, v5); 3923 } 3924 __ addv(vi4, __ T2D, vi1, vi3); 3925 __ sha512h2(vi3, __ T2D, vi1, vi0); 3926 } 3927 3928 // Arguments: 3929 // 3930 // Inputs: 3931 // c_rarg0 - byte[] source+offset 3932 // c_rarg1 - int[] SHA.state 3933 // c_rarg2 - int offset 3934 // c_rarg3 - int limit 3935 // 3936 address generate_sha512_implCompress(StubGenStubId stub_id) { 3937 bool multi_block; 3938 switch (stub_id) { 3939 case sha512_implCompress_id: 3940 multi_block = false; 3941 break; 3942 case sha512_implCompressMB_id: 3943 multi_block = true; 3944 break; 3945 default: 3946 ShouldNotReachHere(); 3947 } 3948 3949 static const uint64_t round_consts[80] = { 3950 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3951 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3952 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3953 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3954 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3955 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3956 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3957 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3958 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3959 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3960 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3961 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3962 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3963 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3964 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3965 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3966 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3967 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3968 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3969 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3970 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3971 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3972 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3973 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3974 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3975 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3976 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3977 }; 3978 3979 __ align(CodeEntryAlignment); 3980 3981 StubCodeMark mark(this, stub_id); 3982 address start = __ pc(); 3983 3984 Register buf = c_rarg0; 3985 Register state = c_rarg1; 3986 Register ofs = c_rarg2; 3987 Register limit = c_rarg3; 3988 3989 __ stpd(v8, v9, __ pre(sp, -64)); 3990 __ stpd(v10, v11, Address(sp, 16)); 3991 __ stpd(v12, v13, Address(sp, 32)); 3992 __ stpd(v14, v15, Address(sp, 48)); 3993 3994 Label sha512_loop; 3995 3996 // load state 3997 __ ld1(v8, v9, v10, v11, __ T2D, state); 3998 3999 // load first 4 round constants 4000 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4001 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4002 4003 __ BIND(sha512_loop); 4004 // load 128B of data into v12..v19 4005 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4006 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4007 __ rev64(v12, __ T16B, v12); 4008 __ rev64(v13, __ T16B, v13); 4009 __ rev64(v14, __ T16B, v14); 4010 __ rev64(v15, __ T16B, v15); 4011 __ rev64(v16, __ T16B, v16); 4012 __ rev64(v17, __ T16B, v17); 4013 __ rev64(v18, __ T16B, v18); 4014 __ rev64(v19, __ T16B, v19); 4015 4016 __ mov(rscratch2, rscratch1); 4017 4018 __ mov(v0, __ T16B, v8); 4019 __ mov(v1, __ T16B, v9); 4020 __ mov(v2, __ T16B, v10); 4021 __ mov(v3, __ T16B, v11); 4022 4023 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4024 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4025 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4026 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4027 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4028 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4029 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4030 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4031 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4032 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4033 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4034 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4035 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4036 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4037 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4038 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4039 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4040 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4041 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4042 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4043 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4044 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4045 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4046 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4047 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4048 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4049 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4050 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4051 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4052 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4053 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4054 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4055 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4056 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4057 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4058 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4059 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4060 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4061 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4062 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4063 4064 __ addv(v8, __ T2D, v8, v0); 4065 __ addv(v9, __ T2D, v9, v1); 4066 __ addv(v10, __ T2D, v10, v2); 4067 __ addv(v11, __ T2D, v11, v3); 4068 4069 if (multi_block) { 4070 __ add(ofs, ofs, 128); 4071 __ cmp(ofs, limit); 4072 __ br(Assembler::LE, sha512_loop); 4073 __ mov(c_rarg0, ofs); // return ofs 4074 } 4075 4076 __ st1(v8, v9, v10, v11, __ T2D, state); 4077 4078 __ ldpd(v14, v15, Address(sp, 48)); 4079 __ ldpd(v12, v13, Address(sp, 32)); 4080 __ ldpd(v10, v11, Address(sp, 16)); 4081 __ ldpd(v8, v9, __ post(sp, 64)); 4082 4083 __ ret(lr); 4084 4085 return start; 4086 } 4087 4088 // Execute one round of keccak of two computations in parallel. 4089 // One of the states should be loaded into the lower halves of 4090 // the vector registers v0-v24, the other should be loaded into 4091 // the upper halves of those registers. The ld1r instruction loads 4092 // the round constant into both halves of register v31. 4093 // Intermediate results c0...c5 and d0...d5 are computed 4094 // in registers v25...v30. 4095 // All vector instructions that are used operate on both register 4096 // halves in parallel. 4097 // If only a single computation is needed, one can only load the lower halves. 4098 void keccak_round(Register rscratch1) { 4099 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4100 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4101 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4102 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4103 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4104 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4105 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4106 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4107 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4108 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4109 4110 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4111 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4112 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4113 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4114 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4115 4116 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4117 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4118 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4119 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4120 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4121 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4122 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4123 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4124 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4125 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4126 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4127 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4128 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4129 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4130 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4131 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4132 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4133 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4134 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4135 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4136 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4137 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4138 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4139 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4140 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4141 4142 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4143 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4144 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4145 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4146 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4147 4148 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4149 4150 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4151 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4152 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4153 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4154 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4155 4156 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4157 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4158 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4159 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4160 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4161 4162 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4163 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4164 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4165 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4166 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4167 4168 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4169 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4170 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4171 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4172 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4173 4174 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4175 } 4176 4177 // Arguments: 4178 // 4179 // Inputs: 4180 // c_rarg0 - byte[] source+offset 4181 // c_rarg1 - byte[] SHA.state 4182 // c_rarg2 - int block_size 4183 // c_rarg3 - int offset 4184 // c_rarg4 - int limit 4185 // 4186 address generate_sha3_implCompress(StubGenStubId stub_id) { 4187 bool multi_block; 4188 switch (stub_id) { 4189 case sha3_implCompress_id: 4190 multi_block = false; 4191 break; 4192 case sha3_implCompressMB_id: 4193 multi_block = true; 4194 break; 4195 default: 4196 ShouldNotReachHere(); 4197 } 4198 4199 static const uint64_t round_consts[24] = { 4200 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4201 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4202 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4203 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4204 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4205 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4206 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4207 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4208 }; 4209 4210 __ align(CodeEntryAlignment); 4211 4212 StubCodeMark mark(this, stub_id); 4213 address start = __ pc(); 4214 4215 Register buf = c_rarg0; 4216 Register state = c_rarg1; 4217 Register block_size = c_rarg2; 4218 Register ofs = c_rarg3; 4219 Register limit = c_rarg4; 4220 4221 Label sha3_loop, rounds24_loop; 4222 Label sha3_512_or_sha3_384, shake128; 4223 4224 __ stpd(v8, v9, __ pre(sp, -64)); 4225 __ stpd(v10, v11, Address(sp, 16)); 4226 __ stpd(v12, v13, Address(sp, 32)); 4227 __ stpd(v14, v15, Address(sp, 48)); 4228 4229 // load state 4230 __ add(rscratch1, state, 32); 4231 __ ld1(v0, v1, v2, v3, __ T1D, state); 4232 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4233 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4234 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4235 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4236 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4237 __ ld1(v24, __ T1D, rscratch1); 4238 4239 __ BIND(sha3_loop); 4240 4241 // 24 keccak rounds 4242 __ movw(rscratch2, 24); 4243 4244 // load round_constants base 4245 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4246 4247 // load input 4248 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4249 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4250 __ eor(v0, __ T8B, v0, v25); 4251 __ eor(v1, __ T8B, v1, v26); 4252 __ eor(v2, __ T8B, v2, v27); 4253 __ eor(v3, __ T8B, v3, v28); 4254 __ eor(v4, __ T8B, v4, v29); 4255 __ eor(v5, __ T8B, v5, v30); 4256 __ eor(v6, __ T8B, v6, v31); 4257 4258 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4259 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4260 4261 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4262 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4263 __ eor(v7, __ T8B, v7, v25); 4264 __ eor(v8, __ T8B, v8, v26); 4265 __ eor(v9, __ T8B, v9, v27); 4266 __ eor(v10, __ T8B, v10, v28); 4267 __ eor(v11, __ T8B, v11, v29); 4268 __ eor(v12, __ T8B, v12, v30); 4269 __ eor(v13, __ T8B, v13, v31); 4270 4271 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4272 __ eor(v14, __ T8B, v14, v25); 4273 __ eor(v15, __ T8B, v15, v26); 4274 __ eor(v16, __ T8B, v16, v27); 4275 4276 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4277 __ andw(c_rarg5, block_size, 48); 4278 __ cbzw(c_rarg5, rounds24_loop); 4279 4280 __ tbnz(block_size, 5, shake128); 4281 // block_size == 144, bit5 == 0, SHA3-224 4282 __ ldrd(v28, __ post(buf, 8)); 4283 __ eor(v17, __ T8B, v17, v28); 4284 __ b(rounds24_loop); 4285 4286 __ BIND(shake128); 4287 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4288 __ eor(v17, __ T8B, v17, v28); 4289 __ eor(v18, __ T8B, v18, v29); 4290 __ eor(v19, __ T8B, v19, v30); 4291 __ eor(v20, __ T8B, v20, v31); 4292 __ b(rounds24_loop); // block_size == 168, SHAKE128 4293 4294 __ BIND(sha3_512_or_sha3_384); 4295 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4296 __ eor(v7, __ T8B, v7, v25); 4297 __ eor(v8, __ T8B, v8, v26); 4298 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4299 4300 // SHA3-384 4301 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4302 __ eor(v9, __ T8B, v9, v27); 4303 __ eor(v10, __ T8B, v10, v28); 4304 __ eor(v11, __ T8B, v11, v29); 4305 __ eor(v12, __ T8B, v12, v30); 4306 4307 __ BIND(rounds24_loop); 4308 __ subw(rscratch2, rscratch2, 1); 4309 4310 keccak_round(rscratch1); 4311 4312 __ cbnzw(rscratch2, rounds24_loop); 4313 4314 if (multi_block) { 4315 __ add(ofs, ofs, block_size); 4316 __ cmp(ofs, limit); 4317 __ br(Assembler::LE, sha3_loop); 4318 __ mov(c_rarg0, ofs); // return ofs 4319 } 4320 4321 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4322 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4323 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4324 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4325 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4326 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4327 __ st1(v24, __ T1D, state); 4328 4329 // restore callee-saved registers 4330 __ ldpd(v14, v15, Address(sp, 48)); 4331 __ ldpd(v12, v13, Address(sp, 32)); 4332 __ ldpd(v10, v11, Address(sp, 16)); 4333 __ ldpd(v8, v9, __ post(sp, 64)); 4334 4335 __ ret(lr); 4336 4337 return start; 4338 } 4339 4340 // Inputs: 4341 // c_rarg0 - long[] state0 4342 // c_rarg1 - long[] state1 4343 address generate_double_keccak() { 4344 static const uint64_t round_consts[24] = { 4345 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4346 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4347 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4348 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4349 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4350 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4351 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4352 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4353 }; 4354 4355 // Implements the double_keccak() method of the 4356 // sun.secyrity.provider.SHA3Parallel class 4357 __ align(CodeEntryAlignment); 4358 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4359 address start = __ pc(); 4360 __ enter(); 4361 4362 Register state0 = c_rarg0; 4363 Register state1 = c_rarg1; 4364 4365 Label rounds24_loop; 4366 4367 // save callee-saved registers 4368 __ stpd(v8, v9, __ pre(sp, -64)); 4369 __ stpd(v10, v11, Address(sp, 16)); 4370 __ stpd(v12, v13, Address(sp, 32)); 4371 __ stpd(v14, v15, Address(sp, 48)); 4372 4373 // load states 4374 __ add(rscratch1, state0, 32); 4375 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4376 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4377 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4378 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4379 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4380 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4381 __ ld1(v24, __ D, 0, rscratch1); 4382 __ add(rscratch1, state1, 32); 4383 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4384 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4385 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4386 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4387 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4388 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4389 __ ld1(v24, __ D, 1, rscratch1); 4390 4391 // 24 keccak rounds 4392 __ movw(rscratch2, 24); 4393 4394 // load round_constants base 4395 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4396 4397 __ BIND(rounds24_loop); 4398 __ subw(rscratch2, rscratch2, 1); 4399 keccak_round(rscratch1); 4400 __ cbnzw(rscratch2, rounds24_loop); 4401 4402 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4403 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4404 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4405 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4406 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4407 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4408 __ st1(v24, __ D, 0, state0); 4409 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4410 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4411 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4412 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4413 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4414 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4415 __ st1(v24, __ D, 1, state1); 4416 4417 // restore callee-saved vector registers 4418 __ ldpd(v14, v15, Address(sp, 48)); 4419 __ ldpd(v12, v13, Address(sp, 32)); 4420 __ ldpd(v10, v11, Address(sp, 16)); 4421 __ ldpd(v8, v9, __ post(sp, 64)); 4422 4423 __ leave(); // required for proper stackwalking of RuntimeStub frame 4424 __ mov(r0, zr); // return 0 4425 __ ret(lr); 4426 4427 return start; 4428 } 4429 4430 /** 4431 * Arguments: 4432 * 4433 * Inputs: 4434 * c_rarg0 - int crc 4435 * c_rarg1 - byte* buf 4436 * c_rarg2 - int length 4437 * 4438 * Output: 4439 * rax - int crc result 4440 */ 4441 address generate_updateBytesCRC32() { 4442 assert(UseCRC32Intrinsics, "what are we doing here?"); 4443 4444 __ align(CodeEntryAlignment); 4445 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 4446 StubCodeMark mark(this, stub_id); 4447 4448 address start = __ pc(); 4449 4450 const Register crc = c_rarg0; // crc 4451 const Register buf = c_rarg1; // source java byte array address 4452 const Register len = c_rarg2; // length 4453 const Register table0 = c_rarg3; // crc_table address 4454 const Register table1 = c_rarg4; 4455 const Register table2 = c_rarg5; 4456 const Register table3 = c_rarg6; 4457 const Register tmp3 = c_rarg7; 4458 4459 BLOCK_COMMENT("Entry:"); 4460 __ enter(); // required for proper stackwalking of RuntimeStub frame 4461 4462 __ kernel_crc32(crc, buf, len, 4463 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4464 4465 __ leave(); // required for proper stackwalking of RuntimeStub frame 4466 __ ret(lr); 4467 4468 return start; 4469 } 4470 4471 // ChaCha20 block function. This version parallelizes 4 quarter 4472 // round operations at a time. It uses 16 SIMD registers to 4473 // produce 4 blocks of key stream. 4474 // 4475 // state (int[16]) = c_rarg0 4476 // keystream (byte[256]) = c_rarg1 4477 // return - number of bytes of keystream (always 256) 4478 // 4479 // In this approach, we load the 512-bit start state sequentially into 4480 // 4 128-bit vectors. We then make 4 4-vector copies of that starting 4481 // state, with each successive set of 4 vectors having a +1 added into 4482 // the first 32-bit lane of the 4th vector in that group (the counter). 4483 // By doing this, we can perform the block function on 4 512-bit blocks 4484 // within one run of this intrinsic. 4485 // The alignment of the data across the 4-vector group is such that at 4486 // the start it is already aligned for the first round of each two-round 4487 // loop iteration. In other words, the corresponding lanes of each vector 4488 // will contain the values needed for that quarter round operation (e.g. 4489 // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.). 4490 // In between each full round, a lane shift must occur. Within a loop 4491 // iteration, between the first and second rounds, the 2nd, 3rd, and 4th 4492 // vectors are rotated left 32, 64 and 96 bits, respectively. The result 4493 // is effectively a diagonal orientation in columnar form. After the 4494 // second full round, those registers are left-rotated again, this time 4495 // 96, 64, and 32 bits - returning the vectors to their columnar organization. 4496 // After all 10 iterations, the original state is added to each 4-vector 4497 // working state along with the add mask, and the 4 vector groups are 4498 // sequentially written to the memory dedicated for the output key stream. 4499 // 4500 // For a more detailed explanation, see Goll and Gueron, "Vectorization of 4501 // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology: 4502 // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33 4503 address generate_chacha20Block_qrpar() { 4504 Label L_Q_twoRounds, L_Q_cc20_const; 4505 // The constant data is broken into two 128-bit segments to be loaded 4506 // onto SIMD registers. The first 128 bits are a counter add overlay 4507 // that adds +1/+0/+0/+0 to the vectors holding replicated state[12]. 4508 // The second 128-bits is a table constant used for 8-bit left rotations. 4509 // on 32-bit lanes within a SIMD register. 4510 __ BIND(L_Q_cc20_const); 4511 __ emit_int64(0x0000000000000001UL); 4512 __ emit_int64(0x0000000000000000UL); 4513 __ emit_int64(0x0605040702010003UL); 4514 __ emit_int64(0x0E0D0C0F0A09080BUL); 4515 4516 __ align(CodeEntryAlignment); 4517 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4518 StubCodeMark mark(this, stub_id); 4519 address start = __ pc(); 4520 __ enter(); 4521 4522 const Register state = c_rarg0; 4523 const Register keystream = c_rarg1; 4524 const Register loopCtr = r10; 4525 const Register tmpAddr = r11; 4526 4527 const FloatRegister aState = v0; 4528 const FloatRegister bState = v1; 4529 const FloatRegister cState = v2; 4530 const FloatRegister dState = v3; 4531 const FloatRegister a1Vec = v4; 4532 const FloatRegister b1Vec = v5; 4533 const FloatRegister c1Vec = v6; 4534 const FloatRegister d1Vec = v7; 4535 // Skip the callee-saved registers v8 - v15 4536 const FloatRegister a2Vec = v16; 4537 const FloatRegister b2Vec = v17; 4538 const FloatRegister c2Vec = v18; 4539 const FloatRegister d2Vec = v19; 4540 const FloatRegister a3Vec = v20; 4541 const FloatRegister b3Vec = v21; 4542 const FloatRegister c3Vec = v22; 4543 const FloatRegister d3Vec = v23; 4544 const FloatRegister a4Vec = v24; 4545 const FloatRegister b4Vec = v25; 4546 const FloatRegister c4Vec = v26; 4547 const FloatRegister d4Vec = v27; 4548 const FloatRegister scratch = v28; 4549 const FloatRegister addMask = v29; 4550 const FloatRegister lrot8Tbl = v30; 4551 4552 // Load the initial state in the first 4 quadword registers, 4553 // then copy the initial state into the next 4 quadword registers 4554 // that will be used for the working state. 4555 __ ld1(aState, bState, cState, dState, __ T16B, Address(state)); 4556 4557 // Load the index register for 2 constant 128-bit data fields. 4558 // The first represents the +1/+0/+0/+0 add mask. The second is 4559 // the 8-bit left rotation. 4560 __ adr(tmpAddr, L_Q_cc20_const); 4561 __ ldpq(addMask, lrot8Tbl, Address(tmpAddr)); 4562 4563 __ mov(a1Vec, __ T16B, aState); 4564 __ mov(b1Vec, __ T16B, bState); 4565 __ mov(c1Vec, __ T16B, cState); 4566 __ mov(d1Vec, __ T16B, dState); 4567 4568 __ mov(a2Vec, __ T16B, aState); 4569 __ mov(b2Vec, __ T16B, bState); 4570 __ mov(c2Vec, __ T16B, cState); 4571 __ addv(d2Vec, __ T4S, d1Vec, addMask); 4572 4573 __ mov(a3Vec, __ T16B, aState); 4574 __ mov(b3Vec, __ T16B, bState); 4575 __ mov(c3Vec, __ T16B, cState); 4576 __ addv(d3Vec, __ T4S, d2Vec, addMask); 4577 4578 __ mov(a4Vec, __ T16B, aState); 4579 __ mov(b4Vec, __ T16B, bState); 4580 __ mov(c4Vec, __ T16B, cState); 4581 __ addv(d4Vec, __ T4S, d3Vec, addMask); 4582 4583 // Set up the 10 iteration loop 4584 __ mov(loopCtr, 10); 4585 __ BIND(L_Q_twoRounds); 4586 4587 // The first set of operations on the vectors covers the first 4 quarter 4588 // round operations: 4589 // Qround(state, 0, 4, 8,12) 4590 // Qround(state, 1, 5, 9,13) 4591 // Qround(state, 2, 6,10,14) 4592 // Qround(state, 3, 7,11,15) 4593 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4594 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4595 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4596 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4597 4598 // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to 4599 // diagonals. The a1Vec does not need to change orientation. 4600 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true); 4601 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true); 4602 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true); 4603 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true); 4604 4605 // The second set of operations on the vectors covers the second 4 quarter 4606 // round operations, now acting on the diagonals: 4607 // Qround(state, 0, 5,10,15) 4608 // Qround(state, 1, 6,11,12) 4609 // Qround(state, 2, 7, 8,13) 4610 // Qround(state, 3, 4, 9,14) 4611 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4612 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4613 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4614 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4615 4616 // Before we start the next iteration, we need to perform shuffles 4617 // on the b/c/d vectors to move them back to columnar organizations 4618 // from their current diagonal orientation. 4619 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false); 4620 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false); 4621 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false); 4622 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false); 4623 4624 // Decrement and iterate 4625 __ sub(loopCtr, loopCtr, 1); 4626 __ cbnz(loopCtr, L_Q_twoRounds); 4627 4628 // Once the counter reaches zero, we fall out of the loop 4629 // and need to add the initial state back into the working state 4630 // represented by the a/b/c/d1Vec registers. This is destructive 4631 // on the dState register but we no longer will need it. 4632 __ addv(a1Vec, __ T4S, a1Vec, aState); 4633 __ addv(b1Vec, __ T4S, b1Vec, bState); 4634 __ addv(c1Vec, __ T4S, c1Vec, cState); 4635 __ addv(d1Vec, __ T4S, d1Vec, dState); 4636 4637 __ addv(a2Vec, __ T4S, a2Vec, aState); 4638 __ addv(b2Vec, __ T4S, b2Vec, bState); 4639 __ addv(c2Vec, __ T4S, c2Vec, cState); 4640 __ addv(dState, __ T4S, dState, addMask); 4641 __ addv(d2Vec, __ T4S, d2Vec, dState); 4642 4643 __ addv(a3Vec, __ T4S, a3Vec, aState); 4644 __ addv(b3Vec, __ T4S, b3Vec, bState); 4645 __ addv(c3Vec, __ T4S, c3Vec, cState); 4646 __ addv(dState, __ T4S, dState, addMask); 4647 __ addv(d3Vec, __ T4S, d3Vec, dState); 4648 4649 __ addv(a4Vec, __ T4S, a4Vec, aState); 4650 __ addv(b4Vec, __ T4S, b4Vec, bState); 4651 __ addv(c4Vec, __ T4S, c4Vec, cState); 4652 __ addv(dState, __ T4S, dState, addMask); 4653 __ addv(d4Vec, __ T4S, d4Vec, dState); 4654 4655 // Write the final state back to the result buffer 4656 __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64)); 4657 __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64)); 4658 __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64)); 4659 __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64)); 4660 4661 __ mov(r0, 256); // Return length of output keystream 4662 __ leave(); 4663 __ ret(lr); 4664 4665 return start; 4666 } 4667 4668 void dilithium_load16zetas(int o0, Register zetas) { 4669 __ ldpq(as_FloatRegister(o0), as_FloatRegister(o0 + 1), __ post (zetas, 32)); 4670 __ ldpq(as_FloatRegister(o0 + 2), as_FloatRegister(o0 + 3), __ post (zetas, 32)); 4671 4672 } 4673 4674 void dilithium_load32zetas(Register zetas) { 4675 dilithium_load16zetas(16, zetas); 4676 dilithium_load16zetas(20, zetas); 4677 } 4678 4679 // 2x16 32-bit Montgomery multiplications in parallel 4680 // See the montMul() method of the sun.security.provider.ML_DSA class. 4681 // Here MONT_R_BITS is 32, so the right shift by it is implicit. 4682 // The constants qInv = MONT_Q_INV_MOD_R and q = MONT_Q are loaded in 4683 // (all 32-bit chunks of) vector registers v30 and v31, resp. 4684 // The inputs are b[i]s in v0-v7 and c[i]s v16-v23 and 4685 // the results are a[i]s in v16-v23, four 32-bit values in each register 4686 // and we do a_i = b_i * c_i * 2^-32 mod MONT_Q for all 4687 void dilithium_montmul32(bool by_constant) { 4688 FloatRegister vr0 = by_constant ? v29 : v0; 4689 FloatRegister vr1 = by_constant ? v29 : v1; 4690 FloatRegister vr2 = by_constant ? v29 : v2; 4691 FloatRegister vr3 = by_constant ? v29 : v3; 4692 FloatRegister vr4 = by_constant ? v29 : v4; 4693 FloatRegister vr5 = by_constant ? v29 : v5; 4694 FloatRegister vr6 = by_constant ? v29 : v6; 4695 FloatRegister vr7 = by_constant ? v29 : v7; 4696 4697 __ sqdmulh(v24, __ T4S, vr0, v16); // aHigh = hi32(2 * b * c) 4698 __ mulv(v16, __ T4S, vr0, v16); // aLow = lo32(b * c) 4699 __ sqdmulh(v25, __ T4S, vr1, v17); 4700 __ mulv(v17, __ T4S, vr1, v17); 4701 __ sqdmulh(v26, __ T4S, vr2, v18); 4702 __ mulv(v18, __ T4S, vr2, v18); 4703 __ sqdmulh(v27, __ T4S, vr3, v19); 4704 __ mulv(v19, __ T4S, vr3, v19); 4705 4706 __ mulv(v16, __ T4S, v16, v30); // m = aLow * qinv 4707 __ mulv(v17, __ T4S, v17, v30); 4708 __ mulv(v18, __ T4S, v18, v30); 4709 __ mulv(v19, __ T4S, v19, v30); 4710 4711 __ sqdmulh(v16, __ T4S, v16, v31); // n = hi32(2 * m * q) 4712 __ sqdmulh(v17, __ T4S, v17, v31); 4713 __ sqdmulh(v18, __ T4S, v18, v31); 4714 __ sqdmulh(v19, __ T4S, v19, v31); 4715 4716 __ shsubv(v16, __ T4S, v24, v16); // a = (aHigh - n) / 2 4717 __ shsubv(v17, __ T4S, v25, v17); 4718 __ shsubv(v18, __ T4S, v26, v18); 4719 __ shsubv(v19, __ T4S, v27, v19); 4720 4721 __ sqdmulh(v24, __ T4S, vr4, v20); 4722 __ mulv(v20, __ T4S, vr4, v20); 4723 __ sqdmulh(v25, __ T4S, vr5, v21); 4724 __ mulv(v21, __ T4S, vr5, v21); 4725 __ sqdmulh(v26, __ T4S, vr6, v22); 4726 __ mulv(v22, __ T4S, vr6, v22); 4727 __ sqdmulh(v27, __ T4S, vr7, v23); 4728 __ mulv(v23, __ T4S, vr7, v23); 4729 4730 __ mulv(v20, __ T4S, v20, v30); 4731 __ mulv(v21, __ T4S, v21, v30); 4732 __ mulv(v22, __ T4S, v22, v30); 4733 __ mulv(v23, __ T4S, v23, v30); 4734 4735 __ sqdmulh(v20, __ T4S, v20, v31); 4736 __ sqdmulh(v21, __ T4S, v21, v31); 4737 __ sqdmulh(v22, __ T4S, v22, v31); 4738 __ sqdmulh(v23, __ T4S, v23, v31); 4739 4740 __ shsubv(v20, __ T4S, v24, v20); 4741 __ shsubv(v21, __ T4S, v25, v21); 4742 __ shsubv(v22, __ T4S, v26, v22); 4743 __ shsubv(v23, __ T4S, v27, v23); 4744 } 4745 4746 // Do the addition and subtraction done in the ntt algorithm. 4747 // See sun.security.provider.ML_DSA.implDilithiumAlmostNttJava() 4748 void dilithium_add_sub32() { 4749 __ addv(v24, __ T4S, v0, v16); // coeffs[j] = coeffs[j] + tmp; 4750 __ addv(v25, __ T4S, v1, v17); 4751 __ addv(v26, __ T4S, v2, v18); 4752 __ addv(v27, __ T4S, v3, v19); 4753 __ addv(v28, __ T4S, v4, v20); 4754 __ addv(v29, __ T4S, v5, v21); 4755 __ addv(v30, __ T4S, v6, v22); 4756 __ addv(v31, __ T4S, v7, v23); 4757 4758 __ subv(v0, __ T4S, v0, v16); // coeffs[j + l] = coeffs[j] - tmp; 4759 __ subv(v1, __ T4S, v1, v17); 4760 __ subv(v2, __ T4S, v2, v18); 4761 __ subv(v3, __ T4S, v3, v19); 4762 __ subv(v4, __ T4S, v4, v20); 4763 __ subv(v5, __ T4S, v5, v21); 4764 __ subv(v6, __ T4S, v6, v22); 4765 __ subv(v7, __ T4S, v7, v23); 4766 } 4767 4768 // Do the same computation that 4769 // dilithium_montmul32() and dilithium_add_sub32() does, 4770 // except for only 4x4 32-bit vector elements and with 4771 // different register usage. 4772 void dilithium_montmul_sub_add16() { 4773 __ sqdmulh(v24, __ T4S, v1, v16); 4774 __ mulv(v16, __ T4S, v1, v16); 4775 __ sqdmulh(v25, __ T4S, v3, v17); 4776 __ mulv(v17, __ T4S, v3, v17); 4777 __ sqdmulh(v26, __ T4S, v5, v18); 4778 __ mulv(v18, __ T4S, v5, v18); 4779 __ sqdmulh(v27, __ T4S, v7, v19); 4780 __ mulv(v19, __ T4S, v7, v19); 4781 4782 __ mulv(v16, __ T4S, v16, v30); 4783 __ mulv(v17, __ T4S, v17, v30); 4784 __ mulv(v18, __ T4S, v18, v30); 4785 __ mulv(v19, __ T4S, v19, v30); 4786 4787 __ sqdmulh(v16, __ T4S, v16, v31); 4788 __ sqdmulh(v17, __ T4S, v17, v31); 4789 __ sqdmulh(v18, __ T4S, v18, v31); 4790 __ sqdmulh(v19, __ T4S, v19, v31); 4791 4792 __ shsubv(v16, __ T4S, v24, v16); 4793 __ shsubv(v17, __ T4S, v25, v17); 4794 __ shsubv(v18, __ T4S, v26, v18); 4795 __ shsubv(v19, __ T4S, v27, v19); 4796 4797 __ subv(v1, __ T4S, v0, v16); 4798 __ subv(v3, __ T4S, v2, v17); 4799 __ subv(v5, __ T4S, v4, v18); 4800 __ subv(v7, __ T4S, v6, v19); 4801 4802 __ addv(v0, __ T4S, v0, v16); 4803 __ addv(v2, __ T4S, v2, v17); 4804 __ addv(v4, __ T4S, v4, v18); 4805 __ addv(v6, __ T4S, v6, v19); 4806 } 4807 4808 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 4809 // in the Java implementation come in sequences of at least 8, so we 4810 // can use ldpq to collect the corresponding data into pairs of vector 4811 // registers. 4812 // We collect the coefficients corresponding to the 'j+l' indexes into 4813 // the vector registers v0-v7, the zetas into the vector registers v16-v23 4814 // then we do the (Montgomery) multiplications by the zetas in parallel 4815 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 4816 // v0-v7, then do the additions into v24-v31 and the subtractions into 4817 // v0-v7 and finally save the results back to the coeffs array. 4818 void dilithiumNttLevel0_4(const Register dilithiumConsts, 4819 const Register coeffs, const Register zetas) { 4820 int c1 = 0; 4821 int c2 = 512; 4822 int startIncr; 4823 int incr1 = 32; 4824 int incr2 = 64; 4825 int incr3 = 96; 4826 4827 for (int level = 0; level < 5; level++) { 4828 int c1Start = c1; 4829 int c2Start = c2; 4830 if (level == 3) { 4831 incr1 = 32; 4832 incr2 = 128; 4833 incr3 = 160; 4834 } else if (level == 4) { 4835 incr1 = 64; 4836 incr2 = 128; 4837 incr3 = 192; 4838 } 4839 4840 for (int i = 0; i < 4; i++) { 4841 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4842 __ ldpq(v0, v1, Address(coeffs, c2Start)); 4843 __ ldpq(v2, v3, Address(coeffs, c2Start + incr1)); 4844 __ ldpq(v4, v5, Address(coeffs, c2Start + incr2)); 4845 __ ldpq(v6, v7, Address(coeffs, c2Start + incr3)); 4846 dilithium_load32zetas(zetas); 4847 dilithium_montmul32(false); 4848 __ ldpq(v0, v1, Address(coeffs, c1Start)); 4849 __ ldpq(v2, v3, Address(coeffs, c1Start + incr1)); 4850 __ ldpq(v4, v5, Address(coeffs, c1Start + incr2)); 4851 __ ldpq(v6, v7, Address(coeffs, c1Start + incr3)); 4852 dilithium_add_sub32(); 4853 __ stpq(v24, v25, Address(coeffs, c1Start)); 4854 __ stpq(v26, v27, Address(coeffs, c1Start + incr1)); 4855 __ stpq(v28, v29, Address(coeffs, c1Start + incr2)); 4856 __ stpq(v30, v31, Address(coeffs, c1Start + incr3)); 4857 __ stpq(v0, v1, Address(coeffs, c2Start)); 4858 __ stpq(v2, v3, Address(coeffs, c2Start + incr1)); 4859 __ stpq(v4, v5, Address(coeffs, c2Start + incr2)); 4860 __ stpq(v6, v7, Address(coeffs, c2Start + incr3)); 4861 4862 int k = 4 * level + i; 4863 4864 if (k > 7) { 4865 startIncr = 256; 4866 } else if (k == 5) { 4867 startIncr = 384; 4868 } else { 4869 startIncr = 128; 4870 } 4871 4872 c1Start += startIncr; 4873 c2Start += startIncr; 4874 } 4875 4876 c2 /= 2; 4877 } 4878 } 4879 4880 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 4881 // Implements the method 4882 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 4883 // of the Java class sun.security.provider 4884 // 4885 // coeffs (int[256]) = c_rarg0 4886 // zetas (int[256]) = c_rarg1 4887 address generate_dilithiumAlmostNtt() { 4888 4889 __ align(CodeEntryAlignment); 4890 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 4891 StubCodeMark mark(this, stub_id); 4892 address start = __ pc(); 4893 __ enter(); 4894 4895 const Register coeffs = c_rarg0; 4896 const Register zetas = c_rarg1; 4897 4898 const Register tmpAddr = r9; 4899 const Register dilithiumConsts = r10; 4900 const Register result = r11; 4901 4902 __ add(result, coeffs, 0); 4903 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 4904 4905 // Each level represents one iteration of the outer for loop of the Java version 4906 4907 // level 0-4 4908 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 4909 4910 // level 5 4911 for (int i = 0; i < 1024; i += 256) { 4912 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4913 __ ldr(v0, __ Q, Address(coeffs, i + 16)); 4914 __ ldr(v1, __ Q, Address(coeffs, i + 48)); 4915 __ ldr(v2, __ Q, Address(coeffs, i + 80)); 4916 __ ldr(v3, __ Q, Address(coeffs, i + 112)); 4917 __ ldr(v4, __ Q, Address(coeffs, i + 144)); 4918 __ ldr(v5, __ Q, Address(coeffs, i + 176)); 4919 __ ldr(v6, __ Q, Address(coeffs, i + 208)); 4920 __ ldr(v7, __ Q, Address(coeffs, i + 240)); 4921 dilithium_load32zetas(zetas); 4922 dilithium_montmul32(false); 4923 __ ldr(v0, __ Q, Address(coeffs, i)); 4924 __ ldr(v1, __ Q, Address(coeffs, i + 32)); 4925 __ ldr(v2, __ Q, Address(coeffs, i + 64)); 4926 __ ldr(v3, __ Q, Address(coeffs, i + 96)); 4927 __ ldr(v4, __ Q, Address(coeffs, i + 128)); 4928 __ ldr(v5, __ Q, Address(coeffs, i + 160)); 4929 __ ldr(v6, __ Q, Address(coeffs, i + 192)); 4930 __ ldr(v7, __ Q, Address(coeffs, i + 224)); 4931 dilithium_add_sub32(); 4932 __ str(v24, __ Q, Address(coeffs, i)); 4933 __ str(v25, __ Q, Address(coeffs, i + 32)); 4934 __ str(v26, __ Q, Address(coeffs, i + 64)); 4935 __ str(v27, __ Q, Address(coeffs, i + 96)); 4936 __ str(v28, __ Q, Address(coeffs, i + 128)); 4937 __ str(v29, __ Q, Address(coeffs, i + 160)); 4938 __ str(v30, __ Q, Address(coeffs, i + 192)); 4939 __ str(v31, __ Q, Address(coeffs, i + 224)); 4940 __ str(v0, __ Q, Address(coeffs, i + 16)); 4941 __ str(v1, __ Q, Address(coeffs, i + 48)); 4942 __ str(v2, __ Q, Address(coeffs, i + 80)); 4943 __ str(v3, __ Q, Address(coeffs, i + 112)); 4944 __ str(v4, __ Q, Address(coeffs, i + 144)); 4945 __ str(v5, __ Q, Address(coeffs, i + 176)); 4946 __ str(v6, __ Q, Address(coeffs, i + 208)); 4947 __ str(v7, __ Q, Address(coeffs, i + 240)); 4948 } 4949 4950 // level 6 4951 for (int i = 0; i < 1024; i += 128) { 4952 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4953 __ add(tmpAddr, coeffs, i); 4954 __ ld2(v0, v1, __ T2D, tmpAddr); 4955 __ add(tmpAddr, coeffs, i + 32); 4956 __ ld2(v2, v3, __ T2D, tmpAddr); 4957 __ add(tmpAddr, coeffs, i + 64); 4958 __ ld2(v4, v5, __ T2D, tmpAddr); 4959 __ add(tmpAddr, coeffs, i + 96); 4960 __ ld2(v6, v7, __ T2D, tmpAddr); 4961 dilithium_load16zetas(16, zetas); 4962 dilithium_montmul_sub_add16(); 4963 __ add(tmpAddr, coeffs, i); 4964 __ st2(v0, v1, __ T2D, tmpAddr); 4965 __ add(tmpAddr, coeffs, i + 32); 4966 __ st2(v2, v3, __ T2D, tmpAddr); 4967 __ add(tmpAddr, coeffs, i + 64); 4968 __ st2(v4, v5, __ T2D, tmpAddr); 4969 __ add(tmpAddr, coeffs, i + 96); 4970 __ st2(v6, v7, __ T2D, tmpAddr); 4971 } 4972 4973 // level 7 4974 for (int i = 0; i < 1024; i += 128) { 4975 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4976 __ add(tmpAddr, coeffs, i); 4977 __ ld2(v0, v1, __ T4S, tmpAddr); 4978 __ add(tmpAddr, coeffs, i + 32); 4979 __ ld2(v2, v3, __ T4S, tmpAddr); 4980 __ add(tmpAddr, coeffs, i + 64); 4981 __ ld2(v4, v5, __ T4S, tmpAddr); 4982 __ add(tmpAddr, coeffs, i + 96); 4983 __ ld2(v6, v7, __ T4S, tmpAddr); 4984 dilithium_load16zetas(16, zetas); 4985 dilithium_montmul_sub_add16(); 4986 __ add(tmpAddr, coeffs, i); 4987 __ st2(v0, v1, __ T4S, tmpAddr); 4988 __ add(tmpAddr, coeffs, i + 32); 4989 __ st2(v2, v3, __ T4S, tmpAddr); 4990 __ add(tmpAddr, coeffs, i + 64); 4991 __ st2(v4, v5, __ T4S, tmpAddr); 4992 __ add(tmpAddr, coeffs, i + 96); 4993 __ st2(v6, v7, __ T4S, tmpAddr); 4994 } 4995 __ leave(); // required for proper stackwalking of RuntimeStub frame 4996 __ mov(r0, zr); // return 0 4997 __ ret(lr); 4998 4999 return start; 5000 5001 } 5002 5003 // Do the computations that can be found in the body of the loop in 5004 // sun.security.provider.ML_DSA.implDilithiumAlmostInverseNttJava() 5005 // for 16 coefficients in parallel: 5006 // tmp = coeffs[j]; 5007 // coeffs[j] = (tmp + coeffs[j + l]); 5008 // coeffs[j + l] = montMul(tmp - coeffs[j + l], -MONT_ZETAS_FOR_NTT[m]); 5009 // coefss[j]s are loaded in v0, v2, v4 and v6, 5010 // coeffs[j + l]s in v1, v3, v5 and v7, 5011 // the corresponding zetas in v16, v17, v18 and v19. 5012 void dilithium_sub_add_montmul16() { 5013 __ subv(v20, __ T4S, v0, v1); 5014 __ subv(v21, __ T4S, v2, v3); 5015 __ subv(v22, __ T4S, v4, v5); 5016 __ subv(v23, __ T4S, v6, v7); 5017 5018 __ addv(v0, __ T4S, v0, v1); 5019 __ addv(v2, __ T4S, v2, v3); 5020 __ addv(v4, __ T4S, v4, v5); 5021 __ addv(v6, __ T4S, v6, v7); 5022 5023 __ sqdmulh(v24, __ T4S, v20, v16); // aHigh = hi32(2 * b * c) 5024 __ mulv(v1, __ T4S, v20, v16); // aLow = lo32(b * c) 5025 __ sqdmulh(v25, __ T4S, v21, v17); 5026 __ mulv(v3, __ T4S, v21, v17); 5027 __ sqdmulh(v26, __ T4S, v22, v18); 5028 __ mulv(v5, __ T4S, v22, v18); 5029 __ sqdmulh(v27, __ T4S, v23, v19); 5030 __ mulv(v7, __ T4S, v23, v19); 5031 5032 __ mulv(v1, __ T4S, v1, v30); // m = (aLow * q) 5033 __ mulv(v3, __ T4S, v3, v30); 5034 __ mulv(v5, __ T4S, v5, v30); 5035 __ mulv(v7, __ T4S, v7, v30); 5036 5037 __ sqdmulh(v1, __ T4S, v1, v31); // n = hi32(2 * m * q) 5038 __ sqdmulh(v3, __ T4S, v3, v31); 5039 __ sqdmulh(v5, __ T4S, v5, v31); 5040 __ sqdmulh(v7, __ T4S, v7, v31); 5041 5042 __ shsubv(v1, __ T4S, v24, v1); // a = (aHigh - n) / 2 5043 __ shsubv(v3, __ T4S, v25, v3); 5044 __ shsubv(v5, __ T4S, v26, v5); 5045 __ shsubv(v7, __ T4S, v27, v7); 5046 } 5047 5048 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 5049 // in the Java implementation come in sequences of at least 8, so we 5050 // can use ldpq to collect the corresponding data into pairs of vector 5051 // registers 5052 // We collect the coefficients that correspond to the 'j's into v0-v7 5053 // the coefficiets that correspond to the 'j+l's into v16-v23 then 5054 // do the additions into v24-v31 and the subtractions into v0-v7 then 5055 // save the result of the additions, load the zetas into v16-v23 5056 // do the (Montgomery) multiplications by zeta in parallel into v16-v23 5057 // finally save the results back to the coeffs array 5058 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 5059 const Register coeffs, const Register zetas) { 5060 int c1 = 0; 5061 int c2 = 32; 5062 int startIncr; 5063 int incr1; 5064 int incr2; 5065 int incr3; 5066 5067 for (int level = 3; level < 8; level++) { 5068 int c1Start = c1; 5069 int c2Start = c2; 5070 if (level == 3) { 5071 incr1 = 64; 5072 incr2 = 128; 5073 incr3 = 192; 5074 } else if (level == 4) { 5075 incr1 = 32; 5076 incr2 = 128; 5077 incr3 = 160; 5078 } else { 5079 incr1 = 32; 5080 incr2 = 64; 5081 incr3 = 96; 5082 } 5083 5084 for (int i = 0; i < 4; i++) { 5085 __ ldpq(v0, v1, Address(coeffs, c1Start)); 5086 __ ldpq(v2, v3, Address(coeffs, c1Start + incr1)); 5087 __ ldpq(v4, v5, Address(coeffs, c1Start + incr2)); 5088 __ ldpq(v6, v7, Address(coeffs, c1Start + incr3)); 5089 __ ldpq(v16, v17, Address(coeffs, c2Start)); 5090 __ ldpq(v18, v19, Address(coeffs, c2Start + incr1)); 5091 __ ldpq(v20, v21, Address(coeffs, c2Start + incr2)); 5092 __ ldpq(v22, v23, Address(coeffs, c2Start + incr3)); 5093 dilithium_add_sub32(); 5094 __ stpq(v24, v25, Address(coeffs, c1Start)); 5095 __ stpq(v26, v27, Address(coeffs, c1Start + incr1)); 5096 __ stpq(v28, v29, Address(coeffs, c1Start + incr2)); 5097 __ stpq(v30, v31, Address(coeffs, c1Start + incr3)); 5098 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5099 dilithium_load32zetas(zetas); 5100 dilithium_montmul32(false); 5101 __ stpq(v16, v17, Address(coeffs, c2Start)); 5102 __ stpq(v18, v19, Address(coeffs, c2Start + incr1)); 5103 __ stpq(v20, v21, Address(coeffs, c2Start + incr2)); 5104 __ stpq(v22, v23, Address(coeffs, c2Start + incr3)); 5105 5106 int k = 4 * level + i; 5107 5108 if (k < 24) { 5109 startIncr = 256; 5110 } else if (k == 25) { 5111 startIncr = 384; 5112 } else { 5113 startIncr = 128; 5114 } 5115 5116 c1Start += startIncr; 5117 c2Start += startIncr; 5118 } 5119 5120 c2 *= 2; 5121 } 5122 } 5123 5124 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 5125 // Implements the method 5126 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 5127 // the sun.security.provider.ML_DSA class. 5128 // 5129 // coeffs (int[256]) = c_rarg0 5130 // zetas (int[256]) = c_rarg1 5131 address generate_dilithiumAlmostInverseNtt() { 5132 5133 __ align(CodeEntryAlignment); 5134 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 5135 StubCodeMark mark(this, stub_id); 5136 address start = __ pc(); 5137 __ enter(); 5138 5139 const Register coeffs = c_rarg0; 5140 const Register zetas = c_rarg1; 5141 5142 const Register tmpAddr = r9; 5143 const Register dilithiumConsts = r10; 5144 const Register result = r11; 5145 5146 __ add(result, coeffs, 0); 5147 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5148 5149 // Each level represents one iteration of the outer for loop of the Java version 5150 // level0 5151 for (int i = 0; i < 1024; i += 128) { 5152 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5153 __ add(tmpAddr, coeffs, i); 5154 __ ld2(v0, v1, __ T4S, tmpAddr); 5155 __ add(tmpAddr, coeffs, i + 32); 5156 __ ld2(v2, v3, __ T4S, tmpAddr); 5157 __ add(tmpAddr, coeffs, i + 64); 5158 __ ld2(v4, v5, __ T4S, tmpAddr); 5159 __ add(tmpAddr, coeffs, i + 96); 5160 __ ld2(v6, v7, __ T4S, tmpAddr); 5161 dilithium_load16zetas(16, zetas); 5162 dilithium_sub_add_montmul16(); 5163 __ add(tmpAddr, coeffs, i); 5164 __ st2(v0, v1, __ T4S, tmpAddr); 5165 __ add(tmpAddr, coeffs, i + 32); 5166 __ st2(v2, v3, __ T4S, tmpAddr); 5167 __ add(tmpAddr, coeffs, i + 64); 5168 __ st2(v4, v5, __ T4S, tmpAddr); 5169 __ add(tmpAddr, coeffs, i + 96); 5170 __ st2(v6, v7, __ T4S, tmpAddr); 5171 } 5172 5173 // level 1 5174 for (int i = 0; i < 1024; i += 128) { 5175 __ add(tmpAddr, coeffs, i); 5176 __ ld2(v0, v1, __ T2D, tmpAddr); 5177 __ add(tmpAddr, coeffs, i + 32); 5178 __ ld2(v2, v3, __ T2D, tmpAddr); 5179 __ add(tmpAddr, coeffs, i + 64); 5180 __ ld2(v4, v5, __ T2D, tmpAddr); 5181 __ add(tmpAddr, coeffs, i + 96); 5182 __ ld2(v6, v7, __ T2D, tmpAddr); 5183 dilithium_load16zetas(16, zetas); 5184 dilithium_sub_add_montmul16(); 5185 __ add(tmpAddr, coeffs, i); 5186 __ st2(v0, v1, __ T2D, tmpAddr); 5187 __ add(tmpAddr, coeffs, i + 32); 5188 __ st2(v2, v3, __ T2D, tmpAddr); 5189 __ add(tmpAddr, coeffs, i + 64); 5190 __ st2(v4, v5, __ T2D, tmpAddr); 5191 __ add(tmpAddr, coeffs, i + 96); 5192 __ st2(v6, v7, __ T2D, tmpAddr); 5193 } 5194 5195 //level 2 5196 for (int i = 0; i < 1024; i += 256) { 5197 __ ldr(v0, __ Q, Address(coeffs, i)); 5198 __ ldr(v1, __ Q, Address(coeffs, i + 32)); 5199 __ ldr(v2, __ Q, Address(coeffs, i + 64)); 5200 __ ldr(v3, __ Q, Address(coeffs, i + 96)); 5201 __ ldr(v4, __ Q, Address(coeffs, i + 128)); 5202 __ ldr(v5, __ Q, Address(coeffs, i + 160)); 5203 __ ldr(v6, __ Q, Address(coeffs, i + 192)); 5204 __ ldr(v7, __ Q, Address(coeffs, i + 224)); 5205 __ ldr(v16, __ Q, Address(coeffs, i + 16)); 5206 __ ldr(v17, __ Q, Address(coeffs, i + 48)); 5207 __ ldr(v18, __ Q, Address(coeffs, i + 80)); 5208 __ ldr(v19, __ Q, Address(coeffs, i + 112)); 5209 __ ldr(v20, __ Q, Address(coeffs, i + 144)); 5210 __ ldr(v21, __ Q, Address(coeffs, i + 176)); 5211 __ ldr(v22, __ Q, Address(coeffs, i + 208)); 5212 __ ldr(v23, __ Q, Address(coeffs, i + 240)); 5213 dilithium_add_sub32(); 5214 __ str(v24, __ Q, Address(coeffs, i)); 5215 __ str(v25, __ Q, Address(coeffs, i + 32)); 5216 __ str(v26, __ Q, Address(coeffs, i + 64)); 5217 __ str(v27, __ Q, Address(coeffs, i + 96)); 5218 __ str(v28, __ Q, Address(coeffs, i + 128)); 5219 __ str(v29, __ Q, Address(coeffs, i + 160)); 5220 __ str(v30, __ Q, Address(coeffs, i + 192)); 5221 __ str(v31, __ Q, Address(coeffs, i + 224)); 5222 dilithium_load32zetas(zetas); 5223 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5224 dilithium_montmul32(false); 5225 __ str(v16, __ Q, Address(coeffs, i + 16)); 5226 __ str(v17, __ Q, Address(coeffs, i + 48)); 5227 __ str(v18, __ Q, Address(coeffs, i + 80)); 5228 __ str(v19, __ Q, Address(coeffs, i + 112)); 5229 __ str(v20, __ Q, Address(coeffs, i + 144)); 5230 __ str(v21, __ Q, Address(coeffs, i + 176)); 5231 __ str(v22, __ Q, Address(coeffs, i + 208)); 5232 __ str(v23, __ Q, Address(coeffs, i + 240)); 5233 } 5234 5235 // level 3-7 5236 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 5237 5238 __ leave(); // required for proper stackwalking of RuntimeStub frame 5239 __ mov(r0, zr); // return 0 5240 __ ret(lr); 5241 5242 return start; 5243 5244 } 5245 5246 // Dilithium multiply polynomials in the NTT domain. 5247 // Straightforward implementation of the method 5248 // static int implDilithiumNttMult( 5249 // int[] result, int[] ntta, int[] nttb {} of 5250 // the sun.security.provider.ML_DSA class. 5251 // 5252 // result (int[256]) = c_rarg0 5253 // poly1 (int[256]) = c_rarg1 5254 // poly2 (int[256]) = c_rarg2 5255 address generate_dilithiumNttMult() { 5256 5257 __ align(CodeEntryAlignment); 5258 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 5259 StubCodeMark mark(this, stub_id); 5260 address start = __ pc(); 5261 __ enter(); 5262 5263 Label L_loop; 5264 5265 const Register result = c_rarg0; 5266 const Register poly1 = c_rarg1; 5267 const Register poly2 = c_rarg2; 5268 5269 const Register dilithiumConsts = r10; 5270 const Register len = r11; 5271 5272 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5273 5274 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5275 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 5276 5277 __ mov(len, zr); 5278 __ add(len, len, 1024); 5279 5280 __ BIND(L_loop); 5281 5282 __ ldpq(v0, v1, __ post(poly1, 32)); 5283 __ ldpq(v2, v3, __ post(poly1, 32)); 5284 __ ldpq(v4, v5, __ post(poly1, 32)); 5285 __ ldpq(v6, v7, __ post(poly1, 32)); 5286 __ ldpq(v16, v17, __ post(poly2, 32)); 5287 __ ldpq(v18, v19, __ post(poly2, 32)); 5288 __ ldpq(v20, v21, __ post(poly2, 32)); 5289 __ ldpq(v22, v23, __ post(poly2, 32)); 5290 dilithium_montmul32(false); 5291 dilithium_montmul32(true); 5292 __ stpq(v16, v17, __ post(result, 32)); 5293 __ stpq(v18, v19, __ post(result, 32)); 5294 __ stpq(v20, v21, __ post(result, 32)); 5295 __ stpq(v22, v23, __ post(result, 32)); 5296 5297 __ sub(len, len, 128); 5298 __ cmp(len, (u1)128); 5299 __ br(Assembler::GE, L_loop); 5300 5301 __ leave(); // required for proper stackwalking of RuntimeStub frame 5302 __ mov(r0, zr); // return 0 5303 __ ret(lr); 5304 5305 return start; 5306 5307 } 5308 5309 // Dilithium Motgomery multiply an array by a constant. 5310 // A straightforward implementation of the method 5311 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 5312 // of the sun.security.provider.MLDSA class 5313 // 5314 // coeffs (int[256]) = c_rarg0 5315 // constant (int) = c_rarg1 5316 address generate_dilithiumMontMulByConstant() { 5317 5318 __ align(CodeEntryAlignment); 5319 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 5320 StubCodeMark mark(this, stub_id); 5321 address start = __ pc(); 5322 __ enter(); 5323 5324 Label L_loop; 5325 5326 const Register coeffs = c_rarg0; 5327 const Register constant = c_rarg1; 5328 5329 const Register dilithiumConsts = r10; 5330 const Register result = r11; 5331 const Register len = r12; 5332 5333 __ add(result, coeffs, 0); 5334 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5335 5336 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5337 __ dup(v29, __ T4S, constant); 5338 __ mov(len, zr); 5339 __ add(len, len, 1024); 5340 5341 __ BIND(L_loop); 5342 5343 __ ldpq(v16, v17, __ post(coeffs, 32)); 5344 __ ldpq(v18, v19, __ post(coeffs, 32)); 5345 __ ldpq(v20, v21, __ post(coeffs, 32)); 5346 __ ldpq(v22, v23, __ post(coeffs, 32)); 5347 dilithium_montmul32(true); 5348 __ stpq(v16, v17, __ post(result, 32)); 5349 __ stpq(v18, v19, __ post(result, 32)); 5350 __ stpq(v20, v21, __ post(result, 32)); 5351 __ stpq(v22, v23, __ post(result, 32)); 5352 5353 __ sub(len, len, 128); 5354 __ cmp(len, (u1)128); 5355 __ br(Assembler::GE, L_loop); 5356 5357 __ leave(); // required for proper stackwalking of RuntimeStub frame 5358 __ mov(r0, zr); // return 0 5359 __ ret(lr); 5360 5361 return start; 5362 } 5363 5364 // Dilithium decompose poly. 5365 // Implements the method 5366 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 5367 // of the sun.security.provider.ML_DSA class 5368 // 5369 // input (int[256]) = c_rarg0 5370 // lowPart (int[256]) = c_rarg1 5371 // highPart (int[256]) = c_rarg2 5372 // twoGamma2 (int) = c_rarg3 5373 // multiplier (int) = c_rarg4 5374 address generate_dilithiumDecomposePoly() { 5375 5376 __ align(CodeEntryAlignment); 5377 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 5378 StubCodeMark mark(this, stub_id); 5379 address start = __ pc(); 5380 __ enter(); 5381 5382 Label L_loop; 5383 5384 const Register input = c_rarg0; 5385 const Register lowPart = c_rarg1; 5386 const Register highPart = c_rarg2; 5387 const Register twoGamma2 = c_rarg3; 5388 const Register multiplier = c_rarg4; 5389 5390 const Register len = r9; 5391 const Register dilithiumConsts = r10; 5392 const Register tmp = r11; 5393 5394 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5395 5396 // save callee-saved registers 5397 __ stpd(v8, v9, __ pre(sp, -64)); 5398 __ stpd(v10, v11, Address(sp, 16)); 5399 __ stpd(v12, v13, Address(sp, 32)); 5400 __ stpd(v14, v15, Address(sp, 48)); 5401 5402 5403 __ mov(tmp, zr); 5404 __ add(tmp, tmp, 1); 5405 __ dup(v25, __ T4S, tmp); // 1 5406 __ ldr(v30, __ Q, Address(dilithiumConsts, 16)); // q 5407 __ ldr(v31, __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 5408 __ dup(v28, __ T4S, twoGamma2); // 2 * gamma2 5409 __ dup(v29, __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 5410 __ subv(v26, __ T4S, v30, v25); // q - 1 5411 __ sshr(v27, __ T4S, v28, 1); // gamma2 5412 5413 __ mov(len, zr); 5414 __ add(len, len, 1024); 5415 5416 __ BIND(L_loop); 5417 5418 __ ld4(v0, v1, v2, v3, __ T4S, __ post(input, 64)); 5419 5420 // rplus in v0 5421 // rplus = rplus - ((rplus + 5373807) >> 23) * dilithium_q; 5422 __ addv(v4, __ T4S, v0, v31); 5423 __ addv(v5, __ T4S, v1, v31); 5424 __ addv(v6, __ T4S, v2, v31); 5425 __ addv(v7, __ T4S, v3, v31); 5426 5427 __ sshr(v4, __ T4S, v4, 23); 5428 __ sshr(v5, __ T4S, v5, 23); 5429 __ sshr(v6, __ T4S, v6, 23); 5430 __ sshr(v7, __ T4S, v7, 23); 5431 5432 __ mulv(v4, __ T4S, v4, v30); 5433 __ mulv(v5, __ T4S, v5, v30); 5434 __ mulv(v6, __ T4S, v6, v30); 5435 __ mulv(v7, __ T4S, v7, v30); 5436 5437 __ subv(v0, __ T4S, v0, v4); 5438 __ subv(v1, __ T4S, v1, v5); 5439 __ subv(v2, __ T4S, v2, v6); 5440 __ subv(v3, __ T4S, v3, v7); 5441 5442 // rplus in v0 5443 // rplus = rplus + ((rplus >> 31) & dilithium_q); 5444 __ sshr(v4, __ T4S, v0, 31); 5445 __ sshr(v5, __ T4S, v1, 31); 5446 __ sshr(v6, __ T4S, v2, 31); 5447 __ sshr(v7, __ T4S, v3, 31); 5448 5449 __ andr(v4, __ T16B, v4, v30); 5450 __ andr(v5, __ T16B, v5, v30); 5451 __ andr(v6, __ T16B, v6, v30); 5452 __ andr(v7, __ T16B, v7, v30); 5453 5454 __ addv(v0, __ T4S, v0, v4); 5455 __ addv(v1, __ T4S, v1, v5); 5456 __ addv(v2, __ T4S, v2, v6); 5457 __ addv(v3, __ T4S, v3, v7); 5458 5459 // rplus in v0 5460 // int quotient = (rplus * multiplier) >> 22; 5461 __ mulv(v4, __ T4S, v0, v29); 5462 __ mulv(v5, __ T4S, v1, v29); 5463 __ mulv(v6, __ T4S, v2, v29); 5464 __ mulv(v7, __ T4S, v3, v29); 5465 5466 __ sshr(v4, __ T4S, v4, 22); 5467 __ sshr(v5, __ T4S, v5, 22); 5468 __ sshr(v6, __ T4S, v6, 22); 5469 __ sshr(v7, __ T4S, v7, 22); 5470 5471 // quotient in v4 5472 // int r0 = rplus - quotient * twoGamma2; 5473 __ mulv(v8, __ T4S, v4, v28); 5474 __ mulv(v9, __ T4S, v5, v28); 5475 __ mulv(v10, __ T4S, v6, v28); 5476 __ mulv(v11, __ T4S, v7, v28); 5477 5478 __ subv(v8, __ T4S, v0, v8); 5479 __ subv(v9, __ T4S, v1, v9); 5480 __ subv(v10, __ T4S, v2, v10); 5481 __ subv(v11, __ T4S, v3, v11); 5482 5483 // r0 in v8 5484 // int mask = (twoGamma2 - r0) >> 22; 5485 __ subv(v12, __ T4S, v28, v8); 5486 __ subv(v13, __ T4S, v28, v9); 5487 __ subv(v14, __ T4S, v28, v10); 5488 __ subv(v15, __ T4S, v28, v11); 5489 5490 __ sshr(v12, __ T4S, v12, 22); 5491 __ sshr(v13, __ T4S, v13, 22); 5492 __ sshr(v14, __ T4S, v14, 22); 5493 __ sshr(v15, __ T4S, v15, 22); 5494 5495 // mask in v12 5496 // r0 -= (mask & twoGamma2); 5497 __ andr(v16, __ T16B, v12, v28); 5498 __ andr(v17, __ T16B, v13, v28); 5499 __ andr(v18, __ T16B, v14, v28); 5500 __ andr(v19, __ T16B, v15, v28); 5501 5502 __ subv(v8, __ T4S, v8, v16); 5503 __ subv(v9, __ T4S, v9, v17); 5504 __ subv(v10, __ T4S, v10, v18); 5505 __ subv(v11, __ T4S, v11, v19); 5506 5507 // r0 in v8 5508 // quotient += (mask & 1); 5509 __ andr(v16, __ T16B, v12, v25); 5510 __ andr(v17, __ T16B, v13, v25); 5511 __ andr(v18, __ T16B, v14, v25); 5512 __ andr(v19, __ T16B, v15, v25); 5513 5514 __ addv(v4, __ T4S, v4, v16); 5515 __ addv(v5, __ T4S, v5, v17); 5516 __ addv(v6, __ T4S, v6, v18); 5517 __ addv(v7, __ T4S, v7, v19); 5518 5519 // mask = (twoGamma2 / 2 - r0) >> 31; 5520 __ subv(v12, __ T4S, v27, v8); 5521 __ subv(v13, __ T4S, v27, v9); 5522 __ subv(v14, __ T4S, v27, v10); 5523 __ subv(v15, __ T4S, v27, v11); 5524 5525 __ sshr(v12, __ T4S, v12, 31); 5526 __ sshr(v13, __ T4S, v13, 31); 5527 __ sshr(v14, __ T4S, v14, 31); 5528 __ sshr(v15, __ T4S, v15, 31); 5529 5530 // r0 -= (mask & twoGamma2); 5531 __ andr(v16, __ T16B, v12, v28); 5532 __ andr(v17, __ T16B, v13, v28); 5533 __ andr(v18, __ T16B, v14, v28); 5534 __ andr(v19, __ T16B, v15, v28); 5535 5536 __ subv(v8, __ T4S, v8, v16); 5537 __ subv(v9, __ T4S, v9, v17); 5538 __ subv(v10, __ T4S, v10, v18); 5539 __ subv(v11, __ T4S, v11, v19); 5540 5541 // quotient += (mask & 1); 5542 __ andr(v16, __ T16B, v12, v25); 5543 __ andr(v17, __ T16B, v13, v25); 5544 __ andr(v18, __ T16B, v14, v25); 5545 __ andr(v19, __ T16B, v15, v25); 5546 5547 __ addv(v4, __ T4S, v4, v16); 5548 __ addv(v5, __ T4S, v5, v17); 5549 __ addv(v6, __ T4S, v6, v18); 5550 __ addv(v7, __ T4S, v7, v19); 5551 5552 // int r1 = rplus - r0 - (dilithium_q - 1); 5553 __ subv(v16, __ T4S, v0, v8); 5554 __ subv(v17, __ T4S, v1, v9); 5555 __ subv(v18, __ T4S, v2, v10); 5556 __ subv(v19, __ T4S, v3, v11); 5557 5558 __ subv(v16, __ T4S, v16, v26); 5559 __ subv(v17, __ T4S, v17, v26); 5560 __ subv(v18, __ T4S, v18, v26); 5561 __ subv(v19, __ T4S, v19, v26); 5562 5563 // r1 in v16 5564 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 5565 __ negr(v20, __ T4S, v16); 5566 __ negr(v21, __ T4S, v17); 5567 __ negr(v22, __ T4S, v18); 5568 __ negr(v23, __ T4S, v19); 5569 5570 __ orr(v16, __ T16B, v16, v20); 5571 __ orr(v17, __ T16B, v17, v21); 5572 __ orr(v18, __ T16B, v18, v22); 5573 __ orr(v19, __ T16B, v19, v23); 5574 5575 __ sshr(v0, __ T4S, v16, 31); 5576 __ sshr(v1, __ T4S, v17, 31); 5577 __ sshr(v2, __ T4S, v18, 31); 5578 __ sshr(v3, __ T4S, v19, 31); 5579 5580 // r1 in v0 5581 // r0 += ~r1; 5582 __ notr(v20, __ T16B, v0); 5583 __ notr(v21, __ T16B, v1); 5584 __ notr(v22, __ T16B, v2); 5585 __ notr(v23, __ T16B, v3); 5586 5587 __ addv(v8, __ T4S, v8, v20); 5588 __ addv(v9, __ T4S, v9, v21); 5589 __ addv(v10, __ T4S, v10, v22); 5590 __ addv(v11, __ T4S, v11, v23); 5591 5592 // r0 in v8 5593 // r1 = r1 & quotient; 5594 __ andr(v0, __ T16B, v4, v0); 5595 __ andr(v1, __ T16B, v5, v1); 5596 __ andr(v2, __ T16B, v6, v2); 5597 __ andr(v3, __ T16B, v7, v3); 5598 5599 // r1 in v0 5600 // lowPart[m] = r0; 5601 // highPart[m] = r1; 5602 __ st4(v8, v9, v10, v11, __ T4S, __ post(lowPart, 64)); 5603 __ st4(v0, v1, v2, v3, __ T4S, __ post(highPart, 64)); 5604 5605 5606 __ sub(len, len, 64); 5607 __ cmp(len, (u1)64); 5608 __ br(Assembler::GE, L_loop); 5609 5610 // restore callee-saved vector registers 5611 __ ldpd(v14, v15, Address(sp, 48)); 5612 __ ldpd(v12, v13, Address(sp, 32)); 5613 __ ldpd(v10, v11, Address(sp, 16)); 5614 __ ldpd(v8, v9, __ post(sp, 64)); 5615 5616 __ leave(); // required for proper stackwalking of RuntimeStub frame 5617 __ mov(r0, zr); // return 0 5618 __ ret(lr); 5619 5620 return start; 5621 } 5622 5623 /** 5624 * Arguments: 5625 * 5626 * Inputs: 5627 * c_rarg0 - int crc 5628 * c_rarg1 - byte* buf 5629 * c_rarg2 - int length 5630 * c_rarg3 - int* table 5631 * 5632 * Output: 5633 * r0 - int crc result 5634 */ 5635 address generate_updateBytesCRC32C() { 5636 assert(UseCRC32CIntrinsics, "what are we doing here?"); 5637 5638 __ align(CodeEntryAlignment); 5639 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 5640 StubCodeMark mark(this, stub_id); 5641 5642 address start = __ pc(); 5643 5644 const Register crc = c_rarg0; // crc 5645 const Register buf = c_rarg1; // source java byte array address 5646 const Register len = c_rarg2; // length 5647 const Register table0 = c_rarg3; // crc_table address 5648 const Register table1 = c_rarg4; 5649 const Register table2 = c_rarg5; 5650 const Register table3 = c_rarg6; 5651 const Register tmp3 = c_rarg7; 5652 5653 BLOCK_COMMENT("Entry:"); 5654 __ enter(); // required for proper stackwalking of RuntimeStub frame 5655 5656 __ kernel_crc32c(crc, buf, len, 5657 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 5658 5659 __ leave(); // required for proper stackwalking of RuntimeStub frame 5660 __ ret(lr); 5661 5662 return start; 5663 } 5664 5665 /*** 5666 * Arguments: 5667 * 5668 * Inputs: 5669 * c_rarg0 - int adler 5670 * c_rarg1 - byte* buff 5671 * c_rarg2 - int len 5672 * 5673 * Output: 5674 * c_rarg0 - int adler result 5675 */ 5676 address generate_updateBytesAdler32() { 5677 __ align(CodeEntryAlignment); 5678 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 5679 StubCodeMark mark(this, stub_id); 5680 address start = __ pc(); 5681 5682 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 5683 5684 // Aliases 5685 Register adler = c_rarg0; 5686 Register s1 = c_rarg0; 5687 Register s2 = c_rarg3; 5688 Register buff = c_rarg1; 5689 Register len = c_rarg2; 5690 Register nmax = r4; 5691 Register base = r5; 5692 Register count = r6; 5693 Register temp0 = rscratch1; 5694 Register temp1 = rscratch2; 5695 FloatRegister vbytes = v0; 5696 FloatRegister vs1acc = v1; 5697 FloatRegister vs2acc = v2; 5698 FloatRegister vtable = v3; 5699 5700 // Max number of bytes we can process before having to take the mod 5701 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 5702 uint64_t BASE = 0xfff1; 5703 uint64_t NMAX = 0x15B0; 5704 5705 __ mov(base, BASE); 5706 __ mov(nmax, NMAX); 5707 5708 // Load accumulation coefficients for the upper 16 bits 5709 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 5710 __ ld1(vtable, __ T16B, Address(temp0)); 5711 5712 // s1 is initialized to the lower 16 bits of adler 5713 // s2 is initialized to the upper 16 bits of adler 5714 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 5715 __ uxth(s1, adler); // s1 = (adler & 0xffff) 5716 5717 // The pipelined loop needs at least 16 elements for 1 iteration 5718 // It does check this, but it is more effective to skip to the cleanup loop 5719 __ cmp(len, (u1)16); 5720 __ br(Assembler::HS, L_nmax); 5721 __ cbz(len, L_combine); 5722 5723 __ bind(L_simple_by1_loop); 5724 __ ldrb(temp0, Address(__ post(buff, 1))); 5725 __ add(s1, s1, temp0); 5726 __ add(s2, s2, s1); 5727 __ subs(len, len, 1); 5728 __ br(Assembler::HI, L_simple_by1_loop); 5729 5730 // s1 = s1 % BASE 5731 __ subs(temp0, s1, base); 5732 __ csel(s1, temp0, s1, Assembler::HS); 5733 5734 // s2 = s2 % BASE 5735 __ lsr(temp0, s2, 16); 5736 __ lsl(temp1, temp0, 4); 5737 __ sub(temp1, temp1, temp0); 5738 __ add(s2, temp1, s2, ext::uxth); 5739 5740 __ subs(temp0, s2, base); 5741 __ csel(s2, temp0, s2, Assembler::HS); 5742 5743 __ b(L_combine); 5744 5745 __ bind(L_nmax); 5746 __ subs(len, len, nmax); 5747 __ sub(count, nmax, 16); 5748 __ br(Assembler::LO, L_by16); 5749 5750 __ bind(L_nmax_loop); 5751 5752 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 5753 vbytes, vs1acc, vs2acc, vtable); 5754 5755 __ subs(count, count, 16); 5756 __ br(Assembler::HS, L_nmax_loop); 5757 5758 // s1 = s1 % BASE 5759 __ lsr(temp0, s1, 16); 5760 __ lsl(temp1, temp0, 4); 5761 __ sub(temp1, temp1, temp0); 5762 __ add(temp1, temp1, s1, ext::uxth); 5763 5764 __ lsr(temp0, temp1, 16); 5765 __ lsl(s1, temp0, 4); 5766 __ sub(s1, s1, temp0); 5767 __ add(s1, s1, temp1, ext:: uxth); 5768 5769 __ subs(temp0, s1, base); 5770 __ csel(s1, temp0, s1, Assembler::HS); 5771 5772 // s2 = s2 % BASE 5773 __ lsr(temp0, s2, 16); 5774 __ lsl(temp1, temp0, 4); 5775 __ sub(temp1, temp1, temp0); 5776 __ add(temp1, temp1, s2, ext::uxth); 5777 5778 __ lsr(temp0, temp1, 16); 5779 __ lsl(s2, temp0, 4); 5780 __ sub(s2, s2, temp0); 5781 __ add(s2, s2, temp1, ext:: uxth); 5782 5783 __ subs(temp0, s2, base); 5784 __ csel(s2, temp0, s2, Assembler::HS); 5785 5786 __ subs(len, len, nmax); 5787 __ sub(count, nmax, 16); 5788 __ br(Assembler::HS, L_nmax_loop); 5789 5790 __ bind(L_by16); 5791 __ adds(len, len, count); 5792 __ br(Assembler::LO, L_by1); 5793 5794 __ bind(L_by16_loop); 5795 5796 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 5797 vbytes, vs1acc, vs2acc, vtable); 5798 5799 __ subs(len, len, 16); 5800 __ br(Assembler::HS, L_by16_loop); 5801 5802 __ bind(L_by1); 5803 __ adds(len, len, 15); 5804 __ br(Assembler::LO, L_do_mod); 5805 5806 __ bind(L_by1_loop); 5807 __ ldrb(temp0, Address(__ post(buff, 1))); 5808 __ add(s1, temp0, s1); 5809 __ add(s2, s2, s1); 5810 __ subs(len, len, 1); 5811 __ br(Assembler::HS, L_by1_loop); 5812 5813 __ bind(L_do_mod); 5814 // s1 = s1 % BASE 5815 __ lsr(temp0, s1, 16); 5816 __ lsl(temp1, temp0, 4); 5817 __ sub(temp1, temp1, temp0); 5818 __ add(temp1, temp1, s1, ext::uxth); 5819 5820 __ lsr(temp0, temp1, 16); 5821 __ lsl(s1, temp0, 4); 5822 __ sub(s1, s1, temp0); 5823 __ add(s1, s1, temp1, ext:: uxth); 5824 5825 __ subs(temp0, s1, base); 5826 __ csel(s1, temp0, s1, Assembler::HS); 5827 5828 // s2 = s2 % BASE 5829 __ lsr(temp0, s2, 16); 5830 __ lsl(temp1, temp0, 4); 5831 __ sub(temp1, temp1, temp0); 5832 __ add(temp1, temp1, s2, ext::uxth); 5833 5834 __ lsr(temp0, temp1, 16); 5835 __ lsl(s2, temp0, 4); 5836 __ sub(s2, s2, temp0); 5837 __ add(s2, s2, temp1, ext:: uxth); 5838 5839 __ subs(temp0, s2, base); 5840 __ csel(s2, temp0, s2, Assembler::HS); 5841 5842 // Combine lower bits and higher bits 5843 __ bind(L_combine); 5844 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 5845 5846 __ ret(lr); 5847 5848 return start; 5849 } 5850 5851 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 5852 Register temp0, Register temp1, FloatRegister vbytes, 5853 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 5854 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 5855 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 5856 // In non-vectorized code, we update s1 and s2 as: 5857 // s1 <- s1 + b1 5858 // s2 <- s2 + s1 5859 // s1 <- s1 + b2 5860 // s2 <- s2 + b1 5861 // ... 5862 // s1 <- s1 + b16 5863 // s2 <- s2 + s1 5864 // Putting above assignments together, we have: 5865 // s1_new = s1 + b1 + b2 + ... + b16 5866 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 5867 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 5868 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 5869 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 5870 5871 // s2 = s2 + s1 * 16 5872 __ add(s2, s2, s1, Assembler::LSL, 4); 5873 5874 // vs1acc = b1 + b2 + b3 + ... + b16 5875 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 5876 __ umullv(vs2acc, __ T8B, vtable, vbytes); 5877 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 5878 __ uaddlv(vs1acc, __ T16B, vbytes); 5879 __ uaddlv(vs2acc, __ T8H, vs2acc); 5880 5881 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 5882 __ fmovd(temp0, vs1acc); 5883 __ fmovd(temp1, vs2acc); 5884 __ add(s1, s1, temp0); 5885 __ add(s2, s2, temp1); 5886 } 5887 5888 /** 5889 * Arguments: 5890 * 5891 * Input: 5892 * c_rarg0 - x address 5893 * c_rarg1 - x length 5894 * c_rarg2 - y address 5895 * c_rarg3 - y length 5896 * c_rarg4 - z address 5897 */ 5898 address generate_multiplyToLen() { 5899 __ align(CodeEntryAlignment); 5900 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 5901 StubCodeMark mark(this, stub_id); 5902 5903 address start = __ pc(); 5904 const Register x = r0; 5905 const Register xlen = r1; 5906 const Register y = r2; 5907 const Register ylen = r3; 5908 const Register z = r4; 5909 5910 const Register tmp0 = r5; 5911 const Register tmp1 = r10; 5912 const Register tmp2 = r11; 5913 const Register tmp3 = r12; 5914 const Register tmp4 = r13; 5915 const Register tmp5 = r14; 5916 const Register tmp6 = r15; 5917 const Register tmp7 = r16; 5918 5919 BLOCK_COMMENT("Entry:"); 5920 __ enter(); // required for proper stackwalking of RuntimeStub frame 5921 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 5922 __ leave(); // required for proper stackwalking of RuntimeStub frame 5923 __ ret(lr); 5924 5925 return start; 5926 } 5927 5928 address generate_squareToLen() { 5929 // squareToLen algorithm for sizes 1..127 described in java code works 5930 // faster than multiply_to_len on some CPUs and slower on others, but 5931 // multiply_to_len shows a bit better overall results 5932 __ align(CodeEntryAlignment); 5933 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 5934 StubCodeMark mark(this, stub_id); 5935 address start = __ pc(); 5936 5937 const Register x = r0; 5938 const Register xlen = r1; 5939 const Register z = r2; 5940 const Register y = r4; // == x 5941 const Register ylen = r5; // == xlen 5942 5943 const Register tmp0 = r3; 5944 const Register tmp1 = r10; 5945 const Register tmp2 = r11; 5946 const Register tmp3 = r12; 5947 const Register tmp4 = r13; 5948 const Register tmp5 = r14; 5949 const Register tmp6 = r15; 5950 const Register tmp7 = r16; 5951 5952 RegSet spilled_regs = RegSet::of(y, ylen); 5953 BLOCK_COMMENT("Entry:"); 5954 __ enter(); 5955 __ push(spilled_regs, sp); 5956 __ mov(y, x); 5957 __ mov(ylen, xlen); 5958 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 5959 __ pop(spilled_regs, sp); 5960 __ leave(); 5961 __ ret(lr); 5962 return start; 5963 } 5964 5965 address generate_mulAdd() { 5966 __ align(CodeEntryAlignment); 5967 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 5968 StubCodeMark mark(this, stub_id); 5969 5970 address start = __ pc(); 5971 5972 const Register out = r0; 5973 const Register in = r1; 5974 const Register offset = r2; 5975 const Register len = r3; 5976 const Register k = r4; 5977 5978 BLOCK_COMMENT("Entry:"); 5979 __ enter(); 5980 __ mul_add(out, in, offset, len, k); 5981 __ leave(); 5982 __ ret(lr); 5983 5984 return start; 5985 } 5986 5987 // Arguments: 5988 // 5989 // Input: 5990 // c_rarg0 - newArr address 5991 // c_rarg1 - oldArr address 5992 // c_rarg2 - newIdx 5993 // c_rarg3 - shiftCount 5994 // c_rarg4 - numIter 5995 // 5996 address generate_bigIntegerRightShift() { 5997 __ align(CodeEntryAlignment); 5998 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 5999 StubCodeMark mark(this, stub_id); 6000 address start = __ pc(); 6001 6002 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 6003 6004 Register newArr = c_rarg0; 6005 Register oldArr = c_rarg1; 6006 Register newIdx = c_rarg2; 6007 Register shiftCount = c_rarg3; 6008 Register numIter = c_rarg4; 6009 Register idx = numIter; 6010 6011 Register newArrCur = rscratch1; 6012 Register shiftRevCount = rscratch2; 6013 Register oldArrCur = r13; 6014 Register oldArrNext = r14; 6015 6016 FloatRegister oldElem0 = v0; 6017 FloatRegister oldElem1 = v1; 6018 FloatRegister newElem = v2; 6019 FloatRegister shiftVCount = v3; 6020 FloatRegister shiftVRevCount = v4; 6021 6022 __ cbz(idx, Exit); 6023 6024 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 6025 6026 // left shift count 6027 __ movw(shiftRevCount, 32); 6028 __ subw(shiftRevCount, shiftRevCount, shiftCount); 6029 6030 // numIter too small to allow a 4-words SIMD loop, rolling back 6031 __ cmp(numIter, (u1)4); 6032 __ br(Assembler::LT, ShiftThree); 6033 6034 __ dup(shiftVCount, __ T4S, shiftCount); 6035 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 6036 __ negr(shiftVCount, __ T4S, shiftVCount); 6037 6038 __ BIND(ShiftSIMDLoop); 6039 6040 // Calculate the load addresses 6041 __ sub(idx, idx, 4); 6042 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 6043 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 6044 __ add(oldArrCur, oldArrNext, 4); 6045 6046 // Load 4 words and process 6047 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 6048 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 6049 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 6050 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 6051 __ orr(newElem, __ T16B, oldElem0, oldElem1); 6052 __ st1(newElem, __ T4S, Address(newArrCur)); 6053 6054 __ cmp(idx, (u1)4); 6055 __ br(Assembler::LT, ShiftTwoLoop); 6056 __ b(ShiftSIMDLoop); 6057 6058 __ BIND(ShiftTwoLoop); 6059 __ cbz(idx, Exit); 6060 __ cmp(idx, (u1)1); 6061 __ br(Assembler::EQ, ShiftOne); 6062 6063 // Calculate the load addresses 6064 __ sub(idx, idx, 2); 6065 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 6066 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 6067 __ add(oldArrCur, oldArrNext, 4); 6068 6069 // Load 2 words and process 6070 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 6071 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 6072 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 6073 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 6074 __ orr(newElem, __ T8B, oldElem0, oldElem1); 6075 __ st1(newElem, __ T2S, Address(newArrCur)); 6076 __ b(ShiftTwoLoop); 6077 6078 __ BIND(ShiftThree); 6079 __ tbz(idx, 1, ShiftOne); 6080 __ tbz(idx, 0, ShiftTwo); 6081 __ ldrw(r10, Address(oldArr, 12)); 6082 __ ldrw(r11, Address(oldArr, 8)); 6083 __ lsrvw(r10, r10, shiftCount); 6084 __ lslvw(r11, r11, shiftRevCount); 6085 __ orrw(r12, r10, r11); 6086 __ strw(r12, Address(newArr, 8)); 6087 6088 __ BIND(ShiftTwo); 6089 __ ldrw(r10, Address(oldArr, 8)); 6090 __ ldrw(r11, Address(oldArr, 4)); 6091 __ lsrvw(r10, r10, shiftCount); 6092 __ lslvw(r11, r11, shiftRevCount); 6093 __ orrw(r12, r10, r11); 6094 __ strw(r12, Address(newArr, 4)); 6095 6096 __ BIND(ShiftOne); 6097 __ ldrw(r10, Address(oldArr, 4)); 6098 __ ldrw(r11, Address(oldArr)); 6099 __ lsrvw(r10, r10, shiftCount); 6100 __ lslvw(r11, r11, shiftRevCount); 6101 __ orrw(r12, r10, r11); 6102 __ strw(r12, Address(newArr)); 6103 6104 __ BIND(Exit); 6105 __ ret(lr); 6106 6107 return start; 6108 } 6109 6110 // Arguments: 6111 // 6112 // Input: 6113 // c_rarg0 - newArr address 6114 // c_rarg1 - oldArr address 6115 // c_rarg2 - newIdx 6116 // c_rarg3 - shiftCount 6117 // c_rarg4 - numIter 6118 // 6119 address generate_bigIntegerLeftShift() { 6120 __ align(CodeEntryAlignment); 6121 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 6122 StubCodeMark mark(this, stub_id); 6123 address start = __ pc(); 6124 6125 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 6126 6127 Register newArr = c_rarg0; 6128 Register oldArr = c_rarg1; 6129 Register newIdx = c_rarg2; 6130 Register shiftCount = c_rarg3; 6131 Register numIter = c_rarg4; 6132 6133 Register shiftRevCount = rscratch1; 6134 Register oldArrNext = rscratch2; 6135 6136 FloatRegister oldElem0 = v0; 6137 FloatRegister oldElem1 = v1; 6138 FloatRegister newElem = v2; 6139 FloatRegister shiftVCount = v3; 6140 FloatRegister shiftVRevCount = v4; 6141 6142 __ cbz(numIter, Exit); 6143 6144 __ add(oldArrNext, oldArr, 4); 6145 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 6146 6147 // right shift count 6148 __ movw(shiftRevCount, 32); 6149 __ subw(shiftRevCount, shiftRevCount, shiftCount); 6150 6151 // numIter too small to allow a 4-words SIMD loop, rolling back 6152 __ cmp(numIter, (u1)4); 6153 __ br(Assembler::LT, ShiftThree); 6154 6155 __ dup(shiftVCount, __ T4S, shiftCount); 6156 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 6157 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 6158 6159 __ BIND(ShiftSIMDLoop); 6160 6161 // load 4 words and process 6162 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 6163 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 6164 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 6165 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 6166 __ orr(newElem, __ T16B, oldElem0, oldElem1); 6167 __ st1(newElem, __ T4S, __ post(newArr, 16)); 6168 __ sub(numIter, numIter, 4); 6169 6170 __ cmp(numIter, (u1)4); 6171 __ br(Assembler::LT, ShiftTwoLoop); 6172 __ b(ShiftSIMDLoop); 6173 6174 __ BIND(ShiftTwoLoop); 6175 __ cbz(numIter, Exit); 6176 __ cmp(numIter, (u1)1); 6177 __ br(Assembler::EQ, ShiftOne); 6178 6179 // load 2 words and process 6180 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 6181 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 6182 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 6183 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 6184 __ orr(newElem, __ T8B, oldElem0, oldElem1); 6185 __ st1(newElem, __ T2S, __ post(newArr, 8)); 6186 __ sub(numIter, numIter, 2); 6187 __ b(ShiftTwoLoop); 6188 6189 __ BIND(ShiftThree); 6190 __ ldrw(r10, __ post(oldArr, 4)); 6191 __ ldrw(r11, __ post(oldArrNext, 4)); 6192 __ lslvw(r10, r10, shiftCount); 6193 __ lsrvw(r11, r11, shiftRevCount); 6194 __ orrw(r12, r10, r11); 6195 __ strw(r12, __ post(newArr, 4)); 6196 __ tbz(numIter, 1, Exit); 6197 __ tbz(numIter, 0, ShiftOne); 6198 6199 __ BIND(ShiftTwo); 6200 __ ldrw(r10, __ post(oldArr, 4)); 6201 __ ldrw(r11, __ post(oldArrNext, 4)); 6202 __ lslvw(r10, r10, shiftCount); 6203 __ lsrvw(r11, r11, shiftRevCount); 6204 __ orrw(r12, r10, r11); 6205 __ strw(r12, __ post(newArr, 4)); 6206 6207 __ BIND(ShiftOne); 6208 __ ldrw(r10, Address(oldArr)); 6209 __ ldrw(r11, Address(oldArrNext)); 6210 __ lslvw(r10, r10, shiftCount); 6211 __ lsrvw(r11, r11, shiftRevCount); 6212 __ orrw(r12, r10, r11); 6213 __ strw(r12, Address(newArr)); 6214 6215 __ BIND(Exit); 6216 __ ret(lr); 6217 6218 return start; 6219 } 6220 6221 address generate_count_positives(address &count_positives_long) { 6222 const u1 large_loop_size = 64; 6223 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 6224 int dcache_line = VM_Version::dcache_line_size(); 6225 6226 Register ary1 = r1, len = r2, result = r0; 6227 6228 __ align(CodeEntryAlignment); 6229 6230 StubGenStubId stub_id = StubGenStubId::count_positives_id; 6231 StubCodeMark mark(this, stub_id); 6232 6233 address entry = __ pc(); 6234 6235 __ enter(); 6236 // precondition: a copy of len is already in result 6237 // __ mov(result, len); 6238 6239 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 6240 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 6241 6242 __ cmp(len, (u1)15); 6243 __ br(Assembler::GT, LEN_OVER_15); 6244 // The only case when execution falls into this code is when pointer is near 6245 // the end of memory page and we have to avoid reading next page 6246 __ add(ary1, ary1, len); 6247 __ subs(len, len, 8); 6248 __ br(Assembler::GT, LEN_OVER_8); 6249 __ ldr(rscratch2, Address(ary1, -8)); 6250 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 6251 __ lsrv(rscratch2, rscratch2, rscratch1); 6252 __ tst(rscratch2, UPPER_BIT_MASK); 6253 __ csel(result, zr, result, Assembler::NE); 6254 __ leave(); 6255 __ ret(lr); 6256 __ bind(LEN_OVER_8); 6257 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 6258 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 6259 __ tst(rscratch2, UPPER_BIT_MASK); 6260 __ br(Assembler::NE, RET_NO_POP); 6261 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 6262 __ lsrv(rscratch1, rscratch1, rscratch2); 6263 __ tst(rscratch1, UPPER_BIT_MASK); 6264 __ bind(RET_NO_POP); 6265 __ csel(result, zr, result, Assembler::NE); 6266 __ leave(); 6267 __ ret(lr); 6268 6269 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 6270 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 6271 6272 count_positives_long = __ pc(); // 2nd entry point 6273 6274 __ enter(); 6275 6276 __ bind(LEN_OVER_15); 6277 __ push(spilled_regs, sp); 6278 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 6279 __ cbz(rscratch2, ALIGNED); 6280 __ ldp(tmp6, tmp1, Address(ary1)); 6281 __ mov(tmp5, 16); 6282 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 6283 __ add(ary1, ary1, rscratch1); 6284 __ orr(tmp6, tmp6, tmp1); 6285 __ tst(tmp6, UPPER_BIT_MASK); 6286 __ br(Assembler::NE, RET_ADJUST); 6287 __ sub(len, len, rscratch1); 6288 6289 __ bind(ALIGNED); 6290 __ cmp(len, large_loop_size); 6291 __ br(Assembler::LT, CHECK_16); 6292 // Perform 16-byte load as early return in pre-loop to handle situation 6293 // when initially aligned large array has negative values at starting bytes, 6294 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 6295 // slower. Cases with negative bytes further ahead won't be affected that 6296 // much. In fact, it'll be faster due to early loads, less instructions and 6297 // less branches in LARGE_LOOP. 6298 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 6299 __ sub(len, len, 16); 6300 __ orr(tmp6, tmp6, tmp1); 6301 __ tst(tmp6, UPPER_BIT_MASK); 6302 __ br(Assembler::NE, RET_ADJUST_16); 6303 __ cmp(len, large_loop_size); 6304 __ br(Assembler::LT, CHECK_16); 6305 6306 if (SoftwarePrefetchHintDistance >= 0 6307 && SoftwarePrefetchHintDistance >= dcache_line) { 6308 // initial prefetch 6309 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 6310 } 6311 __ bind(LARGE_LOOP); 6312 if (SoftwarePrefetchHintDistance >= 0) { 6313 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 6314 } 6315 // Issue load instructions first, since it can save few CPU/MEM cycles, also 6316 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 6317 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 6318 // instructions per cycle and have less branches, but this approach disables 6319 // early return, thus, all 64 bytes are loaded and checked every time. 6320 __ ldp(tmp2, tmp3, Address(ary1)); 6321 __ ldp(tmp4, tmp5, Address(ary1, 16)); 6322 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 6323 __ ldp(tmp6, tmp1, Address(ary1, 48)); 6324 __ add(ary1, ary1, large_loop_size); 6325 __ sub(len, len, large_loop_size); 6326 __ orr(tmp2, tmp2, tmp3); 6327 __ orr(tmp4, tmp4, tmp5); 6328 __ orr(rscratch1, rscratch1, rscratch2); 6329 __ orr(tmp6, tmp6, tmp1); 6330 __ orr(tmp2, tmp2, tmp4); 6331 __ orr(rscratch1, rscratch1, tmp6); 6332 __ orr(tmp2, tmp2, rscratch1); 6333 __ tst(tmp2, UPPER_BIT_MASK); 6334 __ br(Assembler::NE, RET_ADJUST_LONG); 6335 __ cmp(len, large_loop_size); 6336 __ br(Assembler::GE, LARGE_LOOP); 6337 6338 __ bind(CHECK_16); // small 16-byte load pre-loop 6339 __ cmp(len, (u1)16); 6340 __ br(Assembler::LT, POST_LOOP16); 6341 6342 __ bind(LOOP16); // small 16-byte load loop 6343 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 6344 __ sub(len, len, 16); 6345 __ orr(tmp2, tmp2, tmp3); 6346 __ tst(tmp2, UPPER_BIT_MASK); 6347 __ br(Assembler::NE, RET_ADJUST_16); 6348 __ cmp(len, (u1)16); 6349 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 6350 6351 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 6352 __ cmp(len, (u1)8); 6353 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 6354 __ ldr(tmp3, Address(__ post(ary1, 8))); 6355 __ tst(tmp3, UPPER_BIT_MASK); 6356 __ br(Assembler::NE, RET_ADJUST); 6357 __ sub(len, len, 8); 6358 6359 __ bind(POST_LOOP16_LOAD_TAIL); 6360 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 6361 __ ldr(tmp1, Address(ary1)); 6362 __ mov(tmp2, 64); 6363 __ sub(tmp4, tmp2, len, __ LSL, 3); 6364 __ lslv(tmp1, tmp1, tmp4); 6365 __ tst(tmp1, UPPER_BIT_MASK); 6366 __ br(Assembler::NE, RET_ADJUST); 6367 // Fallthrough 6368 6369 __ bind(RET_LEN); 6370 __ pop(spilled_regs, sp); 6371 __ leave(); 6372 __ ret(lr); 6373 6374 // difference result - len is the count of guaranteed to be 6375 // positive bytes 6376 6377 __ bind(RET_ADJUST_LONG); 6378 __ add(len, len, (u1)(large_loop_size - 16)); 6379 __ bind(RET_ADJUST_16); 6380 __ add(len, len, 16); 6381 __ bind(RET_ADJUST); 6382 __ pop(spilled_regs, sp); 6383 __ leave(); 6384 __ sub(result, result, len); 6385 __ ret(lr); 6386 6387 return entry; 6388 } 6389 6390 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 6391 bool usePrefetch, Label &NOT_EQUAL) { 6392 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6393 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 6394 tmp7 = r12, tmp8 = r13; 6395 Label LOOP; 6396 6397 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6398 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6399 __ bind(LOOP); 6400 if (usePrefetch) { 6401 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 6402 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 6403 } 6404 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 6405 __ eor(tmp1, tmp1, tmp2); 6406 __ eor(tmp3, tmp3, tmp4); 6407 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 6408 __ orr(tmp1, tmp1, tmp3); 6409 __ cbnz(tmp1, NOT_EQUAL); 6410 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6411 __ eor(tmp5, tmp5, tmp6); 6412 __ eor(tmp7, tmp7, tmp8); 6413 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6414 __ orr(tmp5, tmp5, tmp7); 6415 __ cbnz(tmp5, NOT_EQUAL); 6416 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 6417 __ eor(tmp1, tmp1, tmp2); 6418 __ eor(tmp3, tmp3, tmp4); 6419 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 6420 __ orr(tmp1, tmp1, tmp3); 6421 __ cbnz(tmp1, NOT_EQUAL); 6422 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6423 __ eor(tmp5, tmp5, tmp6); 6424 __ sub(cnt1, cnt1, 8 * wordSize); 6425 __ eor(tmp7, tmp7, tmp8); 6426 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6427 // tmp6 is not used. MacroAssembler::subs is used here (rather than 6428 // cmp) because subs allows an unlimited range of immediate operand. 6429 __ subs(tmp6, cnt1, loopThreshold); 6430 __ orr(tmp5, tmp5, tmp7); 6431 __ cbnz(tmp5, NOT_EQUAL); 6432 __ br(__ GE, LOOP); 6433 // post-loop 6434 __ eor(tmp1, tmp1, tmp2); 6435 __ eor(tmp3, tmp3, tmp4); 6436 __ orr(tmp1, tmp1, tmp3); 6437 __ sub(cnt1, cnt1, 2 * wordSize); 6438 __ cbnz(tmp1, NOT_EQUAL); 6439 } 6440 6441 void generate_large_array_equals_loop_simd(int loopThreshold, 6442 bool usePrefetch, Label &NOT_EQUAL) { 6443 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6444 tmp2 = rscratch2; 6445 Label LOOP; 6446 6447 __ bind(LOOP); 6448 if (usePrefetch) { 6449 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 6450 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 6451 } 6452 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 6453 __ sub(cnt1, cnt1, 8 * wordSize); 6454 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 6455 __ subs(tmp1, cnt1, loopThreshold); 6456 __ eor(v0, __ T16B, v0, v4); 6457 __ eor(v1, __ T16B, v1, v5); 6458 __ eor(v2, __ T16B, v2, v6); 6459 __ eor(v3, __ T16B, v3, v7); 6460 __ orr(v0, __ T16B, v0, v1); 6461 __ orr(v1, __ T16B, v2, v3); 6462 __ orr(v0, __ T16B, v0, v1); 6463 __ umov(tmp1, v0, __ D, 0); 6464 __ umov(tmp2, v0, __ D, 1); 6465 __ orr(tmp1, tmp1, tmp2); 6466 __ cbnz(tmp1, NOT_EQUAL); 6467 __ br(__ GE, LOOP); 6468 } 6469 6470 // a1 = r1 - array1 address 6471 // a2 = r2 - array2 address 6472 // result = r0 - return value. Already contains "false" 6473 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 6474 // r3-r5 are reserved temporary registers 6475 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 6476 address generate_large_array_equals() { 6477 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6478 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 6479 tmp7 = r12, tmp8 = r13; 6480 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 6481 SMALL_LOOP, POST_LOOP; 6482 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 6483 // calculate if at least 32 prefetched bytes are used 6484 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 6485 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 6486 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 6487 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 6488 tmp5, tmp6, tmp7, tmp8); 6489 6490 __ align(CodeEntryAlignment); 6491 6492 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 6493 StubCodeMark mark(this, stub_id); 6494 6495 address entry = __ pc(); 6496 __ enter(); 6497 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 6498 // also advance pointers to use post-increment instead of pre-increment 6499 __ add(a1, a1, wordSize); 6500 __ add(a2, a2, wordSize); 6501 if (AvoidUnalignedAccesses) { 6502 // both implementations (SIMD/nonSIMD) are using relatively large load 6503 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 6504 // on some CPUs in case of address is not at least 16-byte aligned. 6505 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 6506 // load if needed at least for 1st address and make if 16-byte aligned. 6507 Label ALIGNED16; 6508 __ tbz(a1, 3, ALIGNED16); 6509 __ ldr(tmp1, Address(__ post(a1, wordSize))); 6510 __ ldr(tmp2, Address(__ post(a2, wordSize))); 6511 __ sub(cnt1, cnt1, wordSize); 6512 __ eor(tmp1, tmp1, tmp2); 6513 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 6514 __ bind(ALIGNED16); 6515 } 6516 if (UseSIMDForArrayEquals) { 6517 if (SoftwarePrefetchHintDistance >= 0) { 6518 __ subs(tmp1, cnt1, prefetchLoopThreshold); 6519 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 6520 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 6521 /* prfm = */ true, NOT_EQUAL); 6522 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 6523 __ br(__ LT, TAIL); 6524 } 6525 __ bind(NO_PREFETCH_LARGE_LOOP); 6526 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 6527 /* prfm = */ false, NOT_EQUAL); 6528 } else { 6529 __ push(spilled_regs, sp); 6530 if (SoftwarePrefetchHintDistance >= 0) { 6531 __ subs(tmp1, cnt1, prefetchLoopThreshold); 6532 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 6533 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 6534 /* prfm = */ true, NOT_EQUAL); 6535 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 6536 __ br(__ LT, TAIL); 6537 } 6538 __ bind(NO_PREFETCH_LARGE_LOOP); 6539 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 6540 /* prfm = */ false, NOT_EQUAL); 6541 } 6542 __ bind(TAIL); 6543 __ cbz(cnt1, EQUAL); 6544 __ subs(cnt1, cnt1, wordSize); 6545 __ br(__ LE, POST_LOOP); 6546 __ bind(SMALL_LOOP); 6547 __ ldr(tmp1, Address(__ post(a1, wordSize))); 6548 __ ldr(tmp2, Address(__ post(a2, wordSize))); 6549 __ subs(cnt1, cnt1, wordSize); 6550 __ eor(tmp1, tmp1, tmp2); 6551 __ cbnz(tmp1, NOT_EQUAL); 6552 __ br(__ GT, SMALL_LOOP); 6553 __ bind(POST_LOOP); 6554 __ ldr(tmp1, Address(a1, cnt1)); 6555 __ ldr(tmp2, Address(a2, cnt1)); 6556 __ eor(tmp1, tmp1, tmp2); 6557 __ cbnz(tmp1, NOT_EQUAL); 6558 __ bind(EQUAL); 6559 __ mov(result, true); 6560 __ bind(NOT_EQUAL); 6561 if (!UseSIMDForArrayEquals) { 6562 __ pop(spilled_regs, sp); 6563 } 6564 __ bind(NOT_EQUAL_NO_POP); 6565 __ leave(); 6566 __ ret(lr); 6567 return entry; 6568 } 6569 6570 // result = r0 - return value. Contains initial hashcode value on entry. 6571 // ary = r1 - array address 6572 // cnt = r2 - elements count 6573 // Clobbers: v0-v13, rscratch1, rscratch2 6574 address generate_large_arrays_hashcode(BasicType eltype) { 6575 const Register result = r0, ary = r1, cnt = r2; 6576 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 6577 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 6578 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 6579 const FloatRegister vpowm = v13; 6580 6581 ARRAYS_HASHCODE_REGISTERS; 6582 6583 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 6584 6585 unsigned int vf; // vectorization factor 6586 bool multiply_by_halves; 6587 Assembler::SIMD_Arrangement load_arrangement; 6588 switch (eltype) { 6589 case T_BOOLEAN: 6590 case T_BYTE: 6591 load_arrangement = Assembler::T8B; 6592 multiply_by_halves = true; 6593 vf = 8; 6594 break; 6595 case T_CHAR: 6596 case T_SHORT: 6597 load_arrangement = Assembler::T8H; 6598 multiply_by_halves = true; 6599 vf = 8; 6600 break; 6601 case T_INT: 6602 load_arrangement = Assembler::T4S; 6603 multiply_by_halves = false; 6604 vf = 4; 6605 break; 6606 default: 6607 ShouldNotReachHere(); 6608 } 6609 6610 // Unroll factor 6611 const unsigned uf = 4; 6612 6613 // Effective vectorization factor 6614 const unsigned evf = vf * uf; 6615 6616 __ align(CodeEntryAlignment); 6617 6618 StubGenStubId stub_id; 6619 switch (eltype) { 6620 case T_BOOLEAN: 6621 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 6622 break; 6623 case T_BYTE: 6624 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 6625 break; 6626 case T_CHAR: 6627 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 6628 break; 6629 case T_SHORT: 6630 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 6631 break; 6632 case T_INT: 6633 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 6634 break; 6635 default: 6636 stub_id = StubGenStubId::NO_STUBID; 6637 ShouldNotReachHere(); 6638 }; 6639 6640 StubCodeMark mark(this, stub_id); 6641 6642 address entry = __ pc(); 6643 __ enter(); 6644 6645 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 6646 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 6647 // value shouldn't change throughout both loops. 6648 __ movw(rscratch1, intpow(31U, 3)); 6649 __ mov(vpow, Assembler::S, 0, rscratch1); 6650 __ movw(rscratch1, intpow(31U, 2)); 6651 __ mov(vpow, Assembler::S, 1, rscratch1); 6652 __ movw(rscratch1, intpow(31U, 1)); 6653 __ mov(vpow, Assembler::S, 2, rscratch1); 6654 __ movw(rscratch1, intpow(31U, 0)); 6655 __ mov(vpow, Assembler::S, 3, rscratch1); 6656 6657 __ mov(vmul0, Assembler::T16B, 0); 6658 __ mov(vmul0, Assembler::S, 3, result); 6659 6660 __ andr(rscratch2, cnt, (uf - 1) * vf); 6661 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 6662 6663 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 6664 __ mov(vpowm, Assembler::S, 0, rscratch1); 6665 6666 // SMALL LOOP 6667 __ bind(SMALL_LOOP); 6668 6669 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 6670 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6671 __ subsw(rscratch2, rscratch2, vf); 6672 6673 if (load_arrangement == Assembler::T8B) { 6674 // Extend 8B to 8H to be able to use vector multiply 6675 // instructions 6676 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 6677 if (is_signed_subword_type(eltype)) { 6678 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6679 } else { 6680 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6681 } 6682 } 6683 6684 switch (load_arrangement) { 6685 case Assembler::T4S: 6686 __ addv(vmul0, load_arrangement, vmul0, vdata0); 6687 break; 6688 case Assembler::T8B: 6689 case Assembler::T8H: 6690 assert(is_subword_type(eltype), "subword type expected"); 6691 if (is_signed_subword_type(eltype)) { 6692 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6693 } else { 6694 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6695 } 6696 break; 6697 default: 6698 __ should_not_reach_here(); 6699 } 6700 6701 // Process the upper half of a vector 6702 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 6703 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6704 if (is_signed_subword_type(eltype)) { 6705 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6706 } else { 6707 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6708 } 6709 } 6710 6711 __ br(Assembler::HI, SMALL_LOOP); 6712 6713 // SMALL LOOP'S EPILOQUE 6714 __ lsr(rscratch2, cnt, exact_log2(evf)); 6715 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 6716 6717 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 6718 __ addv(vmul0, Assembler::T4S, vmul0); 6719 __ umov(result, vmul0, Assembler::S, 0); 6720 6721 // TAIL 6722 __ bind(TAIL); 6723 6724 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 6725 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 6726 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 6727 __ andr(rscratch2, cnt, vf - 1); 6728 __ bind(TAIL_SHORTCUT); 6729 __ adr(rscratch1, BR_BASE); 6730 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 6731 __ movw(rscratch2, 0x1f); 6732 __ br(rscratch1); 6733 6734 for (size_t i = 0; i < vf - 1; ++i) { 6735 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 6736 eltype); 6737 __ maddw(result, result, rscratch2, rscratch1); 6738 } 6739 __ bind(BR_BASE); 6740 6741 __ leave(); 6742 __ ret(lr); 6743 6744 // LARGE LOOP 6745 __ bind(LARGE_LOOP_PREHEADER); 6746 6747 __ lsr(rscratch2, cnt, exact_log2(evf)); 6748 6749 if (multiply_by_halves) { 6750 // 31^4 - multiplier between lower and upper parts of a register 6751 __ movw(rscratch1, intpow(31U, vf / 2)); 6752 __ mov(vpowm, Assembler::S, 1, rscratch1); 6753 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 6754 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 6755 __ mov(vpowm, Assembler::S, 0, rscratch1); 6756 } else { 6757 // 31^16 6758 __ movw(rscratch1, intpow(31U, evf)); 6759 __ mov(vpowm, Assembler::S, 0, rscratch1); 6760 } 6761 6762 __ mov(vmul3, Assembler::T16B, 0); 6763 __ mov(vmul2, Assembler::T16B, 0); 6764 __ mov(vmul1, Assembler::T16B, 0); 6765 6766 __ bind(LARGE_LOOP); 6767 6768 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 6769 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 6770 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 6771 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6772 6773 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 6774 Address(__ post(ary, evf * type2aelembytes(eltype)))); 6775 6776 if (load_arrangement == Assembler::T8B) { 6777 // Extend 8B to 8H to be able to use vector multiply 6778 // instructions 6779 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 6780 if (is_signed_subword_type(eltype)) { 6781 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 6782 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 6783 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 6784 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6785 } else { 6786 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 6787 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 6788 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 6789 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6790 } 6791 } 6792 6793 switch (load_arrangement) { 6794 case Assembler::T4S: 6795 __ addv(vmul3, load_arrangement, vmul3, vdata3); 6796 __ addv(vmul2, load_arrangement, vmul2, vdata2); 6797 __ addv(vmul1, load_arrangement, vmul1, vdata1); 6798 __ addv(vmul0, load_arrangement, vmul0, vdata0); 6799 break; 6800 case Assembler::T8B: 6801 case Assembler::T8H: 6802 assert(is_subword_type(eltype), "subword type expected"); 6803 if (is_signed_subword_type(eltype)) { 6804 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 6805 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 6806 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 6807 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6808 } else { 6809 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 6810 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 6811 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 6812 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6813 } 6814 break; 6815 default: 6816 __ should_not_reach_here(); 6817 } 6818 6819 // Process the upper half of a vector 6820 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 6821 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 6822 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 6823 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 6824 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 6825 if (is_signed_subword_type(eltype)) { 6826 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 6827 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 6828 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 6829 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6830 } else { 6831 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 6832 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 6833 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 6834 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6835 } 6836 } 6837 6838 __ subsw(rscratch2, rscratch2, 1); 6839 __ br(Assembler::HI, LARGE_LOOP); 6840 6841 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 6842 __ addv(vmul3, Assembler::T4S, vmul3); 6843 __ umov(result, vmul3, Assembler::S, 0); 6844 6845 __ mov(rscratch2, intpow(31U, vf)); 6846 6847 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 6848 __ addv(vmul2, Assembler::T4S, vmul2); 6849 __ umov(rscratch1, vmul2, Assembler::S, 0); 6850 __ maddw(result, result, rscratch2, rscratch1); 6851 6852 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 6853 __ addv(vmul1, Assembler::T4S, vmul1); 6854 __ umov(rscratch1, vmul1, Assembler::S, 0); 6855 __ maddw(result, result, rscratch2, rscratch1); 6856 6857 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 6858 __ addv(vmul0, Assembler::T4S, vmul0); 6859 __ umov(rscratch1, vmul0, Assembler::S, 0); 6860 __ maddw(result, result, rscratch2, rscratch1); 6861 6862 __ andr(rscratch2, cnt, vf - 1); 6863 __ cbnz(rscratch2, TAIL_SHORTCUT); 6864 6865 __ leave(); 6866 __ ret(lr); 6867 6868 return entry; 6869 } 6870 6871 address generate_dsin_dcos(bool isCos) { 6872 __ align(CodeEntryAlignment); 6873 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 6874 StubCodeMark mark(this, stub_id); 6875 address start = __ pc(); 6876 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 6877 (address)StubRoutines::aarch64::_two_over_pi, 6878 (address)StubRoutines::aarch64::_pio2, 6879 (address)StubRoutines::aarch64::_dsin_coef, 6880 (address)StubRoutines::aarch64::_dcos_coef); 6881 return start; 6882 } 6883 6884 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 6885 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 6886 Label &DIFF2) { 6887 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 6888 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 6889 6890 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 6891 __ ldr(tmpU, Address(__ post(cnt1, 8))); 6892 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 6893 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 6894 6895 __ fmovd(tmpL, vtmp3); 6896 __ eor(rscratch2, tmp3, tmpL); 6897 __ cbnz(rscratch2, DIFF2); 6898 6899 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6900 __ umov(tmpL, vtmp3, __ D, 1); 6901 __ eor(rscratch2, tmpU, tmpL); 6902 __ cbnz(rscratch2, DIFF1); 6903 6904 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 6905 __ ldr(tmpU, Address(__ post(cnt1, 8))); 6906 __ fmovd(tmpL, vtmp); 6907 __ eor(rscratch2, tmp3, tmpL); 6908 __ cbnz(rscratch2, DIFF2); 6909 6910 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6911 __ umov(tmpL, vtmp, __ D, 1); 6912 __ eor(rscratch2, tmpU, tmpL); 6913 __ cbnz(rscratch2, DIFF1); 6914 } 6915 6916 // r0 = result 6917 // r1 = str1 6918 // r2 = cnt1 6919 // r3 = str2 6920 // r4 = cnt2 6921 // r10 = tmp1 6922 // r11 = tmp2 6923 address generate_compare_long_string_different_encoding(bool isLU) { 6924 __ align(CodeEntryAlignment); 6925 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 6926 StubCodeMark mark(this, stub_id); 6927 address entry = __ pc(); 6928 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 6929 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 6930 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 6931 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6932 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 6933 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 6934 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 6935 6936 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 6937 6938 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 6939 // cnt2 == amount of characters left to compare 6940 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 6941 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 6942 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 6943 __ add(str2, str2, isLU ? wordSize : wordSize/2); 6944 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 6945 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 6946 __ eor(rscratch2, tmp1, tmp2); 6947 __ mov(rscratch1, tmp2); 6948 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 6949 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 6950 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 6951 __ push(spilled_regs, sp); 6952 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 6953 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 6954 6955 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6956 6957 if (SoftwarePrefetchHintDistance >= 0) { 6958 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 6959 __ br(__ LT, NO_PREFETCH); 6960 __ bind(LARGE_LOOP_PREFETCH); 6961 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 6962 __ mov(tmp4, 2); 6963 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 6964 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 6965 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6966 __ subs(tmp4, tmp4, 1); 6967 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 6968 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 6969 __ mov(tmp4, 2); 6970 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 6971 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6972 __ subs(tmp4, tmp4, 1); 6973 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 6974 __ sub(cnt2, cnt2, 64); 6975 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 6976 __ br(__ GE, LARGE_LOOP_PREFETCH); 6977 } 6978 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 6979 __ bind(NO_PREFETCH); 6980 __ subs(cnt2, cnt2, 16); 6981 __ br(__ LT, TAIL); 6982 __ align(OptoLoopAlignment); 6983 __ bind(SMALL_LOOP); // smaller loop 6984 __ subs(cnt2, cnt2, 16); 6985 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6986 __ br(__ GE, SMALL_LOOP); 6987 __ cmn(cnt2, (u1)16); 6988 __ br(__ EQ, LOAD_LAST); 6989 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 6990 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 6991 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 6992 __ ldr(tmp3, Address(cnt1, -8)); 6993 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 6994 __ b(LOAD_LAST); 6995 __ bind(DIFF2); 6996 __ mov(tmpU, tmp3); 6997 __ bind(DIFF1); 6998 __ pop(spilled_regs, sp); 6999 __ b(CALCULATE_DIFFERENCE); 7000 __ bind(LOAD_LAST); 7001 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 7002 // No need to load it again 7003 __ mov(tmpU, tmp3); 7004 __ pop(spilled_regs, sp); 7005 7006 // tmp2 points to the address of the last 4 Latin1 characters right now 7007 __ ldrs(vtmp, Address(tmp2)); 7008 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 7009 __ fmovd(tmpL, vtmp); 7010 7011 __ eor(rscratch2, tmpU, tmpL); 7012 __ cbz(rscratch2, DONE); 7013 7014 // Find the first different characters in the longwords and 7015 // compute their difference. 7016 __ bind(CALCULATE_DIFFERENCE); 7017 __ rev(rscratch2, rscratch2); 7018 __ clz(rscratch2, rscratch2); 7019 __ andr(rscratch2, rscratch2, -16); 7020 __ lsrv(tmp1, tmp1, rscratch2); 7021 __ uxthw(tmp1, tmp1); 7022 __ lsrv(rscratch1, rscratch1, rscratch2); 7023 __ uxthw(rscratch1, rscratch1); 7024 __ subw(result, tmp1, rscratch1); 7025 __ bind(DONE); 7026 __ ret(lr); 7027 return entry; 7028 } 7029 7030 // r0 = input (float16) 7031 // v0 = result (float) 7032 // v1 = temporary float register 7033 address generate_float16ToFloat() { 7034 __ align(CodeEntryAlignment); 7035 StubGenStubId stub_id = StubGenStubId::hf2f_id; 7036 StubCodeMark mark(this, stub_id); 7037 address entry = __ pc(); 7038 BLOCK_COMMENT("Entry:"); 7039 __ flt16_to_flt(v0, r0, v1); 7040 __ ret(lr); 7041 return entry; 7042 } 7043 7044 // v0 = input (float) 7045 // r0 = result (float16) 7046 // v1 = temporary float register 7047 address generate_floatToFloat16() { 7048 __ align(CodeEntryAlignment); 7049 StubGenStubId stub_id = StubGenStubId::f2hf_id; 7050 StubCodeMark mark(this, stub_id); 7051 address entry = __ pc(); 7052 BLOCK_COMMENT("Entry:"); 7053 __ flt_to_flt16(r0, v0, v1); 7054 __ ret(lr); 7055 return entry; 7056 } 7057 7058 address generate_method_entry_barrier() { 7059 __ align(CodeEntryAlignment); 7060 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 7061 StubCodeMark mark(this, stub_id); 7062 7063 Label deoptimize_label; 7064 7065 address start = __ pc(); 7066 7067 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 7068 7069 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 7070 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 7071 // We can get here despite the nmethod being good, if we have not 7072 // yet applied our cross modification fence (or data fence). 7073 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 7074 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 7075 __ ldrw(rscratch2, rscratch2); 7076 __ strw(rscratch2, thread_epoch_addr); 7077 __ isb(); 7078 __ membar(__ LoadLoad); 7079 } 7080 7081 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 7082 7083 __ enter(); 7084 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 7085 7086 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 7087 7088 __ push_call_clobbered_registers(); 7089 7090 __ mov(c_rarg0, rscratch2); 7091 __ call_VM_leaf 7092 (CAST_FROM_FN_PTR 7093 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 7094 7095 __ reset_last_Java_frame(true); 7096 7097 __ mov(rscratch1, r0); 7098 7099 __ pop_call_clobbered_registers(); 7100 7101 __ cbnz(rscratch1, deoptimize_label); 7102 7103 __ leave(); 7104 __ ret(lr); 7105 7106 __ BIND(deoptimize_label); 7107 7108 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 7109 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 7110 7111 __ mov(sp, rscratch1); 7112 __ br(rscratch2); 7113 7114 return start; 7115 } 7116 7117 // r0 = result 7118 // r1 = str1 7119 // r2 = cnt1 7120 // r3 = str2 7121 // r4 = cnt2 7122 // r10 = tmp1 7123 // r11 = tmp2 7124 address generate_compare_long_string_same_encoding(bool isLL) { 7125 __ align(CodeEntryAlignment); 7126 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 7127 StubCodeMark mark(this, stub_id); 7128 address entry = __ pc(); 7129 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 7130 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 7131 7132 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 7133 7134 // exit from large loop when less than 64 bytes left to read or we're about 7135 // to prefetch memory behind array border 7136 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 7137 7138 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 7139 __ eor(rscratch2, tmp1, tmp2); 7140 __ cbnz(rscratch2, CAL_DIFFERENCE); 7141 7142 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 7143 // update pointers, because of previous read 7144 __ add(str1, str1, wordSize); 7145 __ add(str2, str2, wordSize); 7146 if (SoftwarePrefetchHintDistance >= 0) { 7147 __ align(OptoLoopAlignment); 7148 __ bind(LARGE_LOOP_PREFETCH); 7149 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 7150 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 7151 7152 for (int i = 0; i < 4; i++) { 7153 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 7154 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 7155 __ cmp(tmp1, tmp2); 7156 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7157 __ br(Assembler::NE, DIFF); 7158 } 7159 __ sub(cnt2, cnt2, isLL ? 64 : 32); 7160 __ add(str1, str1, 64); 7161 __ add(str2, str2, 64); 7162 __ subs(rscratch2, cnt2, largeLoopExitCondition); 7163 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 7164 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 7165 } 7166 7167 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 7168 __ br(Assembler::LE, LESS16); 7169 __ align(OptoLoopAlignment); 7170 __ bind(LOOP_COMPARE16); 7171 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 7172 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 7173 __ cmp(tmp1, tmp2); 7174 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7175 __ br(Assembler::NE, DIFF); 7176 __ sub(cnt2, cnt2, isLL ? 16 : 8); 7177 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 7178 __ br(Assembler::LT, LESS16); 7179 7180 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 7181 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 7182 __ cmp(tmp1, tmp2); 7183 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7184 __ br(Assembler::NE, DIFF); 7185 __ sub(cnt2, cnt2, isLL ? 16 : 8); 7186 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 7187 __ br(Assembler::GE, LOOP_COMPARE16); 7188 __ cbz(cnt2, LENGTH_DIFF); 7189 7190 __ bind(LESS16); 7191 // each 8 compare 7192 __ subs(cnt2, cnt2, isLL ? 8 : 4); 7193 __ br(Assembler::LE, LESS8); 7194 __ ldr(tmp1, Address(__ post(str1, 8))); 7195 __ ldr(tmp2, Address(__ post(str2, 8))); 7196 __ eor(rscratch2, tmp1, tmp2); 7197 __ cbnz(rscratch2, CAL_DIFFERENCE); 7198 __ sub(cnt2, cnt2, isLL ? 8 : 4); 7199 7200 __ bind(LESS8); // directly load last 8 bytes 7201 if (!isLL) { 7202 __ add(cnt2, cnt2, cnt2); 7203 } 7204 __ ldr(tmp1, Address(str1, cnt2)); 7205 __ ldr(tmp2, Address(str2, cnt2)); 7206 __ eor(rscratch2, tmp1, tmp2); 7207 __ cbz(rscratch2, LENGTH_DIFF); 7208 __ b(CAL_DIFFERENCE); 7209 7210 __ bind(DIFF); 7211 __ cmp(tmp1, tmp2); 7212 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 7213 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 7214 // reuse rscratch2 register for the result of eor instruction 7215 __ eor(rscratch2, tmp1, tmp2); 7216 7217 __ bind(CAL_DIFFERENCE); 7218 __ rev(rscratch2, rscratch2); 7219 __ clz(rscratch2, rscratch2); 7220 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 7221 __ lsrv(tmp1, tmp1, rscratch2); 7222 __ lsrv(tmp2, tmp2, rscratch2); 7223 if (isLL) { 7224 __ uxtbw(tmp1, tmp1); 7225 __ uxtbw(tmp2, tmp2); 7226 } else { 7227 __ uxthw(tmp1, tmp1); 7228 __ uxthw(tmp2, tmp2); 7229 } 7230 __ subw(result, tmp1, tmp2); 7231 7232 __ bind(LENGTH_DIFF); 7233 __ ret(lr); 7234 return entry; 7235 } 7236 7237 enum string_compare_mode { 7238 LL, 7239 LU, 7240 UL, 7241 UU, 7242 }; 7243 7244 // The following registers are declared in aarch64.ad 7245 // r0 = result 7246 // r1 = str1 7247 // r2 = cnt1 7248 // r3 = str2 7249 // r4 = cnt2 7250 // r10 = tmp1 7251 // r11 = tmp2 7252 // z0 = ztmp1 7253 // z1 = ztmp2 7254 // p0 = pgtmp1 7255 // p1 = pgtmp2 7256 address generate_compare_long_string_sve(string_compare_mode mode) { 7257 StubGenStubId stub_id; 7258 switch (mode) { 7259 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 7260 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 7261 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 7262 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 7263 default: ShouldNotReachHere(); 7264 } 7265 7266 __ align(CodeEntryAlignment); 7267 address entry = __ pc(); 7268 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 7269 tmp1 = r10, tmp2 = r11; 7270 7271 Label LOOP, DONE, MISMATCH; 7272 Register vec_len = tmp1; 7273 Register idx = tmp2; 7274 // The minimum of the string lengths has been stored in cnt2. 7275 Register cnt = cnt2; 7276 FloatRegister ztmp1 = z0, ztmp2 = z1; 7277 PRegister pgtmp1 = p0, pgtmp2 = p1; 7278 7279 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 7280 switch (mode) { \ 7281 case LL: \ 7282 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 7283 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 7284 break; \ 7285 case LU: \ 7286 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 7287 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 7288 break; \ 7289 case UL: \ 7290 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 7291 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 7292 break; \ 7293 case UU: \ 7294 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 7295 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 7296 break; \ 7297 default: \ 7298 ShouldNotReachHere(); \ 7299 } 7300 7301 StubCodeMark mark(this, stub_id); 7302 7303 __ mov(idx, 0); 7304 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 7305 7306 if (mode == LL) { 7307 __ sve_cntb(vec_len); 7308 } else { 7309 __ sve_cnth(vec_len); 7310 } 7311 7312 __ sub(rscratch1, cnt, vec_len); 7313 7314 __ bind(LOOP); 7315 7316 // main loop 7317 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 7318 __ add(idx, idx, vec_len); 7319 // Compare strings. 7320 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 7321 __ br(__ NE, MISMATCH); 7322 __ cmp(idx, rscratch1); 7323 __ br(__ LT, LOOP); 7324 7325 // post loop, last iteration 7326 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 7327 7328 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 7329 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 7330 __ br(__ EQ, DONE); 7331 7332 __ bind(MISMATCH); 7333 7334 // Crop the vector to find its location. 7335 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 7336 // Extract the first different characters of each string. 7337 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 7338 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 7339 7340 // Compute the difference of the first different characters. 7341 __ sub(result, rscratch1, rscratch2); 7342 7343 __ bind(DONE); 7344 __ ret(lr); 7345 #undef LOAD_PAIR 7346 return entry; 7347 } 7348 7349 void generate_compare_long_strings() { 7350 if (UseSVE == 0) { 7351 StubRoutines::aarch64::_compare_long_string_LL 7352 = generate_compare_long_string_same_encoding(true); 7353 StubRoutines::aarch64::_compare_long_string_UU 7354 = generate_compare_long_string_same_encoding(false); 7355 StubRoutines::aarch64::_compare_long_string_LU 7356 = generate_compare_long_string_different_encoding(true); 7357 StubRoutines::aarch64::_compare_long_string_UL 7358 = generate_compare_long_string_different_encoding(false); 7359 } else { 7360 StubRoutines::aarch64::_compare_long_string_LL 7361 = generate_compare_long_string_sve(LL); 7362 StubRoutines::aarch64::_compare_long_string_UU 7363 = generate_compare_long_string_sve(UU); 7364 StubRoutines::aarch64::_compare_long_string_LU 7365 = generate_compare_long_string_sve(LU); 7366 StubRoutines::aarch64::_compare_long_string_UL 7367 = generate_compare_long_string_sve(UL); 7368 } 7369 } 7370 7371 // R0 = result 7372 // R1 = str2 7373 // R2 = cnt1 7374 // R3 = str1 7375 // R4 = cnt2 7376 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 7377 // 7378 // This generic linear code use few additional ideas, which makes it faster: 7379 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 7380 // in order to skip initial loading(help in systems with 1 ld pipeline) 7381 // 2) we can use "fast" algorithm of finding single character to search for 7382 // first symbol with less branches(1 branch per each loaded register instead 7383 // of branch for each symbol), so, this is where constants like 7384 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 7385 // 3) after loading and analyzing 1st register of source string, it can be 7386 // used to search for every 1st character entry, saving few loads in 7387 // comparison with "simplier-but-slower" implementation 7388 // 4) in order to avoid lots of push/pop operations, code below is heavily 7389 // re-using/re-initializing/compressing register values, which makes code 7390 // larger and a bit less readable, however, most of extra operations are 7391 // issued during loads or branches, so, penalty is minimal 7392 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 7393 StubGenStubId stub_id; 7394 if (str1_isL) { 7395 if (str2_isL) { 7396 stub_id = StubGenStubId::string_indexof_linear_ll_id; 7397 } else { 7398 stub_id = StubGenStubId::string_indexof_linear_ul_id; 7399 } 7400 } else { 7401 if (str2_isL) { 7402 ShouldNotReachHere(); 7403 } else { 7404 stub_id = StubGenStubId::string_indexof_linear_uu_id; 7405 } 7406 } 7407 __ align(CodeEntryAlignment); 7408 StubCodeMark mark(this, stub_id); 7409 address entry = __ pc(); 7410 7411 int str1_chr_size = str1_isL ? 1 : 2; 7412 int str2_chr_size = str2_isL ? 1 : 2; 7413 int str1_chr_shift = str1_isL ? 0 : 1; 7414 int str2_chr_shift = str2_isL ? 0 : 1; 7415 bool isL = str1_isL && str2_isL; 7416 // parameters 7417 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 7418 // temporary registers 7419 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 7420 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 7421 // redefinitions 7422 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 7423 7424 __ push(spilled_regs, sp); 7425 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 7426 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 7427 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 7428 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 7429 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 7430 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 7431 // Read whole register from str1. It is safe, because length >=8 here 7432 __ ldr(ch1, Address(str1)); 7433 // Read whole register from str2. It is safe, because length >=8 here 7434 __ ldr(ch2, Address(str2)); 7435 __ sub(cnt2, cnt2, cnt1); 7436 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 7437 if (str1_isL != str2_isL) { 7438 __ eor(v0, __ T16B, v0, v0); 7439 } 7440 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 7441 __ mul(first, first, tmp1); 7442 // check if we have less than 1 register to check 7443 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 7444 if (str1_isL != str2_isL) { 7445 __ fmovd(v1, ch1); 7446 } 7447 __ br(__ LE, L_SMALL); 7448 __ eor(ch2, first, ch2); 7449 if (str1_isL != str2_isL) { 7450 __ zip1(v1, __ T16B, v1, v0); 7451 } 7452 __ sub(tmp2, ch2, tmp1); 7453 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7454 __ bics(tmp2, tmp2, ch2); 7455 if (str1_isL != str2_isL) { 7456 __ fmovd(ch1, v1); 7457 } 7458 __ br(__ NE, L_HAS_ZERO); 7459 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 7460 __ add(result, result, wordSize/str2_chr_size); 7461 __ add(str2, str2, wordSize); 7462 __ br(__ LT, L_POST_LOOP); 7463 __ BIND(L_LOOP); 7464 __ ldr(ch2, Address(str2)); 7465 __ eor(ch2, first, ch2); 7466 __ sub(tmp2, ch2, tmp1); 7467 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7468 __ bics(tmp2, tmp2, ch2); 7469 __ br(__ NE, L_HAS_ZERO); 7470 __ BIND(L_LOOP_PROCEED); 7471 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 7472 __ add(str2, str2, wordSize); 7473 __ add(result, result, wordSize/str2_chr_size); 7474 __ br(__ GE, L_LOOP); 7475 __ BIND(L_POST_LOOP); 7476 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 7477 __ br(__ LE, NOMATCH); 7478 __ ldr(ch2, Address(str2)); 7479 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 7480 __ eor(ch2, first, ch2); 7481 __ sub(tmp2, ch2, tmp1); 7482 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7483 __ mov(tmp4, -1); // all bits set 7484 __ b(L_SMALL_PROCEED); 7485 __ align(OptoLoopAlignment); 7486 __ BIND(L_SMALL); 7487 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 7488 __ eor(ch2, first, ch2); 7489 if (str1_isL != str2_isL) { 7490 __ zip1(v1, __ T16B, v1, v0); 7491 } 7492 __ sub(tmp2, ch2, tmp1); 7493 __ mov(tmp4, -1); // all bits set 7494 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7495 if (str1_isL != str2_isL) { 7496 __ fmovd(ch1, v1); // move converted 4 symbols 7497 } 7498 __ BIND(L_SMALL_PROCEED); 7499 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 7500 __ bic(tmp2, tmp2, ch2); 7501 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 7502 __ rbit(tmp2, tmp2); 7503 __ br(__ EQ, NOMATCH); 7504 __ BIND(L_SMALL_HAS_ZERO_LOOP); 7505 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 7506 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 7507 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 7508 if (str2_isL) { // LL 7509 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 7510 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 7511 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 7512 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 7513 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7514 } else { 7515 __ mov(ch2, 0xE); // all bits in byte set except last one 7516 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7517 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7518 __ lslv(tmp2, tmp2, tmp4); 7519 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7520 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7521 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7522 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7523 } 7524 __ cmp(ch1, ch2); 7525 __ mov(tmp4, wordSize/str2_chr_size); 7526 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7527 __ BIND(L_SMALL_CMP_LOOP); 7528 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 7529 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 7530 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 7531 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 7532 __ add(tmp4, tmp4, 1); 7533 __ cmp(tmp4, cnt1); 7534 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 7535 __ cmp(first, ch2); 7536 __ br(__ EQ, L_SMALL_CMP_LOOP); 7537 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 7538 __ cbz(tmp2, NOMATCH); // no more matches. exit 7539 __ clz(tmp4, tmp2); 7540 __ add(result, result, 1); // advance index 7541 __ add(str2, str2, str2_chr_size); // advance pointer 7542 __ b(L_SMALL_HAS_ZERO_LOOP); 7543 __ align(OptoLoopAlignment); 7544 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 7545 __ cmp(first, ch2); 7546 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7547 __ b(DONE); 7548 __ align(OptoLoopAlignment); 7549 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 7550 if (str2_isL) { // LL 7551 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 7552 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 7553 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 7554 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 7555 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7556 } else { 7557 __ mov(ch2, 0xE); // all bits in byte set except last one 7558 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7559 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7560 __ lslv(tmp2, tmp2, tmp4); 7561 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7562 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7563 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7564 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7565 } 7566 __ cmp(ch1, ch2); 7567 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7568 __ b(DONE); 7569 __ align(OptoLoopAlignment); 7570 __ BIND(L_HAS_ZERO); 7571 __ rbit(tmp2, tmp2); 7572 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 7573 // Now, perform compression of counters(cnt2 and cnt1) into one register. 7574 // It's fine because both counters are 32bit and are not changed in this 7575 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 7576 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 7577 __ sub(result, result, 1); 7578 __ BIND(L_HAS_ZERO_LOOP); 7579 __ mov(cnt1, wordSize/str2_chr_size); 7580 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 7581 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 7582 if (str2_isL) { 7583 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 7584 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7585 __ lslv(tmp2, tmp2, tmp4); 7586 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7587 __ add(tmp4, tmp4, 1); 7588 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7589 __ lsl(tmp2, tmp2, 1); 7590 __ mov(tmp4, wordSize/str2_chr_size); 7591 } else { 7592 __ mov(ch2, 0xE); 7593 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7594 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7595 __ lslv(tmp2, tmp2, tmp4); 7596 __ add(tmp4, tmp4, 1); 7597 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7598 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 7599 __ lsl(tmp2, tmp2, 1); 7600 __ mov(tmp4, wordSize/str2_chr_size); 7601 __ sub(str2, str2, str2_chr_size); 7602 } 7603 __ cmp(ch1, ch2); 7604 __ mov(tmp4, wordSize/str2_chr_size); 7605 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7606 __ BIND(L_CMP_LOOP); 7607 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 7608 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 7609 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 7610 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 7611 __ add(tmp4, tmp4, 1); 7612 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 7613 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 7614 __ cmp(cnt1, ch2); 7615 __ br(__ EQ, L_CMP_LOOP); 7616 __ BIND(L_CMP_LOOP_NOMATCH); 7617 // here we're not matched 7618 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 7619 __ clz(tmp4, tmp2); 7620 __ add(str2, str2, str2_chr_size); // advance pointer 7621 __ b(L_HAS_ZERO_LOOP); 7622 __ align(OptoLoopAlignment); 7623 __ BIND(L_CMP_LOOP_LAST_CMP); 7624 __ cmp(cnt1, ch2); 7625 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7626 __ b(DONE); 7627 __ align(OptoLoopAlignment); 7628 __ BIND(L_CMP_LOOP_LAST_CMP2); 7629 if (str2_isL) { 7630 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 7631 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7632 __ lslv(tmp2, tmp2, tmp4); 7633 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7634 __ add(tmp4, tmp4, 1); 7635 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7636 __ lsl(tmp2, tmp2, 1); 7637 } else { 7638 __ mov(ch2, 0xE); 7639 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7640 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7641 __ lslv(tmp2, tmp2, tmp4); 7642 __ add(tmp4, tmp4, 1); 7643 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7644 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 7645 __ lsl(tmp2, tmp2, 1); 7646 __ sub(str2, str2, str2_chr_size); 7647 } 7648 __ cmp(ch1, ch2); 7649 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7650 __ b(DONE); 7651 __ align(OptoLoopAlignment); 7652 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 7653 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 7654 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 7655 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 7656 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 7657 // result by analyzed characters value, so, we can just reset lower bits 7658 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 7659 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 7660 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 7661 // index of last analyzed substring inside current octet. So, str2 in at 7662 // respective start address. We need to advance it to next octet 7663 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 7664 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 7665 __ bfm(result, zr, 0, 2 - str2_chr_shift); 7666 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 7667 __ movw(cnt2, cnt2); 7668 __ b(L_LOOP_PROCEED); 7669 __ align(OptoLoopAlignment); 7670 __ BIND(NOMATCH); 7671 __ mov(result, -1); 7672 __ BIND(DONE); 7673 __ pop(spilled_regs, sp); 7674 __ ret(lr); 7675 return entry; 7676 } 7677 7678 void generate_string_indexof_stubs() { 7679 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 7680 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 7681 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 7682 } 7683 7684 void inflate_and_store_2_fp_registers(bool generatePrfm, 7685 FloatRegister src1, FloatRegister src2) { 7686 Register dst = r1; 7687 __ zip1(v1, __ T16B, src1, v0); 7688 __ zip2(v2, __ T16B, src1, v0); 7689 if (generatePrfm) { 7690 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 7691 } 7692 __ zip1(v3, __ T16B, src2, v0); 7693 __ zip2(v4, __ T16B, src2, v0); 7694 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 7695 } 7696 7697 // R0 = src 7698 // R1 = dst 7699 // R2 = len 7700 // R3 = len >> 3 7701 // V0 = 0 7702 // v1 = loaded 8 bytes 7703 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 7704 address generate_large_byte_array_inflate() { 7705 __ align(CodeEntryAlignment); 7706 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 7707 StubCodeMark mark(this, stub_id); 7708 address entry = __ pc(); 7709 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 7710 Register src = r0, dst = r1, len = r2, octetCounter = r3; 7711 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 7712 7713 // do one more 8-byte read to have address 16-byte aligned in most cases 7714 // also use single store instruction 7715 __ ldrd(v2, __ post(src, 8)); 7716 __ sub(octetCounter, octetCounter, 2); 7717 __ zip1(v1, __ T16B, v1, v0); 7718 __ zip1(v2, __ T16B, v2, v0); 7719 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 7720 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7721 __ subs(rscratch1, octetCounter, large_loop_threshold); 7722 __ br(__ LE, LOOP_START); 7723 __ b(LOOP_PRFM_START); 7724 __ bind(LOOP_PRFM); 7725 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7726 __ bind(LOOP_PRFM_START); 7727 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 7728 __ sub(octetCounter, octetCounter, 8); 7729 __ subs(rscratch1, octetCounter, large_loop_threshold); 7730 inflate_and_store_2_fp_registers(true, v3, v4); 7731 inflate_and_store_2_fp_registers(true, v5, v6); 7732 __ br(__ GT, LOOP_PRFM); 7733 __ cmp(octetCounter, (u1)8); 7734 __ br(__ LT, DONE); 7735 __ bind(LOOP); 7736 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7737 __ bind(LOOP_START); 7738 __ sub(octetCounter, octetCounter, 8); 7739 __ cmp(octetCounter, (u1)8); 7740 inflate_and_store_2_fp_registers(false, v3, v4); 7741 inflate_and_store_2_fp_registers(false, v5, v6); 7742 __ br(__ GE, LOOP); 7743 __ bind(DONE); 7744 __ ret(lr); 7745 return entry; 7746 } 7747 7748 /** 7749 * Arguments: 7750 * 7751 * Input: 7752 * c_rarg0 - current state address 7753 * c_rarg1 - H key address 7754 * c_rarg2 - data address 7755 * c_rarg3 - number of blocks 7756 * 7757 * Output: 7758 * Updated state at c_rarg0 7759 */ 7760 address generate_ghash_processBlocks() { 7761 // Bafflingly, GCM uses little-endian for the byte order, but 7762 // big-endian for the bit order. For example, the polynomial 1 is 7763 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 7764 // 7765 // So, we must either reverse the bytes in each word and do 7766 // everything big-endian or reverse the bits in each byte and do 7767 // it little-endian. On AArch64 it's more idiomatic to reverse 7768 // the bits in each byte (we have an instruction, RBIT, to do 7769 // that) and keep the data in little-endian bit order through the 7770 // calculation, bit-reversing the inputs and outputs. 7771 7772 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 7773 StubCodeMark mark(this, stub_id); 7774 __ align(wordSize * 2); 7775 address p = __ pc(); 7776 __ emit_int64(0x87); // The low-order bits of the field 7777 // polynomial (i.e. p = z^7+z^2+z+1) 7778 // repeated in the low and high parts of a 7779 // 128-bit vector 7780 __ emit_int64(0x87); 7781 7782 __ align(CodeEntryAlignment); 7783 address start = __ pc(); 7784 7785 Register state = c_rarg0; 7786 Register subkeyH = c_rarg1; 7787 Register data = c_rarg2; 7788 Register blocks = c_rarg3; 7789 7790 FloatRegister vzr = v30; 7791 __ eor(vzr, __ T16B, vzr, vzr); // zero register 7792 7793 __ ldrq(v24, p); // The field polynomial 7794 7795 __ ldrq(v0, Address(state)); 7796 __ ldrq(v1, Address(subkeyH)); 7797 7798 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 7799 __ rbit(v0, __ T16B, v0); 7800 __ rev64(v1, __ T16B, v1); 7801 __ rbit(v1, __ T16B, v1); 7802 7803 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 7804 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 7805 7806 { 7807 Label L_ghash_loop; 7808 __ bind(L_ghash_loop); 7809 7810 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 7811 // reversing each byte 7812 __ rbit(v2, __ T16B, v2); 7813 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 7814 7815 // Multiply state in v2 by subkey in v1 7816 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 7817 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 7818 /*temps*/v6, v3, /*reuse/clobber b*/v2); 7819 // Reduce v7:v5 by the field polynomial 7820 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 7821 7822 __ sub(blocks, blocks, 1); 7823 __ cbnz(blocks, L_ghash_loop); 7824 } 7825 7826 // The bit-reversed result is at this point in v0 7827 __ rev64(v0, __ T16B, v0); 7828 __ rbit(v0, __ T16B, v0); 7829 7830 __ st1(v0, __ T16B, state); 7831 __ ret(lr); 7832 7833 return start; 7834 } 7835 7836 address generate_ghash_processBlocks_wide() { 7837 address small = generate_ghash_processBlocks(); 7838 7839 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 7840 StubCodeMark mark(this, stub_id); 7841 __ align(wordSize * 2); 7842 address p = __ pc(); 7843 __ emit_int64(0x87); // The low-order bits of the field 7844 // polynomial (i.e. p = z^7+z^2+z+1) 7845 // repeated in the low and high parts of a 7846 // 128-bit vector 7847 __ emit_int64(0x87); 7848 7849 __ align(CodeEntryAlignment); 7850 address start = __ pc(); 7851 7852 Register state = c_rarg0; 7853 Register subkeyH = c_rarg1; 7854 Register data = c_rarg2; 7855 Register blocks = c_rarg3; 7856 7857 const int unroll = 4; 7858 7859 __ cmp(blocks, (unsigned char)(unroll * 2)); 7860 __ br(__ LT, small); 7861 7862 if (unroll > 1) { 7863 // Save state before entering routine 7864 __ sub(sp, sp, 4 * 16); 7865 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 7866 __ sub(sp, sp, 4 * 16); 7867 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 7868 } 7869 7870 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 7871 7872 if (unroll > 1) { 7873 // And restore state 7874 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 7875 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 7876 } 7877 7878 __ cmp(blocks, (unsigned char)0); 7879 __ br(__ GT, small); 7880 7881 __ ret(lr); 7882 7883 return start; 7884 } 7885 7886 void generate_base64_encode_simdround(Register src, Register dst, 7887 FloatRegister codec, u8 size) { 7888 7889 FloatRegister in0 = v4, in1 = v5, in2 = v6; 7890 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 7891 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 7892 7893 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 7894 7895 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 7896 7897 __ ushr(ind0, arrangement, in0, 2); 7898 7899 __ ushr(ind1, arrangement, in1, 2); 7900 __ shl(in0, arrangement, in0, 6); 7901 __ orr(ind1, arrangement, ind1, in0); 7902 __ ushr(ind1, arrangement, ind1, 2); 7903 7904 __ ushr(ind2, arrangement, in2, 4); 7905 __ shl(in1, arrangement, in1, 4); 7906 __ orr(ind2, arrangement, in1, ind2); 7907 __ ushr(ind2, arrangement, ind2, 2); 7908 7909 __ shl(ind3, arrangement, in2, 2); 7910 __ ushr(ind3, arrangement, ind3, 2); 7911 7912 __ tbl(out0, arrangement, codec, 4, ind0); 7913 __ tbl(out1, arrangement, codec, 4, ind1); 7914 __ tbl(out2, arrangement, codec, 4, ind2); 7915 __ tbl(out3, arrangement, codec, 4, ind3); 7916 7917 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 7918 } 7919 7920 /** 7921 * Arguments: 7922 * 7923 * Input: 7924 * c_rarg0 - src_start 7925 * c_rarg1 - src_offset 7926 * c_rarg2 - src_length 7927 * c_rarg3 - dest_start 7928 * c_rarg4 - dest_offset 7929 * c_rarg5 - isURL 7930 * 7931 */ 7932 address generate_base64_encodeBlock() { 7933 7934 static const char toBase64[64] = { 7935 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 7936 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 7937 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 7938 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 7939 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 7940 }; 7941 7942 static const char toBase64URL[64] = { 7943 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 7944 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 7945 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 7946 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 7947 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 7948 }; 7949 7950 __ align(CodeEntryAlignment); 7951 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 7952 StubCodeMark mark(this, stub_id); 7953 address start = __ pc(); 7954 7955 Register src = c_rarg0; // source array 7956 Register soff = c_rarg1; // source start offset 7957 Register send = c_rarg2; // source end offset 7958 Register dst = c_rarg3; // dest array 7959 Register doff = c_rarg4; // position for writing to dest array 7960 Register isURL = c_rarg5; // Base64 or URL character set 7961 7962 // c_rarg6 and c_rarg7 are free to use as temps 7963 Register codec = c_rarg6; 7964 Register length = c_rarg7; 7965 7966 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 7967 7968 __ add(src, src, soff); 7969 __ add(dst, dst, doff); 7970 __ sub(length, send, soff); 7971 7972 // load the codec base address 7973 __ lea(codec, ExternalAddress((address) toBase64)); 7974 __ cbz(isURL, ProcessData); 7975 __ lea(codec, ExternalAddress((address) toBase64URL)); 7976 7977 __ BIND(ProcessData); 7978 7979 // too short to formup a SIMD loop, roll back 7980 __ cmp(length, (u1)24); 7981 __ br(Assembler::LT, Process3B); 7982 7983 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 7984 7985 __ BIND(Process48B); 7986 __ cmp(length, (u1)48); 7987 __ br(Assembler::LT, Process24B); 7988 generate_base64_encode_simdround(src, dst, v0, 16); 7989 __ sub(length, length, 48); 7990 __ b(Process48B); 7991 7992 __ BIND(Process24B); 7993 __ cmp(length, (u1)24); 7994 __ br(Assembler::LT, SIMDExit); 7995 generate_base64_encode_simdround(src, dst, v0, 8); 7996 __ sub(length, length, 24); 7997 7998 __ BIND(SIMDExit); 7999 __ cbz(length, Exit); 8000 8001 __ BIND(Process3B); 8002 // 3 src bytes, 24 bits 8003 __ ldrb(r10, __ post(src, 1)); 8004 __ ldrb(r11, __ post(src, 1)); 8005 __ ldrb(r12, __ post(src, 1)); 8006 __ orrw(r11, r11, r10, Assembler::LSL, 8); 8007 __ orrw(r12, r12, r11, Assembler::LSL, 8); 8008 // codec index 8009 __ ubfmw(r15, r12, 18, 23); 8010 __ ubfmw(r14, r12, 12, 17); 8011 __ ubfmw(r13, r12, 6, 11); 8012 __ andw(r12, r12, 63); 8013 // get the code based on the codec 8014 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 8015 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 8016 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 8017 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 8018 __ strb(r15, __ post(dst, 1)); 8019 __ strb(r14, __ post(dst, 1)); 8020 __ strb(r13, __ post(dst, 1)); 8021 __ strb(r12, __ post(dst, 1)); 8022 __ sub(length, length, 3); 8023 __ cbnz(length, Process3B); 8024 8025 __ BIND(Exit); 8026 __ ret(lr); 8027 8028 return start; 8029 } 8030 8031 void generate_base64_decode_simdround(Register src, Register dst, 8032 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 8033 8034 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 8035 FloatRegister out0 = v20, out1 = v21, out2 = v22; 8036 8037 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 8038 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 8039 8040 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 8041 8042 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 8043 8044 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 8045 8046 // we need unsigned saturating subtract, to make sure all input values 8047 // in range [0, 63] will have 0U value in the higher half lookup 8048 __ uqsubv(decH0, __ T16B, in0, v27); 8049 __ uqsubv(decH1, __ T16B, in1, v27); 8050 __ uqsubv(decH2, __ T16B, in2, v27); 8051 __ uqsubv(decH3, __ T16B, in3, v27); 8052 8053 // lower half lookup 8054 __ tbl(decL0, arrangement, codecL, 4, in0); 8055 __ tbl(decL1, arrangement, codecL, 4, in1); 8056 __ tbl(decL2, arrangement, codecL, 4, in2); 8057 __ tbl(decL3, arrangement, codecL, 4, in3); 8058 8059 // higher half lookup 8060 __ tbx(decH0, arrangement, codecH, 4, decH0); 8061 __ tbx(decH1, arrangement, codecH, 4, decH1); 8062 __ tbx(decH2, arrangement, codecH, 4, decH2); 8063 __ tbx(decH3, arrangement, codecH, 4, decH3); 8064 8065 // combine lower and higher 8066 __ orr(decL0, arrangement, decL0, decH0); 8067 __ orr(decL1, arrangement, decL1, decH1); 8068 __ orr(decL2, arrangement, decL2, decH2); 8069 __ orr(decL3, arrangement, decL3, decH3); 8070 8071 // check illegal inputs, value larger than 63 (maximum of 6 bits) 8072 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 8073 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 8074 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 8075 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 8076 __ orr(in0, arrangement, decH0, decH1); 8077 __ orr(in1, arrangement, decH2, decH3); 8078 __ orr(in2, arrangement, in0, in1); 8079 __ umaxv(in3, arrangement, in2); 8080 __ umov(rscratch2, in3, __ B, 0); 8081 8082 // get the data to output 8083 __ shl(out0, arrangement, decL0, 2); 8084 __ ushr(out1, arrangement, decL1, 4); 8085 __ orr(out0, arrangement, out0, out1); 8086 __ shl(out1, arrangement, decL1, 4); 8087 __ ushr(out2, arrangement, decL2, 2); 8088 __ orr(out1, arrangement, out1, out2); 8089 __ shl(out2, arrangement, decL2, 6); 8090 __ orr(out2, arrangement, out2, decL3); 8091 8092 __ cbz(rscratch2, NoIllegalData); 8093 8094 // handle illegal input 8095 __ umov(r10, in2, __ D, 0); 8096 if (size == 16) { 8097 __ cbnz(r10, ErrorInLowerHalf); 8098 8099 // illegal input is in higher half, store the lower half now. 8100 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 8101 8102 __ umov(r10, in2, __ D, 1); 8103 __ umov(r11, out0, __ D, 1); 8104 __ umov(r12, out1, __ D, 1); 8105 __ umov(r13, out2, __ D, 1); 8106 __ b(StoreLegalData); 8107 8108 __ BIND(ErrorInLowerHalf); 8109 } 8110 __ umov(r11, out0, __ D, 0); 8111 __ umov(r12, out1, __ D, 0); 8112 __ umov(r13, out2, __ D, 0); 8113 8114 __ BIND(StoreLegalData); 8115 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 8116 __ strb(r11, __ post(dst, 1)); 8117 __ strb(r12, __ post(dst, 1)); 8118 __ strb(r13, __ post(dst, 1)); 8119 __ lsr(r10, r10, 8); 8120 __ lsr(r11, r11, 8); 8121 __ lsr(r12, r12, 8); 8122 __ lsr(r13, r13, 8); 8123 __ b(StoreLegalData); 8124 8125 __ BIND(NoIllegalData); 8126 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 8127 } 8128 8129 8130 /** 8131 * Arguments: 8132 * 8133 * Input: 8134 * c_rarg0 - src_start 8135 * c_rarg1 - src_offset 8136 * c_rarg2 - src_length 8137 * c_rarg3 - dest_start 8138 * c_rarg4 - dest_offset 8139 * c_rarg5 - isURL 8140 * c_rarg6 - isMIME 8141 * 8142 */ 8143 address generate_base64_decodeBlock() { 8144 8145 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 8146 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 8147 // titled "Base64 decoding". 8148 8149 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 8150 // except the trailing character '=' is also treated illegal value in this intrinsic. That 8151 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 8152 static const uint8_t fromBase64ForNoSIMD[256] = { 8153 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8154 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8155 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 8156 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8157 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 8158 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 8159 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 8160 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 8161 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8162 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8163 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8164 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8165 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8166 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8167 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8168 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8169 }; 8170 8171 static const uint8_t fromBase64URLForNoSIMD[256] = { 8172 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8173 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8174 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 8175 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8176 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 8177 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 8178 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 8179 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 8180 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8181 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8182 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8183 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8184 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8185 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8186 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8187 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8188 }; 8189 8190 // A legal value of base64 code is in range [0, 127]. We need two lookups 8191 // with tbl/tbx and combine them to get the decode data. The 1st table vector 8192 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 8193 // table vector lookup use tbx, out of range indices are unchanged in 8194 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 8195 // The value of index 64 is set to 0, so that we know that we already get the 8196 // decoded data with the 1st lookup. 8197 static const uint8_t fromBase64ForSIMD[128] = { 8198 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8199 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8200 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 8201 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8202 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 8203 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 8204 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 8205 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 8206 }; 8207 8208 static const uint8_t fromBase64URLForSIMD[128] = { 8209 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8210 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8211 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 8212 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8213 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 8214 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 8215 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 8216 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 8217 }; 8218 8219 __ align(CodeEntryAlignment); 8220 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 8221 StubCodeMark mark(this, stub_id); 8222 address start = __ pc(); 8223 8224 Register src = c_rarg0; // source array 8225 Register soff = c_rarg1; // source start offset 8226 Register send = c_rarg2; // source end offset 8227 Register dst = c_rarg3; // dest array 8228 Register doff = c_rarg4; // position for writing to dest array 8229 Register isURL = c_rarg5; // Base64 or URL character set 8230 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 8231 8232 Register length = send; // reuse send as length of source data to process 8233 8234 Register simd_codec = c_rarg6; 8235 Register nosimd_codec = c_rarg7; 8236 8237 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 8238 8239 __ enter(); 8240 8241 __ add(src, src, soff); 8242 __ add(dst, dst, doff); 8243 8244 __ mov(doff, dst); 8245 8246 __ sub(length, send, soff); 8247 __ bfm(length, zr, 0, 1); 8248 8249 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 8250 __ cbz(isURL, ProcessData); 8251 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 8252 8253 __ BIND(ProcessData); 8254 __ mov(rscratch1, length); 8255 __ cmp(length, (u1)144); // 144 = 80 + 64 8256 __ br(Assembler::LT, Process4B); 8257 8258 // In the MIME case, the line length cannot be more than 76 8259 // bytes (see RFC 2045). This is too short a block for SIMD 8260 // to be worthwhile, so we use non-SIMD here. 8261 __ movw(rscratch1, 79); 8262 8263 __ BIND(Process4B); 8264 __ ldrw(r14, __ post(src, 4)); 8265 __ ubfxw(r10, r14, 0, 8); 8266 __ ubfxw(r11, r14, 8, 8); 8267 __ ubfxw(r12, r14, 16, 8); 8268 __ ubfxw(r13, r14, 24, 8); 8269 // get the de-code 8270 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 8271 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 8272 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 8273 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 8274 // error detection, 255u indicates an illegal input 8275 __ orrw(r14, r10, r11); 8276 __ orrw(r15, r12, r13); 8277 __ orrw(r14, r14, r15); 8278 __ tbnz(r14, 7, Exit); 8279 // recover the data 8280 __ lslw(r14, r10, 10); 8281 __ bfiw(r14, r11, 4, 6); 8282 __ bfmw(r14, r12, 2, 5); 8283 __ rev16w(r14, r14); 8284 __ bfiw(r13, r12, 6, 2); 8285 __ strh(r14, __ post(dst, 2)); 8286 __ strb(r13, __ post(dst, 1)); 8287 // non-simd loop 8288 __ subsw(rscratch1, rscratch1, 4); 8289 __ br(Assembler::GT, Process4B); 8290 8291 // if exiting from PreProcess80B, rscratch1 == -1; 8292 // otherwise, rscratch1 == 0. 8293 __ cbzw(rscratch1, Exit); 8294 __ sub(length, length, 80); 8295 8296 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 8297 __ cbz(isURL, SIMDEnter); 8298 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 8299 8300 __ BIND(SIMDEnter); 8301 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 8302 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 8303 __ mov(rscratch1, 63); 8304 __ dup(v27, __ T16B, rscratch1); 8305 8306 __ BIND(Process64B); 8307 __ cmp(length, (u1)64); 8308 __ br(Assembler::LT, Process32B); 8309 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 8310 __ sub(length, length, 64); 8311 __ b(Process64B); 8312 8313 __ BIND(Process32B); 8314 __ cmp(length, (u1)32); 8315 __ br(Assembler::LT, SIMDExit); 8316 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 8317 __ sub(length, length, 32); 8318 __ b(Process32B); 8319 8320 __ BIND(SIMDExit); 8321 __ cbz(length, Exit); 8322 __ movw(rscratch1, length); 8323 __ b(Process4B); 8324 8325 __ BIND(Exit); 8326 __ sub(c_rarg0, dst, doff); 8327 8328 __ leave(); 8329 __ ret(lr); 8330 8331 return start; 8332 } 8333 8334 // Support for spin waits. 8335 address generate_spin_wait() { 8336 __ align(CodeEntryAlignment); 8337 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 8338 StubCodeMark mark(this, stub_id); 8339 address start = __ pc(); 8340 8341 __ spin_wait(); 8342 __ ret(lr); 8343 8344 return start; 8345 } 8346 8347 void generate_lookup_secondary_supers_table_stub() { 8348 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 8349 StubCodeMark mark(this, stub_id); 8350 8351 const Register 8352 r_super_klass = r0, 8353 r_array_base = r1, 8354 r_array_length = r2, 8355 r_array_index = r3, 8356 r_sub_klass = r4, 8357 r_bitmap = rscratch2, 8358 result = r5; 8359 const FloatRegister 8360 vtemp = v0; 8361 8362 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 8363 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 8364 Label L_success; 8365 __ enter(); 8366 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 8367 r_array_base, r_array_length, r_array_index, 8368 vtemp, result, slot, 8369 /*stub_is_near*/true); 8370 __ leave(); 8371 __ ret(lr); 8372 } 8373 } 8374 8375 // Slow path implementation for UseSecondarySupersTable. 8376 address generate_lookup_secondary_supers_table_slow_path_stub() { 8377 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 8378 StubCodeMark mark(this, stub_id); 8379 8380 address start = __ pc(); 8381 const Register 8382 r_super_klass = r0, // argument 8383 r_array_base = r1, // argument 8384 temp1 = r2, // temp 8385 r_array_index = r3, // argument 8386 r_bitmap = rscratch2, // argument 8387 result = r5; // argument 8388 8389 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 8390 __ ret(lr); 8391 8392 return start; 8393 } 8394 8395 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8396 8397 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 8398 // 8399 // If LSE is in use, generate LSE versions of all the stubs. The 8400 // non-LSE versions are in atomic_aarch64.S. 8401 8402 // class AtomicStubMark records the entry point of a stub and the 8403 // stub pointer which will point to it. The stub pointer is set to 8404 // the entry point when ~AtomicStubMark() is called, which must be 8405 // after ICache::invalidate_range. This ensures safe publication of 8406 // the generated code. 8407 class AtomicStubMark { 8408 address _entry_point; 8409 aarch64_atomic_stub_t *_stub; 8410 MacroAssembler *_masm; 8411 public: 8412 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 8413 _masm = masm; 8414 __ align(32); 8415 _entry_point = __ pc(); 8416 _stub = stub; 8417 } 8418 ~AtomicStubMark() { 8419 *_stub = (aarch64_atomic_stub_t)_entry_point; 8420 } 8421 }; 8422 8423 // NB: For memory_order_conservative we need a trailing membar after 8424 // LSE atomic operations but not a leading membar. 8425 // 8426 // We don't need a leading membar because a clause in the Arm ARM 8427 // says: 8428 // 8429 // Barrier-ordered-before 8430 // 8431 // Barrier instructions order prior Memory effects before subsequent 8432 // Memory effects generated by the same Observer. A read or a write 8433 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 8434 // Observer if and only if RW1 appears in program order before RW 2 8435 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 8436 // instruction with both Acquire and Release semantics. 8437 // 8438 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 8439 // and Release semantics, therefore we don't need a leading 8440 // barrier. However, there is no corresponding Barrier-ordered-after 8441 // relationship, therefore we need a trailing membar to prevent a 8442 // later store or load from being reordered with the store in an 8443 // atomic instruction. 8444 // 8445 // This was checked by using the herd7 consistency model simulator 8446 // (http://diy.inria.fr/) with this test case: 8447 // 8448 // AArch64 LseCas 8449 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 8450 // P0 | P1; 8451 // LDR W4, [X2] | MOV W3, #0; 8452 // DMB LD | MOV W4, #1; 8453 // LDR W3, [X1] | CASAL W3, W4, [X1]; 8454 // | DMB ISH; 8455 // | STR W4, [X2]; 8456 // exists 8457 // (0:X3=0 /\ 0:X4=1) 8458 // 8459 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 8460 // with the store to x in P1. Without the DMB in P1 this may happen. 8461 // 8462 // At the time of writing we don't know of any AArch64 hardware that 8463 // reorders stores in this way, but the Reference Manual permits it. 8464 8465 void gen_cas_entry(Assembler::operand_size size, 8466 atomic_memory_order order) { 8467 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 8468 exchange_val = c_rarg2; 8469 bool acquire, release; 8470 switch (order) { 8471 case memory_order_relaxed: 8472 acquire = false; 8473 release = false; 8474 break; 8475 case memory_order_release: 8476 acquire = false; 8477 release = true; 8478 break; 8479 default: 8480 acquire = true; 8481 release = true; 8482 break; 8483 } 8484 __ mov(prev, compare_val); 8485 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 8486 if (order == memory_order_conservative) { 8487 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8488 } 8489 if (size == Assembler::xword) { 8490 __ mov(r0, prev); 8491 } else { 8492 __ movw(r0, prev); 8493 } 8494 __ ret(lr); 8495 } 8496 8497 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 8498 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 8499 // If not relaxed, then default to conservative. Relaxed is the only 8500 // case we use enough to be worth specializing. 8501 if (order == memory_order_relaxed) { 8502 __ ldadd(size, incr, prev, addr); 8503 } else { 8504 __ ldaddal(size, incr, prev, addr); 8505 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8506 } 8507 if (size == Assembler::xword) { 8508 __ mov(r0, prev); 8509 } else { 8510 __ movw(r0, prev); 8511 } 8512 __ ret(lr); 8513 } 8514 8515 void gen_swpal_entry(Assembler::operand_size size) { 8516 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 8517 __ swpal(size, incr, prev, addr); 8518 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8519 if (size == Assembler::xword) { 8520 __ mov(r0, prev); 8521 } else { 8522 __ movw(r0, prev); 8523 } 8524 __ ret(lr); 8525 } 8526 8527 void generate_atomic_entry_points() { 8528 if (! UseLSE) { 8529 return; 8530 } 8531 __ align(CodeEntryAlignment); 8532 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 8533 StubCodeMark mark(this, stub_id); 8534 address first_entry = __ pc(); 8535 8536 // ADD, memory_order_conservative 8537 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 8538 gen_ldadd_entry(Assembler::word, memory_order_conservative); 8539 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 8540 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 8541 8542 // ADD, memory_order_relaxed 8543 AtomicStubMark mark_fetch_add_4_relaxed 8544 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 8545 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 8546 AtomicStubMark mark_fetch_add_8_relaxed 8547 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 8548 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 8549 8550 // XCHG, memory_order_conservative 8551 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 8552 gen_swpal_entry(Assembler::word); 8553 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 8554 gen_swpal_entry(Assembler::xword); 8555 8556 // CAS, memory_order_conservative 8557 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 8558 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 8559 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 8560 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 8561 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 8562 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 8563 8564 // CAS, memory_order_relaxed 8565 AtomicStubMark mark_cmpxchg_1_relaxed 8566 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 8567 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 8568 AtomicStubMark mark_cmpxchg_4_relaxed 8569 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 8570 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 8571 AtomicStubMark mark_cmpxchg_8_relaxed 8572 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 8573 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 8574 8575 AtomicStubMark mark_cmpxchg_4_release 8576 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 8577 gen_cas_entry(MacroAssembler::word, memory_order_release); 8578 AtomicStubMark mark_cmpxchg_8_release 8579 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 8580 gen_cas_entry(MacroAssembler::xword, memory_order_release); 8581 8582 AtomicStubMark mark_cmpxchg_4_seq_cst 8583 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 8584 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 8585 AtomicStubMark mark_cmpxchg_8_seq_cst 8586 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 8587 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 8588 8589 ICache::invalidate_range(first_entry, __ pc() - first_entry); 8590 } 8591 #endif // LINUX 8592 8593 address generate_cont_thaw(Continuation::thaw_kind kind) { 8594 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 8595 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 8596 8597 address start = __ pc(); 8598 8599 if (return_barrier) { 8600 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 8601 __ mov(sp, rscratch1); 8602 } 8603 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 8604 8605 if (return_barrier) { 8606 // preserve possible return value from a method returning to the return barrier 8607 __ fmovd(rscratch1, v0); 8608 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 8609 } 8610 8611 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 8612 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 8613 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 8614 8615 if (return_barrier) { 8616 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 8617 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 8618 __ fmovd(v0, rscratch1); 8619 } 8620 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 8621 8622 8623 Label thaw_success; 8624 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 8625 __ cbnz(rscratch2, thaw_success); 8626 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 8627 __ br(rscratch1); 8628 __ bind(thaw_success); 8629 8630 // make room for the thawed frames 8631 __ sub(rscratch1, sp, rscratch2); 8632 __ andr(rscratch1, rscratch1, -16); // align 8633 __ mov(sp, rscratch1); 8634 8635 if (return_barrier) { 8636 // save original return value -- again 8637 __ fmovd(rscratch1, v0); 8638 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 8639 } 8640 8641 // If we want, we can templatize thaw by kind, and have three different entries 8642 __ movw(c_rarg1, (uint32_t)kind); 8643 8644 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 8645 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 8646 8647 if (return_barrier) { 8648 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 8649 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 8650 __ fmovd(v0, rscratch1); 8651 } else { 8652 __ mov(r0, zr); // return 0 (success) from doYield 8653 } 8654 8655 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 8656 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 8657 __ mov(rfp, sp); 8658 8659 if (return_barrier_exception) { 8660 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 8661 __ authenticate_return_address(c_rarg1); 8662 __ verify_oop(r0); 8663 // save return value containing the exception oop in callee-saved R19 8664 __ mov(r19, r0); 8665 8666 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 8667 8668 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 8669 // __ reinitialize_ptrue(); 8670 8671 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 8672 8673 __ mov(r1, r0); // the exception handler 8674 __ mov(r0, r19); // restore return value containing the exception oop 8675 __ verify_oop(r0); 8676 8677 __ leave(); 8678 __ mov(r3, lr); 8679 __ br(r1); // the exception handler 8680 } else { 8681 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 8682 __ leave(); 8683 __ ret(lr); 8684 } 8685 8686 return start; 8687 } 8688 8689 address generate_cont_thaw() { 8690 if (!Continuations::enabled()) return nullptr; 8691 8692 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 8693 StubCodeMark mark(this, stub_id); 8694 address start = __ pc(); 8695 generate_cont_thaw(Continuation::thaw_top); 8696 return start; 8697 } 8698 8699 address generate_cont_returnBarrier() { 8700 if (!Continuations::enabled()) return nullptr; 8701 8702 // TODO: will probably need multiple return barriers depending on return type 8703 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 8704 StubCodeMark mark(this, stub_id); 8705 address start = __ pc(); 8706 8707 generate_cont_thaw(Continuation::thaw_return_barrier); 8708 8709 return start; 8710 } 8711 8712 address generate_cont_returnBarrier_exception() { 8713 if (!Continuations::enabled()) return nullptr; 8714 8715 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 8716 StubCodeMark mark(this, stub_id); 8717 address start = __ pc(); 8718 8719 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 8720 8721 return start; 8722 } 8723 8724 address generate_cont_preempt_stub() { 8725 if (!Continuations::enabled()) return nullptr; 8726 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 8727 StubCodeMark mark(this, stub_id); 8728 address start = __ pc(); 8729 8730 __ reset_last_Java_frame(true); 8731 8732 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 8733 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 8734 __ mov(sp, rscratch2); 8735 8736 Label preemption_cancelled; 8737 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 8738 __ cbnz(rscratch1, preemption_cancelled); 8739 8740 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 8741 SharedRuntime::continuation_enter_cleanup(_masm); 8742 __ leave(); 8743 __ ret(lr); 8744 8745 // We acquired the monitor after freezing the frames so call thaw to continue execution. 8746 __ bind(preemption_cancelled); 8747 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 8748 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 8749 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 8750 __ ldr(rscratch1, Address(rscratch1)); 8751 __ br(rscratch1); 8752 8753 return start; 8754 } 8755 8756 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 8757 // are represented as long[5], with BITS_PER_LIMB = 26. 8758 // Pack five 26-bit limbs into three 64-bit registers. 8759 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 8760 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 8761 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 8762 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 8763 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 8764 8765 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 8766 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 8767 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 8768 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 8769 8770 if (dest2->is_valid()) { 8771 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 8772 } else { 8773 #ifdef ASSERT 8774 Label OK; 8775 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 8776 __ br(__ EQ, OK); 8777 __ stop("high bits of Poly1305 integer should be zero"); 8778 __ should_not_reach_here(); 8779 __ bind(OK); 8780 #endif 8781 } 8782 } 8783 8784 // As above, but return only a 128-bit integer, packed into two 8785 // 64-bit registers. 8786 void pack_26(Register dest0, Register dest1, Register src) { 8787 pack_26(dest0, dest1, noreg, src); 8788 } 8789 8790 // Multiply and multiply-accumulate unsigned 64-bit registers. 8791 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 8792 __ mul(prod_lo, n, m); 8793 __ umulh(prod_hi, n, m); 8794 } 8795 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 8796 wide_mul(rscratch1, rscratch2, n, m); 8797 __ adds(sum_lo, sum_lo, rscratch1); 8798 __ adc(sum_hi, sum_hi, rscratch2); 8799 } 8800 8801 // Poly1305, RFC 7539 8802 8803 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 8804 // description of the tricks used to simplify and accelerate this 8805 // computation. 8806 8807 address generate_poly1305_processBlocks() { 8808 __ align(CodeEntryAlignment); 8809 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 8810 StubCodeMark mark(this, stub_id); 8811 address start = __ pc(); 8812 Label here; 8813 __ enter(); 8814 RegSet callee_saved = RegSet::range(r19, r28); 8815 __ push(callee_saved, sp); 8816 8817 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 8818 8819 // Arguments 8820 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 8821 8822 // R_n is the 128-bit randomly-generated key, packed into two 8823 // registers. The caller passes this key to us as long[5], with 8824 // BITS_PER_LIMB = 26. 8825 const Register R_0 = *++regs, R_1 = *++regs; 8826 pack_26(R_0, R_1, r_start); 8827 8828 // RR_n is (R_n >> 2) * 5 8829 const Register RR_0 = *++regs, RR_1 = *++regs; 8830 __ lsr(RR_0, R_0, 2); 8831 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 8832 __ lsr(RR_1, R_1, 2); 8833 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 8834 8835 // U_n is the current checksum 8836 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 8837 pack_26(U_0, U_1, U_2, acc_start); 8838 8839 static constexpr int BLOCK_LENGTH = 16; 8840 Label DONE, LOOP; 8841 8842 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 8843 __ br(Assembler::LT, DONE); { 8844 __ bind(LOOP); 8845 8846 // S_n is to be the sum of U_n and the next block of data 8847 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 8848 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 8849 __ adds(S_0, U_0, S_0); 8850 __ adcs(S_1, U_1, S_1); 8851 __ adc(S_2, U_2, zr); 8852 __ add(S_2, S_2, 1); 8853 8854 const Register U_0HI = *++regs, U_1HI = *++regs; 8855 8856 // NB: this logic depends on some of the special properties of 8857 // Poly1305 keys. In particular, because we know that the top 8858 // four bits of R_0 and R_1 are zero, we can add together 8859 // partial products without any risk of needing to propagate a 8860 // carry out. 8861 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 8862 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 8863 __ andr(U_2, R_0, 3); 8864 __ mul(U_2, S_2, U_2); 8865 8866 // Recycle registers S_0, S_1, S_2 8867 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 8868 8869 // Partial reduction mod 2**130 - 5 8870 __ adds(U_1, U_0HI, U_1); 8871 __ adc(U_2, U_1HI, U_2); 8872 // Sum now in U_2:U_1:U_0. 8873 // Dead: U_0HI, U_1HI. 8874 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 8875 8876 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 8877 8878 // First, U_2:U_1:U_0 += (U_2 >> 2) 8879 __ lsr(rscratch1, U_2, 2); 8880 __ andr(U_2, U_2, (u8)3); 8881 __ adds(U_0, U_0, rscratch1); 8882 __ adcs(U_1, U_1, zr); 8883 __ adc(U_2, U_2, zr); 8884 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 8885 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 8886 __ adcs(U_1, U_1, zr); 8887 __ adc(U_2, U_2, zr); 8888 8889 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 8890 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 8891 __ br(~ Assembler::LT, LOOP); 8892 } 8893 8894 // Further reduce modulo 2^130 - 5 8895 __ lsr(rscratch1, U_2, 2); 8896 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 8897 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 8898 __ adcs(U_1, U_1, zr); 8899 __ andr(U_2, U_2, (u1)3); 8900 __ adc(U_2, U_2, zr); 8901 8902 // Unpack the sum into five 26-bit limbs and write to memory. 8903 __ ubfiz(rscratch1, U_0, 0, 26); 8904 __ ubfx(rscratch2, U_0, 26, 26); 8905 __ stp(rscratch1, rscratch2, Address(acc_start)); 8906 __ ubfx(rscratch1, U_0, 52, 12); 8907 __ bfi(rscratch1, U_1, 12, 14); 8908 __ ubfx(rscratch2, U_1, 14, 26); 8909 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 8910 __ ubfx(rscratch1, U_1, 40, 24); 8911 __ bfi(rscratch1, U_2, 24, 3); 8912 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 8913 8914 __ bind(DONE); 8915 __ pop(callee_saved, sp); 8916 __ leave(); 8917 __ ret(lr); 8918 8919 return start; 8920 } 8921 8922 // exception handler for upcall stubs 8923 address generate_upcall_stub_exception_handler() { 8924 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 8925 StubCodeMark mark(this, stub_id); 8926 address start = __ pc(); 8927 8928 // Native caller has no idea how to handle exceptions, 8929 // so we just crash here. Up to callee to catch exceptions. 8930 __ verify_oop(r0); 8931 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 8932 __ blr(rscratch1); 8933 __ should_not_reach_here(); 8934 8935 return start; 8936 } 8937 8938 // load Method* target of MethodHandle 8939 // j_rarg0 = jobject receiver 8940 // rmethod = result 8941 address generate_upcall_stub_load_target() { 8942 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 8943 StubCodeMark mark(this, stub_id); 8944 address start = __ pc(); 8945 8946 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 8947 // Load target method from receiver 8948 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 8949 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 8950 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 8951 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 8952 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 8953 noreg, noreg); 8954 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 8955 8956 __ ret(lr); 8957 8958 return start; 8959 } 8960 8961 #undef __ 8962 #define __ masm-> 8963 8964 class MontgomeryMultiplyGenerator : public MacroAssembler { 8965 8966 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 8967 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 8968 8969 RegSet _toSave; 8970 bool _squaring; 8971 8972 public: 8973 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 8974 : MacroAssembler(as->code()), _squaring(squaring) { 8975 8976 // Register allocation 8977 8978 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 8979 Pa_base = *regs; // Argument registers 8980 if (squaring) 8981 Pb_base = Pa_base; 8982 else 8983 Pb_base = *++regs; 8984 Pn_base = *++regs; 8985 Rlen= *++regs; 8986 inv = *++regs; 8987 Pm_base = *++regs; 8988 8989 // Working registers: 8990 Ra = *++regs; // The current digit of a, b, n, and m. 8991 Rb = *++regs; 8992 Rm = *++regs; 8993 Rn = *++regs; 8994 8995 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 8996 Pb = *++regs; 8997 Pm = *++regs; 8998 Pn = *++regs; 8999 9000 t0 = *++regs; // Three registers which form a 9001 t1 = *++regs; // triple-precision accumuator. 9002 t2 = *++regs; 9003 9004 Ri = *++regs; // Inner and outer loop indexes. 9005 Rj = *++regs; 9006 9007 Rhi_ab = *++regs; // Product registers: low and high parts 9008 Rlo_ab = *++regs; // of a*b and m*n. 9009 Rhi_mn = *++regs; 9010 Rlo_mn = *++regs; 9011 9012 // r19 and up are callee-saved. 9013 _toSave = RegSet::range(r19, *regs) + Pm_base; 9014 } 9015 9016 private: 9017 void save_regs() { 9018 push(_toSave, sp); 9019 } 9020 9021 void restore_regs() { 9022 pop(_toSave, sp); 9023 } 9024 9025 template <typename T> 9026 void unroll_2(Register count, T block) { 9027 Label loop, end, odd; 9028 tbnz(count, 0, odd); 9029 cbz(count, end); 9030 align(16); 9031 bind(loop); 9032 (this->*block)(); 9033 bind(odd); 9034 (this->*block)(); 9035 subs(count, count, 2); 9036 br(Assembler::GT, loop); 9037 bind(end); 9038 } 9039 9040 template <typename T> 9041 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 9042 Label loop, end, odd; 9043 tbnz(count, 0, odd); 9044 cbz(count, end); 9045 align(16); 9046 bind(loop); 9047 (this->*block)(d, s, tmp); 9048 bind(odd); 9049 (this->*block)(d, s, tmp); 9050 subs(count, count, 2); 9051 br(Assembler::GT, loop); 9052 bind(end); 9053 } 9054 9055 void pre1(RegisterOrConstant i) { 9056 block_comment("pre1"); 9057 // Pa = Pa_base; 9058 // Pb = Pb_base + i; 9059 // Pm = Pm_base; 9060 // Pn = Pn_base + i; 9061 // Ra = *Pa; 9062 // Rb = *Pb; 9063 // Rm = *Pm; 9064 // Rn = *Pn; 9065 ldr(Ra, Address(Pa_base)); 9066 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 9067 ldr(Rm, Address(Pm_base)); 9068 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9069 lea(Pa, Address(Pa_base)); 9070 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 9071 lea(Pm, Address(Pm_base)); 9072 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9073 9074 // Zero the m*n result. 9075 mov(Rhi_mn, zr); 9076 mov(Rlo_mn, zr); 9077 } 9078 9079 // The core multiply-accumulate step of a Montgomery 9080 // multiplication. The idea is to schedule operations as a 9081 // pipeline so that instructions with long latencies (loads and 9082 // multiplies) have time to complete before their results are 9083 // used. This most benefits in-order implementations of the 9084 // architecture but out-of-order ones also benefit. 9085 void step() { 9086 block_comment("step"); 9087 // MACC(Ra, Rb, t0, t1, t2); 9088 // Ra = *++Pa; 9089 // Rb = *--Pb; 9090 umulh(Rhi_ab, Ra, Rb); 9091 mul(Rlo_ab, Ra, Rb); 9092 ldr(Ra, pre(Pa, wordSize)); 9093 ldr(Rb, pre(Pb, -wordSize)); 9094 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 9095 // previous iteration. 9096 // MACC(Rm, Rn, t0, t1, t2); 9097 // Rm = *++Pm; 9098 // Rn = *--Pn; 9099 umulh(Rhi_mn, Rm, Rn); 9100 mul(Rlo_mn, Rm, Rn); 9101 ldr(Rm, pre(Pm, wordSize)); 9102 ldr(Rn, pre(Pn, -wordSize)); 9103 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9104 } 9105 9106 void post1() { 9107 block_comment("post1"); 9108 9109 // MACC(Ra, Rb, t0, t1, t2); 9110 // Ra = *++Pa; 9111 // Rb = *--Pb; 9112 umulh(Rhi_ab, Ra, Rb); 9113 mul(Rlo_ab, Ra, Rb); 9114 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9115 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9116 9117 // *Pm = Rm = t0 * inv; 9118 mul(Rm, t0, inv); 9119 str(Rm, Address(Pm)); 9120 9121 // MACC(Rm, Rn, t0, t1, t2); 9122 // t0 = t1; t1 = t2; t2 = 0; 9123 umulh(Rhi_mn, Rm, Rn); 9124 9125 #ifndef PRODUCT 9126 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 9127 { 9128 mul(Rlo_mn, Rm, Rn); 9129 add(Rlo_mn, t0, Rlo_mn); 9130 Label ok; 9131 cbz(Rlo_mn, ok); { 9132 stop("broken Montgomery multiply"); 9133 } bind(ok); 9134 } 9135 #endif 9136 // We have very carefully set things up so that 9137 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 9138 // the lower half of Rm * Rn because we know the result already: 9139 // it must be -t0. t0 + (-t0) must generate a carry iff 9140 // t0 != 0. So, rather than do a mul and an adds we just set 9141 // the carry flag iff t0 is nonzero. 9142 // 9143 // mul(Rlo_mn, Rm, Rn); 9144 // adds(zr, t0, Rlo_mn); 9145 subs(zr, t0, 1); // Set carry iff t0 is nonzero 9146 adcs(t0, t1, Rhi_mn); 9147 adc(t1, t2, zr); 9148 mov(t2, zr); 9149 } 9150 9151 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 9152 block_comment("pre2"); 9153 // Pa = Pa_base + i-len; 9154 // Pb = Pb_base + len; 9155 // Pm = Pm_base + i-len; 9156 // Pn = Pn_base + len; 9157 9158 if (i.is_register()) { 9159 sub(Rj, i.as_register(), len); 9160 } else { 9161 mov(Rj, i.as_constant()); 9162 sub(Rj, Rj, len); 9163 } 9164 // Rj == i-len 9165 9166 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 9167 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 9168 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 9169 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 9170 9171 // Ra = *++Pa; 9172 // Rb = *--Pb; 9173 // Rm = *++Pm; 9174 // Rn = *--Pn; 9175 ldr(Ra, pre(Pa, wordSize)); 9176 ldr(Rb, pre(Pb, -wordSize)); 9177 ldr(Rm, pre(Pm, wordSize)); 9178 ldr(Rn, pre(Pn, -wordSize)); 9179 9180 mov(Rhi_mn, zr); 9181 mov(Rlo_mn, zr); 9182 } 9183 9184 void post2(RegisterOrConstant i, RegisterOrConstant len) { 9185 block_comment("post2"); 9186 if (i.is_constant()) { 9187 mov(Rj, i.as_constant()-len.as_constant()); 9188 } else { 9189 sub(Rj, i.as_register(), len); 9190 } 9191 9192 adds(t0, t0, Rlo_mn); // The pending m*n, low part 9193 9194 // As soon as we know the least significant digit of our result, 9195 // store it. 9196 // Pm_base[i-len] = t0; 9197 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 9198 9199 // t0 = t1; t1 = t2; t2 = 0; 9200 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 9201 adc(t1, t2, zr); 9202 mov(t2, zr); 9203 } 9204 9205 // A carry in t0 after Montgomery multiplication means that we 9206 // should subtract multiples of n from our result in m. We'll 9207 // keep doing that until there is no carry. 9208 void normalize(RegisterOrConstant len) { 9209 block_comment("normalize"); 9210 // while (t0) 9211 // t0 = sub(Pm_base, Pn_base, t0, len); 9212 Label loop, post, again; 9213 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 9214 cbz(t0, post); { 9215 bind(again); { 9216 mov(i, zr); 9217 mov(cnt, len); 9218 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9219 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9220 subs(zr, zr, zr); // set carry flag, i.e. no borrow 9221 align(16); 9222 bind(loop); { 9223 sbcs(Rm, Rm, Rn); 9224 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9225 add(i, i, 1); 9226 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9227 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9228 sub(cnt, cnt, 1); 9229 } cbnz(cnt, loop); 9230 sbc(t0, t0, zr); 9231 } cbnz(t0, again); 9232 } bind(post); 9233 } 9234 9235 // Move memory at s to d, reversing words. 9236 // Increments d to end of copied memory 9237 // Destroys tmp1, tmp2 9238 // Preserves len 9239 // Leaves s pointing to the address which was in d at start 9240 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 9241 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 9242 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 9243 9244 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 9245 mov(tmp1, len); 9246 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 9247 sub(s, d, len, ext::uxtw, LogBytesPerWord); 9248 } 9249 // where 9250 void reverse1(Register d, Register s, Register tmp) { 9251 ldr(tmp, pre(s, -wordSize)); 9252 ror(tmp, tmp, 32); 9253 str(tmp, post(d, wordSize)); 9254 } 9255 9256 void step_squaring() { 9257 // An extra ACC 9258 step(); 9259 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9260 } 9261 9262 void last_squaring(RegisterOrConstant i) { 9263 Label dont; 9264 // if ((i & 1) == 0) { 9265 tbnz(i.as_register(), 0, dont); { 9266 // MACC(Ra, Rb, t0, t1, t2); 9267 // Ra = *++Pa; 9268 // Rb = *--Pb; 9269 umulh(Rhi_ab, Ra, Rb); 9270 mul(Rlo_ab, Ra, Rb); 9271 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9272 } bind(dont); 9273 } 9274 9275 void extra_step_squaring() { 9276 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9277 9278 // MACC(Rm, Rn, t0, t1, t2); 9279 // Rm = *++Pm; 9280 // Rn = *--Pn; 9281 umulh(Rhi_mn, Rm, Rn); 9282 mul(Rlo_mn, Rm, Rn); 9283 ldr(Rm, pre(Pm, wordSize)); 9284 ldr(Rn, pre(Pn, -wordSize)); 9285 } 9286 9287 void post1_squaring() { 9288 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9289 9290 // *Pm = Rm = t0 * inv; 9291 mul(Rm, t0, inv); 9292 str(Rm, Address(Pm)); 9293 9294 // MACC(Rm, Rn, t0, t1, t2); 9295 // t0 = t1; t1 = t2; t2 = 0; 9296 umulh(Rhi_mn, Rm, Rn); 9297 9298 #ifndef PRODUCT 9299 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 9300 { 9301 mul(Rlo_mn, Rm, Rn); 9302 add(Rlo_mn, t0, Rlo_mn); 9303 Label ok; 9304 cbz(Rlo_mn, ok); { 9305 stop("broken Montgomery multiply"); 9306 } bind(ok); 9307 } 9308 #endif 9309 // We have very carefully set things up so that 9310 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 9311 // the lower half of Rm * Rn because we know the result already: 9312 // it must be -t0. t0 + (-t0) must generate a carry iff 9313 // t0 != 0. So, rather than do a mul and an adds we just set 9314 // the carry flag iff t0 is nonzero. 9315 // 9316 // mul(Rlo_mn, Rm, Rn); 9317 // adds(zr, t0, Rlo_mn); 9318 subs(zr, t0, 1); // Set carry iff t0 is nonzero 9319 adcs(t0, t1, Rhi_mn); 9320 adc(t1, t2, zr); 9321 mov(t2, zr); 9322 } 9323 9324 void acc(Register Rhi, Register Rlo, 9325 Register t0, Register t1, Register t2) { 9326 adds(t0, t0, Rlo); 9327 adcs(t1, t1, Rhi); 9328 adc(t2, t2, zr); 9329 } 9330 9331 public: 9332 /** 9333 * Fast Montgomery multiplication. The derivation of the 9334 * algorithm is in A Cryptographic Library for the Motorola 9335 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 9336 * 9337 * Arguments: 9338 * 9339 * Inputs for multiplication: 9340 * c_rarg0 - int array elements a 9341 * c_rarg1 - int array elements b 9342 * c_rarg2 - int array elements n (the modulus) 9343 * c_rarg3 - int length 9344 * c_rarg4 - int inv 9345 * c_rarg5 - int array elements m (the result) 9346 * 9347 * Inputs for squaring: 9348 * c_rarg0 - int array elements a 9349 * c_rarg1 - int array elements n (the modulus) 9350 * c_rarg2 - int length 9351 * c_rarg3 - int inv 9352 * c_rarg4 - int array elements m (the result) 9353 * 9354 */ 9355 address generate_multiply() { 9356 Label argh, nothing; 9357 bind(argh); 9358 stop("MontgomeryMultiply total_allocation must be <= 8192"); 9359 9360 align(CodeEntryAlignment); 9361 address entry = pc(); 9362 9363 cbzw(Rlen, nothing); 9364 9365 enter(); 9366 9367 // Make room. 9368 cmpw(Rlen, 512); 9369 br(Assembler::HI, argh); 9370 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 9371 andr(sp, Ra, -2 * wordSize); 9372 9373 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 9374 9375 { 9376 // Copy input args, reversing as we go. We use Ra as a 9377 // temporary variable. 9378 reverse(Ra, Pa_base, Rlen, t0, t1); 9379 if (!_squaring) 9380 reverse(Ra, Pb_base, Rlen, t0, t1); 9381 reverse(Ra, Pn_base, Rlen, t0, t1); 9382 } 9383 9384 // Push all call-saved registers and also Pm_base which we'll need 9385 // at the end. 9386 save_regs(); 9387 9388 #ifndef PRODUCT 9389 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 9390 { 9391 ldr(Rn, Address(Pn_base, 0)); 9392 mul(Rlo_mn, Rn, inv); 9393 subs(zr, Rlo_mn, -1); 9394 Label ok; 9395 br(EQ, ok); { 9396 stop("broken inverse in Montgomery multiply"); 9397 } bind(ok); 9398 } 9399 #endif 9400 9401 mov(Pm_base, Ra); 9402 9403 mov(t0, zr); 9404 mov(t1, zr); 9405 mov(t2, zr); 9406 9407 block_comment("for (int i = 0; i < len; i++) {"); 9408 mov(Ri, zr); { 9409 Label loop, end; 9410 cmpw(Ri, Rlen); 9411 br(Assembler::GE, end); 9412 9413 bind(loop); 9414 pre1(Ri); 9415 9416 block_comment(" for (j = i; j; j--) {"); { 9417 movw(Rj, Ri); 9418 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 9419 } block_comment(" } // j"); 9420 9421 post1(); 9422 addw(Ri, Ri, 1); 9423 cmpw(Ri, Rlen); 9424 br(Assembler::LT, loop); 9425 bind(end); 9426 block_comment("} // i"); 9427 } 9428 9429 block_comment("for (int i = len; i < 2*len; i++) {"); 9430 mov(Ri, Rlen); { 9431 Label loop, end; 9432 cmpw(Ri, Rlen, Assembler::LSL, 1); 9433 br(Assembler::GE, end); 9434 9435 bind(loop); 9436 pre2(Ri, Rlen); 9437 9438 block_comment(" for (j = len*2-i-1; j; j--) {"); { 9439 lslw(Rj, Rlen, 1); 9440 subw(Rj, Rj, Ri); 9441 subw(Rj, Rj, 1); 9442 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 9443 } block_comment(" } // j"); 9444 9445 post2(Ri, Rlen); 9446 addw(Ri, Ri, 1); 9447 cmpw(Ri, Rlen, Assembler::LSL, 1); 9448 br(Assembler::LT, loop); 9449 bind(end); 9450 } 9451 block_comment("} // i"); 9452 9453 normalize(Rlen); 9454 9455 mov(Ra, Pm_base); // Save Pm_base in Ra 9456 restore_regs(); // Restore caller's Pm_base 9457 9458 // Copy our result into caller's Pm_base 9459 reverse(Pm_base, Ra, Rlen, t0, t1); 9460 9461 leave(); 9462 bind(nothing); 9463 ret(lr); 9464 9465 return entry; 9466 } 9467 // In C, approximately: 9468 9469 // void 9470 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 9471 // julong Pn_base[], julong Pm_base[], 9472 // julong inv, int len) { 9473 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 9474 // julong *Pa, *Pb, *Pn, *Pm; 9475 // julong Ra, Rb, Rn, Rm; 9476 9477 // int i; 9478 9479 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 9480 9481 // for (i = 0; i < len; i++) { 9482 // int j; 9483 9484 // Pa = Pa_base; 9485 // Pb = Pb_base + i; 9486 // Pm = Pm_base; 9487 // Pn = Pn_base + i; 9488 9489 // Ra = *Pa; 9490 // Rb = *Pb; 9491 // Rm = *Pm; 9492 // Rn = *Pn; 9493 9494 // int iters = i; 9495 // for (j = 0; iters--; j++) { 9496 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 9497 // MACC(Ra, Rb, t0, t1, t2); 9498 // Ra = *++Pa; 9499 // Rb = *--Pb; 9500 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9501 // MACC(Rm, Rn, t0, t1, t2); 9502 // Rm = *++Pm; 9503 // Rn = *--Pn; 9504 // } 9505 9506 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 9507 // MACC(Ra, Rb, t0, t1, t2); 9508 // *Pm = Rm = t0 * inv; 9509 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 9510 // MACC(Rm, Rn, t0, t1, t2); 9511 9512 // assert(t0 == 0, "broken Montgomery multiply"); 9513 9514 // t0 = t1; t1 = t2; t2 = 0; 9515 // } 9516 9517 // for (i = len; i < 2*len; i++) { 9518 // int j; 9519 9520 // Pa = Pa_base + i-len; 9521 // Pb = Pb_base + len; 9522 // Pm = Pm_base + i-len; 9523 // Pn = Pn_base + len; 9524 9525 // Ra = *++Pa; 9526 // Rb = *--Pb; 9527 // Rm = *++Pm; 9528 // Rn = *--Pn; 9529 9530 // int iters = len*2-i-1; 9531 // for (j = i-len+1; iters--; j++) { 9532 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 9533 // MACC(Ra, Rb, t0, t1, t2); 9534 // Ra = *++Pa; 9535 // Rb = *--Pb; 9536 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9537 // MACC(Rm, Rn, t0, t1, t2); 9538 // Rm = *++Pm; 9539 // Rn = *--Pn; 9540 // } 9541 9542 // Pm_base[i-len] = t0; 9543 // t0 = t1; t1 = t2; t2 = 0; 9544 // } 9545 9546 // while (t0) 9547 // t0 = sub(Pm_base, Pn_base, t0, len); 9548 // } 9549 9550 /** 9551 * Fast Montgomery squaring. This uses asymptotically 25% fewer 9552 * multiplies than Montgomery multiplication so it should be up to 9553 * 25% faster. However, its loop control is more complex and it 9554 * may actually run slower on some machines. 9555 * 9556 * Arguments: 9557 * 9558 * Inputs: 9559 * c_rarg0 - int array elements a 9560 * c_rarg1 - int array elements n (the modulus) 9561 * c_rarg2 - int length 9562 * c_rarg3 - int inv 9563 * c_rarg4 - int array elements m (the result) 9564 * 9565 */ 9566 address generate_square() { 9567 Label argh; 9568 bind(argh); 9569 stop("MontgomeryMultiply total_allocation must be <= 8192"); 9570 9571 align(CodeEntryAlignment); 9572 address entry = pc(); 9573 9574 enter(); 9575 9576 // Make room. 9577 cmpw(Rlen, 512); 9578 br(Assembler::HI, argh); 9579 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 9580 andr(sp, Ra, -2 * wordSize); 9581 9582 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 9583 9584 { 9585 // Copy input args, reversing as we go. We use Ra as a 9586 // temporary variable. 9587 reverse(Ra, Pa_base, Rlen, t0, t1); 9588 reverse(Ra, Pn_base, Rlen, t0, t1); 9589 } 9590 9591 // Push all call-saved registers and also Pm_base which we'll need 9592 // at the end. 9593 save_regs(); 9594 9595 mov(Pm_base, Ra); 9596 9597 mov(t0, zr); 9598 mov(t1, zr); 9599 mov(t2, zr); 9600 9601 block_comment("for (int i = 0; i < len; i++) {"); 9602 mov(Ri, zr); { 9603 Label loop, end; 9604 bind(loop); 9605 cmp(Ri, Rlen); 9606 br(Assembler::GE, end); 9607 9608 pre1(Ri); 9609 9610 block_comment("for (j = (i+1)/2; j; j--) {"); { 9611 add(Rj, Ri, 1); 9612 lsr(Rj, Rj, 1); 9613 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 9614 } block_comment(" } // j"); 9615 9616 last_squaring(Ri); 9617 9618 block_comment(" for (j = i/2; j; j--) {"); { 9619 lsr(Rj, Ri, 1); 9620 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 9621 } block_comment(" } // j"); 9622 9623 post1_squaring(); 9624 add(Ri, Ri, 1); 9625 cmp(Ri, Rlen); 9626 br(Assembler::LT, loop); 9627 9628 bind(end); 9629 block_comment("} // i"); 9630 } 9631 9632 block_comment("for (int i = len; i < 2*len; i++) {"); 9633 mov(Ri, Rlen); { 9634 Label loop, end; 9635 bind(loop); 9636 cmp(Ri, Rlen, Assembler::LSL, 1); 9637 br(Assembler::GE, end); 9638 9639 pre2(Ri, Rlen); 9640 9641 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 9642 lsl(Rj, Rlen, 1); 9643 sub(Rj, Rj, Ri); 9644 sub(Rj, Rj, 1); 9645 lsr(Rj, Rj, 1); 9646 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 9647 } block_comment(" } // j"); 9648 9649 last_squaring(Ri); 9650 9651 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 9652 lsl(Rj, Rlen, 1); 9653 sub(Rj, Rj, Ri); 9654 lsr(Rj, Rj, 1); 9655 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 9656 } block_comment(" } // j"); 9657 9658 post2(Ri, Rlen); 9659 add(Ri, Ri, 1); 9660 cmp(Ri, Rlen, Assembler::LSL, 1); 9661 9662 br(Assembler::LT, loop); 9663 bind(end); 9664 block_comment("} // i"); 9665 } 9666 9667 normalize(Rlen); 9668 9669 mov(Ra, Pm_base); // Save Pm_base in Ra 9670 restore_regs(); // Restore caller's Pm_base 9671 9672 // Copy our result into caller's Pm_base 9673 reverse(Pm_base, Ra, Rlen, t0, t1); 9674 9675 leave(); 9676 ret(lr); 9677 9678 return entry; 9679 } 9680 // In C, approximately: 9681 9682 // void 9683 // montgomery_square(julong Pa_base[], julong Pn_base[], 9684 // julong Pm_base[], julong inv, int len) { 9685 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 9686 // julong *Pa, *Pb, *Pn, *Pm; 9687 // julong Ra, Rb, Rn, Rm; 9688 9689 // int i; 9690 9691 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 9692 9693 // for (i = 0; i < len; i++) { 9694 // int j; 9695 9696 // Pa = Pa_base; 9697 // Pb = Pa_base + i; 9698 // Pm = Pm_base; 9699 // Pn = Pn_base + i; 9700 9701 // Ra = *Pa; 9702 // Rb = *Pb; 9703 // Rm = *Pm; 9704 // Rn = *Pn; 9705 9706 // int iters = (i+1)/2; 9707 // for (j = 0; iters--; j++) { 9708 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 9709 // MACC2(Ra, Rb, t0, t1, t2); 9710 // Ra = *++Pa; 9711 // Rb = *--Pb; 9712 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9713 // MACC(Rm, Rn, t0, t1, t2); 9714 // Rm = *++Pm; 9715 // Rn = *--Pn; 9716 // } 9717 // if ((i & 1) == 0) { 9718 // assert(Ra == Pa_base[j], "must be"); 9719 // MACC(Ra, Ra, t0, t1, t2); 9720 // } 9721 // iters = i/2; 9722 // assert(iters == i-j, "must be"); 9723 // for (; iters--; j++) { 9724 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9725 // MACC(Rm, Rn, t0, t1, t2); 9726 // Rm = *++Pm; 9727 // Rn = *--Pn; 9728 // } 9729 9730 // *Pm = Rm = t0 * inv; 9731 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 9732 // MACC(Rm, Rn, t0, t1, t2); 9733 9734 // assert(t0 == 0, "broken Montgomery multiply"); 9735 9736 // t0 = t1; t1 = t2; t2 = 0; 9737 // } 9738 9739 // for (i = len; i < 2*len; i++) { 9740 // int start = i-len+1; 9741 // int end = start + (len - start)/2; 9742 // int j; 9743 9744 // Pa = Pa_base + i-len; 9745 // Pb = Pa_base + len; 9746 // Pm = Pm_base + i-len; 9747 // Pn = Pn_base + len; 9748 9749 // Ra = *++Pa; 9750 // Rb = *--Pb; 9751 // Rm = *++Pm; 9752 // Rn = *--Pn; 9753 9754 // int iters = (2*len-i-1)/2; 9755 // assert(iters == end-start, "must be"); 9756 // for (j = start; iters--; j++) { 9757 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 9758 // MACC2(Ra, Rb, t0, t1, t2); 9759 // Ra = *++Pa; 9760 // Rb = *--Pb; 9761 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9762 // MACC(Rm, Rn, t0, t1, t2); 9763 // Rm = *++Pm; 9764 // Rn = *--Pn; 9765 // } 9766 // if ((i & 1) == 0) { 9767 // assert(Ra == Pa_base[j], "must be"); 9768 // MACC(Ra, Ra, t0, t1, t2); 9769 // } 9770 // iters = (2*len-i)/2; 9771 // assert(iters == len-j, "must be"); 9772 // for (; iters--; j++) { 9773 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9774 // MACC(Rm, Rn, t0, t1, t2); 9775 // Rm = *++Pm; 9776 // Rn = *--Pn; 9777 // } 9778 // Pm_base[i-len] = t0; 9779 // t0 = t1; t1 = t2; t2 = 0; 9780 // } 9781 9782 // while (t0) 9783 // t0 = sub(Pm_base, Pn_base, t0, len); 9784 // } 9785 }; 9786 9787 void generate_vector_math_stubs() { 9788 // Get native vector math stub routine addresses 9789 void* libsleef = nullptr; 9790 char ebuf[1024]; 9791 char dll_name[JVM_MAXPATHLEN]; 9792 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 9793 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 9794 } 9795 if (libsleef == nullptr) { 9796 log_info(library)("Failed to load native vector math library, %s!", ebuf); 9797 return; 9798 } 9799 // Method naming convention 9800 // All the methods are named as <OP><T><N>_<U><suffix> 9801 // Where: 9802 // <OP> is the operation name, e.g. sin 9803 // <T> is optional to indicate float/double 9804 // "f/d" for vector float/double operation 9805 // <N> is the number of elements in the vector 9806 // "2/4" for neon, and "x" for sve 9807 // <U> is the precision level 9808 // "u10/u05" represents 1.0/0.5 ULP error bounds 9809 // We use "u10" for all operations by default 9810 // But for those functions do not have u10 support, we use "u05" instead 9811 // <suffix> indicates neon/sve 9812 // "sve/advsimd" for sve/neon implementations 9813 // e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions 9814 // cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions 9815 // 9816 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 9817 9818 // Math vector stubs implemented with SVE for scalable vector size. 9819 if (UseSVE > 0) { 9820 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 9821 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 9822 // Skip "tanh" because there is performance regression 9823 if (vop == VectorSupport::VECTOR_OP_TANH) { 9824 continue; 9825 } 9826 9827 // The native library does not support u10 level of "hypot". 9828 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 9829 9830 snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf); 9831 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 9832 9833 snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf); 9834 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 9835 } 9836 } 9837 9838 // Math vector stubs implemented with NEON for 64/128 bits vector size. 9839 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 9840 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 9841 // Skip "tanh" because there is performance regression 9842 if (vop == VectorSupport::VECTOR_OP_TANH) { 9843 continue; 9844 } 9845 9846 // The native library does not support u10 level of "hypot". 9847 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 9848 9849 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 9850 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf); 9851 9852 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 9853 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 9854 9855 snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf); 9856 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 9857 } 9858 } 9859 9860 // Call here from the interpreter or compiled code to either load 9861 // multiple returned values from the inline type instance being 9862 // returned to registers or to store returned values to a newly 9863 // allocated inline type instance. 9864 address generate_return_value_stub(address destination, const char* name, bool has_res) { 9865 // We need to save all registers the calling convention may use so 9866 // the runtime calls read or update those registers. This needs to 9867 // be in sync with SharedRuntime::java_return_convention(). 9868 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 9869 enum layout { 9870 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0 9871 j_rarg6_off, j_rarg6_2, 9872 j_rarg5_off, j_rarg5_2, 9873 j_rarg4_off, j_rarg4_2, 9874 j_rarg3_off, j_rarg3_2, 9875 j_rarg2_off, j_rarg2_2, 9876 j_rarg1_off, j_rarg1_2, 9877 j_rarg0_off, j_rarg0_2, 9878 9879 j_farg7_off, j_farg7_2, 9880 j_farg6_off, j_farg6_2, 9881 j_farg5_off, j_farg5_2, 9882 j_farg4_off, j_farg4_2, 9883 j_farg3_off, j_farg3_2, 9884 j_farg2_off, j_farg2_2, 9885 j_farg1_off, j_farg1_2, 9886 j_farg0_off, j_farg0_2, 9887 9888 rfp_off, rfp_off2, 9889 return_off, return_off2, 9890 9891 framesize // inclusive of return address 9892 }; 9893 9894 CodeBuffer code(name, 512, 64); 9895 MacroAssembler* masm = new MacroAssembler(&code); 9896 9897 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16); 9898 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned"); 9899 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 9900 int frame_size_in_words = frame_size_in_bytes / wordSize; 9901 9902 OopMapSet* oop_maps = new OopMapSet(); 9903 OopMap* map = new OopMap(frame_size_in_slots, 0); 9904 9905 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 9906 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 9907 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 9908 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 9909 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 9910 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 9911 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 9912 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 9913 9914 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 9915 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 9916 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 9917 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 9918 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 9919 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 9920 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 9921 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 9922 9923 address start = __ pc(); 9924 9925 __ enter(); // Save FP and LR before call 9926 9927 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize))); 9928 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize))); 9929 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize))); 9930 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize))); 9931 9932 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize))); 9933 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize))); 9934 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize))); 9935 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize))); 9936 9937 int frame_complete = __ offset(); 9938 9939 // Set up last_Java_sp and last_Java_fp 9940 address the_pc = __ pc(); 9941 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1); 9942 9943 // Call runtime 9944 __ mov(c_rarg1, r0); 9945 __ mov(c_rarg0, rthread); 9946 9947 __ mov(rscratch1, destination); 9948 __ blr(rscratch1); 9949 9950 oop_maps->add_gc_map(the_pc - start, map); 9951 9952 __ reset_last_Java_frame(false); 9953 9954 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize))); 9955 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize))); 9956 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize))); 9957 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize))); 9958 9959 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize))); 9960 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize))); 9961 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize))); 9962 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize))); 9963 9964 __ leave(); 9965 9966 // check for pending exceptions 9967 Label pending; 9968 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 9969 __ cbnz(rscratch1, pending); 9970 9971 if (has_res) { 9972 __ get_vm_result(r0, rthread); 9973 } 9974 9975 __ ret(lr); 9976 9977 __ bind(pending); 9978 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 9979 9980 // ------------- 9981 // make sure all code is generated 9982 masm->flush(); 9983 9984 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 9985 return stub->entry_point(); 9986 } 9987 9988 // Initialization 9989 void generate_initial_stubs() { 9990 // Generate initial stubs and initializes the entry points 9991 9992 // entry points that exist in all platforms Note: This is code 9993 // that could be shared among different platforms - however the 9994 // benefit seems to be smaller than the disadvantage of having a 9995 // much more complicated generator structure. See also comment in 9996 // stubRoutines.hpp. 9997 9998 StubRoutines::_forward_exception_entry = generate_forward_exception(); 9999 10000 StubRoutines::_call_stub_entry = 10001 generate_call_stub(StubRoutines::_call_stub_return_address); 10002 10003 // is referenced by megamorphic call 10004 StubRoutines::_catch_exception_entry = generate_catch_exception(); 10005 10006 // Initialize table for copy memory (arraycopy) check. 10007 if (UnsafeMemoryAccess::_table == nullptr) { 10008 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 10009 } 10010 10011 if (UseCRC32Intrinsics) { 10012 // set table address before stub generation which use it 10013 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 10014 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 10015 } 10016 10017 if (UseCRC32CIntrinsics) { 10018 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 10019 } 10020 10021 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 10022 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 10023 } 10024 10025 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 10026 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 10027 } 10028 10029 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 10030 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 10031 StubRoutines::_hf2f = generate_float16ToFloat(); 10032 StubRoutines::_f2hf = generate_floatToFloat16(); 10033 } 10034 10035 if (InlineTypeReturnedAsFields) { 10036 StubRoutines::_load_inline_type_fields_in_regs = 10037 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false); 10038 StubRoutines::_store_inline_type_fields_to_buf = 10039 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true); 10040 } 10041 10042 } 10043 10044 void generate_continuation_stubs() { 10045 // Continuation stubs: 10046 StubRoutines::_cont_thaw = generate_cont_thaw(); 10047 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 10048 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 10049 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 10050 } 10051 10052 void generate_final_stubs() { 10053 // support for verify_oop (must happen after universe_init) 10054 if (VerifyOops) { 10055 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 10056 } 10057 10058 // arraycopy stubs used by compilers 10059 generate_arraycopy_stubs(); 10060 10061 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 10062 if (bs_nm != nullptr) { 10063 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 10064 } 10065 10066 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 10067 10068 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 10069 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 10070 10071 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 10072 10073 generate_atomic_entry_points(); 10074 10075 #endif // LINUX 10076 10077 #ifdef COMPILER2 10078 if (UseSecondarySupersTable) { 10079 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 10080 if (! InlineSecondarySupersTest) { 10081 generate_lookup_secondary_supers_table_stub(); 10082 } 10083 } 10084 #endif 10085 10086 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 10087 } 10088 10089 void generate_compiler_stubs() { 10090 #if COMPILER2_OR_JVMCI 10091 10092 if (UseSVE == 0) { 10093 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 10094 } 10095 10096 // array equals stub for large arrays. 10097 if (!UseSimpleArrayEquals) { 10098 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 10099 } 10100 10101 // arrays_hascode stub for large arrays. 10102 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 10103 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 10104 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 10105 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 10106 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 10107 10108 // byte_array_inflate stub for large arrays. 10109 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 10110 10111 // countPositives stub for large arrays. 10112 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 10113 10114 generate_compare_long_strings(); 10115 10116 generate_string_indexof_stubs(); 10117 10118 #ifdef COMPILER2 10119 if (UseMultiplyToLenIntrinsic) { 10120 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 10121 } 10122 10123 if (UseSquareToLenIntrinsic) { 10124 StubRoutines::_squareToLen = generate_squareToLen(); 10125 } 10126 10127 if (UseMulAddIntrinsic) { 10128 StubRoutines::_mulAdd = generate_mulAdd(); 10129 } 10130 10131 if (UseSIMDForBigIntegerShiftIntrinsics) { 10132 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 10133 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 10134 } 10135 10136 if (UseMontgomeryMultiplyIntrinsic) { 10137 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 10138 StubCodeMark mark(this, stub_id); 10139 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 10140 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 10141 } 10142 10143 if (UseMontgomerySquareIntrinsic) { 10144 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 10145 StubCodeMark mark(this, stub_id); 10146 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 10147 // We use generate_multiply() rather than generate_square() 10148 // because it's faster for the sizes of modulus we care about. 10149 StubRoutines::_montgomerySquare = g.generate_multiply(); 10150 } 10151 10152 generate_vector_math_stubs(); 10153 10154 #endif // COMPILER2 10155 10156 if (UseChaCha20Intrinsics) { 10157 StubRoutines::_chacha20Block = generate_chacha20Block_qrpar(); 10158 } 10159 10160 if (UseDilithiumIntrinsics) { 10161 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 10162 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 10163 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 10164 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 10165 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 10166 } 10167 10168 if (UseBASE64Intrinsics) { 10169 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 10170 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 10171 } 10172 10173 // data cache line writeback 10174 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 10175 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 10176 10177 if (UseAESIntrinsics) { 10178 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 10179 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 10180 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 10181 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 10182 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 10183 } 10184 if (UseGHASHIntrinsics) { 10185 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 10186 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 10187 } 10188 if (UseAESIntrinsics && UseGHASHIntrinsics) { 10189 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 10190 } 10191 10192 if (UseMD5Intrinsics) { 10193 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 10194 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 10195 } 10196 if (UseSHA1Intrinsics) { 10197 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 10198 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 10199 } 10200 if (UseSHA256Intrinsics) { 10201 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 10202 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 10203 } 10204 if (UseSHA512Intrinsics) { 10205 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 10206 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 10207 } 10208 if (UseSHA3Intrinsics) { 10209 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 10210 StubRoutines::_double_keccak = generate_double_keccak(); 10211 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 10212 } 10213 10214 if (UsePoly1305Intrinsics) { 10215 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 10216 } 10217 10218 // generate Adler32 intrinsics code 10219 if (UseAdler32Intrinsics) { 10220 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 10221 } 10222 10223 #endif // COMPILER2_OR_JVMCI 10224 } 10225 10226 public: 10227 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 10228 switch(blob_id) { 10229 case initial_id: 10230 generate_initial_stubs(); 10231 break; 10232 case continuation_id: 10233 generate_continuation_stubs(); 10234 break; 10235 case compiler_id: 10236 generate_compiler_stubs(); 10237 break; 10238 case final_id: 10239 generate_final_stubs(); 10240 break; 10241 default: 10242 fatal("unexpected blob id: %d", blob_id); 10243 break; 10244 }; 10245 } 10246 }; // end class declaration 10247 10248 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 10249 StubGenerator g(code, blob_id); 10250 } 10251 10252 10253 #if defined (LINUX) 10254 10255 // Define pointers to atomic stubs and initialize them to point to the 10256 // code in atomic_aarch64.S. 10257 10258 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 10259 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 10260 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 10261 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 10262 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 10263 10264 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 10265 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 10266 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 10267 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 10268 DEFAULT_ATOMIC_OP(xchg, 4, ) 10269 DEFAULT_ATOMIC_OP(xchg, 8, ) 10270 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 10271 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 10272 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 10273 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 10274 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 10275 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 10276 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 10277 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 10278 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 10279 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 10280 10281 #undef DEFAULT_ATOMIC_OP 10282 10283 #endif // LINUX