1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "code/SCCache.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/arguments.hpp" 46 #include "runtime/atomic.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/frame.inline.hpp" 50 #include "runtime/handles.inline.hpp" 51 #include "runtime/javaThread.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/stubCodeGenerator.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "utilities/align.hpp" 56 #include "utilities/checkedCast.hpp" 57 #include "utilities/debug.hpp" 58 #include "utilities/globalDefinitions.hpp" 59 #include "utilities/intpow.hpp" 60 #include "utilities/powerOfTwo.hpp" 61 #ifdef COMPILER2 62 #include "opto/runtime.hpp" 63 #endif 64 #if INCLUDE_ZGC 65 #include "gc/z/zThreadLocalData.hpp" 66 #endif 67 68 // Declaration and definition of StubGenerator (no .hpp file). 69 // For a more detailed description of the stub routine structure 70 // see the comment in stubRoutines.hpp 71 72 #undef __ 73 #define __ _masm-> 74 75 #ifdef PRODUCT 76 #define BLOCK_COMMENT(str) /* nothing */ 77 #else 78 #define BLOCK_COMMENT(str) __ block_comment(str) 79 #endif 80 81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 82 83 // Stub Code definitions 84 85 class StubGenerator: public StubCodeGenerator { 86 private: 87 88 #ifdef PRODUCT 89 #define inc_counter_np(counter) ((void)0) 90 #else 91 void inc_counter_np_(uint& counter) { 92 __ incrementw(ExternalAddress((address)&counter)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubGenStubId stub_id = StubGenStubId::call_stub_id; 207 StubCodeMark mark(this, stub_id); 208 address start = __ pc(); 209 210 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 211 212 const Address fpcr_save (rfp, fpcr_off * wordSize); 213 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 214 const Address result (rfp, result_off * wordSize); 215 const Address result_type (rfp, result_type_off * wordSize); 216 const Address method (rfp, method_off * wordSize); 217 const Address entry_point (rfp, entry_point_off * wordSize); 218 const Address parameter_size(rfp, parameter_size_off * wordSize); 219 220 const Address thread (rfp, thread_off * wordSize); 221 222 const Address d15_save (rfp, d15_off * wordSize); 223 const Address d13_save (rfp, d13_off * wordSize); 224 const Address d11_save (rfp, d11_off * wordSize); 225 const Address d9_save (rfp, d9_off * wordSize); 226 227 const Address r28_save (rfp, r28_off * wordSize); 228 const Address r26_save (rfp, r26_off * wordSize); 229 const Address r24_save (rfp, r24_off * wordSize); 230 const Address r22_save (rfp, r22_off * wordSize); 231 const Address r20_save (rfp, r20_off * wordSize); 232 233 // stub code 234 235 address aarch64_entry = __ pc(); 236 237 // set up frame and move sp to end of save area 238 __ enter(); 239 __ sub(sp, rfp, -sp_after_call_off * wordSize); 240 241 // save register parameters and Java scratch/global registers 242 // n.b. we save thread even though it gets installed in 243 // rthread because we want to sanity check rthread later 244 __ str(c_rarg7, thread); 245 __ strw(c_rarg6, parameter_size); 246 __ stp(c_rarg4, c_rarg5, entry_point); 247 __ stp(c_rarg2, c_rarg3, result_type); 248 __ stp(c_rarg0, c_rarg1, call_wrapper); 249 250 __ stp(r20, r19, r20_save); 251 __ stp(r22, r21, r22_save); 252 __ stp(r24, r23, r24_save); 253 __ stp(r26, r25, r26_save); 254 __ stp(r28, r27, r28_save); 255 256 __ stpd(v9, v8, d9_save); 257 __ stpd(v11, v10, d11_save); 258 __ stpd(v13, v12, d13_save); 259 __ stpd(v15, v14, d15_save); 260 261 __ get_fpcr(rscratch1); 262 __ str(rscratch1, fpcr_save); 263 // Set FPCR to the state we need. We do want Round to Nearest. We 264 // don't want non-IEEE rounding modes or floating-point traps. 265 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 266 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 267 __ set_fpcr(rscratch1); 268 269 // install Java thread in global register now we have saved 270 // whatever value it held 271 __ mov(rthread, c_rarg7); 272 // And method 273 __ mov(rmethod, c_rarg3); 274 275 // set up the heapbase register 276 __ reinit_heapbase(); 277 278 #ifdef ASSERT 279 // make sure we have no pending exceptions 280 { 281 Label L; 282 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 283 __ cmp(rscratch1, (u1)NULL_WORD); 284 __ br(Assembler::EQ, L); 285 __ stop("StubRoutines::call_stub: entered with pending exception"); 286 __ BIND(L); 287 } 288 #endif 289 // pass parameters if any 290 __ mov(esp, sp); 291 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 292 __ andr(sp, rscratch1, -2 * wordSize); 293 294 BLOCK_COMMENT("pass parameters if any"); 295 Label parameters_done; 296 // parameter count is still in c_rarg6 297 // and parameter pointer identifying param 1 is in c_rarg5 298 __ cbzw(c_rarg6, parameters_done); 299 300 address loop = __ pc(); 301 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 302 __ subsw(c_rarg6, c_rarg6, 1); 303 __ push(rscratch1); 304 __ br(Assembler::GT, loop); 305 306 __ BIND(parameters_done); 307 308 // call Java entry -- passing methdoOop, and current sp 309 // rmethod: Method* 310 // r19_sender_sp: sender sp 311 BLOCK_COMMENT("call Java function"); 312 __ mov(r19_sender_sp, sp); 313 __ blr(c_rarg4); 314 315 // we do this here because the notify will already have been done 316 // if we get to the next instruction via an exception 317 // 318 // n.b. adding this instruction here affects the calculation of 319 // whether or not a routine returns to the call stub (used when 320 // doing stack walks) since the normal test is to check the return 321 // pc against the address saved below. so we may need to allow for 322 // this extra instruction in the check. 323 324 // save current address for use by exception handling code 325 326 return_address = __ pc(); 327 328 // store result depending on type (everything that is not 329 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 330 // n.b. this assumes Java returns an integral result in r0 331 // and a floating result in j_farg0 332 __ ldr(j_rarg2, result); 333 Label is_long, is_float, is_double, exit; 334 __ ldr(j_rarg1, result_type); 335 __ cmp(j_rarg1, (u1)T_OBJECT); 336 __ br(Assembler::EQ, is_long); 337 __ cmp(j_rarg1, (u1)T_LONG); 338 __ br(Assembler::EQ, is_long); 339 __ cmp(j_rarg1, (u1)T_FLOAT); 340 __ br(Assembler::EQ, is_float); 341 __ cmp(j_rarg1, (u1)T_DOUBLE); 342 __ br(Assembler::EQ, is_double); 343 344 // handle T_INT case 345 __ strw(r0, Address(j_rarg2)); 346 347 __ BIND(exit); 348 349 // pop parameters 350 __ sub(esp, rfp, -sp_after_call_off * wordSize); 351 352 #ifdef ASSERT 353 // verify that threads correspond 354 { 355 Label L, S; 356 __ ldr(rscratch1, thread); 357 __ cmp(rthread, rscratch1); 358 __ br(Assembler::NE, S); 359 __ get_thread(rscratch1); 360 __ cmp(rthread, rscratch1); 361 __ br(Assembler::EQ, L); 362 __ BIND(S); 363 __ stop("StubRoutines::call_stub: threads must correspond"); 364 __ BIND(L); 365 } 366 #endif 367 368 __ pop_cont_fastpath(rthread); 369 370 // restore callee-save registers 371 __ ldpd(v15, v14, d15_save); 372 __ ldpd(v13, v12, d13_save); 373 __ ldpd(v11, v10, d11_save); 374 __ ldpd(v9, v8, d9_save); 375 376 __ ldp(r28, r27, r28_save); 377 __ ldp(r26, r25, r26_save); 378 __ ldp(r24, r23, r24_save); 379 __ ldp(r22, r21, r22_save); 380 __ ldp(r20, r19, r20_save); 381 382 // restore fpcr 383 __ ldr(rscratch1, fpcr_save); 384 __ set_fpcr(rscratch1); 385 386 __ ldp(c_rarg0, c_rarg1, call_wrapper); 387 __ ldrw(c_rarg2, result_type); 388 __ ldr(c_rarg3, method); 389 __ ldp(c_rarg4, c_rarg5, entry_point); 390 __ ldp(c_rarg6, c_rarg7, parameter_size); 391 392 // leave frame and return to caller 393 __ leave(); 394 __ ret(lr); 395 396 // handle return types different from T_INT 397 398 __ BIND(is_long); 399 __ str(r0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_float); 403 __ strs(j_farg0, Address(j_rarg2, 0)); 404 __ br(Assembler::AL, exit); 405 406 __ BIND(is_double); 407 __ strd(j_farg0, Address(j_rarg2, 0)); 408 __ br(Assembler::AL, exit); 409 410 return start; 411 } 412 413 // Return point for a Java call if there's an exception thrown in 414 // Java code. The exception is caught and transformed into a 415 // pending exception stored in JavaThread that can be tested from 416 // within the VM. 417 // 418 // Note: Usually the parameters are removed by the callee. In case 419 // of an exception crossing an activation frame boundary, that is 420 // not the case if the callee is compiled code => need to setup the 421 // rsp. 422 // 423 // r0: exception oop 424 425 address generate_catch_exception() { 426 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 427 StubCodeMark mark(this, stub_id); 428 address start = __ pc(); 429 430 // same as in generate_call_stub(): 431 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 432 const Address thread (rfp, thread_off * wordSize); 433 434 #ifdef ASSERT 435 // verify that threads correspond 436 { 437 Label L, S; 438 __ ldr(rscratch1, thread); 439 __ cmp(rthread, rscratch1); 440 __ br(Assembler::NE, S); 441 __ get_thread(rscratch1); 442 __ cmp(rthread, rscratch1); 443 __ br(Assembler::EQ, L); 444 __ bind(S); 445 __ stop("StubRoutines::catch_exception: threads must correspond"); 446 __ bind(L); 447 } 448 #endif 449 450 // set pending exception 451 __ verify_oop(r0); 452 453 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 454 __ mov(rscratch1, (address)__FILE__); 455 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 456 __ movw(rscratch1, (int)__LINE__); 457 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 458 459 // complete return to VM 460 assert(StubRoutines::_call_stub_return_address != nullptr, 461 "_call_stub_return_address must have been generated before"); 462 __ b(StubRoutines::_call_stub_return_address); 463 464 return start; 465 } 466 467 // Continuation point for runtime calls returning with a pending 468 // exception. The pending exception check happened in the runtime 469 // or native call stub. The pending exception in Thread is 470 // converted into a Java-level exception. 471 // 472 // Contract with Java-level exception handlers: 473 // r0: exception 474 // r3: throwing pc 475 // 476 // NOTE: At entry of this stub, exception-pc must be in LR !! 477 478 // NOTE: this is always used as a jump target within generated code 479 // so it just needs to be generated code with no x86 prolog 480 481 address generate_forward_exception() { 482 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 483 StubCodeMark mark(this, stub_id); 484 address start = __ pc(); 485 486 // Upon entry, LR points to the return address returning into 487 // Java (interpreted or compiled) code; i.e., the return address 488 // becomes the throwing pc. 489 // 490 // Arguments pushed before the runtime call are still on the stack 491 // but the exception handler will reset the stack pointer -> 492 // ignore them. A potential result in registers can be ignored as 493 // well. 494 495 #ifdef ASSERT 496 // make sure this code is only executed if there is a pending exception 497 { 498 Label L; 499 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 500 __ cbnz(rscratch1, L); 501 __ stop("StubRoutines::forward exception: no pending exception (1)"); 502 __ bind(L); 503 } 504 #endif 505 506 // compute exception handler into r19 507 508 // call the VM to find the handler address associated with the 509 // caller address. pass thread in r0 and caller pc (ret address) 510 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 511 // the stack. 512 __ mov(c_rarg1, lr); 513 // lr will be trashed by the VM call so we move it to R19 514 // (callee-saved) because we also need to pass it to the handler 515 // returned by this call. 516 __ mov(r19, lr); 517 BLOCK_COMMENT("call exception_handler_for_return_address"); 518 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 519 SharedRuntime::exception_handler_for_return_address), 520 rthread, c_rarg1); 521 // Reinitialize the ptrue predicate register, in case the external runtime 522 // call clobbers ptrue reg, as we may return to SVE compiled code. 523 __ reinitialize_ptrue(); 524 525 // we should not really care that lr is no longer the callee 526 // address. we saved the value the handler needs in r19 so we can 527 // just copy it to r3. however, the C2 handler will push its own 528 // frame and then calls into the VM and the VM code asserts that 529 // the PC for the frame above the handler belongs to a compiled 530 // Java method. So, we restore lr here to satisfy that assert. 531 __ mov(lr, r19); 532 // setup r0 & r3 & clear pending exception 533 __ mov(r3, r19); 534 __ mov(r19, r0); 535 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 536 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 537 538 #ifdef ASSERT 539 // make sure exception is set 540 { 541 Label L; 542 __ cbnz(r0, L); 543 __ stop("StubRoutines::forward exception: no pending exception (2)"); 544 __ bind(L); 545 } 546 #endif 547 548 // continue at exception handler 549 // r0: exception 550 // r3: throwing pc 551 // r19: exception handler 552 __ verify_oop(r0); 553 __ br(r19); 554 555 return start; 556 } 557 558 // Non-destructive plausibility checks for oops 559 // 560 // Arguments: 561 // r0: oop to verify 562 // rscratch1: error message 563 // 564 // Stack after saving c_rarg3: 565 // [tos + 0]: saved c_rarg3 566 // [tos + 1]: saved c_rarg2 567 // [tos + 2]: saved lr 568 // [tos + 3]: saved rscratch2 569 // [tos + 4]: saved r0 570 // [tos + 5]: saved rscratch1 571 address generate_verify_oop() { 572 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 573 StubCodeMark mark(this, stub_id); 574 address start = __ pc(); 575 576 Label exit, error; 577 578 // save c_rarg2 and c_rarg3 579 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 580 581 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 583 __ ldr(c_rarg3, Address(c_rarg2)); 584 __ add(c_rarg3, c_rarg3, 1); 585 __ str(c_rarg3, Address(c_rarg2)); 586 587 // object is in r0 588 // make sure object is 'reasonable' 589 __ cbz(r0, exit); // if obj is null it is OK 590 591 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 592 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blr(rscratch1); 615 __ hlt(0); 616 617 return start; 618 } 619 620 // Generate indices for iota vector. 621 address generate_iota_indices(StubGenStubId stub_id) { 622 __ align(CodeEntryAlignment); 623 StubCodeMark mark(this, stub_id); 624 address start = __ pc(); 625 // B 626 __ emit_data64(0x0706050403020100, relocInfo::none); 627 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 628 // H 629 __ emit_data64(0x0003000200010000, relocInfo::none); 630 __ emit_data64(0x0007000600050004, relocInfo::none); 631 // S 632 __ emit_data64(0x0000000100000000, relocInfo::none); 633 __ emit_data64(0x0000000300000002, relocInfo::none); 634 // D 635 __ emit_data64(0x0000000000000000, relocInfo::none); 636 __ emit_data64(0x0000000000000001, relocInfo::none); 637 // S - FP 638 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 639 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 640 // D - FP 641 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 642 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 643 return start; 644 } 645 646 // The inner part of zero_words(). This is the bulk operation, 647 // zeroing words in blocks, possibly using DC ZVA to do it. The 648 // caller is responsible for zeroing the last few words. 649 // 650 // Inputs: 651 // r10: the HeapWord-aligned base address of an array to zero. 652 // r11: the count in HeapWords, r11 > 0. 653 // 654 // Returns r10 and r11, adjusted for the caller to clear. 655 // r10: the base address of the tail of words left to clear. 656 // r11: the number of words in the tail. 657 // r11 < MacroAssembler::zero_words_block_size. 658 659 address generate_zero_blocks() { 660 Label done; 661 Label base_aligned; 662 663 Register base = r10, cnt = r11; 664 665 __ align(CodeEntryAlignment); 666 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 667 StubCodeMark mark(this, stub_id); 668 address start = __ pc(); 669 670 if (UseBlockZeroing) { 671 int zva_length = VM_Version::zva_length(); 672 673 // Ensure ZVA length can be divided by 16. This is required by 674 // the subsequent operations. 675 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 676 677 __ tbz(base, 3, base_aligned); 678 __ str(zr, Address(__ post(base, 8))); 679 __ sub(cnt, cnt, 1); 680 __ bind(base_aligned); 681 682 // Ensure count >= zva_length * 2 so that it still deserves a zva after 683 // alignment. 684 Label small; 685 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 686 __ subs(rscratch1, cnt, low_limit >> 3); 687 __ br(Assembler::LT, small); 688 __ zero_dcache_blocks(base, cnt); 689 __ bind(small); 690 } 691 692 { 693 // Number of stp instructions we'll unroll 694 const int unroll = 695 MacroAssembler::zero_words_block_size / 2; 696 // Clear the remaining blocks. 697 Label loop; 698 __ subs(cnt, cnt, unroll * 2); 699 __ br(Assembler::LT, done); 700 __ bind(loop); 701 for (int i = 0; i < unroll; i++) 702 __ stp(zr, zr, __ post(base, 16)); 703 __ subs(cnt, cnt, unroll * 2); 704 __ br(Assembler::GE, loop); 705 __ bind(done); 706 __ add(cnt, cnt, unroll * 2); 707 } 708 709 __ ret(lr); 710 711 return start; 712 } 713 714 715 typedef enum { 716 copy_forwards = 1, 717 copy_backwards = -1 718 } copy_direction; 719 720 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 721 // for arraycopy stubs. 722 class ArrayCopyBarrierSetHelper : StackObj { 723 BarrierSetAssembler* _bs_asm; 724 MacroAssembler* _masm; 725 DecoratorSet _decorators; 726 BasicType _type; 727 Register _gct1; 728 Register _gct2; 729 Register _gct3; 730 FloatRegister _gcvt1; 731 FloatRegister _gcvt2; 732 FloatRegister _gcvt3; 733 734 public: 735 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 736 DecoratorSet decorators, 737 BasicType type, 738 Register gct1, 739 Register gct2, 740 Register gct3, 741 FloatRegister gcvt1, 742 FloatRegister gcvt2, 743 FloatRegister gcvt3) 744 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 745 _masm(masm), 746 _decorators(decorators), 747 _type(type), 748 _gct1(gct1), 749 _gct2(gct2), 750 _gct3(gct3), 751 _gcvt1(gcvt1), 752 _gcvt2(gcvt2), 753 _gcvt3(gcvt3) { 754 } 755 756 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 757 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 758 dst1, dst2, src, 759 _gct1, _gct2, _gcvt1); 760 } 761 762 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 763 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 764 dst, src1, src2, 765 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 766 } 767 768 void copy_load_at_16(Register dst1, Register dst2, Address src) { 769 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 770 dst1, dst2, src, 771 _gct1); 772 } 773 774 void copy_store_at_16(Address dst, Register src1, Register src2) { 775 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 776 dst, src1, src2, 777 _gct1, _gct2, _gct3); 778 } 779 780 void copy_load_at_8(Register dst, Address src) { 781 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 782 dst, noreg, src, 783 _gct1); 784 } 785 786 void copy_store_at_8(Address dst, Register src) { 787 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 788 dst, src, noreg, 789 _gct1, _gct2, _gct3); 790 } 791 }; 792 793 // Bulk copy of blocks of 8 words. 794 // 795 // count is a count of words. 796 // 797 // Precondition: count >= 8 798 // 799 // Postconditions: 800 // 801 // The least significant bit of count contains the remaining count 802 // of words to copy. The rest of count is trash. 803 // 804 // s and d are adjusted to point to the remaining words to copy 805 // 806 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 807 BasicType type; 808 copy_direction direction; 809 810 switch (stub_id) { 811 case copy_byte_f_id: 812 direction = copy_forwards; 813 type = T_BYTE; 814 break; 815 case copy_byte_b_id: 816 direction = copy_backwards; 817 type = T_BYTE; 818 break; 819 case copy_oop_f_id: 820 direction = copy_forwards; 821 type = T_OBJECT; 822 break; 823 case copy_oop_b_id: 824 direction = copy_backwards; 825 type = T_OBJECT; 826 break; 827 case copy_oop_uninit_f_id: 828 direction = copy_forwards; 829 type = T_OBJECT; 830 break; 831 case copy_oop_uninit_b_id: 832 direction = copy_backwards; 833 type = T_OBJECT; 834 break; 835 default: 836 ShouldNotReachHere(); 837 } 838 839 int unit = wordSize * direction; 840 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 841 842 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 843 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 844 const Register stride = r14; 845 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 846 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 847 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 848 849 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 850 assert_different_registers(s, d, count, rscratch1, rscratch2); 851 852 Label again, drain; 853 854 __ align(CodeEntryAlignment); 855 856 StubCodeMark mark(this, stub_id); 857 858 __ bind(start); 859 860 Label unaligned_copy_long; 861 if (AvoidUnalignedAccesses) { 862 __ tbnz(d, 3, unaligned_copy_long); 863 } 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, bias); 867 __ sub(d, d, bias); 868 } 869 870 #ifdef ASSERT 871 // Make sure we are never given < 8 words 872 { 873 Label L; 874 __ cmp(count, (u1)8); 875 __ br(Assembler::GE, L); 876 __ stop("genrate_copy_longs called with < 8 words"); 877 __ bind(L); 878 } 879 #endif 880 881 // Fill 8 registers 882 if (UseSIMDForMemoryOps) { 883 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 884 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 885 } else { 886 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 888 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 889 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 890 } 891 892 __ subs(count, count, 16); 893 __ br(Assembler::LO, drain); 894 895 int prefetch = PrefetchCopyIntervalInBytes; 896 bool use_stride = false; 897 if (direction == copy_backwards) { 898 use_stride = prefetch > 256; 899 prefetch = -prefetch; 900 if (use_stride) __ mov(stride, prefetch); 901 } 902 903 __ bind(again); 904 905 if (PrefetchCopyIntervalInBytes > 0) 906 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 907 908 if (UseSIMDForMemoryOps) { 909 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 910 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 911 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 912 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 913 } else { 914 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 915 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 916 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 917 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 918 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 919 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 920 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 921 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 922 } 923 924 __ subs(count, count, 8); 925 __ br(Assembler::HS, again); 926 927 // Drain 928 __ bind(drain); 929 if (UseSIMDForMemoryOps) { 930 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 931 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 932 } else { 933 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 934 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 935 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 936 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 937 } 938 939 { 940 Label L1, L2; 941 __ tbz(count, exact_log2(4), L1); 942 if (UseSIMDForMemoryOps) { 943 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 944 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 945 } else { 946 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 947 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 950 } 951 __ bind(L1); 952 953 if (direction == copy_forwards) { 954 __ add(s, s, bias); 955 __ add(d, d, bias); 956 } 957 958 __ tbz(count, 1, L2); 959 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 960 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 961 __ bind(L2); 962 } 963 964 __ ret(lr); 965 966 if (AvoidUnalignedAccesses) { 967 Label drain, again; 968 // Register order for storing. Order is different for backward copy. 969 970 __ bind(unaligned_copy_long); 971 972 // source address is even aligned, target odd aligned 973 // 974 // when forward copying word pairs we read long pairs at offsets 975 // {0, 2, 4, 6} (in long words). when backwards copying we read 976 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 977 // address by -2 in the forwards case so we can compute the 978 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 979 // or -1. 980 // 981 // when forward copying we need to store 1 word, 3 pairs and 982 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 983 // zero offset We adjust the destination by -1 which means we 984 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 985 // 986 // When backwards copyng we need to store 1 word, 3 pairs and 987 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 988 // offsets {1, 3, 5, 7, 8} * unit. 989 990 if (direction == copy_forwards) { 991 __ sub(s, s, 16); 992 __ sub(d, d, 8); 993 } 994 995 // Fill 8 registers 996 // 997 // for forwards copy s was offset by -16 from the original input 998 // value of s so the register contents are at these offsets 999 // relative to the 64 bit block addressed by that original input 1000 // and so on for each successive 64 byte block when s is updated 1001 // 1002 // t0 at offset 0, t1 at offset 8 1003 // t2 at offset 16, t3 at offset 24 1004 // t4 at offset 32, t5 at offset 40 1005 // t6 at offset 48, t7 at offset 56 1006 1007 // for backwards copy s was not offset so the register contents 1008 // are at these offsets into the preceding 64 byte block 1009 // relative to that original input and so on for each successive 1010 // preceding 64 byte block when s is updated. this explains the 1011 // slightly counter-intuitive looking pattern of register usage 1012 // in the stp instructions for backwards copy. 1013 // 1014 // t0 at offset -16, t1 at offset -8 1015 // t2 at offset -32, t3 at offset -24 1016 // t4 at offset -48, t5 at offset -40 1017 // t6 at offset -64, t7 at offset -56 1018 1019 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1020 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1021 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1022 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1023 1024 __ subs(count, count, 16); 1025 __ br(Assembler::LO, drain); 1026 1027 int prefetch = PrefetchCopyIntervalInBytes; 1028 bool use_stride = false; 1029 if (direction == copy_backwards) { 1030 use_stride = prefetch > 256; 1031 prefetch = -prefetch; 1032 if (use_stride) __ mov(stride, prefetch); 1033 } 1034 1035 __ bind(again); 1036 1037 if (PrefetchCopyIntervalInBytes > 0) 1038 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1039 1040 if (direction == copy_forwards) { 1041 // allowing for the offset of -8 the store instructions place 1042 // registers into the target 64 bit block at the following 1043 // offsets 1044 // 1045 // t0 at offset 0 1046 // t1 at offset 8, t2 at offset 16 1047 // t3 at offset 24, t4 at offset 32 1048 // t5 at offset 40, t6 at offset 48 1049 // t7 at offset 56 1050 1051 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1052 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1053 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1054 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1055 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1056 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1057 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1058 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1059 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1060 } else { 1061 // d was not offset when we started so the registers are 1062 // written into the 64 bit block preceding d with the following 1063 // offsets 1064 // 1065 // t1 at offset -8 1066 // t3 at offset -24, t0 at offset -16 1067 // t5 at offset -48, t2 at offset -32 1068 // t7 at offset -56, t4 at offset -48 1069 // t6 at offset -64 1070 // 1071 // note that this matches the offsets previously noted for the 1072 // loads 1073 1074 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1075 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1076 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1077 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1078 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1079 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1080 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1081 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1082 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1083 } 1084 1085 __ subs(count, count, 8); 1086 __ br(Assembler::HS, again); 1087 1088 // Drain 1089 // 1090 // this uses the same pattern of offsets and register arguments 1091 // as above 1092 __ bind(drain); 1093 if (direction == copy_forwards) { 1094 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1095 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1096 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1097 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1098 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1099 } else { 1100 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1101 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1102 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1103 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1104 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1105 } 1106 // now we need to copy any remaining part block which may 1107 // include a 4 word block subblock and/or a 2 word subblock. 1108 // bits 2 and 1 in the count are the tell-tale for whether we 1109 // have each such subblock 1110 { 1111 Label L1, L2; 1112 __ tbz(count, exact_log2(4), L1); 1113 // this is the same as above but copying only 4 longs hence 1114 // with only one intervening stp between the str instructions 1115 // but note that the offsets and registers still follow the 1116 // same pattern 1117 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1118 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1119 if (direction == copy_forwards) { 1120 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1121 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1122 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1123 } else { 1124 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1125 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1126 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1127 } 1128 __ bind(L1); 1129 1130 __ tbz(count, 1, L2); 1131 // this is the same as above but copying only 2 longs hence 1132 // there is no intervening stp between the str instructions 1133 // but note that the offset and register patterns are still 1134 // the same 1135 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1136 if (direction == copy_forwards) { 1137 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1138 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1139 } else { 1140 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1141 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1142 } 1143 __ bind(L2); 1144 1145 // for forwards copy we need to re-adjust the offsets we 1146 // applied so that s and d are follow the last words written 1147 1148 if (direction == copy_forwards) { 1149 __ add(s, s, 16); 1150 __ add(d, d, 8); 1151 } 1152 1153 } 1154 1155 __ ret(lr); 1156 } 1157 } 1158 1159 // Small copy: less than 16 bytes. 1160 // 1161 // NB: Ignores all of the bits of count which represent more than 15 1162 // bytes, so a caller doesn't have to mask them. 1163 1164 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1165 bool is_backwards = step < 0; 1166 size_t granularity = uabs(step); 1167 int direction = is_backwards ? -1 : 1; 1168 1169 Label Lword, Lint, Lshort, Lbyte; 1170 1171 assert(granularity 1172 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1173 1174 const Register t0 = r3; 1175 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1176 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1177 1178 // ??? I don't know if this bit-test-and-branch is the right thing 1179 // to do. It does a lot of jumping, resulting in several 1180 // mispredicted branches. It might make more sense to do this 1181 // with something like Duff's device with a single computed branch. 1182 1183 __ tbz(count, 3 - exact_log2(granularity), Lword); 1184 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1185 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1186 __ bind(Lword); 1187 1188 if (granularity <= sizeof (jint)) { 1189 __ tbz(count, 2 - exact_log2(granularity), Lint); 1190 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1191 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1192 __ bind(Lint); 1193 } 1194 1195 if (granularity <= sizeof (jshort)) { 1196 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1197 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1198 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1199 __ bind(Lshort); 1200 } 1201 1202 if (granularity <= sizeof (jbyte)) { 1203 __ tbz(count, 0, Lbyte); 1204 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1205 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1206 __ bind(Lbyte); 1207 } 1208 } 1209 1210 Label copy_f, copy_b; 1211 Label copy_obj_f, copy_obj_b; 1212 Label copy_obj_uninit_f, copy_obj_uninit_b; 1213 1214 // All-singing all-dancing memory copy. 1215 // 1216 // Copy count units of memory from s to d. The size of a unit is 1217 // step, which can be positive or negative depending on the direction 1218 // of copy. If is_aligned is false, we align the source address. 1219 // 1220 1221 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1222 Register s, Register d, Register count, int step) { 1223 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1224 bool is_backwards = step < 0; 1225 unsigned int granularity = uabs(step); 1226 const Register t0 = r3, t1 = r4; 1227 1228 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1229 // load all the data before writing anything 1230 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1231 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1232 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1233 const Register send = r17, dend = r16; 1234 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1235 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1236 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1237 1238 if (PrefetchCopyIntervalInBytes > 0) 1239 __ prfm(Address(s, 0), PLDL1KEEP); 1240 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1241 __ br(Assembler::HI, copy_big); 1242 1243 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1244 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1245 1246 __ cmp(count, u1(16/granularity)); 1247 __ br(Assembler::LS, copy16); 1248 1249 __ cmp(count, u1(64/granularity)); 1250 __ br(Assembler::HI, copy80); 1251 1252 __ cmp(count, u1(32/granularity)); 1253 __ br(Assembler::LS, copy32); 1254 1255 // 33..64 bytes 1256 if (UseSIMDForMemoryOps) { 1257 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1258 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1259 bs.copy_store_at_32(Address(d, 0), v0, v1); 1260 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1261 } else { 1262 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1263 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1264 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1265 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1266 1267 bs.copy_store_at_16(Address(d, 0), t0, t1); 1268 bs.copy_store_at_16(Address(d, 16), t2, t3); 1269 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1270 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1271 } 1272 __ b(finish); 1273 1274 // 17..32 bytes 1275 __ bind(copy32); 1276 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1277 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1278 1279 bs.copy_store_at_16(Address(d, 0), t0, t1); 1280 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1281 __ b(finish); 1282 1283 // 65..80/96 bytes 1284 // (96 bytes if SIMD because we do 32 byes per instruction) 1285 __ bind(copy80); 1286 if (UseSIMDForMemoryOps) { 1287 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1288 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1289 // Unaligned pointers can be an issue for copying. 1290 // The issue has more chances to happen when granularity of data is 1291 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1292 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1293 // The most performance drop has been seen for the range 65-80 bytes. 1294 // For such cases using the pair of ldp/stp instead of the third pair of 1295 // ldpq/stpq fixes the performance issue. 1296 if (granularity < sizeof (jint)) { 1297 Label copy96; 1298 __ cmp(count, u1(80/granularity)); 1299 __ br(Assembler::HI, copy96); 1300 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1301 1302 bs.copy_store_at_32(Address(d, 0), v0, v1); 1303 bs.copy_store_at_32(Address(d, 32), v2, v3); 1304 1305 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1306 __ b(finish); 1307 1308 __ bind(copy96); 1309 } 1310 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1311 1312 bs.copy_store_at_32(Address(d, 0), v0, v1); 1313 bs.copy_store_at_32(Address(d, 32), v2, v3); 1314 1315 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1316 } else { 1317 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1318 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1319 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1320 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1321 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1322 1323 bs.copy_store_at_16(Address(d, 0), t0, t1); 1324 bs.copy_store_at_16(Address(d, 16), t2, t3); 1325 bs.copy_store_at_16(Address(d, 32), t4, t5); 1326 bs.copy_store_at_16(Address(d, 48), t6, t7); 1327 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1328 } 1329 __ b(finish); 1330 1331 // 0..16 bytes 1332 __ bind(copy16); 1333 __ cmp(count, u1(8/granularity)); 1334 __ br(Assembler::LO, copy8); 1335 1336 // 8..16 bytes 1337 bs.copy_load_at_8(t0, Address(s, 0)); 1338 bs.copy_load_at_8(t1, Address(send, -8)); 1339 bs.copy_store_at_8(Address(d, 0), t0); 1340 bs.copy_store_at_8(Address(dend, -8), t1); 1341 __ b(finish); 1342 1343 if (granularity < 8) { 1344 // 4..7 bytes 1345 __ bind(copy8); 1346 __ tbz(count, 2 - exact_log2(granularity), copy4); 1347 __ ldrw(t0, Address(s, 0)); 1348 __ ldrw(t1, Address(send, -4)); 1349 __ strw(t0, Address(d, 0)); 1350 __ strw(t1, Address(dend, -4)); 1351 __ b(finish); 1352 if (granularity < 4) { 1353 // 0..3 bytes 1354 __ bind(copy4); 1355 __ cbz(count, finish); // get rid of 0 case 1356 if (granularity == 2) { 1357 __ ldrh(t0, Address(s, 0)); 1358 __ strh(t0, Address(d, 0)); 1359 } else { // granularity == 1 1360 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1361 // the first and last byte. 1362 // Handle the 3 byte case by loading and storing base + count/2 1363 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1364 // This does means in the 1 byte case we load/store the same 1365 // byte 3 times. 1366 __ lsr(count, count, 1); 1367 __ ldrb(t0, Address(s, 0)); 1368 __ ldrb(t1, Address(send, -1)); 1369 __ ldrb(t2, Address(s, count)); 1370 __ strb(t0, Address(d, 0)); 1371 __ strb(t1, Address(dend, -1)); 1372 __ strb(t2, Address(d, count)); 1373 } 1374 __ b(finish); 1375 } 1376 } 1377 1378 __ bind(copy_big); 1379 if (is_backwards) { 1380 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1381 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1382 } 1383 1384 // Now we've got the small case out of the way we can align the 1385 // source address on a 2-word boundary. 1386 1387 // Here we will materialize a count in r15, which is used by copy_memory_small 1388 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1389 // Up until here, we have used t9, which aliases r15, but from here on, that register 1390 // can not be used as a temp register, as it contains the count. 1391 1392 Label aligned; 1393 1394 if (is_aligned) { 1395 // We may have to adjust by 1 word to get s 2-word-aligned. 1396 __ tbz(s, exact_log2(wordSize), aligned); 1397 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1398 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1399 __ sub(count, count, wordSize/granularity); 1400 } else { 1401 if (is_backwards) { 1402 __ andr(r15, s, 2 * wordSize - 1); 1403 } else { 1404 __ neg(r15, s); 1405 __ andr(r15, r15, 2 * wordSize - 1); 1406 } 1407 // r15 is the byte adjustment needed to align s. 1408 __ cbz(r15, aligned); 1409 int shift = exact_log2(granularity); 1410 if (shift > 0) { 1411 __ lsr(r15, r15, shift); 1412 } 1413 __ sub(count, count, r15); 1414 1415 #if 0 1416 // ?? This code is only correct for a disjoint copy. It may or 1417 // may not make sense to use it in that case. 1418 1419 // Copy the first pair; s and d may not be aligned. 1420 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1421 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1422 1423 // Align s and d, adjust count 1424 if (is_backwards) { 1425 __ sub(s, s, r15); 1426 __ sub(d, d, r15); 1427 } else { 1428 __ add(s, s, r15); 1429 __ add(d, d, r15); 1430 } 1431 #else 1432 copy_memory_small(decorators, type, s, d, r15, step); 1433 #endif 1434 } 1435 1436 __ bind(aligned); 1437 1438 // s is now 2-word-aligned. 1439 1440 // We have a count of units and some trailing bytes. Adjust the 1441 // count and do a bulk copy of words. If the shift is zero 1442 // perform a move instead to benefit from zero latency moves. 1443 int shift = exact_log2(wordSize/granularity); 1444 if (shift > 0) { 1445 __ lsr(r15, count, shift); 1446 } else { 1447 __ mov(r15, count); 1448 } 1449 if (direction == copy_forwards) { 1450 if (type != T_OBJECT) { 1451 __ bl(copy_f); 1452 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1453 __ bl(copy_obj_uninit_f); 1454 } else { 1455 __ bl(copy_obj_f); 1456 } 1457 } else { 1458 if (type != T_OBJECT) { 1459 __ bl(copy_b); 1460 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1461 __ bl(copy_obj_uninit_b); 1462 } else { 1463 __ bl(copy_obj_b); 1464 } 1465 } 1466 1467 // And the tail. 1468 copy_memory_small(decorators, type, s, d, count, step); 1469 1470 if (granularity >= 8) __ bind(copy8); 1471 if (granularity >= 4) __ bind(copy4); 1472 __ bind(finish); 1473 } 1474 1475 1476 void clobber_registers() { 1477 #ifdef ASSERT 1478 RegSet clobbered 1479 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1480 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1481 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1482 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1483 __ mov(*it, rscratch1); 1484 } 1485 #endif 1486 1487 } 1488 1489 // Scan over array at a for count oops, verifying each one. 1490 // Preserves a and count, clobbers rscratch1 and rscratch2. 1491 void verify_oop_array (int size, Register a, Register count, Register temp) { 1492 Label loop, end; 1493 __ mov(rscratch1, a); 1494 __ mov(rscratch2, zr); 1495 __ bind(loop); 1496 __ cmp(rscratch2, count); 1497 __ br(Assembler::HS, end); 1498 if (size == wordSize) { 1499 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1500 __ verify_oop(temp); 1501 } else { 1502 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1503 __ decode_heap_oop(temp); // calls verify_oop 1504 } 1505 __ add(rscratch2, rscratch2, 1); 1506 __ b(loop); 1507 __ bind(end); 1508 } 1509 1510 // Arguments: 1511 // stub_id - is used to name the stub and identify all details of 1512 // how to perform the copy. 1513 // 1514 // entry - is assigned to the stub's post push entry point unless 1515 // it is null 1516 // 1517 // Inputs: 1518 // c_rarg0 - source array address 1519 // c_rarg1 - destination array address 1520 // c_rarg2 - element count, treated as ssize_t, can be zero 1521 // 1522 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1523 // the hardware handle it. The two dwords within qwords that span 1524 // cache line boundaries will still be loaded and stored atomically. 1525 // 1526 // Side Effects: entry is set to the (post push) entry point so it 1527 // can be used by the corresponding conjoint copy 1528 // method 1529 // 1530 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1531 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1532 RegSet saved_reg = RegSet::of(s, d, count); 1533 int size; 1534 bool aligned; 1535 bool is_oop; 1536 bool dest_uninitialized; 1537 switch (stub_id) { 1538 case jbyte_disjoint_arraycopy_id: 1539 size = sizeof(jbyte); 1540 aligned = false; 1541 is_oop = false; 1542 dest_uninitialized = false; 1543 break; 1544 case arrayof_jbyte_disjoint_arraycopy_id: 1545 size = sizeof(jbyte); 1546 aligned = true; 1547 is_oop = false; 1548 dest_uninitialized = false; 1549 break; 1550 case jshort_disjoint_arraycopy_id: 1551 size = sizeof(jshort); 1552 aligned = false; 1553 is_oop = false; 1554 dest_uninitialized = false; 1555 break; 1556 case arrayof_jshort_disjoint_arraycopy_id: 1557 size = sizeof(jshort); 1558 aligned = true; 1559 is_oop = false; 1560 dest_uninitialized = false; 1561 break; 1562 case jint_disjoint_arraycopy_id: 1563 size = sizeof(jint); 1564 aligned = false; 1565 is_oop = false; 1566 dest_uninitialized = false; 1567 break; 1568 case arrayof_jint_disjoint_arraycopy_id: 1569 size = sizeof(jint); 1570 aligned = true; 1571 is_oop = false; 1572 dest_uninitialized = false; 1573 break; 1574 case jlong_disjoint_arraycopy_id: 1575 // since this is always aligned we can (should!) use the same 1576 // stub as for case arrayof_jlong_disjoint_arraycopy 1577 ShouldNotReachHere(); 1578 break; 1579 case arrayof_jlong_disjoint_arraycopy_id: 1580 size = sizeof(jlong); 1581 aligned = true; 1582 is_oop = false; 1583 dest_uninitialized = false; 1584 break; 1585 case oop_disjoint_arraycopy_id: 1586 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1587 aligned = !UseCompressedOops; 1588 is_oop = true; 1589 dest_uninitialized = false; 1590 break; 1591 case arrayof_oop_disjoint_arraycopy_id: 1592 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1593 aligned = !UseCompressedOops; 1594 is_oop = true; 1595 dest_uninitialized = false; 1596 break; 1597 case oop_disjoint_arraycopy_uninit_id: 1598 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1599 aligned = !UseCompressedOops; 1600 is_oop = true; 1601 dest_uninitialized = true; 1602 break; 1603 case arrayof_oop_disjoint_arraycopy_uninit_id: 1604 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1605 aligned = !UseCompressedOops; 1606 is_oop = true; 1607 dest_uninitialized = true; 1608 break; 1609 default: 1610 ShouldNotReachHere(); 1611 break; 1612 } 1613 1614 __ align(CodeEntryAlignment); 1615 StubCodeMark mark(this, stub_id); 1616 address start = __ pc(); 1617 __ enter(); 1618 1619 if (entry != nullptr) { 1620 *entry = __ pc(); 1621 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1622 BLOCK_COMMENT("Entry:"); 1623 } 1624 1625 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1626 if (dest_uninitialized) { 1627 decorators |= IS_DEST_UNINITIALIZED; 1628 } 1629 if (aligned) { 1630 decorators |= ARRAYCOPY_ALIGNED; 1631 } 1632 1633 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1634 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1635 1636 if (is_oop) { 1637 // save regs before copy_memory 1638 __ push(RegSet::of(d, count), sp); 1639 } 1640 { 1641 // UnsafeMemoryAccess page error: continue after unsafe access 1642 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1643 UnsafeMemoryAccessMark umam(this, add_entry, true); 1644 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1645 } 1646 1647 if (is_oop) { 1648 __ pop(RegSet::of(d, count), sp); 1649 if (VerifyOops) 1650 verify_oop_array(size, d, count, r16); 1651 } 1652 1653 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1654 1655 __ leave(); 1656 __ mov(r0, zr); // return 0 1657 __ ret(lr); 1658 return start; 1659 } 1660 1661 // Arguments: 1662 // stub_id - is used to name the stub and identify all details of 1663 // how to perform the copy. 1664 // 1665 // nooverlap_target - identifes the (post push) entry for the 1666 // corresponding disjoint copy routine which can be 1667 // jumped to if the ranges do not actually overlap 1668 // 1669 // entry - is assigned to the stub's post push entry point unless 1670 // it is null 1671 // 1672 // 1673 // Inputs: 1674 // c_rarg0 - source array address 1675 // c_rarg1 - destination array address 1676 // c_rarg2 - element count, treated as ssize_t, can be zero 1677 // 1678 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1679 // the hardware handle it. The two dwords within qwords that span 1680 // cache line boundaries will still be loaded and stored atomically. 1681 // 1682 // Side Effects: 1683 // entry is set to the no-overlap entry point so it can be used by 1684 // some other conjoint copy method 1685 // 1686 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1687 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1688 RegSet saved_regs = RegSet::of(s, d, count); 1689 int size; 1690 bool aligned; 1691 bool is_oop; 1692 bool dest_uninitialized; 1693 switch (stub_id) { 1694 case jbyte_arraycopy_id: 1695 size = sizeof(jbyte); 1696 aligned = false; 1697 is_oop = false; 1698 dest_uninitialized = false; 1699 break; 1700 case arrayof_jbyte_arraycopy_id: 1701 size = sizeof(jbyte); 1702 aligned = true; 1703 is_oop = false; 1704 dest_uninitialized = false; 1705 break; 1706 case jshort_arraycopy_id: 1707 size = sizeof(jshort); 1708 aligned = false; 1709 is_oop = false; 1710 dest_uninitialized = false; 1711 break; 1712 case arrayof_jshort_arraycopy_id: 1713 size = sizeof(jshort); 1714 aligned = true; 1715 is_oop = false; 1716 dest_uninitialized = false; 1717 break; 1718 case jint_arraycopy_id: 1719 size = sizeof(jint); 1720 aligned = false; 1721 is_oop = false; 1722 dest_uninitialized = false; 1723 break; 1724 case arrayof_jint_arraycopy_id: 1725 size = sizeof(jint); 1726 aligned = true; 1727 is_oop = false; 1728 dest_uninitialized = false; 1729 break; 1730 case jlong_arraycopy_id: 1731 // since this is always aligned we can (should!) use the same 1732 // stub as for case arrayof_jlong_disjoint_arraycopy 1733 ShouldNotReachHere(); 1734 break; 1735 case arrayof_jlong_arraycopy_id: 1736 size = sizeof(jlong); 1737 aligned = true; 1738 is_oop = false; 1739 dest_uninitialized = false; 1740 break; 1741 case oop_arraycopy_id: 1742 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1743 aligned = !UseCompressedOops; 1744 is_oop = true; 1745 dest_uninitialized = false; 1746 break; 1747 case arrayof_oop_arraycopy_id: 1748 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1749 aligned = !UseCompressedOops; 1750 is_oop = true; 1751 dest_uninitialized = false; 1752 break; 1753 case oop_arraycopy_uninit_id: 1754 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1755 aligned = !UseCompressedOops; 1756 is_oop = true; 1757 dest_uninitialized = true; 1758 break; 1759 case arrayof_oop_arraycopy_uninit_id: 1760 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1761 aligned = !UseCompressedOops; 1762 is_oop = true; 1763 dest_uninitialized = true; 1764 break; 1765 default: 1766 ShouldNotReachHere(); 1767 } 1768 1769 StubCodeMark mark(this, stub_id); 1770 address start = __ pc(); 1771 __ enter(); 1772 1773 if (entry != nullptr) { 1774 *entry = __ pc(); 1775 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1776 BLOCK_COMMENT("Entry:"); 1777 } 1778 1779 // use fwd copy when (d-s) above_equal (count*size) 1780 __ sub(rscratch1, d, s); 1781 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1782 __ br(Assembler::HS, nooverlap_target); 1783 1784 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1785 if (dest_uninitialized) { 1786 decorators |= IS_DEST_UNINITIALIZED; 1787 } 1788 if (aligned) { 1789 decorators |= ARRAYCOPY_ALIGNED; 1790 } 1791 1792 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1793 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1794 1795 if (is_oop) { 1796 // save regs before copy_memory 1797 __ push(RegSet::of(d, count), sp); 1798 } 1799 { 1800 // UnsafeMemoryAccess page error: continue after unsafe access 1801 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1802 UnsafeMemoryAccessMark umam(this, add_entry, true); 1803 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1804 } 1805 if (is_oop) { 1806 __ pop(RegSet::of(d, count), sp); 1807 if (VerifyOops) 1808 verify_oop_array(size, d, count, r16); 1809 } 1810 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1811 __ leave(); 1812 __ mov(r0, zr); // return 0 1813 __ ret(lr); 1814 return start; 1815 } 1816 1817 // Helper for generating a dynamic type check. 1818 // Smashes rscratch1, rscratch2. 1819 void generate_type_check(Register sub_klass, 1820 Register super_check_offset, 1821 Register super_klass, 1822 Register temp1, 1823 Register temp2, 1824 Register result, 1825 Label& L_success) { 1826 assert_different_registers(sub_klass, super_check_offset, super_klass); 1827 1828 BLOCK_COMMENT("type_check:"); 1829 1830 Label L_miss; 1831 1832 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1833 super_check_offset); 1834 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1835 1836 // Fall through on failure! 1837 __ BIND(L_miss); 1838 } 1839 1840 // 1841 // Generate checkcasting array copy stub 1842 // 1843 // Input: 1844 // c_rarg0 - source array address 1845 // c_rarg1 - destination array address 1846 // c_rarg2 - element count, treated as ssize_t, can be zero 1847 // c_rarg3 - size_t ckoff (super_check_offset) 1848 // c_rarg4 - oop ckval (super_klass) 1849 // 1850 // Output: 1851 // r0 == 0 - success 1852 // r0 == -1^K - failure, where K is partial transfer count 1853 // 1854 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1855 bool dest_uninitialized; 1856 switch (stub_id) { 1857 case checkcast_arraycopy_id: 1858 dest_uninitialized = false; 1859 break; 1860 case checkcast_arraycopy_uninit_id: 1861 dest_uninitialized = true; 1862 break; 1863 default: 1864 ShouldNotReachHere(); 1865 } 1866 1867 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1868 1869 // Input registers (after setup_arg_regs) 1870 const Register from = c_rarg0; // source array address 1871 const Register to = c_rarg1; // destination array address 1872 const Register count = c_rarg2; // elementscount 1873 const Register ckoff = c_rarg3; // super_check_offset 1874 const Register ckval = c_rarg4; // super_klass 1875 1876 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1877 RegSet wb_post_saved_regs = RegSet::of(count); 1878 1879 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1880 const Register copied_oop = r22; // actual oop copied 1881 const Register count_save = r21; // orig elementscount 1882 const Register start_to = r20; // destination array start address 1883 const Register r19_klass = r19; // oop._klass 1884 1885 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1886 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1887 1888 //--------------------------------------------------------------- 1889 // Assembler stub will be used for this call to arraycopy 1890 // if the two arrays are subtypes of Object[] but the 1891 // destination array type is not equal to or a supertype 1892 // of the source type. Each element must be separately 1893 // checked. 1894 1895 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1896 copied_oop, r19_klass, count_save); 1897 1898 __ align(CodeEntryAlignment); 1899 StubCodeMark mark(this, stub_id); 1900 address start = __ pc(); 1901 1902 __ enter(); // required for proper stackwalking of RuntimeStub frame 1903 1904 #ifdef ASSERT 1905 // caller guarantees that the arrays really are different 1906 // otherwise, we would have to make conjoint checks 1907 { Label L; 1908 __ b(L); // conjoint check not yet implemented 1909 __ stop("checkcast_copy within a single array"); 1910 __ bind(L); 1911 } 1912 #endif //ASSERT 1913 1914 // Caller of this entry point must set up the argument registers. 1915 if (entry != nullptr) { 1916 *entry = __ pc(); 1917 BLOCK_COMMENT("Entry:"); 1918 } 1919 1920 // Empty array: Nothing to do. 1921 __ cbz(count, L_done); 1922 __ push(RegSet::of(r19, r20, r21, r22), sp); 1923 1924 #ifdef ASSERT 1925 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1926 // The ckoff and ckval must be mutually consistent, 1927 // even though caller generates both. 1928 { Label L; 1929 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1930 __ ldrw(start_to, Address(ckval, sco_offset)); 1931 __ cmpw(ckoff, start_to); 1932 __ br(Assembler::EQ, L); 1933 __ stop("super_check_offset inconsistent"); 1934 __ bind(L); 1935 } 1936 #endif //ASSERT 1937 1938 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1939 bool is_oop = true; 1940 int element_size = UseCompressedOops ? 4 : 8; 1941 if (dest_uninitialized) { 1942 decorators |= IS_DEST_UNINITIALIZED; 1943 } 1944 1945 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1946 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1947 1948 // save the original count 1949 __ mov(count_save, count); 1950 1951 // Copy from low to high addresses 1952 __ mov(start_to, to); // Save destination array start address 1953 __ b(L_load_element); 1954 1955 // ======== begin loop ======== 1956 // (Loop is rotated; its entry is L_load_element.) 1957 // Loop control: 1958 // for (; count != 0; count--) { 1959 // copied_oop = load_heap_oop(from++); 1960 // ... generate_type_check ...; 1961 // store_heap_oop(to++, copied_oop); 1962 // } 1963 __ align(OptoLoopAlignment); 1964 1965 __ BIND(L_store_element); 1966 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1967 __ post(to, element_size), copied_oop, noreg, 1968 gct1, gct2, gct3); 1969 __ sub(count, count, 1); 1970 __ cbz(count, L_do_card_marks); 1971 1972 // ======== loop entry is here ======== 1973 __ BIND(L_load_element); 1974 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1975 copied_oop, noreg, __ post(from, element_size), 1976 gct1); 1977 __ cbz(copied_oop, L_store_element); 1978 1979 __ load_klass(r19_klass, copied_oop);// query the object klass 1980 1981 BLOCK_COMMENT("type_check:"); 1982 generate_type_check(/*sub_klass*/r19_klass, 1983 /*super_check_offset*/ckoff, 1984 /*super_klass*/ckval, 1985 /*r_array_base*/gct1, 1986 /*temp2*/gct2, 1987 /*result*/r10, L_store_element); 1988 1989 // Fall through on failure! 1990 1991 // ======== end loop ======== 1992 1993 // It was a real error; we must depend on the caller to finish the job. 1994 // Register count = remaining oops, count_orig = total oops. 1995 // Emit GC store barriers for the oops we have copied and report 1996 // their number to the caller. 1997 1998 __ subs(count, count_save, count); // K = partially copied oop count 1999 __ eon(count, count, zr); // report (-1^K) to caller 2000 __ br(Assembler::EQ, L_done_pop); 2001 2002 __ BIND(L_do_card_marks); 2003 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2004 2005 __ bind(L_done_pop); 2006 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2007 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2008 2009 __ bind(L_done); 2010 __ mov(r0, count); 2011 __ leave(); 2012 __ ret(lr); 2013 2014 return start; 2015 } 2016 2017 // Perform range checks on the proposed arraycopy. 2018 // Kills temp, but nothing else. 2019 // Also, clean the sign bits of src_pos and dst_pos. 2020 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2021 Register src_pos, // source position (c_rarg1) 2022 Register dst, // destination array oo (c_rarg2) 2023 Register dst_pos, // destination position (c_rarg3) 2024 Register length, 2025 Register temp, 2026 Label& L_failed) { 2027 BLOCK_COMMENT("arraycopy_range_checks:"); 2028 2029 assert_different_registers(rscratch1, temp); 2030 2031 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2032 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2033 __ addw(temp, length, src_pos); 2034 __ cmpw(temp, rscratch1); 2035 __ br(Assembler::HI, L_failed); 2036 2037 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2038 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2039 __ addw(temp, length, dst_pos); 2040 __ cmpw(temp, rscratch1); 2041 __ br(Assembler::HI, L_failed); 2042 2043 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2044 __ movw(src_pos, src_pos); 2045 __ movw(dst_pos, dst_pos); 2046 2047 BLOCK_COMMENT("arraycopy_range_checks done"); 2048 } 2049 2050 // These stubs get called from some dumb test routine. 2051 // I'll write them properly when they're called from 2052 // something that's actually doing something. 2053 static void fake_arraycopy_stub(address src, address dst, int count) { 2054 assert(count == 0, "huh?"); 2055 } 2056 2057 2058 // 2059 // Generate 'unsafe' array copy stub 2060 // Though just as safe as the other stubs, it takes an unscaled 2061 // size_t argument instead of an element count. 2062 // 2063 // Input: 2064 // c_rarg0 - source array address 2065 // c_rarg1 - destination array address 2066 // c_rarg2 - byte count, treated as ssize_t, can be zero 2067 // 2068 // Examines the alignment of the operands and dispatches 2069 // to a long, int, short, or byte copy loop. 2070 // 2071 address generate_unsafe_copy(address byte_copy_entry, 2072 address short_copy_entry, 2073 address int_copy_entry, 2074 address long_copy_entry) { 2075 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2076 2077 Label L_long_aligned, L_int_aligned, L_short_aligned; 2078 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2079 2080 __ align(CodeEntryAlignment); 2081 StubCodeMark mark(this, stub_id); 2082 address start = __ pc(); 2083 __ enter(); // required for proper stackwalking of RuntimeStub frame 2084 2085 // bump this on entry, not on exit: 2086 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2087 2088 __ orr(rscratch1, s, d); 2089 __ orr(rscratch1, rscratch1, count); 2090 2091 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2092 __ cbz(rscratch1, L_long_aligned); 2093 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2094 __ cbz(rscratch1, L_int_aligned); 2095 __ tbz(rscratch1, 0, L_short_aligned); 2096 __ b(RuntimeAddress(byte_copy_entry)); 2097 2098 __ BIND(L_short_aligned); 2099 __ lsr(count, count, LogBytesPerShort); // size => short_count 2100 __ b(RuntimeAddress(short_copy_entry)); 2101 __ BIND(L_int_aligned); 2102 __ lsr(count, count, LogBytesPerInt); // size => int_count 2103 __ b(RuntimeAddress(int_copy_entry)); 2104 __ BIND(L_long_aligned); 2105 __ lsr(count, count, LogBytesPerLong); // size => long_count 2106 __ b(RuntimeAddress(long_copy_entry)); 2107 2108 return start; 2109 } 2110 2111 // 2112 // Generate generic array copy stubs 2113 // 2114 // Input: 2115 // c_rarg0 - src oop 2116 // c_rarg1 - src_pos (32-bits) 2117 // c_rarg2 - dst oop 2118 // c_rarg3 - dst_pos (32-bits) 2119 // c_rarg4 - element count (32-bits) 2120 // 2121 // Output: 2122 // r0 == 0 - success 2123 // r0 == -1^K - failure, where K is partial transfer count 2124 // 2125 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2126 address int_copy_entry, address oop_copy_entry, 2127 address long_copy_entry, address checkcast_copy_entry) { 2128 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2129 2130 Label L_failed, L_objArray; 2131 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2132 2133 // Input registers 2134 const Register src = c_rarg0; // source array oop 2135 const Register src_pos = c_rarg1; // source position 2136 const Register dst = c_rarg2; // destination array oop 2137 const Register dst_pos = c_rarg3; // destination position 2138 const Register length = c_rarg4; 2139 2140 2141 // Registers used as temps 2142 const Register dst_klass = c_rarg5; 2143 2144 __ align(CodeEntryAlignment); 2145 2146 StubCodeMark mark(this, stub_id); 2147 2148 address start = __ pc(); 2149 2150 __ enter(); // required for proper stackwalking of RuntimeStub frame 2151 2152 // bump this on entry, not on exit: 2153 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2154 2155 //----------------------------------------------------------------------- 2156 // Assembler stub will be used for this call to arraycopy 2157 // if the following conditions are met: 2158 // 2159 // (1) src and dst must not be null. 2160 // (2) src_pos must not be negative. 2161 // (3) dst_pos must not be negative. 2162 // (4) length must not be negative. 2163 // (5) src klass and dst klass should be the same and not null. 2164 // (6) src and dst should be arrays. 2165 // (7) src_pos + length must not exceed length of src. 2166 // (8) dst_pos + length must not exceed length of dst. 2167 // 2168 2169 // if (src == nullptr) return -1; 2170 __ cbz(src, L_failed); 2171 2172 // if (src_pos < 0) return -1; 2173 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2174 2175 // if (dst == nullptr) return -1; 2176 __ cbz(dst, L_failed); 2177 2178 // if (dst_pos < 0) return -1; 2179 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2180 2181 // registers used as temp 2182 const Register scratch_length = r16; // elements count to copy 2183 const Register scratch_src_klass = r17; // array klass 2184 const Register lh = r15; // layout helper 2185 2186 // if (length < 0) return -1; 2187 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2188 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2189 2190 __ load_klass(scratch_src_klass, src); 2191 #ifdef ASSERT 2192 // assert(src->klass() != nullptr); 2193 { 2194 BLOCK_COMMENT("assert klasses not null {"); 2195 Label L1, L2; 2196 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2197 __ bind(L1); 2198 __ stop("broken null klass"); 2199 __ bind(L2); 2200 __ load_klass(rscratch1, dst); 2201 __ cbz(rscratch1, L1); // this would be broken also 2202 BLOCK_COMMENT("} assert klasses not null done"); 2203 } 2204 #endif 2205 2206 // Load layout helper (32-bits) 2207 // 2208 // |array_tag| | header_size | element_type | |log2_element_size| 2209 // 32 30 24 16 8 2 0 2210 // 2211 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2212 // 2213 2214 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2215 2216 // Handle objArrays completely differently... 2217 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2218 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2219 __ movw(rscratch1, objArray_lh); 2220 __ eorw(rscratch2, lh, rscratch1); 2221 __ cbzw(rscratch2, L_objArray); 2222 2223 // if (src->klass() != dst->klass()) return -1; 2224 __ load_klass(rscratch2, dst); 2225 __ eor(rscratch2, rscratch2, scratch_src_klass); 2226 __ cbnz(rscratch2, L_failed); 2227 2228 // if (!src->is_Array()) return -1; 2229 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2230 2231 // At this point, it is known to be a typeArray (array_tag 0x3). 2232 #ifdef ASSERT 2233 { 2234 BLOCK_COMMENT("assert primitive array {"); 2235 Label L; 2236 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2237 __ cmpw(lh, rscratch2); 2238 __ br(Assembler::GE, L); 2239 __ stop("must be a primitive array"); 2240 __ bind(L); 2241 BLOCK_COMMENT("} assert primitive array done"); 2242 } 2243 #endif 2244 2245 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2246 rscratch2, L_failed); 2247 2248 // TypeArrayKlass 2249 // 2250 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2251 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2252 // 2253 2254 const Register rscratch1_offset = rscratch1; // array offset 2255 const Register r15_elsize = lh; // element size 2256 2257 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2258 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2259 __ add(src, src, rscratch1_offset); // src array offset 2260 __ add(dst, dst, rscratch1_offset); // dst array offset 2261 BLOCK_COMMENT("choose copy loop based on element size"); 2262 2263 // next registers should be set before the jump to corresponding stub 2264 const Register from = c_rarg0; // source array address 2265 const Register to = c_rarg1; // destination array address 2266 const Register count = c_rarg2; // elements count 2267 2268 // 'from', 'to', 'count' registers should be set in such order 2269 // since they are the same as 'src', 'src_pos', 'dst'. 2270 2271 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2272 2273 // The possible values of elsize are 0-3, i.e. exact_log2(element 2274 // size in bytes). We do a simple bitwise binary search. 2275 __ BIND(L_copy_bytes); 2276 __ tbnz(r15_elsize, 1, L_copy_ints); 2277 __ tbnz(r15_elsize, 0, L_copy_shorts); 2278 __ lea(from, Address(src, src_pos));// src_addr 2279 __ lea(to, Address(dst, dst_pos));// dst_addr 2280 __ movw(count, scratch_length); // length 2281 __ b(RuntimeAddress(byte_copy_entry)); 2282 2283 __ BIND(L_copy_shorts); 2284 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2285 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2286 __ movw(count, scratch_length); // length 2287 __ b(RuntimeAddress(short_copy_entry)); 2288 2289 __ BIND(L_copy_ints); 2290 __ tbnz(r15_elsize, 0, L_copy_longs); 2291 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2292 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2293 __ movw(count, scratch_length); // length 2294 __ b(RuntimeAddress(int_copy_entry)); 2295 2296 __ BIND(L_copy_longs); 2297 #ifdef ASSERT 2298 { 2299 BLOCK_COMMENT("assert long copy {"); 2300 Label L; 2301 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2302 __ cmpw(r15_elsize, LogBytesPerLong); 2303 __ br(Assembler::EQ, L); 2304 __ stop("must be long copy, but elsize is wrong"); 2305 __ bind(L); 2306 BLOCK_COMMENT("} assert long copy done"); 2307 } 2308 #endif 2309 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2310 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2311 __ movw(count, scratch_length); // length 2312 __ b(RuntimeAddress(long_copy_entry)); 2313 2314 // ObjArrayKlass 2315 __ BIND(L_objArray); 2316 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2317 2318 Label L_plain_copy, L_checkcast_copy; 2319 // test array classes for subtyping 2320 __ load_klass(r15, dst); 2321 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2322 __ br(Assembler::NE, L_checkcast_copy); 2323 2324 // Identically typed arrays can be copied without element-wise checks. 2325 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2326 rscratch2, L_failed); 2327 2328 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2329 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2330 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2331 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2332 __ movw(count, scratch_length); // length 2333 __ BIND(L_plain_copy); 2334 __ b(RuntimeAddress(oop_copy_entry)); 2335 2336 __ BIND(L_checkcast_copy); 2337 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2338 { 2339 // Before looking at dst.length, make sure dst is also an objArray. 2340 __ ldrw(rscratch1, Address(r15, lh_offset)); 2341 __ movw(rscratch2, objArray_lh); 2342 __ eorw(rscratch1, rscratch1, rscratch2); 2343 __ cbnzw(rscratch1, L_failed); 2344 2345 // It is safe to examine both src.length and dst.length. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 r15, L_failed); 2348 2349 __ load_klass(dst_klass, dst); // reload 2350 2351 // Marshal the base address arguments now, freeing registers. 2352 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2353 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2354 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2355 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2356 __ movw(count, length); // length (reloaded) 2357 Register sco_temp = c_rarg3; // this register is free now 2358 assert_different_registers(from, to, count, sco_temp, 2359 dst_klass, scratch_src_klass); 2360 // assert_clean_int(count, sco_temp); 2361 2362 // Generate the type check. 2363 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2364 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2365 2366 // Smashes rscratch1, rscratch2 2367 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2368 L_plain_copy); 2369 2370 // Fetch destination element klass from the ObjArrayKlass header. 2371 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2372 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2373 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2374 2375 // the checkcast_copy loop needs two extra arguments: 2376 assert(c_rarg3 == sco_temp, "#3 already in place"); 2377 // Set up arguments for checkcast_copy_entry. 2378 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2379 __ b(RuntimeAddress(checkcast_copy_entry)); 2380 } 2381 2382 __ BIND(L_failed); 2383 __ mov(r0, -1); 2384 __ leave(); // required for proper stackwalking of RuntimeStub frame 2385 __ ret(lr); 2386 2387 return start; 2388 } 2389 2390 // 2391 // Generate stub for array fill. If "aligned" is true, the 2392 // "to" address is assumed to be heapword aligned. 2393 // 2394 // Arguments for generated stub: 2395 // to: c_rarg0 2396 // value: c_rarg1 2397 // count: c_rarg2 treated as signed 2398 // 2399 address generate_fill(StubGenStubId stub_id) { 2400 BasicType t; 2401 bool aligned; 2402 2403 switch (stub_id) { 2404 case jbyte_fill_id: 2405 t = T_BYTE; 2406 aligned = false; 2407 break; 2408 case jshort_fill_id: 2409 t = T_SHORT; 2410 aligned = false; 2411 break; 2412 case jint_fill_id: 2413 t = T_INT; 2414 aligned = false; 2415 break; 2416 case arrayof_jbyte_fill_id: 2417 t = T_BYTE; 2418 aligned = true; 2419 break; 2420 case arrayof_jshort_fill_id: 2421 t = T_SHORT; 2422 aligned = true; 2423 break; 2424 case arrayof_jint_fill_id: 2425 t = T_INT; 2426 aligned = true; 2427 break; 2428 default: 2429 ShouldNotReachHere(); 2430 }; 2431 2432 __ align(CodeEntryAlignment); 2433 StubCodeMark mark(this, stub_id); 2434 address start = __ pc(); 2435 2436 BLOCK_COMMENT("Entry:"); 2437 2438 const Register to = c_rarg0; // source array address 2439 const Register value = c_rarg1; // value 2440 const Register count = c_rarg2; // elements count 2441 2442 const Register bz_base = r10; // base for block_zero routine 2443 const Register cnt_words = r11; // temp register 2444 2445 __ enter(); 2446 2447 Label L_fill_elements, L_exit1; 2448 2449 int shift = -1; 2450 switch (t) { 2451 case T_BYTE: 2452 shift = 0; 2453 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2454 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2455 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2456 __ br(Assembler::LO, L_fill_elements); 2457 break; 2458 case T_SHORT: 2459 shift = 1; 2460 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2461 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2462 __ br(Assembler::LO, L_fill_elements); 2463 break; 2464 case T_INT: 2465 shift = 2; 2466 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2467 __ br(Assembler::LO, L_fill_elements); 2468 break; 2469 default: ShouldNotReachHere(); 2470 } 2471 2472 // Align source address at 8 bytes address boundary. 2473 Label L_skip_align1, L_skip_align2, L_skip_align4; 2474 if (!aligned) { 2475 switch (t) { 2476 case T_BYTE: 2477 // One byte misalignment happens only for byte arrays. 2478 __ tbz(to, 0, L_skip_align1); 2479 __ strb(value, Address(__ post(to, 1))); 2480 __ subw(count, count, 1); 2481 __ bind(L_skip_align1); 2482 // Fallthrough 2483 case T_SHORT: 2484 // Two bytes misalignment happens only for byte and short (char) arrays. 2485 __ tbz(to, 1, L_skip_align2); 2486 __ strh(value, Address(__ post(to, 2))); 2487 __ subw(count, count, 2 >> shift); 2488 __ bind(L_skip_align2); 2489 // Fallthrough 2490 case T_INT: 2491 // Align to 8 bytes, we know we are 4 byte aligned to start. 2492 __ tbz(to, 2, L_skip_align4); 2493 __ strw(value, Address(__ post(to, 4))); 2494 __ subw(count, count, 4 >> shift); 2495 __ bind(L_skip_align4); 2496 break; 2497 default: ShouldNotReachHere(); 2498 } 2499 } 2500 2501 // 2502 // Fill large chunks 2503 // 2504 __ lsrw(cnt_words, count, 3 - shift); // number of words 2505 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2506 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2507 if (UseBlockZeroing) { 2508 Label non_block_zeroing, rest; 2509 // If the fill value is zero we can use the fast zero_words(). 2510 __ cbnz(value, non_block_zeroing); 2511 __ mov(bz_base, to); 2512 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2513 address tpc = __ zero_words(bz_base, cnt_words); 2514 if (tpc == nullptr) { 2515 fatal("CodeCache is full at generate_fill"); 2516 } 2517 __ b(rest); 2518 __ bind(non_block_zeroing); 2519 __ fill_words(to, cnt_words, value); 2520 __ bind(rest); 2521 } else { 2522 __ fill_words(to, cnt_words, value); 2523 } 2524 2525 // Remaining count is less than 8 bytes. Fill it by a single store. 2526 // Note that the total length is no less than 8 bytes. 2527 if (t == T_BYTE || t == T_SHORT) { 2528 Label L_exit1; 2529 __ cbzw(count, L_exit1); 2530 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2531 __ str(value, Address(to, -8)); // overwrite some elements 2532 __ bind(L_exit1); 2533 __ leave(); 2534 __ ret(lr); 2535 } 2536 2537 // Handle copies less than 8 bytes. 2538 Label L_fill_2, L_fill_4, L_exit2; 2539 __ bind(L_fill_elements); 2540 switch (t) { 2541 case T_BYTE: 2542 __ tbz(count, 0, L_fill_2); 2543 __ strb(value, Address(__ post(to, 1))); 2544 __ bind(L_fill_2); 2545 __ tbz(count, 1, L_fill_4); 2546 __ strh(value, Address(__ post(to, 2))); 2547 __ bind(L_fill_4); 2548 __ tbz(count, 2, L_exit2); 2549 __ strw(value, Address(to)); 2550 break; 2551 case T_SHORT: 2552 __ tbz(count, 0, L_fill_4); 2553 __ strh(value, Address(__ post(to, 2))); 2554 __ bind(L_fill_4); 2555 __ tbz(count, 1, L_exit2); 2556 __ strw(value, Address(to)); 2557 break; 2558 case T_INT: 2559 __ cbzw(count, L_exit2); 2560 __ strw(value, Address(to)); 2561 break; 2562 default: ShouldNotReachHere(); 2563 } 2564 __ bind(L_exit2); 2565 __ leave(); 2566 __ ret(lr); 2567 return start; 2568 } 2569 2570 address generate_data_cache_writeback() { 2571 const Register line = c_rarg0; // address of line to write back 2572 2573 __ align(CodeEntryAlignment); 2574 2575 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2576 StubCodeMark mark(this, stub_id); 2577 2578 address start = __ pc(); 2579 __ enter(); 2580 __ cache_wb(Address(line, 0)); 2581 __ leave(); 2582 __ ret(lr); 2583 2584 return start; 2585 } 2586 2587 address generate_data_cache_writeback_sync() { 2588 const Register is_pre = c_rarg0; // pre or post sync 2589 2590 __ align(CodeEntryAlignment); 2591 2592 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2593 StubCodeMark mark(this, stub_id); 2594 2595 // pre wbsync is a no-op 2596 // post wbsync translates to an sfence 2597 2598 Label skip; 2599 address start = __ pc(); 2600 __ enter(); 2601 __ cbnz(is_pre, skip); 2602 __ cache_wbsync(false); 2603 __ bind(skip); 2604 __ leave(); 2605 __ ret(lr); 2606 2607 return start; 2608 } 2609 2610 void generate_arraycopy_stubs() { 2611 address entry; 2612 address entry_jbyte_arraycopy; 2613 address entry_jshort_arraycopy; 2614 address entry_jint_arraycopy; 2615 address entry_oop_arraycopy; 2616 address entry_jlong_arraycopy; 2617 address entry_checkcast_arraycopy; 2618 2619 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2620 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2621 2622 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2623 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2624 2625 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2626 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2627 2628 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2629 2630 //*** jbyte 2631 // Always need aligned and unaligned versions 2632 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2633 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2634 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2635 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2636 2637 //*** jshort 2638 // Always need aligned and unaligned versions 2639 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2640 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2641 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2642 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2643 2644 //*** jint 2645 // Aligned versions 2646 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2647 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2648 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2649 // entry_jint_arraycopy always points to the unaligned version 2650 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2651 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2652 2653 //*** jlong 2654 // It is always aligned 2655 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2656 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2657 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2658 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2659 2660 //*** oops 2661 { 2662 // With compressed oops we need unaligned versions; notice that 2663 // we overwrite entry_oop_arraycopy. 2664 bool aligned = !UseCompressedOops; 2665 2666 StubRoutines::_arrayof_oop_disjoint_arraycopy 2667 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2668 StubRoutines::_arrayof_oop_arraycopy 2669 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2670 // Aligned versions without pre-barriers 2671 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2672 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2673 StubRoutines::_arrayof_oop_arraycopy_uninit 2674 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2675 } 2676 2677 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2678 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2679 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2680 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2681 2682 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2683 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2684 2685 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2686 entry_jshort_arraycopy, 2687 entry_jint_arraycopy, 2688 entry_jlong_arraycopy); 2689 2690 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2691 entry_jshort_arraycopy, 2692 entry_jint_arraycopy, 2693 entry_oop_arraycopy, 2694 entry_jlong_arraycopy, 2695 entry_checkcast_arraycopy); 2696 2697 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2698 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2699 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2700 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2701 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2702 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2703 } 2704 2705 void generate_math_stubs() { Unimplemented(); } 2706 2707 // Arguments: 2708 // 2709 // Inputs: 2710 // c_rarg0 - source byte array address 2711 // c_rarg1 - destination byte array address 2712 // c_rarg2 - K (key) in little endian int array 2713 // 2714 address generate_aescrypt_encryptBlock() { 2715 __ align(CodeEntryAlignment); 2716 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2717 StubCodeMark mark(this, stub_id); 2718 2719 const Register from = c_rarg0; // source array address 2720 const Register to = c_rarg1; // destination array address 2721 const Register key = c_rarg2; // key array address 2722 const Register keylen = rscratch1; 2723 2724 address start = __ pc(); 2725 __ enter(); 2726 2727 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2728 2729 __ aesenc_loadkeys(key, keylen); 2730 __ aesecb_encrypt(from, to, keylen); 2731 2732 __ mov(r0, 0); 2733 2734 __ leave(); 2735 __ ret(lr); 2736 2737 return start; 2738 } 2739 2740 // Arguments: 2741 // 2742 // Inputs: 2743 // c_rarg0 - source byte array address 2744 // c_rarg1 - destination byte array address 2745 // c_rarg2 - K (key) in little endian int array 2746 // 2747 address generate_aescrypt_decryptBlock() { 2748 assert(UseAES, "need AES cryptographic extension support"); 2749 __ align(CodeEntryAlignment); 2750 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2751 StubCodeMark mark(this, stub_id); 2752 Label L_doLast; 2753 2754 const Register from = c_rarg0; // source array address 2755 const Register to = c_rarg1; // destination array address 2756 const Register key = c_rarg2; // key array address 2757 const Register keylen = rscratch1; 2758 2759 address start = __ pc(); 2760 __ enter(); // required for proper stackwalking of RuntimeStub frame 2761 2762 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2763 2764 __ aesecb_decrypt(from, to, key, keylen); 2765 2766 __ mov(r0, 0); 2767 2768 __ leave(); 2769 __ ret(lr); 2770 2771 return start; 2772 } 2773 2774 // Arguments: 2775 // 2776 // Inputs: 2777 // c_rarg0 - source byte array address 2778 // c_rarg1 - destination byte array address 2779 // c_rarg2 - K (key) in little endian int array 2780 // c_rarg3 - r vector byte array address 2781 // c_rarg4 - input length 2782 // 2783 // Output: 2784 // x0 - input length 2785 // 2786 address generate_cipherBlockChaining_encryptAESCrypt() { 2787 assert(UseAES, "need AES cryptographic extension support"); 2788 __ align(CodeEntryAlignment); 2789 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2790 StubCodeMark mark(this, stub_id); 2791 2792 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2793 2794 const Register from = c_rarg0; // source array address 2795 const Register to = c_rarg1; // destination array address 2796 const Register key = c_rarg2; // key array address 2797 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2798 // and left with the results of the last encryption block 2799 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2800 const Register keylen = rscratch1; 2801 2802 address start = __ pc(); 2803 2804 __ enter(); 2805 2806 __ movw(rscratch2, len_reg); 2807 2808 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2809 2810 __ ld1(v0, __ T16B, rvec); 2811 2812 __ cmpw(keylen, 52); 2813 __ br(Assembler::CC, L_loadkeys_44); 2814 __ br(Assembler::EQ, L_loadkeys_52); 2815 2816 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2817 __ rev32(v17, __ T16B, v17); 2818 __ rev32(v18, __ T16B, v18); 2819 __ BIND(L_loadkeys_52); 2820 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2821 __ rev32(v19, __ T16B, v19); 2822 __ rev32(v20, __ T16B, v20); 2823 __ BIND(L_loadkeys_44); 2824 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2825 __ rev32(v21, __ T16B, v21); 2826 __ rev32(v22, __ T16B, v22); 2827 __ rev32(v23, __ T16B, v23); 2828 __ rev32(v24, __ T16B, v24); 2829 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2830 __ rev32(v25, __ T16B, v25); 2831 __ rev32(v26, __ T16B, v26); 2832 __ rev32(v27, __ T16B, v27); 2833 __ rev32(v28, __ T16B, v28); 2834 __ ld1(v29, v30, v31, __ T16B, key); 2835 __ rev32(v29, __ T16B, v29); 2836 __ rev32(v30, __ T16B, v30); 2837 __ rev32(v31, __ T16B, v31); 2838 2839 __ BIND(L_aes_loop); 2840 __ ld1(v1, __ T16B, __ post(from, 16)); 2841 __ eor(v0, __ T16B, v0, v1); 2842 2843 __ br(Assembler::CC, L_rounds_44); 2844 __ br(Assembler::EQ, L_rounds_52); 2845 2846 __ aese(v0, v17); __ aesmc(v0, v0); 2847 __ aese(v0, v18); __ aesmc(v0, v0); 2848 __ BIND(L_rounds_52); 2849 __ aese(v0, v19); __ aesmc(v0, v0); 2850 __ aese(v0, v20); __ aesmc(v0, v0); 2851 __ BIND(L_rounds_44); 2852 __ aese(v0, v21); __ aesmc(v0, v0); 2853 __ aese(v0, v22); __ aesmc(v0, v0); 2854 __ aese(v0, v23); __ aesmc(v0, v0); 2855 __ aese(v0, v24); __ aesmc(v0, v0); 2856 __ aese(v0, v25); __ aesmc(v0, v0); 2857 __ aese(v0, v26); __ aesmc(v0, v0); 2858 __ aese(v0, v27); __ aesmc(v0, v0); 2859 __ aese(v0, v28); __ aesmc(v0, v0); 2860 __ aese(v0, v29); __ aesmc(v0, v0); 2861 __ aese(v0, v30); 2862 __ eor(v0, __ T16B, v0, v31); 2863 2864 __ st1(v0, __ T16B, __ post(to, 16)); 2865 2866 __ subw(len_reg, len_reg, 16); 2867 __ cbnzw(len_reg, L_aes_loop); 2868 2869 __ st1(v0, __ T16B, rvec); 2870 2871 __ mov(r0, rscratch2); 2872 2873 __ leave(); 2874 __ ret(lr); 2875 2876 return start; 2877 } 2878 2879 // Arguments: 2880 // 2881 // Inputs: 2882 // c_rarg0 - source byte array address 2883 // c_rarg1 - destination byte array address 2884 // c_rarg2 - K (key) in little endian int array 2885 // c_rarg3 - r vector byte array address 2886 // c_rarg4 - input length 2887 // 2888 // Output: 2889 // r0 - input length 2890 // 2891 address generate_cipherBlockChaining_decryptAESCrypt() { 2892 assert(UseAES, "need AES cryptographic extension support"); 2893 __ align(CodeEntryAlignment); 2894 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 2895 StubCodeMark mark(this, stub_id); 2896 2897 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2898 2899 const Register from = c_rarg0; // source array address 2900 const Register to = c_rarg1; // destination array address 2901 const Register key = c_rarg2; // key array address 2902 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2903 // and left with the results of the last encryption block 2904 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2905 const Register keylen = rscratch1; 2906 2907 address start = __ pc(); 2908 2909 __ enter(); 2910 2911 __ movw(rscratch2, len_reg); 2912 2913 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2914 2915 __ ld1(v2, __ T16B, rvec); 2916 2917 __ ld1(v31, __ T16B, __ post(key, 16)); 2918 __ rev32(v31, __ T16B, v31); 2919 2920 __ cmpw(keylen, 52); 2921 __ br(Assembler::CC, L_loadkeys_44); 2922 __ br(Assembler::EQ, L_loadkeys_52); 2923 2924 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2925 __ rev32(v17, __ T16B, v17); 2926 __ rev32(v18, __ T16B, v18); 2927 __ BIND(L_loadkeys_52); 2928 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2929 __ rev32(v19, __ T16B, v19); 2930 __ rev32(v20, __ T16B, v20); 2931 __ BIND(L_loadkeys_44); 2932 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2933 __ rev32(v21, __ T16B, v21); 2934 __ rev32(v22, __ T16B, v22); 2935 __ rev32(v23, __ T16B, v23); 2936 __ rev32(v24, __ T16B, v24); 2937 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2938 __ rev32(v25, __ T16B, v25); 2939 __ rev32(v26, __ T16B, v26); 2940 __ rev32(v27, __ T16B, v27); 2941 __ rev32(v28, __ T16B, v28); 2942 __ ld1(v29, v30, __ T16B, key); 2943 __ rev32(v29, __ T16B, v29); 2944 __ rev32(v30, __ T16B, v30); 2945 2946 __ BIND(L_aes_loop); 2947 __ ld1(v0, __ T16B, __ post(from, 16)); 2948 __ orr(v1, __ T16B, v0, v0); 2949 2950 __ br(Assembler::CC, L_rounds_44); 2951 __ br(Assembler::EQ, L_rounds_52); 2952 2953 __ aesd(v0, v17); __ aesimc(v0, v0); 2954 __ aesd(v0, v18); __ aesimc(v0, v0); 2955 __ BIND(L_rounds_52); 2956 __ aesd(v0, v19); __ aesimc(v0, v0); 2957 __ aesd(v0, v20); __ aesimc(v0, v0); 2958 __ BIND(L_rounds_44); 2959 __ aesd(v0, v21); __ aesimc(v0, v0); 2960 __ aesd(v0, v22); __ aesimc(v0, v0); 2961 __ aesd(v0, v23); __ aesimc(v0, v0); 2962 __ aesd(v0, v24); __ aesimc(v0, v0); 2963 __ aesd(v0, v25); __ aesimc(v0, v0); 2964 __ aesd(v0, v26); __ aesimc(v0, v0); 2965 __ aesd(v0, v27); __ aesimc(v0, v0); 2966 __ aesd(v0, v28); __ aesimc(v0, v0); 2967 __ aesd(v0, v29); __ aesimc(v0, v0); 2968 __ aesd(v0, v30); 2969 __ eor(v0, __ T16B, v0, v31); 2970 __ eor(v0, __ T16B, v0, v2); 2971 2972 __ st1(v0, __ T16B, __ post(to, 16)); 2973 __ orr(v2, __ T16B, v1, v1); 2974 2975 __ subw(len_reg, len_reg, 16); 2976 __ cbnzw(len_reg, L_aes_loop); 2977 2978 __ st1(v2, __ T16B, rvec); 2979 2980 __ mov(r0, rscratch2); 2981 2982 __ leave(); 2983 __ ret(lr); 2984 2985 return start; 2986 } 2987 2988 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2989 // Inputs: 128-bits. in is preserved. 2990 // The least-significant 64-bit word is in the upper dword of each vector. 2991 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2992 // Output: result 2993 void be_add_128_64(FloatRegister result, FloatRegister in, 2994 FloatRegister inc, FloatRegister tmp) { 2995 assert_different_registers(result, tmp, inc); 2996 2997 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2998 // input 2999 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3000 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3001 // MSD == 0 (must be!) to LSD 3002 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3003 } 3004 3005 // CTR AES crypt. 3006 // Arguments: 3007 // 3008 // Inputs: 3009 // c_rarg0 - source byte array address 3010 // c_rarg1 - destination byte array address 3011 // c_rarg2 - K (key) in little endian int array 3012 // c_rarg3 - counter vector byte array address 3013 // c_rarg4 - input length 3014 // c_rarg5 - saved encryptedCounter start 3015 // c_rarg6 - saved used length 3016 // 3017 // Output: 3018 // r0 - input length 3019 // 3020 address generate_counterMode_AESCrypt() { 3021 const Register in = c_rarg0; 3022 const Register out = c_rarg1; 3023 const Register key = c_rarg2; 3024 const Register counter = c_rarg3; 3025 const Register saved_len = c_rarg4, len = r10; 3026 const Register saved_encrypted_ctr = c_rarg5; 3027 const Register used_ptr = c_rarg6, used = r12; 3028 3029 const Register offset = r7; 3030 const Register keylen = r11; 3031 3032 const unsigned char block_size = 16; 3033 const int bulk_width = 4; 3034 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3035 // performance with larger data sizes, but it also means that the 3036 // fast path isn't used until you have at least 8 blocks, and up 3037 // to 127 bytes of data will be executed on the slow path. For 3038 // that reason, and also so as not to blow away too much icache, 4 3039 // blocks seems like a sensible compromise. 3040 3041 // Algorithm: 3042 // 3043 // if (len == 0) { 3044 // goto DONE; 3045 // } 3046 // int result = len; 3047 // do { 3048 // if (used >= blockSize) { 3049 // if (len >= bulk_width * blockSize) { 3050 // CTR_large_block(); 3051 // if (len == 0) 3052 // goto DONE; 3053 // } 3054 // for (;;) { 3055 // 16ByteVector v0 = counter; 3056 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3057 // used = 0; 3058 // if (len < blockSize) 3059 // break; /* goto NEXT */ 3060 // 16ByteVector v1 = load16Bytes(in, offset); 3061 // v1 = v1 ^ encryptedCounter; 3062 // store16Bytes(out, offset); 3063 // used = blockSize; 3064 // offset += blockSize; 3065 // len -= blockSize; 3066 // if (len == 0) 3067 // goto DONE; 3068 // } 3069 // } 3070 // NEXT: 3071 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3072 // len--; 3073 // } while (len != 0); 3074 // DONE: 3075 // return result; 3076 // 3077 // CTR_large_block() 3078 // Wide bulk encryption of whole blocks. 3079 3080 __ align(CodeEntryAlignment); 3081 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3082 StubCodeMark mark(this, stub_id); 3083 const address start = __ pc(); 3084 __ enter(); 3085 3086 Label DONE, CTR_large_block, large_block_return; 3087 __ ldrw(used, Address(used_ptr)); 3088 __ cbzw(saved_len, DONE); 3089 3090 __ mov(len, saved_len); 3091 __ mov(offset, 0); 3092 3093 // Compute #rounds for AES based on the length of the key array 3094 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3095 3096 __ aesenc_loadkeys(key, keylen); 3097 3098 { 3099 Label L_CTR_loop, NEXT; 3100 3101 __ bind(L_CTR_loop); 3102 3103 __ cmp(used, block_size); 3104 __ br(__ LO, NEXT); 3105 3106 // Maybe we have a lot of data 3107 __ subsw(rscratch1, len, bulk_width * block_size); 3108 __ br(__ HS, CTR_large_block); 3109 __ BIND(large_block_return); 3110 __ cbzw(len, DONE); 3111 3112 // Setup the counter 3113 __ movi(v4, __ T4S, 0); 3114 __ movi(v5, __ T4S, 1); 3115 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3116 3117 // 128-bit big-endian increment 3118 __ ld1(v0, __ T16B, counter); 3119 __ rev64(v16, __ T16B, v0); 3120 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3121 __ rev64(v16, __ T16B, v16); 3122 __ st1(v16, __ T16B, counter); 3123 // Previous counter value is in v0 3124 // v4 contains { 0, 1 } 3125 3126 { 3127 // We have fewer than bulk_width blocks of data left. Encrypt 3128 // them one by one until there is less than a full block 3129 // remaining, being careful to save both the encrypted counter 3130 // and the counter. 3131 3132 Label inner_loop; 3133 __ bind(inner_loop); 3134 // Counter to encrypt is in v0 3135 __ aesecb_encrypt(noreg, noreg, keylen); 3136 __ st1(v0, __ T16B, saved_encrypted_ctr); 3137 3138 // Do we have a remaining full block? 3139 3140 __ mov(used, 0); 3141 __ cmp(len, block_size); 3142 __ br(__ LO, NEXT); 3143 3144 // Yes, we have a full block 3145 __ ldrq(v1, Address(in, offset)); 3146 __ eor(v1, __ T16B, v1, v0); 3147 __ strq(v1, Address(out, offset)); 3148 __ mov(used, block_size); 3149 __ add(offset, offset, block_size); 3150 3151 __ subw(len, len, block_size); 3152 __ cbzw(len, DONE); 3153 3154 // Increment the counter, store it back 3155 __ orr(v0, __ T16B, v16, v16); 3156 __ rev64(v16, __ T16B, v16); 3157 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3158 __ rev64(v16, __ T16B, v16); 3159 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3160 3161 __ b(inner_loop); 3162 } 3163 3164 __ BIND(NEXT); 3165 3166 // Encrypt a single byte, and loop. 3167 // We expect this to be a rare event. 3168 __ ldrb(rscratch1, Address(in, offset)); 3169 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3170 __ eor(rscratch1, rscratch1, rscratch2); 3171 __ strb(rscratch1, Address(out, offset)); 3172 __ add(offset, offset, 1); 3173 __ add(used, used, 1); 3174 __ subw(len, len,1); 3175 __ cbnzw(len, L_CTR_loop); 3176 } 3177 3178 __ bind(DONE); 3179 __ strw(used, Address(used_ptr)); 3180 __ mov(r0, saved_len); 3181 3182 __ leave(); // required for proper stackwalking of RuntimeStub frame 3183 __ ret(lr); 3184 3185 // Bulk encryption 3186 3187 __ BIND (CTR_large_block); 3188 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3189 3190 if (bulk_width == 8) { 3191 __ sub(sp, sp, 4 * 16); 3192 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3193 } 3194 __ sub(sp, sp, 4 * 16); 3195 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3196 RegSet saved_regs = (RegSet::of(in, out, offset) 3197 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3198 __ push(saved_regs, sp); 3199 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3200 __ add(in, in, offset); 3201 __ add(out, out, offset); 3202 3203 // Keys should already be loaded into the correct registers 3204 3205 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3206 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3207 3208 // AES/CTR loop 3209 { 3210 Label L_CTR_loop; 3211 __ BIND(L_CTR_loop); 3212 3213 // Setup the counters 3214 __ movi(v8, __ T4S, 0); 3215 __ movi(v9, __ T4S, 1); 3216 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3217 3218 for (int i = 0; i < bulk_width; i++) { 3219 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3220 __ rev64(v0_ofs, __ T16B, v16); 3221 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3222 } 3223 3224 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3225 3226 // Encrypt the counters 3227 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3228 3229 if (bulk_width == 8) { 3230 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3231 } 3232 3233 // XOR the encrypted counters with the inputs 3234 for (int i = 0; i < bulk_width; i++) { 3235 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3236 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3237 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3238 } 3239 3240 // Write the encrypted data 3241 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3242 if (bulk_width == 8) { 3243 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3244 } 3245 3246 __ subw(len, len, 16 * bulk_width); 3247 __ cbnzw(len, L_CTR_loop); 3248 } 3249 3250 // Save the counter back where it goes 3251 __ rev64(v16, __ T16B, v16); 3252 __ st1(v16, __ T16B, counter); 3253 3254 __ pop(saved_regs, sp); 3255 3256 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3257 if (bulk_width == 8) { 3258 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3259 } 3260 3261 __ andr(rscratch1, len, -16 * bulk_width); 3262 __ sub(len, len, rscratch1); 3263 __ add(offset, offset, rscratch1); 3264 __ mov(used, 16); 3265 __ strw(used, Address(used_ptr)); 3266 __ b(large_block_return); 3267 3268 return start; 3269 } 3270 3271 // Vector AES Galois Counter Mode implementation. Parameters: 3272 // 3273 // in = c_rarg0 3274 // len = c_rarg1 3275 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3276 // out = c_rarg3 3277 // key = c_rarg4 3278 // state = c_rarg5 - GHASH.state 3279 // subkeyHtbl = c_rarg6 - powers of H 3280 // counter = c_rarg7 - 16 bytes of CTR 3281 // return - number of processed bytes 3282 address generate_galoisCounterMode_AESCrypt() { 3283 address ghash_polynomial = __ pc(); 3284 __ emit_int64(0x87); // The low-order bits of the field 3285 // polynomial (i.e. p = z^7+z^2+z+1) 3286 // repeated in the low and high parts of a 3287 // 128-bit vector 3288 __ emit_int64(0x87); 3289 3290 __ align(CodeEntryAlignment); 3291 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3292 StubCodeMark mark(this, stub_id); 3293 address start = __ pc(); 3294 __ enter(); 3295 3296 const Register in = c_rarg0; 3297 const Register len = c_rarg1; 3298 const Register ct = c_rarg2; 3299 const Register out = c_rarg3; 3300 // and updated with the incremented counter in the end 3301 3302 const Register key = c_rarg4; 3303 const Register state = c_rarg5; 3304 3305 const Register subkeyHtbl = c_rarg6; 3306 3307 const Register counter = c_rarg7; 3308 3309 const Register keylen = r10; 3310 // Save state before entering routine 3311 __ sub(sp, sp, 4 * 16); 3312 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3313 __ sub(sp, sp, 4 * 16); 3314 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3315 3316 // __ andr(len, len, -512); 3317 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3318 __ str(len, __ pre(sp, -2 * wordSize)); 3319 3320 Label DONE; 3321 __ cbz(len, DONE); 3322 3323 // Compute #rounds for AES based on the length of the key array 3324 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3325 3326 __ aesenc_loadkeys(key, keylen); 3327 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3328 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3329 3330 // AES/CTR loop 3331 { 3332 Label L_CTR_loop; 3333 __ BIND(L_CTR_loop); 3334 3335 // Setup the counters 3336 __ movi(v8, __ T4S, 0); 3337 __ movi(v9, __ T4S, 1); 3338 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3339 3340 assert(v0->encoding() < v8->encoding(), ""); 3341 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3342 FloatRegister f = as_FloatRegister(i); 3343 __ rev32(f, __ T16B, v16); 3344 __ addv(v16, __ T4S, v16, v8); 3345 } 3346 3347 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3348 3349 // Encrypt the counters 3350 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3351 3352 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3353 3354 // XOR the encrypted counters with the inputs 3355 for (int i = 0; i < 8; i++) { 3356 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3357 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3358 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3359 } 3360 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3361 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3362 3363 __ subw(len, len, 16 * 8); 3364 __ cbnzw(len, L_CTR_loop); 3365 } 3366 3367 __ rev32(v16, __ T16B, v16); 3368 __ st1(v16, __ T16B, counter); 3369 3370 __ ldr(len, Address(sp)); 3371 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3372 3373 // GHASH/CTR loop 3374 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3375 len, /*unrolls*/4); 3376 3377 #ifdef ASSERT 3378 { Label L; 3379 __ cmp(len, (unsigned char)0); 3380 __ br(Assembler::EQ, L); 3381 __ stop("stubGenerator: abort"); 3382 __ bind(L); 3383 } 3384 #endif 3385 3386 __ bind(DONE); 3387 // Return the number of bytes processed 3388 __ ldr(r0, __ post(sp, 2 * wordSize)); 3389 3390 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3391 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3392 3393 __ leave(); // required for proper stackwalking of RuntimeStub frame 3394 __ ret(lr); 3395 return start; 3396 } 3397 3398 class Cached64Bytes { 3399 private: 3400 MacroAssembler *_masm; 3401 Register _regs[8]; 3402 3403 public: 3404 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3405 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3406 auto it = rs.begin(); 3407 for (auto &r: _regs) { 3408 r = *it; 3409 ++it; 3410 } 3411 } 3412 3413 void gen_loads(Register base) { 3414 for (int i = 0; i < 8; i += 2) { 3415 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3416 } 3417 } 3418 3419 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3420 void extract_u32(Register dest, int i) { 3421 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3422 } 3423 }; 3424 3425 // Utility routines for md5. 3426 // Clobbers r10 and r11. 3427 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3428 int k, int s, int t) { 3429 Register rscratch3 = r10; 3430 Register rscratch4 = r11; 3431 3432 __ eorw(rscratch3, r3, r4); 3433 __ movw(rscratch2, t); 3434 __ andw(rscratch3, rscratch3, r2); 3435 __ addw(rscratch4, r1, rscratch2); 3436 reg_cache.extract_u32(rscratch1, k); 3437 __ eorw(rscratch3, rscratch3, r4); 3438 __ addw(rscratch4, rscratch4, rscratch1); 3439 __ addw(rscratch3, rscratch3, rscratch4); 3440 __ rorw(rscratch2, rscratch3, 32 - s); 3441 __ addw(r1, rscratch2, r2); 3442 } 3443 3444 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3445 int k, int s, int t) { 3446 Register rscratch3 = r10; 3447 Register rscratch4 = r11; 3448 3449 reg_cache.extract_u32(rscratch1, k); 3450 __ movw(rscratch2, t); 3451 __ addw(rscratch4, r1, rscratch2); 3452 __ addw(rscratch4, rscratch4, rscratch1); 3453 __ bicw(rscratch2, r3, r4); 3454 __ andw(rscratch3, r2, r4); 3455 __ addw(rscratch2, rscratch2, rscratch4); 3456 __ addw(rscratch2, rscratch2, rscratch3); 3457 __ rorw(rscratch2, rscratch2, 32 - s); 3458 __ addw(r1, rscratch2, r2); 3459 } 3460 3461 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3462 int k, int s, int t) { 3463 Register rscratch3 = r10; 3464 Register rscratch4 = r11; 3465 3466 __ eorw(rscratch3, r3, r4); 3467 __ movw(rscratch2, t); 3468 __ addw(rscratch4, r1, rscratch2); 3469 reg_cache.extract_u32(rscratch1, k); 3470 __ eorw(rscratch3, rscratch3, r2); 3471 __ addw(rscratch4, rscratch4, rscratch1); 3472 __ addw(rscratch3, rscratch3, rscratch4); 3473 __ rorw(rscratch2, rscratch3, 32 - s); 3474 __ addw(r1, rscratch2, r2); 3475 } 3476 3477 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3478 int k, int s, int t) { 3479 Register rscratch3 = r10; 3480 Register rscratch4 = r11; 3481 3482 __ movw(rscratch3, t); 3483 __ ornw(rscratch2, r2, r4); 3484 __ addw(rscratch4, r1, rscratch3); 3485 reg_cache.extract_u32(rscratch1, k); 3486 __ eorw(rscratch3, rscratch2, r3); 3487 __ addw(rscratch4, rscratch4, rscratch1); 3488 __ addw(rscratch3, rscratch3, rscratch4); 3489 __ rorw(rscratch2, rscratch3, 32 - s); 3490 __ addw(r1, rscratch2, r2); 3491 } 3492 3493 // Arguments: 3494 // 3495 // Inputs: 3496 // c_rarg0 - byte[] source+offset 3497 // c_rarg1 - int[] SHA.state 3498 // c_rarg2 - int offset 3499 // c_rarg3 - int limit 3500 // 3501 address generate_md5_implCompress(StubGenStubId stub_id) { 3502 bool multi_block; 3503 switch (stub_id) { 3504 case md5_implCompress_id: 3505 multi_block = false; 3506 break; 3507 case md5_implCompressMB_id: 3508 multi_block = true; 3509 break; 3510 default: 3511 ShouldNotReachHere(); 3512 } 3513 __ align(CodeEntryAlignment); 3514 3515 StubCodeMark mark(this, stub_id); 3516 address start = __ pc(); 3517 3518 Register buf = c_rarg0; 3519 Register state = c_rarg1; 3520 Register ofs = c_rarg2; 3521 Register limit = c_rarg3; 3522 Register a = r4; 3523 Register b = r5; 3524 Register c = r6; 3525 Register d = r7; 3526 Register rscratch3 = r10; 3527 Register rscratch4 = r11; 3528 3529 Register state_regs[2] = { r12, r13 }; 3530 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3531 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3532 3533 __ push(saved_regs, sp); 3534 3535 __ ldp(state_regs[0], state_regs[1], Address(state)); 3536 __ ubfx(a, state_regs[0], 0, 32); 3537 __ ubfx(b, state_regs[0], 32, 32); 3538 __ ubfx(c, state_regs[1], 0, 32); 3539 __ ubfx(d, state_regs[1], 32, 32); 3540 3541 Label md5_loop; 3542 __ BIND(md5_loop); 3543 3544 reg_cache.gen_loads(buf); 3545 3546 // Round 1 3547 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3548 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3549 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3550 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3551 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3552 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3553 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3554 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3555 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3556 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3557 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3558 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3559 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3560 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3561 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3562 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3563 3564 // Round 2 3565 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3566 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3567 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3568 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3569 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3570 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3571 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3572 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3573 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3574 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3575 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3576 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3577 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3578 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3579 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3580 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3581 3582 // Round 3 3583 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3584 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3585 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3586 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3587 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3588 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3589 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3590 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3591 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3592 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3593 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3594 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3595 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3596 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3597 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3598 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3599 3600 // Round 4 3601 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3602 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3603 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3604 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3605 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3606 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3607 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3608 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3609 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3610 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3611 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3612 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3613 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3614 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3615 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3616 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3617 3618 __ addw(a, state_regs[0], a); 3619 __ ubfx(rscratch2, state_regs[0], 32, 32); 3620 __ addw(b, rscratch2, b); 3621 __ addw(c, state_regs[1], c); 3622 __ ubfx(rscratch4, state_regs[1], 32, 32); 3623 __ addw(d, rscratch4, d); 3624 3625 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3626 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3627 3628 if (multi_block) { 3629 __ add(buf, buf, 64); 3630 __ add(ofs, ofs, 64); 3631 __ cmp(ofs, limit); 3632 __ br(Assembler::LE, md5_loop); 3633 __ mov(c_rarg0, ofs); // return ofs 3634 } 3635 3636 // write hash values back in the correct order 3637 __ stp(state_regs[0], state_regs[1], Address(state)); 3638 3639 __ pop(saved_regs, sp); 3640 3641 __ ret(lr); 3642 3643 return start; 3644 } 3645 3646 // Arguments: 3647 // 3648 // Inputs: 3649 // c_rarg0 - byte[] source+offset 3650 // c_rarg1 - int[] SHA.state 3651 // c_rarg2 - int offset 3652 // c_rarg3 - int limit 3653 // 3654 address generate_sha1_implCompress(StubGenStubId stub_id) { 3655 bool multi_block; 3656 switch (stub_id) { 3657 case sha1_implCompress_id: 3658 multi_block = false; 3659 break; 3660 case sha1_implCompressMB_id: 3661 multi_block = true; 3662 break; 3663 default: 3664 ShouldNotReachHere(); 3665 } 3666 3667 __ align(CodeEntryAlignment); 3668 3669 StubCodeMark mark(this, stub_id); 3670 address start = __ pc(); 3671 3672 Register buf = c_rarg0; 3673 Register state = c_rarg1; 3674 Register ofs = c_rarg2; 3675 Register limit = c_rarg3; 3676 3677 Label keys; 3678 Label sha1_loop; 3679 3680 // load the keys into v0..v3 3681 __ adr(rscratch1, keys); 3682 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3683 // load 5 words state into v6, v7 3684 __ ldrq(v6, Address(state, 0)); 3685 __ ldrs(v7, Address(state, 16)); 3686 3687 3688 __ BIND(sha1_loop); 3689 // load 64 bytes of data into v16..v19 3690 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3691 __ rev32(v16, __ T16B, v16); 3692 __ rev32(v17, __ T16B, v17); 3693 __ rev32(v18, __ T16B, v18); 3694 __ rev32(v19, __ T16B, v19); 3695 3696 // do the sha1 3697 __ addv(v4, __ T4S, v16, v0); 3698 __ orr(v20, __ T16B, v6, v6); 3699 3700 FloatRegister d0 = v16; 3701 FloatRegister d1 = v17; 3702 FloatRegister d2 = v18; 3703 FloatRegister d3 = v19; 3704 3705 for (int round = 0; round < 20; round++) { 3706 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3707 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3708 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3709 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3710 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3711 3712 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3713 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3714 __ sha1h(tmp2, __ T4S, v20); 3715 if (round < 5) 3716 __ sha1c(v20, __ T4S, tmp3, tmp4); 3717 else if (round < 10 || round >= 15) 3718 __ sha1p(v20, __ T4S, tmp3, tmp4); 3719 else 3720 __ sha1m(v20, __ T4S, tmp3, tmp4); 3721 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3722 3723 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3724 } 3725 3726 __ addv(v7, __ T2S, v7, v21); 3727 __ addv(v6, __ T4S, v6, v20); 3728 3729 if (multi_block) { 3730 __ add(ofs, ofs, 64); 3731 __ cmp(ofs, limit); 3732 __ br(Assembler::LE, sha1_loop); 3733 __ mov(c_rarg0, ofs); // return ofs 3734 } 3735 3736 __ strq(v6, Address(state, 0)); 3737 __ strs(v7, Address(state, 16)); 3738 3739 __ ret(lr); 3740 3741 __ bind(keys); 3742 __ emit_int32(0x5a827999); 3743 __ emit_int32(0x6ed9eba1); 3744 __ emit_int32(0x8f1bbcdc); 3745 __ emit_int32(0xca62c1d6); 3746 3747 return start; 3748 } 3749 3750 3751 // Arguments: 3752 // 3753 // Inputs: 3754 // c_rarg0 - byte[] source+offset 3755 // c_rarg1 - int[] SHA.state 3756 // c_rarg2 - int offset 3757 // c_rarg3 - int limit 3758 // 3759 address generate_sha256_implCompress(StubGenStubId stub_id) { 3760 bool multi_block; 3761 switch (stub_id) { 3762 case sha256_implCompress_id: 3763 multi_block = false; 3764 break; 3765 case sha256_implCompressMB_id: 3766 multi_block = true; 3767 break; 3768 default: 3769 ShouldNotReachHere(); 3770 } 3771 3772 static const uint32_t round_consts[64] = { 3773 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3774 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3775 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3776 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3777 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3778 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3779 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3780 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3781 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3782 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3783 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3784 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3785 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3786 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3787 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3788 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3789 }; 3790 3791 __ align(CodeEntryAlignment); 3792 3793 StubCodeMark mark(this, stub_id); 3794 address start = __ pc(); 3795 3796 Register buf = c_rarg0; 3797 Register state = c_rarg1; 3798 Register ofs = c_rarg2; 3799 Register limit = c_rarg3; 3800 3801 Label sha1_loop; 3802 3803 __ stpd(v8, v9, __ pre(sp, -32)); 3804 __ stpd(v10, v11, Address(sp, 16)); 3805 3806 // dga == v0 3807 // dgb == v1 3808 // dg0 == v2 3809 // dg1 == v3 3810 // dg2 == v4 3811 // t0 == v6 3812 // t1 == v7 3813 3814 // load 16 keys to v16..v31 3815 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3816 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3817 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3818 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3819 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3820 3821 // load 8 words (256 bits) state 3822 __ ldpq(v0, v1, state); 3823 3824 __ BIND(sha1_loop); 3825 // load 64 bytes of data into v8..v11 3826 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3827 __ rev32(v8, __ T16B, v8); 3828 __ rev32(v9, __ T16B, v9); 3829 __ rev32(v10, __ T16B, v10); 3830 __ rev32(v11, __ T16B, v11); 3831 3832 __ addv(v6, __ T4S, v8, v16); 3833 __ orr(v2, __ T16B, v0, v0); 3834 __ orr(v3, __ T16B, v1, v1); 3835 3836 FloatRegister d0 = v8; 3837 FloatRegister d1 = v9; 3838 FloatRegister d2 = v10; 3839 FloatRegister d3 = v11; 3840 3841 3842 for (int round = 0; round < 16; round++) { 3843 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3844 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3845 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3846 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3847 3848 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3849 __ orr(v4, __ T16B, v2, v2); 3850 if (round < 15) 3851 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3852 __ sha256h(v2, __ T4S, v3, tmp2); 3853 __ sha256h2(v3, __ T4S, v4, tmp2); 3854 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3855 3856 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3857 } 3858 3859 __ addv(v0, __ T4S, v0, v2); 3860 __ addv(v1, __ T4S, v1, v3); 3861 3862 if (multi_block) { 3863 __ add(ofs, ofs, 64); 3864 __ cmp(ofs, limit); 3865 __ br(Assembler::LE, sha1_loop); 3866 __ mov(c_rarg0, ofs); // return ofs 3867 } 3868 3869 __ ldpd(v10, v11, Address(sp, 16)); 3870 __ ldpd(v8, v9, __ post(sp, 32)); 3871 3872 __ stpq(v0, v1, state); 3873 3874 __ ret(lr); 3875 3876 return start; 3877 } 3878 3879 // Double rounds for sha512. 3880 void sha512_dround(int dr, 3881 FloatRegister vi0, FloatRegister vi1, 3882 FloatRegister vi2, FloatRegister vi3, 3883 FloatRegister vi4, FloatRegister vrc0, 3884 FloatRegister vrc1, FloatRegister vin0, 3885 FloatRegister vin1, FloatRegister vin2, 3886 FloatRegister vin3, FloatRegister vin4) { 3887 if (dr < 36) { 3888 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3889 } 3890 __ addv(v5, __ T2D, vrc0, vin0); 3891 __ ext(v6, __ T16B, vi2, vi3, 8); 3892 __ ext(v5, __ T16B, v5, v5, 8); 3893 __ ext(v7, __ T16B, vi1, vi2, 8); 3894 __ addv(vi3, __ T2D, vi3, v5); 3895 if (dr < 32) { 3896 __ ext(v5, __ T16B, vin3, vin4, 8); 3897 __ sha512su0(vin0, __ T2D, vin1); 3898 } 3899 __ sha512h(vi3, __ T2D, v6, v7); 3900 if (dr < 32) { 3901 __ sha512su1(vin0, __ T2D, vin2, v5); 3902 } 3903 __ addv(vi4, __ T2D, vi1, vi3); 3904 __ sha512h2(vi3, __ T2D, vi1, vi0); 3905 } 3906 3907 // Arguments: 3908 // 3909 // Inputs: 3910 // c_rarg0 - byte[] source+offset 3911 // c_rarg1 - int[] SHA.state 3912 // c_rarg2 - int offset 3913 // c_rarg3 - int limit 3914 // 3915 address generate_sha512_implCompress(StubGenStubId stub_id) { 3916 bool multi_block; 3917 switch (stub_id) { 3918 case sha512_implCompress_id: 3919 multi_block = false; 3920 break; 3921 case sha512_implCompressMB_id: 3922 multi_block = true; 3923 break; 3924 default: 3925 ShouldNotReachHere(); 3926 } 3927 3928 static const uint64_t round_consts[80] = { 3929 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3930 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3931 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3932 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3933 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3934 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3935 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3936 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3937 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3938 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3939 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3940 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3941 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3942 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3943 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3944 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3945 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3946 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3947 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3948 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3949 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3950 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3951 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3952 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3953 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3954 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3955 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3956 }; 3957 3958 __ align(CodeEntryAlignment); 3959 3960 StubCodeMark mark(this, stub_id); 3961 address start = __ pc(); 3962 3963 Register buf = c_rarg0; 3964 Register state = c_rarg1; 3965 Register ofs = c_rarg2; 3966 Register limit = c_rarg3; 3967 3968 __ stpd(v8, v9, __ pre(sp, -64)); 3969 __ stpd(v10, v11, Address(sp, 16)); 3970 __ stpd(v12, v13, Address(sp, 32)); 3971 __ stpd(v14, v15, Address(sp, 48)); 3972 3973 Label sha512_loop; 3974 3975 // load state 3976 __ ld1(v8, v9, v10, v11, __ T2D, state); 3977 3978 // load first 4 round constants 3979 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3980 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3981 3982 __ BIND(sha512_loop); 3983 // load 128B of data into v12..v19 3984 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3985 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3986 __ rev64(v12, __ T16B, v12); 3987 __ rev64(v13, __ T16B, v13); 3988 __ rev64(v14, __ T16B, v14); 3989 __ rev64(v15, __ T16B, v15); 3990 __ rev64(v16, __ T16B, v16); 3991 __ rev64(v17, __ T16B, v17); 3992 __ rev64(v18, __ T16B, v18); 3993 __ rev64(v19, __ T16B, v19); 3994 3995 __ mov(rscratch2, rscratch1); 3996 3997 __ mov(v0, __ T16B, v8); 3998 __ mov(v1, __ T16B, v9); 3999 __ mov(v2, __ T16B, v10); 4000 __ mov(v3, __ T16B, v11); 4001 4002 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4003 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4004 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4005 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4006 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4007 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4008 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4009 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4010 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4011 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4012 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4013 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4014 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4015 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4016 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4017 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4018 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4019 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4020 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4021 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4022 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4023 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4024 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4025 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4026 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4027 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4028 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4029 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4030 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4031 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4032 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4033 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4034 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4035 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4036 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4037 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4038 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4039 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4040 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4041 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4042 4043 __ addv(v8, __ T2D, v8, v0); 4044 __ addv(v9, __ T2D, v9, v1); 4045 __ addv(v10, __ T2D, v10, v2); 4046 __ addv(v11, __ T2D, v11, v3); 4047 4048 if (multi_block) { 4049 __ add(ofs, ofs, 128); 4050 __ cmp(ofs, limit); 4051 __ br(Assembler::LE, sha512_loop); 4052 __ mov(c_rarg0, ofs); // return ofs 4053 } 4054 4055 __ st1(v8, v9, v10, v11, __ T2D, state); 4056 4057 __ ldpd(v14, v15, Address(sp, 48)); 4058 __ ldpd(v12, v13, Address(sp, 32)); 4059 __ ldpd(v10, v11, Address(sp, 16)); 4060 __ ldpd(v8, v9, __ post(sp, 64)); 4061 4062 __ ret(lr); 4063 4064 return start; 4065 } 4066 4067 // Execute one round of keccak of two computations in parallel. 4068 // One of the states should be loaded into the lower halves of 4069 // the vector registers v0-v24, the other should be loaded into 4070 // the upper halves of those registers. The ld1r instruction loads 4071 // the round constant into both halves of register v31. 4072 // Intermediate results c0...c5 and d0...d5 are computed 4073 // in registers v25...v30. 4074 // All vector instructions that are used operate on both register 4075 // halves in parallel. 4076 // If only a single computation is needed, one can only load the lower halves. 4077 void keccak_round(Register rscratch1) { 4078 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4079 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4080 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4081 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4082 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4083 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4084 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4085 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4086 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4087 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4088 4089 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4090 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4091 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4092 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4093 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4094 4095 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4096 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4097 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4098 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4099 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4100 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4101 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4102 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4103 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4104 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4105 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4106 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4107 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4108 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4109 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4110 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4111 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4112 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4113 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4114 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4115 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4116 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4117 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4118 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4119 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4120 4121 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4122 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4123 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4124 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4125 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4126 4127 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4128 4129 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4130 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4131 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4132 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4133 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4134 4135 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4136 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4137 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4138 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4139 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4140 4141 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4142 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4143 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4144 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4145 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4146 4147 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4148 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4149 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4150 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4151 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4152 4153 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4154 } 4155 4156 // Arguments: 4157 // 4158 // Inputs: 4159 // c_rarg0 - byte[] source+offset 4160 // c_rarg1 - byte[] SHA.state 4161 // c_rarg2 - int block_size 4162 // c_rarg3 - int offset 4163 // c_rarg4 - int limit 4164 // 4165 address generate_sha3_implCompress(StubGenStubId stub_id) { 4166 bool multi_block; 4167 switch (stub_id) { 4168 case sha3_implCompress_id: 4169 multi_block = false; 4170 break; 4171 case sha3_implCompressMB_id: 4172 multi_block = true; 4173 break; 4174 default: 4175 ShouldNotReachHere(); 4176 } 4177 4178 static const uint64_t round_consts[24] = { 4179 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4180 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4181 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4182 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4183 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4184 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4185 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4186 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4187 }; 4188 4189 __ align(CodeEntryAlignment); 4190 4191 StubCodeMark mark(this, stub_id); 4192 address start = __ pc(); 4193 4194 Register buf = c_rarg0; 4195 Register state = c_rarg1; 4196 Register block_size = c_rarg2; 4197 Register ofs = c_rarg3; 4198 Register limit = c_rarg4; 4199 4200 Label sha3_loop, rounds24_loop; 4201 Label sha3_512_or_sha3_384, shake128; 4202 4203 __ stpd(v8, v9, __ pre(sp, -64)); 4204 __ stpd(v10, v11, Address(sp, 16)); 4205 __ stpd(v12, v13, Address(sp, 32)); 4206 __ stpd(v14, v15, Address(sp, 48)); 4207 4208 // load state 4209 __ add(rscratch1, state, 32); 4210 __ ld1(v0, v1, v2, v3, __ T1D, state); 4211 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4212 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4213 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4214 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4215 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4216 __ ld1(v24, __ T1D, rscratch1); 4217 4218 __ BIND(sha3_loop); 4219 4220 // 24 keccak rounds 4221 __ movw(rscratch2, 24); 4222 4223 // load round_constants base 4224 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4225 4226 // load input 4227 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4228 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4229 __ eor(v0, __ T8B, v0, v25); 4230 __ eor(v1, __ T8B, v1, v26); 4231 __ eor(v2, __ T8B, v2, v27); 4232 __ eor(v3, __ T8B, v3, v28); 4233 __ eor(v4, __ T8B, v4, v29); 4234 __ eor(v5, __ T8B, v5, v30); 4235 __ eor(v6, __ T8B, v6, v31); 4236 4237 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4238 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4239 4240 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4241 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4242 __ eor(v7, __ T8B, v7, v25); 4243 __ eor(v8, __ T8B, v8, v26); 4244 __ eor(v9, __ T8B, v9, v27); 4245 __ eor(v10, __ T8B, v10, v28); 4246 __ eor(v11, __ T8B, v11, v29); 4247 __ eor(v12, __ T8B, v12, v30); 4248 __ eor(v13, __ T8B, v13, v31); 4249 4250 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4251 __ eor(v14, __ T8B, v14, v25); 4252 __ eor(v15, __ T8B, v15, v26); 4253 __ eor(v16, __ T8B, v16, v27); 4254 4255 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4256 __ andw(c_rarg5, block_size, 48); 4257 __ cbzw(c_rarg5, rounds24_loop); 4258 4259 __ tbnz(block_size, 5, shake128); 4260 // block_size == 144, bit5 == 0, SHA3-224 4261 __ ldrd(v28, __ post(buf, 8)); 4262 __ eor(v17, __ T8B, v17, v28); 4263 __ b(rounds24_loop); 4264 4265 __ BIND(shake128); 4266 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4267 __ eor(v17, __ T8B, v17, v28); 4268 __ eor(v18, __ T8B, v18, v29); 4269 __ eor(v19, __ T8B, v19, v30); 4270 __ eor(v20, __ T8B, v20, v31); 4271 __ b(rounds24_loop); // block_size == 168, SHAKE128 4272 4273 __ BIND(sha3_512_or_sha3_384); 4274 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4275 __ eor(v7, __ T8B, v7, v25); 4276 __ eor(v8, __ T8B, v8, v26); 4277 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4278 4279 // SHA3-384 4280 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4281 __ eor(v9, __ T8B, v9, v27); 4282 __ eor(v10, __ T8B, v10, v28); 4283 __ eor(v11, __ T8B, v11, v29); 4284 __ eor(v12, __ T8B, v12, v30); 4285 4286 __ BIND(rounds24_loop); 4287 __ subw(rscratch2, rscratch2, 1); 4288 4289 keccak_round(rscratch1); 4290 4291 __ cbnzw(rscratch2, rounds24_loop); 4292 4293 if (multi_block) { 4294 __ add(ofs, ofs, block_size); 4295 __ cmp(ofs, limit); 4296 __ br(Assembler::LE, sha3_loop); 4297 __ mov(c_rarg0, ofs); // return ofs 4298 } 4299 4300 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4301 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4302 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4303 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4304 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4305 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4306 __ st1(v24, __ T1D, state); 4307 4308 // restore callee-saved registers 4309 __ ldpd(v14, v15, Address(sp, 48)); 4310 __ ldpd(v12, v13, Address(sp, 32)); 4311 __ ldpd(v10, v11, Address(sp, 16)); 4312 __ ldpd(v8, v9, __ post(sp, 64)); 4313 4314 __ ret(lr); 4315 4316 return start; 4317 } 4318 4319 // Inputs: 4320 // c_rarg0 - long[] state0 4321 // c_rarg1 - long[] state1 4322 address generate_double_keccak() { 4323 static const uint64_t round_consts[24] = { 4324 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4325 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4326 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4327 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4328 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4329 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4330 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4331 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4332 }; 4333 4334 // Implements the double_keccak() method of the 4335 // sun.secyrity.provider.SHA3Parallel class 4336 __ align(CodeEntryAlignment); 4337 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4338 address start = __ pc(); 4339 __ enter(); 4340 4341 Register state0 = c_rarg0; 4342 Register state1 = c_rarg1; 4343 4344 Label rounds24_loop; 4345 4346 // save callee-saved registers 4347 __ stpd(v8, v9, __ pre(sp, -64)); 4348 __ stpd(v10, v11, Address(sp, 16)); 4349 __ stpd(v12, v13, Address(sp, 32)); 4350 __ stpd(v14, v15, Address(sp, 48)); 4351 4352 // load states 4353 __ add(rscratch1, state0, 32); 4354 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4355 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4356 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4357 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4358 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4359 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4360 __ ld1(v24, __ D, 0, rscratch1); 4361 __ add(rscratch1, state1, 32); 4362 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4363 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4364 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4365 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4366 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4367 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4368 __ ld1(v24, __ D, 1, rscratch1); 4369 4370 // 24 keccak rounds 4371 __ movw(rscratch2, 24); 4372 4373 // load round_constants base 4374 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4375 4376 __ BIND(rounds24_loop); 4377 __ subw(rscratch2, rscratch2, 1); 4378 keccak_round(rscratch1); 4379 __ cbnzw(rscratch2, rounds24_loop); 4380 4381 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4382 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4383 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4384 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4385 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4386 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4387 __ st1(v24, __ D, 0, state0); 4388 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4389 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4390 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4391 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4392 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4393 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4394 __ st1(v24, __ D, 1, state1); 4395 4396 // restore callee-saved vector registers 4397 __ ldpd(v14, v15, Address(sp, 48)); 4398 __ ldpd(v12, v13, Address(sp, 32)); 4399 __ ldpd(v10, v11, Address(sp, 16)); 4400 __ ldpd(v8, v9, __ post(sp, 64)); 4401 4402 __ leave(); // required for proper stackwalking of RuntimeStub frame 4403 __ mov(r0, zr); // return 0 4404 __ ret(lr); 4405 4406 return start; 4407 } 4408 4409 /** 4410 * Arguments: 4411 * 4412 * Inputs: 4413 * c_rarg0 - int crc 4414 * c_rarg1 - byte* buf 4415 * c_rarg2 - int length 4416 * 4417 * Output: 4418 * rax - int crc result 4419 */ 4420 address generate_updateBytesCRC32() { 4421 assert(UseCRC32Intrinsics, "what are we doing here?"); 4422 4423 __ align(CodeEntryAlignment); 4424 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 4425 StubCodeMark mark(this, stub_id); 4426 4427 address start = __ pc(); 4428 4429 const Register crc = c_rarg0; // crc 4430 const Register buf = c_rarg1; // source java byte array address 4431 const Register len = c_rarg2; // length 4432 const Register table0 = c_rarg3; // crc_table address 4433 const Register table1 = c_rarg4; 4434 const Register table2 = c_rarg5; 4435 const Register table3 = c_rarg6; 4436 const Register tmp3 = c_rarg7; 4437 4438 BLOCK_COMMENT("Entry:"); 4439 __ enter(); // required for proper stackwalking of RuntimeStub frame 4440 4441 __ kernel_crc32(crc, buf, len, 4442 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4443 4444 __ leave(); // required for proper stackwalking of RuntimeStub frame 4445 __ ret(lr); 4446 4447 return start; 4448 } 4449 4450 // ChaCha20 block function. This version parallelizes 4 quarter 4451 // round operations at a time. It uses 16 SIMD registers to 4452 // produce 4 blocks of key stream. 4453 // 4454 // state (int[16]) = c_rarg0 4455 // keystream (byte[256]) = c_rarg1 4456 // return - number of bytes of keystream (always 256) 4457 // 4458 // In this approach, we load the 512-bit start state sequentially into 4459 // 4 128-bit vectors. We then make 4 4-vector copies of that starting 4460 // state, with each successive set of 4 vectors having a +1 added into 4461 // the first 32-bit lane of the 4th vector in that group (the counter). 4462 // By doing this, we can perform the block function on 4 512-bit blocks 4463 // within one run of this intrinsic. 4464 // The alignment of the data across the 4-vector group is such that at 4465 // the start it is already aligned for the first round of each two-round 4466 // loop iteration. In other words, the corresponding lanes of each vector 4467 // will contain the values needed for that quarter round operation (e.g. 4468 // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.). 4469 // In between each full round, a lane shift must occur. Within a loop 4470 // iteration, between the first and second rounds, the 2nd, 3rd, and 4th 4471 // vectors are rotated left 32, 64 and 96 bits, respectively. The result 4472 // is effectively a diagonal orientation in columnar form. After the 4473 // second full round, those registers are left-rotated again, this time 4474 // 96, 64, and 32 bits - returning the vectors to their columnar organization. 4475 // After all 10 iterations, the original state is added to each 4-vector 4476 // working state along with the add mask, and the 4 vector groups are 4477 // sequentially written to the memory dedicated for the output key stream. 4478 // 4479 // For a more detailed explanation, see Goll and Gueron, "Vectorization of 4480 // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology: 4481 // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33 4482 address generate_chacha20Block_qrpar() { 4483 Label L_Q_twoRounds, L_Q_cc20_const; 4484 // The constant data is broken into two 128-bit segments to be loaded 4485 // onto SIMD registers. The first 128 bits are a counter add overlay 4486 // that adds +1/+0/+0/+0 to the vectors holding replicated state[12]. 4487 // The second 128-bits is a table constant used for 8-bit left rotations. 4488 // on 32-bit lanes within a SIMD register. 4489 __ BIND(L_Q_cc20_const); 4490 __ emit_int64(0x0000000000000001UL); 4491 __ emit_int64(0x0000000000000000UL); 4492 __ emit_int64(0x0605040702010003UL); 4493 __ emit_int64(0x0E0D0C0F0A09080BUL); 4494 4495 __ align(CodeEntryAlignment); 4496 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4497 StubCodeMark mark(this, stub_id); 4498 address start = __ pc(); 4499 __ enter(); 4500 4501 const Register state = c_rarg0; 4502 const Register keystream = c_rarg1; 4503 const Register loopCtr = r10; 4504 const Register tmpAddr = r11; 4505 4506 const FloatRegister aState = v0; 4507 const FloatRegister bState = v1; 4508 const FloatRegister cState = v2; 4509 const FloatRegister dState = v3; 4510 const FloatRegister a1Vec = v4; 4511 const FloatRegister b1Vec = v5; 4512 const FloatRegister c1Vec = v6; 4513 const FloatRegister d1Vec = v7; 4514 // Skip the callee-saved registers v8 - v15 4515 const FloatRegister a2Vec = v16; 4516 const FloatRegister b2Vec = v17; 4517 const FloatRegister c2Vec = v18; 4518 const FloatRegister d2Vec = v19; 4519 const FloatRegister a3Vec = v20; 4520 const FloatRegister b3Vec = v21; 4521 const FloatRegister c3Vec = v22; 4522 const FloatRegister d3Vec = v23; 4523 const FloatRegister a4Vec = v24; 4524 const FloatRegister b4Vec = v25; 4525 const FloatRegister c4Vec = v26; 4526 const FloatRegister d4Vec = v27; 4527 const FloatRegister scratch = v28; 4528 const FloatRegister addMask = v29; 4529 const FloatRegister lrot8Tbl = v30; 4530 4531 // Load the initial state in the first 4 quadword registers, 4532 // then copy the initial state into the next 4 quadword registers 4533 // that will be used for the working state. 4534 __ ld1(aState, bState, cState, dState, __ T16B, Address(state)); 4535 4536 // Load the index register for 2 constant 128-bit data fields. 4537 // The first represents the +1/+0/+0/+0 add mask. The second is 4538 // the 8-bit left rotation. 4539 __ adr(tmpAddr, L_Q_cc20_const); 4540 __ ldpq(addMask, lrot8Tbl, Address(tmpAddr)); 4541 4542 __ mov(a1Vec, __ T16B, aState); 4543 __ mov(b1Vec, __ T16B, bState); 4544 __ mov(c1Vec, __ T16B, cState); 4545 __ mov(d1Vec, __ T16B, dState); 4546 4547 __ mov(a2Vec, __ T16B, aState); 4548 __ mov(b2Vec, __ T16B, bState); 4549 __ mov(c2Vec, __ T16B, cState); 4550 __ addv(d2Vec, __ T4S, d1Vec, addMask); 4551 4552 __ mov(a3Vec, __ T16B, aState); 4553 __ mov(b3Vec, __ T16B, bState); 4554 __ mov(c3Vec, __ T16B, cState); 4555 __ addv(d3Vec, __ T4S, d2Vec, addMask); 4556 4557 __ mov(a4Vec, __ T16B, aState); 4558 __ mov(b4Vec, __ T16B, bState); 4559 __ mov(c4Vec, __ T16B, cState); 4560 __ addv(d4Vec, __ T4S, d3Vec, addMask); 4561 4562 // Set up the 10 iteration loop 4563 __ mov(loopCtr, 10); 4564 __ BIND(L_Q_twoRounds); 4565 4566 // The first set of operations on the vectors covers the first 4 quarter 4567 // round operations: 4568 // Qround(state, 0, 4, 8,12) 4569 // Qround(state, 1, 5, 9,13) 4570 // Qround(state, 2, 6,10,14) 4571 // Qround(state, 3, 7,11,15) 4572 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4573 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4574 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4575 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4576 4577 // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to 4578 // diagonals. The a1Vec does not need to change orientation. 4579 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true); 4580 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true); 4581 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true); 4582 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true); 4583 4584 // The second set of operations on the vectors covers the second 4 quarter 4585 // round operations, now acting on the diagonals: 4586 // Qround(state, 0, 5,10,15) 4587 // Qround(state, 1, 6,11,12) 4588 // Qround(state, 2, 7, 8,13) 4589 // Qround(state, 3, 4, 9,14) 4590 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4591 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4592 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4593 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4594 4595 // Before we start the next iteration, we need to perform shuffles 4596 // on the b/c/d vectors to move them back to columnar organizations 4597 // from their current diagonal orientation. 4598 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false); 4599 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false); 4600 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false); 4601 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false); 4602 4603 // Decrement and iterate 4604 __ sub(loopCtr, loopCtr, 1); 4605 __ cbnz(loopCtr, L_Q_twoRounds); 4606 4607 // Once the counter reaches zero, we fall out of the loop 4608 // and need to add the initial state back into the working state 4609 // represented by the a/b/c/d1Vec registers. This is destructive 4610 // on the dState register but we no longer will need it. 4611 __ addv(a1Vec, __ T4S, a1Vec, aState); 4612 __ addv(b1Vec, __ T4S, b1Vec, bState); 4613 __ addv(c1Vec, __ T4S, c1Vec, cState); 4614 __ addv(d1Vec, __ T4S, d1Vec, dState); 4615 4616 __ addv(a2Vec, __ T4S, a2Vec, aState); 4617 __ addv(b2Vec, __ T4S, b2Vec, bState); 4618 __ addv(c2Vec, __ T4S, c2Vec, cState); 4619 __ addv(dState, __ T4S, dState, addMask); 4620 __ addv(d2Vec, __ T4S, d2Vec, dState); 4621 4622 __ addv(a3Vec, __ T4S, a3Vec, aState); 4623 __ addv(b3Vec, __ T4S, b3Vec, bState); 4624 __ addv(c3Vec, __ T4S, c3Vec, cState); 4625 __ addv(dState, __ T4S, dState, addMask); 4626 __ addv(d3Vec, __ T4S, d3Vec, dState); 4627 4628 __ addv(a4Vec, __ T4S, a4Vec, aState); 4629 __ addv(b4Vec, __ T4S, b4Vec, bState); 4630 __ addv(c4Vec, __ T4S, c4Vec, cState); 4631 __ addv(dState, __ T4S, dState, addMask); 4632 __ addv(d4Vec, __ T4S, d4Vec, dState); 4633 4634 // Write the final state back to the result buffer 4635 __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64)); 4636 __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64)); 4637 __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64)); 4638 __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64)); 4639 4640 __ mov(r0, 256); // Return length of output keystream 4641 __ leave(); 4642 __ ret(lr); 4643 4644 return start; 4645 } 4646 4647 // Helpers to schedule parallel operation bundles across vector 4648 // register sequences of size 2, 4 or 8. 4649 4650 // Implement various primitive computations across vector sequences 4651 4652 template<int N> 4653 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4654 const VSeq<N>& v1, const VSeq<N>& v2) { 4655 for (int i = 0; i < N; i++) { 4656 __ addv(v[i], T, v1[i], v2[i]); 4657 } 4658 } 4659 4660 template<int N> 4661 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4662 const VSeq<N>& v1, const VSeq<N>& v2) { 4663 for (int i = 0; i < N; i++) { 4664 __ subv(v[i], T, v1[i], v2[i]); 4665 } 4666 } 4667 4668 template<int N> 4669 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4670 const VSeq<N>& v1, const VSeq<N>& v2) { 4671 for (int i = 0; i < N; i++) { 4672 __ mulv(v[i], T, v1[i], v2[i]); 4673 } 4674 } 4675 4676 template<int N> 4677 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4678 for (int i = 0; i < N; i++) { 4679 __ negr(v[i], T, v1[i]); 4680 } 4681 } 4682 4683 template<int N> 4684 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4685 const VSeq<N>& v1, int shift) { 4686 for (int i = 0; i < N; i++) { 4687 __ sshr(v[i], T, v1[i], shift); 4688 } 4689 } 4690 4691 template<int N> 4692 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4693 for (int i = 0; i < N; i++) { 4694 __ andr(v[i], __ T16B, v1[i], v2[i]); 4695 } 4696 } 4697 4698 template<int N> 4699 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4700 for (int i = 0; i < N; i++) { 4701 __ orr(v[i], __ T16B, v1[i], v2[i]); 4702 } 4703 } 4704 4705 template<int N> 4706 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4707 for (int i = 0; i < N; i++) { 4708 __ notr(v[i], __ T16B, v1[i]); 4709 } 4710 } 4711 4712 // load N/2 successive pairs of quadword values from memory in order 4713 // into N successive vector registers of the sequence via the 4714 // address supplied in base. 4715 template<int N> 4716 void vs_ldpq(const VSeq<N>& v, Register base) { 4717 for (int i = 0; i < N; i += 2) { 4718 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4719 } 4720 } 4721 4722 // load N/2 successive pairs of quadword values from memory in order 4723 // into N vector registers of the sequence via the address supplied 4724 // in base using post-increment addressing 4725 template<int N> 4726 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4727 for (int i = 0; i < N; i += 2) { 4728 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4729 } 4730 } 4731 4732 // store N successive vector registers of the sequence into N/2 4733 // successive pairs of quadword memory locations via the address 4734 // supplied in base using post-increment addressing 4735 template<int N> 4736 void vs_stpq_post(const VSeq<N>& v, Register base) { 4737 for (int i = 0; i < N; i += 2) { 4738 __ stpq(v[i], v[i+1], __ post(base, 32)); 4739 } 4740 } 4741 4742 // load N/2 pairs of quadword values from memory into N vector 4743 // registers via the address supplied in base with each pair indexed 4744 // using the the start offset plus the corresponding entry in the 4745 // offsets array 4746 template<int N> 4747 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4748 for (int i = 0; i < N/2; i++) { 4749 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4750 } 4751 } 4752 4753 // store N vector registers into N/2 pairs of quadword memory 4754 // locations via the address supplied in base with each pair indexed 4755 // using the the start offset plus the corresponding entry in the 4756 // offsets array 4757 template<int N> 4758 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4759 for (int i = 0; i < N/2; i++) { 4760 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4761 } 4762 } 4763 4764 // load N single quadword values from memory into N vector registers 4765 // via the address supplied in base with each value indexed using 4766 // the the start offset plus the corresponding entry in the offsets 4767 // array 4768 template<int N> 4769 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4770 int start, int (&offsets)[N]) { 4771 for (int i = 0; i < N; i++) { 4772 __ ldr(v[i], T, Address(base, start + offsets[i])); 4773 } 4774 } 4775 4776 // store N vector registers into N single quadword memory locations 4777 // via the address supplied in base with each value indexed using 4778 // the the start offset plus the corresponding entry in the offsets 4779 // array 4780 template<int N> 4781 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4782 int start, int (&offsets)[N]) { 4783 for (int i = 0; i < N; i++) { 4784 __ str(v[i], T, Address(base, start + offsets[i])); 4785 } 4786 } 4787 4788 // load N/2 pairs of quadword values from memory de-interleaved into 4789 // N vector registers 2 at a time via the address supplied in base 4790 // with each pair indexed using the the start offset plus the 4791 // corresponding entry in the offsets array 4792 template<int N> 4793 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4794 Register tmp, int start, int (&offsets)[N/2]) { 4795 for (int i = 0; i < N/2; i++) { 4796 __ add(tmp, base, start + offsets[i]); 4797 __ ld2(v[2*i], v[2*i+1], T, tmp); 4798 } 4799 } 4800 4801 // store N vector registers 2 at a time interleaved into N/2 pairs 4802 // of quadword memory locations via the address supplied in base 4803 // with each pair indexed using the the start offset plus the 4804 // corresponding entry in the offsets array 4805 template<int N> 4806 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4807 Register tmp, int start, int (&offsets)[N/2]) { 4808 for (int i = 0; i < N/2; i++) { 4809 __ add(tmp, base, start + offsets[i]); 4810 __ st2(v[2*i], v[2*i+1], T, tmp); 4811 } 4812 } 4813 4814 // Helper routines for various flavours of dilithium montgomery 4815 // multiply 4816 4817 // Perform 16 32-bit Montgomery multiplications in parallel 4818 // See the montMul() method of the sun.security.provider.ML_DSA class. 4819 // 4820 // Computes 4x4S results 4821 // a = b * c * 2^-32 mod MONT_Q 4822 // Inputs: vb, vc - 4x4S vector register sequences 4823 // vq - 2x4S constants <MONT_Q, MONT_Q_INV_MOD_R> 4824 // Temps: vtmp - 4x4S vector sequence trashed after call 4825 // Outputs: va - 4x4S vector register sequences 4826 // vb, vc, vtmp and vq must all be disjoint 4827 // va must be disjoint from all other inputs/temps or must equal vc 4828 // n.b. MONT_R_BITS is 32, so the right shift by it is implicit. 4829 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 4830 const VSeq<4>& vtmp, const VSeq<2>& vq) { 4831 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 4832 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 4833 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 4834 4835 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 4836 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 4837 4838 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 4839 4840 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 4841 assert(vs_disjoint(va, vb), "va and vb overlap"); 4842 assert(vs_disjoint(va, vq), "va and vq overlap"); 4843 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 4844 4845 // schedule 4 streams of instructions across the vector sequences 4846 for (int i = 0; i < 4; i++) { 4847 __ sqdmulh(vtmp[i], __ T4S, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 4848 __ mulv(va[i], __ T4S, vb[i], vc[i]); // aLow = lo32(b * c) 4849 } 4850 4851 for (int i = 0; i < 4; i++) { 4852 __ mulv(va[i], __ T4S, va[i], vq[0]); // m = aLow * qinv 4853 } 4854 4855 for (int i = 0; i < 4; i++) { 4856 __ sqdmulh(va[i], __ T4S, va[i], vq[1]); // n = hi32(2 * m * q) 4857 } 4858 4859 for (int i = 0; i < 4; i++) { 4860 __ shsubv(va[i], __ T4S, vtmp[i], va[i]); // a = (aHigh - n) / 2 4861 } 4862 } 4863 4864 // Perform 2x16 32-bit Montgomery multiplications in parallel 4865 // See the montMul() method of the sun.security.provider.ML_DSA class. 4866 // 4867 // Computes 8x4S results 4868 // a = b * c * 2^-32 mod MONT_Q 4869 // Inputs: vb, vc - 8x4S vector register sequences 4870 // vq - 2x4S constants <MONT_Q, MONT_Q_INV_MOD_R> 4871 // Temps: vtmp - 4x4S vector sequence trashed after call 4872 // Outputs: va - 8x4S vector register sequences 4873 // vb, vc, vtmp and vq must all be disjoint 4874 // va must be disjoint from all other inputs/temps or must equal vc 4875 // n.b. MONT_R_BITS is 32, so the right shift by it is implicit. 4876 void vs_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 4877 const VSeq<4>& vtmp, const VSeq<2>& vq) { 4878 // vb, vc, vtmp and vq must be disjoint. va must either be 4879 // disjoint from all other registers or equal vc 4880 4881 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 4882 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 4883 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 4884 4885 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 4886 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 4887 4888 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 4889 4890 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 4891 assert(vs_disjoint(va, vb), "va and vb overlap"); 4892 assert(vs_disjoint(va, vq), "va and vq overlap"); 4893 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 4894 4895 // we need to multiply the front and back halves of each sequence 4896 // 4x4S at a time because 4897 // 4898 // 1) we are currently only able to get 4-way instruction 4899 // parallelism at best 4900 // 4901 // 2) we need registers for the constants in vq and temporary 4902 // scratch registers to hold intermediate results so vtmp can only 4903 // be a VSeq<4> which means we only have 4 scratch slots 4904 4905 dilithium_montmul16(vs_front(va), vs_front(vb), vs_front(vc), vtmp, vq); 4906 dilithium_montmul16(vs_back(va), vs_back(vb), vs_back(vc), vtmp, vq); 4907 } 4908 4909 // perform combined montmul then add/sub on 4x4S vectors 4910 4911 void dilithium_montmul16_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 4912 const VSeq<4>& vtmp, const VSeq<2>& vq) { 4913 // compute a = montmul(a1, c) 4914 dilithium_montmul16(vc, va1, vc, vtmp, vq); 4915 // ouptut a1 = a0 - a 4916 vs_subv(va1, __ T4S, va0, vc); 4917 // and a0 = a0 + a 4918 vs_addv(va0, __ T4S, va0, vc); 4919 } 4920 4921 // perform combined add/sub then montul on 4x4S vectors 4922 4923 void dilithium_sub_add_montmul16(const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 4924 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 4925 // compute c = a0 - a1 4926 vs_subv(vtmp1, __ T4S, va0, va1); 4927 // output a0 = a0 + a1 4928 vs_addv(va0, __ T4S, va0, va1); 4929 // output a1 = b montmul c 4930 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 4931 } 4932 4933 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 4934 // in the Java implementation come in sequences of at least 8, so we 4935 // can use ldpq to collect the corresponding data into pairs of vector 4936 // registers. 4937 // We collect the coefficients corresponding to the 'j+l' indexes into 4938 // the vector registers v0-v7, the zetas into the vector registers v16-v23 4939 // then we do the (Montgomery) multiplications by the zetas in parallel 4940 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 4941 // v0-v7, then do the additions into v24-v31 and the subtractions into 4942 // v0-v7 and finally save the results back to the coeffs array. 4943 void dilithiumNttLevel0_4(const Register dilithiumConsts, 4944 const Register coeffs, const Register zetas) { 4945 int c1 = 0; 4946 int c2 = 512; 4947 int startIncr; 4948 // don't use callee save registers v8 - v15 4949 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 4950 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 4951 VSeq<2> vq(30); // n.b. constants overlap vs3 4952 int offsets[4] = { 0, 32, 64, 96 }; 4953 4954 for (int level = 0; level < 5; level++) { 4955 int c1Start = c1; 4956 int c2Start = c2; 4957 if (level == 3) { 4958 offsets[1] = 32; 4959 offsets[2] = 128; 4960 offsets[3] = 160; 4961 } else if (level == 4) { 4962 offsets[1] = 64; 4963 offsets[2] = 128; 4964 offsets[3] = 192; 4965 } 4966 4967 // for levels 1 - 4 we simply load 2 x 4 adjacent values at a 4968 // time at 4 different offsets and multiply them in order by the 4969 // next set of input values. So we employ indexed load and store 4970 // pair instructions with arrangement 4S 4971 for (int i = 0; i < 4; i++) { 4972 // reload q and qinv 4973 vs_ldpq(vq, dilithiumConsts); // qInv, q 4974 // load 8x4S coefficients via second start pos == c2 4975 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 4976 // load next 8x4S inputs == b 4977 vs_ldpq_post(vs2, zetas); 4978 // compute a == c2 * b mod MONT_Q 4979 vs_montmul32(vs2, vs1, vs2, vtmp, vq); 4980 // load 8x4s coefficients via first start pos == c1 4981 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 4982 // compute a1 = c1 + a 4983 vs_addv(vs3, __ T4S, vs1, vs2); 4984 // compute a2 = c1 - a 4985 vs_subv(vs1, __ T4S, vs1, vs2); 4986 // output a1 and a2 4987 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 4988 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 4989 4990 int k = 4 * level + i; 4991 4992 if (k > 7) { 4993 startIncr = 256; 4994 } else if (k == 5) { 4995 startIncr = 384; 4996 } else { 4997 startIncr = 128; 4998 } 4999 5000 c1Start += startIncr; 5001 c2Start += startIncr; 5002 } 5003 5004 c2 /= 2; 5005 } 5006 } 5007 5008 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 5009 // Implements the method 5010 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 5011 // of the Java class sun.security.provider 5012 // 5013 // coeffs (int[256]) = c_rarg0 5014 // zetas (int[256]) = c_rarg1 5015 address generate_dilithiumAlmostNtt() { 5016 5017 __ align(CodeEntryAlignment); 5018 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 5019 StubCodeMark mark(this, stub_id); 5020 address start = __ pc(); 5021 __ enter(); 5022 5023 const Register coeffs = c_rarg0; 5024 const Register zetas = c_rarg1; 5025 5026 const Register tmpAddr = r9; 5027 const Register dilithiumConsts = r10; 5028 const Register result = r11; 5029 // don't use callee save registers v8 - v15 5030 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 5031 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5032 VSeq<2> vq(30); // n.b. constants overlap vs3 5033 int offsets[4] = {0, 32, 64, 96}; 5034 int offsets1[8] = {16, 48, 80, 112, 144, 176, 208, 240 }; 5035 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5036 __ add(result, coeffs, 0); 5037 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5038 5039 // Each level represents one iteration of the outer for loop of the Java version 5040 5041 // level 0-4 5042 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 5043 5044 // level 5 5045 5046 // at level 5 the coefficients we need to combine with the zetas 5047 // are grouped in memory in blocks of size 4. So, for both sets of 5048 // coefficients we load 4 adjacent values at 8 different offsets 5049 // using an indexed ldr with register variant Q and multiply them 5050 // in sequence order by the next set of inputs. Likewise we store 5051 // the resuls using an indexed str with register variant Q. 5052 for (int i = 0; i < 1024; i += 256) { 5053 // reload constants q, qinv each iteration as they get clobbered later 5054 vs_ldpq(vq, dilithiumConsts); // qInv, q 5055 // load 32 (8x4S) coefficients via first offsets = c1 5056 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 5057 // load next 32 (8x4S) inputs = b 5058 vs_ldpq_post(vs2, zetas); 5059 // a = b montul c1 5060 vs_montmul32(vs2, vs1, vs2, vtmp, vq); 5061 // load 32 (8x4S) coefficients via second offsets = c2 5062 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 5063 // add/sub with result of multiply 5064 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 5065 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 5066 // write back new coefficients using same offsets 5067 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 5068 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 5069 } 5070 5071 // level 6 5072 // at level 6 the coefficients we need to combine with the zetas 5073 // are grouped in memory in pairs, the first two being montmul 5074 // inputs and the second add/sub inputs. We can still implement 5075 // the montmul+sub+add using 4-way parallelism but only if we 5076 // combine the coefficients with the zetas 16 at a time. We load 8 5077 // adjacent values at 4 different offsets using an ld2 load with 5078 // arrangement 2D. That interleaves the lower and upper halves of 5079 // each pair of quadwords into successive vector registers. We 5080 // then need to montmul the 4 even elements of the coefficients 5081 // register sequence by the zetas in order and then add/sub the 4 5082 // odd elements of the coefficients register sequence. We use an 5083 // equivalent st2 operation to store the results back into memory 5084 // de-interleaved. 5085 for (int i = 0; i < 1024; i += 128) { 5086 // reload constants q, qinv each iteration as they get clobbered later 5087 vs_ldpq(vq, dilithiumConsts); // qInv, q 5088 // load interleaved 16 (4x2D) coefficients via offsets 5089 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 5090 // load next 16 (4x4S) inputs 5091 vs_ldpq_post(vs_front(vs2), zetas); 5092 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 5093 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 5094 vs_front(vs2), vtmp, vq); 5095 // store interleaved 16 (4x2D) coefficients via offsets 5096 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 5097 } 5098 5099 // level 7 5100 // at level 7 the coefficients we need to combine with the zetas 5101 // occur singly with montmul inputs alterating with add/sub 5102 // inputs. Once again we can use 4-way parallelism to combine 16 5103 // zetas at a time. However, we have to load 8 adjacent values at 5104 // 4 different offsets using an ld2 load with arrangement 4S. That 5105 // interleaves the the odd words of each pair into one 5106 // coefficients vector register and the even words of the pair 5107 // into the next register. We then need to montmul the 4 even 5108 // elements of the coefficients register sequence by the zetas in 5109 // order and then add/sub the 4 odd elements of the coefficients 5110 // register sequence. We use an equivalent st2 operation to store 5111 // the results back into memory de-interleaved. 5112 5113 for (int i = 0; i < 1024; i += 128) { 5114 // reload constants q, qinv each iteration as they get clobbered later 5115 vs_ldpq(vq, dilithiumConsts); // qInv, q 5116 // load interleaved 16 (4x4S) coefficients via offsets 5117 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 5118 // load next 16 (4x4S) inputs 5119 vs_ldpq_post(vs_front(vs2), zetas); 5120 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 5121 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 5122 vs_front(vs2), vtmp, vq); 5123 // store interleaved 16 (4x4S) coefficients via offsets 5124 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 5125 } 5126 __ leave(); // required for proper stackwalking of RuntimeStub frame 5127 __ mov(r0, zr); // return 0 5128 __ ret(lr); 5129 5130 return start; 5131 } 5132 5133 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 5134 // in the Java implementation come in sequences of at least 8, so we 5135 // can use ldpq to collect the corresponding data into pairs of vector 5136 // registers 5137 // We collect the coefficients that correspond to the 'j's into vs1 5138 // the coefficiets that correspond to the 'j+l's into vs2 then 5139 // do the additions into vs3 and the subtractions into vs1 then 5140 // save the result of the additions, load the zetas into vs2 5141 // do the (Montgomery) multiplications by zeta in parallel into vs2 5142 // finally save the results back to the coeffs array 5143 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 5144 const Register coeffs, const Register zetas) { 5145 int c1 = 0; 5146 int c2 = 32; 5147 int startIncr; 5148 int offsets[4]; 5149 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 5150 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5151 VSeq<2> vq(30); // n.b. constants overlap vs3 5152 5153 offsets[0] = 0; 5154 5155 for (int level = 3; level < 8; level++) { 5156 int c1Start = c1; 5157 int c2Start = c2; 5158 if (level == 3) { 5159 offsets[1] = 64; 5160 offsets[2] = 128; 5161 offsets[3] = 192; 5162 } else if (level == 4) { 5163 offsets[1] = 32; 5164 offsets[2] = 128; 5165 offsets[3] = 160; 5166 } else { 5167 offsets[1] = 32; 5168 offsets[2] = 64; 5169 offsets[3] = 96; 5170 } 5171 5172 // for levels 3 - 7 we simply load 2 x 4 adjacent values at a 5173 // time at 4 different offsets and multiply them in order by the 5174 // next set of input values. So we employ indexed load and store 5175 // pair instructions with arrangement 4S 5176 for (int i = 0; i < 4; i++) { 5177 // load v1 32 (8x4S) coefficients relative to first start index 5178 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 5179 // load v2 32 (8x4S) coefficients relative to second start index 5180 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 5181 // a0 = v1 + v2 -- n.b. clobbers vqs 5182 vs_addv(vs3, __ T4S, vs1, vs2); 5183 // a1 = v1 - v2 5184 vs_subv(vs1, __ T4S, vs1, vs2); 5185 // save a1 relative to first start index 5186 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 5187 // load constants q, qinv each iteration as they get clobbered above 5188 vs_ldpq(vq, dilithiumConsts); // qInv, q 5189 // load b next 32 (8x4S) inputs 5190 vs_ldpq_post(vs2, zetas); 5191 // a = a1 montmul b 5192 vs_montmul32(vs2, vs1, vs2, vtmp, vq); 5193 // save a relative to second start index 5194 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 5195 5196 int k = 4 * level + i; 5197 5198 if (k < 24) { 5199 startIncr = 256; 5200 } else if (k == 25) { 5201 startIncr = 384; 5202 } else { 5203 startIncr = 128; 5204 } 5205 5206 c1Start += startIncr; 5207 c2Start += startIncr; 5208 } 5209 5210 c2 *= 2; 5211 } 5212 } 5213 5214 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 5215 // Implements the method 5216 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 5217 // the sun.security.provider.ML_DSA class. 5218 // 5219 // coeffs (int[256]) = c_rarg0 5220 // zetas (int[256]) = c_rarg1 5221 address generate_dilithiumAlmostInverseNtt() { 5222 5223 __ align(CodeEntryAlignment); 5224 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 5225 StubCodeMark mark(this, stub_id); 5226 address start = __ pc(); 5227 __ enter(); 5228 5229 const Register coeffs = c_rarg0; 5230 const Register zetas = c_rarg1; 5231 5232 const Register tmpAddr = r9; 5233 const Register dilithiumConsts = r10; 5234 const Register result = r11; 5235 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 5236 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5237 VSeq<2> vq(30); // n.b. constants overlap vs3 5238 int offsets[4] = { 0, 32, 64, 96 }; 5239 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5240 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 5241 5242 __ add(result, coeffs, 0); 5243 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5244 5245 // Each level represents one iteration of the outer for loop of the Java version 5246 // level0 5247 5248 // level 0 5249 // At level 0 we need to interleave adjacent quartets of 5250 // coefficients before we multiply and add/sub by the next 16 5251 // zetas just as we did for level 7 in the multiply code. So we 5252 // load and store the values using an ld2/st2 with arrangement 4S 5253 for (int i = 0; i < 1024; i += 128) { 5254 // load constants q, qinv 5255 // n.b. this can be moved out of the loop as they do not get 5256 // clobbered by first two loops 5257 vs_ldpq(vq, dilithiumConsts); // qInv, q 5258 // a0/a1 load interleaved 32 (8x4S) coefficients 5259 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 5260 // b load next 32 (8x4S) inputs 5261 vs_ldpq_post(vs_front(vs2), zetas); 5262 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 5263 // n.b. second half of vs2 provides temporary register storage 5264 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 5265 vs_front(vs2), vs_back(vs2), vtmp, vq); 5266 // a0/a1 store interleaved 32 (8x4S) coefficients 5267 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 5268 } 5269 5270 // level 1 5271 // At level 1 we need to interleave pairs of adjacent pairs of 5272 // coefficients before we multiply by the next 16 zetas just as we 5273 // did for level 6 in the multiply code. So we load and store the 5274 // values an ld2/st2 with arrangement 2D 5275 for (int i = 0; i < 1024; i += 128) { 5276 // a0/a1 load interleaved 32 (8x2D) coefficients 5277 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 5278 // b load next 16 (4x4S) inputs 5279 vs_ldpq_post(vs_front(vs2), zetas); 5280 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 5281 // n.b. second half of vs2 provides temporary register storage 5282 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 5283 vs_front(vs2), vs_back(vs2), vtmp, vq); 5284 // a0/a1 store interleaved 32 (8x2D) coefficients 5285 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 5286 } 5287 5288 // level 2 5289 // At level 2 coefficients come in blocks of 4. So, we load 4 5290 // adjacent coefficients at 8 distinct offsets for both the first 5291 // and second coefficient sequences, using an ldr with register 5292 // variant Q then combine them with next set of 32 zetas. Likewise 5293 // we store the results using an str with register variant Q. 5294 for (int i = 0; i < 1024; i += 256) { 5295 // c0 load 32 (8x4S) coefficients via first offsets 5296 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 5297 // c1 load 32 (8x4S) coefficients via second offsets 5298 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 5299 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 5300 vs_addv(vs3, __ T4S, vs1, vs2); 5301 // c = c0 - c1 5302 vs_subv(vs1, __ T4S, vs1, vs2); 5303 // store a0 32 (8x4S) coefficients via first offsets 5304 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 5305 // b load 32 (8x4S) next inputs 5306 vs_ldpq_post(vs2, zetas); 5307 // reload constants q, qinv -- they were clobbered earlier 5308 vs_ldpq(vq, dilithiumConsts); // qInv, q 5309 // compute a1 = b montmul c 5310 vs_montmul32(vs2, vs1, vs2, vtmp, vq); 5311 // store a1 32 (8x4S) coefficients via second offsets 5312 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 5313 } 5314 5315 // level 3-7 5316 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 5317 5318 __ leave(); // required for proper stackwalking of RuntimeStub frame 5319 __ mov(r0, zr); // return 0 5320 __ ret(lr); 5321 5322 return start; 5323 5324 } 5325 5326 // Dilithium multiply polynomials in the NTT domain. 5327 // Straightforward implementation of the method 5328 // static int implDilithiumNttMult( 5329 // int[] result, int[] ntta, int[] nttb {} of 5330 // the sun.security.provider.ML_DSA class. 5331 // 5332 // result (int[256]) = c_rarg0 5333 // poly1 (int[256]) = c_rarg1 5334 // poly2 (int[256]) = c_rarg2 5335 address generate_dilithiumNttMult() { 5336 5337 __ align(CodeEntryAlignment); 5338 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 5339 StubCodeMark mark(this, stub_id); 5340 address start = __ pc(); 5341 __ enter(); 5342 5343 Label L_loop; 5344 5345 const Register result = c_rarg0; 5346 const Register poly1 = c_rarg1; 5347 const Register poly2 = c_rarg2; 5348 5349 const Register dilithiumConsts = r10; 5350 const Register len = r11; 5351 5352 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 5353 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5354 VSeq<2> vq(30); // n.b. constants overlap vs3 5355 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 5356 5357 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5358 5359 // load constants q, qinv 5360 vs_ldpq(vq, dilithiumConsts); // qInv, q 5361 // load constant rSquare into v29 5362 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 5363 5364 __ mov(len, zr); 5365 __ add(len, len, 1024); 5366 5367 __ BIND(L_loop); 5368 5369 // b load 32 (8x4S) next inputs from poly1 5370 vs_ldpq_post(vs1, poly1); 5371 // c load 32 (8x4S) next inputs from poly2 5372 vs_ldpq_post(vs2, poly2); 5373 // compute a = b montmul c 5374 vs_montmul32(vs2, vs1, vs2, vtmp, vq); 5375 // compute a = rsquare montmul a 5376 vs_montmul32(vs2, vrsquare, vs2, vtmp, vq); 5377 // save a 32 (8x4S) results 5378 vs_stpq_post(vs2, result); 5379 5380 __ sub(len, len, 128); 5381 __ cmp(len, (u1)128); 5382 __ br(Assembler::GE, L_loop); 5383 5384 __ leave(); // required for proper stackwalking of RuntimeStub frame 5385 __ mov(r0, zr); // return 0 5386 __ ret(lr); 5387 5388 return start; 5389 5390 } 5391 5392 // Dilithium Motgomery multiply an array by a constant. 5393 // A straightforward implementation of the method 5394 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 5395 // of the sun.security.provider.MLDSA class 5396 // 5397 // coeffs (int[256]) = c_rarg0 5398 // constant (int) = c_rarg1 5399 address generate_dilithiumMontMulByConstant() { 5400 5401 __ align(CodeEntryAlignment); 5402 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 5403 StubCodeMark mark(this, stub_id); 5404 address start = __ pc(); 5405 __ enter(); 5406 5407 Label L_loop; 5408 5409 const Register coeffs = c_rarg0; 5410 const Register constant = c_rarg1; 5411 5412 const Register dilithiumConsts = r10; 5413 const Register result = r11; 5414 const Register len = r12; 5415 5416 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 5417 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5418 VSeq<2> vq(30); // n.b. constants overlap vs3 5419 VSeq<8> vconst(29, 0); // for montmul by constant 5420 5421 // results track inputs 5422 __ add(result, coeffs, 0); 5423 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5424 5425 // load constants q, qinv -- they do not get clobbered by first two loops 5426 vs_ldpq(vq, dilithiumConsts); // qInv, q 5427 // copy caller supplied constant across vconst 5428 __ dup(vconst[0], __ T4S, constant); 5429 __ mov(len, zr); 5430 __ add(len, len, 1024); 5431 5432 __ BIND(L_loop); 5433 5434 // load next 32 inputs 5435 vs_ldpq_post(vs2, coeffs); 5436 // mont mul by constant 5437 vs_montmul32(vs2, vconst, vs2, vtmp, vq); 5438 // write next 32 results 5439 vs_stpq_post(vs2, result); 5440 5441 __ sub(len, len, 128); 5442 __ cmp(len, (u1)128); 5443 __ br(Assembler::GE, L_loop); 5444 5445 __ leave(); // required for proper stackwalking of RuntimeStub frame 5446 __ mov(r0, zr); // return 0 5447 __ ret(lr); 5448 5449 return start; 5450 5451 } 5452 5453 // Dilithium decompose poly. 5454 // Implements the method 5455 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 5456 // of the sun.security.provider.ML_DSA class 5457 // 5458 // input (int[256]) = c_rarg0 5459 // lowPart (int[256]) = c_rarg1 5460 // highPart (int[256]) = c_rarg2 5461 // twoGamma2 (int) = c_rarg3 5462 // multiplier (int) = c_rarg4 5463 address generate_dilithiumDecomposePoly() { 5464 5465 __ align(CodeEntryAlignment); 5466 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 5467 StubCodeMark mark(this, stub_id); 5468 address start = __ pc(); 5469 Label L_loop; 5470 5471 const Register input = c_rarg0; 5472 const Register lowPart = c_rarg1; 5473 const Register highPart = c_rarg2; 5474 const Register twoGamma2 = c_rarg3; 5475 const Register multiplier = c_rarg4; 5476 5477 const Register len = r9; 5478 const Register dilithiumConsts = r10; 5479 const Register tmp = r11; 5480 5481 VSeq<4> vs1(0), vs2(4), vs3(8); // 6 independent sets of 4x4s values 5482 VSeq<4> vs4(12), vs5(16), vtmp(20); 5483 VSeq<4> one(25, 0); // 7 constants for cross-multiplying 5484 VSeq<4> qminus1(26, 0); 5485 VSeq<4> g2(27, 0); 5486 VSeq<4> twog2(28, 0); 5487 VSeq<4> mult(29, 0); 5488 VSeq<4> q(30, 0); 5489 VSeq<4> qadd(31, 0); 5490 5491 __ enter(); 5492 5493 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5494 5495 // save callee-saved registers 5496 __ stpd(v8, v9, __ pre(sp, -64)); 5497 __ stpd(v10, v11, Address(sp, 16)); 5498 __ stpd(v12, v13, Address(sp, 32)); 5499 __ stpd(v14, v15, Address(sp, 48)); 5500 5501 // populate constant registers 5502 __ mov(tmp, zr); 5503 __ add(tmp, tmp, 1); 5504 __ dup(one[0], __ T4S, tmp); // 1 5505 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 5506 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 5507 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 5508 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 5509 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 5510 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 5511 5512 __ mov(len, zr); 5513 __ add(len, len, 1024); 5514 5515 __ BIND(L_loop); 5516 5517 // load next 4x4S inputs interleaved: rplus --> vs1 5518 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 5519 5520 // rplus = rplus - ((rplus + qadd) >> 23) * q 5521 vs_addv(vtmp, __ T4S, vs1, qadd); 5522 vs_sshr(vtmp, __ T4S, vtmp, 23); 5523 vs_mulv(vtmp, __ T4S, vtmp, q); 5524 vs_subv(vs1, __ T4S, vs1, vtmp); 5525 5526 // rplus = rplus + ((rplus >> 31) & dilithium_q); 5527 vs_sshr(vtmp, __ T4S, vs1, 31); 5528 vs_andr(vtmp, vtmp, q); 5529 vs_addv(vs1, __ T4S, vs1, vtmp); 5530 5531 // quotient --> vs2 5532 // int quotient = (rplus * multiplier) >> 22; 5533 vs_mulv(vtmp, __ T4S, vs1, mult); 5534 vs_sshr(vs2, __ T4S, vtmp, 22); 5535 5536 // r0 --> vs3 5537 // int r0 = rplus - quotient * twoGamma2; 5538 vs_mulv(vtmp, __ T4S, vs2, twog2); 5539 vs_subv(vs3, __ T4S, vs1, vtmp); 5540 5541 // mask --> vs4 5542 // int mask = (twoGamma2 - r0) >> 22; 5543 vs_subv(vtmp, __ T4S, twog2, vs3); 5544 vs_sshr(vs4, __ T4S, vtmp, 22); 5545 5546 // r0 -= (mask & twoGamma2); 5547 vs_andr(vtmp, vs4, twog2); 5548 vs_subv(vs3, __ T4S, vs3, vtmp); 5549 5550 // quotient += (mask & 1); 5551 vs_andr(vtmp, vs4, one); 5552 vs_addv(vs2, __ T4S, vs2, vtmp); 5553 5554 // mask = (twoGamma2 / 2 - r0) >> 31; 5555 vs_subv(vtmp, __ T4S, g2, vs3); 5556 vs_sshr(vs4, __ T4S, vtmp, 31); 5557 5558 // r0 -= (mask & twoGamma2); 5559 vs_andr(vtmp, vs4, twog2); 5560 vs_subv(vs3, __ T4S, vs3, vtmp); 5561 5562 // quotient += (mask & 1); 5563 vs_andr(vtmp, vs4, one); 5564 vs_addv(vs2, __ T4S, vs2, vtmp); 5565 5566 // r1 --> vs5 5567 // int r1 = rplus - r0 - (dilithium_q - 1); 5568 vs_subv(vtmp, __ T4S, vs1, vs3); 5569 vs_subv(vs5, __ T4S, vtmp, qminus1); 5570 5571 // r1 --> vs1 (overwriting rplus) 5572 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 5573 vs_negr(vtmp, __ T4S, vs5); 5574 vs_orr(vtmp, vs5, vtmp); 5575 vs_sshr(vs1, __ T4S, vtmp, 31); 5576 5577 // r0 += ~r1; 5578 vs_notr(vtmp, vs1); 5579 vs_addv(vs3, __ T4S, vs3, vtmp); 5580 5581 // r1 = r1 & quotient; 5582 vs_andr(vs1, vs2, vs1); 5583 5584 // store results inteleaved 5585 // lowPart[m] = r0; 5586 // highPart[m] = r1; 5587 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 5588 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 5589 5590 5591 __ sub(len, len, 64); 5592 __ cmp(len, (u1)64); 5593 __ br(Assembler::GE, L_loop); 5594 5595 // restore callee-saved vector registers 5596 __ ldpd(v14, v15, Address(sp, 48)); 5597 __ ldpd(v12, v13, Address(sp, 32)); 5598 __ ldpd(v10, v11, Address(sp, 16)); 5599 __ ldpd(v8, v9, __ post(sp, 64)); 5600 5601 __ leave(); // required for proper stackwalking of RuntimeStub frame 5602 __ mov(r0, zr); // return 0 5603 __ ret(lr); 5604 5605 return start; 5606 5607 } 5608 5609 /** 5610 * Arguments: 5611 * 5612 * Inputs: 5613 * c_rarg0 - int crc 5614 * c_rarg1 - byte* buf 5615 * c_rarg2 - int length 5616 * c_rarg3 - int* table 5617 * 5618 * Output: 5619 * r0 - int crc result 5620 */ 5621 address generate_updateBytesCRC32C() { 5622 assert(UseCRC32CIntrinsics, "what are we doing here?"); 5623 5624 __ align(CodeEntryAlignment); 5625 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 5626 StubCodeMark mark(this, stub_id); 5627 5628 address start = __ pc(); 5629 5630 const Register crc = c_rarg0; // crc 5631 const Register buf = c_rarg1; // source java byte array address 5632 const Register len = c_rarg2; // length 5633 const Register table0 = c_rarg3; // crc_table address 5634 const Register table1 = c_rarg4; 5635 const Register table2 = c_rarg5; 5636 const Register table3 = c_rarg6; 5637 const Register tmp3 = c_rarg7; 5638 5639 BLOCK_COMMENT("Entry:"); 5640 __ enter(); // required for proper stackwalking of RuntimeStub frame 5641 5642 __ kernel_crc32c(crc, buf, len, 5643 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 5644 5645 __ leave(); // required for proper stackwalking of RuntimeStub frame 5646 __ ret(lr); 5647 5648 return start; 5649 } 5650 5651 /*** 5652 * Arguments: 5653 * 5654 * Inputs: 5655 * c_rarg0 - int adler 5656 * c_rarg1 - byte* buff 5657 * c_rarg2 - int len 5658 * 5659 * Output: 5660 * c_rarg0 - int adler result 5661 */ 5662 address generate_updateBytesAdler32() { 5663 __ align(CodeEntryAlignment); 5664 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 5665 StubCodeMark mark(this, stub_id); 5666 address start = __ pc(); 5667 5668 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 5669 5670 // Aliases 5671 Register adler = c_rarg0; 5672 Register s1 = c_rarg0; 5673 Register s2 = c_rarg3; 5674 Register buff = c_rarg1; 5675 Register len = c_rarg2; 5676 Register nmax = r4; 5677 Register base = r5; 5678 Register count = r6; 5679 Register temp0 = rscratch1; 5680 Register temp1 = rscratch2; 5681 FloatRegister vbytes = v0; 5682 FloatRegister vs1acc = v1; 5683 FloatRegister vs2acc = v2; 5684 FloatRegister vtable = v3; 5685 5686 // Max number of bytes we can process before having to take the mod 5687 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 5688 uint64_t BASE = 0xfff1; 5689 uint64_t NMAX = 0x15B0; 5690 5691 __ mov(base, BASE); 5692 __ mov(nmax, NMAX); 5693 5694 // Load accumulation coefficients for the upper 16 bits 5695 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 5696 __ ld1(vtable, __ T16B, Address(temp0)); 5697 5698 // s1 is initialized to the lower 16 bits of adler 5699 // s2 is initialized to the upper 16 bits of adler 5700 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 5701 __ uxth(s1, adler); // s1 = (adler & 0xffff) 5702 5703 // The pipelined loop needs at least 16 elements for 1 iteration 5704 // It does check this, but it is more effective to skip to the cleanup loop 5705 __ cmp(len, (u1)16); 5706 __ br(Assembler::HS, L_nmax); 5707 __ cbz(len, L_combine); 5708 5709 __ bind(L_simple_by1_loop); 5710 __ ldrb(temp0, Address(__ post(buff, 1))); 5711 __ add(s1, s1, temp0); 5712 __ add(s2, s2, s1); 5713 __ subs(len, len, 1); 5714 __ br(Assembler::HI, L_simple_by1_loop); 5715 5716 // s1 = s1 % BASE 5717 __ subs(temp0, s1, base); 5718 __ csel(s1, temp0, s1, Assembler::HS); 5719 5720 // s2 = s2 % BASE 5721 __ lsr(temp0, s2, 16); 5722 __ lsl(temp1, temp0, 4); 5723 __ sub(temp1, temp1, temp0); 5724 __ add(s2, temp1, s2, ext::uxth); 5725 5726 __ subs(temp0, s2, base); 5727 __ csel(s2, temp0, s2, Assembler::HS); 5728 5729 __ b(L_combine); 5730 5731 __ bind(L_nmax); 5732 __ subs(len, len, nmax); 5733 __ sub(count, nmax, 16); 5734 __ br(Assembler::LO, L_by16); 5735 5736 __ bind(L_nmax_loop); 5737 5738 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 5739 vbytes, vs1acc, vs2acc, vtable); 5740 5741 __ subs(count, count, 16); 5742 __ br(Assembler::HS, L_nmax_loop); 5743 5744 // s1 = s1 % BASE 5745 __ lsr(temp0, s1, 16); 5746 __ lsl(temp1, temp0, 4); 5747 __ sub(temp1, temp1, temp0); 5748 __ add(temp1, temp1, s1, ext::uxth); 5749 5750 __ lsr(temp0, temp1, 16); 5751 __ lsl(s1, temp0, 4); 5752 __ sub(s1, s1, temp0); 5753 __ add(s1, s1, temp1, ext:: uxth); 5754 5755 __ subs(temp0, s1, base); 5756 __ csel(s1, temp0, s1, Assembler::HS); 5757 5758 // s2 = s2 % BASE 5759 __ lsr(temp0, s2, 16); 5760 __ lsl(temp1, temp0, 4); 5761 __ sub(temp1, temp1, temp0); 5762 __ add(temp1, temp1, s2, ext::uxth); 5763 5764 __ lsr(temp0, temp1, 16); 5765 __ lsl(s2, temp0, 4); 5766 __ sub(s2, s2, temp0); 5767 __ add(s2, s2, temp1, ext:: uxth); 5768 5769 __ subs(temp0, s2, base); 5770 __ csel(s2, temp0, s2, Assembler::HS); 5771 5772 __ subs(len, len, nmax); 5773 __ sub(count, nmax, 16); 5774 __ br(Assembler::HS, L_nmax_loop); 5775 5776 __ bind(L_by16); 5777 __ adds(len, len, count); 5778 __ br(Assembler::LO, L_by1); 5779 5780 __ bind(L_by16_loop); 5781 5782 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 5783 vbytes, vs1acc, vs2acc, vtable); 5784 5785 __ subs(len, len, 16); 5786 __ br(Assembler::HS, L_by16_loop); 5787 5788 __ bind(L_by1); 5789 __ adds(len, len, 15); 5790 __ br(Assembler::LO, L_do_mod); 5791 5792 __ bind(L_by1_loop); 5793 __ ldrb(temp0, Address(__ post(buff, 1))); 5794 __ add(s1, temp0, s1); 5795 __ add(s2, s2, s1); 5796 __ subs(len, len, 1); 5797 __ br(Assembler::HS, L_by1_loop); 5798 5799 __ bind(L_do_mod); 5800 // s1 = s1 % BASE 5801 __ lsr(temp0, s1, 16); 5802 __ lsl(temp1, temp0, 4); 5803 __ sub(temp1, temp1, temp0); 5804 __ add(temp1, temp1, s1, ext::uxth); 5805 5806 __ lsr(temp0, temp1, 16); 5807 __ lsl(s1, temp0, 4); 5808 __ sub(s1, s1, temp0); 5809 __ add(s1, s1, temp1, ext:: uxth); 5810 5811 __ subs(temp0, s1, base); 5812 __ csel(s1, temp0, s1, Assembler::HS); 5813 5814 // s2 = s2 % BASE 5815 __ lsr(temp0, s2, 16); 5816 __ lsl(temp1, temp0, 4); 5817 __ sub(temp1, temp1, temp0); 5818 __ add(temp1, temp1, s2, ext::uxth); 5819 5820 __ lsr(temp0, temp1, 16); 5821 __ lsl(s2, temp0, 4); 5822 __ sub(s2, s2, temp0); 5823 __ add(s2, s2, temp1, ext:: uxth); 5824 5825 __ subs(temp0, s2, base); 5826 __ csel(s2, temp0, s2, Assembler::HS); 5827 5828 // Combine lower bits and higher bits 5829 __ bind(L_combine); 5830 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 5831 5832 __ ret(lr); 5833 5834 return start; 5835 } 5836 5837 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 5838 Register temp0, Register temp1, FloatRegister vbytes, 5839 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 5840 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 5841 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 5842 // In non-vectorized code, we update s1 and s2 as: 5843 // s1 <- s1 + b1 5844 // s2 <- s2 + s1 5845 // s1 <- s1 + b2 5846 // s2 <- s2 + b1 5847 // ... 5848 // s1 <- s1 + b16 5849 // s2 <- s2 + s1 5850 // Putting above assignments together, we have: 5851 // s1_new = s1 + b1 + b2 + ... + b16 5852 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 5853 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 5854 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 5855 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 5856 5857 // s2 = s2 + s1 * 16 5858 __ add(s2, s2, s1, Assembler::LSL, 4); 5859 5860 // vs1acc = b1 + b2 + b3 + ... + b16 5861 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 5862 __ umullv(vs2acc, __ T8B, vtable, vbytes); 5863 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 5864 __ uaddlv(vs1acc, __ T16B, vbytes); 5865 __ uaddlv(vs2acc, __ T8H, vs2acc); 5866 5867 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 5868 __ fmovd(temp0, vs1acc); 5869 __ fmovd(temp1, vs2acc); 5870 __ add(s1, s1, temp0); 5871 __ add(s2, s2, temp1); 5872 } 5873 5874 /** 5875 * Arguments: 5876 * 5877 * Input: 5878 * c_rarg0 - x address 5879 * c_rarg1 - x length 5880 * c_rarg2 - y address 5881 * c_rarg3 - y length 5882 * c_rarg4 - z address 5883 */ 5884 address generate_multiplyToLen() { 5885 __ align(CodeEntryAlignment); 5886 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 5887 StubCodeMark mark(this, stub_id); 5888 5889 address start = __ pc(); 5890 5891 if (SCCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) { 5892 return start; 5893 } 5894 const Register x = r0; 5895 const Register xlen = r1; 5896 const Register y = r2; 5897 const Register ylen = r3; 5898 const Register z = r4; 5899 5900 const Register tmp0 = r5; 5901 const Register tmp1 = r10; 5902 const Register tmp2 = r11; 5903 const Register tmp3 = r12; 5904 const Register tmp4 = r13; 5905 const Register tmp5 = r14; 5906 const Register tmp6 = r15; 5907 const Register tmp7 = r16; 5908 5909 BLOCK_COMMENT("Entry:"); 5910 __ enter(); // required for proper stackwalking of RuntimeStub frame 5911 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 5912 __ leave(); // required for proper stackwalking of RuntimeStub frame 5913 __ ret(lr); 5914 5915 SCCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start); 5916 return start; 5917 } 5918 5919 address generate_squareToLen() { 5920 // squareToLen algorithm for sizes 1..127 described in java code works 5921 // faster than multiply_to_len on some CPUs and slower on others, but 5922 // multiply_to_len shows a bit better overall results 5923 __ align(CodeEntryAlignment); 5924 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 5925 StubCodeMark mark(this, stub_id); 5926 address start = __ pc(); 5927 5928 if (SCCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) { 5929 return start; 5930 } 5931 const Register x = r0; 5932 const Register xlen = r1; 5933 const Register z = r2; 5934 const Register y = r4; // == x 5935 const Register ylen = r5; // == xlen 5936 5937 const Register tmp0 = r3; 5938 const Register tmp1 = r10; 5939 const Register tmp2 = r11; 5940 const Register tmp3 = r12; 5941 const Register tmp4 = r13; 5942 const Register tmp5 = r14; 5943 const Register tmp6 = r15; 5944 const Register tmp7 = r16; 5945 5946 RegSet spilled_regs = RegSet::of(y, ylen); 5947 BLOCK_COMMENT("Entry:"); 5948 __ enter(); 5949 __ push(spilled_regs, sp); 5950 __ mov(y, x); 5951 __ mov(ylen, xlen); 5952 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 5953 __ pop(spilled_regs, sp); 5954 __ leave(); 5955 __ ret(lr); 5956 5957 SCCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start); 5958 return start; 5959 } 5960 5961 address generate_mulAdd() { 5962 __ align(CodeEntryAlignment); 5963 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 5964 StubCodeMark mark(this, stub_id); 5965 5966 address start = __ pc(); 5967 5968 if (SCCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) { 5969 return start; 5970 } 5971 const Register out = r0; 5972 const Register in = r1; 5973 const Register offset = r2; 5974 const Register len = r3; 5975 const Register k = r4; 5976 5977 BLOCK_COMMENT("Entry:"); 5978 __ enter(); 5979 __ mul_add(out, in, offset, len, k); 5980 __ leave(); 5981 __ ret(lr); 5982 5983 SCCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start); 5984 return start; 5985 } 5986 5987 // Arguments: 5988 // 5989 // Input: 5990 // c_rarg0 - newArr address 5991 // c_rarg1 - oldArr address 5992 // c_rarg2 - newIdx 5993 // c_rarg3 - shiftCount 5994 // c_rarg4 - numIter 5995 // 5996 address generate_bigIntegerRightShift() { 5997 __ align(CodeEntryAlignment); 5998 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 5999 StubCodeMark mark(this, stub_id); 6000 address start = __ pc(); 6001 6002 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 6003 6004 Register newArr = c_rarg0; 6005 Register oldArr = c_rarg1; 6006 Register newIdx = c_rarg2; 6007 Register shiftCount = c_rarg3; 6008 Register numIter = c_rarg4; 6009 Register idx = numIter; 6010 6011 Register newArrCur = rscratch1; 6012 Register shiftRevCount = rscratch2; 6013 Register oldArrCur = r13; 6014 Register oldArrNext = r14; 6015 6016 FloatRegister oldElem0 = v0; 6017 FloatRegister oldElem1 = v1; 6018 FloatRegister newElem = v2; 6019 FloatRegister shiftVCount = v3; 6020 FloatRegister shiftVRevCount = v4; 6021 6022 __ cbz(idx, Exit); 6023 6024 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 6025 6026 // left shift count 6027 __ movw(shiftRevCount, 32); 6028 __ subw(shiftRevCount, shiftRevCount, shiftCount); 6029 6030 // numIter too small to allow a 4-words SIMD loop, rolling back 6031 __ cmp(numIter, (u1)4); 6032 __ br(Assembler::LT, ShiftThree); 6033 6034 __ dup(shiftVCount, __ T4S, shiftCount); 6035 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 6036 __ negr(shiftVCount, __ T4S, shiftVCount); 6037 6038 __ BIND(ShiftSIMDLoop); 6039 6040 // Calculate the load addresses 6041 __ sub(idx, idx, 4); 6042 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 6043 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 6044 __ add(oldArrCur, oldArrNext, 4); 6045 6046 // Load 4 words and process 6047 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 6048 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 6049 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 6050 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 6051 __ orr(newElem, __ T16B, oldElem0, oldElem1); 6052 __ st1(newElem, __ T4S, Address(newArrCur)); 6053 6054 __ cmp(idx, (u1)4); 6055 __ br(Assembler::LT, ShiftTwoLoop); 6056 __ b(ShiftSIMDLoop); 6057 6058 __ BIND(ShiftTwoLoop); 6059 __ cbz(idx, Exit); 6060 __ cmp(idx, (u1)1); 6061 __ br(Assembler::EQ, ShiftOne); 6062 6063 // Calculate the load addresses 6064 __ sub(idx, idx, 2); 6065 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 6066 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 6067 __ add(oldArrCur, oldArrNext, 4); 6068 6069 // Load 2 words and process 6070 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 6071 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 6072 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 6073 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 6074 __ orr(newElem, __ T8B, oldElem0, oldElem1); 6075 __ st1(newElem, __ T2S, Address(newArrCur)); 6076 __ b(ShiftTwoLoop); 6077 6078 __ BIND(ShiftThree); 6079 __ tbz(idx, 1, ShiftOne); 6080 __ tbz(idx, 0, ShiftTwo); 6081 __ ldrw(r10, Address(oldArr, 12)); 6082 __ ldrw(r11, Address(oldArr, 8)); 6083 __ lsrvw(r10, r10, shiftCount); 6084 __ lslvw(r11, r11, shiftRevCount); 6085 __ orrw(r12, r10, r11); 6086 __ strw(r12, Address(newArr, 8)); 6087 6088 __ BIND(ShiftTwo); 6089 __ ldrw(r10, Address(oldArr, 8)); 6090 __ ldrw(r11, Address(oldArr, 4)); 6091 __ lsrvw(r10, r10, shiftCount); 6092 __ lslvw(r11, r11, shiftRevCount); 6093 __ orrw(r12, r10, r11); 6094 __ strw(r12, Address(newArr, 4)); 6095 6096 __ BIND(ShiftOne); 6097 __ ldrw(r10, Address(oldArr, 4)); 6098 __ ldrw(r11, Address(oldArr)); 6099 __ lsrvw(r10, r10, shiftCount); 6100 __ lslvw(r11, r11, shiftRevCount); 6101 __ orrw(r12, r10, r11); 6102 __ strw(r12, Address(newArr)); 6103 6104 __ BIND(Exit); 6105 __ ret(lr); 6106 6107 return start; 6108 } 6109 6110 // Arguments: 6111 // 6112 // Input: 6113 // c_rarg0 - newArr address 6114 // c_rarg1 - oldArr address 6115 // c_rarg2 - newIdx 6116 // c_rarg3 - shiftCount 6117 // c_rarg4 - numIter 6118 // 6119 address generate_bigIntegerLeftShift() { 6120 __ align(CodeEntryAlignment); 6121 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 6122 StubCodeMark mark(this, stub_id); 6123 address start = __ pc(); 6124 6125 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 6126 6127 Register newArr = c_rarg0; 6128 Register oldArr = c_rarg1; 6129 Register newIdx = c_rarg2; 6130 Register shiftCount = c_rarg3; 6131 Register numIter = c_rarg4; 6132 6133 Register shiftRevCount = rscratch1; 6134 Register oldArrNext = rscratch2; 6135 6136 FloatRegister oldElem0 = v0; 6137 FloatRegister oldElem1 = v1; 6138 FloatRegister newElem = v2; 6139 FloatRegister shiftVCount = v3; 6140 FloatRegister shiftVRevCount = v4; 6141 6142 __ cbz(numIter, Exit); 6143 6144 __ add(oldArrNext, oldArr, 4); 6145 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 6146 6147 // right shift count 6148 __ movw(shiftRevCount, 32); 6149 __ subw(shiftRevCount, shiftRevCount, shiftCount); 6150 6151 // numIter too small to allow a 4-words SIMD loop, rolling back 6152 __ cmp(numIter, (u1)4); 6153 __ br(Assembler::LT, ShiftThree); 6154 6155 __ dup(shiftVCount, __ T4S, shiftCount); 6156 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 6157 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 6158 6159 __ BIND(ShiftSIMDLoop); 6160 6161 // load 4 words and process 6162 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 6163 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 6164 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 6165 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 6166 __ orr(newElem, __ T16B, oldElem0, oldElem1); 6167 __ st1(newElem, __ T4S, __ post(newArr, 16)); 6168 __ sub(numIter, numIter, 4); 6169 6170 __ cmp(numIter, (u1)4); 6171 __ br(Assembler::LT, ShiftTwoLoop); 6172 __ b(ShiftSIMDLoop); 6173 6174 __ BIND(ShiftTwoLoop); 6175 __ cbz(numIter, Exit); 6176 __ cmp(numIter, (u1)1); 6177 __ br(Assembler::EQ, ShiftOne); 6178 6179 // load 2 words and process 6180 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 6181 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 6182 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 6183 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 6184 __ orr(newElem, __ T8B, oldElem0, oldElem1); 6185 __ st1(newElem, __ T2S, __ post(newArr, 8)); 6186 __ sub(numIter, numIter, 2); 6187 __ b(ShiftTwoLoop); 6188 6189 __ BIND(ShiftThree); 6190 __ ldrw(r10, __ post(oldArr, 4)); 6191 __ ldrw(r11, __ post(oldArrNext, 4)); 6192 __ lslvw(r10, r10, shiftCount); 6193 __ lsrvw(r11, r11, shiftRevCount); 6194 __ orrw(r12, r10, r11); 6195 __ strw(r12, __ post(newArr, 4)); 6196 __ tbz(numIter, 1, Exit); 6197 __ tbz(numIter, 0, ShiftOne); 6198 6199 __ BIND(ShiftTwo); 6200 __ ldrw(r10, __ post(oldArr, 4)); 6201 __ ldrw(r11, __ post(oldArrNext, 4)); 6202 __ lslvw(r10, r10, shiftCount); 6203 __ lsrvw(r11, r11, shiftRevCount); 6204 __ orrw(r12, r10, r11); 6205 __ strw(r12, __ post(newArr, 4)); 6206 6207 __ BIND(ShiftOne); 6208 __ ldrw(r10, Address(oldArr)); 6209 __ ldrw(r11, Address(oldArrNext)); 6210 __ lslvw(r10, r10, shiftCount); 6211 __ lsrvw(r11, r11, shiftRevCount); 6212 __ orrw(r12, r10, r11); 6213 __ strw(r12, Address(newArr)); 6214 6215 __ BIND(Exit); 6216 __ ret(lr); 6217 6218 return start; 6219 } 6220 6221 address generate_count_positives(address &count_positives_long) { 6222 const u1 large_loop_size = 64; 6223 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 6224 int dcache_line = VM_Version::dcache_line_size(); 6225 6226 Register ary1 = r1, len = r2, result = r0; 6227 6228 __ align(CodeEntryAlignment); 6229 6230 StubGenStubId stub_id = StubGenStubId::count_positives_id; 6231 StubCodeMark mark(this, stub_id); 6232 6233 address entry = __ pc(); 6234 6235 __ enter(); 6236 // precondition: a copy of len is already in result 6237 // __ mov(result, len); 6238 6239 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 6240 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 6241 6242 __ cmp(len, (u1)15); 6243 __ br(Assembler::GT, LEN_OVER_15); 6244 // The only case when execution falls into this code is when pointer is near 6245 // the end of memory page and we have to avoid reading next page 6246 __ add(ary1, ary1, len); 6247 __ subs(len, len, 8); 6248 __ br(Assembler::GT, LEN_OVER_8); 6249 __ ldr(rscratch2, Address(ary1, -8)); 6250 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 6251 __ lsrv(rscratch2, rscratch2, rscratch1); 6252 __ tst(rscratch2, UPPER_BIT_MASK); 6253 __ csel(result, zr, result, Assembler::NE); 6254 __ leave(); 6255 __ ret(lr); 6256 __ bind(LEN_OVER_8); 6257 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 6258 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 6259 __ tst(rscratch2, UPPER_BIT_MASK); 6260 __ br(Assembler::NE, RET_NO_POP); 6261 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 6262 __ lsrv(rscratch1, rscratch1, rscratch2); 6263 __ tst(rscratch1, UPPER_BIT_MASK); 6264 __ bind(RET_NO_POP); 6265 __ csel(result, zr, result, Assembler::NE); 6266 __ leave(); 6267 __ ret(lr); 6268 6269 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 6270 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 6271 6272 count_positives_long = __ pc(); // 2nd entry point 6273 6274 __ enter(); 6275 6276 __ bind(LEN_OVER_15); 6277 __ push(spilled_regs, sp); 6278 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 6279 __ cbz(rscratch2, ALIGNED); 6280 __ ldp(tmp6, tmp1, Address(ary1)); 6281 __ mov(tmp5, 16); 6282 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 6283 __ add(ary1, ary1, rscratch1); 6284 __ orr(tmp6, tmp6, tmp1); 6285 __ tst(tmp6, UPPER_BIT_MASK); 6286 __ br(Assembler::NE, RET_ADJUST); 6287 __ sub(len, len, rscratch1); 6288 6289 __ bind(ALIGNED); 6290 __ cmp(len, large_loop_size); 6291 __ br(Assembler::LT, CHECK_16); 6292 // Perform 16-byte load as early return in pre-loop to handle situation 6293 // when initially aligned large array has negative values at starting bytes, 6294 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 6295 // slower. Cases with negative bytes further ahead won't be affected that 6296 // much. In fact, it'll be faster due to early loads, less instructions and 6297 // less branches in LARGE_LOOP. 6298 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 6299 __ sub(len, len, 16); 6300 __ orr(tmp6, tmp6, tmp1); 6301 __ tst(tmp6, UPPER_BIT_MASK); 6302 __ br(Assembler::NE, RET_ADJUST_16); 6303 __ cmp(len, large_loop_size); 6304 __ br(Assembler::LT, CHECK_16); 6305 6306 if (SoftwarePrefetchHintDistance >= 0 6307 && SoftwarePrefetchHintDistance >= dcache_line) { 6308 // initial prefetch 6309 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 6310 } 6311 __ bind(LARGE_LOOP); 6312 if (SoftwarePrefetchHintDistance >= 0) { 6313 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 6314 } 6315 // Issue load instructions first, since it can save few CPU/MEM cycles, also 6316 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 6317 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 6318 // instructions per cycle and have less branches, but this approach disables 6319 // early return, thus, all 64 bytes are loaded and checked every time. 6320 __ ldp(tmp2, tmp3, Address(ary1)); 6321 __ ldp(tmp4, tmp5, Address(ary1, 16)); 6322 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 6323 __ ldp(tmp6, tmp1, Address(ary1, 48)); 6324 __ add(ary1, ary1, large_loop_size); 6325 __ sub(len, len, large_loop_size); 6326 __ orr(tmp2, tmp2, tmp3); 6327 __ orr(tmp4, tmp4, tmp5); 6328 __ orr(rscratch1, rscratch1, rscratch2); 6329 __ orr(tmp6, tmp6, tmp1); 6330 __ orr(tmp2, tmp2, tmp4); 6331 __ orr(rscratch1, rscratch1, tmp6); 6332 __ orr(tmp2, tmp2, rscratch1); 6333 __ tst(tmp2, UPPER_BIT_MASK); 6334 __ br(Assembler::NE, RET_ADJUST_LONG); 6335 __ cmp(len, large_loop_size); 6336 __ br(Assembler::GE, LARGE_LOOP); 6337 6338 __ bind(CHECK_16); // small 16-byte load pre-loop 6339 __ cmp(len, (u1)16); 6340 __ br(Assembler::LT, POST_LOOP16); 6341 6342 __ bind(LOOP16); // small 16-byte load loop 6343 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 6344 __ sub(len, len, 16); 6345 __ orr(tmp2, tmp2, tmp3); 6346 __ tst(tmp2, UPPER_BIT_MASK); 6347 __ br(Assembler::NE, RET_ADJUST_16); 6348 __ cmp(len, (u1)16); 6349 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 6350 6351 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 6352 __ cmp(len, (u1)8); 6353 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 6354 __ ldr(tmp3, Address(__ post(ary1, 8))); 6355 __ tst(tmp3, UPPER_BIT_MASK); 6356 __ br(Assembler::NE, RET_ADJUST); 6357 __ sub(len, len, 8); 6358 6359 __ bind(POST_LOOP16_LOAD_TAIL); 6360 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 6361 __ ldr(tmp1, Address(ary1)); 6362 __ mov(tmp2, 64); 6363 __ sub(tmp4, tmp2, len, __ LSL, 3); 6364 __ lslv(tmp1, tmp1, tmp4); 6365 __ tst(tmp1, UPPER_BIT_MASK); 6366 __ br(Assembler::NE, RET_ADJUST); 6367 // Fallthrough 6368 6369 __ bind(RET_LEN); 6370 __ pop(spilled_regs, sp); 6371 __ leave(); 6372 __ ret(lr); 6373 6374 // difference result - len is the count of guaranteed to be 6375 // positive bytes 6376 6377 __ bind(RET_ADJUST_LONG); 6378 __ add(len, len, (u1)(large_loop_size - 16)); 6379 __ bind(RET_ADJUST_16); 6380 __ add(len, len, 16); 6381 __ bind(RET_ADJUST); 6382 __ pop(spilled_regs, sp); 6383 __ leave(); 6384 __ sub(result, result, len); 6385 __ ret(lr); 6386 6387 return entry; 6388 } 6389 6390 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 6391 bool usePrefetch, Label &NOT_EQUAL) { 6392 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6393 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 6394 tmp7 = r12, tmp8 = r13; 6395 Label LOOP; 6396 6397 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6398 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6399 __ bind(LOOP); 6400 if (usePrefetch) { 6401 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 6402 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 6403 } 6404 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 6405 __ eor(tmp1, tmp1, tmp2); 6406 __ eor(tmp3, tmp3, tmp4); 6407 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 6408 __ orr(tmp1, tmp1, tmp3); 6409 __ cbnz(tmp1, NOT_EQUAL); 6410 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6411 __ eor(tmp5, tmp5, tmp6); 6412 __ eor(tmp7, tmp7, tmp8); 6413 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6414 __ orr(tmp5, tmp5, tmp7); 6415 __ cbnz(tmp5, NOT_EQUAL); 6416 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 6417 __ eor(tmp1, tmp1, tmp2); 6418 __ eor(tmp3, tmp3, tmp4); 6419 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 6420 __ orr(tmp1, tmp1, tmp3); 6421 __ cbnz(tmp1, NOT_EQUAL); 6422 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6423 __ eor(tmp5, tmp5, tmp6); 6424 __ sub(cnt1, cnt1, 8 * wordSize); 6425 __ eor(tmp7, tmp7, tmp8); 6426 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6427 // tmp6 is not used. MacroAssembler::subs is used here (rather than 6428 // cmp) because subs allows an unlimited range of immediate operand. 6429 __ subs(tmp6, cnt1, loopThreshold); 6430 __ orr(tmp5, tmp5, tmp7); 6431 __ cbnz(tmp5, NOT_EQUAL); 6432 __ br(__ GE, LOOP); 6433 // post-loop 6434 __ eor(tmp1, tmp1, tmp2); 6435 __ eor(tmp3, tmp3, tmp4); 6436 __ orr(tmp1, tmp1, tmp3); 6437 __ sub(cnt1, cnt1, 2 * wordSize); 6438 __ cbnz(tmp1, NOT_EQUAL); 6439 } 6440 6441 void generate_large_array_equals_loop_simd(int loopThreshold, 6442 bool usePrefetch, Label &NOT_EQUAL) { 6443 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6444 tmp2 = rscratch2; 6445 Label LOOP; 6446 6447 __ bind(LOOP); 6448 if (usePrefetch) { 6449 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 6450 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 6451 } 6452 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 6453 __ sub(cnt1, cnt1, 8 * wordSize); 6454 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 6455 __ subs(tmp1, cnt1, loopThreshold); 6456 __ eor(v0, __ T16B, v0, v4); 6457 __ eor(v1, __ T16B, v1, v5); 6458 __ eor(v2, __ T16B, v2, v6); 6459 __ eor(v3, __ T16B, v3, v7); 6460 __ orr(v0, __ T16B, v0, v1); 6461 __ orr(v1, __ T16B, v2, v3); 6462 __ orr(v0, __ T16B, v0, v1); 6463 __ umov(tmp1, v0, __ D, 0); 6464 __ umov(tmp2, v0, __ D, 1); 6465 __ orr(tmp1, tmp1, tmp2); 6466 __ cbnz(tmp1, NOT_EQUAL); 6467 __ br(__ GE, LOOP); 6468 } 6469 6470 // a1 = r1 - array1 address 6471 // a2 = r2 - array2 address 6472 // result = r0 - return value. Already contains "false" 6473 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 6474 // r3-r5 are reserved temporary registers 6475 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 6476 address generate_large_array_equals() { 6477 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6478 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 6479 tmp7 = r12, tmp8 = r13; 6480 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 6481 SMALL_LOOP, POST_LOOP; 6482 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 6483 // calculate if at least 32 prefetched bytes are used 6484 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 6485 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 6486 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 6487 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 6488 tmp5, tmp6, tmp7, tmp8); 6489 6490 __ align(CodeEntryAlignment); 6491 6492 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 6493 StubCodeMark mark(this, stub_id); 6494 6495 address entry = __ pc(); 6496 __ enter(); 6497 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 6498 // also advance pointers to use post-increment instead of pre-increment 6499 __ add(a1, a1, wordSize); 6500 __ add(a2, a2, wordSize); 6501 if (AvoidUnalignedAccesses) { 6502 // both implementations (SIMD/nonSIMD) are using relatively large load 6503 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 6504 // on some CPUs in case of address is not at least 16-byte aligned. 6505 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 6506 // load if needed at least for 1st address and make if 16-byte aligned. 6507 Label ALIGNED16; 6508 __ tbz(a1, 3, ALIGNED16); 6509 __ ldr(tmp1, Address(__ post(a1, wordSize))); 6510 __ ldr(tmp2, Address(__ post(a2, wordSize))); 6511 __ sub(cnt1, cnt1, wordSize); 6512 __ eor(tmp1, tmp1, tmp2); 6513 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 6514 __ bind(ALIGNED16); 6515 } 6516 if (UseSIMDForArrayEquals) { 6517 if (SoftwarePrefetchHintDistance >= 0) { 6518 __ subs(tmp1, cnt1, prefetchLoopThreshold); 6519 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 6520 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 6521 /* prfm = */ true, NOT_EQUAL); 6522 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 6523 __ br(__ LT, TAIL); 6524 } 6525 __ bind(NO_PREFETCH_LARGE_LOOP); 6526 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 6527 /* prfm = */ false, NOT_EQUAL); 6528 } else { 6529 __ push(spilled_regs, sp); 6530 if (SoftwarePrefetchHintDistance >= 0) { 6531 __ subs(tmp1, cnt1, prefetchLoopThreshold); 6532 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 6533 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 6534 /* prfm = */ true, NOT_EQUAL); 6535 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 6536 __ br(__ LT, TAIL); 6537 } 6538 __ bind(NO_PREFETCH_LARGE_LOOP); 6539 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 6540 /* prfm = */ false, NOT_EQUAL); 6541 } 6542 __ bind(TAIL); 6543 __ cbz(cnt1, EQUAL); 6544 __ subs(cnt1, cnt1, wordSize); 6545 __ br(__ LE, POST_LOOP); 6546 __ bind(SMALL_LOOP); 6547 __ ldr(tmp1, Address(__ post(a1, wordSize))); 6548 __ ldr(tmp2, Address(__ post(a2, wordSize))); 6549 __ subs(cnt1, cnt1, wordSize); 6550 __ eor(tmp1, tmp1, tmp2); 6551 __ cbnz(tmp1, NOT_EQUAL); 6552 __ br(__ GT, SMALL_LOOP); 6553 __ bind(POST_LOOP); 6554 __ ldr(tmp1, Address(a1, cnt1)); 6555 __ ldr(tmp2, Address(a2, cnt1)); 6556 __ eor(tmp1, tmp1, tmp2); 6557 __ cbnz(tmp1, NOT_EQUAL); 6558 __ bind(EQUAL); 6559 __ mov(result, true); 6560 __ bind(NOT_EQUAL); 6561 if (!UseSIMDForArrayEquals) { 6562 __ pop(spilled_regs, sp); 6563 } 6564 __ bind(NOT_EQUAL_NO_POP); 6565 __ leave(); 6566 __ ret(lr); 6567 return entry; 6568 } 6569 6570 // result = r0 - return value. Contains initial hashcode value on entry. 6571 // ary = r1 - array address 6572 // cnt = r2 - elements count 6573 // Clobbers: v0-v13, rscratch1, rscratch2 6574 address generate_large_arrays_hashcode(BasicType eltype) { 6575 const Register result = r0, ary = r1, cnt = r2; 6576 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 6577 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 6578 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 6579 const FloatRegister vpowm = v13; 6580 6581 ARRAYS_HASHCODE_REGISTERS; 6582 6583 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 6584 6585 unsigned int vf; // vectorization factor 6586 bool multiply_by_halves; 6587 Assembler::SIMD_Arrangement load_arrangement; 6588 switch (eltype) { 6589 case T_BOOLEAN: 6590 case T_BYTE: 6591 load_arrangement = Assembler::T8B; 6592 multiply_by_halves = true; 6593 vf = 8; 6594 break; 6595 case T_CHAR: 6596 case T_SHORT: 6597 load_arrangement = Assembler::T8H; 6598 multiply_by_halves = true; 6599 vf = 8; 6600 break; 6601 case T_INT: 6602 load_arrangement = Assembler::T4S; 6603 multiply_by_halves = false; 6604 vf = 4; 6605 break; 6606 default: 6607 ShouldNotReachHere(); 6608 } 6609 6610 // Unroll factor 6611 const unsigned uf = 4; 6612 6613 // Effective vectorization factor 6614 const unsigned evf = vf * uf; 6615 6616 __ align(CodeEntryAlignment); 6617 6618 StubGenStubId stub_id; 6619 switch (eltype) { 6620 case T_BOOLEAN: 6621 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 6622 break; 6623 case T_BYTE: 6624 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 6625 break; 6626 case T_CHAR: 6627 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 6628 break; 6629 case T_SHORT: 6630 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 6631 break; 6632 case T_INT: 6633 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 6634 break; 6635 default: 6636 stub_id = StubGenStubId::NO_STUBID; 6637 ShouldNotReachHere(); 6638 }; 6639 6640 StubCodeMark mark(this, stub_id); 6641 6642 address entry = __ pc(); 6643 __ enter(); 6644 6645 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 6646 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 6647 // value shouldn't change throughout both loops. 6648 __ movw(rscratch1, intpow(31U, 3)); 6649 __ mov(vpow, Assembler::S, 0, rscratch1); 6650 __ movw(rscratch1, intpow(31U, 2)); 6651 __ mov(vpow, Assembler::S, 1, rscratch1); 6652 __ movw(rscratch1, intpow(31U, 1)); 6653 __ mov(vpow, Assembler::S, 2, rscratch1); 6654 __ movw(rscratch1, intpow(31U, 0)); 6655 __ mov(vpow, Assembler::S, 3, rscratch1); 6656 6657 __ mov(vmul0, Assembler::T16B, 0); 6658 __ mov(vmul0, Assembler::S, 3, result); 6659 6660 __ andr(rscratch2, cnt, (uf - 1) * vf); 6661 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 6662 6663 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 6664 __ mov(vpowm, Assembler::S, 0, rscratch1); 6665 6666 // SMALL LOOP 6667 __ bind(SMALL_LOOP); 6668 6669 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 6670 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6671 __ subsw(rscratch2, rscratch2, vf); 6672 6673 if (load_arrangement == Assembler::T8B) { 6674 // Extend 8B to 8H to be able to use vector multiply 6675 // instructions 6676 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 6677 if (is_signed_subword_type(eltype)) { 6678 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6679 } else { 6680 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6681 } 6682 } 6683 6684 switch (load_arrangement) { 6685 case Assembler::T4S: 6686 __ addv(vmul0, load_arrangement, vmul0, vdata0); 6687 break; 6688 case Assembler::T8B: 6689 case Assembler::T8H: 6690 assert(is_subword_type(eltype), "subword type expected"); 6691 if (is_signed_subword_type(eltype)) { 6692 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6693 } else { 6694 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6695 } 6696 break; 6697 default: 6698 __ should_not_reach_here(); 6699 } 6700 6701 // Process the upper half of a vector 6702 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 6703 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6704 if (is_signed_subword_type(eltype)) { 6705 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6706 } else { 6707 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6708 } 6709 } 6710 6711 __ br(Assembler::HI, SMALL_LOOP); 6712 6713 // SMALL LOOP'S EPILOQUE 6714 __ lsr(rscratch2, cnt, exact_log2(evf)); 6715 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 6716 6717 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 6718 __ addv(vmul0, Assembler::T4S, vmul0); 6719 __ umov(result, vmul0, Assembler::S, 0); 6720 6721 // TAIL 6722 __ bind(TAIL); 6723 6724 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 6725 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 6726 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 6727 __ andr(rscratch2, cnt, vf - 1); 6728 __ bind(TAIL_SHORTCUT); 6729 __ adr(rscratch1, BR_BASE); 6730 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 6731 __ movw(rscratch2, 0x1f); 6732 __ br(rscratch1); 6733 6734 for (size_t i = 0; i < vf - 1; ++i) { 6735 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 6736 eltype); 6737 __ maddw(result, result, rscratch2, rscratch1); 6738 } 6739 __ bind(BR_BASE); 6740 6741 __ leave(); 6742 __ ret(lr); 6743 6744 // LARGE LOOP 6745 __ bind(LARGE_LOOP_PREHEADER); 6746 6747 __ lsr(rscratch2, cnt, exact_log2(evf)); 6748 6749 if (multiply_by_halves) { 6750 // 31^4 - multiplier between lower and upper parts of a register 6751 __ movw(rscratch1, intpow(31U, vf / 2)); 6752 __ mov(vpowm, Assembler::S, 1, rscratch1); 6753 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 6754 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 6755 __ mov(vpowm, Assembler::S, 0, rscratch1); 6756 } else { 6757 // 31^16 6758 __ movw(rscratch1, intpow(31U, evf)); 6759 __ mov(vpowm, Assembler::S, 0, rscratch1); 6760 } 6761 6762 __ mov(vmul3, Assembler::T16B, 0); 6763 __ mov(vmul2, Assembler::T16B, 0); 6764 __ mov(vmul1, Assembler::T16B, 0); 6765 6766 __ bind(LARGE_LOOP); 6767 6768 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 6769 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 6770 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 6771 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6772 6773 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 6774 Address(__ post(ary, evf * type2aelembytes(eltype)))); 6775 6776 if (load_arrangement == Assembler::T8B) { 6777 // Extend 8B to 8H to be able to use vector multiply 6778 // instructions 6779 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 6780 if (is_signed_subword_type(eltype)) { 6781 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 6782 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 6783 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 6784 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6785 } else { 6786 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 6787 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 6788 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 6789 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6790 } 6791 } 6792 6793 switch (load_arrangement) { 6794 case Assembler::T4S: 6795 __ addv(vmul3, load_arrangement, vmul3, vdata3); 6796 __ addv(vmul2, load_arrangement, vmul2, vdata2); 6797 __ addv(vmul1, load_arrangement, vmul1, vdata1); 6798 __ addv(vmul0, load_arrangement, vmul0, vdata0); 6799 break; 6800 case Assembler::T8B: 6801 case Assembler::T8H: 6802 assert(is_subword_type(eltype), "subword type expected"); 6803 if (is_signed_subword_type(eltype)) { 6804 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 6805 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 6806 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 6807 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6808 } else { 6809 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 6810 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 6811 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 6812 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6813 } 6814 break; 6815 default: 6816 __ should_not_reach_here(); 6817 } 6818 6819 // Process the upper half of a vector 6820 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 6821 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 6822 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 6823 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 6824 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 6825 if (is_signed_subword_type(eltype)) { 6826 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 6827 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 6828 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 6829 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6830 } else { 6831 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 6832 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 6833 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 6834 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6835 } 6836 } 6837 6838 __ subsw(rscratch2, rscratch2, 1); 6839 __ br(Assembler::HI, LARGE_LOOP); 6840 6841 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 6842 __ addv(vmul3, Assembler::T4S, vmul3); 6843 __ umov(result, vmul3, Assembler::S, 0); 6844 6845 __ mov(rscratch2, intpow(31U, vf)); 6846 6847 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 6848 __ addv(vmul2, Assembler::T4S, vmul2); 6849 __ umov(rscratch1, vmul2, Assembler::S, 0); 6850 __ maddw(result, result, rscratch2, rscratch1); 6851 6852 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 6853 __ addv(vmul1, Assembler::T4S, vmul1); 6854 __ umov(rscratch1, vmul1, Assembler::S, 0); 6855 __ maddw(result, result, rscratch2, rscratch1); 6856 6857 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 6858 __ addv(vmul0, Assembler::T4S, vmul0); 6859 __ umov(rscratch1, vmul0, Assembler::S, 0); 6860 __ maddw(result, result, rscratch2, rscratch1); 6861 6862 __ andr(rscratch2, cnt, vf - 1); 6863 __ cbnz(rscratch2, TAIL_SHORTCUT); 6864 6865 __ leave(); 6866 __ ret(lr); 6867 6868 return entry; 6869 } 6870 6871 address generate_dsin_dcos(bool isCos) { 6872 __ align(CodeEntryAlignment); 6873 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 6874 StubCodeMark mark(this, stub_id); 6875 address start = __ pc(); 6876 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 6877 (address)StubRoutines::aarch64::_two_over_pi, 6878 (address)StubRoutines::aarch64::_pio2, 6879 (address)StubRoutines::aarch64::_dsin_coef, 6880 (address)StubRoutines::aarch64::_dcos_coef); 6881 return start; 6882 } 6883 6884 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 6885 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 6886 Label &DIFF2) { 6887 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 6888 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 6889 6890 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 6891 __ ldr(tmpU, Address(__ post(cnt1, 8))); 6892 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 6893 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 6894 6895 __ fmovd(tmpL, vtmp3); 6896 __ eor(rscratch2, tmp3, tmpL); 6897 __ cbnz(rscratch2, DIFF2); 6898 6899 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6900 __ umov(tmpL, vtmp3, __ D, 1); 6901 __ eor(rscratch2, tmpU, tmpL); 6902 __ cbnz(rscratch2, DIFF1); 6903 6904 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 6905 __ ldr(tmpU, Address(__ post(cnt1, 8))); 6906 __ fmovd(tmpL, vtmp); 6907 __ eor(rscratch2, tmp3, tmpL); 6908 __ cbnz(rscratch2, DIFF2); 6909 6910 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6911 __ umov(tmpL, vtmp, __ D, 1); 6912 __ eor(rscratch2, tmpU, tmpL); 6913 __ cbnz(rscratch2, DIFF1); 6914 } 6915 6916 // r0 = result 6917 // r1 = str1 6918 // r2 = cnt1 6919 // r3 = str2 6920 // r4 = cnt2 6921 // r10 = tmp1 6922 // r11 = tmp2 6923 address generate_compare_long_string_different_encoding(bool isLU) { 6924 __ align(CodeEntryAlignment); 6925 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 6926 StubCodeMark mark(this, stub_id); 6927 address entry = __ pc(); 6928 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 6929 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 6930 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 6931 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6932 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 6933 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 6934 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 6935 6936 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 6937 6938 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 6939 // cnt2 == amount of characters left to compare 6940 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 6941 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 6942 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 6943 __ add(str2, str2, isLU ? wordSize : wordSize/2); 6944 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 6945 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 6946 __ eor(rscratch2, tmp1, tmp2); 6947 __ mov(rscratch1, tmp2); 6948 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 6949 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 6950 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 6951 __ push(spilled_regs, sp); 6952 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 6953 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 6954 6955 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6956 6957 if (SoftwarePrefetchHintDistance >= 0) { 6958 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 6959 __ br(__ LT, NO_PREFETCH); 6960 __ bind(LARGE_LOOP_PREFETCH); 6961 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 6962 __ mov(tmp4, 2); 6963 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 6964 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 6965 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6966 __ subs(tmp4, tmp4, 1); 6967 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 6968 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 6969 __ mov(tmp4, 2); 6970 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 6971 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6972 __ subs(tmp4, tmp4, 1); 6973 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 6974 __ sub(cnt2, cnt2, 64); 6975 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 6976 __ br(__ GE, LARGE_LOOP_PREFETCH); 6977 } 6978 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 6979 __ bind(NO_PREFETCH); 6980 __ subs(cnt2, cnt2, 16); 6981 __ br(__ LT, TAIL); 6982 __ align(OptoLoopAlignment); 6983 __ bind(SMALL_LOOP); // smaller loop 6984 __ subs(cnt2, cnt2, 16); 6985 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6986 __ br(__ GE, SMALL_LOOP); 6987 __ cmn(cnt2, (u1)16); 6988 __ br(__ EQ, LOAD_LAST); 6989 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 6990 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 6991 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 6992 __ ldr(tmp3, Address(cnt1, -8)); 6993 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 6994 __ b(LOAD_LAST); 6995 __ bind(DIFF2); 6996 __ mov(tmpU, tmp3); 6997 __ bind(DIFF1); 6998 __ pop(spilled_regs, sp); 6999 __ b(CALCULATE_DIFFERENCE); 7000 __ bind(LOAD_LAST); 7001 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 7002 // No need to load it again 7003 __ mov(tmpU, tmp3); 7004 __ pop(spilled_regs, sp); 7005 7006 // tmp2 points to the address of the last 4 Latin1 characters right now 7007 __ ldrs(vtmp, Address(tmp2)); 7008 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 7009 __ fmovd(tmpL, vtmp); 7010 7011 __ eor(rscratch2, tmpU, tmpL); 7012 __ cbz(rscratch2, DONE); 7013 7014 // Find the first different characters in the longwords and 7015 // compute their difference. 7016 __ bind(CALCULATE_DIFFERENCE); 7017 __ rev(rscratch2, rscratch2); 7018 __ clz(rscratch2, rscratch2); 7019 __ andr(rscratch2, rscratch2, -16); 7020 __ lsrv(tmp1, tmp1, rscratch2); 7021 __ uxthw(tmp1, tmp1); 7022 __ lsrv(rscratch1, rscratch1, rscratch2); 7023 __ uxthw(rscratch1, rscratch1); 7024 __ subw(result, tmp1, rscratch1); 7025 __ bind(DONE); 7026 __ ret(lr); 7027 return entry; 7028 } 7029 7030 // r0 = input (float16) 7031 // v0 = result (float) 7032 // v1 = temporary float register 7033 address generate_float16ToFloat() { 7034 __ align(CodeEntryAlignment); 7035 StubGenStubId stub_id = StubGenStubId::hf2f_id; 7036 StubCodeMark mark(this, stub_id); 7037 address entry = __ pc(); 7038 BLOCK_COMMENT("Entry:"); 7039 __ flt16_to_flt(v0, r0, v1); 7040 __ ret(lr); 7041 return entry; 7042 } 7043 7044 // v0 = input (float) 7045 // r0 = result (float16) 7046 // v1 = temporary float register 7047 address generate_floatToFloat16() { 7048 __ align(CodeEntryAlignment); 7049 StubGenStubId stub_id = StubGenStubId::f2hf_id; 7050 StubCodeMark mark(this, stub_id); 7051 address entry = __ pc(); 7052 BLOCK_COMMENT("Entry:"); 7053 __ flt_to_flt16(r0, v0, v1); 7054 __ ret(lr); 7055 return entry; 7056 } 7057 7058 address generate_method_entry_barrier() { 7059 __ align(CodeEntryAlignment); 7060 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 7061 StubCodeMark mark(this, stub_id); 7062 7063 Label deoptimize_label; 7064 7065 address start = __ pc(); 7066 7067 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 7068 7069 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 7070 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 7071 // We can get here despite the nmethod being good, if we have not 7072 // yet applied our cross modification fence (or data fence). 7073 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 7074 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 7075 __ ldrw(rscratch2, rscratch2); 7076 __ strw(rscratch2, thread_epoch_addr); 7077 __ isb(); 7078 __ membar(__ LoadLoad); 7079 } 7080 7081 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 7082 7083 __ enter(); 7084 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 7085 7086 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 7087 7088 __ push_call_clobbered_registers(); 7089 7090 __ mov(c_rarg0, rscratch2); 7091 __ call_VM_leaf 7092 (CAST_FROM_FN_PTR 7093 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 7094 7095 __ reset_last_Java_frame(true); 7096 7097 __ mov(rscratch1, r0); 7098 7099 __ pop_call_clobbered_registers(); 7100 7101 __ cbnz(rscratch1, deoptimize_label); 7102 7103 __ leave(); 7104 __ ret(lr); 7105 7106 __ BIND(deoptimize_label); 7107 7108 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 7109 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 7110 7111 __ mov(sp, rscratch1); 7112 __ br(rscratch2); 7113 7114 return start; 7115 } 7116 7117 // r0 = result 7118 // r1 = str1 7119 // r2 = cnt1 7120 // r3 = str2 7121 // r4 = cnt2 7122 // r10 = tmp1 7123 // r11 = tmp2 7124 address generate_compare_long_string_same_encoding(bool isLL) { 7125 __ align(CodeEntryAlignment); 7126 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 7127 StubCodeMark mark(this, stub_id); 7128 address entry = __ pc(); 7129 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 7130 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 7131 7132 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 7133 7134 // exit from large loop when less than 64 bytes left to read or we're about 7135 // to prefetch memory behind array border 7136 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 7137 7138 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 7139 __ eor(rscratch2, tmp1, tmp2); 7140 __ cbnz(rscratch2, CAL_DIFFERENCE); 7141 7142 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 7143 // update pointers, because of previous read 7144 __ add(str1, str1, wordSize); 7145 __ add(str2, str2, wordSize); 7146 if (SoftwarePrefetchHintDistance >= 0) { 7147 __ align(OptoLoopAlignment); 7148 __ bind(LARGE_LOOP_PREFETCH); 7149 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 7150 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 7151 7152 for (int i = 0; i < 4; i++) { 7153 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 7154 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 7155 __ cmp(tmp1, tmp2); 7156 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7157 __ br(Assembler::NE, DIFF); 7158 } 7159 __ sub(cnt2, cnt2, isLL ? 64 : 32); 7160 __ add(str1, str1, 64); 7161 __ add(str2, str2, 64); 7162 __ subs(rscratch2, cnt2, largeLoopExitCondition); 7163 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 7164 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 7165 } 7166 7167 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 7168 __ br(Assembler::LE, LESS16); 7169 __ align(OptoLoopAlignment); 7170 __ bind(LOOP_COMPARE16); 7171 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 7172 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 7173 __ cmp(tmp1, tmp2); 7174 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7175 __ br(Assembler::NE, DIFF); 7176 __ sub(cnt2, cnt2, isLL ? 16 : 8); 7177 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 7178 __ br(Assembler::LT, LESS16); 7179 7180 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 7181 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 7182 __ cmp(tmp1, tmp2); 7183 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7184 __ br(Assembler::NE, DIFF); 7185 __ sub(cnt2, cnt2, isLL ? 16 : 8); 7186 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 7187 __ br(Assembler::GE, LOOP_COMPARE16); 7188 __ cbz(cnt2, LENGTH_DIFF); 7189 7190 __ bind(LESS16); 7191 // each 8 compare 7192 __ subs(cnt2, cnt2, isLL ? 8 : 4); 7193 __ br(Assembler::LE, LESS8); 7194 __ ldr(tmp1, Address(__ post(str1, 8))); 7195 __ ldr(tmp2, Address(__ post(str2, 8))); 7196 __ eor(rscratch2, tmp1, tmp2); 7197 __ cbnz(rscratch2, CAL_DIFFERENCE); 7198 __ sub(cnt2, cnt2, isLL ? 8 : 4); 7199 7200 __ bind(LESS8); // directly load last 8 bytes 7201 if (!isLL) { 7202 __ add(cnt2, cnt2, cnt2); 7203 } 7204 __ ldr(tmp1, Address(str1, cnt2)); 7205 __ ldr(tmp2, Address(str2, cnt2)); 7206 __ eor(rscratch2, tmp1, tmp2); 7207 __ cbz(rscratch2, LENGTH_DIFF); 7208 __ b(CAL_DIFFERENCE); 7209 7210 __ bind(DIFF); 7211 __ cmp(tmp1, tmp2); 7212 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 7213 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 7214 // reuse rscratch2 register for the result of eor instruction 7215 __ eor(rscratch2, tmp1, tmp2); 7216 7217 __ bind(CAL_DIFFERENCE); 7218 __ rev(rscratch2, rscratch2); 7219 __ clz(rscratch2, rscratch2); 7220 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 7221 __ lsrv(tmp1, tmp1, rscratch2); 7222 __ lsrv(tmp2, tmp2, rscratch2); 7223 if (isLL) { 7224 __ uxtbw(tmp1, tmp1); 7225 __ uxtbw(tmp2, tmp2); 7226 } else { 7227 __ uxthw(tmp1, tmp1); 7228 __ uxthw(tmp2, tmp2); 7229 } 7230 __ subw(result, tmp1, tmp2); 7231 7232 __ bind(LENGTH_DIFF); 7233 __ ret(lr); 7234 return entry; 7235 } 7236 7237 enum string_compare_mode { 7238 LL, 7239 LU, 7240 UL, 7241 UU, 7242 }; 7243 7244 // The following registers are declared in aarch64.ad 7245 // r0 = result 7246 // r1 = str1 7247 // r2 = cnt1 7248 // r3 = str2 7249 // r4 = cnt2 7250 // r10 = tmp1 7251 // r11 = tmp2 7252 // z0 = ztmp1 7253 // z1 = ztmp2 7254 // p0 = pgtmp1 7255 // p1 = pgtmp2 7256 address generate_compare_long_string_sve(string_compare_mode mode) { 7257 StubGenStubId stub_id; 7258 switch (mode) { 7259 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 7260 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 7261 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 7262 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 7263 default: ShouldNotReachHere(); 7264 } 7265 7266 __ align(CodeEntryAlignment); 7267 address entry = __ pc(); 7268 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 7269 tmp1 = r10, tmp2 = r11; 7270 7271 Label LOOP, DONE, MISMATCH; 7272 Register vec_len = tmp1; 7273 Register idx = tmp2; 7274 // The minimum of the string lengths has been stored in cnt2. 7275 Register cnt = cnt2; 7276 FloatRegister ztmp1 = z0, ztmp2 = z1; 7277 PRegister pgtmp1 = p0, pgtmp2 = p1; 7278 7279 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 7280 switch (mode) { \ 7281 case LL: \ 7282 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 7283 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 7284 break; \ 7285 case LU: \ 7286 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 7287 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 7288 break; \ 7289 case UL: \ 7290 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 7291 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 7292 break; \ 7293 case UU: \ 7294 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 7295 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 7296 break; \ 7297 default: \ 7298 ShouldNotReachHere(); \ 7299 } 7300 7301 StubCodeMark mark(this, stub_id); 7302 7303 __ mov(idx, 0); 7304 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 7305 7306 if (mode == LL) { 7307 __ sve_cntb(vec_len); 7308 } else { 7309 __ sve_cnth(vec_len); 7310 } 7311 7312 __ sub(rscratch1, cnt, vec_len); 7313 7314 __ bind(LOOP); 7315 7316 // main loop 7317 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 7318 __ add(idx, idx, vec_len); 7319 // Compare strings. 7320 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 7321 __ br(__ NE, MISMATCH); 7322 __ cmp(idx, rscratch1); 7323 __ br(__ LT, LOOP); 7324 7325 // post loop, last iteration 7326 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 7327 7328 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 7329 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 7330 __ br(__ EQ, DONE); 7331 7332 __ bind(MISMATCH); 7333 7334 // Crop the vector to find its location. 7335 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 7336 // Extract the first different characters of each string. 7337 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 7338 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 7339 7340 // Compute the difference of the first different characters. 7341 __ sub(result, rscratch1, rscratch2); 7342 7343 __ bind(DONE); 7344 __ ret(lr); 7345 #undef LOAD_PAIR 7346 return entry; 7347 } 7348 7349 void generate_compare_long_strings() { 7350 if (UseSVE == 0) { 7351 StubRoutines::aarch64::_compare_long_string_LL 7352 = generate_compare_long_string_same_encoding(true); 7353 StubRoutines::aarch64::_compare_long_string_UU 7354 = generate_compare_long_string_same_encoding(false); 7355 StubRoutines::aarch64::_compare_long_string_LU 7356 = generate_compare_long_string_different_encoding(true); 7357 StubRoutines::aarch64::_compare_long_string_UL 7358 = generate_compare_long_string_different_encoding(false); 7359 } else { 7360 StubRoutines::aarch64::_compare_long_string_LL 7361 = generate_compare_long_string_sve(LL); 7362 StubRoutines::aarch64::_compare_long_string_UU 7363 = generate_compare_long_string_sve(UU); 7364 StubRoutines::aarch64::_compare_long_string_LU 7365 = generate_compare_long_string_sve(LU); 7366 StubRoutines::aarch64::_compare_long_string_UL 7367 = generate_compare_long_string_sve(UL); 7368 } 7369 } 7370 7371 // R0 = result 7372 // R1 = str2 7373 // R2 = cnt1 7374 // R3 = str1 7375 // R4 = cnt2 7376 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 7377 // 7378 // This generic linear code use few additional ideas, which makes it faster: 7379 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 7380 // in order to skip initial loading(help in systems with 1 ld pipeline) 7381 // 2) we can use "fast" algorithm of finding single character to search for 7382 // first symbol with less branches(1 branch per each loaded register instead 7383 // of branch for each symbol), so, this is where constants like 7384 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 7385 // 3) after loading and analyzing 1st register of source string, it can be 7386 // used to search for every 1st character entry, saving few loads in 7387 // comparison with "simplier-but-slower" implementation 7388 // 4) in order to avoid lots of push/pop operations, code below is heavily 7389 // re-using/re-initializing/compressing register values, which makes code 7390 // larger and a bit less readable, however, most of extra operations are 7391 // issued during loads or branches, so, penalty is minimal 7392 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 7393 StubGenStubId stub_id; 7394 if (str1_isL) { 7395 if (str2_isL) { 7396 stub_id = StubGenStubId::string_indexof_linear_ll_id; 7397 } else { 7398 stub_id = StubGenStubId::string_indexof_linear_ul_id; 7399 } 7400 } else { 7401 if (str2_isL) { 7402 ShouldNotReachHere(); 7403 } else { 7404 stub_id = StubGenStubId::string_indexof_linear_uu_id; 7405 } 7406 } 7407 __ align(CodeEntryAlignment); 7408 StubCodeMark mark(this, stub_id); 7409 address entry = __ pc(); 7410 7411 int str1_chr_size = str1_isL ? 1 : 2; 7412 int str2_chr_size = str2_isL ? 1 : 2; 7413 int str1_chr_shift = str1_isL ? 0 : 1; 7414 int str2_chr_shift = str2_isL ? 0 : 1; 7415 bool isL = str1_isL && str2_isL; 7416 // parameters 7417 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 7418 // temporary registers 7419 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 7420 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 7421 // redefinitions 7422 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 7423 7424 __ push(spilled_regs, sp); 7425 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 7426 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 7427 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 7428 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 7429 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 7430 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 7431 // Read whole register from str1. It is safe, because length >=8 here 7432 __ ldr(ch1, Address(str1)); 7433 // Read whole register from str2. It is safe, because length >=8 here 7434 __ ldr(ch2, Address(str2)); 7435 __ sub(cnt2, cnt2, cnt1); 7436 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 7437 if (str1_isL != str2_isL) { 7438 __ eor(v0, __ T16B, v0, v0); 7439 } 7440 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 7441 __ mul(first, first, tmp1); 7442 // check if we have less than 1 register to check 7443 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 7444 if (str1_isL != str2_isL) { 7445 __ fmovd(v1, ch1); 7446 } 7447 __ br(__ LE, L_SMALL); 7448 __ eor(ch2, first, ch2); 7449 if (str1_isL != str2_isL) { 7450 __ zip1(v1, __ T16B, v1, v0); 7451 } 7452 __ sub(tmp2, ch2, tmp1); 7453 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7454 __ bics(tmp2, tmp2, ch2); 7455 if (str1_isL != str2_isL) { 7456 __ fmovd(ch1, v1); 7457 } 7458 __ br(__ NE, L_HAS_ZERO); 7459 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 7460 __ add(result, result, wordSize/str2_chr_size); 7461 __ add(str2, str2, wordSize); 7462 __ br(__ LT, L_POST_LOOP); 7463 __ BIND(L_LOOP); 7464 __ ldr(ch2, Address(str2)); 7465 __ eor(ch2, first, ch2); 7466 __ sub(tmp2, ch2, tmp1); 7467 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7468 __ bics(tmp2, tmp2, ch2); 7469 __ br(__ NE, L_HAS_ZERO); 7470 __ BIND(L_LOOP_PROCEED); 7471 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 7472 __ add(str2, str2, wordSize); 7473 __ add(result, result, wordSize/str2_chr_size); 7474 __ br(__ GE, L_LOOP); 7475 __ BIND(L_POST_LOOP); 7476 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 7477 __ br(__ LE, NOMATCH); 7478 __ ldr(ch2, Address(str2)); 7479 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 7480 __ eor(ch2, first, ch2); 7481 __ sub(tmp2, ch2, tmp1); 7482 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7483 __ mov(tmp4, -1); // all bits set 7484 __ b(L_SMALL_PROCEED); 7485 __ align(OptoLoopAlignment); 7486 __ BIND(L_SMALL); 7487 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 7488 __ eor(ch2, first, ch2); 7489 if (str1_isL != str2_isL) { 7490 __ zip1(v1, __ T16B, v1, v0); 7491 } 7492 __ sub(tmp2, ch2, tmp1); 7493 __ mov(tmp4, -1); // all bits set 7494 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7495 if (str1_isL != str2_isL) { 7496 __ fmovd(ch1, v1); // move converted 4 symbols 7497 } 7498 __ BIND(L_SMALL_PROCEED); 7499 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 7500 __ bic(tmp2, tmp2, ch2); 7501 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 7502 __ rbit(tmp2, tmp2); 7503 __ br(__ EQ, NOMATCH); 7504 __ BIND(L_SMALL_HAS_ZERO_LOOP); 7505 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 7506 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 7507 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 7508 if (str2_isL) { // LL 7509 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 7510 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 7511 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 7512 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 7513 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7514 } else { 7515 __ mov(ch2, 0xE); // all bits in byte set except last one 7516 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7517 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7518 __ lslv(tmp2, tmp2, tmp4); 7519 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7520 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7521 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7522 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7523 } 7524 __ cmp(ch1, ch2); 7525 __ mov(tmp4, wordSize/str2_chr_size); 7526 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7527 __ BIND(L_SMALL_CMP_LOOP); 7528 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 7529 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 7530 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 7531 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 7532 __ add(tmp4, tmp4, 1); 7533 __ cmp(tmp4, cnt1); 7534 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 7535 __ cmp(first, ch2); 7536 __ br(__ EQ, L_SMALL_CMP_LOOP); 7537 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 7538 __ cbz(tmp2, NOMATCH); // no more matches. exit 7539 __ clz(tmp4, tmp2); 7540 __ add(result, result, 1); // advance index 7541 __ add(str2, str2, str2_chr_size); // advance pointer 7542 __ b(L_SMALL_HAS_ZERO_LOOP); 7543 __ align(OptoLoopAlignment); 7544 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 7545 __ cmp(first, ch2); 7546 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7547 __ b(DONE); 7548 __ align(OptoLoopAlignment); 7549 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 7550 if (str2_isL) { // LL 7551 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 7552 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 7553 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 7554 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 7555 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7556 } else { 7557 __ mov(ch2, 0xE); // all bits in byte set except last one 7558 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7559 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7560 __ lslv(tmp2, tmp2, tmp4); 7561 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7562 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7563 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7564 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7565 } 7566 __ cmp(ch1, ch2); 7567 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7568 __ b(DONE); 7569 __ align(OptoLoopAlignment); 7570 __ BIND(L_HAS_ZERO); 7571 __ rbit(tmp2, tmp2); 7572 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 7573 // Now, perform compression of counters(cnt2 and cnt1) into one register. 7574 // It's fine because both counters are 32bit and are not changed in this 7575 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 7576 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 7577 __ sub(result, result, 1); 7578 __ BIND(L_HAS_ZERO_LOOP); 7579 __ mov(cnt1, wordSize/str2_chr_size); 7580 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 7581 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 7582 if (str2_isL) { 7583 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 7584 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7585 __ lslv(tmp2, tmp2, tmp4); 7586 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7587 __ add(tmp4, tmp4, 1); 7588 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7589 __ lsl(tmp2, tmp2, 1); 7590 __ mov(tmp4, wordSize/str2_chr_size); 7591 } else { 7592 __ mov(ch2, 0xE); 7593 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7594 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7595 __ lslv(tmp2, tmp2, tmp4); 7596 __ add(tmp4, tmp4, 1); 7597 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7598 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 7599 __ lsl(tmp2, tmp2, 1); 7600 __ mov(tmp4, wordSize/str2_chr_size); 7601 __ sub(str2, str2, str2_chr_size); 7602 } 7603 __ cmp(ch1, ch2); 7604 __ mov(tmp4, wordSize/str2_chr_size); 7605 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7606 __ BIND(L_CMP_LOOP); 7607 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 7608 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 7609 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 7610 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 7611 __ add(tmp4, tmp4, 1); 7612 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 7613 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 7614 __ cmp(cnt1, ch2); 7615 __ br(__ EQ, L_CMP_LOOP); 7616 __ BIND(L_CMP_LOOP_NOMATCH); 7617 // here we're not matched 7618 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 7619 __ clz(tmp4, tmp2); 7620 __ add(str2, str2, str2_chr_size); // advance pointer 7621 __ b(L_HAS_ZERO_LOOP); 7622 __ align(OptoLoopAlignment); 7623 __ BIND(L_CMP_LOOP_LAST_CMP); 7624 __ cmp(cnt1, ch2); 7625 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7626 __ b(DONE); 7627 __ align(OptoLoopAlignment); 7628 __ BIND(L_CMP_LOOP_LAST_CMP2); 7629 if (str2_isL) { 7630 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 7631 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7632 __ lslv(tmp2, tmp2, tmp4); 7633 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7634 __ add(tmp4, tmp4, 1); 7635 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7636 __ lsl(tmp2, tmp2, 1); 7637 } else { 7638 __ mov(ch2, 0xE); 7639 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7640 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7641 __ lslv(tmp2, tmp2, tmp4); 7642 __ add(tmp4, tmp4, 1); 7643 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7644 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 7645 __ lsl(tmp2, tmp2, 1); 7646 __ sub(str2, str2, str2_chr_size); 7647 } 7648 __ cmp(ch1, ch2); 7649 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7650 __ b(DONE); 7651 __ align(OptoLoopAlignment); 7652 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 7653 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 7654 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 7655 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 7656 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 7657 // result by analyzed characters value, so, we can just reset lower bits 7658 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 7659 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 7660 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 7661 // index of last analyzed substring inside current octet. So, str2 in at 7662 // respective start address. We need to advance it to next octet 7663 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 7664 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 7665 __ bfm(result, zr, 0, 2 - str2_chr_shift); 7666 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 7667 __ movw(cnt2, cnt2); 7668 __ b(L_LOOP_PROCEED); 7669 __ align(OptoLoopAlignment); 7670 __ BIND(NOMATCH); 7671 __ mov(result, -1); 7672 __ BIND(DONE); 7673 __ pop(spilled_regs, sp); 7674 __ ret(lr); 7675 return entry; 7676 } 7677 7678 void generate_string_indexof_stubs() { 7679 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 7680 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 7681 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 7682 } 7683 7684 void inflate_and_store_2_fp_registers(bool generatePrfm, 7685 FloatRegister src1, FloatRegister src2) { 7686 Register dst = r1; 7687 __ zip1(v1, __ T16B, src1, v0); 7688 __ zip2(v2, __ T16B, src1, v0); 7689 if (generatePrfm) { 7690 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 7691 } 7692 __ zip1(v3, __ T16B, src2, v0); 7693 __ zip2(v4, __ T16B, src2, v0); 7694 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 7695 } 7696 7697 // R0 = src 7698 // R1 = dst 7699 // R2 = len 7700 // R3 = len >> 3 7701 // V0 = 0 7702 // v1 = loaded 8 bytes 7703 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 7704 address generate_large_byte_array_inflate() { 7705 __ align(CodeEntryAlignment); 7706 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 7707 StubCodeMark mark(this, stub_id); 7708 address entry = __ pc(); 7709 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 7710 Register src = r0, dst = r1, len = r2, octetCounter = r3; 7711 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 7712 7713 // do one more 8-byte read to have address 16-byte aligned in most cases 7714 // also use single store instruction 7715 __ ldrd(v2, __ post(src, 8)); 7716 __ sub(octetCounter, octetCounter, 2); 7717 __ zip1(v1, __ T16B, v1, v0); 7718 __ zip1(v2, __ T16B, v2, v0); 7719 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 7720 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7721 __ subs(rscratch1, octetCounter, large_loop_threshold); 7722 __ br(__ LE, LOOP_START); 7723 __ b(LOOP_PRFM_START); 7724 __ bind(LOOP_PRFM); 7725 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7726 __ bind(LOOP_PRFM_START); 7727 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 7728 __ sub(octetCounter, octetCounter, 8); 7729 __ subs(rscratch1, octetCounter, large_loop_threshold); 7730 inflate_and_store_2_fp_registers(true, v3, v4); 7731 inflate_and_store_2_fp_registers(true, v5, v6); 7732 __ br(__ GT, LOOP_PRFM); 7733 __ cmp(octetCounter, (u1)8); 7734 __ br(__ LT, DONE); 7735 __ bind(LOOP); 7736 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7737 __ bind(LOOP_START); 7738 __ sub(octetCounter, octetCounter, 8); 7739 __ cmp(octetCounter, (u1)8); 7740 inflate_and_store_2_fp_registers(false, v3, v4); 7741 inflate_and_store_2_fp_registers(false, v5, v6); 7742 __ br(__ GE, LOOP); 7743 __ bind(DONE); 7744 __ ret(lr); 7745 return entry; 7746 } 7747 7748 /** 7749 * Arguments: 7750 * 7751 * Input: 7752 * c_rarg0 - current state address 7753 * c_rarg1 - H key address 7754 * c_rarg2 - data address 7755 * c_rarg3 - number of blocks 7756 * 7757 * Output: 7758 * Updated state at c_rarg0 7759 */ 7760 address generate_ghash_processBlocks() { 7761 // Bafflingly, GCM uses little-endian for the byte order, but 7762 // big-endian for the bit order. For example, the polynomial 1 is 7763 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 7764 // 7765 // So, we must either reverse the bytes in each word and do 7766 // everything big-endian or reverse the bits in each byte and do 7767 // it little-endian. On AArch64 it's more idiomatic to reverse 7768 // the bits in each byte (we have an instruction, RBIT, to do 7769 // that) and keep the data in little-endian bit order through the 7770 // calculation, bit-reversing the inputs and outputs. 7771 7772 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 7773 StubCodeMark mark(this, stub_id); 7774 __ align(wordSize * 2); 7775 address p = __ pc(); 7776 __ emit_int64(0x87); // The low-order bits of the field 7777 // polynomial (i.e. p = z^7+z^2+z+1) 7778 // repeated in the low and high parts of a 7779 // 128-bit vector 7780 __ emit_int64(0x87); 7781 7782 __ align(CodeEntryAlignment); 7783 address start = __ pc(); 7784 7785 Register state = c_rarg0; 7786 Register subkeyH = c_rarg1; 7787 Register data = c_rarg2; 7788 Register blocks = c_rarg3; 7789 7790 FloatRegister vzr = v30; 7791 __ eor(vzr, __ T16B, vzr, vzr); // zero register 7792 7793 __ ldrq(v24, p); // The field polynomial 7794 7795 __ ldrq(v0, Address(state)); 7796 __ ldrq(v1, Address(subkeyH)); 7797 7798 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 7799 __ rbit(v0, __ T16B, v0); 7800 __ rev64(v1, __ T16B, v1); 7801 __ rbit(v1, __ T16B, v1); 7802 7803 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 7804 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 7805 7806 { 7807 Label L_ghash_loop; 7808 __ bind(L_ghash_loop); 7809 7810 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 7811 // reversing each byte 7812 __ rbit(v2, __ T16B, v2); 7813 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 7814 7815 // Multiply state in v2 by subkey in v1 7816 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 7817 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 7818 /*temps*/v6, v3, /*reuse/clobber b*/v2); 7819 // Reduce v7:v5 by the field polynomial 7820 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 7821 7822 __ sub(blocks, blocks, 1); 7823 __ cbnz(blocks, L_ghash_loop); 7824 } 7825 7826 // The bit-reversed result is at this point in v0 7827 __ rev64(v0, __ T16B, v0); 7828 __ rbit(v0, __ T16B, v0); 7829 7830 __ st1(v0, __ T16B, state); 7831 __ ret(lr); 7832 7833 return start; 7834 } 7835 7836 address generate_ghash_processBlocks_wide() { 7837 address small = generate_ghash_processBlocks(); 7838 7839 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 7840 StubCodeMark mark(this, stub_id); 7841 __ align(wordSize * 2); 7842 address p = __ pc(); 7843 __ emit_int64(0x87); // The low-order bits of the field 7844 // polynomial (i.e. p = z^7+z^2+z+1) 7845 // repeated in the low and high parts of a 7846 // 128-bit vector 7847 __ emit_int64(0x87); 7848 7849 __ align(CodeEntryAlignment); 7850 address start = __ pc(); 7851 7852 Register state = c_rarg0; 7853 Register subkeyH = c_rarg1; 7854 Register data = c_rarg2; 7855 Register blocks = c_rarg3; 7856 7857 const int unroll = 4; 7858 7859 __ cmp(blocks, (unsigned char)(unroll * 2)); 7860 __ br(__ LT, small); 7861 7862 if (unroll > 1) { 7863 // Save state before entering routine 7864 __ sub(sp, sp, 4 * 16); 7865 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 7866 __ sub(sp, sp, 4 * 16); 7867 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 7868 } 7869 7870 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 7871 7872 if (unroll > 1) { 7873 // And restore state 7874 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 7875 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 7876 } 7877 7878 __ cmp(blocks, (unsigned char)0); 7879 __ br(__ GT, small); 7880 7881 __ ret(lr); 7882 7883 return start; 7884 } 7885 7886 void generate_base64_encode_simdround(Register src, Register dst, 7887 FloatRegister codec, u8 size) { 7888 7889 FloatRegister in0 = v4, in1 = v5, in2 = v6; 7890 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 7891 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 7892 7893 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 7894 7895 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 7896 7897 __ ushr(ind0, arrangement, in0, 2); 7898 7899 __ ushr(ind1, arrangement, in1, 2); 7900 __ shl(in0, arrangement, in0, 6); 7901 __ orr(ind1, arrangement, ind1, in0); 7902 __ ushr(ind1, arrangement, ind1, 2); 7903 7904 __ ushr(ind2, arrangement, in2, 4); 7905 __ shl(in1, arrangement, in1, 4); 7906 __ orr(ind2, arrangement, in1, ind2); 7907 __ ushr(ind2, arrangement, ind2, 2); 7908 7909 __ shl(ind3, arrangement, in2, 2); 7910 __ ushr(ind3, arrangement, ind3, 2); 7911 7912 __ tbl(out0, arrangement, codec, 4, ind0); 7913 __ tbl(out1, arrangement, codec, 4, ind1); 7914 __ tbl(out2, arrangement, codec, 4, ind2); 7915 __ tbl(out3, arrangement, codec, 4, ind3); 7916 7917 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 7918 } 7919 7920 /** 7921 * Arguments: 7922 * 7923 * Input: 7924 * c_rarg0 - src_start 7925 * c_rarg1 - src_offset 7926 * c_rarg2 - src_length 7927 * c_rarg3 - dest_start 7928 * c_rarg4 - dest_offset 7929 * c_rarg5 - isURL 7930 * 7931 */ 7932 address generate_base64_encodeBlock() { 7933 7934 static const char toBase64[64] = { 7935 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 7936 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 7937 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 7938 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 7939 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 7940 }; 7941 7942 static const char toBase64URL[64] = { 7943 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 7944 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 7945 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 7946 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 7947 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 7948 }; 7949 7950 __ align(CodeEntryAlignment); 7951 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 7952 StubCodeMark mark(this, stub_id); 7953 address start = __ pc(); 7954 7955 Register src = c_rarg0; // source array 7956 Register soff = c_rarg1; // source start offset 7957 Register send = c_rarg2; // source end offset 7958 Register dst = c_rarg3; // dest array 7959 Register doff = c_rarg4; // position for writing to dest array 7960 Register isURL = c_rarg5; // Base64 or URL character set 7961 7962 // c_rarg6 and c_rarg7 are free to use as temps 7963 Register codec = c_rarg6; 7964 Register length = c_rarg7; 7965 7966 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 7967 7968 __ add(src, src, soff); 7969 __ add(dst, dst, doff); 7970 __ sub(length, send, soff); 7971 7972 // load the codec base address 7973 __ lea(codec, ExternalAddress((address) toBase64)); 7974 __ cbz(isURL, ProcessData); 7975 __ lea(codec, ExternalAddress((address) toBase64URL)); 7976 7977 __ BIND(ProcessData); 7978 7979 // too short to formup a SIMD loop, roll back 7980 __ cmp(length, (u1)24); 7981 __ br(Assembler::LT, Process3B); 7982 7983 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 7984 7985 __ BIND(Process48B); 7986 __ cmp(length, (u1)48); 7987 __ br(Assembler::LT, Process24B); 7988 generate_base64_encode_simdround(src, dst, v0, 16); 7989 __ sub(length, length, 48); 7990 __ b(Process48B); 7991 7992 __ BIND(Process24B); 7993 __ cmp(length, (u1)24); 7994 __ br(Assembler::LT, SIMDExit); 7995 generate_base64_encode_simdround(src, dst, v0, 8); 7996 __ sub(length, length, 24); 7997 7998 __ BIND(SIMDExit); 7999 __ cbz(length, Exit); 8000 8001 __ BIND(Process3B); 8002 // 3 src bytes, 24 bits 8003 __ ldrb(r10, __ post(src, 1)); 8004 __ ldrb(r11, __ post(src, 1)); 8005 __ ldrb(r12, __ post(src, 1)); 8006 __ orrw(r11, r11, r10, Assembler::LSL, 8); 8007 __ orrw(r12, r12, r11, Assembler::LSL, 8); 8008 // codec index 8009 __ ubfmw(r15, r12, 18, 23); 8010 __ ubfmw(r14, r12, 12, 17); 8011 __ ubfmw(r13, r12, 6, 11); 8012 __ andw(r12, r12, 63); 8013 // get the code based on the codec 8014 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 8015 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 8016 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 8017 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 8018 __ strb(r15, __ post(dst, 1)); 8019 __ strb(r14, __ post(dst, 1)); 8020 __ strb(r13, __ post(dst, 1)); 8021 __ strb(r12, __ post(dst, 1)); 8022 __ sub(length, length, 3); 8023 __ cbnz(length, Process3B); 8024 8025 __ BIND(Exit); 8026 __ ret(lr); 8027 8028 return start; 8029 } 8030 8031 void generate_base64_decode_simdround(Register src, Register dst, 8032 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 8033 8034 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 8035 FloatRegister out0 = v20, out1 = v21, out2 = v22; 8036 8037 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 8038 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 8039 8040 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 8041 8042 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 8043 8044 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 8045 8046 // we need unsigned saturating subtract, to make sure all input values 8047 // in range [0, 63] will have 0U value in the higher half lookup 8048 __ uqsubv(decH0, __ T16B, in0, v27); 8049 __ uqsubv(decH1, __ T16B, in1, v27); 8050 __ uqsubv(decH2, __ T16B, in2, v27); 8051 __ uqsubv(decH3, __ T16B, in3, v27); 8052 8053 // lower half lookup 8054 __ tbl(decL0, arrangement, codecL, 4, in0); 8055 __ tbl(decL1, arrangement, codecL, 4, in1); 8056 __ tbl(decL2, arrangement, codecL, 4, in2); 8057 __ tbl(decL3, arrangement, codecL, 4, in3); 8058 8059 // higher half lookup 8060 __ tbx(decH0, arrangement, codecH, 4, decH0); 8061 __ tbx(decH1, arrangement, codecH, 4, decH1); 8062 __ tbx(decH2, arrangement, codecH, 4, decH2); 8063 __ tbx(decH3, arrangement, codecH, 4, decH3); 8064 8065 // combine lower and higher 8066 __ orr(decL0, arrangement, decL0, decH0); 8067 __ orr(decL1, arrangement, decL1, decH1); 8068 __ orr(decL2, arrangement, decL2, decH2); 8069 __ orr(decL3, arrangement, decL3, decH3); 8070 8071 // check illegal inputs, value larger than 63 (maximum of 6 bits) 8072 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 8073 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 8074 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 8075 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 8076 __ orr(in0, arrangement, decH0, decH1); 8077 __ orr(in1, arrangement, decH2, decH3); 8078 __ orr(in2, arrangement, in0, in1); 8079 __ umaxv(in3, arrangement, in2); 8080 __ umov(rscratch2, in3, __ B, 0); 8081 8082 // get the data to output 8083 __ shl(out0, arrangement, decL0, 2); 8084 __ ushr(out1, arrangement, decL1, 4); 8085 __ orr(out0, arrangement, out0, out1); 8086 __ shl(out1, arrangement, decL1, 4); 8087 __ ushr(out2, arrangement, decL2, 2); 8088 __ orr(out1, arrangement, out1, out2); 8089 __ shl(out2, arrangement, decL2, 6); 8090 __ orr(out2, arrangement, out2, decL3); 8091 8092 __ cbz(rscratch2, NoIllegalData); 8093 8094 // handle illegal input 8095 __ umov(r10, in2, __ D, 0); 8096 if (size == 16) { 8097 __ cbnz(r10, ErrorInLowerHalf); 8098 8099 // illegal input is in higher half, store the lower half now. 8100 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 8101 8102 __ umov(r10, in2, __ D, 1); 8103 __ umov(r11, out0, __ D, 1); 8104 __ umov(r12, out1, __ D, 1); 8105 __ umov(r13, out2, __ D, 1); 8106 __ b(StoreLegalData); 8107 8108 __ BIND(ErrorInLowerHalf); 8109 } 8110 __ umov(r11, out0, __ D, 0); 8111 __ umov(r12, out1, __ D, 0); 8112 __ umov(r13, out2, __ D, 0); 8113 8114 __ BIND(StoreLegalData); 8115 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 8116 __ strb(r11, __ post(dst, 1)); 8117 __ strb(r12, __ post(dst, 1)); 8118 __ strb(r13, __ post(dst, 1)); 8119 __ lsr(r10, r10, 8); 8120 __ lsr(r11, r11, 8); 8121 __ lsr(r12, r12, 8); 8122 __ lsr(r13, r13, 8); 8123 __ b(StoreLegalData); 8124 8125 __ BIND(NoIllegalData); 8126 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 8127 } 8128 8129 8130 /** 8131 * Arguments: 8132 * 8133 * Input: 8134 * c_rarg0 - src_start 8135 * c_rarg1 - src_offset 8136 * c_rarg2 - src_length 8137 * c_rarg3 - dest_start 8138 * c_rarg4 - dest_offset 8139 * c_rarg5 - isURL 8140 * c_rarg6 - isMIME 8141 * 8142 */ 8143 address generate_base64_decodeBlock() { 8144 8145 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 8146 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 8147 // titled "Base64 decoding". 8148 8149 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 8150 // except the trailing character '=' is also treated illegal value in this intrinsic. That 8151 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 8152 static const uint8_t fromBase64ForNoSIMD[256] = { 8153 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8154 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8155 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 8156 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8157 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 8158 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 8159 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 8160 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 8161 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8162 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8163 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8164 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8165 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8166 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8167 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8168 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8169 }; 8170 8171 static const uint8_t fromBase64URLForNoSIMD[256] = { 8172 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8173 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8174 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 8175 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8176 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 8177 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 8178 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 8179 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 8180 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8181 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8182 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8183 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8184 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8185 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8186 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8187 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8188 }; 8189 8190 // A legal value of base64 code is in range [0, 127]. We need two lookups 8191 // with tbl/tbx and combine them to get the decode data. The 1st table vector 8192 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 8193 // table vector lookup use tbx, out of range indices are unchanged in 8194 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 8195 // The value of index 64 is set to 0, so that we know that we already get the 8196 // decoded data with the 1st lookup. 8197 static const uint8_t fromBase64ForSIMD[128] = { 8198 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8199 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8200 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 8201 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8202 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 8203 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 8204 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 8205 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 8206 }; 8207 8208 static const uint8_t fromBase64URLForSIMD[128] = { 8209 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8210 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8211 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 8212 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8213 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 8214 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 8215 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 8216 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 8217 }; 8218 8219 __ align(CodeEntryAlignment); 8220 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 8221 StubCodeMark mark(this, stub_id); 8222 address start = __ pc(); 8223 8224 Register src = c_rarg0; // source array 8225 Register soff = c_rarg1; // source start offset 8226 Register send = c_rarg2; // source end offset 8227 Register dst = c_rarg3; // dest array 8228 Register doff = c_rarg4; // position for writing to dest array 8229 Register isURL = c_rarg5; // Base64 or URL character set 8230 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 8231 8232 Register length = send; // reuse send as length of source data to process 8233 8234 Register simd_codec = c_rarg6; 8235 Register nosimd_codec = c_rarg7; 8236 8237 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 8238 8239 __ enter(); 8240 8241 __ add(src, src, soff); 8242 __ add(dst, dst, doff); 8243 8244 __ mov(doff, dst); 8245 8246 __ sub(length, send, soff); 8247 __ bfm(length, zr, 0, 1); 8248 8249 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 8250 __ cbz(isURL, ProcessData); 8251 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 8252 8253 __ BIND(ProcessData); 8254 __ mov(rscratch1, length); 8255 __ cmp(length, (u1)144); // 144 = 80 + 64 8256 __ br(Assembler::LT, Process4B); 8257 8258 // In the MIME case, the line length cannot be more than 76 8259 // bytes (see RFC 2045). This is too short a block for SIMD 8260 // to be worthwhile, so we use non-SIMD here. 8261 __ movw(rscratch1, 79); 8262 8263 __ BIND(Process4B); 8264 __ ldrw(r14, __ post(src, 4)); 8265 __ ubfxw(r10, r14, 0, 8); 8266 __ ubfxw(r11, r14, 8, 8); 8267 __ ubfxw(r12, r14, 16, 8); 8268 __ ubfxw(r13, r14, 24, 8); 8269 // get the de-code 8270 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 8271 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 8272 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 8273 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 8274 // error detection, 255u indicates an illegal input 8275 __ orrw(r14, r10, r11); 8276 __ orrw(r15, r12, r13); 8277 __ orrw(r14, r14, r15); 8278 __ tbnz(r14, 7, Exit); 8279 // recover the data 8280 __ lslw(r14, r10, 10); 8281 __ bfiw(r14, r11, 4, 6); 8282 __ bfmw(r14, r12, 2, 5); 8283 __ rev16w(r14, r14); 8284 __ bfiw(r13, r12, 6, 2); 8285 __ strh(r14, __ post(dst, 2)); 8286 __ strb(r13, __ post(dst, 1)); 8287 // non-simd loop 8288 __ subsw(rscratch1, rscratch1, 4); 8289 __ br(Assembler::GT, Process4B); 8290 8291 // if exiting from PreProcess80B, rscratch1 == -1; 8292 // otherwise, rscratch1 == 0. 8293 __ cbzw(rscratch1, Exit); 8294 __ sub(length, length, 80); 8295 8296 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 8297 __ cbz(isURL, SIMDEnter); 8298 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 8299 8300 __ BIND(SIMDEnter); 8301 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 8302 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 8303 __ mov(rscratch1, 63); 8304 __ dup(v27, __ T16B, rscratch1); 8305 8306 __ BIND(Process64B); 8307 __ cmp(length, (u1)64); 8308 __ br(Assembler::LT, Process32B); 8309 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 8310 __ sub(length, length, 64); 8311 __ b(Process64B); 8312 8313 __ BIND(Process32B); 8314 __ cmp(length, (u1)32); 8315 __ br(Assembler::LT, SIMDExit); 8316 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 8317 __ sub(length, length, 32); 8318 __ b(Process32B); 8319 8320 __ BIND(SIMDExit); 8321 __ cbz(length, Exit); 8322 __ movw(rscratch1, length); 8323 __ b(Process4B); 8324 8325 __ BIND(Exit); 8326 __ sub(c_rarg0, dst, doff); 8327 8328 __ leave(); 8329 __ ret(lr); 8330 8331 return start; 8332 } 8333 8334 // Support for spin waits. 8335 address generate_spin_wait() { 8336 __ align(CodeEntryAlignment); 8337 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 8338 StubCodeMark mark(this, stub_id); 8339 address start = __ pc(); 8340 8341 __ spin_wait(); 8342 __ ret(lr); 8343 8344 return start; 8345 } 8346 8347 void generate_lookup_secondary_supers_table_stub() { 8348 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 8349 StubCodeMark mark(this, stub_id); 8350 8351 const Register 8352 r_super_klass = r0, 8353 r_array_base = r1, 8354 r_array_length = r2, 8355 r_array_index = r3, 8356 r_sub_klass = r4, 8357 r_bitmap = rscratch2, 8358 result = r5; 8359 const FloatRegister 8360 vtemp = v0; 8361 8362 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 8363 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 8364 Label L_success; 8365 __ enter(); 8366 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 8367 r_array_base, r_array_length, r_array_index, 8368 vtemp, result, slot, 8369 /*stub_is_near*/true); 8370 __ leave(); 8371 __ ret(lr); 8372 } 8373 } 8374 8375 // Slow path implementation for UseSecondarySupersTable. 8376 address generate_lookup_secondary_supers_table_slow_path_stub() { 8377 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 8378 StubCodeMark mark(this, stub_id); 8379 8380 address start = __ pc(); 8381 const Register 8382 r_super_klass = r0, // argument 8383 r_array_base = r1, // argument 8384 temp1 = r2, // temp 8385 r_array_index = r3, // argument 8386 r_bitmap = rscratch2, // argument 8387 result = r5; // argument 8388 8389 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 8390 __ ret(lr); 8391 8392 return start; 8393 } 8394 8395 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8396 8397 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 8398 // 8399 // If LSE is in use, generate LSE versions of all the stubs. The 8400 // non-LSE versions are in atomic_aarch64.S. 8401 8402 // class AtomicStubMark records the entry point of a stub and the 8403 // stub pointer which will point to it. The stub pointer is set to 8404 // the entry point when ~AtomicStubMark() is called, which must be 8405 // after ICache::invalidate_range. This ensures safe publication of 8406 // the generated code. 8407 class AtomicStubMark { 8408 address _entry_point; 8409 aarch64_atomic_stub_t *_stub; 8410 MacroAssembler *_masm; 8411 public: 8412 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 8413 _masm = masm; 8414 __ align(32); 8415 _entry_point = __ pc(); 8416 _stub = stub; 8417 } 8418 ~AtomicStubMark() { 8419 *_stub = (aarch64_atomic_stub_t)_entry_point; 8420 } 8421 }; 8422 8423 // NB: For memory_order_conservative we need a trailing membar after 8424 // LSE atomic operations but not a leading membar. 8425 // 8426 // We don't need a leading membar because a clause in the Arm ARM 8427 // says: 8428 // 8429 // Barrier-ordered-before 8430 // 8431 // Barrier instructions order prior Memory effects before subsequent 8432 // Memory effects generated by the same Observer. A read or a write 8433 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 8434 // Observer if and only if RW1 appears in program order before RW 2 8435 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 8436 // instruction with both Acquire and Release semantics. 8437 // 8438 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 8439 // and Release semantics, therefore we don't need a leading 8440 // barrier. However, there is no corresponding Barrier-ordered-after 8441 // relationship, therefore we need a trailing membar to prevent a 8442 // later store or load from being reordered with the store in an 8443 // atomic instruction. 8444 // 8445 // This was checked by using the herd7 consistency model simulator 8446 // (http://diy.inria.fr/) with this test case: 8447 // 8448 // AArch64 LseCas 8449 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 8450 // P0 | P1; 8451 // LDR W4, [X2] | MOV W3, #0; 8452 // DMB LD | MOV W4, #1; 8453 // LDR W3, [X1] | CASAL W3, W4, [X1]; 8454 // | DMB ISH; 8455 // | STR W4, [X2]; 8456 // exists 8457 // (0:X3=0 /\ 0:X4=1) 8458 // 8459 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 8460 // with the store to x in P1. Without the DMB in P1 this may happen. 8461 // 8462 // At the time of writing we don't know of any AArch64 hardware that 8463 // reorders stores in this way, but the Reference Manual permits it. 8464 8465 void gen_cas_entry(Assembler::operand_size size, 8466 atomic_memory_order order) { 8467 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 8468 exchange_val = c_rarg2; 8469 bool acquire, release; 8470 switch (order) { 8471 case memory_order_relaxed: 8472 acquire = false; 8473 release = false; 8474 break; 8475 case memory_order_release: 8476 acquire = false; 8477 release = true; 8478 break; 8479 default: 8480 acquire = true; 8481 release = true; 8482 break; 8483 } 8484 __ mov(prev, compare_val); 8485 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 8486 if (order == memory_order_conservative) { 8487 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8488 } 8489 if (size == Assembler::xword) { 8490 __ mov(r0, prev); 8491 } else { 8492 __ movw(r0, prev); 8493 } 8494 __ ret(lr); 8495 } 8496 8497 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 8498 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 8499 // If not relaxed, then default to conservative. Relaxed is the only 8500 // case we use enough to be worth specializing. 8501 if (order == memory_order_relaxed) { 8502 __ ldadd(size, incr, prev, addr); 8503 } else { 8504 __ ldaddal(size, incr, prev, addr); 8505 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8506 } 8507 if (size == Assembler::xword) { 8508 __ mov(r0, prev); 8509 } else { 8510 __ movw(r0, prev); 8511 } 8512 __ ret(lr); 8513 } 8514 8515 void gen_swpal_entry(Assembler::operand_size size) { 8516 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 8517 __ swpal(size, incr, prev, addr); 8518 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8519 if (size == Assembler::xword) { 8520 __ mov(r0, prev); 8521 } else { 8522 __ movw(r0, prev); 8523 } 8524 __ ret(lr); 8525 } 8526 8527 void generate_atomic_entry_points() { 8528 if (! UseLSE) { 8529 return; 8530 } 8531 __ align(CodeEntryAlignment); 8532 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 8533 StubCodeMark mark(this, stub_id); 8534 address first_entry = __ pc(); 8535 8536 // ADD, memory_order_conservative 8537 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 8538 gen_ldadd_entry(Assembler::word, memory_order_conservative); 8539 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 8540 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 8541 8542 // ADD, memory_order_relaxed 8543 AtomicStubMark mark_fetch_add_4_relaxed 8544 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 8545 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 8546 AtomicStubMark mark_fetch_add_8_relaxed 8547 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 8548 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 8549 8550 // XCHG, memory_order_conservative 8551 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 8552 gen_swpal_entry(Assembler::word); 8553 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 8554 gen_swpal_entry(Assembler::xword); 8555 8556 // CAS, memory_order_conservative 8557 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 8558 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 8559 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 8560 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 8561 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 8562 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 8563 8564 // CAS, memory_order_relaxed 8565 AtomicStubMark mark_cmpxchg_1_relaxed 8566 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 8567 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 8568 AtomicStubMark mark_cmpxchg_4_relaxed 8569 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 8570 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 8571 AtomicStubMark mark_cmpxchg_8_relaxed 8572 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 8573 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 8574 8575 AtomicStubMark mark_cmpxchg_4_release 8576 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 8577 gen_cas_entry(MacroAssembler::word, memory_order_release); 8578 AtomicStubMark mark_cmpxchg_8_release 8579 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 8580 gen_cas_entry(MacroAssembler::xword, memory_order_release); 8581 8582 AtomicStubMark mark_cmpxchg_4_seq_cst 8583 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 8584 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 8585 AtomicStubMark mark_cmpxchg_8_seq_cst 8586 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 8587 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 8588 8589 ICache::invalidate_range(first_entry, __ pc() - first_entry); 8590 } 8591 #endif // LINUX 8592 8593 address generate_cont_thaw(Continuation::thaw_kind kind) { 8594 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 8595 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 8596 8597 address start = __ pc(); 8598 8599 if (return_barrier) { 8600 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 8601 __ mov(sp, rscratch1); 8602 } 8603 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 8604 8605 if (return_barrier) { 8606 // preserve possible return value from a method returning to the return barrier 8607 __ fmovd(rscratch1, v0); 8608 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 8609 } 8610 8611 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 8612 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 8613 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 8614 8615 if (return_barrier) { 8616 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 8617 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 8618 __ fmovd(v0, rscratch1); 8619 } 8620 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 8621 8622 8623 Label thaw_success; 8624 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 8625 __ cbnz(rscratch2, thaw_success); 8626 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 8627 __ br(rscratch1); 8628 __ bind(thaw_success); 8629 8630 // make room for the thawed frames 8631 __ sub(rscratch1, sp, rscratch2); 8632 __ andr(rscratch1, rscratch1, -16); // align 8633 __ mov(sp, rscratch1); 8634 8635 if (return_barrier) { 8636 // save original return value -- again 8637 __ fmovd(rscratch1, v0); 8638 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 8639 } 8640 8641 // If we want, we can templatize thaw by kind, and have three different entries 8642 __ movw(c_rarg1, (uint32_t)kind); 8643 8644 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 8645 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 8646 8647 if (return_barrier) { 8648 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 8649 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 8650 __ fmovd(v0, rscratch1); 8651 } else { 8652 __ mov(r0, zr); // return 0 (success) from doYield 8653 } 8654 8655 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 8656 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 8657 __ mov(rfp, sp); 8658 8659 if (return_barrier_exception) { 8660 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 8661 __ authenticate_return_address(c_rarg1); 8662 __ verify_oop(r0); 8663 // save return value containing the exception oop in callee-saved R19 8664 __ mov(r19, r0); 8665 8666 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 8667 8668 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 8669 // __ reinitialize_ptrue(); 8670 8671 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 8672 8673 __ mov(r1, r0); // the exception handler 8674 __ mov(r0, r19); // restore return value containing the exception oop 8675 __ verify_oop(r0); 8676 8677 __ leave(); 8678 __ mov(r3, lr); 8679 __ br(r1); // the exception handler 8680 } else { 8681 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 8682 __ leave(); 8683 __ ret(lr); 8684 } 8685 8686 return start; 8687 } 8688 8689 address generate_cont_thaw() { 8690 if (!Continuations::enabled()) return nullptr; 8691 8692 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 8693 StubCodeMark mark(this, stub_id); 8694 address start = __ pc(); 8695 generate_cont_thaw(Continuation::thaw_top); 8696 return start; 8697 } 8698 8699 address generate_cont_returnBarrier() { 8700 if (!Continuations::enabled()) return nullptr; 8701 8702 // TODO: will probably need multiple return barriers depending on return type 8703 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 8704 StubCodeMark mark(this, stub_id); 8705 address start = __ pc(); 8706 8707 generate_cont_thaw(Continuation::thaw_return_barrier); 8708 8709 return start; 8710 } 8711 8712 address generate_cont_returnBarrier_exception() { 8713 if (!Continuations::enabled()) return nullptr; 8714 8715 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 8716 StubCodeMark mark(this, stub_id); 8717 address start = __ pc(); 8718 8719 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 8720 8721 return start; 8722 } 8723 8724 address generate_cont_preempt_stub() { 8725 if (!Continuations::enabled()) return nullptr; 8726 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 8727 StubCodeMark mark(this, stub_id); 8728 address start = __ pc(); 8729 8730 __ reset_last_Java_frame(true); 8731 8732 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 8733 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 8734 __ mov(sp, rscratch2); 8735 8736 Label preemption_cancelled; 8737 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 8738 __ cbnz(rscratch1, preemption_cancelled); 8739 8740 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 8741 SharedRuntime::continuation_enter_cleanup(_masm); 8742 __ leave(); 8743 __ ret(lr); 8744 8745 // We acquired the monitor after freezing the frames so call thaw to continue execution. 8746 __ bind(preemption_cancelled); 8747 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 8748 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 8749 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 8750 __ ldr(rscratch1, Address(rscratch1)); 8751 __ br(rscratch1); 8752 8753 return start; 8754 } 8755 8756 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 8757 // are represented as long[5], with BITS_PER_LIMB = 26. 8758 // Pack five 26-bit limbs into three 64-bit registers. 8759 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 8760 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 8761 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 8762 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 8763 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 8764 8765 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 8766 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 8767 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 8768 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 8769 8770 if (dest2->is_valid()) { 8771 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 8772 } else { 8773 #ifdef ASSERT 8774 Label OK; 8775 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 8776 __ br(__ EQ, OK); 8777 __ stop("high bits of Poly1305 integer should be zero"); 8778 __ should_not_reach_here(); 8779 __ bind(OK); 8780 #endif 8781 } 8782 } 8783 8784 // As above, but return only a 128-bit integer, packed into two 8785 // 64-bit registers. 8786 void pack_26(Register dest0, Register dest1, Register src) { 8787 pack_26(dest0, dest1, noreg, src); 8788 } 8789 8790 // Multiply and multiply-accumulate unsigned 64-bit registers. 8791 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 8792 __ mul(prod_lo, n, m); 8793 __ umulh(prod_hi, n, m); 8794 } 8795 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 8796 wide_mul(rscratch1, rscratch2, n, m); 8797 __ adds(sum_lo, sum_lo, rscratch1); 8798 __ adc(sum_hi, sum_hi, rscratch2); 8799 } 8800 8801 // Poly1305, RFC 7539 8802 8803 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 8804 // description of the tricks used to simplify and accelerate this 8805 // computation. 8806 8807 address generate_poly1305_processBlocks() { 8808 __ align(CodeEntryAlignment); 8809 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 8810 StubCodeMark mark(this, stub_id); 8811 address start = __ pc(); 8812 Label here; 8813 __ enter(); 8814 RegSet callee_saved = RegSet::range(r19, r28); 8815 __ push(callee_saved, sp); 8816 8817 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 8818 8819 // Arguments 8820 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 8821 8822 // R_n is the 128-bit randomly-generated key, packed into two 8823 // registers. The caller passes this key to us as long[5], with 8824 // BITS_PER_LIMB = 26. 8825 const Register R_0 = *++regs, R_1 = *++regs; 8826 pack_26(R_0, R_1, r_start); 8827 8828 // RR_n is (R_n >> 2) * 5 8829 const Register RR_0 = *++regs, RR_1 = *++regs; 8830 __ lsr(RR_0, R_0, 2); 8831 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 8832 __ lsr(RR_1, R_1, 2); 8833 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 8834 8835 // U_n is the current checksum 8836 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 8837 pack_26(U_0, U_1, U_2, acc_start); 8838 8839 static constexpr int BLOCK_LENGTH = 16; 8840 Label DONE, LOOP; 8841 8842 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 8843 __ br(Assembler::LT, DONE); { 8844 __ bind(LOOP); 8845 8846 // S_n is to be the sum of U_n and the next block of data 8847 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 8848 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 8849 __ adds(S_0, U_0, S_0); 8850 __ adcs(S_1, U_1, S_1); 8851 __ adc(S_2, U_2, zr); 8852 __ add(S_2, S_2, 1); 8853 8854 const Register U_0HI = *++regs, U_1HI = *++regs; 8855 8856 // NB: this logic depends on some of the special properties of 8857 // Poly1305 keys. In particular, because we know that the top 8858 // four bits of R_0 and R_1 are zero, we can add together 8859 // partial products without any risk of needing to propagate a 8860 // carry out. 8861 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 8862 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 8863 __ andr(U_2, R_0, 3); 8864 __ mul(U_2, S_2, U_2); 8865 8866 // Recycle registers S_0, S_1, S_2 8867 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 8868 8869 // Partial reduction mod 2**130 - 5 8870 __ adds(U_1, U_0HI, U_1); 8871 __ adc(U_2, U_1HI, U_2); 8872 // Sum now in U_2:U_1:U_0. 8873 // Dead: U_0HI, U_1HI. 8874 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 8875 8876 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 8877 8878 // First, U_2:U_1:U_0 += (U_2 >> 2) 8879 __ lsr(rscratch1, U_2, 2); 8880 __ andr(U_2, U_2, (u8)3); 8881 __ adds(U_0, U_0, rscratch1); 8882 __ adcs(U_1, U_1, zr); 8883 __ adc(U_2, U_2, zr); 8884 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 8885 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 8886 __ adcs(U_1, U_1, zr); 8887 __ adc(U_2, U_2, zr); 8888 8889 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 8890 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 8891 __ br(~ Assembler::LT, LOOP); 8892 } 8893 8894 // Further reduce modulo 2^130 - 5 8895 __ lsr(rscratch1, U_2, 2); 8896 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 8897 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 8898 __ adcs(U_1, U_1, zr); 8899 __ andr(U_2, U_2, (u1)3); 8900 __ adc(U_2, U_2, zr); 8901 8902 // Unpack the sum into five 26-bit limbs and write to memory. 8903 __ ubfiz(rscratch1, U_0, 0, 26); 8904 __ ubfx(rscratch2, U_0, 26, 26); 8905 __ stp(rscratch1, rscratch2, Address(acc_start)); 8906 __ ubfx(rscratch1, U_0, 52, 12); 8907 __ bfi(rscratch1, U_1, 12, 14); 8908 __ ubfx(rscratch2, U_1, 14, 26); 8909 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 8910 __ ubfx(rscratch1, U_1, 40, 24); 8911 __ bfi(rscratch1, U_2, 24, 3); 8912 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 8913 8914 __ bind(DONE); 8915 __ pop(callee_saved, sp); 8916 __ leave(); 8917 __ ret(lr); 8918 8919 return start; 8920 } 8921 8922 // exception handler for upcall stubs 8923 address generate_upcall_stub_exception_handler() { 8924 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 8925 StubCodeMark mark(this, stub_id); 8926 address start = __ pc(); 8927 8928 // Native caller has no idea how to handle exceptions, 8929 // so we just crash here. Up to callee to catch exceptions. 8930 __ verify_oop(r0); 8931 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 8932 __ blr(rscratch1); 8933 __ should_not_reach_here(); 8934 8935 return start; 8936 } 8937 8938 // load Method* target of MethodHandle 8939 // j_rarg0 = jobject receiver 8940 // rmethod = result 8941 address generate_upcall_stub_load_target() { 8942 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 8943 StubCodeMark mark(this, stub_id); 8944 address start = __ pc(); 8945 8946 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 8947 // Load target method from receiver 8948 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 8949 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 8950 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 8951 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 8952 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 8953 noreg, noreg); 8954 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 8955 8956 __ ret(lr); 8957 8958 return start; 8959 } 8960 8961 #undef __ 8962 #define __ masm-> 8963 8964 class MontgomeryMultiplyGenerator : public MacroAssembler { 8965 8966 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 8967 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 8968 8969 RegSet _toSave; 8970 bool _squaring; 8971 8972 public: 8973 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 8974 : MacroAssembler(as->code()), _squaring(squaring) { 8975 8976 // Register allocation 8977 8978 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 8979 Pa_base = *regs; // Argument registers 8980 if (squaring) 8981 Pb_base = Pa_base; 8982 else 8983 Pb_base = *++regs; 8984 Pn_base = *++regs; 8985 Rlen= *++regs; 8986 inv = *++regs; 8987 Pm_base = *++regs; 8988 8989 // Working registers: 8990 Ra = *++regs; // The current digit of a, b, n, and m. 8991 Rb = *++regs; 8992 Rm = *++regs; 8993 Rn = *++regs; 8994 8995 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 8996 Pb = *++regs; 8997 Pm = *++regs; 8998 Pn = *++regs; 8999 9000 t0 = *++regs; // Three registers which form a 9001 t1 = *++regs; // triple-precision accumuator. 9002 t2 = *++regs; 9003 9004 Ri = *++regs; // Inner and outer loop indexes. 9005 Rj = *++regs; 9006 9007 Rhi_ab = *++regs; // Product registers: low and high parts 9008 Rlo_ab = *++regs; // of a*b and m*n. 9009 Rhi_mn = *++regs; 9010 Rlo_mn = *++regs; 9011 9012 // r19 and up are callee-saved. 9013 _toSave = RegSet::range(r19, *regs) + Pm_base; 9014 } 9015 9016 private: 9017 void save_regs() { 9018 push(_toSave, sp); 9019 } 9020 9021 void restore_regs() { 9022 pop(_toSave, sp); 9023 } 9024 9025 template <typename T> 9026 void unroll_2(Register count, T block) { 9027 Label loop, end, odd; 9028 tbnz(count, 0, odd); 9029 cbz(count, end); 9030 align(16); 9031 bind(loop); 9032 (this->*block)(); 9033 bind(odd); 9034 (this->*block)(); 9035 subs(count, count, 2); 9036 br(Assembler::GT, loop); 9037 bind(end); 9038 } 9039 9040 template <typename T> 9041 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 9042 Label loop, end, odd; 9043 tbnz(count, 0, odd); 9044 cbz(count, end); 9045 align(16); 9046 bind(loop); 9047 (this->*block)(d, s, tmp); 9048 bind(odd); 9049 (this->*block)(d, s, tmp); 9050 subs(count, count, 2); 9051 br(Assembler::GT, loop); 9052 bind(end); 9053 } 9054 9055 void pre1(RegisterOrConstant i) { 9056 block_comment("pre1"); 9057 // Pa = Pa_base; 9058 // Pb = Pb_base + i; 9059 // Pm = Pm_base; 9060 // Pn = Pn_base + i; 9061 // Ra = *Pa; 9062 // Rb = *Pb; 9063 // Rm = *Pm; 9064 // Rn = *Pn; 9065 ldr(Ra, Address(Pa_base)); 9066 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 9067 ldr(Rm, Address(Pm_base)); 9068 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9069 lea(Pa, Address(Pa_base)); 9070 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 9071 lea(Pm, Address(Pm_base)); 9072 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9073 9074 // Zero the m*n result. 9075 mov(Rhi_mn, zr); 9076 mov(Rlo_mn, zr); 9077 } 9078 9079 // The core multiply-accumulate step of a Montgomery 9080 // multiplication. The idea is to schedule operations as a 9081 // pipeline so that instructions with long latencies (loads and 9082 // multiplies) have time to complete before their results are 9083 // used. This most benefits in-order implementations of the 9084 // architecture but out-of-order ones also benefit. 9085 void step() { 9086 block_comment("step"); 9087 // MACC(Ra, Rb, t0, t1, t2); 9088 // Ra = *++Pa; 9089 // Rb = *--Pb; 9090 umulh(Rhi_ab, Ra, Rb); 9091 mul(Rlo_ab, Ra, Rb); 9092 ldr(Ra, pre(Pa, wordSize)); 9093 ldr(Rb, pre(Pb, -wordSize)); 9094 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 9095 // previous iteration. 9096 // MACC(Rm, Rn, t0, t1, t2); 9097 // Rm = *++Pm; 9098 // Rn = *--Pn; 9099 umulh(Rhi_mn, Rm, Rn); 9100 mul(Rlo_mn, Rm, Rn); 9101 ldr(Rm, pre(Pm, wordSize)); 9102 ldr(Rn, pre(Pn, -wordSize)); 9103 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9104 } 9105 9106 void post1() { 9107 block_comment("post1"); 9108 9109 // MACC(Ra, Rb, t0, t1, t2); 9110 // Ra = *++Pa; 9111 // Rb = *--Pb; 9112 umulh(Rhi_ab, Ra, Rb); 9113 mul(Rlo_ab, Ra, Rb); 9114 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9115 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9116 9117 // *Pm = Rm = t0 * inv; 9118 mul(Rm, t0, inv); 9119 str(Rm, Address(Pm)); 9120 9121 // MACC(Rm, Rn, t0, t1, t2); 9122 // t0 = t1; t1 = t2; t2 = 0; 9123 umulh(Rhi_mn, Rm, Rn); 9124 9125 #ifndef PRODUCT 9126 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 9127 { 9128 mul(Rlo_mn, Rm, Rn); 9129 add(Rlo_mn, t0, Rlo_mn); 9130 Label ok; 9131 cbz(Rlo_mn, ok); { 9132 stop("broken Montgomery multiply"); 9133 } bind(ok); 9134 } 9135 #endif 9136 // We have very carefully set things up so that 9137 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 9138 // the lower half of Rm * Rn because we know the result already: 9139 // it must be -t0. t0 + (-t0) must generate a carry iff 9140 // t0 != 0. So, rather than do a mul and an adds we just set 9141 // the carry flag iff t0 is nonzero. 9142 // 9143 // mul(Rlo_mn, Rm, Rn); 9144 // adds(zr, t0, Rlo_mn); 9145 subs(zr, t0, 1); // Set carry iff t0 is nonzero 9146 adcs(t0, t1, Rhi_mn); 9147 adc(t1, t2, zr); 9148 mov(t2, zr); 9149 } 9150 9151 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 9152 block_comment("pre2"); 9153 // Pa = Pa_base + i-len; 9154 // Pb = Pb_base + len; 9155 // Pm = Pm_base + i-len; 9156 // Pn = Pn_base + len; 9157 9158 if (i.is_register()) { 9159 sub(Rj, i.as_register(), len); 9160 } else { 9161 mov(Rj, i.as_constant()); 9162 sub(Rj, Rj, len); 9163 } 9164 // Rj == i-len 9165 9166 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 9167 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 9168 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 9169 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 9170 9171 // Ra = *++Pa; 9172 // Rb = *--Pb; 9173 // Rm = *++Pm; 9174 // Rn = *--Pn; 9175 ldr(Ra, pre(Pa, wordSize)); 9176 ldr(Rb, pre(Pb, -wordSize)); 9177 ldr(Rm, pre(Pm, wordSize)); 9178 ldr(Rn, pre(Pn, -wordSize)); 9179 9180 mov(Rhi_mn, zr); 9181 mov(Rlo_mn, zr); 9182 } 9183 9184 void post2(RegisterOrConstant i, RegisterOrConstant len) { 9185 block_comment("post2"); 9186 if (i.is_constant()) { 9187 mov(Rj, i.as_constant()-len.as_constant()); 9188 } else { 9189 sub(Rj, i.as_register(), len); 9190 } 9191 9192 adds(t0, t0, Rlo_mn); // The pending m*n, low part 9193 9194 // As soon as we know the least significant digit of our result, 9195 // store it. 9196 // Pm_base[i-len] = t0; 9197 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 9198 9199 // t0 = t1; t1 = t2; t2 = 0; 9200 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 9201 adc(t1, t2, zr); 9202 mov(t2, zr); 9203 } 9204 9205 // A carry in t0 after Montgomery multiplication means that we 9206 // should subtract multiples of n from our result in m. We'll 9207 // keep doing that until there is no carry. 9208 void normalize(RegisterOrConstant len) { 9209 block_comment("normalize"); 9210 // while (t0) 9211 // t0 = sub(Pm_base, Pn_base, t0, len); 9212 Label loop, post, again; 9213 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 9214 cbz(t0, post); { 9215 bind(again); { 9216 mov(i, zr); 9217 mov(cnt, len); 9218 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9219 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9220 subs(zr, zr, zr); // set carry flag, i.e. no borrow 9221 align(16); 9222 bind(loop); { 9223 sbcs(Rm, Rm, Rn); 9224 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9225 add(i, i, 1); 9226 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9227 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9228 sub(cnt, cnt, 1); 9229 } cbnz(cnt, loop); 9230 sbc(t0, t0, zr); 9231 } cbnz(t0, again); 9232 } bind(post); 9233 } 9234 9235 // Move memory at s to d, reversing words. 9236 // Increments d to end of copied memory 9237 // Destroys tmp1, tmp2 9238 // Preserves len 9239 // Leaves s pointing to the address which was in d at start 9240 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 9241 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 9242 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 9243 9244 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 9245 mov(tmp1, len); 9246 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 9247 sub(s, d, len, ext::uxtw, LogBytesPerWord); 9248 } 9249 // where 9250 void reverse1(Register d, Register s, Register tmp) { 9251 ldr(tmp, pre(s, -wordSize)); 9252 ror(tmp, tmp, 32); 9253 str(tmp, post(d, wordSize)); 9254 } 9255 9256 void step_squaring() { 9257 // An extra ACC 9258 step(); 9259 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9260 } 9261 9262 void last_squaring(RegisterOrConstant i) { 9263 Label dont; 9264 // if ((i & 1) == 0) { 9265 tbnz(i.as_register(), 0, dont); { 9266 // MACC(Ra, Rb, t0, t1, t2); 9267 // Ra = *++Pa; 9268 // Rb = *--Pb; 9269 umulh(Rhi_ab, Ra, Rb); 9270 mul(Rlo_ab, Ra, Rb); 9271 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9272 } bind(dont); 9273 } 9274 9275 void extra_step_squaring() { 9276 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9277 9278 // MACC(Rm, Rn, t0, t1, t2); 9279 // Rm = *++Pm; 9280 // Rn = *--Pn; 9281 umulh(Rhi_mn, Rm, Rn); 9282 mul(Rlo_mn, Rm, Rn); 9283 ldr(Rm, pre(Pm, wordSize)); 9284 ldr(Rn, pre(Pn, -wordSize)); 9285 } 9286 9287 void post1_squaring() { 9288 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9289 9290 // *Pm = Rm = t0 * inv; 9291 mul(Rm, t0, inv); 9292 str(Rm, Address(Pm)); 9293 9294 // MACC(Rm, Rn, t0, t1, t2); 9295 // t0 = t1; t1 = t2; t2 = 0; 9296 umulh(Rhi_mn, Rm, Rn); 9297 9298 #ifndef PRODUCT 9299 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 9300 { 9301 mul(Rlo_mn, Rm, Rn); 9302 add(Rlo_mn, t0, Rlo_mn); 9303 Label ok; 9304 cbz(Rlo_mn, ok); { 9305 stop("broken Montgomery multiply"); 9306 } bind(ok); 9307 } 9308 #endif 9309 // We have very carefully set things up so that 9310 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 9311 // the lower half of Rm * Rn because we know the result already: 9312 // it must be -t0. t0 + (-t0) must generate a carry iff 9313 // t0 != 0. So, rather than do a mul and an adds we just set 9314 // the carry flag iff t0 is nonzero. 9315 // 9316 // mul(Rlo_mn, Rm, Rn); 9317 // adds(zr, t0, Rlo_mn); 9318 subs(zr, t0, 1); // Set carry iff t0 is nonzero 9319 adcs(t0, t1, Rhi_mn); 9320 adc(t1, t2, zr); 9321 mov(t2, zr); 9322 } 9323 9324 void acc(Register Rhi, Register Rlo, 9325 Register t0, Register t1, Register t2) { 9326 adds(t0, t0, Rlo); 9327 adcs(t1, t1, Rhi); 9328 adc(t2, t2, zr); 9329 } 9330 9331 public: 9332 /** 9333 * Fast Montgomery multiplication. The derivation of the 9334 * algorithm is in A Cryptographic Library for the Motorola 9335 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 9336 * 9337 * Arguments: 9338 * 9339 * Inputs for multiplication: 9340 * c_rarg0 - int array elements a 9341 * c_rarg1 - int array elements b 9342 * c_rarg2 - int array elements n (the modulus) 9343 * c_rarg3 - int length 9344 * c_rarg4 - int inv 9345 * c_rarg5 - int array elements m (the result) 9346 * 9347 * Inputs for squaring: 9348 * c_rarg0 - int array elements a 9349 * c_rarg1 - int array elements n (the modulus) 9350 * c_rarg2 - int length 9351 * c_rarg3 - int inv 9352 * c_rarg4 - int array elements m (the result) 9353 * 9354 */ 9355 address generate_multiply() { 9356 Label argh, nothing; 9357 bind(argh); 9358 stop("MontgomeryMultiply total_allocation must be <= 8192"); 9359 9360 align(CodeEntryAlignment); 9361 address entry = pc(); 9362 9363 cbzw(Rlen, nothing); 9364 9365 enter(); 9366 9367 // Make room. 9368 cmpw(Rlen, 512); 9369 br(Assembler::HI, argh); 9370 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 9371 andr(sp, Ra, -2 * wordSize); 9372 9373 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 9374 9375 { 9376 // Copy input args, reversing as we go. We use Ra as a 9377 // temporary variable. 9378 reverse(Ra, Pa_base, Rlen, t0, t1); 9379 if (!_squaring) 9380 reverse(Ra, Pb_base, Rlen, t0, t1); 9381 reverse(Ra, Pn_base, Rlen, t0, t1); 9382 } 9383 9384 // Push all call-saved registers and also Pm_base which we'll need 9385 // at the end. 9386 save_regs(); 9387 9388 #ifndef PRODUCT 9389 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 9390 { 9391 ldr(Rn, Address(Pn_base, 0)); 9392 mul(Rlo_mn, Rn, inv); 9393 subs(zr, Rlo_mn, -1); 9394 Label ok; 9395 br(EQ, ok); { 9396 stop("broken inverse in Montgomery multiply"); 9397 } bind(ok); 9398 } 9399 #endif 9400 9401 mov(Pm_base, Ra); 9402 9403 mov(t0, zr); 9404 mov(t1, zr); 9405 mov(t2, zr); 9406 9407 block_comment("for (int i = 0; i < len; i++) {"); 9408 mov(Ri, zr); { 9409 Label loop, end; 9410 cmpw(Ri, Rlen); 9411 br(Assembler::GE, end); 9412 9413 bind(loop); 9414 pre1(Ri); 9415 9416 block_comment(" for (j = i; j; j--) {"); { 9417 movw(Rj, Ri); 9418 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 9419 } block_comment(" } // j"); 9420 9421 post1(); 9422 addw(Ri, Ri, 1); 9423 cmpw(Ri, Rlen); 9424 br(Assembler::LT, loop); 9425 bind(end); 9426 block_comment("} // i"); 9427 } 9428 9429 block_comment("for (int i = len; i < 2*len; i++) {"); 9430 mov(Ri, Rlen); { 9431 Label loop, end; 9432 cmpw(Ri, Rlen, Assembler::LSL, 1); 9433 br(Assembler::GE, end); 9434 9435 bind(loop); 9436 pre2(Ri, Rlen); 9437 9438 block_comment(" for (j = len*2-i-1; j; j--) {"); { 9439 lslw(Rj, Rlen, 1); 9440 subw(Rj, Rj, Ri); 9441 subw(Rj, Rj, 1); 9442 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 9443 } block_comment(" } // j"); 9444 9445 post2(Ri, Rlen); 9446 addw(Ri, Ri, 1); 9447 cmpw(Ri, Rlen, Assembler::LSL, 1); 9448 br(Assembler::LT, loop); 9449 bind(end); 9450 } 9451 block_comment("} // i"); 9452 9453 normalize(Rlen); 9454 9455 mov(Ra, Pm_base); // Save Pm_base in Ra 9456 restore_regs(); // Restore caller's Pm_base 9457 9458 // Copy our result into caller's Pm_base 9459 reverse(Pm_base, Ra, Rlen, t0, t1); 9460 9461 leave(); 9462 bind(nothing); 9463 ret(lr); 9464 9465 return entry; 9466 } 9467 // In C, approximately: 9468 9469 // void 9470 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 9471 // julong Pn_base[], julong Pm_base[], 9472 // julong inv, int len) { 9473 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 9474 // julong *Pa, *Pb, *Pn, *Pm; 9475 // julong Ra, Rb, Rn, Rm; 9476 9477 // int i; 9478 9479 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 9480 9481 // for (i = 0; i < len; i++) { 9482 // int j; 9483 9484 // Pa = Pa_base; 9485 // Pb = Pb_base + i; 9486 // Pm = Pm_base; 9487 // Pn = Pn_base + i; 9488 9489 // Ra = *Pa; 9490 // Rb = *Pb; 9491 // Rm = *Pm; 9492 // Rn = *Pn; 9493 9494 // int iters = i; 9495 // for (j = 0; iters--; j++) { 9496 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 9497 // MACC(Ra, Rb, t0, t1, t2); 9498 // Ra = *++Pa; 9499 // Rb = *--Pb; 9500 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9501 // MACC(Rm, Rn, t0, t1, t2); 9502 // Rm = *++Pm; 9503 // Rn = *--Pn; 9504 // } 9505 9506 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 9507 // MACC(Ra, Rb, t0, t1, t2); 9508 // *Pm = Rm = t0 * inv; 9509 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 9510 // MACC(Rm, Rn, t0, t1, t2); 9511 9512 // assert(t0 == 0, "broken Montgomery multiply"); 9513 9514 // t0 = t1; t1 = t2; t2 = 0; 9515 // } 9516 9517 // for (i = len; i < 2*len; i++) { 9518 // int j; 9519 9520 // Pa = Pa_base + i-len; 9521 // Pb = Pb_base + len; 9522 // Pm = Pm_base + i-len; 9523 // Pn = Pn_base + len; 9524 9525 // Ra = *++Pa; 9526 // Rb = *--Pb; 9527 // Rm = *++Pm; 9528 // Rn = *--Pn; 9529 9530 // int iters = len*2-i-1; 9531 // for (j = i-len+1; iters--; j++) { 9532 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 9533 // MACC(Ra, Rb, t0, t1, t2); 9534 // Ra = *++Pa; 9535 // Rb = *--Pb; 9536 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9537 // MACC(Rm, Rn, t0, t1, t2); 9538 // Rm = *++Pm; 9539 // Rn = *--Pn; 9540 // } 9541 9542 // Pm_base[i-len] = t0; 9543 // t0 = t1; t1 = t2; t2 = 0; 9544 // } 9545 9546 // while (t0) 9547 // t0 = sub(Pm_base, Pn_base, t0, len); 9548 // } 9549 9550 /** 9551 * Fast Montgomery squaring. This uses asymptotically 25% fewer 9552 * multiplies than Montgomery multiplication so it should be up to 9553 * 25% faster. However, its loop control is more complex and it 9554 * may actually run slower on some machines. 9555 * 9556 * Arguments: 9557 * 9558 * Inputs: 9559 * c_rarg0 - int array elements a 9560 * c_rarg1 - int array elements n (the modulus) 9561 * c_rarg2 - int length 9562 * c_rarg3 - int inv 9563 * c_rarg4 - int array elements m (the result) 9564 * 9565 */ 9566 address generate_square() { 9567 Label argh; 9568 bind(argh); 9569 stop("MontgomeryMultiply total_allocation must be <= 8192"); 9570 9571 align(CodeEntryAlignment); 9572 address entry = pc(); 9573 9574 enter(); 9575 9576 // Make room. 9577 cmpw(Rlen, 512); 9578 br(Assembler::HI, argh); 9579 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 9580 andr(sp, Ra, -2 * wordSize); 9581 9582 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 9583 9584 { 9585 // Copy input args, reversing as we go. We use Ra as a 9586 // temporary variable. 9587 reverse(Ra, Pa_base, Rlen, t0, t1); 9588 reverse(Ra, Pn_base, Rlen, t0, t1); 9589 } 9590 9591 // Push all call-saved registers and also Pm_base which we'll need 9592 // at the end. 9593 save_regs(); 9594 9595 mov(Pm_base, Ra); 9596 9597 mov(t0, zr); 9598 mov(t1, zr); 9599 mov(t2, zr); 9600 9601 block_comment("for (int i = 0; i < len; i++) {"); 9602 mov(Ri, zr); { 9603 Label loop, end; 9604 bind(loop); 9605 cmp(Ri, Rlen); 9606 br(Assembler::GE, end); 9607 9608 pre1(Ri); 9609 9610 block_comment("for (j = (i+1)/2; j; j--) {"); { 9611 add(Rj, Ri, 1); 9612 lsr(Rj, Rj, 1); 9613 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 9614 } block_comment(" } // j"); 9615 9616 last_squaring(Ri); 9617 9618 block_comment(" for (j = i/2; j; j--) {"); { 9619 lsr(Rj, Ri, 1); 9620 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 9621 } block_comment(" } // j"); 9622 9623 post1_squaring(); 9624 add(Ri, Ri, 1); 9625 cmp(Ri, Rlen); 9626 br(Assembler::LT, loop); 9627 9628 bind(end); 9629 block_comment("} // i"); 9630 } 9631 9632 block_comment("for (int i = len; i < 2*len; i++) {"); 9633 mov(Ri, Rlen); { 9634 Label loop, end; 9635 bind(loop); 9636 cmp(Ri, Rlen, Assembler::LSL, 1); 9637 br(Assembler::GE, end); 9638 9639 pre2(Ri, Rlen); 9640 9641 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 9642 lsl(Rj, Rlen, 1); 9643 sub(Rj, Rj, Ri); 9644 sub(Rj, Rj, 1); 9645 lsr(Rj, Rj, 1); 9646 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 9647 } block_comment(" } // j"); 9648 9649 last_squaring(Ri); 9650 9651 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 9652 lsl(Rj, Rlen, 1); 9653 sub(Rj, Rj, Ri); 9654 lsr(Rj, Rj, 1); 9655 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 9656 } block_comment(" } // j"); 9657 9658 post2(Ri, Rlen); 9659 add(Ri, Ri, 1); 9660 cmp(Ri, Rlen, Assembler::LSL, 1); 9661 9662 br(Assembler::LT, loop); 9663 bind(end); 9664 block_comment("} // i"); 9665 } 9666 9667 normalize(Rlen); 9668 9669 mov(Ra, Pm_base); // Save Pm_base in Ra 9670 restore_regs(); // Restore caller's Pm_base 9671 9672 // Copy our result into caller's Pm_base 9673 reverse(Pm_base, Ra, Rlen, t0, t1); 9674 9675 leave(); 9676 ret(lr); 9677 9678 return entry; 9679 } 9680 // In C, approximately: 9681 9682 // void 9683 // montgomery_square(julong Pa_base[], julong Pn_base[], 9684 // julong Pm_base[], julong inv, int len) { 9685 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 9686 // julong *Pa, *Pb, *Pn, *Pm; 9687 // julong Ra, Rb, Rn, Rm; 9688 9689 // int i; 9690 9691 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 9692 9693 // for (i = 0; i < len; i++) { 9694 // int j; 9695 9696 // Pa = Pa_base; 9697 // Pb = Pa_base + i; 9698 // Pm = Pm_base; 9699 // Pn = Pn_base + i; 9700 9701 // Ra = *Pa; 9702 // Rb = *Pb; 9703 // Rm = *Pm; 9704 // Rn = *Pn; 9705 9706 // int iters = (i+1)/2; 9707 // for (j = 0; iters--; j++) { 9708 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 9709 // MACC2(Ra, Rb, t0, t1, t2); 9710 // Ra = *++Pa; 9711 // Rb = *--Pb; 9712 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9713 // MACC(Rm, Rn, t0, t1, t2); 9714 // Rm = *++Pm; 9715 // Rn = *--Pn; 9716 // } 9717 // if ((i & 1) == 0) { 9718 // assert(Ra == Pa_base[j], "must be"); 9719 // MACC(Ra, Ra, t0, t1, t2); 9720 // } 9721 // iters = i/2; 9722 // assert(iters == i-j, "must be"); 9723 // for (; iters--; j++) { 9724 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9725 // MACC(Rm, Rn, t0, t1, t2); 9726 // Rm = *++Pm; 9727 // Rn = *--Pn; 9728 // } 9729 9730 // *Pm = Rm = t0 * inv; 9731 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 9732 // MACC(Rm, Rn, t0, t1, t2); 9733 9734 // assert(t0 == 0, "broken Montgomery multiply"); 9735 9736 // t0 = t1; t1 = t2; t2 = 0; 9737 // } 9738 9739 // for (i = len; i < 2*len; i++) { 9740 // int start = i-len+1; 9741 // int end = start + (len - start)/2; 9742 // int j; 9743 9744 // Pa = Pa_base + i-len; 9745 // Pb = Pa_base + len; 9746 // Pm = Pm_base + i-len; 9747 // Pn = Pn_base + len; 9748 9749 // Ra = *++Pa; 9750 // Rb = *--Pb; 9751 // Rm = *++Pm; 9752 // Rn = *--Pn; 9753 9754 // int iters = (2*len-i-1)/2; 9755 // assert(iters == end-start, "must be"); 9756 // for (j = start; iters--; j++) { 9757 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 9758 // MACC2(Ra, Rb, t0, t1, t2); 9759 // Ra = *++Pa; 9760 // Rb = *--Pb; 9761 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9762 // MACC(Rm, Rn, t0, t1, t2); 9763 // Rm = *++Pm; 9764 // Rn = *--Pn; 9765 // } 9766 // if ((i & 1) == 0) { 9767 // assert(Ra == Pa_base[j], "must be"); 9768 // MACC(Ra, Ra, t0, t1, t2); 9769 // } 9770 // iters = (2*len-i)/2; 9771 // assert(iters == len-j, "must be"); 9772 // for (; iters--; j++) { 9773 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9774 // MACC(Rm, Rn, t0, t1, t2); 9775 // Rm = *++Pm; 9776 // Rn = *--Pn; 9777 // } 9778 // Pm_base[i-len] = t0; 9779 // t0 = t1; t1 = t2; t2 = 0; 9780 // } 9781 9782 // while (t0) 9783 // t0 = sub(Pm_base, Pn_base, t0, len); 9784 // } 9785 }; 9786 9787 void generate_vector_math_stubs() { 9788 // Get native vector math stub routine addresses 9789 void* libsleef = nullptr; 9790 char ebuf[1024]; 9791 char dll_name[JVM_MAXPATHLEN]; 9792 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 9793 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 9794 } 9795 if (libsleef == nullptr) { 9796 log_info(library)("Failed to load native vector math library, %s!", ebuf); 9797 return; 9798 } 9799 // Method naming convention 9800 // All the methods are named as <OP><T><N>_<U><suffix> 9801 // Where: 9802 // <OP> is the operation name, e.g. sin 9803 // <T> is optional to indicate float/double 9804 // "f/d" for vector float/double operation 9805 // <N> is the number of elements in the vector 9806 // "2/4" for neon, and "x" for sve 9807 // <U> is the precision level 9808 // "u10/u05" represents 1.0/0.5 ULP error bounds 9809 // We use "u10" for all operations by default 9810 // But for those functions do not have u10 support, we use "u05" instead 9811 // <suffix> indicates neon/sve 9812 // "sve/advsimd" for sve/neon implementations 9813 // e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions 9814 // cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions 9815 // 9816 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 9817 9818 // Math vector stubs implemented with SVE for scalable vector size. 9819 if (UseSVE > 0) { 9820 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 9821 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 9822 // Skip "tanh" because there is performance regression 9823 if (vop == VectorSupport::VECTOR_OP_TANH) { 9824 continue; 9825 } 9826 9827 // The native library does not support u10 level of "hypot". 9828 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 9829 9830 snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf); 9831 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 9832 9833 snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf); 9834 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 9835 } 9836 } 9837 9838 // Math vector stubs implemented with NEON for 64/128 bits vector size. 9839 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 9840 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 9841 // Skip "tanh" because there is performance regression 9842 if (vop == VectorSupport::VECTOR_OP_TANH) { 9843 continue; 9844 } 9845 9846 // The native library does not support u10 level of "hypot". 9847 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 9848 9849 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 9850 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf); 9851 9852 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 9853 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 9854 9855 snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf); 9856 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 9857 } 9858 } 9859 9860 // Initialization 9861 void generate_initial_stubs() { 9862 // Generate initial stubs and initializes the entry points 9863 9864 // entry points that exist in all platforms Note: This is code 9865 // that could be shared among different platforms - however the 9866 // benefit seems to be smaller than the disadvantage of having a 9867 // much more complicated generator structure. See also comment in 9868 // stubRoutines.hpp. 9869 9870 StubRoutines::_forward_exception_entry = generate_forward_exception(); 9871 9872 StubRoutines::_call_stub_entry = 9873 generate_call_stub(StubRoutines::_call_stub_return_address); 9874 9875 // is referenced by megamorphic call 9876 StubRoutines::_catch_exception_entry = generate_catch_exception(); 9877 9878 // Initialize table for copy memory (arraycopy) check. 9879 if (UnsafeMemoryAccess::_table == nullptr) { 9880 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 9881 } 9882 9883 if (UseCRC32Intrinsics) { 9884 // set table address before stub generation which use it 9885 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 9886 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 9887 } 9888 9889 if (UseCRC32CIntrinsics) { 9890 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 9891 } 9892 9893 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 9894 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 9895 } 9896 9897 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 9898 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 9899 } 9900 9901 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 9902 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 9903 StubRoutines::_hf2f = generate_float16ToFloat(); 9904 StubRoutines::_f2hf = generate_floatToFloat16(); 9905 } 9906 } 9907 9908 void generate_continuation_stubs() { 9909 // Continuation stubs: 9910 StubRoutines::_cont_thaw = generate_cont_thaw(); 9911 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 9912 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 9913 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 9914 } 9915 9916 void generate_final_stubs() { 9917 // support for verify_oop (must happen after universe_init) 9918 if (VerifyOops) { 9919 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 9920 } 9921 9922 // arraycopy stubs used by compilers 9923 generate_arraycopy_stubs(); 9924 9925 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 9926 9927 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 9928 9929 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 9930 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 9931 9932 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 9933 9934 generate_atomic_entry_points(); 9935 9936 #endif // LINUX 9937 9938 #ifdef COMPILER2 9939 if (UseSecondarySupersTable) { 9940 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 9941 if (! InlineSecondarySupersTest) { 9942 generate_lookup_secondary_supers_table_stub(); 9943 } 9944 } 9945 #endif 9946 9947 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 9948 } 9949 9950 void generate_compiler_stubs() { 9951 #if COMPILER2_OR_JVMCI 9952 9953 if (UseSVE == 0) { 9954 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 9955 } 9956 9957 // array equals stub for large arrays. 9958 if (!UseSimpleArrayEquals) { 9959 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 9960 } 9961 9962 // arrays_hascode stub for large arrays. 9963 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 9964 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 9965 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 9966 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 9967 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 9968 9969 // byte_array_inflate stub for large arrays. 9970 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 9971 9972 // countPositives stub for large arrays. 9973 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 9974 9975 generate_compare_long_strings(); 9976 9977 generate_string_indexof_stubs(); 9978 9979 #ifdef COMPILER2 9980 if (UseMultiplyToLenIntrinsic) { 9981 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 9982 } 9983 9984 if (UseSquareToLenIntrinsic) { 9985 StubRoutines::_squareToLen = generate_squareToLen(); 9986 } 9987 9988 if (UseMulAddIntrinsic) { 9989 StubRoutines::_mulAdd = generate_mulAdd(); 9990 } 9991 9992 if (UseSIMDForBigIntegerShiftIntrinsics) { 9993 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 9994 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 9995 } 9996 9997 if (UseMontgomeryMultiplyIntrinsic) { 9998 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 9999 StubCodeMark mark(this, stub_id); 10000 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 10001 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 10002 } 10003 10004 if (UseMontgomerySquareIntrinsic) { 10005 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 10006 StubCodeMark mark(this, stub_id); 10007 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 10008 // We use generate_multiply() rather than generate_square() 10009 // because it's faster for the sizes of modulus we care about. 10010 StubRoutines::_montgomerySquare = g.generate_multiply(); 10011 } 10012 10013 generate_vector_math_stubs(); 10014 10015 #endif // COMPILER2 10016 10017 if (UseChaCha20Intrinsics) { 10018 StubRoutines::_chacha20Block = generate_chacha20Block_qrpar(); 10019 } 10020 10021 if (UseDilithiumIntrinsics) { 10022 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 10023 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 10024 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 10025 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 10026 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 10027 } 10028 10029 if (UseBASE64Intrinsics) { 10030 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 10031 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 10032 } 10033 10034 // data cache line writeback 10035 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 10036 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 10037 10038 if (UseAESIntrinsics) { 10039 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 10040 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 10041 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 10042 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 10043 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 10044 } 10045 if (UseGHASHIntrinsics) { 10046 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 10047 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 10048 } 10049 if (UseAESIntrinsics && UseGHASHIntrinsics) { 10050 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 10051 } 10052 10053 if (UseMD5Intrinsics) { 10054 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 10055 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 10056 } 10057 if (UseSHA1Intrinsics) { 10058 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 10059 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 10060 } 10061 if (UseSHA256Intrinsics) { 10062 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 10063 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 10064 } 10065 if (UseSHA512Intrinsics) { 10066 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 10067 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 10068 } 10069 if (UseSHA3Intrinsics) { 10070 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 10071 StubRoutines::_double_keccak = generate_double_keccak(); 10072 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 10073 } 10074 10075 if (UsePoly1305Intrinsics) { 10076 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 10077 } 10078 10079 // generate Adler32 intrinsics code 10080 if (UseAdler32Intrinsics) { 10081 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 10082 } 10083 10084 #endif // COMPILER2_OR_JVMCI 10085 } 10086 10087 public: 10088 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 10089 switch(blob_id) { 10090 case initial_id: 10091 generate_initial_stubs(); 10092 break; 10093 case continuation_id: 10094 generate_continuation_stubs(); 10095 break; 10096 case compiler_id: 10097 generate_compiler_stubs(); 10098 break; 10099 case final_id: 10100 generate_final_stubs(); 10101 break; 10102 default: 10103 fatal("unexpected blob id: %d", blob_id); 10104 break; 10105 }; 10106 } 10107 }; // end class declaration 10108 10109 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 10110 StubGenerator g(code, blob_id); 10111 } 10112 10113 10114 #if defined (LINUX) 10115 10116 // Define pointers to atomic stubs and initialize them to point to the 10117 // code in atomic_aarch64.S. 10118 10119 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 10120 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 10121 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 10122 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 10123 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 10124 10125 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 10126 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 10127 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 10128 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 10129 DEFAULT_ATOMIC_OP(xchg, 4, ) 10130 DEFAULT_ATOMIC_OP(xchg, 8, ) 10131 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 10132 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 10133 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 10134 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 10135 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 10136 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 10137 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 10138 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 10139 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 10140 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 10141 10142 #undef DEFAULT_ATOMIC_OP 10143 10144 #endif // LINUX