1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "code/SCCache.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/arguments.hpp" 46 #include "runtime/atomic.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/frame.inline.hpp" 50 #include "runtime/handles.inline.hpp" 51 #include "runtime/javaThread.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/stubCodeGenerator.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "utilities/align.hpp" 56 #include "utilities/checkedCast.hpp" 57 #include "utilities/debug.hpp" 58 #include "utilities/globalDefinitions.hpp" 59 #include "utilities/intpow.hpp" 60 #include "utilities/powerOfTwo.hpp" 61 #ifdef COMPILER2 62 #include "opto/runtime.hpp" 63 #endif 64 #if INCLUDE_ZGC 65 #include "gc/z/zThreadLocalData.hpp" 66 #endif 67 68 // Declaration and definition of StubGenerator (no .hpp file). 69 // For a more detailed description of the stub routine structure 70 // see the comment in stubRoutines.hpp 71 72 #undef __ 73 #define __ _masm-> 74 75 #ifdef PRODUCT 76 #define BLOCK_COMMENT(str) /* nothing */ 77 #else 78 #define BLOCK_COMMENT(str) __ block_comment(str) 79 #endif 80 81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 82 83 // Stub Code definitions 84 85 class StubGenerator: public StubCodeGenerator { 86 private: 87 88 #ifdef PRODUCT 89 #define inc_counter_np(counter) ((void)0) 90 #else 91 void inc_counter_np_(uint& counter) { 92 __ incrementw(ExternalAddress((address)&counter)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubGenStubId stub_id = StubGenStubId::call_stub_id; 207 StubCodeMark mark(this, stub_id); 208 address start = __ pc(); 209 210 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 211 212 const Address fpcr_save (rfp, fpcr_off * wordSize); 213 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 214 const Address result (rfp, result_off * wordSize); 215 const Address result_type (rfp, result_type_off * wordSize); 216 const Address method (rfp, method_off * wordSize); 217 const Address entry_point (rfp, entry_point_off * wordSize); 218 const Address parameter_size(rfp, parameter_size_off * wordSize); 219 220 const Address thread (rfp, thread_off * wordSize); 221 222 const Address d15_save (rfp, d15_off * wordSize); 223 const Address d13_save (rfp, d13_off * wordSize); 224 const Address d11_save (rfp, d11_off * wordSize); 225 const Address d9_save (rfp, d9_off * wordSize); 226 227 const Address r28_save (rfp, r28_off * wordSize); 228 const Address r26_save (rfp, r26_off * wordSize); 229 const Address r24_save (rfp, r24_off * wordSize); 230 const Address r22_save (rfp, r22_off * wordSize); 231 const Address r20_save (rfp, r20_off * wordSize); 232 233 // stub code 234 235 address aarch64_entry = __ pc(); 236 237 // set up frame and move sp to end of save area 238 __ enter(); 239 __ sub(sp, rfp, -sp_after_call_off * wordSize); 240 241 // save register parameters and Java scratch/global registers 242 // n.b. we save thread even though it gets installed in 243 // rthread because we want to sanity check rthread later 244 __ str(c_rarg7, thread); 245 __ strw(c_rarg6, parameter_size); 246 __ stp(c_rarg4, c_rarg5, entry_point); 247 __ stp(c_rarg2, c_rarg3, result_type); 248 __ stp(c_rarg0, c_rarg1, call_wrapper); 249 250 __ stp(r20, r19, r20_save); 251 __ stp(r22, r21, r22_save); 252 __ stp(r24, r23, r24_save); 253 __ stp(r26, r25, r26_save); 254 __ stp(r28, r27, r28_save); 255 256 __ stpd(v9, v8, d9_save); 257 __ stpd(v11, v10, d11_save); 258 __ stpd(v13, v12, d13_save); 259 __ stpd(v15, v14, d15_save); 260 261 __ get_fpcr(rscratch1); 262 __ str(rscratch1, fpcr_save); 263 // Set FPCR to the state we need. We do want Round to Nearest. We 264 // don't want non-IEEE rounding modes or floating-point traps. 265 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 266 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 267 __ set_fpcr(rscratch1); 268 269 // install Java thread in global register now we have saved 270 // whatever value it held 271 __ mov(rthread, c_rarg7); 272 // And method 273 __ mov(rmethod, c_rarg3); 274 275 // set up the heapbase register 276 __ reinit_heapbase(); 277 278 #ifdef ASSERT 279 // make sure we have no pending exceptions 280 { 281 Label L; 282 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 283 __ cmp(rscratch1, (u1)NULL_WORD); 284 __ br(Assembler::EQ, L); 285 __ stop("StubRoutines::call_stub: entered with pending exception"); 286 __ BIND(L); 287 } 288 #endif 289 // pass parameters if any 290 __ mov(esp, sp); 291 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 292 __ andr(sp, rscratch1, -2 * wordSize); 293 294 BLOCK_COMMENT("pass parameters if any"); 295 Label parameters_done; 296 // parameter count is still in c_rarg6 297 // and parameter pointer identifying param 1 is in c_rarg5 298 __ cbzw(c_rarg6, parameters_done); 299 300 address loop = __ pc(); 301 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 302 __ subsw(c_rarg6, c_rarg6, 1); 303 __ push(rscratch1); 304 __ br(Assembler::GT, loop); 305 306 __ BIND(parameters_done); 307 308 // call Java entry -- passing methdoOop, and current sp 309 // rmethod: Method* 310 // r19_sender_sp: sender sp 311 BLOCK_COMMENT("call Java function"); 312 __ mov(r19_sender_sp, sp); 313 __ blr(c_rarg4); 314 315 // we do this here because the notify will already have been done 316 // if we get to the next instruction via an exception 317 // 318 // n.b. adding this instruction here affects the calculation of 319 // whether or not a routine returns to the call stub (used when 320 // doing stack walks) since the normal test is to check the return 321 // pc against the address saved below. so we may need to allow for 322 // this extra instruction in the check. 323 324 // save current address for use by exception handling code 325 326 return_address = __ pc(); 327 328 // store result depending on type (everything that is not 329 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 330 // n.b. this assumes Java returns an integral result in r0 331 // and a floating result in j_farg0 332 __ ldr(j_rarg2, result); 333 Label is_long, is_float, is_double, exit; 334 __ ldr(j_rarg1, result_type); 335 __ cmp(j_rarg1, (u1)T_OBJECT); 336 __ br(Assembler::EQ, is_long); 337 __ cmp(j_rarg1, (u1)T_LONG); 338 __ br(Assembler::EQ, is_long); 339 __ cmp(j_rarg1, (u1)T_FLOAT); 340 __ br(Assembler::EQ, is_float); 341 __ cmp(j_rarg1, (u1)T_DOUBLE); 342 __ br(Assembler::EQ, is_double); 343 344 // handle T_INT case 345 __ strw(r0, Address(j_rarg2)); 346 347 __ BIND(exit); 348 349 // pop parameters 350 __ sub(esp, rfp, -sp_after_call_off * wordSize); 351 352 #ifdef ASSERT 353 // verify that threads correspond 354 { 355 Label L, S; 356 __ ldr(rscratch1, thread); 357 __ cmp(rthread, rscratch1); 358 __ br(Assembler::NE, S); 359 __ get_thread(rscratch1); 360 __ cmp(rthread, rscratch1); 361 __ br(Assembler::EQ, L); 362 __ BIND(S); 363 __ stop("StubRoutines::call_stub: threads must correspond"); 364 __ BIND(L); 365 } 366 #endif 367 368 __ pop_cont_fastpath(rthread); 369 370 // restore callee-save registers 371 __ ldpd(v15, v14, d15_save); 372 __ ldpd(v13, v12, d13_save); 373 __ ldpd(v11, v10, d11_save); 374 __ ldpd(v9, v8, d9_save); 375 376 __ ldp(r28, r27, r28_save); 377 __ ldp(r26, r25, r26_save); 378 __ ldp(r24, r23, r24_save); 379 __ ldp(r22, r21, r22_save); 380 __ ldp(r20, r19, r20_save); 381 382 // restore fpcr 383 __ ldr(rscratch1, fpcr_save); 384 __ set_fpcr(rscratch1); 385 386 __ ldp(c_rarg0, c_rarg1, call_wrapper); 387 __ ldrw(c_rarg2, result_type); 388 __ ldr(c_rarg3, method); 389 __ ldp(c_rarg4, c_rarg5, entry_point); 390 __ ldp(c_rarg6, c_rarg7, parameter_size); 391 392 // leave frame and return to caller 393 __ leave(); 394 __ ret(lr); 395 396 // handle return types different from T_INT 397 398 __ BIND(is_long); 399 __ str(r0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_float); 403 __ strs(j_farg0, Address(j_rarg2, 0)); 404 __ br(Assembler::AL, exit); 405 406 __ BIND(is_double); 407 __ strd(j_farg0, Address(j_rarg2, 0)); 408 __ br(Assembler::AL, exit); 409 410 return start; 411 } 412 413 // Return point for a Java call if there's an exception thrown in 414 // Java code. The exception is caught and transformed into a 415 // pending exception stored in JavaThread that can be tested from 416 // within the VM. 417 // 418 // Note: Usually the parameters are removed by the callee. In case 419 // of an exception crossing an activation frame boundary, that is 420 // not the case if the callee is compiled code => need to setup the 421 // rsp. 422 // 423 // r0: exception oop 424 425 address generate_catch_exception() { 426 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 427 StubCodeMark mark(this, stub_id); 428 address start = __ pc(); 429 430 // same as in generate_call_stub(): 431 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 432 const Address thread (rfp, thread_off * wordSize); 433 434 #ifdef ASSERT 435 // verify that threads correspond 436 { 437 Label L, S; 438 __ ldr(rscratch1, thread); 439 __ cmp(rthread, rscratch1); 440 __ br(Assembler::NE, S); 441 __ get_thread(rscratch1); 442 __ cmp(rthread, rscratch1); 443 __ br(Assembler::EQ, L); 444 __ bind(S); 445 __ stop("StubRoutines::catch_exception: threads must correspond"); 446 __ bind(L); 447 } 448 #endif 449 450 // set pending exception 451 __ verify_oop(r0); 452 453 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 454 __ mov(rscratch1, (address)__FILE__); 455 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 456 __ movw(rscratch1, (int)__LINE__); 457 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 458 459 // complete return to VM 460 assert(StubRoutines::_call_stub_return_address != nullptr, 461 "_call_stub_return_address must have been generated before"); 462 __ b(StubRoutines::_call_stub_return_address); 463 464 return start; 465 } 466 467 // Continuation point for runtime calls returning with a pending 468 // exception. The pending exception check happened in the runtime 469 // or native call stub. The pending exception in Thread is 470 // converted into a Java-level exception. 471 // 472 // Contract with Java-level exception handlers: 473 // r0: exception 474 // r3: throwing pc 475 // 476 // NOTE: At entry of this stub, exception-pc must be in LR !! 477 478 // NOTE: this is always used as a jump target within generated code 479 // so it just needs to be generated code with no x86 prolog 480 481 address generate_forward_exception() { 482 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 483 StubCodeMark mark(this, stub_id); 484 address start = __ pc(); 485 486 // Upon entry, LR points to the return address returning into 487 // Java (interpreted or compiled) code; i.e., the return address 488 // becomes the throwing pc. 489 // 490 // Arguments pushed before the runtime call are still on the stack 491 // but the exception handler will reset the stack pointer -> 492 // ignore them. A potential result in registers can be ignored as 493 // well. 494 495 #ifdef ASSERT 496 // make sure this code is only executed if there is a pending exception 497 { 498 Label L; 499 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 500 __ cbnz(rscratch1, L); 501 __ stop("StubRoutines::forward exception: no pending exception (1)"); 502 __ bind(L); 503 } 504 #endif 505 506 // compute exception handler into r19 507 508 // call the VM to find the handler address associated with the 509 // caller address. pass thread in r0 and caller pc (ret address) 510 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 511 // the stack. 512 __ mov(c_rarg1, lr); 513 // lr will be trashed by the VM call so we move it to R19 514 // (callee-saved) because we also need to pass it to the handler 515 // returned by this call. 516 __ mov(r19, lr); 517 BLOCK_COMMENT("call exception_handler_for_return_address"); 518 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 519 SharedRuntime::exception_handler_for_return_address), 520 rthread, c_rarg1); 521 // Reinitialize the ptrue predicate register, in case the external runtime 522 // call clobbers ptrue reg, as we may return to SVE compiled code. 523 __ reinitialize_ptrue(); 524 525 // we should not really care that lr is no longer the callee 526 // address. we saved the value the handler needs in r19 so we can 527 // just copy it to r3. however, the C2 handler will push its own 528 // frame and then calls into the VM and the VM code asserts that 529 // the PC for the frame above the handler belongs to a compiled 530 // Java method. So, we restore lr here to satisfy that assert. 531 __ mov(lr, r19); 532 // setup r0 & r3 & clear pending exception 533 __ mov(r3, r19); 534 __ mov(r19, r0); 535 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 536 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 537 538 #ifdef ASSERT 539 // make sure exception is set 540 { 541 Label L; 542 __ cbnz(r0, L); 543 __ stop("StubRoutines::forward exception: no pending exception (2)"); 544 __ bind(L); 545 } 546 #endif 547 548 // continue at exception handler 549 // r0: exception 550 // r3: throwing pc 551 // r19: exception handler 552 __ verify_oop(r0); 553 __ br(r19); 554 555 return start; 556 } 557 558 // Non-destructive plausibility checks for oops 559 // 560 // Arguments: 561 // r0: oop to verify 562 // rscratch1: error message 563 // 564 // Stack after saving c_rarg3: 565 // [tos + 0]: saved c_rarg3 566 // [tos + 1]: saved c_rarg2 567 // [tos + 2]: saved lr 568 // [tos + 3]: saved rscratch2 569 // [tos + 4]: saved r0 570 // [tos + 5]: saved rscratch1 571 address generate_verify_oop() { 572 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 573 StubCodeMark mark(this, stub_id); 574 address start = __ pc(); 575 576 Label exit, error; 577 578 // save c_rarg2 and c_rarg3 579 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 580 581 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 583 __ ldr(c_rarg3, Address(c_rarg2)); 584 __ add(c_rarg3, c_rarg3, 1); 585 __ str(c_rarg3, Address(c_rarg2)); 586 587 // object is in r0 588 // make sure object is 'reasonable' 589 __ cbz(r0, exit); // if obj is null it is OK 590 591 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 592 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blr(rscratch1); 615 __ hlt(0); 616 617 return start; 618 } 619 620 // Generate indices for iota vector. 621 address generate_iota_indices(StubGenStubId stub_id) { 622 __ align(CodeEntryAlignment); 623 StubCodeMark mark(this, stub_id); 624 address start = __ pc(); 625 // B 626 __ emit_data64(0x0706050403020100, relocInfo::none); 627 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 628 // H 629 __ emit_data64(0x0003000200010000, relocInfo::none); 630 __ emit_data64(0x0007000600050004, relocInfo::none); 631 // S 632 __ emit_data64(0x0000000100000000, relocInfo::none); 633 __ emit_data64(0x0000000300000002, relocInfo::none); 634 // D 635 __ emit_data64(0x0000000000000000, relocInfo::none); 636 __ emit_data64(0x0000000000000001, relocInfo::none); 637 // S - FP 638 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 639 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 640 // D - FP 641 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 642 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 643 return start; 644 } 645 646 // The inner part of zero_words(). This is the bulk operation, 647 // zeroing words in blocks, possibly using DC ZVA to do it. The 648 // caller is responsible for zeroing the last few words. 649 // 650 // Inputs: 651 // r10: the HeapWord-aligned base address of an array to zero. 652 // r11: the count in HeapWords, r11 > 0. 653 // 654 // Returns r10 and r11, adjusted for the caller to clear. 655 // r10: the base address of the tail of words left to clear. 656 // r11: the number of words in the tail. 657 // r11 < MacroAssembler::zero_words_block_size. 658 659 address generate_zero_blocks() { 660 Label done; 661 Label base_aligned; 662 663 Register base = r10, cnt = r11; 664 665 __ align(CodeEntryAlignment); 666 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 667 StubCodeMark mark(this, stub_id); 668 address start = __ pc(); 669 670 if (UseBlockZeroing) { 671 int zva_length = VM_Version::zva_length(); 672 673 // Ensure ZVA length can be divided by 16. This is required by 674 // the subsequent operations. 675 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 676 677 __ tbz(base, 3, base_aligned); 678 __ str(zr, Address(__ post(base, 8))); 679 __ sub(cnt, cnt, 1); 680 __ bind(base_aligned); 681 682 // Ensure count >= zva_length * 2 so that it still deserves a zva after 683 // alignment. 684 Label small; 685 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 686 __ subs(rscratch1, cnt, low_limit >> 3); 687 __ br(Assembler::LT, small); 688 __ zero_dcache_blocks(base, cnt); 689 __ bind(small); 690 } 691 692 { 693 // Number of stp instructions we'll unroll 694 const int unroll = 695 MacroAssembler::zero_words_block_size / 2; 696 // Clear the remaining blocks. 697 Label loop; 698 __ subs(cnt, cnt, unroll * 2); 699 __ br(Assembler::LT, done); 700 __ bind(loop); 701 for (int i = 0; i < unroll; i++) 702 __ stp(zr, zr, __ post(base, 16)); 703 __ subs(cnt, cnt, unroll * 2); 704 __ br(Assembler::GE, loop); 705 __ bind(done); 706 __ add(cnt, cnt, unroll * 2); 707 } 708 709 __ ret(lr); 710 711 return start; 712 } 713 714 715 typedef enum { 716 copy_forwards = 1, 717 copy_backwards = -1 718 } copy_direction; 719 720 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 721 // for arraycopy stubs. 722 class ArrayCopyBarrierSetHelper : StackObj { 723 BarrierSetAssembler* _bs_asm; 724 MacroAssembler* _masm; 725 DecoratorSet _decorators; 726 BasicType _type; 727 Register _gct1; 728 Register _gct2; 729 Register _gct3; 730 FloatRegister _gcvt1; 731 FloatRegister _gcvt2; 732 FloatRegister _gcvt3; 733 734 public: 735 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 736 DecoratorSet decorators, 737 BasicType type, 738 Register gct1, 739 Register gct2, 740 Register gct3, 741 FloatRegister gcvt1, 742 FloatRegister gcvt2, 743 FloatRegister gcvt3) 744 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 745 _masm(masm), 746 _decorators(decorators), 747 _type(type), 748 _gct1(gct1), 749 _gct2(gct2), 750 _gct3(gct3), 751 _gcvt1(gcvt1), 752 _gcvt2(gcvt2), 753 _gcvt3(gcvt3) { 754 } 755 756 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 757 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 758 dst1, dst2, src, 759 _gct1, _gct2, _gcvt1); 760 } 761 762 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 763 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 764 dst, src1, src2, 765 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 766 } 767 768 void copy_load_at_16(Register dst1, Register dst2, Address src) { 769 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 770 dst1, dst2, src, 771 _gct1); 772 } 773 774 void copy_store_at_16(Address dst, Register src1, Register src2) { 775 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 776 dst, src1, src2, 777 _gct1, _gct2, _gct3); 778 } 779 780 void copy_load_at_8(Register dst, Address src) { 781 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 782 dst, noreg, src, 783 _gct1); 784 } 785 786 void copy_store_at_8(Address dst, Register src) { 787 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 788 dst, src, noreg, 789 _gct1, _gct2, _gct3); 790 } 791 }; 792 793 // Bulk copy of blocks of 8 words. 794 // 795 // count is a count of words. 796 // 797 // Precondition: count >= 8 798 // 799 // Postconditions: 800 // 801 // The least significant bit of count contains the remaining count 802 // of words to copy. The rest of count is trash. 803 // 804 // s and d are adjusted to point to the remaining words to copy 805 // 806 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 807 BasicType type; 808 copy_direction direction; 809 810 switch (stub_id) { 811 case copy_byte_f_id: 812 direction = copy_forwards; 813 type = T_BYTE; 814 break; 815 case copy_byte_b_id: 816 direction = copy_backwards; 817 type = T_BYTE; 818 break; 819 case copy_oop_f_id: 820 direction = copy_forwards; 821 type = T_OBJECT; 822 break; 823 case copy_oop_b_id: 824 direction = copy_backwards; 825 type = T_OBJECT; 826 break; 827 case copy_oop_uninit_f_id: 828 direction = copy_forwards; 829 type = T_OBJECT; 830 break; 831 case copy_oop_uninit_b_id: 832 direction = copy_backwards; 833 type = T_OBJECT; 834 break; 835 default: 836 ShouldNotReachHere(); 837 } 838 839 int unit = wordSize * direction; 840 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 841 842 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 843 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 844 const Register stride = r14; 845 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 846 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 847 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 848 849 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 850 assert_different_registers(s, d, count, rscratch1, rscratch2); 851 852 Label again, drain; 853 854 __ align(CodeEntryAlignment); 855 856 StubCodeMark mark(this, stub_id); 857 858 __ bind(start); 859 860 Label unaligned_copy_long; 861 if (AvoidUnalignedAccesses) { 862 __ tbnz(d, 3, unaligned_copy_long); 863 } 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, bias); 867 __ sub(d, d, bias); 868 } 869 870 #ifdef ASSERT 871 // Make sure we are never given < 8 words 872 { 873 Label L; 874 __ cmp(count, (u1)8); 875 __ br(Assembler::GE, L); 876 __ stop("genrate_copy_longs called with < 8 words"); 877 __ bind(L); 878 } 879 #endif 880 881 // Fill 8 registers 882 if (UseSIMDForMemoryOps) { 883 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 884 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 885 } else { 886 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 888 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 889 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 890 } 891 892 __ subs(count, count, 16); 893 __ br(Assembler::LO, drain); 894 895 int prefetch = PrefetchCopyIntervalInBytes; 896 bool use_stride = false; 897 if (direction == copy_backwards) { 898 use_stride = prefetch > 256; 899 prefetch = -prefetch; 900 if (use_stride) __ mov(stride, prefetch); 901 } 902 903 __ bind(again); 904 905 if (PrefetchCopyIntervalInBytes > 0) 906 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 907 908 if (UseSIMDForMemoryOps) { 909 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 910 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 911 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 912 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 913 } else { 914 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 915 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 916 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 917 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 918 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 919 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 920 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 921 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 922 } 923 924 __ subs(count, count, 8); 925 __ br(Assembler::HS, again); 926 927 // Drain 928 __ bind(drain); 929 if (UseSIMDForMemoryOps) { 930 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 931 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 932 } else { 933 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 934 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 935 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 936 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 937 } 938 939 { 940 Label L1, L2; 941 __ tbz(count, exact_log2(4), L1); 942 if (UseSIMDForMemoryOps) { 943 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 944 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 945 } else { 946 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 947 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 950 } 951 __ bind(L1); 952 953 if (direction == copy_forwards) { 954 __ add(s, s, bias); 955 __ add(d, d, bias); 956 } 957 958 __ tbz(count, 1, L2); 959 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 960 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 961 __ bind(L2); 962 } 963 964 __ ret(lr); 965 966 if (AvoidUnalignedAccesses) { 967 Label drain, again; 968 // Register order for storing. Order is different for backward copy. 969 970 __ bind(unaligned_copy_long); 971 972 // source address is even aligned, target odd aligned 973 // 974 // when forward copying word pairs we read long pairs at offsets 975 // {0, 2, 4, 6} (in long words). when backwards copying we read 976 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 977 // address by -2 in the forwards case so we can compute the 978 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 979 // or -1. 980 // 981 // when forward copying we need to store 1 word, 3 pairs and 982 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 983 // zero offset We adjust the destination by -1 which means we 984 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 985 // 986 // When backwards copyng we need to store 1 word, 3 pairs and 987 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 988 // offsets {1, 3, 5, 7, 8} * unit. 989 990 if (direction == copy_forwards) { 991 __ sub(s, s, 16); 992 __ sub(d, d, 8); 993 } 994 995 // Fill 8 registers 996 // 997 // for forwards copy s was offset by -16 from the original input 998 // value of s so the register contents are at these offsets 999 // relative to the 64 bit block addressed by that original input 1000 // and so on for each successive 64 byte block when s is updated 1001 // 1002 // t0 at offset 0, t1 at offset 8 1003 // t2 at offset 16, t3 at offset 24 1004 // t4 at offset 32, t5 at offset 40 1005 // t6 at offset 48, t7 at offset 56 1006 1007 // for backwards copy s was not offset so the register contents 1008 // are at these offsets into the preceding 64 byte block 1009 // relative to that original input and so on for each successive 1010 // preceding 64 byte block when s is updated. this explains the 1011 // slightly counter-intuitive looking pattern of register usage 1012 // in the stp instructions for backwards copy. 1013 // 1014 // t0 at offset -16, t1 at offset -8 1015 // t2 at offset -32, t3 at offset -24 1016 // t4 at offset -48, t5 at offset -40 1017 // t6 at offset -64, t7 at offset -56 1018 1019 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1020 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1021 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1022 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1023 1024 __ subs(count, count, 16); 1025 __ br(Assembler::LO, drain); 1026 1027 int prefetch = PrefetchCopyIntervalInBytes; 1028 bool use_stride = false; 1029 if (direction == copy_backwards) { 1030 use_stride = prefetch > 256; 1031 prefetch = -prefetch; 1032 if (use_stride) __ mov(stride, prefetch); 1033 } 1034 1035 __ bind(again); 1036 1037 if (PrefetchCopyIntervalInBytes > 0) 1038 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1039 1040 if (direction == copy_forwards) { 1041 // allowing for the offset of -8 the store instructions place 1042 // registers into the target 64 bit block at the following 1043 // offsets 1044 // 1045 // t0 at offset 0 1046 // t1 at offset 8, t2 at offset 16 1047 // t3 at offset 24, t4 at offset 32 1048 // t5 at offset 40, t6 at offset 48 1049 // t7 at offset 56 1050 1051 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1052 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1053 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1054 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1055 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1056 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1057 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1058 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1059 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1060 } else { 1061 // d was not offset when we started so the registers are 1062 // written into the 64 bit block preceding d with the following 1063 // offsets 1064 // 1065 // t1 at offset -8 1066 // t3 at offset -24, t0 at offset -16 1067 // t5 at offset -48, t2 at offset -32 1068 // t7 at offset -56, t4 at offset -48 1069 // t6 at offset -64 1070 // 1071 // note that this matches the offsets previously noted for the 1072 // loads 1073 1074 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1075 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1076 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1077 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1078 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1079 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1080 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1081 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1082 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1083 } 1084 1085 __ subs(count, count, 8); 1086 __ br(Assembler::HS, again); 1087 1088 // Drain 1089 // 1090 // this uses the same pattern of offsets and register arguments 1091 // as above 1092 __ bind(drain); 1093 if (direction == copy_forwards) { 1094 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1095 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1096 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1097 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1098 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1099 } else { 1100 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1101 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1102 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1103 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1104 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1105 } 1106 // now we need to copy any remaining part block which may 1107 // include a 4 word block subblock and/or a 2 word subblock. 1108 // bits 2 and 1 in the count are the tell-tale for whether we 1109 // have each such subblock 1110 { 1111 Label L1, L2; 1112 __ tbz(count, exact_log2(4), L1); 1113 // this is the same as above but copying only 4 longs hence 1114 // with only one intervening stp between the str instructions 1115 // but note that the offsets and registers still follow the 1116 // same pattern 1117 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1118 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1119 if (direction == copy_forwards) { 1120 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1121 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1122 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1123 } else { 1124 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1125 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1126 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1127 } 1128 __ bind(L1); 1129 1130 __ tbz(count, 1, L2); 1131 // this is the same as above but copying only 2 longs hence 1132 // there is no intervening stp between the str instructions 1133 // but note that the offset and register patterns are still 1134 // the same 1135 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1136 if (direction == copy_forwards) { 1137 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1138 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1139 } else { 1140 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1141 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1142 } 1143 __ bind(L2); 1144 1145 // for forwards copy we need to re-adjust the offsets we 1146 // applied so that s and d are follow the last words written 1147 1148 if (direction == copy_forwards) { 1149 __ add(s, s, 16); 1150 __ add(d, d, 8); 1151 } 1152 1153 } 1154 1155 __ ret(lr); 1156 } 1157 } 1158 1159 // Small copy: less than 16 bytes. 1160 // 1161 // NB: Ignores all of the bits of count which represent more than 15 1162 // bytes, so a caller doesn't have to mask them. 1163 1164 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1165 bool is_backwards = step < 0; 1166 size_t granularity = uabs(step); 1167 int direction = is_backwards ? -1 : 1; 1168 1169 Label Lword, Lint, Lshort, Lbyte; 1170 1171 assert(granularity 1172 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1173 1174 const Register t0 = r3; 1175 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1176 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1177 1178 // ??? I don't know if this bit-test-and-branch is the right thing 1179 // to do. It does a lot of jumping, resulting in several 1180 // mispredicted branches. It might make more sense to do this 1181 // with something like Duff's device with a single computed branch. 1182 1183 __ tbz(count, 3 - exact_log2(granularity), Lword); 1184 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1185 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1186 __ bind(Lword); 1187 1188 if (granularity <= sizeof (jint)) { 1189 __ tbz(count, 2 - exact_log2(granularity), Lint); 1190 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1191 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1192 __ bind(Lint); 1193 } 1194 1195 if (granularity <= sizeof (jshort)) { 1196 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1197 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1198 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1199 __ bind(Lshort); 1200 } 1201 1202 if (granularity <= sizeof (jbyte)) { 1203 __ tbz(count, 0, Lbyte); 1204 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1205 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1206 __ bind(Lbyte); 1207 } 1208 } 1209 1210 Label copy_f, copy_b; 1211 Label copy_obj_f, copy_obj_b; 1212 Label copy_obj_uninit_f, copy_obj_uninit_b; 1213 1214 // All-singing all-dancing memory copy. 1215 // 1216 // Copy count units of memory from s to d. The size of a unit is 1217 // step, which can be positive or negative depending on the direction 1218 // of copy. If is_aligned is false, we align the source address. 1219 // 1220 1221 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1222 Register s, Register d, Register count, int step) { 1223 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1224 bool is_backwards = step < 0; 1225 unsigned int granularity = uabs(step); 1226 const Register t0 = r3, t1 = r4; 1227 1228 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1229 // load all the data before writing anything 1230 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1231 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1232 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1233 const Register send = r17, dend = r16; 1234 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1235 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1236 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1237 1238 if (PrefetchCopyIntervalInBytes > 0) 1239 __ prfm(Address(s, 0), PLDL1KEEP); 1240 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1241 __ br(Assembler::HI, copy_big); 1242 1243 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1244 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1245 1246 __ cmp(count, u1(16/granularity)); 1247 __ br(Assembler::LS, copy16); 1248 1249 __ cmp(count, u1(64/granularity)); 1250 __ br(Assembler::HI, copy80); 1251 1252 __ cmp(count, u1(32/granularity)); 1253 __ br(Assembler::LS, copy32); 1254 1255 // 33..64 bytes 1256 if (UseSIMDForMemoryOps) { 1257 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1258 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1259 bs.copy_store_at_32(Address(d, 0), v0, v1); 1260 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1261 } else { 1262 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1263 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1264 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1265 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1266 1267 bs.copy_store_at_16(Address(d, 0), t0, t1); 1268 bs.copy_store_at_16(Address(d, 16), t2, t3); 1269 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1270 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1271 } 1272 __ b(finish); 1273 1274 // 17..32 bytes 1275 __ bind(copy32); 1276 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1277 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1278 1279 bs.copy_store_at_16(Address(d, 0), t0, t1); 1280 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1281 __ b(finish); 1282 1283 // 65..80/96 bytes 1284 // (96 bytes if SIMD because we do 32 byes per instruction) 1285 __ bind(copy80); 1286 if (UseSIMDForMemoryOps) { 1287 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1288 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1289 // Unaligned pointers can be an issue for copying. 1290 // The issue has more chances to happen when granularity of data is 1291 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1292 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1293 // The most performance drop has been seen for the range 65-80 bytes. 1294 // For such cases using the pair of ldp/stp instead of the third pair of 1295 // ldpq/stpq fixes the performance issue. 1296 if (granularity < sizeof (jint)) { 1297 Label copy96; 1298 __ cmp(count, u1(80/granularity)); 1299 __ br(Assembler::HI, copy96); 1300 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1301 1302 bs.copy_store_at_32(Address(d, 0), v0, v1); 1303 bs.copy_store_at_32(Address(d, 32), v2, v3); 1304 1305 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1306 __ b(finish); 1307 1308 __ bind(copy96); 1309 } 1310 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1311 1312 bs.copy_store_at_32(Address(d, 0), v0, v1); 1313 bs.copy_store_at_32(Address(d, 32), v2, v3); 1314 1315 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1316 } else { 1317 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1318 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1319 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1320 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1321 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1322 1323 bs.copy_store_at_16(Address(d, 0), t0, t1); 1324 bs.copy_store_at_16(Address(d, 16), t2, t3); 1325 bs.copy_store_at_16(Address(d, 32), t4, t5); 1326 bs.copy_store_at_16(Address(d, 48), t6, t7); 1327 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1328 } 1329 __ b(finish); 1330 1331 // 0..16 bytes 1332 __ bind(copy16); 1333 __ cmp(count, u1(8/granularity)); 1334 __ br(Assembler::LO, copy8); 1335 1336 // 8..16 bytes 1337 bs.copy_load_at_8(t0, Address(s, 0)); 1338 bs.copy_load_at_8(t1, Address(send, -8)); 1339 bs.copy_store_at_8(Address(d, 0), t0); 1340 bs.copy_store_at_8(Address(dend, -8), t1); 1341 __ b(finish); 1342 1343 if (granularity < 8) { 1344 // 4..7 bytes 1345 __ bind(copy8); 1346 __ tbz(count, 2 - exact_log2(granularity), copy4); 1347 __ ldrw(t0, Address(s, 0)); 1348 __ ldrw(t1, Address(send, -4)); 1349 __ strw(t0, Address(d, 0)); 1350 __ strw(t1, Address(dend, -4)); 1351 __ b(finish); 1352 if (granularity < 4) { 1353 // 0..3 bytes 1354 __ bind(copy4); 1355 __ cbz(count, finish); // get rid of 0 case 1356 if (granularity == 2) { 1357 __ ldrh(t0, Address(s, 0)); 1358 __ strh(t0, Address(d, 0)); 1359 } else { // granularity == 1 1360 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1361 // the first and last byte. 1362 // Handle the 3 byte case by loading and storing base + count/2 1363 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1364 // This does means in the 1 byte case we load/store the same 1365 // byte 3 times. 1366 __ lsr(count, count, 1); 1367 __ ldrb(t0, Address(s, 0)); 1368 __ ldrb(t1, Address(send, -1)); 1369 __ ldrb(t2, Address(s, count)); 1370 __ strb(t0, Address(d, 0)); 1371 __ strb(t1, Address(dend, -1)); 1372 __ strb(t2, Address(d, count)); 1373 } 1374 __ b(finish); 1375 } 1376 } 1377 1378 __ bind(copy_big); 1379 if (is_backwards) { 1380 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1381 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1382 } 1383 1384 // Now we've got the small case out of the way we can align the 1385 // source address on a 2-word boundary. 1386 1387 // Here we will materialize a count in r15, which is used by copy_memory_small 1388 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1389 // Up until here, we have used t9, which aliases r15, but from here on, that register 1390 // can not be used as a temp register, as it contains the count. 1391 1392 Label aligned; 1393 1394 if (is_aligned) { 1395 // We may have to adjust by 1 word to get s 2-word-aligned. 1396 __ tbz(s, exact_log2(wordSize), aligned); 1397 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1398 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1399 __ sub(count, count, wordSize/granularity); 1400 } else { 1401 if (is_backwards) { 1402 __ andr(r15, s, 2 * wordSize - 1); 1403 } else { 1404 __ neg(r15, s); 1405 __ andr(r15, r15, 2 * wordSize - 1); 1406 } 1407 // r15 is the byte adjustment needed to align s. 1408 __ cbz(r15, aligned); 1409 int shift = exact_log2(granularity); 1410 if (shift > 0) { 1411 __ lsr(r15, r15, shift); 1412 } 1413 __ sub(count, count, r15); 1414 1415 #if 0 1416 // ?? This code is only correct for a disjoint copy. It may or 1417 // may not make sense to use it in that case. 1418 1419 // Copy the first pair; s and d may not be aligned. 1420 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1421 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1422 1423 // Align s and d, adjust count 1424 if (is_backwards) { 1425 __ sub(s, s, r15); 1426 __ sub(d, d, r15); 1427 } else { 1428 __ add(s, s, r15); 1429 __ add(d, d, r15); 1430 } 1431 #else 1432 copy_memory_small(decorators, type, s, d, r15, step); 1433 #endif 1434 } 1435 1436 __ bind(aligned); 1437 1438 // s is now 2-word-aligned. 1439 1440 // We have a count of units and some trailing bytes. Adjust the 1441 // count and do a bulk copy of words. If the shift is zero 1442 // perform a move instead to benefit from zero latency moves. 1443 int shift = exact_log2(wordSize/granularity); 1444 if (shift > 0) { 1445 __ lsr(r15, count, shift); 1446 } else { 1447 __ mov(r15, count); 1448 } 1449 if (direction == copy_forwards) { 1450 if (type != T_OBJECT) { 1451 __ bl(copy_f); 1452 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1453 __ bl(copy_obj_uninit_f); 1454 } else { 1455 __ bl(copy_obj_f); 1456 } 1457 } else { 1458 if (type != T_OBJECT) { 1459 __ bl(copy_b); 1460 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1461 __ bl(copy_obj_uninit_b); 1462 } else { 1463 __ bl(copy_obj_b); 1464 } 1465 } 1466 1467 // And the tail. 1468 copy_memory_small(decorators, type, s, d, count, step); 1469 1470 if (granularity >= 8) __ bind(copy8); 1471 if (granularity >= 4) __ bind(copy4); 1472 __ bind(finish); 1473 } 1474 1475 1476 void clobber_registers() { 1477 #ifdef ASSERT 1478 RegSet clobbered 1479 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1480 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1481 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1482 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1483 __ mov(*it, rscratch1); 1484 } 1485 #endif 1486 1487 } 1488 1489 // Scan over array at a for count oops, verifying each one. 1490 // Preserves a and count, clobbers rscratch1 and rscratch2. 1491 void verify_oop_array (int size, Register a, Register count, Register temp) { 1492 Label loop, end; 1493 __ mov(rscratch1, a); 1494 __ mov(rscratch2, zr); 1495 __ bind(loop); 1496 __ cmp(rscratch2, count); 1497 __ br(Assembler::HS, end); 1498 if (size == wordSize) { 1499 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1500 __ verify_oop(temp); 1501 } else { 1502 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1503 __ decode_heap_oop(temp); // calls verify_oop 1504 } 1505 __ add(rscratch2, rscratch2, 1); 1506 __ b(loop); 1507 __ bind(end); 1508 } 1509 1510 // Arguments: 1511 // stub_id - is used to name the stub and identify all details of 1512 // how to perform the copy. 1513 // 1514 // entry - is assigned to the stub's post push entry point unless 1515 // it is null 1516 // 1517 // Inputs: 1518 // c_rarg0 - source array address 1519 // c_rarg1 - destination array address 1520 // c_rarg2 - element count, treated as ssize_t, can be zero 1521 // 1522 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1523 // the hardware handle it. The two dwords within qwords that span 1524 // cache line boundaries will still be loaded and stored atomically. 1525 // 1526 // Side Effects: entry is set to the (post push) entry point so it 1527 // can be used by the corresponding conjoint copy 1528 // method 1529 // 1530 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1531 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1532 RegSet saved_reg = RegSet::of(s, d, count); 1533 int size; 1534 bool aligned; 1535 bool is_oop; 1536 bool dest_uninitialized; 1537 switch (stub_id) { 1538 case jbyte_disjoint_arraycopy_id: 1539 size = sizeof(jbyte); 1540 aligned = false; 1541 is_oop = false; 1542 dest_uninitialized = false; 1543 break; 1544 case arrayof_jbyte_disjoint_arraycopy_id: 1545 size = sizeof(jbyte); 1546 aligned = true; 1547 is_oop = false; 1548 dest_uninitialized = false; 1549 break; 1550 case jshort_disjoint_arraycopy_id: 1551 size = sizeof(jshort); 1552 aligned = false; 1553 is_oop = false; 1554 dest_uninitialized = false; 1555 break; 1556 case arrayof_jshort_disjoint_arraycopy_id: 1557 size = sizeof(jshort); 1558 aligned = true; 1559 is_oop = false; 1560 dest_uninitialized = false; 1561 break; 1562 case jint_disjoint_arraycopy_id: 1563 size = sizeof(jint); 1564 aligned = false; 1565 is_oop = false; 1566 dest_uninitialized = false; 1567 break; 1568 case arrayof_jint_disjoint_arraycopy_id: 1569 size = sizeof(jint); 1570 aligned = true; 1571 is_oop = false; 1572 dest_uninitialized = false; 1573 break; 1574 case jlong_disjoint_arraycopy_id: 1575 // since this is always aligned we can (should!) use the same 1576 // stub as for case arrayof_jlong_disjoint_arraycopy 1577 ShouldNotReachHere(); 1578 break; 1579 case arrayof_jlong_disjoint_arraycopy_id: 1580 size = sizeof(jlong); 1581 aligned = true; 1582 is_oop = false; 1583 dest_uninitialized = false; 1584 break; 1585 case oop_disjoint_arraycopy_id: 1586 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1587 aligned = !UseCompressedOops; 1588 is_oop = true; 1589 dest_uninitialized = false; 1590 break; 1591 case arrayof_oop_disjoint_arraycopy_id: 1592 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1593 aligned = !UseCompressedOops; 1594 is_oop = true; 1595 dest_uninitialized = false; 1596 break; 1597 case oop_disjoint_arraycopy_uninit_id: 1598 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1599 aligned = !UseCompressedOops; 1600 is_oop = true; 1601 dest_uninitialized = true; 1602 break; 1603 case arrayof_oop_disjoint_arraycopy_uninit_id: 1604 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1605 aligned = !UseCompressedOops; 1606 is_oop = true; 1607 dest_uninitialized = true; 1608 break; 1609 default: 1610 ShouldNotReachHere(); 1611 break; 1612 } 1613 1614 __ align(CodeEntryAlignment); 1615 StubCodeMark mark(this, stub_id); 1616 address start = __ pc(); 1617 __ enter(); 1618 1619 if (entry != nullptr) { 1620 *entry = __ pc(); 1621 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1622 BLOCK_COMMENT("Entry:"); 1623 } 1624 1625 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1626 if (dest_uninitialized) { 1627 decorators |= IS_DEST_UNINITIALIZED; 1628 } 1629 if (aligned) { 1630 decorators |= ARRAYCOPY_ALIGNED; 1631 } 1632 1633 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1634 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1635 1636 if (is_oop) { 1637 // save regs before copy_memory 1638 __ push(RegSet::of(d, count), sp); 1639 } 1640 { 1641 // UnsafeMemoryAccess page error: continue after unsafe access 1642 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1643 UnsafeMemoryAccessMark umam(this, add_entry, true); 1644 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1645 } 1646 1647 if (is_oop) { 1648 __ pop(RegSet::of(d, count), sp); 1649 if (VerifyOops) 1650 verify_oop_array(size, d, count, r16); 1651 } 1652 1653 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1654 1655 __ leave(); 1656 __ mov(r0, zr); // return 0 1657 __ ret(lr); 1658 return start; 1659 } 1660 1661 // Arguments: 1662 // stub_id - is used to name the stub and identify all details of 1663 // how to perform the copy. 1664 // 1665 // nooverlap_target - identifes the (post push) entry for the 1666 // corresponding disjoint copy routine which can be 1667 // jumped to if the ranges do not actually overlap 1668 // 1669 // entry - is assigned to the stub's post push entry point unless 1670 // it is null 1671 // 1672 // 1673 // Inputs: 1674 // c_rarg0 - source array address 1675 // c_rarg1 - destination array address 1676 // c_rarg2 - element count, treated as ssize_t, can be zero 1677 // 1678 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1679 // the hardware handle it. The two dwords within qwords that span 1680 // cache line boundaries will still be loaded and stored atomically. 1681 // 1682 // Side Effects: 1683 // entry is set to the no-overlap entry point so it can be used by 1684 // some other conjoint copy method 1685 // 1686 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1687 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1688 RegSet saved_regs = RegSet::of(s, d, count); 1689 int size; 1690 bool aligned; 1691 bool is_oop; 1692 bool dest_uninitialized; 1693 switch (stub_id) { 1694 case jbyte_arraycopy_id: 1695 size = sizeof(jbyte); 1696 aligned = false; 1697 is_oop = false; 1698 dest_uninitialized = false; 1699 break; 1700 case arrayof_jbyte_arraycopy_id: 1701 size = sizeof(jbyte); 1702 aligned = true; 1703 is_oop = false; 1704 dest_uninitialized = false; 1705 break; 1706 case jshort_arraycopy_id: 1707 size = sizeof(jshort); 1708 aligned = false; 1709 is_oop = false; 1710 dest_uninitialized = false; 1711 break; 1712 case arrayof_jshort_arraycopy_id: 1713 size = sizeof(jshort); 1714 aligned = true; 1715 is_oop = false; 1716 dest_uninitialized = false; 1717 break; 1718 case jint_arraycopy_id: 1719 size = sizeof(jint); 1720 aligned = false; 1721 is_oop = false; 1722 dest_uninitialized = false; 1723 break; 1724 case arrayof_jint_arraycopy_id: 1725 size = sizeof(jint); 1726 aligned = true; 1727 is_oop = false; 1728 dest_uninitialized = false; 1729 break; 1730 case jlong_arraycopy_id: 1731 // since this is always aligned we can (should!) use the same 1732 // stub as for case arrayof_jlong_disjoint_arraycopy 1733 ShouldNotReachHere(); 1734 break; 1735 case arrayof_jlong_arraycopy_id: 1736 size = sizeof(jlong); 1737 aligned = true; 1738 is_oop = false; 1739 dest_uninitialized = false; 1740 break; 1741 case oop_arraycopy_id: 1742 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1743 aligned = !UseCompressedOops; 1744 is_oop = true; 1745 dest_uninitialized = false; 1746 break; 1747 case arrayof_oop_arraycopy_id: 1748 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1749 aligned = !UseCompressedOops; 1750 is_oop = true; 1751 dest_uninitialized = false; 1752 break; 1753 case oop_arraycopy_uninit_id: 1754 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1755 aligned = !UseCompressedOops; 1756 is_oop = true; 1757 dest_uninitialized = true; 1758 break; 1759 case arrayof_oop_arraycopy_uninit_id: 1760 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1761 aligned = !UseCompressedOops; 1762 is_oop = true; 1763 dest_uninitialized = true; 1764 break; 1765 default: 1766 ShouldNotReachHere(); 1767 } 1768 1769 StubCodeMark mark(this, stub_id); 1770 address start = __ pc(); 1771 __ enter(); 1772 1773 if (entry != nullptr) { 1774 *entry = __ pc(); 1775 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1776 BLOCK_COMMENT("Entry:"); 1777 } 1778 1779 // use fwd copy when (d-s) above_equal (count*size) 1780 __ sub(rscratch1, d, s); 1781 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1782 __ br(Assembler::HS, nooverlap_target); 1783 1784 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1785 if (dest_uninitialized) { 1786 decorators |= IS_DEST_UNINITIALIZED; 1787 } 1788 if (aligned) { 1789 decorators |= ARRAYCOPY_ALIGNED; 1790 } 1791 1792 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1793 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1794 1795 if (is_oop) { 1796 // save regs before copy_memory 1797 __ push(RegSet::of(d, count), sp); 1798 } 1799 { 1800 // UnsafeMemoryAccess page error: continue after unsafe access 1801 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1802 UnsafeMemoryAccessMark umam(this, add_entry, true); 1803 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1804 } 1805 if (is_oop) { 1806 __ pop(RegSet::of(d, count), sp); 1807 if (VerifyOops) 1808 verify_oop_array(size, d, count, r16); 1809 } 1810 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1811 __ leave(); 1812 __ mov(r0, zr); // return 0 1813 __ ret(lr); 1814 return start; 1815 } 1816 1817 // Helper for generating a dynamic type check. 1818 // Smashes rscratch1, rscratch2. 1819 void generate_type_check(Register sub_klass, 1820 Register super_check_offset, 1821 Register super_klass, 1822 Register temp1, 1823 Register temp2, 1824 Register result, 1825 Label& L_success) { 1826 assert_different_registers(sub_klass, super_check_offset, super_klass); 1827 1828 BLOCK_COMMENT("type_check:"); 1829 1830 Label L_miss; 1831 1832 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1833 super_check_offset); 1834 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1835 1836 // Fall through on failure! 1837 __ BIND(L_miss); 1838 } 1839 1840 // 1841 // Generate checkcasting array copy stub 1842 // 1843 // Input: 1844 // c_rarg0 - source array address 1845 // c_rarg1 - destination array address 1846 // c_rarg2 - element count, treated as ssize_t, can be zero 1847 // c_rarg3 - size_t ckoff (super_check_offset) 1848 // c_rarg4 - oop ckval (super_klass) 1849 // 1850 // Output: 1851 // r0 == 0 - success 1852 // r0 == -1^K - failure, where K is partial transfer count 1853 // 1854 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1855 bool dest_uninitialized; 1856 switch (stub_id) { 1857 case checkcast_arraycopy_id: 1858 dest_uninitialized = false; 1859 break; 1860 case checkcast_arraycopy_uninit_id: 1861 dest_uninitialized = true; 1862 break; 1863 default: 1864 ShouldNotReachHere(); 1865 } 1866 1867 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1868 1869 // Input registers (after setup_arg_regs) 1870 const Register from = c_rarg0; // source array address 1871 const Register to = c_rarg1; // destination array address 1872 const Register count = c_rarg2; // elementscount 1873 const Register ckoff = c_rarg3; // super_check_offset 1874 const Register ckval = c_rarg4; // super_klass 1875 1876 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1877 RegSet wb_post_saved_regs = RegSet::of(count); 1878 1879 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1880 const Register copied_oop = r22; // actual oop copied 1881 const Register count_save = r21; // orig elementscount 1882 const Register start_to = r20; // destination array start address 1883 const Register r19_klass = r19; // oop._klass 1884 1885 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1886 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1887 1888 //--------------------------------------------------------------- 1889 // Assembler stub will be used for this call to arraycopy 1890 // if the two arrays are subtypes of Object[] but the 1891 // destination array type is not equal to or a supertype 1892 // of the source type. Each element must be separately 1893 // checked. 1894 1895 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1896 copied_oop, r19_klass, count_save); 1897 1898 __ align(CodeEntryAlignment); 1899 StubCodeMark mark(this, stub_id); 1900 address start = __ pc(); 1901 1902 __ enter(); // required for proper stackwalking of RuntimeStub frame 1903 1904 #ifdef ASSERT 1905 // caller guarantees that the arrays really are different 1906 // otherwise, we would have to make conjoint checks 1907 { Label L; 1908 __ b(L); // conjoint check not yet implemented 1909 __ stop("checkcast_copy within a single array"); 1910 __ bind(L); 1911 } 1912 #endif //ASSERT 1913 1914 // Caller of this entry point must set up the argument registers. 1915 if (entry != nullptr) { 1916 *entry = __ pc(); 1917 BLOCK_COMMENT("Entry:"); 1918 } 1919 1920 // Empty array: Nothing to do. 1921 __ cbz(count, L_done); 1922 __ push(RegSet::of(r19, r20, r21, r22), sp); 1923 1924 #ifdef ASSERT 1925 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1926 // The ckoff and ckval must be mutually consistent, 1927 // even though caller generates both. 1928 { Label L; 1929 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1930 __ ldrw(start_to, Address(ckval, sco_offset)); 1931 __ cmpw(ckoff, start_to); 1932 __ br(Assembler::EQ, L); 1933 __ stop("super_check_offset inconsistent"); 1934 __ bind(L); 1935 } 1936 #endif //ASSERT 1937 1938 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1939 bool is_oop = true; 1940 int element_size = UseCompressedOops ? 4 : 8; 1941 if (dest_uninitialized) { 1942 decorators |= IS_DEST_UNINITIALIZED; 1943 } 1944 1945 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1946 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1947 1948 // save the original count 1949 __ mov(count_save, count); 1950 1951 // Copy from low to high addresses 1952 __ mov(start_to, to); // Save destination array start address 1953 __ b(L_load_element); 1954 1955 // ======== begin loop ======== 1956 // (Loop is rotated; its entry is L_load_element.) 1957 // Loop control: 1958 // for (; count != 0; count--) { 1959 // copied_oop = load_heap_oop(from++); 1960 // ... generate_type_check ...; 1961 // store_heap_oop(to++, copied_oop); 1962 // } 1963 __ align(OptoLoopAlignment); 1964 1965 __ BIND(L_store_element); 1966 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1967 __ post(to, element_size), copied_oop, noreg, 1968 gct1, gct2, gct3); 1969 __ sub(count, count, 1); 1970 __ cbz(count, L_do_card_marks); 1971 1972 // ======== loop entry is here ======== 1973 __ BIND(L_load_element); 1974 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1975 copied_oop, noreg, __ post(from, element_size), 1976 gct1); 1977 __ cbz(copied_oop, L_store_element); 1978 1979 __ load_klass(r19_klass, copied_oop);// query the object klass 1980 1981 BLOCK_COMMENT("type_check:"); 1982 generate_type_check(/*sub_klass*/r19_klass, 1983 /*super_check_offset*/ckoff, 1984 /*super_klass*/ckval, 1985 /*r_array_base*/gct1, 1986 /*temp2*/gct2, 1987 /*result*/r10, L_store_element); 1988 1989 // Fall through on failure! 1990 1991 // ======== end loop ======== 1992 1993 // It was a real error; we must depend on the caller to finish the job. 1994 // Register count = remaining oops, count_orig = total oops. 1995 // Emit GC store barriers for the oops we have copied and report 1996 // their number to the caller. 1997 1998 __ subs(count, count_save, count); // K = partially copied oop count 1999 __ eon(count, count, zr); // report (-1^K) to caller 2000 __ br(Assembler::EQ, L_done_pop); 2001 2002 __ BIND(L_do_card_marks); 2003 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2004 2005 __ bind(L_done_pop); 2006 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2007 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2008 2009 __ bind(L_done); 2010 __ mov(r0, count); 2011 __ leave(); 2012 __ ret(lr); 2013 2014 return start; 2015 } 2016 2017 // Perform range checks on the proposed arraycopy. 2018 // Kills temp, but nothing else. 2019 // Also, clean the sign bits of src_pos and dst_pos. 2020 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2021 Register src_pos, // source position (c_rarg1) 2022 Register dst, // destination array oo (c_rarg2) 2023 Register dst_pos, // destination position (c_rarg3) 2024 Register length, 2025 Register temp, 2026 Label& L_failed) { 2027 BLOCK_COMMENT("arraycopy_range_checks:"); 2028 2029 assert_different_registers(rscratch1, temp); 2030 2031 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2032 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2033 __ addw(temp, length, src_pos); 2034 __ cmpw(temp, rscratch1); 2035 __ br(Assembler::HI, L_failed); 2036 2037 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2038 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2039 __ addw(temp, length, dst_pos); 2040 __ cmpw(temp, rscratch1); 2041 __ br(Assembler::HI, L_failed); 2042 2043 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2044 __ movw(src_pos, src_pos); 2045 __ movw(dst_pos, dst_pos); 2046 2047 BLOCK_COMMENT("arraycopy_range_checks done"); 2048 } 2049 2050 // These stubs get called from some dumb test routine. 2051 // I'll write them properly when they're called from 2052 // something that's actually doing something. 2053 static void fake_arraycopy_stub(address src, address dst, int count) { 2054 assert(count == 0, "huh?"); 2055 } 2056 2057 2058 // 2059 // Generate 'unsafe' array copy stub 2060 // Though just as safe as the other stubs, it takes an unscaled 2061 // size_t argument instead of an element count. 2062 // 2063 // Input: 2064 // c_rarg0 - source array address 2065 // c_rarg1 - destination array address 2066 // c_rarg2 - byte count, treated as ssize_t, can be zero 2067 // 2068 // Examines the alignment of the operands and dispatches 2069 // to a long, int, short, or byte copy loop. 2070 // 2071 address generate_unsafe_copy(address byte_copy_entry, 2072 address short_copy_entry, 2073 address int_copy_entry, 2074 address long_copy_entry) { 2075 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2076 2077 Label L_long_aligned, L_int_aligned, L_short_aligned; 2078 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2079 2080 __ align(CodeEntryAlignment); 2081 StubCodeMark mark(this, stub_id); 2082 address start = __ pc(); 2083 __ enter(); // required for proper stackwalking of RuntimeStub frame 2084 2085 // bump this on entry, not on exit: 2086 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2087 2088 __ orr(rscratch1, s, d); 2089 __ orr(rscratch1, rscratch1, count); 2090 2091 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2092 __ cbz(rscratch1, L_long_aligned); 2093 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2094 __ cbz(rscratch1, L_int_aligned); 2095 __ tbz(rscratch1, 0, L_short_aligned); 2096 __ b(RuntimeAddress(byte_copy_entry)); 2097 2098 __ BIND(L_short_aligned); 2099 __ lsr(count, count, LogBytesPerShort); // size => short_count 2100 __ b(RuntimeAddress(short_copy_entry)); 2101 __ BIND(L_int_aligned); 2102 __ lsr(count, count, LogBytesPerInt); // size => int_count 2103 __ b(RuntimeAddress(int_copy_entry)); 2104 __ BIND(L_long_aligned); 2105 __ lsr(count, count, LogBytesPerLong); // size => long_count 2106 __ b(RuntimeAddress(long_copy_entry)); 2107 2108 return start; 2109 } 2110 2111 // 2112 // Generate generic array copy stubs 2113 // 2114 // Input: 2115 // c_rarg0 - src oop 2116 // c_rarg1 - src_pos (32-bits) 2117 // c_rarg2 - dst oop 2118 // c_rarg3 - dst_pos (32-bits) 2119 // c_rarg4 - element count (32-bits) 2120 // 2121 // Output: 2122 // r0 == 0 - success 2123 // r0 == -1^K - failure, where K is partial transfer count 2124 // 2125 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2126 address int_copy_entry, address oop_copy_entry, 2127 address long_copy_entry, address checkcast_copy_entry) { 2128 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2129 2130 Label L_failed, L_objArray; 2131 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2132 2133 // Input registers 2134 const Register src = c_rarg0; // source array oop 2135 const Register src_pos = c_rarg1; // source position 2136 const Register dst = c_rarg2; // destination array oop 2137 const Register dst_pos = c_rarg3; // destination position 2138 const Register length = c_rarg4; 2139 2140 2141 // Registers used as temps 2142 const Register dst_klass = c_rarg5; 2143 2144 __ align(CodeEntryAlignment); 2145 2146 StubCodeMark mark(this, stub_id); 2147 2148 address start = __ pc(); 2149 2150 __ enter(); // required for proper stackwalking of RuntimeStub frame 2151 2152 // bump this on entry, not on exit: 2153 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2154 2155 //----------------------------------------------------------------------- 2156 // Assembler stub will be used for this call to arraycopy 2157 // if the following conditions are met: 2158 // 2159 // (1) src and dst must not be null. 2160 // (2) src_pos must not be negative. 2161 // (3) dst_pos must not be negative. 2162 // (4) length must not be negative. 2163 // (5) src klass and dst klass should be the same and not null. 2164 // (6) src and dst should be arrays. 2165 // (7) src_pos + length must not exceed length of src. 2166 // (8) dst_pos + length must not exceed length of dst. 2167 // 2168 2169 // if (src == nullptr) return -1; 2170 __ cbz(src, L_failed); 2171 2172 // if (src_pos < 0) return -1; 2173 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2174 2175 // if (dst == nullptr) return -1; 2176 __ cbz(dst, L_failed); 2177 2178 // if (dst_pos < 0) return -1; 2179 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2180 2181 // registers used as temp 2182 const Register scratch_length = r16; // elements count to copy 2183 const Register scratch_src_klass = r17; // array klass 2184 const Register lh = r15; // layout helper 2185 2186 // if (length < 0) return -1; 2187 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2188 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2189 2190 __ load_klass(scratch_src_klass, src); 2191 #ifdef ASSERT 2192 // assert(src->klass() != nullptr); 2193 { 2194 BLOCK_COMMENT("assert klasses not null {"); 2195 Label L1, L2; 2196 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2197 __ bind(L1); 2198 __ stop("broken null klass"); 2199 __ bind(L2); 2200 __ load_klass(rscratch1, dst); 2201 __ cbz(rscratch1, L1); // this would be broken also 2202 BLOCK_COMMENT("} assert klasses not null done"); 2203 } 2204 #endif 2205 2206 // Load layout helper (32-bits) 2207 // 2208 // |array_tag| | header_size | element_type | |log2_element_size| 2209 // 32 30 24 16 8 2 0 2210 // 2211 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2212 // 2213 2214 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2215 2216 // Handle objArrays completely differently... 2217 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2218 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2219 __ movw(rscratch1, objArray_lh); 2220 __ eorw(rscratch2, lh, rscratch1); 2221 __ cbzw(rscratch2, L_objArray); 2222 2223 // if (src->klass() != dst->klass()) return -1; 2224 __ load_klass(rscratch2, dst); 2225 __ eor(rscratch2, rscratch2, scratch_src_klass); 2226 __ cbnz(rscratch2, L_failed); 2227 2228 // if (!src->is_Array()) return -1; 2229 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2230 2231 // At this point, it is known to be a typeArray (array_tag 0x3). 2232 #ifdef ASSERT 2233 { 2234 BLOCK_COMMENT("assert primitive array {"); 2235 Label L; 2236 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2237 __ cmpw(lh, rscratch2); 2238 __ br(Assembler::GE, L); 2239 __ stop("must be a primitive array"); 2240 __ bind(L); 2241 BLOCK_COMMENT("} assert primitive array done"); 2242 } 2243 #endif 2244 2245 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2246 rscratch2, L_failed); 2247 2248 // TypeArrayKlass 2249 // 2250 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2251 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2252 // 2253 2254 const Register rscratch1_offset = rscratch1; // array offset 2255 const Register r15_elsize = lh; // element size 2256 2257 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2258 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2259 __ add(src, src, rscratch1_offset); // src array offset 2260 __ add(dst, dst, rscratch1_offset); // dst array offset 2261 BLOCK_COMMENT("choose copy loop based on element size"); 2262 2263 // next registers should be set before the jump to corresponding stub 2264 const Register from = c_rarg0; // source array address 2265 const Register to = c_rarg1; // destination array address 2266 const Register count = c_rarg2; // elements count 2267 2268 // 'from', 'to', 'count' registers should be set in such order 2269 // since they are the same as 'src', 'src_pos', 'dst'. 2270 2271 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2272 2273 // The possible values of elsize are 0-3, i.e. exact_log2(element 2274 // size in bytes). We do a simple bitwise binary search. 2275 __ BIND(L_copy_bytes); 2276 __ tbnz(r15_elsize, 1, L_copy_ints); 2277 __ tbnz(r15_elsize, 0, L_copy_shorts); 2278 __ lea(from, Address(src, src_pos));// src_addr 2279 __ lea(to, Address(dst, dst_pos));// dst_addr 2280 __ movw(count, scratch_length); // length 2281 __ b(RuntimeAddress(byte_copy_entry)); 2282 2283 __ BIND(L_copy_shorts); 2284 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2285 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2286 __ movw(count, scratch_length); // length 2287 __ b(RuntimeAddress(short_copy_entry)); 2288 2289 __ BIND(L_copy_ints); 2290 __ tbnz(r15_elsize, 0, L_copy_longs); 2291 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2292 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2293 __ movw(count, scratch_length); // length 2294 __ b(RuntimeAddress(int_copy_entry)); 2295 2296 __ BIND(L_copy_longs); 2297 #ifdef ASSERT 2298 { 2299 BLOCK_COMMENT("assert long copy {"); 2300 Label L; 2301 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2302 __ cmpw(r15_elsize, LogBytesPerLong); 2303 __ br(Assembler::EQ, L); 2304 __ stop("must be long copy, but elsize is wrong"); 2305 __ bind(L); 2306 BLOCK_COMMENT("} assert long copy done"); 2307 } 2308 #endif 2309 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2310 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2311 __ movw(count, scratch_length); // length 2312 __ b(RuntimeAddress(long_copy_entry)); 2313 2314 // ObjArrayKlass 2315 __ BIND(L_objArray); 2316 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2317 2318 Label L_plain_copy, L_checkcast_copy; 2319 // test array classes for subtyping 2320 __ load_klass(r15, dst); 2321 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2322 __ br(Assembler::NE, L_checkcast_copy); 2323 2324 // Identically typed arrays can be copied without element-wise checks. 2325 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2326 rscratch2, L_failed); 2327 2328 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2329 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2330 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2331 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2332 __ movw(count, scratch_length); // length 2333 __ BIND(L_plain_copy); 2334 __ b(RuntimeAddress(oop_copy_entry)); 2335 2336 __ BIND(L_checkcast_copy); 2337 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2338 { 2339 // Before looking at dst.length, make sure dst is also an objArray. 2340 __ ldrw(rscratch1, Address(r15, lh_offset)); 2341 __ movw(rscratch2, objArray_lh); 2342 __ eorw(rscratch1, rscratch1, rscratch2); 2343 __ cbnzw(rscratch1, L_failed); 2344 2345 // It is safe to examine both src.length and dst.length. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 r15, L_failed); 2348 2349 __ load_klass(dst_klass, dst); // reload 2350 2351 // Marshal the base address arguments now, freeing registers. 2352 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2353 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2354 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2355 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2356 __ movw(count, length); // length (reloaded) 2357 Register sco_temp = c_rarg3; // this register is free now 2358 assert_different_registers(from, to, count, sco_temp, 2359 dst_klass, scratch_src_klass); 2360 // assert_clean_int(count, sco_temp); 2361 2362 // Generate the type check. 2363 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2364 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2365 2366 // Smashes rscratch1, rscratch2 2367 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2368 L_plain_copy); 2369 2370 // Fetch destination element klass from the ObjArrayKlass header. 2371 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2372 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2373 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2374 2375 // the checkcast_copy loop needs two extra arguments: 2376 assert(c_rarg3 == sco_temp, "#3 already in place"); 2377 // Set up arguments for checkcast_copy_entry. 2378 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2379 __ b(RuntimeAddress(checkcast_copy_entry)); 2380 } 2381 2382 __ BIND(L_failed); 2383 __ mov(r0, -1); 2384 __ leave(); // required for proper stackwalking of RuntimeStub frame 2385 __ ret(lr); 2386 2387 return start; 2388 } 2389 2390 // 2391 // Generate stub for array fill. If "aligned" is true, the 2392 // "to" address is assumed to be heapword aligned. 2393 // 2394 // Arguments for generated stub: 2395 // to: c_rarg0 2396 // value: c_rarg1 2397 // count: c_rarg2 treated as signed 2398 // 2399 address generate_fill(StubGenStubId stub_id) { 2400 BasicType t; 2401 bool aligned; 2402 2403 switch (stub_id) { 2404 case jbyte_fill_id: 2405 t = T_BYTE; 2406 aligned = false; 2407 break; 2408 case jshort_fill_id: 2409 t = T_SHORT; 2410 aligned = false; 2411 break; 2412 case jint_fill_id: 2413 t = T_INT; 2414 aligned = false; 2415 break; 2416 case arrayof_jbyte_fill_id: 2417 t = T_BYTE; 2418 aligned = true; 2419 break; 2420 case arrayof_jshort_fill_id: 2421 t = T_SHORT; 2422 aligned = true; 2423 break; 2424 case arrayof_jint_fill_id: 2425 t = T_INT; 2426 aligned = true; 2427 break; 2428 default: 2429 ShouldNotReachHere(); 2430 }; 2431 2432 __ align(CodeEntryAlignment); 2433 StubCodeMark mark(this, stub_id); 2434 address start = __ pc(); 2435 2436 BLOCK_COMMENT("Entry:"); 2437 2438 const Register to = c_rarg0; // source array address 2439 const Register value = c_rarg1; // value 2440 const Register count = c_rarg2; // elements count 2441 2442 const Register bz_base = r10; // base for block_zero routine 2443 const Register cnt_words = r11; // temp register 2444 2445 __ enter(); 2446 2447 Label L_fill_elements, L_exit1; 2448 2449 int shift = -1; 2450 switch (t) { 2451 case T_BYTE: 2452 shift = 0; 2453 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2454 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2455 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2456 __ br(Assembler::LO, L_fill_elements); 2457 break; 2458 case T_SHORT: 2459 shift = 1; 2460 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2461 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2462 __ br(Assembler::LO, L_fill_elements); 2463 break; 2464 case T_INT: 2465 shift = 2; 2466 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2467 __ br(Assembler::LO, L_fill_elements); 2468 break; 2469 default: ShouldNotReachHere(); 2470 } 2471 2472 // Align source address at 8 bytes address boundary. 2473 Label L_skip_align1, L_skip_align2, L_skip_align4; 2474 if (!aligned) { 2475 switch (t) { 2476 case T_BYTE: 2477 // One byte misalignment happens only for byte arrays. 2478 __ tbz(to, 0, L_skip_align1); 2479 __ strb(value, Address(__ post(to, 1))); 2480 __ subw(count, count, 1); 2481 __ bind(L_skip_align1); 2482 // Fallthrough 2483 case T_SHORT: 2484 // Two bytes misalignment happens only for byte and short (char) arrays. 2485 __ tbz(to, 1, L_skip_align2); 2486 __ strh(value, Address(__ post(to, 2))); 2487 __ subw(count, count, 2 >> shift); 2488 __ bind(L_skip_align2); 2489 // Fallthrough 2490 case T_INT: 2491 // Align to 8 bytes, we know we are 4 byte aligned to start. 2492 __ tbz(to, 2, L_skip_align4); 2493 __ strw(value, Address(__ post(to, 4))); 2494 __ subw(count, count, 4 >> shift); 2495 __ bind(L_skip_align4); 2496 break; 2497 default: ShouldNotReachHere(); 2498 } 2499 } 2500 2501 // 2502 // Fill large chunks 2503 // 2504 __ lsrw(cnt_words, count, 3 - shift); // number of words 2505 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2506 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2507 if (UseBlockZeroing) { 2508 Label non_block_zeroing, rest; 2509 // If the fill value is zero we can use the fast zero_words(). 2510 __ cbnz(value, non_block_zeroing); 2511 __ mov(bz_base, to); 2512 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2513 address tpc = __ zero_words(bz_base, cnt_words); 2514 if (tpc == nullptr) { 2515 fatal("CodeCache is full at generate_fill"); 2516 } 2517 __ b(rest); 2518 __ bind(non_block_zeroing); 2519 __ fill_words(to, cnt_words, value); 2520 __ bind(rest); 2521 } else { 2522 __ fill_words(to, cnt_words, value); 2523 } 2524 2525 // Remaining count is less than 8 bytes. Fill it by a single store. 2526 // Note that the total length is no less than 8 bytes. 2527 if (t == T_BYTE || t == T_SHORT) { 2528 Label L_exit1; 2529 __ cbzw(count, L_exit1); 2530 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2531 __ str(value, Address(to, -8)); // overwrite some elements 2532 __ bind(L_exit1); 2533 __ leave(); 2534 __ ret(lr); 2535 } 2536 2537 // Handle copies less than 8 bytes. 2538 Label L_fill_2, L_fill_4, L_exit2; 2539 __ bind(L_fill_elements); 2540 switch (t) { 2541 case T_BYTE: 2542 __ tbz(count, 0, L_fill_2); 2543 __ strb(value, Address(__ post(to, 1))); 2544 __ bind(L_fill_2); 2545 __ tbz(count, 1, L_fill_4); 2546 __ strh(value, Address(__ post(to, 2))); 2547 __ bind(L_fill_4); 2548 __ tbz(count, 2, L_exit2); 2549 __ strw(value, Address(to)); 2550 break; 2551 case T_SHORT: 2552 __ tbz(count, 0, L_fill_4); 2553 __ strh(value, Address(__ post(to, 2))); 2554 __ bind(L_fill_4); 2555 __ tbz(count, 1, L_exit2); 2556 __ strw(value, Address(to)); 2557 break; 2558 case T_INT: 2559 __ cbzw(count, L_exit2); 2560 __ strw(value, Address(to)); 2561 break; 2562 default: ShouldNotReachHere(); 2563 } 2564 __ bind(L_exit2); 2565 __ leave(); 2566 __ ret(lr); 2567 return start; 2568 } 2569 2570 address generate_data_cache_writeback() { 2571 const Register line = c_rarg0; // address of line to write back 2572 2573 __ align(CodeEntryAlignment); 2574 2575 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2576 StubCodeMark mark(this, stub_id); 2577 2578 address start = __ pc(); 2579 __ enter(); 2580 __ cache_wb(Address(line, 0)); 2581 __ leave(); 2582 __ ret(lr); 2583 2584 return start; 2585 } 2586 2587 address generate_data_cache_writeback_sync() { 2588 const Register is_pre = c_rarg0; // pre or post sync 2589 2590 __ align(CodeEntryAlignment); 2591 2592 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2593 StubCodeMark mark(this, stub_id); 2594 2595 // pre wbsync is a no-op 2596 // post wbsync translates to an sfence 2597 2598 Label skip; 2599 address start = __ pc(); 2600 __ enter(); 2601 __ cbnz(is_pre, skip); 2602 __ cache_wbsync(false); 2603 __ bind(skip); 2604 __ leave(); 2605 __ ret(lr); 2606 2607 return start; 2608 } 2609 2610 void generate_arraycopy_stubs() { 2611 address entry; 2612 address entry_jbyte_arraycopy; 2613 address entry_jshort_arraycopy; 2614 address entry_jint_arraycopy; 2615 address entry_oop_arraycopy; 2616 address entry_jlong_arraycopy; 2617 address entry_checkcast_arraycopy; 2618 2619 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2620 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2621 2622 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2623 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2624 2625 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2626 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2627 2628 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2629 2630 //*** jbyte 2631 // Always need aligned and unaligned versions 2632 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2633 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2634 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2635 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2636 2637 //*** jshort 2638 // Always need aligned and unaligned versions 2639 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2640 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2641 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2642 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2643 2644 //*** jint 2645 // Aligned versions 2646 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2647 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2648 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2649 // entry_jint_arraycopy always points to the unaligned version 2650 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2651 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2652 2653 //*** jlong 2654 // It is always aligned 2655 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2656 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2657 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2658 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2659 2660 //*** oops 2661 { 2662 // With compressed oops we need unaligned versions; notice that 2663 // we overwrite entry_oop_arraycopy. 2664 bool aligned = !UseCompressedOops; 2665 2666 StubRoutines::_arrayof_oop_disjoint_arraycopy 2667 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2668 StubRoutines::_arrayof_oop_arraycopy 2669 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2670 // Aligned versions without pre-barriers 2671 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2672 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2673 StubRoutines::_arrayof_oop_arraycopy_uninit 2674 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2675 } 2676 2677 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2678 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2679 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2680 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2681 2682 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2683 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2684 2685 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2686 entry_jshort_arraycopy, 2687 entry_jint_arraycopy, 2688 entry_jlong_arraycopy); 2689 2690 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2691 entry_jshort_arraycopy, 2692 entry_jint_arraycopy, 2693 entry_oop_arraycopy, 2694 entry_jlong_arraycopy, 2695 entry_checkcast_arraycopy); 2696 2697 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2698 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2699 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2700 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2701 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2702 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2703 } 2704 2705 void generate_math_stubs() { Unimplemented(); } 2706 2707 // Arguments: 2708 // 2709 // Inputs: 2710 // c_rarg0 - source byte array address 2711 // c_rarg1 - destination byte array address 2712 // c_rarg2 - K (key) in little endian int array 2713 // 2714 address generate_aescrypt_encryptBlock() { 2715 __ align(CodeEntryAlignment); 2716 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2717 StubCodeMark mark(this, stub_id); 2718 2719 const Register from = c_rarg0; // source array address 2720 const Register to = c_rarg1; // destination array address 2721 const Register key = c_rarg2; // key array address 2722 const Register keylen = rscratch1; 2723 2724 address start = __ pc(); 2725 __ enter(); 2726 2727 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2728 2729 __ aesenc_loadkeys(key, keylen); 2730 __ aesecb_encrypt(from, to, keylen); 2731 2732 __ mov(r0, 0); 2733 2734 __ leave(); 2735 __ ret(lr); 2736 2737 return start; 2738 } 2739 2740 // Arguments: 2741 // 2742 // Inputs: 2743 // c_rarg0 - source byte array address 2744 // c_rarg1 - destination byte array address 2745 // c_rarg2 - K (key) in little endian int array 2746 // 2747 address generate_aescrypt_decryptBlock() { 2748 assert(UseAES, "need AES cryptographic extension support"); 2749 __ align(CodeEntryAlignment); 2750 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2751 StubCodeMark mark(this, stub_id); 2752 Label L_doLast; 2753 2754 const Register from = c_rarg0; // source array address 2755 const Register to = c_rarg1; // destination array address 2756 const Register key = c_rarg2; // key array address 2757 const Register keylen = rscratch1; 2758 2759 address start = __ pc(); 2760 __ enter(); // required for proper stackwalking of RuntimeStub frame 2761 2762 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2763 2764 __ aesecb_decrypt(from, to, key, keylen); 2765 2766 __ mov(r0, 0); 2767 2768 __ leave(); 2769 __ ret(lr); 2770 2771 return start; 2772 } 2773 2774 // Arguments: 2775 // 2776 // Inputs: 2777 // c_rarg0 - source byte array address 2778 // c_rarg1 - destination byte array address 2779 // c_rarg2 - K (key) in little endian int array 2780 // c_rarg3 - r vector byte array address 2781 // c_rarg4 - input length 2782 // 2783 // Output: 2784 // x0 - input length 2785 // 2786 address generate_cipherBlockChaining_encryptAESCrypt() { 2787 assert(UseAES, "need AES cryptographic extension support"); 2788 __ align(CodeEntryAlignment); 2789 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2790 StubCodeMark mark(this, stub_id); 2791 2792 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2793 2794 const Register from = c_rarg0; // source array address 2795 const Register to = c_rarg1; // destination array address 2796 const Register key = c_rarg2; // key array address 2797 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2798 // and left with the results of the last encryption block 2799 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2800 const Register keylen = rscratch1; 2801 2802 address start = __ pc(); 2803 2804 __ enter(); 2805 2806 __ movw(rscratch2, len_reg); 2807 2808 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2809 2810 __ ld1(v0, __ T16B, rvec); 2811 2812 __ cmpw(keylen, 52); 2813 __ br(Assembler::CC, L_loadkeys_44); 2814 __ br(Assembler::EQ, L_loadkeys_52); 2815 2816 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2817 __ rev32(v17, __ T16B, v17); 2818 __ rev32(v18, __ T16B, v18); 2819 __ BIND(L_loadkeys_52); 2820 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2821 __ rev32(v19, __ T16B, v19); 2822 __ rev32(v20, __ T16B, v20); 2823 __ BIND(L_loadkeys_44); 2824 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2825 __ rev32(v21, __ T16B, v21); 2826 __ rev32(v22, __ T16B, v22); 2827 __ rev32(v23, __ T16B, v23); 2828 __ rev32(v24, __ T16B, v24); 2829 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2830 __ rev32(v25, __ T16B, v25); 2831 __ rev32(v26, __ T16B, v26); 2832 __ rev32(v27, __ T16B, v27); 2833 __ rev32(v28, __ T16B, v28); 2834 __ ld1(v29, v30, v31, __ T16B, key); 2835 __ rev32(v29, __ T16B, v29); 2836 __ rev32(v30, __ T16B, v30); 2837 __ rev32(v31, __ T16B, v31); 2838 2839 __ BIND(L_aes_loop); 2840 __ ld1(v1, __ T16B, __ post(from, 16)); 2841 __ eor(v0, __ T16B, v0, v1); 2842 2843 __ br(Assembler::CC, L_rounds_44); 2844 __ br(Assembler::EQ, L_rounds_52); 2845 2846 __ aese(v0, v17); __ aesmc(v0, v0); 2847 __ aese(v0, v18); __ aesmc(v0, v0); 2848 __ BIND(L_rounds_52); 2849 __ aese(v0, v19); __ aesmc(v0, v0); 2850 __ aese(v0, v20); __ aesmc(v0, v0); 2851 __ BIND(L_rounds_44); 2852 __ aese(v0, v21); __ aesmc(v0, v0); 2853 __ aese(v0, v22); __ aesmc(v0, v0); 2854 __ aese(v0, v23); __ aesmc(v0, v0); 2855 __ aese(v0, v24); __ aesmc(v0, v0); 2856 __ aese(v0, v25); __ aesmc(v0, v0); 2857 __ aese(v0, v26); __ aesmc(v0, v0); 2858 __ aese(v0, v27); __ aesmc(v0, v0); 2859 __ aese(v0, v28); __ aesmc(v0, v0); 2860 __ aese(v0, v29); __ aesmc(v0, v0); 2861 __ aese(v0, v30); 2862 __ eor(v0, __ T16B, v0, v31); 2863 2864 __ st1(v0, __ T16B, __ post(to, 16)); 2865 2866 __ subw(len_reg, len_reg, 16); 2867 __ cbnzw(len_reg, L_aes_loop); 2868 2869 __ st1(v0, __ T16B, rvec); 2870 2871 __ mov(r0, rscratch2); 2872 2873 __ leave(); 2874 __ ret(lr); 2875 2876 return start; 2877 } 2878 2879 // Arguments: 2880 // 2881 // Inputs: 2882 // c_rarg0 - source byte array address 2883 // c_rarg1 - destination byte array address 2884 // c_rarg2 - K (key) in little endian int array 2885 // c_rarg3 - r vector byte array address 2886 // c_rarg4 - input length 2887 // 2888 // Output: 2889 // r0 - input length 2890 // 2891 address generate_cipherBlockChaining_decryptAESCrypt() { 2892 assert(UseAES, "need AES cryptographic extension support"); 2893 __ align(CodeEntryAlignment); 2894 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 2895 StubCodeMark mark(this, stub_id); 2896 2897 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2898 2899 const Register from = c_rarg0; // source array address 2900 const Register to = c_rarg1; // destination array address 2901 const Register key = c_rarg2; // key array address 2902 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2903 // and left with the results of the last encryption block 2904 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2905 const Register keylen = rscratch1; 2906 2907 address start = __ pc(); 2908 2909 __ enter(); 2910 2911 __ movw(rscratch2, len_reg); 2912 2913 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2914 2915 __ ld1(v2, __ T16B, rvec); 2916 2917 __ ld1(v31, __ T16B, __ post(key, 16)); 2918 __ rev32(v31, __ T16B, v31); 2919 2920 __ cmpw(keylen, 52); 2921 __ br(Assembler::CC, L_loadkeys_44); 2922 __ br(Assembler::EQ, L_loadkeys_52); 2923 2924 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2925 __ rev32(v17, __ T16B, v17); 2926 __ rev32(v18, __ T16B, v18); 2927 __ BIND(L_loadkeys_52); 2928 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2929 __ rev32(v19, __ T16B, v19); 2930 __ rev32(v20, __ T16B, v20); 2931 __ BIND(L_loadkeys_44); 2932 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2933 __ rev32(v21, __ T16B, v21); 2934 __ rev32(v22, __ T16B, v22); 2935 __ rev32(v23, __ T16B, v23); 2936 __ rev32(v24, __ T16B, v24); 2937 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2938 __ rev32(v25, __ T16B, v25); 2939 __ rev32(v26, __ T16B, v26); 2940 __ rev32(v27, __ T16B, v27); 2941 __ rev32(v28, __ T16B, v28); 2942 __ ld1(v29, v30, __ T16B, key); 2943 __ rev32(v29, __ T16B, v29); 2944 __ rev32(v30, __ T16B, v30); 2945 2946 __ BIND(L_aes_loop); 2947 __ ld1(v0, __ T16B, __ post(from, 16)); 2948 __ orr(v1, __ T16B, v0, v0); 2949 2950 __ br(Assembler::CC, L_rounds_44); 2951 __ br(Assembler::EQ, L_rounds_52); 2952 2953 __ aesd(v0, v17); __ aesimc(v0, v0); 2954 __ aesd(v0, v18); __ aesimc(v0, v0); 2955 __ BIND(L_rounds_52); 2956 __ aesd(v0, v19); __ aesimc(v0, v0); 2957 __ aesd(v0, v20); __ aesimc(v0, v0); 2958 __ BIND(L_rounds_44); 2959 __ aesd(v0, v21); __ aesimc(v0, v0); 2960 __ aesd(v0, v22); __ aesimc(v0, v0); 2961 __ aesd(v0, v23); __ aesimc(v0, v0); 2962 __ aesd(v0, v24); __ aesimc(v0, v0); 2963 __ aesd(v0, v25); __ aesimc(v0, v0); 2964 __ aesd(v0, v26); __ aesimc(v0, v0); 2965 __ aesd(v0, v27); __ aesimc(v0, v0); 2966 __ aesd(v0, v28); __ aesimc(v0, v0); 2967 __ aesd(v0, v29); __ aesimc(v0, v0); 2968 __ aesd(v0, v30); 2969 __ eor(v0, __ T16B, v0, v31); 2970 __ eor(v0, __ T16B, v0, v2); 2971 2972 __ st1(v0, __ T16B, __ post(to, 16)); 2973 __ orr(v2, __ T16B, v1, v1); 2974 2975 __ subw(len_reg, len_reg, 16); 2976 __ cbnzw(len_reg, L_aes_loop); 2977 2978 __ st1(v2, __ T16B, rvec); 2979 2980 __ mov(r0, rscratch2); 2981 2982 __ leave(); 2983 __ ret(lr); 2984 2985 return start; 2986 } 2987 2988 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2989 // Inputs: 128-bits. in is preserved. 2990 // The least-significant 64-bit word is in the upper dword of each vector. 2991 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2992 // Output: result 2993 void be_add_128_64(FloatRegister result, FloatRegister in, 2994 FloatRegister inc, FloatRegister tmp) { 2995 assert_different_registers(result, tmp, inc); 2996 2997 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2998 // input 2999 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3000 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3001 // MSD == 0 (must be!) to LSD 3002 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3003 } 3004 3005 // CTR AES crypt. 3006 // Arguments: 3007 // 3008 // Inputs: 3009 // c_rarg0 - source byte array address 3010 // c_rarg1 - destination byte array address 3011 // c_rarg2 - K (key) in little endian int array 3012 // c_rarg3 - counter vector byte array address 3013 // c_rarg4 - input length 3014 // c_rarg5 - saved encryptedCounter start 3015 // c_rarg6 - saved used length 3016 // 3017 // Output: 3018 // r0 - input length 3019 // 3020 address generate_counterMode_AESCrypt() { 3021 const Register in = c_rarg0; 3022 const Register out = c_rarg1; 3023 const Register key = c_rarg2; 3024 const Register counter = c_rarg3; 3025 const Register saved_len = c_rarg4, len = r10; 3026 const Register saved_encrypted_ctr = c_rarg5; 3027 const Register used_ptr = c_rarg6, used = r12; 3028 3029 const Register offset = r7; 3030 const Register keylen = r11; 3031 3032 const unsigned char block_size = 16; 3033 const int bulk_width = 4; 3034 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3035 // performance with larger data sizes, but it also means that the 3036 // fast path isn't used until you have at least 8 blocks, and up 3037 // to 127 bytes of data will be executed on the slow path. For 3038 // that reason, and also so as not to blow away too much icache, 4 3039 // blocks seems like a sensible compromise. 3040 3041 // Algorithm: 3042 // 3043 // if (len == 0) { 3044 // goto DONE; 3045 // } 3046 // int result = len; 3047 // do { 3048 // if (used >= blockSize) { 3049 // if (len >= bulk_width * blockSize) { 3050 // CTR_large_block(); 3051 // if (len == 0) 3052 // goto DONE; 3053 // } 3054 // for (;;) { 3055 // 16ByteVector v0 = counter; 3056 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3057 // used = 0; 3058 // if (len < blockSize) 3059 // break; /* goto NEXT */ 3060 // 16ByteVector v1 = load16Bytes(in, offset); 3061 // v1 = v1 ^ encryptedCounter; 3062 // store16Bytes(out, offset); 3063 // used = blockSize; 3064 // offset += blockSize; 3065 // len -= blockSize; 3066 // if (len == 0) 3067 // goto DONE; 3068 // } 3069 // } 3070 // NEXT: 3071 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3072 // len--; 3073 // } while (len != 0); 3074 // DONE: 3075 // return result; 3076 // 3077 // CTR_large_block() 3078 // Wide bulk encryption of whole blocks. 3079 3080 __ align(CodeEntryAlignment); 3081 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3082 StubCodeMark mark(this, stub_id); 3083 const address start = __ pc(); 3084 __ enter(); 3085 3086 Label DONE, CTR_large_block, large_block_return; 3087 __ ldrw(used, Address(used_ptr)); 3088 __ cbzw(saved_len, DONE); 3089 3090 __ mov(len, saved_len); 3091 __ mov(offset, 0); 3092 3093 // Compute #rounds for AES based on the length of the key array 3094 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3095 3096 __ aesenc_loadkeys(key, keylen); 3097 3098 { 3099 Label L_CTR_loop, NEXT; 3100 3101 __ bind(L_CTR_loop); 3102 3103 __ cmp(used, block_size); 3104 __ br(__ LO, NEXT); 3105 3106 // Maybe we have a lot of data 3107 __ subsw(rscratch1, len, bulk_width * block_size); 3108 __ br(__ HS, CTR_large_block); 3109 __ BIND(large_block_return); 3110 __ cbzw(len, DONE); 3111 3112 // Setup the counter 3113 __ movi(v4, __ T4S, 0); 3114 __ movi(v5, __ T4S, 1); 3115 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3116 3117 // 128-bit big-endian increment 3118 __ ld1(v0, __ T16B, counter); 3119 __ rev64(v16, __ T16B, v0); 3120 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3121 __ rev64(v16, __ T16B, v16); 3122 __ st1(v16, __ T16B, counter); 3123 // Previous counter value is in v0 3124 // v4 contains { 0, 1 } 3125 3126 { 3127 // We have fewer than bulk_width blocks of data left. Encrypt 3128 // them one by one until there is less than a full block 3129 // remaining, being careful to save both the encrypted counter 3130 // and the counter. 3131 3132 Label inner_loop; 3133 __ bind(inner_loop); 3134 // Counter to encrypt is in v0 3135 __ aesecb_encrypt(noreg, noreg, keylen); 3136 __ st1(v0, __ T16B, saved_encrypted_ctr); 3137 3138 // Do we have a remaining full block? 3139 3140 __ mov(used, 0); 3141 __ cmp(len, block_size); 3142 __ br(__ LO, NEXT); 3143 3144 // Yes, we have a full block 3145 __ ldrq(v1, Address(in, offset)); 3146 __ eor(v1, __ T16B, v1, v0); 3147 __ strq(v1, Address(out, offset)); 3148 __ mov(used, block_size); 3149 __ add(offset, offset, block_size); 3150 3151 __ subw(len, len, block_size); 3152 __ cbzw(len, DONE); 3153 3154 // Increment the counter, store it back 3155 __ orr(v0, __ T16B, v16, v16); 3156 __ rev64(v16, __ T16B, v16); 3157 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3158 __ rev64(v16, __ T16B, v16); 3159 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3160 3161 __ b(inner_loop); 3162 } 3163 3164 __ BIND(NEXT); 3165 3166 // Encrypt a single byte, and loop. 3167 // We expect this to be a rare event. 3168 __ ldrb(rscratch1, Address(in, offset)); 3169 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3170 __ eor(rscratch1, rscratch1, rscratch2); 3171 __ strb(rscratch1, Address(out, offset)); 3172 __ add(offset, offset, 1); 3173 __ add(used, used, 1); 3174 __ subw(len, len,1); 3175 __ cbnzw(len, L_CTR_loop); 3176 } 3177 3178 __ bind(DONE); 3179 __ strw(used, Address(used_ptr)); 3180 __ mov(r0, saved_len); 3181 3182 __ leave(); // required for proper stackwalking of RuntimeStub frame 3183 __ ret(lr); 3184 3185 // Bulk encryption 3186 3187 __ BIND (CTR_large_block); 3188 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3189 3190 if (bulk_width == 8) { 3191 __ sub(sp, sp, 4 * 16); 3192 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3193 } 3194 __ sub(sp, sp, 4 * 16); 3195 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3196 RegSet saved_regs = (RegSet::of(in, out, offset) 3197 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3198 __ push(saved_regs, sp); 3199 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3200 __ add(in, in, offset); 3201 __ add(out, out, offset); 3202 3203 // Keys should already be loaded into the correct registers 3204 3205 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3206 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3207 3208 // AES/CTR loop 3209 { 3210 Label L_CTR_loop; 3211 __ BIND(L_CTR_loop); 3212 3213 // Setup the counters 3214 __ movi(v8, __ T4S, 0); 3215 __ movi(v9, __ T4S, 1); 3216 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3217 3218 for (int i = 0; i < bulk_width; i++) { 3219 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3220 __ rev64(v0_ofs, __ T16B, v16); 3221 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3222 } 3223 3224 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3225 3226 // Encrypt the counters 3227 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3228 3229 if (bulk_width == 8) { 3230 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3231 } 3232 3233 // XOR the encrypted counters with the inputs 3234 for (int i = 0; i < bulk_width; i++) { 3235 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3236 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3237 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3238 } 3239 3240 // Write the encrypted data 3241 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3242 if (bulk_width == 8) { 3243 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3244 } 3245 3246 __ subw(len, len, 16 * bulk_width); 3247 __ cbnzw(len, L_CTR_loop); 3248 } 3249 3250 // Save the counter back where it goes 3251 __ rev64(v16, __ T16B, v16); 3252 __ st1(v16, __ T16B, counter); 3253 3254 __ pop(saved_regs, sp); 3255 3256 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3257 if (bulk_width == 8) { 3258 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3259 } 3260 3261 __ andr(rscratch1, len, -16 * bulk_width); 3262 __ sub(len, len, rscratch1); 3263 __ add(offset, offset, rscratch1); 3264 __ mov(used, 16); 3265 __ strw(used, Address(used_ptr)); 3266 __ b(large_block_return); 3267 3268 return start; 3269 } 3270 3271 // Vector AES Galois Counter Mode implementation. Parameters: 3272 // 3273 // in = c_rarg0 3274 // len = c_rarg1 3275 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3276 // out = c_rarg3 3277 // key = c_rarg4 3278 // state = c_rarg5 - GHASH.state 3279 // subkeyHtbl = c_rarg6 - powers of H 3280 // counter = c_rarg7 - 16 bytes of CTR 3281 // return - number of processed bytes 3282 address generate_galoisCounterMode_AESCrypt() { 3283 address ghash_polynomial = __ pc(); 3284 __ emit_int64(0x87); // The low-order bits of the field 3285 // polynomial (i.e. p = z^7+z^2+z+1) 3286 // repeated in the low and high parts of a 3287 // 128-bit vector 3288 __ emit_int64(0x87); 3289 3290 __ align(CodeEntryAlignment); 3291 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3292 StubCodeMark mark(this, stub_id); 3293 address start = __ pc(); 3294 __ enter(); 3295 3296 const Register in = c_rarg0; 3297 const Register len = c_rarg1; 3298 const Register ct = c_rarg2; 3299 const Register out = c_rarg3; 3300 // and updated with the incremented counter in the end 3301 3302 const Register key = c_rarg4; 3303 const Register state = c_rarg5; 3304 3305 const Register subkeyHtbl = c_rarg6; 3306 3307 const Register counter = c_rarg7; 3308 3309 const Register keylen = r10; 3310 // Save state before entering routine 3311 __ sub(sp, sp, 4 * 16); 3312 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3313 __ sub(sp, sp, 4 * 16); 3314 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3315 3316 // __ andr(len, len, -512); 3317 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3318 __ str(len, __ pre(sp, -2 * wordSize)); 3319 3320 Label DONE; 3321 __ cbz(len, DONE); 3322 3323 // Compute #rounds for AES based on the length of the key array 3324 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3325 3326 __ aesenc_loadkeys(key, keylen); 3327 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3328 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3329 3330 // AES/CTR loop 3331 { 3332 Label L_CTR_loop; 3333 __ BIND(L_CTR_loop); 3334 3335 // Setup the counters 3336 __ movi(v8, __ T4S, 0); 3337 __ movi(v9, __ T4S, 1); 3338 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3339 3340 assert(v0->encoding() < v8->encoding(), ""); 3341 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3342 FloatRegister f = as_FloatRegister(i); 3343 __ rev32(f, __ T16B, v16); 3344 __ addv(v16, __ T4S, v16, v8); 3345 } 3346 3347 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3348 3349 // Encrypt the counters 3350 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3351 3352 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3353 3354 // XOR the encrypted counters with the inputs 3355 for (int i = 0; i < 8; i++) { 3356 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3357 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3358 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3359 } 3360 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3361 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3362 3363 __ subw(len, len, 16 * 8); 3364 __ cbnzw(len, L_CTR_loop); 3365 } 3366 3367 __ rev32(v16, __ T16B, v16); 3368 __ st1(v16, __ T16B, counter); 3369 3370 __ ldr(len, Address(sp)); 3371 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3372 3373 // GHASH/CTR loop 3374 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3375 len, /*unrolls*/4); 3376 3377 #ifdef ASSERT 3378 { Label L; 3379 __ cmp(len, (unsigned char)0); 3380 __ br(Assembler::EQ, L); 3381 __ stop("stubGenerator: abort"); 3382 __ bind(L); 3383 } 3384 #endif 3385 3386 __ bind(DONE); 3387 // Return the number of bytes processed 3388 __ ldr(r0, __ post(sp, 2 * wordSize)); 3389 3390 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3391 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3392 3393 __ leave(); // required for proper stackwalking of RuntimeStub frame 3394 __ ret(lr); 3395 return start; 3396 } 3397 3398 class Cached64Bytes { 3399 private: 3400 MacroAssembler *_masm; 3401 Register _regs[8]; 3402 3403 public: 3404 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3405 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3406 auto it = rs.begin(); 3407 for (auto &r: _regs) { 3408 r = *it; 3409 ++it; 3410 } 3411 } 3412 3413 void gen_loads(Register base) { 3414 for (int i = 0; i < 8; i += 2) { 3415 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3416 } 3417 } 3418 3419 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3420 void extract_u32(Register dest, int i) { 3421 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3422 } 3423 }; 3424 3425 // Utility routines for md5. 3426 // Clobbers r10 and r11. 3427 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3428 int k, int s, int t) { 3429 Register rscratch3 = r10; 3430 Register rscratch4 = r11; 3431 3432 __ eorw(rscratch3, r3, r4); 3433 __ movw(rscratch2, t); 3434 __ andw(rscratch3, rscratch3, r2); 3435 __ addw(rscratch4, r1, rscratch2); 3436 reg_cache.extract_u32(rscratch1, k); 3437 __ eorw(rscratch3, rscratch3, r4); 3438 __ addw(rscratch4, rscratch4, rscratch1); 3439 __ addw(rscratch3, rscratch3, rscratch4); 3440 __ rorw(rscratch2, rscratch3, 32 - s); 3441 __ addw(r1, rscratch2, r2); 3442 } 3443 3444 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3445 int k, int s, int t) { 3446 Register rscratch3 = r10; 3447 Register rscratch4 = r11; 3448 3449 reg_cache.extract_u32(rscratch1, k); 3450 __ movw(rscratch2, t); 3451 __ addw(rscratch4, r1, rscratch2); 3452 __ addw(rscratch4, rscratch4, rscratch1); 3453 __ bicw(rscratch2, r3, r4); 3454 __ andw(rscratch3, r2, r4); 3455 __ addw(rscratch2, rscratch2, rscratch4); 3456 __ addw(rscratch2, rscratch2, rscratch3); 3457 __ rorw(rscratch2, rscratch2, 32 - s); 3458 __ addw(r1, rscratch2, r2); 3459 } 3460 3461 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3462 int k, int s, int t) { 3463 Register rscratch3 = r10; 3464 Register rscratch4 = r11; 3465 3466 __ eorw(rscratch3, r3, r4); 3467 __ movw(rscratch2, t); 3468 __ addw(rscratch4, r1, rscratch2); 3469 reg_cache.extract_u32(rscratch1, k); 3470 __ eorw(rscratch3, rscratch3, r2); 3471 __ addw(rscratch4, rscratch4, rscratch1); 3472 __ addw(rscratch3, rscratch3, rscratch4); 3473 __ rorw(rscratch2, rscratch3, 32 - s); 3474 __ addw(r1, rscratch2, r2); 3475 } 3476 3477 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3478 int k, int s, int t) { 3479 Register rscratch3 = r10; 3480 Register rscratch4 = r11; 3481 3482 __ movw(rscratch3, t); 3483 __ ornw(rscratch2, r2, r4); 3484 __ addw(rscratch4, r1, rscratch3); 3485 reg_cache.extract_u32(rscratch1, k); 3486 __ eorw(rscratch3, rscratch2, r3); 3487 __ addw(rscratch4, rscratch4, rscratch1); 3488 __ addw(rscratch3, rscratch3, rscratch4); 3489 __ rorw(rscratch2, rscratch3, 32 - s); 3490 __ addw(r1, rscratch2, r2); 3491 } 3492 3493 // Arguments: 3494 // 3495 // Inputs: 3496 // c_rarg0 - byte[] source+offset 3497 // c_rarg1 - int[] SHA.state 3498 // c_rarg2 - int offset 3499 // c_rarg3 - int limit 3500 // 3501 address generate_md5_implCompress(StubGenStubId stub_id) { 3502 bool multi_block; 3503 switch (stub_id) { 3504 case md5_implCompress_id: 3505 multi_block = false; 3506 break; 3507 case md5_implCompressMB_id: 3508 multi_block = true; 3509 break; 3510 default: 3511 ShouldNotReachHere(); 3512 } 3513 __ align(CodeEntryAlignment); 3514 3515 StubCodeMark mark(this, stub_id); 3516 address start = __ pc(); 3517 3518 Register buf = c_rarg0; 3519 Register state = c_rarg1; 3520 Register ofs = c_rarg2; 3521 Register limit = c_rarg3; 3522 Register a = r4; 3523 Register b = r5; 3524 Register c = r6; 3525 Register d = r7; 3526 Register rscratch3 = r10; 3527 Register rscratch4 = r11; 3528 3529 Register state_regs[2] = { r12, r13 }; 3530 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3531 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3532 3533 __ push(saved_regs, sp); 3534 3535 __ ldp(state_regs[0], state_regs[1], Address(state)); 3536 __ ubfx(a, state_regs[0], 0, 32); 3537 __ ubfx(b, state_regs[0], 32, 32); 3538 __ ubfx(c, state_regs[1], 0, 32); 3539 __ ubfx(d, state_regs[1], 32, 32); 3540 3541 Label md5_loop; 3542 __ BIND(md5_loop); 3543 3544 reg_cache.gen_loads(buf); 3545 3546 // Round 1 3547 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3548 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3549 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3550 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3551 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3552 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3553 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3554 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3555 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3556 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3557 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3558 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3559 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3560 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3561 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3562 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3563 3564 // Round 2 3565 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3566 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3567 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3568 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3569 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3570 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3571 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3572 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3573 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3574 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3575 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3576 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3577 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3578 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3579 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3580 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3581 3582 // Round 3 3583 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3584 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3585 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3586 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3587 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3588 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3589 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3590 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3591 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3592 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3593 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3594 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3595 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3596 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3597 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3598 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3599 3600 // Round 4 3601 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3602 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3603 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3604 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3605 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3606 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3607 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3608 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3609 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3610 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3611 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3612 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3613 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3614 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3615 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3616 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3617 3618 __ addw(a, state_regs[0], a); 3619 __ ubfx(rscratch2, state_regs[0], 32, 32); 3620 __ addw(b, rscratch2, b); 3621 __ addw(c, state_regs[1], c); 3622 __ ubfx(rscratch4, state_regs[1], 32, 32); 3623 __ addw(d, rscratch4, d); 3624 3625 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3626 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3627 3628 if (multi_block) { 3629 __ add(buf, buf, 64); 3630 __ add(ofs, ofs, 64); 3631 __ cmp(ofs, limit); 3632 __ br(Assembler::LE, md5_loop); 3633 __ mov(c_rarg0, ofs); // return ofs 3634 } 3635 3636 // write hash values back in the correct order 3637 __ stp(state_regs[0], state_regs[1], Address(state)); 3638 3639 __ pop(saved_regs, sp); 3640 3641 __ ret(lr); 3642 3643 return start; 3644 } 3645 3646 // Arguments: 3647 // 3648 // Inputs: 3649 // c_rarg0 - byte[] source+offset 3650 // c_rarg1 - int[] SHA.state 3651 // c_rarg2 - int offset 3652 // c_rarg3 - int limit 3653 // 3654 address generate_sha1_implCompress(StubGenStubId stub_id) { 3655 bool multi_block; 3656 switch (stub_id) { 3657 case sha1_implCompress_id: 3658 multi_block = false; 3659 break; 3660 case sha1_implCompressMB_id: 3661 multi_block = true; 3662 break; 3663 default: 3664 ShouldNotReachHere(); 3665 } 3666 3667 __ align(CodeEntryAlignment); 3668 3669 StubCodeMark mark(this, stub_id); 3670 address start = __ pc(); 3671 3672 Register buf = c_rarg0; 3673 Register state = c_rarg1; 3674 Register ofs = c_rarg2; 3675 Register limit = c_rarg3; 3676 3677 Label keys; 3678 Label sha1_loop; 3679 3680 // load the keys into v0..v3 3681 __ adr(rscratch1, keys); 3682 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3683 // load 5 words state into v6, v7 3684 __ ldrq(v6, Address(state, 0)); 3685 __ ldrs(v7, Address(state, 16)); 3686 3687 3688 __ BIND(sha1_loop); 3689 // load 64 bytes of data into v16..v19 3690 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3691 __ rev32(v16, __ T16B, v16); 3692 __ rev32(v17, __ T16B, v17); 3693 __ rev32(v18, __ T16B, v18); 3694 __ rev32(v19, __ T16B, v19); 3695 3696 // do the sha1 3697 __ addv(v4, __ T4S, v16, v0); 3698 __ orr(v20, __ T16B, v6, v6); 3699 3700 FloatRegister d0 = v16; 3701 FloatRegister d1 = v17; 3702 FloatRegister d2 = v18; 3703 FloatRegister d3 = v19; 3704 3705 for (int round = 0; round < 20; round++) { 3706 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3707 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3708 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3709 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3710 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3711 3712 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3713 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3714 __ sha1h(tmp2, __ T4S, v20); 3715 if (round < 5) 3716 __ sha1c(v20, __ T4S, tmp3, tmp4); 3717 else if (round < 10 || round >= 15) 3718 __ sha1p(v20, __ T4S, tmp3, tmp4); 3719 else 3720 __ sha1m(v20, __ T4S, tmp3, tmp4); 3721 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3722 3723 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3724 } 3725 3726 __ addv(v7, __ T2S, v7, v21); 3727 __ addv(v6, __ T4S, v6, v20); 3728 3729 if (multi_block) { 3730 __ add(ofs, ofs, 64); 3731 __ cmp(ofs, limit); 3732 __ br(Assembler::LE, sha1_loop); 3733 __ mov(c_rarg0, ofs); // return ofs 3734 } 3735 3736 __ strq(v6, Address(state, 0)); 3737 __ strs(v7, Address(state, 16)); 3738 3739 __ ret(lr); 3740 3741 __ bind(keys); 3742 __ emit_int32(0x5a827999); 3743 __ emit_int32(0x6ed9eba1); 3744 __ emit_int32(0x8f1bbcdc); 3745 __ emit_int32(0xca62c1d6); 3746 3747 return start; 3748 } 3749 3750 3751 // Arguments: 3752 // 3753 // Inputs: 3754 // c_rarg0 - byte[] source+offset 3755 // c_rarg1 - int[] SHA.state 3756 // c_rarg2 - int offset 3757 // c_rarg3 - int limit 3758 // 3759 address generate_sha256_implCompress(StubGenStubId stub_id) { 3760 bool multi_block; 3761 switch (stub_id) { 3762 case sha256_implCompress_id: 3763 multi_block = false; 3764 break; 3765 case sha256_implCompressMB_id: 3766 multi_block = true; 3767 break; 3768 default: 3769 ShouldNotReachHere(); 3770 } 3771 3772 static const uint32_t round_consts[64] = { 3773 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3774 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3775 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3776 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3777 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3778 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3779 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3780 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3781 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3782 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3783 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3784 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3785 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3786 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3787 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3788 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3789 }; 3790 3791 __ align(CodeEntryAlignment); 3792 3793 StubCodeMark mark(this, stub_id); 3794 address start = __ pc(); 3795 3796 Register buf = c_rarg0; 3797 Register state = c_rarg1; 3798 Register ofs = c_rarg2; 3799 Register limit = c_rarg3; 3800 3801 Label sha1_loop; 3802 3803 __ stpd(v8, v9, __ pre(sp, -32)); 3804 __ stpd(v10, v11, Address(sp, 16)); 3805 3806 // dga == v0 3807 // dgb == v1 3808 // dg0 == v2 3809 // dg1 == v3 3810 // dg2 == v4 3811 // t0 == v6 3812 // t1 == v7 3813 3814 // load 16 keys to v16..v31 3815 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3816 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3817 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3818 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3819 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3820 3821 // load 8 words (256 bits) state 3822 __ ldpq(v0, v1, state); 3823 3824 __ BIND(sha1_loop); 3825 // load 64 bytes of data into v8..v11 3826 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3827 __ rev32(v8, __ T16B, v8); 3828 __ rev32(v9, __ T16B, v9); 3829 __ rev32(v10, __ T16B, v10); 3830 __ rev32(v11, __ T16B, v11); 3831 3832 __ addv(v6, __ T4S, v8, v16); 3833 __ orr(v2, __ T16B, v0, v0); 3834 __ orr(v3, __ T16B, v1, v1); 3835 3836 FloatRegister d0 = v8; 3837 FloatRegister d1 = v9; 3838 FloatRegister d2 = v10; 3839 FloatRegister d3 = v11; 3840 3841 3842 for (int round = 0; round < 16; round++) { 3843 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3844 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3845 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3846 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3847 3848 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3849 __ orr(v4, __ T16B, v2, v2); 3850 if (round < 15) 3851 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3852 __ sha256h(v2, __ T4S, v3, tmp2); 3853 __ sha256h2(v3, __ T4S, v4, tmp2); 3854 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3855 3856 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3857 } 3858 3859 __ addv(v0, __ T4S, v0, v2); 3860 __ addv(v1, __ T4S, v1, v3); 3861 3862 if (multi_block) { 3863 __ add(ofs, ofs, 64); 3864 __ cmp(ofs, limit); 3865 __ br(Assembler::LE, sha1_loop); 3866 __ mov(c_rarg0, ofs); // return ofs 3867 } 3868 3869 __ ldpd(v10, v11, Address(sp, 16)); 3870 __ ldpd(v8, v9, __ post(sp, 32)); 3871 3872 __ stpq(v0, v1, state); 3873 3874 __ ret(lr); 3875 3876 return start; 3877 } 3878 3879 // Double rounds for sha512. 3880 void sha512_dround(int dr, 3881 FloatRegister vi0, FloatRegister vi1, 3882 FloatRegister vi2, FloatRegister vi3, 3883 FloatRegister vi4, FloatRegister vrc0, 3884 FloatRegister vrc1, FloatRegister vin0, 3885 FloatRegister vin1, FloatRegister vin2, 3886 FloatRegister vin3, FloatRegister vin4) { 3887 if (dr < 36) { 3888 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3889 } 3890 __ addv(v5, __ T2D, vrc0, vin0); 3891 __ ext(v6, __ T16B, vi2, vi3, 8); 3892 __ ext(v5, __ T16B, v5, v5, 8); 3893 __ ext(v7, __ T16B, vi1, vi2, 8); 3894 __ addv(vi3, __ T2D, vi3, v5); 3895 if (dr < 32) { 3896 __ ext(v5, __ T16B, vin3, vin4, 8); 3897 __ sha512su0(vin0, __ T2D, vin1); 3898 } 3899 __ sha512h(vi3, __ T2D, v6, v7); 3900 if (dr < 32) { 3901 __ sha512su1(vin0, __ T2D, vin2, v5); 3902 } 3903 __ addv(vi4, __ T2D, vi1, vi3); 3904 __ sha512h2(vi3, __ T2D, vi1, vi0); 3905 } 3906 3907 // Arguments: 3908 // 3909 // Inputs: 3910 // c_rarg0 - byte[] source+offset 3911 // c_rarg1 - int[] SHA.state 3912 // c_rarg2 - int offset 3913 // c_rarg3 - int limit 3914 // 3915 address generate_sha512_implCompress(StubGenStubId stub_id) { 3916 bool multi_block; 3917 switch (stub_id) { 3918 case sha512_implCompress_id: 3919 multi_block = false; 3920 break; 3921 case sha512_implCompressMB_id: 3922 multi_block = true; 3923 break; 3924 default: 3925 ShouldNotReachHere(); 3926 } 3927 3928 static const uint64_t round_consts[80] = { 3929 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3930 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3931 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3932 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3933 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3934 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3935 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3936 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3937 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3938 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3939 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3940 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3941 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3942 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3943 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3944 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3945 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3946 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3947 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3948 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3949 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3950 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3951 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3952 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3953 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3954 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3955 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3956 }; 3957 3958 __ align(CodeEntryAlignment); 3959 3960 StubCodeMark mark(this, stub_id); 3961 address start = __ pc(); 3962 3963 Register buf = c_rarg0; 3964 Register state = c_rarg1; 3965 Register ofs = c_rarg2; 3966 Register limit = c_rarg3; 3967 3968 __ stpd(v8, v9, __ pre(sp, -64)); 3969 __ stpd(v10, v11, Address(sp, 16)); 3970 __ stpd(v12, v13, Address(sp, 32)); 3971 __ stpd(v14, v15, Address(sp, 48)); 3972 3973 Label sha512_loop; 3974 3975 // load state 3976 __ ld1(v8, v9, v10, v11, __ T2D, state); 3977 3978 // load first 4 round constants 3979 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3980 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3981 3982 __ BIND(sha512_loop); 3983 // load 128B of data into v12..v19 3984 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3985 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3986 __ rev64(v12, __ T16B, v12); 3987 __ rev64(v13, __ T16B, v13); 3988 __ rev64(v14, __ T16B, v14); 3989 __ rev64(v15, __ T16B, v15); 3990 __ rev64(v16, __ T16B, v16); 3991 __ rev64(v17, __ T16B, v17); 3992 __ rev64(v18, __ T16B, v18); 3993 __ rev64(v19, __ T16B, v19); 3994 3995 __ mov(rscratch2, rscratch1); 3996 3997 __ mov(v0, __ T16B, v8); 3998 __ mov(v1, __ T16B, v9); 3999 __ mov(v2, __ T16B, v10); 4000 __ mov(v3, __ T16B, v11); 4001 4002 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4003 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4004 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4005 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4006 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4007 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4008 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4009 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4010 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4011 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4012 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4013 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4014 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4015 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4016 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4017 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4018 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4019 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4020 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4021 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4022 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4023 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4024 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4025 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4026 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4027 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4028 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4029 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4030 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4031 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4032 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4033 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4034 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4035 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4036 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4037 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4038 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4039 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4040 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4041 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4042 4043 __ addv(v8, __ T2D, v8, v0); 4044 __ addv(v9, __ T2D, v9, v1); 4045 __ addv(v10, __ T2D, v10, v2); 4046 __ addv(v11, __ T2D, v11, v3); 4047 4048 if (multi_block) { 4049 __ add(ofs, ofs, 128); 4050 __ cmp(ofs, limit); 4051 __ br(Assembler::LE, sha512_loop); 4052 __ mov(c_rarg0, ofs); // return ofs 4053 } 4054 4055 __ st1(v8, v9, v10, v11, __ T2D, state); 4056 4057 __ ldpd(v14, v15, Address(sp, 48)); 4058 __ ldpd(v12, v13, Address(sp, 32)); 4059 __ ldpd(v10, v11, Address(sp, 16)); 4060 __ ldpd(v8, v9, __ post(sp, 64)); 4061 4062 __ ret(lr); 4063 4064 return start; 4065 } 4066 4067 // Execute one round of keccak of two computations in parallel. 4068 // One of the states should be loaded into the lower halves of 4069 // the vector registers v0-v24, the other should be loaded into 4070 // the upper halves of those registers. The ld1r instruction loads 4071 // the round constant into both halves of register v31. 4072 // Intermediate results c0...c5 and d0...d5 are computed 4073 // in registers v25...v30. 4074 // All vector instructions that are used operate on both register 4075 // halves in parallel. 4076 // If only a single computation is needed, one can only load the lower halves. 4077 void keccak_round(Register rscratch1) { 4078 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4079 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4080 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4081 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4082 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4083 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4084 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4085 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4086 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4087 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4088 4089 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4090 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4091 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4092 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4093 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4094 4095 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4096 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4097 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4098 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4099 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4100 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4101 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4102 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4103 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4104 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4105 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4106 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4107 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4108 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4109 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4110 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4111 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4112 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4113 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4114 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4115 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4116 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4117 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4118 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4119 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4120 4121 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4122 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4123 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4124 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4125 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4126 4127 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4128 4129 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4130 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4131 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4132 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4133 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4134 4135 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4136 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4137 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4138 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4139 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4140 4141 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4142 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4143 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4144 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4145 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4146 4147 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4148 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4149 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4150 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4151 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4152 4153 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4154 } 4155 4156 // Arguments: 4157 // 4158 // Inputs: 4159 // c_rarg0 - byte[] source+offset 4160 // c_rarg1 - byte[] SHA.state 4161 // c_rarg2 - int block_size 4162 // c_rarg3 - int offset 4163 // c_rarg4 - int limit 4164 // 4165 address generate_sha3_implCompress(StubGenStubId stub_id) { 4166 bool multi_block; 4167 switch (stub_id) { 4168 case sha3_implCompress_id: 4169 multi_block = false; 4170 break; 4171 case sha3_implCompressMB_id: 4172 multi_block = true; 4173 break; 4174 default: 4175 ShouldNotReachHere(); 4176 } 4177 4178 static const uint64_t round_consts[24] = { 4179 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4180 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4181 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4182 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4183 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4184 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4185 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4186 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4187 }; 4188 4189 __ align(CodeEntryAlignment); 4190 4191 StubCodeMark mark(this, stub_id); 4192 address start = __ pc(); 4193 4194 Register buf = c_rarg0; 4195 Register state = c_rarg1; 4196 Register block_size = c_rarg2; 4197 Register ofs = c_rarg3; 4198 Register limit = c_rarg4; 4199 4200 Label sha3_loop, rounds24_loop; 4201 Label sha3_512_or_sha3_384, shake128; 4202 4203 __ stpd(v8, v9, __ pre(sp, -64)); 4204 __ stpd(v10, v11, Address(sp, 16)); 4205 __ stpd(v12, v13, Address(sp, 32)); 4206 __ stpd(v14, v15, Address(sp, 48)); 4207 4208 // load state 4209 __ add(rscratch1, state, 32); 4210 __ ld1(v0, v1, v2, v3, __ T1D, state); 4211 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4212 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4213 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4214 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4215 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4216 __ ld1(v24, __ T1D, rscratch1); 4217 4218 __ BIND(sha3_loop); 4219 4220 // 24 keccak rounds 4221 __ movw(rscratch2, 24); 4222 4223 // load round_constants base 4224 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4225 4226 // load input 4227 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4228 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4229 __ eor(v0, __ T8B, v0, v25); 4230 __ eor(v1, __ T8B, v1, v26); 4231 __ eor(v2, __ T8B, v2, v27); 4232 __ eor(v3, __ T8B, v3, v28); 4233 __ eor(v4, __ T8B, v4, v29); 4234 __ eor(v5, __ T8B, v5, v30); 4235 __ eor(v6, __ T8B, v6, v31); 4236 4237 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4238 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4239 4240 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4241 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4242 __ eor(v7, __ T8B, v7, v25); 4243 __ eor(v8, __ T8B, v8, v26); 4244 __ eor(v9, __ T8B, v9, v27); 4245 __ eor(v10, __ T8B, v10, v28); 4246 __ eor(v11, __ T8B, v11, v29); 4247 __ eor(v12, __ T8B, v12, v30); 4248 __ eor(v13, __ T8B, v13, v31); 4249 4250 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4251 __ eor(v14, __ T8B, v14, v25); 4252 __ eor(v15, __ T8B, v15, v26); 4253 __ eor(v16, __ T8B, v16, v27); 4254 4255 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4256 __ andw(c_rarg5, block_size, 48); 4257 __ cbzw(c_rarg5, rounds24_loop); 4258 4259 __ tbnz(block_size, 5, shake128); 4260 // block_size == 144, bit5 == 0, SHA3-224 4261 __ ldrd(v28, __ post(buf, 8)); 4262 __ eor(v17, __ T8B, v17, v28); 4263 __ b(rounds24_loop); 4264 4265 __ BIND(shake128); 4266 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4267 __ eor(v17, __ T8B, v17, v28); 4268 __ eor(v18, __ T8B, v18, v29); 4269 __ eor(v19, __ T8B, v19, v30); 4270 __ eor(v20, __ T8B, v20, v31); 4271 __ b(rounds24_loop); // block_size == 168, SHAKE128 4272 4273 __ BIND(sha3_512_or_sha3_384); 4274 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4275 __ eor(v7, __ T8B, v7, v25); 4276 __ eor(v8, __ T8B, v8, v26); 4277 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4278 4279 // SHA3-384 4280 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4281 __ eor(v9, __ T8B, v9, v27); 4282 __ eor(v10, __ T8B, v10, v28); 4283 __ eor(v11, __ T8B, v11, v29); 4284 __ eor(v12, __ T8B, v12, v30); 4285 4286 __ BIND(rounds24_loop); 4287 __ subw(rscratch2, rscratch2, 1); 4288 4289 keccak_round(rscratch1); 4290 4291 __ cbnzw(rscratch2, rounds24_loop); 4292 4293 if (multi_block) { 4294 __ add(ofs, ofs, block_size); 4295 __ cmp(ofs, limit); 4296 __ br(Assembler::LE, sha3_loop); 4297 __ mov(c_rarg0, ofs); // return ofs 4298 } 4299 4300 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4301 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4302 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4303 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4304 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4305 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4306 __ st1(v24, __ T1D, state); 4307 4308 // restore callee-saved registers 4309 __ ldpd(v14, v15, Address(sp, 48)); 4310 __ ldpd(v12, v13, Address(sp, 32)); 4311 __ ldpd(v10, v11, Address(sp, 16)); 4312 __ ldpd(v8, v9, __ post(sp, 64)); 4313 4314 __ ret(lr); 4315 4316 return start; 4317 } 4318 4319 // Inputs: 4320 // c_rarg0 - long[] state0 4321 // c_rarg1 - long[] state1 4322 address generate_double_keccak() { 4323 static const uint64_t round_consts[24] = { 4324 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4325 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4326 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4327 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4328 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4329 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4330 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4331 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4332 }; 4333 4334 // Implements the double_keccak() method of the 4335 // sun.secyrity.provider.SHA3Parallel class 4336 __ align(CodeEntryAlignment); 4337 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4338 address start = __ pc(); 4339 __ enter(); 4340 4341 Register state0 = c_rarg0; 4342 Register state1 = c_rarg1; 4343 4344 Label rounds24_loop; 4345 4346 // save callee-saved registers 4347 __ stpd(v8, v9, __ pre(sp, -64)); 4348 __ stpd(v10, v11, Address(sp, 16)); 4349 __ stpd(v12, v13, Address(sp, 32)); 4350 __ stpd(v14, v15, Address(sp, 48)); 4351 4352 // load states 4353 __ add(rscratch1, state0, 32); 4354 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4355 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4356 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4357 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4358 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4359 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4360 __ ld1(v24, __ D, 0, rscratch1); 4361 __ add(rscratch1, state1, 32); 4362 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4363 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4364 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4365 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4366 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4367 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4368 __ ld1(v24, __ D, 1, rscratch1); 4369 4370 // 24 keccak rounds 4371 __ movw(rscratch2, 24); 4372 4373 // load round_constants base 4374 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4375 4376 __ BIND(rounds24_loop); 4377 __ subw(rscratch2, rscratch2, 1); 4378 keccak_round(rscratch1); 4379 __ cbnzw(rscratch2, rounds24_loop); 4380 4381 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4382 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4383 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4384 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4385 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4386 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4387 __ st1(v24, __ D, 0, state0); 4388 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4389 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4390 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4391 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4392 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4393 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4394 __ st1(v24, __ D, 1, state1); 4395 4396 // restore callee-saved vector registers 4397 __ ldpd(v14, v15, Address(sp, 48)); 4398 __ ldpd(v12, v13, Address(sp, 32)); 4399 __ ldpd(v10, v11, Address(sp, 16)); 4400 __ ldpd(v8, v9, __ post(sp, 64)); 4401 4402 __ leave(); // required for proper stackwalking of RuntimeStub frame 4403 __ mov(r0, zr); // return 0 4404 __ ret(lr); 4405 4406 return start; 4407 } 4408 4409 /** 4410 * Arguments: 4411 * 4412 * Inputs: 4413 * c_rarg0 - int crc 4414 * c_rarg1 - byte* buf 4415 * c_rarg2 - int length 4416 * 4417 * Output: 4418 * rax - int crc result 4419 */ 4420 address generate_updateBytesCRC32() { 4421 assert(UseCRC32Intrinsics, "what are we doing here?"); 4422 4423 __ align(CodeEntryAlignment); 4424 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 4425 StubCodeMark mark(this, stub_id); 4426 4427 address start = __ pc(); 4428 4429 const Register crc = c_rarg0; // crc 4430 const Register buf = c_rarg1; // source java byte array address 4431 const Register len = c_rarg2; // length 4432 const Register table0 = c_rarg3; // crc_table address 4433 const Register table1 = c_rarg4; 4434 const Register table2 = c_rarg5; 4435 const Register table3 = c_rarg6; 4436 const Register tmp3 = c_rarg7; 4437 4438 BLOCK_COMMENT("Entry:"); 4439 __ enter(); // required for proper stackwalking of RuntimeStub frame 4440 4441 __ kernel_crc32(crc, buf, len, 4442 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4443 4444 __ leave(); // required for proper stackwalking of RuntimeStub frame 4445 __ ret(lr); 4446 4447 return start; 4448 } 4449 4450 // ChaCha20 block function. This version parallelizes 4 quarter 4451 // round operations at a time. It uses 16 SIMD registers to 4452 // produce 4 blocks of key stream. 4453 // 4454 // state (int[16]) = c_rarg0 4455 // keystream (byte[256]) = c_rarg1 4456 // return - number of bytes of keystream (always 256) 4457 // 4458 // In this approach, we load the 512-bit start state sequentially into 4459 // 4 128-bit vectors. We then make 4 4-vector copies of that starting 4460 // state, with each successive set of 4 vectors having a +1 added into 4461 // the first 32-bit lane of the 4th vector in that group (the counter). 4462 // By doing this, we can perform the block function on 4 512-bit blocks 4463 // within one run of this intrinsic. 4464 // The alignment of the data across the 4-vector group is such that at 4465 // the start it is already aligned for the first round of each two-round 4466 // loop iteration. In other words, the corresponding lanes of each vector 4467 // will contain the values needed for that quarter round operation (e.g. 4468 // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.). 4469 // In between each full round, a lane shift must occur. Within a loop 4470 // iteration, between the first and second rounds, the 2nd, 3rd, and 4th 4471 // vectors are rotated left 32, 64 and 96 bits, respectively. The result 4472 // is effectively a diagonal orientation in columnar form. After the 4473 // second full round, those registers are left-rotated again, this time 4474 // 96, 64, and 32 bits - returning the vectors to their columnar organization. 4475 // After all 10 iterations, the original state is added to each 4-vector 4476 // working state along with the add mask, and the 4 vector groups are 4477 // sequentially written to the memory dedicated for the output key stream. 4478 // 4479 // For a more detailed explanation, see Goll and Gueron, "Vectorization of 4480 // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology: 4481 // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33 4482 address generate_chacha20Block_qrpar() { 4483 Label L_Q_twoRounds, L_Q_cc20_const; 4484 // The constant data is broken into two 128-bit segments to be loaded 4485 // onto SIMD registers. The first 128 bits are a counter add overlay 4486 // that adds +1/+0/+0/+0 to the vectors holding replicated state[12]. 4487 // The second 128-bits is a table constant used for 8-bit left rotations. 4488 // on 32-bit lanes within a SIMD register. 4489 __ BIND(L_Q_cc20_const); 4490 __ emit_int64(0x0000000000000001UL); 4491 __ emit_int64(0x0000000000000000UL); 4492 __ emit_int64(0x0605040702010003UL); 4493 __ emit_int64(0x0E0D0C0F0A09080BUL); 4494 4495 __ align(CodeEntryAlignment); 4496 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4497 StubCodeMark mark(this, stub_id); 4498 address start = __ pc(); 4499 __ enter(); 4500 4501 const Register state = c_rarg0; 4502 const Register keystream = c_rarg1; 4503 const Register loopCtr = r10; 4504 const Register tmpAddr = r11; 4505 4506 const FloatRegister aState = v0; 4507 const FloatRegister bState = v1; 4508 const FloatRegister cState = v2; 4509 const FloatRegister dState = v3; 4510 const FloatRegister a1Vec = v4; 4511 const FloatRegister b1Vec = v5; 4512 const FloatRegister c1Vec = v6; 4513 const FloatRegister d1Vec = v7; 4514 // Skip the callee-saved registers v8 - v15 4515 const FloatRegister a2Vec = v16; 4516 const FloatRegister b2Vec = v17; 4517 const FloatRegister c2Vec = v18; 4518 const FloatRegister d2Vec = v19; 4519 const FloatRegister a3Vec = v20; 4520 const FloatRegister b3Vec = v21; 4521 const FloatRegister c3Vec = v22; 4522 const FloatRegister d3Vec = v23; 4523 const FloatRegister a4Vec = v24; 4524 const FloatRegister b4Vec = v25; 4525 const FloatRegister c4Vec = v26; 4526 const FloatRegister d4Vec = v27; 4527 const FloatRegister scratch = v28; 4528 const FloatRegister addMask = v29; 4529 const FloatRegister lrot8Tbl = v30; 4530 4531 // Load the initial state in the first 4 quadword registers, 4532 // then copy the initial state into the next 4 quadword registers 4533 // that will be used for the working state. 4534 __ ld1(aState, bState, cState, dState, __ T16B, Address(state)); 4535 4536 // Load the index register for 2 constant 128-bit data fields. 4537 // The first represents the +1/+0/+0/+0 add mask. The second is 4538 // the 8-bit left rotation. 4539 __ adr(tmpAddr, L_Q_cc20_const); 4540 __ ldpq(addMask, lrot8Tbl, Address(tmpAddr)); 4541 4542 __ mov(a1Vec, __ T16B, aState); 4543 __ mov(b1Vec, __ T16B, bState); 4544 __ mov(c1Vec, __ T16B, cState); 4545 __ mov(d1Vec, __ T16B, dState); 4546 4547 __ mov(a2Vec, __ T16B, aState); 4548 __ mov(b2Vec, __ T16B, bState); 4549 __ mov(c2Vec, __ T16B, cState); 4550 __ addv(d2Vec, __ T4S, d1Vec, addMask); 4551 4552 __ mov(a3Vec, __ T16B, aState); 4553 __ mov(b3Vec, __ T16B, bState); 4554 __ mov(c3Vec, __ T16B, cState); 4555 __ addv(d3Vec, __ T4S, d2Vec, addMask); 4556 4557 __ mov(a4Vec, __ T16B, aState); 4558 __ mov(b4Vec, __ T16B, bState); 4559 __ mov(c4Vec, __ T16B, cState); 4560 __ addv(d4Vec, __ T4S, d3Vec, addMask); 4561 4562 // Set up the 10 iteration loop 4563 __ mov(loopCtr, 10); 4564 __ BIND(L_Q_twoRounds); 4565 4566 // The first set of operations on the vectors covers the first 4 quarter 4567 // round operations: 4568 // Qround(state, 0, 4, 8,12) 4569 // Qround(state, 1, 5, 9,13) 4570 // Qround(state, 2, 6,10,14) 4571 // Qround(state, 3, 7,11,15) 4572 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4573 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4574 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4575 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4576 4577 // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to 4578 // diagonals. The a1Vec does not need to change orientation. 4579 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true); 4580 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true); 4581 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true); 4582 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true); 4583 4584 // The second set of operations on the vectors covers the second 4 quarter 4585 // round operations, now acting on the diagonals: 4586 // Qround(state, 0, 5,10,15) 4587 // Qround(state, 1, 6,11,12) 4588 // Qround(state, 2, 7, 8,13) 4589 // Qround(state, 3, 4, 9,14) 4590 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4591 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4592 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4593 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4594 4595 // Before we start the next iteration, we need to perform shuffles 4596 // on the b/c/d vectors to move them back to columnar organizations 4597 // from their current diagonal orientation. 4598 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false); 4599 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false); 4600 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false); 4601 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false); 4602 4603 // Decrement and iterate 4604 __ sub(loopCtr, loopCtr, 1); 4605 __ cbnz(loopCtr, L_Q_twoRounds); 4606 4607 // Once the counter reaches zero, we fall out of the loop 4608 // and need to add the initial state back into the working state 4609 // represented by the a/b/c/d1Vec registers. This is destructive 4610 // on the dState register but we no longer will need it. 4611 __ addv(a1Vec, __ T4S, a1Vec, aState); 4612 __ addv(b1Vec, __ T4S, b1Vec, bState); 4613 __ addv(c1Vec, __ T4S, c1Vec, cState); 4614 __ addv(d1Vec, __ T4S, d1Vec, dState); 4615 4616 __ addv(a2Vec, __ T4S, a2Vec, aState); 4617 __ addv(b2Vec, __ T4S, b2Vec, bState); 4618 __ addv(c2Vec, __ T4S, c2Vec, cState); 4619 __ addv(dState, __ T4S, dState, addMask); 4620 __ addv(d2Vec, __ T4S, d2Vec, dState); 4621 4622 __ addv(a3Vec, __ T4S, a3Vec, aState); 4623 __ addv(b3Vec, __ T4S, b3Vec, bState); 4624 __ addv(c3Vec, __ T4S, c3Vec, cState); 4625 __ addv(dState, __ T4S, dState, addMask); 4626 __ addv(d3Vec, __ T4S, d3Vec, dState); 4627 4628 __ addv(a4Vec, __ T4S, a4Vec, aState); 4629 __ addv(b4Vec, __ T4S, b4Vec, bState); 4630 __ addv(c4Vec, __ T4S, c4Vec, cState); 4631 __ addv(dState, __ T4S, dState, addMask); 4632 __ addv(d4Vec, __ T4S, d4Vec, dState); 4633 4634 // Write the final state back to the result buffer 4635 __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64)); 4636 __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64)); 4637 __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64)); 4638 __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64)); 4639 4640 __ mov(r0, 256); // Return length of output keystream 4641 __ leave(); 4642 __ ret(lr); 4643 4644 return start; 4645 } 4646 4647 void dilithium_load16zetas(int o0, Register zetas) { 4648 __ ldpq(as_FloatRegister(o0), as_FloatRegister(o0 + 1), __ post (zetas, 32)); 4649 __ ldpq(as_FloatRegister(o0 + 2), as_FloatRegister(o0 + 3), __ post (zetas, 32)); 4650 4651 } 4652 4653 void dilithium_load32zetas(Register zetas) { 4654 dilithium_load16zetas(16, zetas); 4655 dilithium_load16zetas(20, zetas); 4656 } 4657 4658 // 2x16 32-bit Montgomery multiplications in parallel 4659 // See the montMul() method of the sun.security.provider.ML_DSA class. 4660 // Here MONT_R_BITS is 32, so the right shift by it is implicit. 4661 // The constants qInv = MONT_Q_INV_MOD_R and q = MONT_Q are loaded in 4662 // (all 32-bit chunks of) vector registers v30 and v31, resp. 4663 // The inputs are b[i]s in v0-v7 and c[i]s v16-v23 and 4664 // the results are a[i]s in v16-v23, four 32-bit values in each register 4665 // and we do a_i = b_i * c_i * 2^-32 mod MONT_Q for all 4666 void dilithium_montmul32(bool by_constant) { 4667 FloatRegister vr0 = by_constant ? v29 : v0; 4668 FloatRegister vr1 = by_constant ? v29 : v1; 4669 FloatRegister vr2 = by_constant ? v29 : v2; 4670 FloatRegister vr3 = by_constant ? v29 : v3; 4671 FloatRegister vr4 = by_constant ? v29 : v4; 4672 FloatRegister vr5 = by_constant ? v29 : v5; 4673 FloatRegister vr6 = by_constant ? v29 : v6; 4674 FloatRegister vr7 = by_constant ? v29 : v7; 4675 4676 __ sqdmulh(v24, __ T4S, vr0, v16); // aHigh = hi32(2 * b * c) 4677 __ mulv(v16, __ T4S, vr0, v16); // aLow = lo32(b * c) 4678 __ sqdmulh(v25, __ T4S, vr1, v17); 4679 __ mulv(v17, __ T4S, vr1, v17); 4680 __ sqdmulh(v26, __ T4S, vr2, v18); 4681 __ mulv(v18, __ T4S, vr2, v18); 4682 __ sqdmulh(v27, __ T4S, vr3, v19); 4683 __ mulv(v19, __ T4S, vr3, v19); 4684 4685 __ mulv(v16, __ T4S, v16, v30); // m = aLow * qinv 4686 __ mulv(v17, __ T4S, v17, v30); 4687 __ mulv(v18, __ T4S, v18, v30); 4688 __ mulv(v19, __ T4S, v19, v30); 4689 4690 __ sqdmulh(v16, __ T4S, v16, v31); // n = hi32(2 * m * q) 4691 __ sqdmulh(v17, __ T4S, v17, v31); 4692 __ sqdmulh(v18, __ T4S, v18, v31); 4693 __ sqdmulh(v19, __ T4S, v19, v31); 4694 4695 __ shsubv(v16, __ T4S, v24, v16); // a = (aHigh - n) / 2 4696 __ shsubv(v17, __ T4S, v25, v17); 4697 __ shsubv(v18, __ T4S, v26, v18); 4698 __ shsubv(v19, __ T4S, v27, v19); 4699 4700 __ sqdmulh(v24, __ T4S, vr4, v20); 4701 __ mulv(v20, __ T4S, vr4, v20); 4702 __ sqdmulh(v25, __ T4S, vr5, v21); 4703 __ mulv(v21, __ T4S, vr5, v21); 4704 __ sqdmulh(v26, __ T4S, vr6, v22); 4705 __ mulv(v22, __ T4S, vr6, v22); 4706 __ sqdmulh(v27, __ T4S, vr7, v23); 4707 __ mulv(v23, __ T4S, vr7, v23); 4708 4709 __ mulv(v20, __ T4S, v20, v30); 4710 __ mulv(v21, __ T4S, v21, v30); 4711 __ mulv(v22, __ T4S, v22, v30); 4712 __ mulv(v23, __ T4S, v23, v30); 4713 4714 __ sqdmulh(v20, __ T4S, v20, v31); 4715 __ sqdmulh(v21, __ T4S, v21, v31); 4716 __ sqdmulh(v22, __ T4S, v22, v31); 4717 __ sqdmulh(v23, __ T4S, v23, v31); 4718 4719 __ shsubv(v20, __ T4S, v24, v20); 4720 __ shsubv(v21, __ T4S, v25, v21); 4721 __ shsubv(v22, __ T4S, v26, v22); 4722 __ shsubv(v23, __ T4S, v27, v23); 4723 } 4724 4725 // Do the addition and subtraction done in the ntt algorithm. 4726 // See sun.security.provider.ML_DSA.implDilithiumAlmostNttJava() 4727 void dilithium_add_sub32() { 4728 __ addv(v24, __ T4S, v0, v16); // coeffs[j] = coeffs[j] + tmp; 4729 __ addv(v25, __ T4S, v1, v17); 4730 __ addv(v26, __ T4S, v2, v18); 4731 __ addv(v27, __ T4S, v3, v19); 4732 __ addv(v28, __ T4S, v4, v20); 4733 __ addv(v29, __ T4S, v5, v21); 4734 __ addv(v30, __ T4S, v6, v22); 4735 __ addv(v31, __ T4S, v7, v23); 4736 4737 __ subv(v0, __ T4S, v0, v16); // coeffs[j + l] = coeffs[j] - tmp; 4738 __ subv(v1, __ T4S, v1, v17); 4739 __ subv(v2, __ T4S, v2, v18); 4740 __ subv(v3, __ T4S, v3, v19); 4741 __ subv(v4, __ T4S, v4, v20); 4742 __ subv(v5, __ T4S, v5, v21); 4743 __ subv(v6, __ T4S, v6, v22); 4744 __ subv(v7, __ T4S, v7, v23); 4745 } 4746 4747 // Do the same computation that 4748 // dilithium_montmul32() and dilithium_add_sub32() does, 4749 // except for only 4x4 32-bit vector elements and with 4750 // different register usage. 4751 void dilithium_montmul_sub_add16() { 4752 __ sqdmulh(v24, __ T4S, v1, v16); 4753 __ mulv(v16, __ T4S, v1, v16); 4754 __ sqdmulh(v25, __ T4S, v3, v17); 4755 __ mulv(v17, __ T4S, v3, v17); 4756 __ sqdmulh(v26, __ T4S, v5, v18); 4757 __ mulv(v18, __ T4S, v5, v18); 4758 __ sqdmulh(v27, __ T4S, v7, v19); 4759 __ mulv(v19, __ T4S, v7, v19); 4760 4761 __ mulv(v16, __ T4S, v16, v30); 4762 __ mulv(v17, __ T4S, v17, v30); 4763 __ mulv(v18, __ T4S, v18, v30); 4764 __ mulv(v19, __ T4S, v19, v30); 4765 4766 __ sqdmulh(v16, __ T4S, v16, v31); 4767 __ sqdmulh(v17, __ T4S, v17, v31); 4768 __ sqdmulh(v18, __ T4S, v18, v31); 4769 __ sqdmulh(v19, __ T4S, v19, v31); 4770 4771 __ shsubv(v16, __ T4S, v24, v16); 4772 __ shsubv(v17, __ T4S, v25, v17); 4773 __ shsubv(v18, __ T4S, v26, v18); 4774 __ shsubv(v19, __ T4S, v27, v19); 4775 4776 __ subv(v1, __ T4S, v0, v16); 4777 __ subv(v3, __ T4S, v2, v17); 4778 __ subv(v5, __ T4S, v4, v18); 4779 __ subv(v7, __ T4S, v6, v19); 4780 4781 __ addv(v0, __ T4S, v0, v16); 4782 __ addv(v2, __ T4S, v2, v17); 4783 __ addv(v4, __ T4S, v4, v18); 4784 __ addv(v6, __ T4S, v6, v19); 4785 } 4786 4787 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 4788 // in the Java implementation come in sequences of at least 8, so we 4789 // can use ldpq to collect the corresponding data into pairs of vector 4790 // registers. 4791 // We collect the coefficients corresponding to the 'j+l' indexes into 4792 // the vector registers v0-v7, the zetas into the vector registers v16-v23 4793 // then we do the (Montgomery) multiplications by the zetas in parallel 4794 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 4795 // v0-v7, then do the additions into v24-v31 and the subtractions into 4796 // v0-v7 and finally save the results back to the coeffs array. 4797 void dilithiumNttLevel0_4(const Register dilithiumConsts, 4798 const Register coeffs, const Register zetas) { 4799 int c1 = 0; 4800 int c2 = 512; 4801 int startIncr; 4802 int incr1 = 32; 4803 int incr2 = 64; 4804 int incr3 = 96; 4805 4806 for (int level = 0; level < 5; level++) { 4807 int c1Start = c1; 4808 int c2Start = c2; 4809 if (level == 3) { 4810 incr1 = 32; 4811 incr2 = 128; 4812 incr3 = 160; 4813 } else if (level == 4) { 4814 incr1 = 64; 4815 incr2 = 128; 4816 incr3 = 192; 4817 } 4818 4819 for (int i = 0; i < 4; i++) { 4820 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4821 __ ldpq(v0, v1, Address(coeffs, c2Start)); 4822 __ ldpq(v2, v3, Address(coeffs, c2Start + incr1)); 4823 __ ldpq(v4, v5, Address(coeffs, c2Start + incr2)); 4824 __ ldpq(v6, v7, Address(coeffs, c2Start + incr3)); 4825 dilithium_load32zetas(zetas); 4826 dilithium_montmul32(false); 4827 __ ldpq(v0, v1, Address(coeffs, c1Start)); 4828 __ ldpq(v2, v3, Address(coeffs, c1Start + incr1)); 4829 __ ldpq(v4, v5, Address(coeffs, c1Start + incr2)); 4830 __ ldpq(v6, v7, Address(coeffs, c1Start + incr3)); 4831 dilithium_add_sub32(); 4832 __ stpq(v24, v25, Address(coeffs, c1Start)); 4833 __ stpq(v26, v27, Address(coeffs, c1Start + incr1)); 4834 __ stpq(v28, v29, Address(coeffs, c1Start + incr2)); 4835 __ stpq(v30, v31, Address(coeffs, c1Start + incr3)); 4836 __ stpq(v0, v1, Address(coeffs, c2Start)); 4837 __ stpq(v2, v3, Address(coeffs, c2Start + incr1)); 4838 __ stpq(v4, v5, Address(coeffs, c2Start + incr2)); 4839 __ stpq(v6, v7, Address(coeffs, c2Start + incr3)); 4840 4841 int k = 4 * level + i; 4842 4843 if (k > 7) { 4844 startIncr = 256; 4845 } else if (k == 5) { 4846 startIncr = 384; 4847 } else { 4848 startIncr = 128; 4849 } 4850 4851 c1Start += startIncr; 4852 c2Start += startIncr; 4853 } 4854 4855 c2 /= 2; 4856 } 4857 } 4858 4859 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 4860 // Implements the method 4861 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 4862 // of the Java class sun.security.provider 4863 // 4864 // coeffs (int[256]) = c_rarg0 4865 // zetas (int[256]) = c_rarg1 4866 address generate_dilithiumAlmostNtt() { 4867 4868 __ align(CodeEntryAlignment); 4869 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 4870 StubCodeMark mark(this, stub_id); 4871 address start = __ pc(); 4872 __ enter(); 4873 4874 const Register coeffs = c_rarg0; 4875 const Register zetas = c_rarg1; 4876 4877 const Register tmpAddr = r9; 4878 const Register dilithiumConsts = r10; 4879 const Register result = r11; 4880 4881 __ add(result, coeffs, 0); 4882 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 4883 4884 // Each level represents one iteration of the outer for loop of the Java version 4885 4886 // level 0-4 4887 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 4888 4889 // level 5 4890 for (int i = 0; i < 1024; i += 256) { 4891 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4892 __ ldr(v0, __ Q, Address(coeffs, i + 16)); 4893 __ ldr(v1, __ Q, Address(coeffs, i + 48)); 4894 __ ldr(v2, __ Q, Address(coeffs, i + 80)); 4895 __ ldr(v3, __ Q, Address(coeffs, i + 112)); 4896 __ ldr(v4, __ Q, Address(coeffs, i + 144)); 4897 __ ldr(v5, __ Q, Address(coeffs, i + 176)); 4898 __ ldr(v6, __ Q, Address(coeffs, i + 208)); 4899 __ ldr(v7, __ Q, Address(coeffs, i + 240)); 4900 dilithium_load32zetas(zetas); 4901 dilithium_montmul32(false); 4902 __ ldr(v0, __ Q, Address(coeffs, i)); 4903 __ ldr(v1, __ Q, Address(coeffs, i + 32)); 4904 __ ldr(v2, __ Q, Address(coeffs, i + 64)); 4905 __ ldr(v3, __ Q, Address(coeffs, i + 96)); 4906 __ ldr(v4, __ Q, Address(coeffs, i + 128)); 4907 __ ldr(v5, __ Q, Address(coeffs, i + 160)); 4908 __ ldr(v6, __ Q, Address(coeffs, i + 192)); 4909 __ ldr(v7, __ Q, Address(coeffs, i + 224)); 4910 dilithium_add_sub32(); 4911 __ str(v24, __ Q, Address(coeffs, i)); 4912 __ str(v25, __ Q, Address(coeffs, i + 32)); 4913 __ str(v26, __ Q, Address(coeffs, i + 64)); 4914 __ str(v27, __ Q, Address(coeffs, i + 96)); 4915 __ str(v28, __ Q, Address(coeffs, i + 128)); 4916 __ str(v29, __ Q, Address(coeffs, i + 160)); 4917 __ str(v30, __ Q, Address(coeffs, i + 192)); 4918 __ str(v31, __ Q, Address(coeffs, i + 224)); 4919 __ str(v0, __ Q, Address(coeffs, i + 16)); 4920 __ str(v1, __ Q, Address(coeffs, i + 48)); 4921 __ str(v2, __ Q, Address(coeffs, i + 80)); 4922 __ str(v3, __ Q, Address(coeffs, i + 112)); 4923 __ str(v4, __ Q, Address(coeffs, i + 144)); 4924 __ str(v5, __ Q, Address(coeffs, i + 176)); 4925 __ str(v6, __ Q, Address(coeffs, i + 208)); 4926 __ str(v7, __ Q, Address(coeffs, i + 240)); 4927 } 4928 4929 // level 6 4930 for (int i = 0; i < 1024; i += 128) { 4931 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4932 __ add(tmpAddr, coeffs, i); 4933 __ ld2(v0, v1, __ T2D, tmpAddr); 4934 __ add(tmpAddr, coeffs, i + 32); 4935 __ ld2(v2, v3, __ T2D, tmpAddr); 4936 __ add(tmpAddr, coeffs, i + 64); 4937 __ ld2(v4, v5, __ T2D, tmpAddr); 4938 __ add(tmpAddr, coeffs, i + 96); 4939 __ ld2(v6, v7, __ T2D, tmpAddr); 4940 dilithium_load16zetas(16, zetas); 4941 dilithium_montmul_sub_add16(); 4942 __ add(tmpAddr, coeffs, i); 4943 __ st2(v0, v1, __ T2D, tmpAddr); 4944 __ add(tmpAddr, coeffs, i + 32); 4945 __ st2(v2, v3, __ T2D, tmpAddr); 4946 __ add(tmpAddr, coeffs, i + 64); 4947 __ st2(v4, v5, __ T2D, tmpAddr); 4948 __ add(tmpAddr, coeffs, i + 96); 4949 __ st2(v6, v7, __ T2D, tmpAddr); 4950 } 4951 4952 // level 7 4953 for (int i = 0; i < 1024; i += 128) { 4954 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 4955 __ add(tmpAddr, coeffs, i); 4956 __ ld2(v0, v1, __ T4S, tmpAddr); 4957 __ add(tmpAddr, coeffs, i + 32); 4958 __ ld2(v2, v3, __ T4S, tmpAddr); 4959 __ add(tmpAddr, coeffs, i + 64); 4960 __ ld2(v4, v5, __ T4S, tmpAddr); 4961 __ add(tmpAddr, coeffs, i + 96); 4962 __ ld2(v6, v7, __ T4S, tmpAddr); 4963 dilithium_load16zetas(16, zetas); 4964 dilithium_montmul_sub_add16(); 4965 __ add(tmpAddr, coeffs, i); 4966 __ st2(v0, v1, __ T4S, tmpAddr); 4967 __ add(tmpAddr, coeffs, i + 32); 4968 __ st2(v2, v3, __ T4S, tmpAddr); 4969 __ add(tmpAddr, coeffs, i + 64); 4970 __ st2(v4, v5, __ T4S, tmpAddr); 4971 __ add(tmpAddr, coeffs, i + 96); 4972 __ st2(v6, v7, __ T4S, tmpAddr); 4973 } 4974 __ leave(); // required for proper stackwalking of RuntimeStub frame 4975 __ mov(r0, zr); // return 0 4976 __ ret(lr); 4977 4978 return start; 4979 4980 } 4981 4982 // Do the computations that can be found in the body of the loop in 4983 // sun.security.provider.ML_DSA.implDilithiumAlmostInverseNttJava() 4984 // for 16 coefficients in parallel: 4985 // tmp = coeffs[j]; 4986 // coeffs[j] = (tmp + coeffs[j + l]); 4987 // coeffs[j + l] = montMul(tmp - coeffs[j + l], -MONT_ZETAS_FOR_NTT[m]); 4988 // coefss[j]s are loaded in v0, v2, v4 and v6, 4989 // coeffs[j + l]s in v1, v3, v5 and v7, 4990 // the corresponding zetas in v16, v17, v18 and v19. 4991 void dilithium_sub_add_montmul16() { 4992 __ subv(v20, __ T4S, v0, v1); 4993 __ subv(v21, __ T4S, v2, v3); 4994 __ subv(v22, __ T4S, v4, v5); 4995 __ subv(v23, __ T4S, v6, v7); 4996 4997 __ addv(v0, __ T4S, v0, v1); 4998 __ addv(v2, __ T4S, v2, v3); 4999 __ addv(v4, __ T4S, v4, v5); 5000 __ addv(v6, __ T4S, v6, v7); 5001 5002 __ sqdmulh(v24, __ T4S, v20, v16); // aHigh = hi32(2 * b * c) 5003 __ mulv(v1, __ T4S, v20, v16); // aLow = lo32(b * c) 5004 __ sqdmulh(v25, __ T4S, v21, v17); 5005 __ mulv(v3, __ T4S, v21, v17); 5006 __ sqdmulh(v26, __ T4S, v22, v18); 5007 __ mulv(v5, __ T4S, v22, v18); 5008 __ sqdmulh(v27, __ T4S, v23, v19); 5009 __ mulv(v7, __ T4S, v23, v19); 5010 5011 __ mulv(v1, __ T4S, v1, v30); // m = (aLow * q) 5012 __ mulv(v3, __ T4S, v3, v30); 5013 __ mulv(v5, __ T4S, v5, v30); 5014 __ mulv(v7, __ T4S, v7, v30); 5015 5016 __ sqdmulh(v1, __ T4S, v1, v31); // n = hi32(2 * m * q) 5017 __ sqdmulh(v3, __ T4S, v3, v31); 5018 __ sqdmulh(v5, __ T4S, v5, v31); 5019 __ sqdmulh(v7, __ T4S, v7, v31); 5020 5021 __ shsubv(v1, __ T4S, v24, v1); // a = (aHigh - n) / 2 5022 __ shsubv(v3, __ T4S, v25, v3); 5023 __ shsubv(v5, __ T4S, v26, v5); 5024 __ shsubv(v7, __ T4S, v27, v7); 5025 } 5026 5027 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 5028 // in the Java implementation come in sequences of at least 8, so we 5029 // can use ldpq to collect the corresponding data into pairs of vector 5030 // registers 5031 // We collect the coefficients that correspond to the 'j's into v0-v7 5032 // the coefficiets that correspond to the 'j+l's into v16-v23 then 5033 // do the additions into v24-v31 and the subtractions into v0-v7 then 5034 // save the result of the additions, load the zetas into v16-v23 5035 // do the (Montgomery) multiplications by zeta in parallel into v16-v23 5036 // finally save the results back to the coeffs array 5037 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 5038 const Register coeffs, const Register zetas) { 5039 int c1 = 0; 5040 int c2 = 32; 5041 int startIncr; 5042 int incr1; 5043 int incr2; 5044 int incr3; 5045 5046 for (int level = 3; level < 8; level++) { 5047 int c1Start = c1; 5048 int c2Start = c2; 5049 if (level == 3) { 5050 incr1 = 64; 5051 incr2 = 128; 5052 incr3 = 192; 5053 } else if (level == 4) { 5054 incr1 = 32; 5055 incr2 = 128; 5056 incr3 = 160; 5057 } else { 5058 incr1 = 32; 5059 incr2 = 64; 5060 incr3 = 96; 5061 } 5062 5063 for (int i = 0; i < 4; i++) { 5064 __ ldpq(v0, v1, Address(coeffs, c1Start)); 5065 __ ldpq(v2, v3, Address(coeffs, c1Start + incr1)); 5066 __ ldpq(v4, v5, Address(coeffs, c1Start + incr2)); 5067 __ ldpq(v6, v7, Address(coeffs, c1Start + incr3)); 5068 __ ldpq(v16, v17, Address(coeffs, c2Start)); 5069 __ ldpq(v18, v19, Address(coeffs, c2Start + incr1)); 5070 __ ldpq(v20, v21, Address(coeffs, c2Start + incr2)); 5071 __ ldpq(v22, v23, Address(coeffs, c2Start + incr3)); 5072 dilithium_add_sub32(); 5073 __ stpq(v24, v25, Address(coeffs, c1Start)); 5074 __ stpq(v26, v27, Address(coeffs, c1Start + incr1)); 5075 __ stpq(v28, v29, Address(coeffs, c1Start + incr2)); 5076 __ stpq(v30, v31, Address(coeffs, c1Start + incr3)); 5077 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5078 dilithium_load32zetas(zetas); 5079 dilithium_montmul32(false); 5080 __ stpq(v16, v17, Address(coeffs, c2Start)); 5081 __ stpq(v18, v19, Address(coeffs, c2Start + incr1)); 5082 __ stpq(v20, v21, Address(coeffs, c2Start + incr2)); 5083 __ stpq(v22, v23, Address(coeffs, c2Start + incr3)); 5084 5085 int k = 4 * level + i; 5086 5087 if (k < 24) { 5088 startIncr = 256; 5089 } else if (k == 25) { 5090 startIncr = 384; 5091 } else { 5092 startIncr = 128; 5093 } 5094 5095 c1Start += startIncr; 5096 c2Start += startIncr; 5097 } 5098 5099 c2 *= 2; 5100 } 5101 } 5102 5103 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 5104 // Implements the method 5105 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 5106 // the sun.security.provider.ML_DSA class. 5107 // 5108 // coeffs (int[256]) = c_rarg0 5109 // zetas (int[256]) = c_rarg1 5110 address generate_dilithiumAlmostInverseNtt() { 5111 5112 __ align(CodeEntryAlignment); 5113 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 5114 StubCodeMark mark(this, stub_id); 5115 address start = __ pc(); 5116 __ enter(); 5117 5118 const Register coeffs = c_rarg0; 5119 const Register zetas = c_rarg1; 5120 5121 const Register tmpAddr = r9; 5122 const Register dilithiumConsts = r10; 5123 const Register result = r11; 5124 5125 __ add(result, coeffs, 0); 5126 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5127 5128 // Each level represents one iteration of the outer for loop of the Java version 5129 // level0 5130 for (int i = 0; i < 1024; i += 128) { 5131 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5132 __ add(tmpAddr, coeffs, i); 5133 __ ld2(v0, v1, __ T4S, tmpAddr); 5134 __ add(tmpAddr, coeffs, i + 32); 5135 __ ld2(v2, v3, __ T4S, tmpAddr); 5136 __ add(tmpAddr, coeffs, i + 64); 5137 __ ld2(v4, v5, __ T4S, tmpAddr); 5138 __ add(tmpAddr, coeffs, i + 96); 5139 __ ld2(v6, v7, __ T4S, tmpAddr); 5140 dilithium_load16zetas(16, zetas); 5141 dilithium_sub_add_montmul16(); 5142 __ add(tmpAddr, coeffs, i); 5143 __ st2(v0, v1, __ T4S, tmpAddr); 5144 __ add(tmpAddr, coeffs, i + 32); 5145 __ st2(v2, v3, __ T4S, tmpAddr); 5146 __ add(tmpAddr, coeffs, i + 64); 5147 __ st2(v4, v5, __ T4S, tmpAddr); 5148 __ add(tmpAddr, coeffs, i + 96); 5149 __ st2(v6, v7, __ T4S, tmpAddr); 5150 } 5151 5152 // level 1 5153 for (int i = 0; i < 1024; i += 128) { 5154 __ add(tmpAddr, coeffs, i); 5155 __ ld2(v0, v1, __ T2D, tmpAddr); 5156 __ add(tmpAddr, coeffs, i + 32); 5157 __ ld2(v2, v3, __ T2D, tmpAddr); 5158 __ add(tmpAddr, coeffs, i + 64); 5159 __ ld2(v4, v5, __ T2D, tmpAddr); 5160 __ add(tmpAddr, coeffs, i + 96); 5161 __ ld2(v6, v7, __ T2D, tmpAddr); 5162 dilithium_load16zetas(16, zetas); 5163 dilithium_sub_add_montmul16(); 5164 __ add(tmpAddr, coeffs, i); 5165 __ st2(v0, v1, __ T2D, tmpAddr); 5166 __ add(tmpAddr, coeffs, i + 32); 5167 __ st2(v2, v3, __ T2D, tmpAddr); 5168 __ add(tmpAddr, coeffs, i + 64); 5169 __ st2(v4, v5, __ T2D, tmpAddr); 5170 __ add(tmpAddr, coeffs, i + 96); 5171 __ st2(v6, v7, __ T2D, tmpAddr); 5172 } 5173 5174 //level 2 5175 for (int i = 0; i < 1024; i += 256) { 5176 __ ldr(v0, __ Q, Address(coeffs, i)); 5177 __ ldr(v1, __ Q, Address(coeffs, i + 32)); 5178 __ ldr(v2, __ Q, Address(coeffs, i + 64)); 5179 __ ldr(v3, __ Q, Address(coeffs, i + 96)); 5180 __ ldr(v4, __ Q, Address(coeffs, i + 128)); 5181 __ ldr(v5, __ Q, Address(coeffs, i + 160)); 5182 __ ldr(v6, __ Q, Address(coeffs, i + 192)); 5183 __ ldr(v7, __ Q, Address(coeffs, i + 224)); 5184 __ ldr(v16, __ Q, Address(coeffs, i + 16)); 5185 __ ldr(v17, __ Q, Address(coeffs, i + 48)); 5186 __ ldr(v18, __ Q, Address(coeffs, i + 80)); 5187 __ ldr(v19, __ Q, Address(coeffs, i + 112)); 5188 __ ldr(v20, __ Q, Address(coeffs, i + 144)); 5189 __ ldr(v21, __ Q, Address(coeffs, i + 176)); 5190 __ ldr(v22, __ Q, Address(coeffs, i + 208)); 5191 __ ldr(v23, __ Q, Address(coeffs, i + 240)); 5192 dilithium_add_sub32(); 5193 __ str(v24, __ Q, Address(coeffs, i)); 5194 __ str(v25, __ Q, Address(coeffs, i + 32)); 5195 __ str(v26, __ Q, Address(coeffs, i + 64)); 5196 __ str(v27, __ Q, Address(coeffs, i + 96)); 5197 __ str(v28, __ Q, Address(coeffs, i + 128)); 5198 __ str(v29, __ Q, Address(coeffs, i + 160)); 5199 __ str(v30, __ Q, Address(coeffs, i + 192)); 5200 __ str(v31, __ Q, Address(coeffs, i + 224)); 5201 dilithium_load32zetas(zetas); 5202 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5203 dilithium_montmul32(false); 5204 __ str(v16, __ Q, Address(coeffs, i + 16)); 5205 __ str(v17, __ Q, Address(coeffs, i + 48)); 5206 __ str(v18, __ Q, Address(coeffs, i + 80)); 5207 __ str(v19, __ Q, Address(coeffs, i + 112)); 5208 __ str(v20, __ Q, Address(coeffs, i + 144)); 5209 __ str(v21, __ Q, Address(coeffs, i + 176)); 5210 __ str(v22, __ Q, Address(coeffs, i + 208)); 5211 __ str(v23, __ Q, Address(coeffs, i + 240)); 5212 } 5213 5214 // level 3-7 5215 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 5216 5217 __ leave(); // required for proper stackwalking of RuntimeStub frame 5218 __ mov(r0, zr); // return 0 5219 __ ret(lr); 5220 5221 return start; 5222 5223 } 5224 5225 // Dilithium multiply polynomials in the NTT domain. 5226 // Straightforward implementation of the method 5227 // static int implDilithiumNttMult( 5228 // int[] result, int[] ntta, int[] nttb {} of 5229 // the sun.security.provider.ML_DSA class. 5230 // 5231 // result (int[256]) = c_rarg0 5232 // poly1 (int[256]) = c_rarg1 5233 // poly2 (int[256]) = c_rarg2 5234 address generate_dilithiumNttMult() { 5235 5236 __ align(CodeEntryAlignment); 5237 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 5238 StubCodeMark mark(this, stub_id); 5239 address start = __ pc(); 5240 __ enter(); 5241 5242 Label L_loop; 5243 5244 const Register result = c_rarg0; 5245 const Register poly1 = c_rarg1; 5246 const Register poly2 = c_rarg2; 5247 5248 const Register dilithiumConsts = r10; 5249 const Register len = r11; 5250 5251 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5252 5253 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5254 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 5255 5256 __ mov(len, zr); 5257 __ add(len, len, 1024); 5258 5259 __ BIND(L_loop); 5260 5261 __ ldpq(v0, v1, __ post(poly1, 32)); 5262 __ ldpq(v2, v3, __ post(poly1, 32)); 5263 __ ldpq(v4, v5, __ post(poly1, 32)); 5264 __ ldpq(v6, v7, __ post(poly1, 32)); 5265 __ ldpq(v16, v17, __ post(poly2, 32)); 5266 __ ldpq(v18, v19, __ post(poly2, 32)); 5267 __ ldpq(v20, v21, __ post(poly2, 32)); 5268 __ ldpq(v22, v23, __ post(poly2, 32)); 5269 dilithium_montmul32(false); 5270 dilithium_montmul32(true); 5271 __ stpq(v16, v17, __ post(result, 32)); 5272 __ stpq(v18, v19, __ post(result, 32)); 5273 __ stpq(v20, v21, __ post(result, 32)); 5274 __ stpq(v22, v23, __ post(result, 32)); 5275 5276 __ sub(len, len, 128); 5277 __ cmp(len, (u1)128); 5278 __ br(Assembler::GE, L_loop); 5279 5280 __ leave(); // required for proper stackwalking of RuntimeStub frame 5281 __ mov(r0, zr); // return 0 5282 __ ret(lr); 5283 5284 return start; 5285 5286 } 5287 5288 // Dilithium Motgomery multiply an array by a constant. 5289 // A straightforward implementation of the method 5290 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 5291 // of the sun.security.provider.MLDSA class 5292 // 5293 // coeffs (int[256]) = c_rarg0 5294 // constant (int) = c_rarg1 5295 address generate_dilithiumMontMulByConstant() { 5296 5297 __ align(CodeEntryAlignment); 5298 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 5299 StubCodeMark mark(this, stub_id); 5300 address start = __ pc(); 5301 __ enter(); 5302 5303 Label L_loop; 5304 5305 const Register coeffs = c_rarg0; 5306 const Register constant = c_rarg1; 5307 5308 const Register dilithiumConsts = r10; 5309 const Register result = r11; 5310 const Register len = r12; 5311 5312 __ add(result, coeffs, 0); 5313 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5314 5315 __ ldpq(v30, v31, Address(dilithiumConsts, 0)); // qInv, q 5316 __ dup(v29, __ T4S, constant); 5317 __ mov(len, zr); 5318 __ add(len, len, 1024); 5319 5320 __ BIND(L_loop); 5321 5322 __ ldpq(v16, v17, __ post(coeffs, 32)); 5323 __ ldpq(v18, v19, __ post(coeffs, 32)); 5324 __ ldpq(v20, v21, __ post(coeffs, 32)); 5325 __ ldpq(v22, v23, __ post(coeffs, 32)); 5326 dilithium_montmul32(true); 5327 __ stpq(v16, v17, __ post(result, 32)); 5328 __ stpq(v18, v19, __ post(result, 32)); 5329 __ stpq(v20, v21, __ post(result, 32)); 5330 __ stpq(v22, v23, __ post(result, 32)); 5331 5332 __ sub(len, len, 128); 5333 __ cmp(len, (u1)128); 5334 __ br(Assembler::GE, L_loop); 5335 5336 __ leave(); // required for proper stackwalking of RuntimeStub frame 5337 __ mov(r0, zr); // return 0 5338 __ ret(lr); 5339 5340 return start; 5341 } 5342 5343 // Dilithium decompose poly. 5344 // Implements the method 5345 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 5346 // of the sun.security.provider.ML_DSA class 5347 // 5348 // input (int[256]) = c_rarg0 5349 // lowPart (int[256]) = c_rarg1 5350 // highPart (int[256]) = c_rarg2 5351 // twoGamma2 (int) = c_rarg3 5352 // multiplier (int) = c_rarg4 5353 address generate_dilithiumDecomposePoly() { 5354 5355 __ align(CodeEntryAlignment); 5356 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 5357 StubCodeMark mark(this, stub_id); 5358 address start = __ pc(); 5359 __ enter(); 5360 5361 Label L_loop; 5362 5363 const Register input = c_rarg0; 5364 const Register lowPart = c_rarg1; 5365 const Register highPart = c_rarg2; 5366 const Register twoGamma2 = c_rarg3; 5367 const Register multiplier = c_rarg4; 5368 5369 const Register len = r9; 5370 const Register dilithiumConsts = r10; 5371 const Register tmp = r11; 5372 5373 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5374 5375 // save callee-saved registers 5376 __ stpd(v8, v9, __ pre(sp, -64)); 5377 __ stpd(v10, v11, Address(sp, 16)); 5378 __ stpd(v12, v13, Address(sp, 32)); 5379 __ stpd(v14, v15, Address(sp, 48)); 5380 5381 5382 __ mov(tmp, zr); 5383 __ add(tmp, tmp, 1); 5384 __ dup(v25, __ T4S, tmp); // 1 5385 __ ldr(v30, __ Q, Address(dilithiumConsts, 16)); // q 5386 __ ldr(v31, __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 5387 __ dup(v28, __ T4S, twoGamma2); // 2 * gamma2 5388 __ dup(v29, __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 5389 __ subv(v26, __ T4S, v30, v25); // q - 1 5390 __ sshr(v27, __ T4S, v28, 1); // gamma2 5391 5392 __ mov(len, zr); 5393 __ add(len, len, 1024); 5394 5395 __ BIND(L_loop); 5396 5397 __ ld4(v0, v1, v2, v3, __ T4S, __ post(input, 64)); 5398 5399 // rplus in v0 5400 // rplus = rplus - ((rplus + 5373807) >> 23) * dilithium_q; 5401 __ addv(v4, __ T4S, v0, v31); 5402 __ addv(v5, __ T4S, v1, v31); 5403 __ addv(v6, __ T4S, v2, v31); 5404 __ addv(v7, __ T4S, v3, v31); 5405 5406 __ sshr(v4, __ T4S, v4, 23); 5407 __ sshr(v5, __ T4S, v5, 23); 5408 __ sshr(v6, __ T4S, v6, 23); 5409 __ sshr(v7, __ T4S, v7, 23); 5410 5411 __ mulv(v4, __ T4S, v4, v30); 5412 __ mulv(v5, __ T4S, v5, v30); 5413 __ mulv(v6, __ T4S, v6, v30); 5414 __ mulv(v7, __ T4S, v7, v30); 5415 5416 __ subv(v0, __ T4S, v0, v4); 5417 __ subv(v1, __ T4S, v1, v5); 5418 __ subv(v2, __ T4S, v2, v6); 5419 __ subv(v3, __ T4S, v3, v7); 5420 5421 // rplus in v0 5422 // rplus = rplus + ((rplus >> 31) & dilithium_q); 5423 __ sshr(v4, __ T4S, v0, 31); 5424 __ sshr(v5, __ T4S, v1, 31); 5425 __ sshr(v6, __ T4S, v2, 31); 5426 __ sshr(v7, __ T4S, v3, 31); 5427 5428 __ andr(v4, __ T16B, v4, v30); 5429 __ andr(v5, __ T16B, v5, v30); 5430 __ andr(v6, __ T16B, v6, v30); 5431 __ andr(v7, __ T16B, v7, v30); 5432 5433 __ addv(v0, __ T4S, v0, v4); 5434 __ addv(v1, __ T4S, v1, v5); 5435 __ addv(v2, __ T4S, v2, v6); 5436 __ addv(v3, __ T4S, v3, v7); 5437 5438 // rplus in v0 5439 // int quotient = (rplus * multiplier) >> 22; 5440 __ mulv(v4, __ T4S, v0, v29); 5441 __ mulv(v5, __ T4S, v1, v29); 5442 __ mulv(v6, __ T4S, v2, v29); 5443 __ mulv(v7, __ T4S, v3, v29); 5444 5445 __ sshr(v4, __ T4S, v4, 22); 5446 __ sshr(v5, __ T4S, v5, 22); 5447 __ sshr(v6, __ T4S, v6, 22); 5448 __ sshr(v7, __ T4S, v7, 22); 5449 5450 // quotient in v4 5451 // int r0 = rplus - quotient * twoGamma2; 5452 __ mulv(v8, __ T4S, v4, v28); 5453 __ mulv(v9, __ T4S, v5, v28); 5454 __ mulv(v10, __ T4S, v6, v28); 5455 __ mulv(v11, __ T4S, v7, v28); 5456 5457 __ subv(v8, __ T4S, v0, v8); 5458 __ subv(v9, __ T4S, v1, v9); 5459 __ subv(v10, __ T4S, v2, v10); 5460 __ subv(v11, __ T4S, v3, v11); 5461 5462 // r0 in v8 5463 // int mask = (twoGamma2 - r0) >> 22; 5464 __ subv(v12, __ T4S, v28, v8); 5465 __ subv(v13, __ T4S, v28, v9); 5466 __ subv(v14, __ T4S, v28, v10); 5467 __ subv(v15, __ T4S, v28, v11); 5468 5469 __ sshr(v12, __ T4S, v12, 22); 5470 __ sshr(v13, __ T4S, v13, 22); 5471 __ sshr(v14, __ T4S, v14, 22); 5472 __ sshr(v15, __ T4S, v15, 22); 5473 5474 // mask in v12 5475 // r0 -= (mask & twoGamma2); 5476 __ andr(v16, __ T16B, v12, v28); 5477 __ andr(v17, __ T16B, v13, v28); 5478 __ andr(v18, __ T16B, v14, v28); 5479 __ andr(v19, __ T16B, v15, v28); 5480 5481 __ subv(v8, __ T4S, v8, v16); 5482 __ subv(v9, __ T4S, v9, v17); 5483 __ subv(v10, __ T4S, v10, v18); 5484 __ subv(v11, __ T4S, v11, v19); 5485 5486 // r0 in v8 5487 // quotient += (mask & 1); 5488 __ andr(v16, __ T16B, v12, v25); 5489 __ andr(v17, __ T16B, v13, v25); 5490 __ andr(v18, __ T16B, v14, v25); 5491 __ andr(v19, __ T16B, v15, v25); 5492 5493 __ addv(v4, __ T4S, v4, v16); 5494 __ addv(v5, __ T4S, v5, v17); 5495 __ addv(v6, __ T4S, v6, v18); 5496 __ addv(v7, __ T4S, v7, v19); 5497 5498 // mask = (twoGamma2 / 2 - r0) >> 31; 5499 __ subv(v12, __ T4S, v27, v8); 5500 __ subv(v13, __ T4S, v27, v9); 5501 __ subv(v14, __ T4S, v27, v10); 5502 __ subv(v15, __ T4S, v27, v11); 5503 5504 __ sshr(v12, __ T4S, v12, 31); 5505 __ sshr(v13, __ T4S, v13, 31); 5506 __ sshr(v14, __ T4S, v14, 31); 5507 __ sshr(v15, __ T4S, v15, 31); 5508 5509 // r0 -= (mask & twoGamma2); 5510 __ andr(v16, __ T16B, v12, v28); 5511 __ andr(v17, __ T16B, v13, v28); 5512 __ andr(v18, __ T16B, v14, v28); 5513 __ andr(v19, __ T16B, v15, v28); 5514 5515 __ subv(v8, __ T4S, v8, v16); 5516 __ subv(v9, __ T4S, v9, v17); 5517 __ subv(v10, __ T4S, v10, v18); 5518 __ subv(v11, __ T4S, v11, v19); 5519 5520 // quotient += (mask & 1); 5521 __ andr(v16, __ T16B, v12, v25); 5522 __ andr(v17, __ T16B, v13, v25); 5523 __ andr(v18, __ T16B, v14, v25); 5524 __ andr(v19, __ T16B, v15, v25); 5525 5526 __ addv(v4, __ T4S, v4, v16); 5527 __ addv(v5, __ T4S, v5, v17); 5528 __ addv(v6, __ T4S, v6, v18); 5529 __ addv(v7, __ T4S, v7, v19); 5530 5531 // int r1 = rplus - r0 - (dilithium_q - 1); 5532 __ subv(v16, __ T4S, v0, v8); 5533 __ subv(v17, __ T4S, v1, v9); 5534 __ subv(v18, __ T4S, v2, v10); 5535 __ subv(v19, __ T4S, v3, v11); 5536 5537 __ subv(v16, __ T4S, v16, v26); 5538 __ subv(v17, __ T4S, v17, v26); 5539 __ subv(v18, __ T4S, v18, v26); 5540 __ subv(v19, __ T4S, v19, v26); 5541 5542 // r1 in v16 5543 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 5544 __ negr(v20, __ T4S, v16); 5545 __ negr(v21, __ T4S, v17); 5546 __ negr(v22, __ T4S, v18); 5547 __ negr(v23, __ T4S, v19); 5548 5549 __ orr(v16, __ T16B, v16, v20); 5550 __ orr(v17, __ T16B, v17, v21); 5551 __ orr(v18, __ T16B, v18, v22); 5552 __ orr(v19, __ T16B, v19, v23); 5553 5554 __ sshr(v0, __ T4S, v16, 31); 5555 __ sshr(v1, __ T4S, v17, 31); 5556 __ sshr(v2, __ T4S, v18, 31); 5557 __ sshr(v3, __ T4S, v19, 31); 5558 5559 // r1 in v0 5560 // r0 += ~r1; 5561 __ notr(v20, __ T16B, v0); 5562 __ notr(v21, __ T16B, v1); 5563 __ notr(v22, __ T16B, v2); 5564 __ notr(v23, __ T16B, v3); 5565 5566 __ addv(v8, __ T4S, v8, v20); 5567 __ addv(v9, __ T4S, v9, v21); 5568 __ addv(v10, __ T4S, v10, v22); 5569 __ addv(v11, __ T4S, v11, v23); 5570 5571 // r0 in v8 5572 // r1 = r1 & quotient; 5573 __ andr(v0, __ T16B, v4, v0); 5574 __ andr(v1, __ T16B, v5, v1); 5575 __ andr(v2, __ T16B, v6, v2); 5576 __ andr(v3, __ T16B, v7, v3); 5577 5578 // r1 in v0 5579 // lowPart[m] = r0; 5580 // highPart[m] = r1; 5581 __ st4(v8, v9, v10, v11, __ T4S, __ post(lowPart, 64)); 5582 __ st4(v0, v1, v2, v3, __ T4S, __ post(highPart, 64)); 5583 5584 5585 __ sub(len, len, 64); 5586 __ cmp(len, (u1)64); 5587 __ br(Assembler::GE, L_loop); 5588 5589 // restore callee-saved vector registers 5590 __ ldpd(v14, v15, Address(sp, 48)); 5591 __ ldpd(v12, v13, Address(sp, 32)); 5592 __ ldpd(v10, v11, Address(sp, 16)); 5593 __ ldpd(v8, v9, __ post(sp, 64)); 5594 5595 __ leave(); // required for proper stackwalking of RuntimeStub frame 5596 __ mov(r0, zr); // return 0 5597 __ ret(lr); 5598 5599 return start; 5600 } 5601 5602 /** 5603 * Arguments: 5604 * 5605 * Inputs: 5606 * c_rarg0 - int crc 5607 * c_rarg1 - byte* buf 5608 * c_rarg2 - int length 5609 * c_rarg3 - int* table 5610 * 5611 * Output: 5612 * r0 - int crc result 5613 */ 5614 address generate_updateBytesCRC32C() { 5615 assert(UseCRC32CIntrinsics, "what are we doing here?"); 5616 5617 __ align(CodeEntryAlignment); 5618 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 5619 StubCodeMark mark(this, stub_id); 5620 5621 address start = __ pc(); 5622 5623 const Register crc = c_rarg0; // crc 5624 const Register buf = c_rarg1; // source java byte array address 5625 const Register len = c_rarg2; // length 5626 const Register table0 = c_rarg3; // crc_table address 5627 const Register table1 = c_rarg4; 5628 const Register table2 = c_rarg5; 5629 const Register table3 = c_rarg6; 5630 const Register tmp3 = c_rarg7; 5631 5632 BLOCK_COMMENT("Entry:"); 5633 __ enter(); // required for proper stackwalking of RuntimeStub frame 5634 5635 __ kernel_crc32c(crc, buf, len, 5636 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 5637 5638 __ leave(); // required for proper stackwalking of RuntimeStub frame 5639 __ ret(lr); 5640 5641 return start; 5642 } 5643 5644 /*** 5645 * Arguments: 5646 * 5647 * Inputs: 5648 * c_rarg0 - int adler 5649 * c_rarg1 - byte* buff 5650 * c_rarg2 - int len 5651 * 5652 * Output: 5653 * c_rarg0 - int adler result 5654 */ 5655 address generate_updateBytesAdler32() { 5656 __ align(CodeEntryAlignment); 5657 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 5658 StubCodeMark mark(this, stub_id); 5659 address start = __ pc(); 5660 5661 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 5662 5663 // Aliases 5664 Register adler = c_rarg0; 5665 Register s1 = c_rarg0; 5666 Register s2 = c_rarg3; 5667 Register buff = c_rarg1; 5668 Register len = c_rarg2; 5669 Register nmax = r4; 5670 Register base = r5; 5671 Register count = r6; 5672 Register temp0 = rscratch1; 5673 Register temp1 = rscratch2; 5674 FloatRegister vbytes = v0; 5675 FloatRegister vs1acc = v1; 5676 FloatRegister vs2acc = v2; 5677 FloatRegister vtable = v3; 5678 5679 // Max number of bytes we can process before having to take the mod 5680 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 5681 uint64_t BASE = 0xfff1; 5682 uint64_t NMAX = 0x15B0; 5683 5684 __ mov(base, BASE); 5685 __ mov(nmax, NMAX); 5686 5687 // Load accumulation coefficients for the upper 16 bits 5688 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 5689 __ ld1(vtable, __ T16B, Address(temp0)); 5690 5691 // s1 is initialized to the lower 16 bits of adler 5692 // s2 is initialized to the upper 16 bits of adler 5693 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 5694 __ uxth(s1, adler); // s1 = (adler & 0xffff) 5695 5696 // The pipelined loop needs at least 16 elements for 1 iteration 5697 // It does check this, but it is more effective to skip to the cleanup loop 5698 __ cmp(len, (u1)16); 5699 __ br(Assembler::HS, L_nmax); 5700 __ cbz(len, L_combine); 5701 5702 __ bind(L_simple_by1_loop); 5703 __ ldrb(temp0, Address(__ post(buff, 1))); 5704 __ add(s1, s1, temp0); 5705 __ add(s2, s2, s1); 5706 __ subs(len, len, 1); 5707 __ br(Assembler::HI, L_simple_by1_loop); 5708 5709 // s1 = s1 % BASE 5710 __ subs(temp0, s1, base); 5711 __ csel(s1, temp0, s1, Assembler::HS); 5712 5713 // s2 = s2 % BASE 5714 __ lsr(temp0, s2, 16); 5715 __ lsl(temp1, temp0, 4); 5716 __ sub(temp1, temp1, temp0); 5717 __ add(s2, temp1, s2, ext::uxth); 5718 5719 __ subs(temp0, s2, base); 5720 __ csel(s2, temp0, s2, Assembler::HS); 5721 5722 __ b(L_combine); 5723 5724 __ bind(L_nmax); 5725 __ subs(len, len, nmax); 5726 __ sub(count, nmax, 16); 5727 __ br(Assembler::LO, L_by16); 5728 5729 __ bind(L_nmax_loop); 5730 5731 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 5732 vbytes, vs1acc, vs2acc, vtable); 5733 5734 __ subs(count, count, 16); 5735 __ br(Assembler::HS, L_nmax_loop); 5736 5737 // s1 = s1 % BASE 5738 __ lsr(temp0, s1, 16); 5739 __ lsl(temp1, temp0, 4); 5740 __ sub(temp1, temp1, temp0); 5741 __ add(temp1, temp1, s1, ext::uxth); 5742 5743 __ lsr(temp0, temp1, 16); 5744 __ lsl(s1, temp0, 4); 5745 __ sub(s1, s1, temp0); 5746 __ add(s1, s1, temp1, ext:: uxth); 5747 5748 __ subs(temp0, s1, base); 5749 __ csel(s1, temp0, s1, Assembler::HS); 5750 5751 // s2 = s2 % BASE 5752 __ lsr(temp0, s2, 16); 5753 __ lsl(temp1, temp0, 4); 5754 __ sub(temp1, temp1, temp0); 5755 __ add(temp1, temp1, s2, ext::uxth); 5756 5757 __ lsr(temp0, temp1, 16); 5758 __ lsl(s2, temp0, 4); 5759 __ sub(s2, s2, temp0); 5760 __ add(s2, s2, temp1, ext:: uxth); 5761 5762 __ subs(temp0, s2, base); 5763 __ csel(s2, temp0, s2, Assembler::HS); 5764 5765 __ subs(len, len, nmax); 5766 __ sub(count, nmax, 16); 5767 __ br(Assembler::HS, L_nmax_loop); 5768 5769 __ bind(L_by16); 5770 __ adds(len, len, count); 5771 __ br(Assembler::LO, L_by1); 5772 5773 __ bind(L_by16_loop); 5774 5775 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 5776 vbytes, vs1acc, vs2acc, vtable); 5777 5778 __ subs(len, len, 16); 5779 __ br(Assembler::HS, L_by16_loop); 5780 5781 __ bind(L_by1); 5782 __ adds(len, len, 15); 5783 __ br(Assembler::LO, L_do_mod); 5784 5785 __ bind(L_by1_loop); 5786 __ ldrb(temp0, Address(__ post(buff, 1))); 5787 __ add(s1, temp0, s1); 5788 __ add(s2, s2, s1); 5789 __ subs(len, len, 1); 5790 __ br(Assembler::HS, L_by1_loop); 5791 5792 __ bind(L_do_mod); 5793 // s1 = s1 % BASE 5794 __ lsr(temp0, s1, 16); 5795 __ lsl(temp1, temp0, 4); 5796 __ sub(temp1, temp1, temp0); 5797 __ add(temp1, temp1, s1, ext::uxth); 5798 5799 __ lsr(temp0, temp1, 16); 5800 __ lsl(s1, temp0, 4); 5801 __ sub(s1, s1, temp0); 5802 __ add(s1, s1, temp1, ext:: uxth); 5803 5804 __ subs(temp0, s1, base); 5805 __ csel(s1, temp0, s1, Assembler::HS); 5806 5807 // s2 = s2 % BASE 5808 __ lsr(temp0, s2, 16); 5809 __ lsl(temp1, temp0, 4); 5810 __ sub(temp1, temp1, temp0); 5811 __ add(temp1, temp1, s2, ext::uxth); 5812 5813 __ lsr(temp0, temp1, 16); 5814 __ lsl(s2, temp0, 4); 5815 __ sub(s2, s2, temp0); 5816 __ add(s2, s2, temp1, ext:: uxth); 5817 5818 __ subs(temp0, s2, base); 5819 __ csel(s2, temp0, s2, Assembler::HS); 5820 5821 // Combine lower bits and higher bits 5822 __ bind(L_combine); 5823 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 5824 5825 __ ret(lr); 5826 5827 return start; 5828 } 5829 5830 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 5831 Register temp0, Register temp1, FloatRegister vbytes, 5832 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 5833 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 5834 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 5835 // In non-vectorized code, we update s1 and s2 as: 5836 // s1 <- s1 + b1 5837 // s2 <- s2 + s1 5838 // s1 <- s1 + b2 5839 // s2 <- s2 + b1 5840 // ... 5841 // s1 <- s1 + b16 5842 // s2 <- s2 + s1 5843 // Putting above assignments together, we have: 5844 // s1_new = s1 + b1 + b2 + ... + b16 5845 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 5846 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 5847 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 5848 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 5849 5850 // s2 = s2 + s1 * 16 5851 __ add(s2, s2, s1, Assembler::LSL, 4); 5852 5853 // vs1acc = b1 + b2 + b3 + ... + b16 5854 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 5855 __ umullv(vs2acc, __ T8B, vtable, vbytes); 5856 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 5857 __ uaddlv(vs1acc, __ T16B, vbytes); 5858 __ uaddlv(vs2acc, __ T8H, vs2acc); 5859 5860 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 5861 __ fmovd(temp0, vs1acc); 5862 __ fmovd(temp1, vs2acc); 5863 __ add(s1, s1, temp0); 5864 __ add(s2, s2, temp1); 5865 } 5866 5867 /** 5868 * Arguments: 5869 * 5870 * Input: 5871 * c_rarg0 - x address 5872 * c_rarg1 - x length 5873 * c_rarg2 - y address 5874 * c_rarg3 - y length 5875 * c_rarg4 - z address 5876 */ 5877 address generate_multiplyToLen() { 5878 __ align(CodeEntryAlignment); 5879 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 5880 StubCodeMark mark(this, stub_id); 5881 5882 address start = __ pc(); 5883 5884 if (SCCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) { 5885 return start; 5886 } 5887 const Register x = r0; 5888 const Register xlen = r1; 5889 const Register y = r2; 5890 const Register ylen = r3; 5891 const Register z = r4; 5892 5893 const Register tmp0 = r5; 5894 const Register tmp1 = r10; 5895 const Register tmp2 = r11; 5896 const Register tmp3 = r12; 5897 const Register tmp4 = r13; 5898 const Register tmp5 = r14; 5899 const Register tmp6 = r15; 5900 const Register tmp7 = r16; 5901 5902 BLOCK_COMMENT("Entry:"); 5903 __ enter(); // required for proper stackwalking of RuntimeStub frame 5904 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 5905 __ leave(); // required for proper stackwalking of RuntimeStub frame 5906 __ ret(lr); 5907 5908 SCCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start); 5909 return start; 5910 } 5911 5912 address generate_squareToLen() { 5913 // squareToLen algorithm for sizes 1..127 described in java code works 5914 // faster than multiply_to_len on some CPUs and slower on others, but 5915 // multiply_to_len shows a bit better overall results 5916 __ align(CodeEntryAlignment); 5917 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 5918 StubCodeMark mark(this, stub_id); 5919 address start = __ pc(); 5920 5921 if (SCCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) { 5922 return start; 5923 } 5924 const Register x = r0; 5925 const Register xlen = r1; 5926 const Register z = r2; 5927 const Register y = r4; // == x 5928 const Register ylen = r5; // == xlen 5929 5930 const Register tmp0 = r3; 5931 const Register tmp1 = r10; 5932 const Register tmp2 = r11; 5933 const Register tmp3 = r12; 5934 const Register tmp4 = r13; 5935 const Register tmp5 = r14; 5936 const Register tmp6 = r15; 5937 const Register tmp7 = r16; 5938 5939 RegSet spilled_regs = RegSet::of(y, ylen); 5940 BLOCK_COMMENT("Entry:"); 5941 __ enter(); 5942 __ push(spilled_regs, sp); 5943 __ mov(y, x); 5944 __ mov(ylen, xlen); 5945 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 5946 __ pop(spilled_regs, sp); 5947 __ leave(); 5948 __ ret(lr); 5949 5950 SCCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start); 5951 return start; 5952 } 5953 5954 address generate_mulAdd() { 5955 __ align(CodeEntryAlignment); 5956 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 5957 StubCodeMark mark(this, stub_id); 5958 5959 address start = __ pc(); 5960 5961 if (SCCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) { 5962 return start; 5963 } 5964 const Register out = r0; 5965 const Register in = r1; 5966 const Register offset = r2; 5967 const Register len = r3; 5968 const Register k = r4; 5969 5970 BLOCK_COMMENT("Entry:"); 5971 __ enter(); 5972 __ mul_add(out, in, offset, len, k); 5973 __ leave(); 5974 __ ret(lr); 5975 5976 SCCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start); 5977 return start; 5978 } 5979 5980 // Arguments: 5981 // 5982 // Input: 5983 // c_rarg0 - newArr address 5984 // c_rarg1 - oldArr address 5985 // c_rarg2 - newIdx 5986 // c_rarg3 - shiftCount 5987 // c_rarg4 - numIter 5988 // 5989 address generate_bigIntegerRightShift() { 5990 __ align(CodeEntryAlignment); 5991 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 5992 StubCodeMark mark(this, stub_id); 5993 address start = __ pc(); 5994 5995 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 5996 5997 Register newArr = c_rarg0; 5998 Register oldArr = c_rarg1; 5999 Register newIdx = c_rarg2; 6000 Register shiftCount = c_rarg3; 6001 Register numIter = c_rarg4; 6002 Register idx = numIter; 6003 6004 Register newArrCur = rscratch1; 6005 Register shiftRevCount = rscratch2; 6006 Register oldArrCur = r13; 6007 Register oldArrNext = r14; 6008 6009 FloatRegister oldElem0 = v0; 6010 FloatRegister oldElem1 = v1; 6011 FloatRegister newElem = v2; 6012 FloatRegister shiftVCount = v3; 6013 FloatRegister shiftVRevCount = v4; 6014 6015 __ cbz(idx, Exit); 6016 6017 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 6018 6019 // left shift count 6020 __ movw(shiftRevCount, 32); 6021 __ subw(shiftRevCount, shiftRevCount, shiftCount); 6022 6023 // numIter too small to allow a 4-words SIMD loop, rolling back 6024 __ cmp(numIter, (u1)4); 6025 __ br(Assembler::LT, ShiftThree); 6026 6027 __ dup(shiftVCount, __ T4S, shiftCount); 6028 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 6029 __ negr(shiftVCount, __ T4S, shiftVCount); 6030 6031 __ BIND(ShiftSIMDLoop); 6032 6033 // Calculate the load addresses 6034 __ sub(idx, idx, 4); 6035 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 6036 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 6037 __ add(oldArrCur, oldArrNext, 4); 6038 6039 // Load 4 words and process 6040 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 6041 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 6042 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 6043 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 6044 __ orr(newElem, __ T16B, oldElem0, oldElem1); 6045 __ st1(newElem, __ T4S, Address(newArrCur)); 6046 6047 __ cmp(idx, (u1)4); 6048 __ br(Assembler::LT, ShiftTwoLoop); 6049 __ b(ShiftSIMDLoop); 6050 6051 __ BIND(ShiftTwoLoop); 6052 __ cbz(idx, Exit); 6053 __ cmp(idx, (u1)1); 6054 __ br(Assembler::EQ, ShiftOne); 6055 6056 // Calculate the load addresses 6057 __ sub(idx, idx, 2); 6058 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 6059 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 6060 __ add(oldArrCur, oldArrNext, 4); 6061 6062 // Load 2 words and process 6063 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 6064 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 6065 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 6066 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 6067 __ orr(newElem, __ T8B, oldElem0, oldElem1); 6068 __ st1(newElem, __ T2S, Address(newArrCur)); 6069 __ b(ShiftTwoLoop); 6070 6071 __ BIND(ShiftThree); 6072 __ tbz(idx, 1, ShiftOne); 6073 __ tbz(idx, 0, ShiftTwo); 6074 __ ldrw(r10, Address(oldArr, 12)); 6075 __ ldrw(r11, Address(oldArr, 8)); 6076 __ lsrvw(r10, r10, shiftCount); 6077 __ lslvw(r11, r11, shiftRevCount); 6078 __ orrw(r12, r10, r11); 6079 __ strw(r12, Address(newArr, 8)); 6080 6081 __ BIND(ShiftTwo); 6082 __ ldrw(r10, Address(oldArr, 8)); 6083 __ ldrw(r11, Address(oldArr, 4)); 6084 __ lsrvw(r10, r10, shiftCount); 6085 __ lslvw(r11, r11, shiftRevCount); 6086 __ orrw(r12, r10, r11); 6087 __ strw(r12, Address(newArr, 4)); 6088 6089 __ BIND(ShiftOne); 6090 __ ldrw(r10, Address(oldArr, 4)); 6091 __ ldrw(r11, Address(oldArr)); 6092 __ lsrvw(r10, r10, shiftCount); 6093 __ lslvw(r11, r11, shiftRevCount); 6094 __ orrw(r12, r10, r11); 6095 __ strw(r12, Address(newArr)); 6096 6097 __ BIND(Exit); 6098 __ ret(lr); 6099 6100 return start; 6101 } 6102 6103 // Arguments: 6104 // 6105 // Input: 6106 // c_rarg0 - newArr address 6107 // c_rarg1 - oldArr address 6108 // c_rarg2 - newIdx 6109 // c_rarg3 - shiftCount 6110 // c_rarg4 - numIter 6111 // 6112 address generate_bigIntegerLeftShift() { 6113 __ align(CodeEntryAlignment); 6114 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 6115 StubCodeMark mark(this, stub_id); 6116 address start = __ pc(); 6117 6118 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 6119 6120 Register newArr = c_rarg0; 6121 Register oldArr = c_rarg1; 6122 Register newIdx = c_rarg2; 6123 Register shiftCount = c_rarg3; 6124 Register numIter = c_rarg4; 6125 6126 Register shiftRevCount = rscratch1; 6127 Register oldArrNext = rscratch2; 6128 6129 FloatRegister oldElem0 = v0; 6130 FloatRegister oldElem1 = v1; 6131 FloatRegister newElem = v2; 6132 FloatRegister shiftVCount = v3; 6133 FloatRegister shiftVRevCount = v4; 6134 6135 __ cbz(numIter, Exit); 6136 6137 __ add(oldArrNext, oldArr, 4); 6138 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 6139 6140 // right shift count 6141 __ movw(shiftRevCount, 32); 6142 __ subw(shiftRevCount, shiftRevCount, shiftCount); 6143 6144 // numIter too small to allow a 4-words SIMD loop, rolling back 6145 __ cmp(numIter, (u1)4); 6146 __ br(Assembler::LT, ShiftThree); 6147 6148 __ dup(shiftVCount, __ T4S, shiftCount); 6149 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 6150 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 6151 6152 __ BIND(ShiftSIMDLoop); 6153 6154 // load 4 words and process 6155 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 6156 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 6157 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 6158 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 6159 __ orr(newElem, __ T16B, oldElem0, oldElem1); 6160 __ st1(newElem, __ T4S, __ post(newArr, 16)); 6161 __ sub(numIter, numIter, 4); 6162 6163 __ cmp(numIter, (u1)4); 6164 __ br(Assembler::LT, ShiftTwoLoop); 6165 __ b(ShiftSIMDLoop); 6166 6167 __ BIND(ShiftTwoLoop); 6168 __ cbz(numIter, Exit); 6169 __ cmp(numIter, (u1)1); 6170 __ br(Assembler::EQ, ShiftOne); 6171 6172 // load 2 words and process 6173 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 6174 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 6175 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 6176 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 6177 __ orr(newElem, __ T8B, oldElem0, oldElem1); 6178 __ st1(newElem, __ T2S, __ post(newArr, 8)); 6179 __ sub(numIter, numIter, 2); 6180 __ b(ShiftTwoLoop); 6181 6182 __ BIND(ShiftThree); 6183 __ ldrw(r10, __ post(oldArr, 4)); 6184 __ ldrw(r11, __ post(oldArrNext, 4)); 6185 __ lslvw(r10, r10, shiftCount); 6186 __ lsrvw(r11, r11, shiftRevCount); 6187 __ orrw(r12, r10, r11); 6188 __ strw(r12, __ post(newArr, 4)); 6189 __ tbz(numIter, 1, Exit); 6190 __ tbz(numIter, 0, ShiftOne); 6191 6192 __ BIND(ShiftTwo); 6193 __ ldrw(r10, __ post(oldArr, 4)); 6194 __ ldrw(r11, __ post(oldArrNext, 4)); 6195 __ lslvw(r10, r10, shiftCount); 6196 __ lsrvw(r11, r11, shiftRevCount); 6197 __ orrw(r12, r10, r11); 6198 __ strw(r12, __ post(newArr, 4)); 6199 6200 __ BIND(ShiftOne); 6201 __ ldrw(r10, Address(oldArr)); 6202 __ ldrw(r11, Address(oldArrNext)); 6203 __ lslvw(r10, r10, shiftCount); 6204 __ lsrvw(r11, r11, shiftRevCount); 6205 __ orrw(r12, r10, r11); 6206 __ strw(r12, Address(newArr)); 6207 6208 __ BIND(Exit); 6209 __ ret(lr); 6210 6211 return start; 6212 } 6213 6214 address generate_count_positives(address &count_positives_long) { 6215 const u1 large_loop_size = 64; 6216 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 6217 int dcache_line = VM_Version::dcache_line_size(); 6218 6219 Register ary1 = r1, len = r2, result = r0; 6220 6221 __ align(CodeEntryAlignment); 6222 6223 StubGenStubId stub_id = StubGenStubId::count_positives_id; 6224 StubCodeMark mark(this, stub_id); 6225 6226 address entry = __ pc(); 6227 6228 __ enter(); 6229 // precondition: a copy of len is already in result 6230 // __ mov(result, len); 6231 6232 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 6233 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 6234 6235 __ cmp(len, (u1)15); 6236 __ br(Assembler::GT, LEN_OVER_15); 6237 // The only case when execution falls into this code is when pointer is near 6238 // the end of memory page and we have to avoid reading next page 6239 __ add(ary1, ary1, len); 6240 __ subs(len, len, 8); 6241 __ br(Assembler::GT, LEN_OVER_8); 6242 __ ldr(rscratch2, Address(ary1, -8)); 6243 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 6244 __ lsrv(rscratch2, rscratch2, rscratch1); 6245 __ tst(rscratch2, UPPER_BIT_MASK); 6246 __ csel(result, zr, result, Assembler::NE); 6247 __ leave(); 6248 __ ret(lr); 6249 __ bind(LEN_OVER_8); 6250 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 6251 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 6252 __ tst(rscratch2, UPPER_BIT_MASK); 6253 __ br(Assembler::NE, RET_NO_POP); 6254 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 6255 __ lsrv(rscratch1, rscratch1, rscratch2); 6256 __ tst(rscratch1, UPPER_BIT_MASK); 6257 __ bind(RET_NO_POP); 6258 __ csel(result, zr, result, Assembler::NE); 6259 __ leave(); 6260 __ ret(lr); 6261 6262 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 6263 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 6264 6265 count_positives_long = __ pc(); // 2nd entry point 6266 6267 __ enter(); 6268 6269 __ bind(LEN_OVER_15); 6270 __ push(spilled_regs, sp); 6271 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 6272 __ cbz(rscratch2, ALIGNED); 6273 __ ldp(tmp6, tmp1, Address(ary1)); 6274 __ mov(tmp5, 16); 6275 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 6276 __ add(ary1, ary1, rscratch1); 6277 __ orr(tmp6, tmp6, tmp1); 6278 __ tst(tmp6, UPPER_BIT_MASK); 6279 __ br(Assembler::NE, RET_ADJUST); 6280 __ sub(len, len, rscratch1); 6281 6282 __ bind(ALIGNED); 6283 __ cmp(len, large_loop_size); 6284 __ br(Assembler::LT, CHECK_16); 6285 // Perform 16-byte load as early return in pre-loop to handle situation 6286 // when initially aligned large array has negative values at starting bytes, 6287 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 6288 // slower. Cases with negative bytes further ahead won't be affected that 6289 // much. In fact, it'll be faster due to early loads, less instructions and 6290 // less branches in LARGE_LOOP. 6291 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 6292 __ sub(len, len, 16); 6293 __ orr(tmp6, tmp6, tmp1); 6294 __ tst(tmp6, UPPER_BIT_MASK); 6295 __ br(Assembler::NE, RET_ADJUST_16); 6296 __ cmp(len, large_loop_size); 6297 __ br(Assembler::LT, CHECK_16); 6298 6299 if (SoftwarePrefetchHintDistance >= 0 6300 && SoftwarePrefetchHintDistance >= dcache_line) { 6301 // initial prefetch 6302 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 6303 } 6304 __ bind(LARGE_LOOP); 6305 if (SoftwarePrefetchHintDistance >= 0) { 6306 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 6307 } 6308 // Issue load instructions first, since it can save few CPU/MEM cycles, also 6309 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 6310 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 6311 // instructions per cycle and have less branches, but this approach disables 6312 // early return, thus, all 64 bytes are loaded and checked every time. 6313 __ ldp(tmp2, tmp3, Address(ary1)); 6314 __ ldp(tmp4, tmp5, Address(ary1, 16)); 6315 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 6316 __ ldp(tmp6, tmp1, Address(ary1, 48)); 6317 __ add(ary1, ary1, large_loop_size); 6318 __ sub(len, len, large_loop_size); 6319 __ orr(tmp2, tmp2, tmp3); 6320 __ orr(tmp4, tmp4, tmp5); 6321 __ orr(rscratch1, rscratch1, rscratch2); 6322 __ orr(tmp6, tmp6, tmp1); 6323 __ orr(tmp2, tmp2, tmp4); 6324 __ orr(rscratch1, rscratch1, tmp6); 6325 __ orr(tmp2, tmp2, rscratch1); 6326 __ tst(tmp2, UPPER_BIT_MASK); 6327 __ br(Assembler::NE, RET_ADJUST_LONG); 6328 __ cmp(len, large_loop_size); 6329 __ br(Assembler::GE, LARGE_LOOP); 6330 6331 __ bind(CHECK_16); // small 16-byte load pre-loop 6332 __ cmp(len, (u1)16); 6333 __ br(Assembler::LT, POST_LOOP16); 6334 6335 __ bind(LOOP16); // small 16-byte load loop 6336 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 6337 __ sub(len, len, 16); 6338 __ orr(tmp2, tmp2, tmp3); 6339 __ tst(tmp2, UPPER_BIT_MASK); 6340 __ br(Assembler::NE, RET_ADJUST_16); 6341 __ cmp(len, (u1)16); 6342 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 6343 6344 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 6345 __ cmp(len, (u1)8); 6346 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 6347 __ ldr(tmp3, Address(__ post(ary1, 8))); 6348 __ tst(tmp3, UPPER_BIT_MASK); 6349 __ br(Assembler::NE, RET_ADJUST); 6350 __ sub(len, len, 8); 6351 6352 __ bind(POST_LOOP16_LOAD_TAIL); 6353 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 6354 __ ldr(tmp1, Address(ary1)); 6355 __ mov(tmp2, 64); 6356 __ sub(tmp4, tmp2, len, __ LSL, 3); 6357 __ lslv(tmp1, tmp1, tmp4); 6358 __ tst(tmp1, UPPER_BIT_MASK); 6359 __ br(Assembler::NE, RET_ADJUST); 6360 // Fallthrough 6361 6362 __ bind(RET_LEN); 6363 __ pop(spilled_regs, sp); 6364 __ leave(); 6365 __ ret(lr); 6366 6367 // difference result - len is the count of guaranteed to be 6368 // positive bytes 6369 6370 __ bind(RET_ADJUST_LONG); 6371 __ add(len, len, (u1)(large_loop_size - 16)); 6372 __ bind(RET_ADJUST_16); 6373 __ add(len, len, 16); 6374 __ bind(RET_ADJUST); 6375 __ pop(spilled_regs, sp); 6376 __ leave(); 6377 __ sub(result, result, len); 6378 __ ret(lr); 6379 6380 return entry; 6381 } 6382 6383 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 6384 bool usePrefetch, Label &NOT_EQUAL) { 6385 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6386 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 6387 tmp7 = r12, tmp8 = r13; 6388 Label LOOP; 6389 6390 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6391 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6392 __ bind(LOOP); 6393 if (usePrefetch) { 6394 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 6395 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 6396 } 6397 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 6398 __ eor(tmp1, tmp1, tmp2); 6399 __ eor(tmp3, tmp3, tmp4); 6400 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 6401 __ orr(tmp1, tmp1, tmp3); 6402 __ cbnz(tmp1, NOT_EQUAL); 6403 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6404 __ eor(tmp5, tmp5, tmp6); 6405 __ eor(tmp7, tmp7, tmp8); 6406 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6407 __ orr(tmp5, tmp5, tmp7); 6408 __ cbnz(tmp5, NOT_EQUAL); 6409 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 6410 __ eor(tmp1, tmp1, tmp2); 6411 __ eor(tmp3, tmp3, tmp4); 6412 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 6413 __ orr(tmp1, tmp1, tmp3); 6414 __ cbnz(tmp1, NOT_EQUAL); 6415 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6416 __ eor(tmp5, tmp5, tmp6); 6417 __ sub(cnt1, cnt1, 8 * wordSize); 6418 __ eor(tmp7, tmp7, tmp8); 6419 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6420 // tmp6 is not used. MacroAssembler::subs is used here (rather than 6421 // cmp) because subs allows an unlimited range of immediate operand. 6422 __ subs(tmp6, cnt1, loopThreshold); 6423 __ orr(tmp5, tmp5, tmp7); 6424 __ cbnz(tmp5, NOT_EQUAL); 6425 __ br(__ GE, LOOP); 6426 // post-loop 6427 __ eor(tmp1, tmp1, tmp2); 6428 __ eor(tmp3, tmp3, tmp4); 6429 __ orr(tmp1, tmp1, tmp3); 6430 __ sub(cnt1, cnt1, 2 * wordSize); 6431 __ cbnz(tmp1, NOT_EQUAL); 6432 } 6433 6434 void generate_large_array_equals_loop_simd(int loopThreshold, 6435 bool usePrefetch, Label &NOT_EQUAL) { 6436 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6437 tmp2 = rscratch2; 6438 Label LOOP; 6439 6440 __ bind(LOOP); 6441 if (usePrefetch) { 6442 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 6443 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 6444 } 6445 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 6446 __ sub(cnt1, cnt1, 8 * wordSize); 6447 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 6448 __ subs(tmp1, cnt1, loopThreshold); 6449 __ eor(v0, __ T16B, v0, v4); 6450 __ eor(v1, __ T16B, v1, v5); 6451 __ eor(v2, __ T16B, v2, v6); 6452 __ eor(v3, __ T16B, v3, v7); 6453 __ orr(v0, __ T16B, v0, v1); 6454 __ orr(v1, __ T16B, v2, v3); 6455 __ orr(v0, __ T16B, v0, v1); 6456 __ umov(tmp1, v0, __ D, 0); 6457 __ umov(tmp2, v0, __ D, 1); 6458 __ orr(tmp1, tmp1, tmp2); 6459 __ cbnz(tmp1, NOT_EQUAL); 6460 __ br(__ GE, LOOP); 6461 } 6462 6463 // a1 = r1 - array1 address 6464 // a2 = r2 - array2 address 6465 // result = r0 - return value. Already contains "false" 6466 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 6467 // r3-r5 are reserved temporary registers 6468 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 6469 address generate_large_array_equals() { 6470 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6471 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 6472 tmp7 = r12, tmp8 = r13; 6473 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 6474 SMALL_LOOP, POST_LOOP; 6475 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 6476 // calculate if at least 32 prefetched bytes are used 6477 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 6478 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 6479 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 6480 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 6481 tmp5, tmp6, tmp7, tmp8); 6482 6483 __ align(CodeEntryAlignment); 6484 6485 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 6486 StubCodeMark mark(this, stub_id); 6487 6488 address entry = __ pc(); 6489 __ enter(); 6490 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 6491 // also advance pointers to use post-increment instead of pre-increment 6492 __ add(a1, a1, wordSize); 6493 __ add(a2, a2, wordSize); 6494 if (AvoidUnalignedAccesses) { 6495 // both implementations (SIMD/nonSIMD) are using relatively large load 6496 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 6497 // on some CPUs in case of address is not at least 16-byte aligned. 6498 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 6499 // load if needed at least for 1st address and make if 16-byte aligned. 6500 Label ALIGNED16; 6501 __ tbz(a1, 3, ALIGNED16); 6502 __ ldr(tmp1, Address(__ post(a1, wordSize))); 6503 __ ldr(tmp2, Address(__ post(a2, wordSize))); 6504 __ sub(cnt1, cnt1, wordSize); 6505 __ eor(tmp1, tmp1, tmp2); 6506 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 6507 __ bind(ALIGNED16); 6508 } 6509 if (UseSIMDForArrayEquals) { 6510 if (SoftwarePrefetchHintDistance >= 0) { 6511 __ subs(tmp1, cnt1, prefetchLoopThreshold); 6512 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 6513 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 6514 /* prfm = */ true, NOT_EQUAL); 6515 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 6516 __ br(__ LT, TAIL); 6517 } 6518 __ bind(NO_PREFETCH_LARGE_LOOP); 6519 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 6520 /* prfm = */ false, NOT_EQUAL); 6521 } else { 6522 __ push(spilled_regs, sp); 6523 if (SoftwarePrefetchHintDistance >= 0) { 6524 __ subs(tmp1, cnt1, prefetchLoopThreshold); 6525 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 6526 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 6527 /* prfm = */ true, NOT_EQUAL); 6528 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 6529 __ br(__ LT, TAIL); 6530 } 6531 __ bind(NO_PREFETCH_LARGE_LOOP); 6532 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 6533 /* prfm = */ false, NOT_EQUAL); 6534 } 6535 __ bind(TAIL); 6536 __ cbz(cnt1, EQUAL); 6537 __ subs(cnt1, cnt1, wordSize); 6538 __ br(__ LE, POST_LOOP); 6539 __ bind(SMALL_LOOP); 6540 __ ldr(tmp1, Address(__ post(a1, wordSize))); 6541 __ ldr(tmp2, Address(__ post(a2, wordSize))); 6542 __ subs(cnt1, cnt1, wordSize); 6543 __ eor(tmp1, tmp1, tmp2); 6544 __ cbnz(tmp1, NOT_EQUAL); 6545 __ br(__ GT, SMALL_LOOP); 6546 __ bind(POST_LOOP); 6547 __ ldr(tmp1, Address(a1, cnt1)); 6548 __ ldr(tmp2, Address(a2, cnt1)); 6549 __ eor(tmp1, tmp1, tmp2); 6550 __ cbnz(tmp1, NOT_EQUAL); 6551 __ bind(EQUAL); 6552 __ mov(result, true); 6553 __ bind(NOT_EQUAL); 6554 if (!UseSIMDForArrayEquals) { 6555 __ pop(spilled_regs, sp); 6556 } 6557 __ bind(NOT_EQUAL_NO_POP); 6558 __ leave(); 6559 __ ret(lr); 6560 return entry; 6561 } 6562 6563 // result = r0 - return value. Contains initial hashcode value on entry. 6564 // ary = r1 - array address 6565 // cnt = r2 - elements count 6566 // Clobbers: v0-v13, rscratch1, rscratch2 6567 address generate_large_arrays_hashcode(BasicType eltype) { 6568 const Register result = r0, ary = r1, cnt = r2; 6569 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 6570 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 6571 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 6572 const FloatRegister vpowm = v13; 6573 6574 ARRAYS_HASHCODE_REGISTERS; 6575 6576 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 6577 6578 unsigned int vf; // vectorization factor 6579 bool multiply_by_halves; 6580 Assembler::SIMD_Arrangement load_arrangement; 6581 switch (eltype) { 6582 case T_BOOLEAN: 6583 case T_BYTE: 6584 load_arrangement = Assembler::T8B; 6585 multiply_by_halves = true; 6586 vf = 8; 6587 break; 6588 case T_CHAR: 6589 case T_SHORT: 6590 load_arrangement = Assembler::T8H; 6591 multiply_by_halves = true; 6592 vf = 8; 6593 break; 6594 case T_INT: 6595 load_arrangement = Assembler::T4S; 6596 multiply_by_halves = false; 6597 vf = 4; 6598 break; 6599 default: 6600 ShouldNotReachHere(); 6601 } 6602 6603 // Unroll factor 6604 const unsigned uf = 4; 6605 6606 // Effective vectorization factor 6607 const unsigned evf = vf * uf; 6608 6609 __ align(CodeEntryAlignment); 6610 6611 StubGenStubId stub_id; 6612 switch (eltype) { 6613 case T_BOOLEAN: 6614 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 6615 break; 6616 case T_BYTE: 6617 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 6618 break; 6619 case T_CHAR: 6620 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 6621 break; 6622 case T_SHORT: 6623 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 6624 break; 6625 case T_INT: 6626 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 6627 break; 6628 default: 6629 stub_id = StubGenStubId::NO_STUBID; 6630 ShouldNotReachHere(); 6631 }; 6632 6633 StubCodeMark mark(this, stub_id); 6634 6635 address entry = __ pc(); 6636 __ enter(); 6637 6638 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 6639 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 6640 // value shouldn't change throughout both loops. 6641 __ movw(rscratch1, intpow(31U, 3)); 6642 __ mov(vpow, Assembler::S, 0, rscratch1); 6643 __ movw(rscratch1, intpow(31U, 2)); 6644 __ mov(vpow, Assembler::S, 1, rscratch1); 6645 __ movw(rscratch1, intpow(31U, 1)); 6646 __ mov(vpow, Assembler::S, 2, rscratch1); 6647 __ movw(rscratch1, intpow(31U, 0)); 6648 __ mov(vpow, Assembler::S, 3, rscratch1); 6649 6650 __ mov(vmul0, Assembler::T16B, 0); 6651 __ mov(vmul0, Assembler::S, 3, result); 6652 6653 __ andr(rscratch2, cnt, (uf - 1) * vf); 6654 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 6655 6656 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 6657 __ mov(vpowm, Assembler::S, 0, rscratch1); 6658 6659 // SMALL LOOP 6660 __ bind(SMALL_LOOP); 6661 6662 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 6663 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6664 __ subsw(rscratch2, rscratch2, vf); 6665 6666 if (load_arrangement == Assembler::T8B) { 6667 // Extend 8B to 8H to be able to use vector multiply 6668 // instructions 6669 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 6670 if (is_signed_subword_type(eltype)) { 6671 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6672 } else { 6673 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6674 } 6675 } 6676 6677 switch (load_arrangement) { 6678 case Assembler::T4S: 6679 __ addv(vmul0, load_arrangement, vmul0, vdata0); 6680 break; 6681 case Assembler::T8B: 6682 case Assembler::T8H: 6683 assert(is_subword_type(eltype), "subword type expected"); 6684 if (is_signed_subword_type(eltype)) { 6685 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6686 } else { 6687 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6688 } 6689 break; 6690 default: 6691 __ should_not_reach_here(); 6692 } 6693 6694 // Process the upper half of a vector 6695 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 6696 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6697 if (is_signed_subword_type(eltype)) { 6698 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6699 } else { 6700 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6701 } 6702 } 6703 6704 __ br(Assembler::HI, SMALL_LOOP); 6705 6706 // SMALL LOOP'S EPILOQUE 6707 __ lsr(rscratch2, cnt, exact_log2(evf)); 6708 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 6709 6710 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 6711 __ addv(vmul0, Assembler::T4S, vmul0); 6712 __ umov(result, vmul0, Assembler::S, 0); 6713 6714 // TAIL 6715 __ bind(TAIL); 6716 6717 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 6718 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 6719 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 6720 __ andr(rscratch2, cnt, vf - 1); 6721 __ bind(TAIL_SHORTCUT); 6722 __ adr(rscratch1, BR_BASE); 6723 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 6724 __ movw(rscratch2, 0x1f); 6725 __ br(rscratch1); 6726 6727 for (size_t i = 0; i < vf - 1; ++i) { 6728 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 6729 eltype); 6730 __ maddw(result, result, rscratch2, rscratch1); 6731 } 6732 __ bind(BR_BASE); 6733 6734 __ leave(); 6735 __ ret(lr); 6736 6737 // LARGE LOOP 6738 __ bind(LARGE_LOOP_PREHEADER); 6739 6740 __ lsr(rscratch2, cnt, exact_log2(evf)); 6741 6742 if (multiply_by_halves) { 6743 // 31^4 - multiplier between lower and upper parts of a register 6744 __ movw(rscratch1, intpow(31U, vf / 2)); 6745 __ mov(vpowm, Assembler::S, 1, rscratch1); 6746 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 6747 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 6748 __ mov(vpowm, Assembler::S, 0, rscratch1); 6749 } else { 6750 // 31^16 6751 __ movw(rscratch1, intpow(31U, evf)); 6752 __ mov(vpowm, Assembler::S, 0, rscratch1); 6753 } 6754 6755 __ mov(vmul3, Assembler::T16B, 0); 6756 __ mov(vmul2, Assembler::T16B, 0); 6757 __ mov(vmul1, Assembler::T16B, 0); 6758 6759 __ bind(LARGE_LOOP); 6760 6761 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 6762 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 6763 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 6764 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6765 6766 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 6767 Address(__ post(ary, evf * type2aelembytes(eltype)))); 6768 6769 if (load_arrangement == Assembler::T8B) { 6770 // Extend 8B to 8H to be able to use vector multiply 6771 // instructions 6772 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 6773 if (is_signed_subword_type(eltype)) { 6774 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 6775 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 6776 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 6777 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6778 } else { 6779 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 6780 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 6781 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 6782 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6783 } 6784 } 6785 6786 switch (load_arrangement) { 6787 case Assembler::T4S: 6788 __ addv(vmul3, load_arrangement, vmul3, vdata3); 6789 __ addv(vmul2, load_arrangement, vmul2, vdata2); 6790 __ addv(vmul1, load_arrangement, vmul1, vdata1); 6791 __ addv(vmul0, load_arrangement, vmul0, vdata0); 6792 break; 6793 case Assembler::T8B: 6794 case Assembler::T8H: 6795 assert(is_subword_type(eltype), "subword type expected"); 6796 if (is_signed_subword_type(eltype)) { 6797 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 6798 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 6799 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 6800 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6801 } else { 6802 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 6803 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 6804 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 6805 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6806 } 6807 break; 6808 default: 6809 __ should_not_reach_here(); 6810 } 6811 6812 // Process the upper half of a vector 6813 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 6814 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 6815 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 6816 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 6817 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 6818 if (is_signed_subword_type(eltype)) { 6819 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 6820 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 6821 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 6822 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6823 } else { 6824 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 6825 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 6826 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 6827 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6828 } 6829 } 6830 6831 __ subsw(rscratch2, rscratch2, 1); 6832 __ br(Assembler::HI, LARGE_LOOP); 6833 6834 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 6835 __ addv(vmul3, Assembler::T4S, vmul3); 6836 __ umov(result, vmul3, Assembler::S, 0); 6837 6838 __ mov(rscratch2, intpow(31U, vf)); 6839 6840 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 6841 __ addv(vmul2, Assembler::T4S, vmul2); 6842 __ umov(rscratch1, vmul2, Assembler::S, 0); 6843 __ maddw(result, result, rscratch2, rscratch1); 6844 6845 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 6846 __ addv(vmul1, Assembler::T4S, vmul1); 6847 __ umov(rscratch1, vmul1, Assembler::S, 0); 6848 __ maddw(result, result, rscratch2, rscratch1); 6849 6850 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 6851 __ addv(vmul0, Assembler::T4S, vmul0); 6852 __ umov(rscratch1, vmul0, Assembler::S, 0); 6853 __ maddw(result, result, rscratch2, rscratch1); 6854 6855 __ andr(rscratch2, cnt, vf - 1); 6856 __ cbnz(rscratch2, TAIL_SHORTCUT); 6857 6858 __ leave(); 6859 __ ret(lr); 6860 6861 return entry; 6862 } 6863 6864 address generate_dsin_dcos(bool isCos) { 6865 __ align(CodeEntryAlignment); 6866 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 6867 StubCodeMark mark(this, stub_id); 6868 address start = __ pc(); 6869 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 6870 (address)StubRoutines::aarch64::_two_over_pi, 6871 (address)StubRoutines::aarch64::_pio2, 6872 (address)StubRoutines::aarch64::_dsin_coef, 6873 (address)StubRoutines::aarch64::_dcos_coef); 6874 return start; 6875 } 6876 6877 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 6878 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 6879 Label &DIFF2) { 6880 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 6881 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 6882 6883 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 6884 __ ldr(tmpU, Address(__ post(cnt1, 8))); 6885 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 6886 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 6887 6888 __ fmovd(tmpL, vtmp3); 6889 __ eor(rscratch2, tmp3, tmpL); 6890 __ cbnz(rscratch2, DIFF2); 6891 6892 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6893 __ umov(tmpL, vtmp3, __ D, 1); 6894 __ eor(rscratch2, tmpU, tmpL); 6895 __ cbnz(rscratch2, DIFF1); 6896 6897 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 6898 __ ldr(tmpU, Address(__ post(cnt1, 8))); 6899 __ fmovd(tmpL, vtmp); 6900 __ eor(rscratch2, tmp3, tmpL); 6901 __ cbnz(rscratch2, DIFF2); 6902 6903 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6904 __ umov(tmpL, vtmp, __ D, 1); 6905 __ eor(rscratch2, tmpU, tmpL); 6906 __ cbnz(rscratch2, DIFF1); 6907 } 6908 6909 // r0 = result 6910 // r1 = str1 6911 // r2 = cnt1 6912 // r3 = str2 6913 // r4 = cnt2 6914 // r10 = tmp1 6915 // r11 = tmp2 6916 address generate_compare_long_string_different_encoding(bool isLU) { 6917 __ align(CodeEntryAlignment); 6918 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 6919 StubCodeMark mark(this, stub_id); 6920 address entry = __ pc(); 6921 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 6922 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 6923 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 6924 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6925 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 6926 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 6927 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 6928 6929 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 6930 6931 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 6932 // cnt2 == amount of characters left to compare 6933 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 6934 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 6935 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 6936 __ add(str2, str2, isLU ? wordSize : wordSize/2); 6937 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 6938 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 6939 __ eor(rscratch2, tmp1, tmp2); 6940 __ mov(rscratch1, tmp2); 6941 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 6942 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 6943 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 6944 __ push(spilled_regs, sp); 6945 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 6946 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 6947 6948 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6949 6950 if (SoftwarePrefetchHintDistance >= 0) { 6951 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 6952 __ br(__ LT, NO_PREFETCH); 6953 __ bind(LARGE_LOOP_PREFETCH); 6954 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 6955 __ mov(tmp4, 2); 6956 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 6957 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 6958 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6959 __ subs(tmp4, tmp4, 1); 6960 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 6961 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 6962 __ mov(tmp4, 2); 6963 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 6964 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6965 __ subs(tmp4, tmp4, 1); 6966 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 6967 __ sub(cnt2, cnt2, 64); 6968 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 6969 __ br(__ GE, LARGE_LOOP_PREFETCH); 6970 } 6971 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 6972 __ bind(NO_PREFETCH); 6973 __ subs(cnt2, cnt2, 16); 6974 __ br(__ LT, TAIL); 6975 __ align(OptoLoopAlignment); 6976 __ bind(SMALL_LOOP); // smaller loop 6977 __ subs(cnt2, cnt2, 16); 6978 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6979 __ br(__ GE, SMALL_LOOP); 6980 __ cmn(cnt2, (u1)16); 6981 __ br(__ EQ, LOAD_LAST); 6982 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 6983 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 6984 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 6985 __ ldr(tmp3, Address(cnt1, -8)); 6986 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 6987 __ b(LOAD_LAST); 6988 __ bind(DIFF2); 6989 __ mov(tmpU, tmp3); 6990 __ bind(DIFF1); 6991 __ pop(spilled_regs, sp); 6992 __ b(CALCULATE_DIFFERENCE); 6993 __ bind(LOAD_LAST); 6994 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 6995 // No need to load it again 6996 __ mov(tmpU, tmp3); 6997 __ pop(spilled_regs, sp); 6998 6999 // tmp2 points to the address of the last 4 Latin1 characters right now 7000 __ ldrs(vtmp, Address(tmp2)); 7001 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 7002 __ fmovd(tmpL, vtmp); 7003 7004 __ eor(rscratch2, tmpU, tmpL); 7005 __ cbz(rscratch2, DONE); 7006 7007 // Find the first different characters in the longwords and 7008 // compute their difference. 7009 __ bind(CALCULATE_DIFFERENCE); 7010 __ rev(rscratch2, rscratch2); 7011 __ clz(rscratch2, rscratch2); 7012 __ andr(rscratch2, rscratch2, -16); 7013 __ lsrv(tmp1, tmp1, rscratch2); 7014 __ uxthw(tmp1, tmp1); 7015 __ lsrv(rscratch1, rscratch1, rscratch2); 7016 __ uxthw(rscratch1, rscratch1); 7017 __ subw(result, tmp1, rscratch1); 7018 __ bind(DONE); 7019 __ ret(lr); 7020 return entry; 7021 } 7022 7023 // r0 = input (float16) 7024 // v0 = result (float) 7025 // v1 = temporary float register 7026 address generate_float16ToFloat() { 7027 __ align(CodeEntryAlignment); 7028 StubGenStubId stub_id = StubGenStubId::hf2f_id; 7029 StubCodeMark mark(this, stub_id); 7030 address entry = __ pc(); 7031 BLOCK_COMMENT("Entry:"); 7032 __ flt16_to_flt(v0, r0, v1); 7033 __ ret(lr); 7034 return entry; 7035 } 7036 7037 // v0 = input (float) 7038 // r0 = result (float16) 7039 // v1 = temporary float register 7040 address generate_floatToFloat16() { 7041 __ align(CodeEntryAlignment); 7042 StubGenStubId stub_id = StubGenStubId::f2hf_id; 7043 StubCodeMark mark(this, stub_id); 7044 address entry = __ pc(); 7045 BLOCK_COMMENT("Entry:"); 7046 __ flt_to_flt16(r0, v0, v1); 7047 __ ret(lr); 7048 return entry; 7049 } 7050 7051 address generate_method_entry_barrier() { 7052 __ align(CodeEntryAlignment); 7053 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 7054 StubCodeMark mark(this, stub_id); 7055 7056 Label deoptimize_label; 7057 7058 address start = __ pc(); 7059 7060 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 7061 7062 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 7063 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 7064 // We can get here despite the nmethod being good, if we have not 7065 // yet applied our cross modification fence (or data fence). 7066 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 7067 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 7068 __ ldrw(rscratch2, rscratch2); 7069 __ strw(rscratch2, thread_epoch_addr); 7070 __ isb(); 7071 __ membar(__ LoadLoad); 7072 } 7073 7074 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 7075 7076 __ enter(); 7077 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 7078 7079 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 7080 7081 __ push_call_clobbered_registers(); 7082 7083 __ mov(c_rarg0, rscratch2); 7084 __ call_VM_leaf 7085 (CAST_FROM_FN_PTR 7086 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 7087 7088 __ reset_last_Java_frame(true); 7089 7090 __ mov(rscratch1, r0); 7091 7092 __ pop_call_clobbered_registers(); 7093 7094 __ cbnz(rscratch1, deoptimize_label); 7095 7096 __ leave(); 7097 __ ret(lr); 7098 7099 __ BIND(deoptimize_label); 7100 7101 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 7102 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 7103 7104 __ mov(sp, rscratch1); 7105 __ br(rscratch2); 7106 7107 return start; 7108 } 7109 7110 // r0 = result 7111 // r1 = str1 7112 // r2 = cnt1 7113 // r3 = str2 7114 // r4 = cnt2 7115 // r10 = tmp1 7116 // r11 = tmp2 7117 address generate_compare_long_string_same_encoding(bool isLL) { 7118 __ align(CodeEntryAlignment); 7119 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 7120 StubCodeMark mark(this, stub_id); 7121 address entry = __ pc(); 7122 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 7123 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 7124 7125 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 7126 7127 // exit from large loop when less than 64 bytes left to read or we're about 7128 // to prefetch memory behind array border 7129 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 7130 7131 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 7132 __ eor(rscratch2, tmp1, tmp2); 7133 __ cbnz(rscratch2, CAL_DIFFERENCE); 7134 7135 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 7136 // update pointers, because of previous read 7137 __ add(str1, str1, wordSize); 7138 __ add(str2, str2, wordSize); 7139 if (SoftwarePrefetchHintDistance >= 0) { 7140 __ align(OptoLoopAlignment); 7141 __ bind(LARGE_LOOP_PREFETCH); 7142 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 7143 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 7144 7145 for (int i = 0; i < 4; i++) { 7146 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 7147 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 7148 __ cmp(tmp1, tmp2); 7149 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7150 __ br(Assembler::NE, DIFF); 7151 } 7152 __ sub(cnt2, cnt2, isLL ? 64 : 32); 7153 __ add(str1, str1, 64); 7154 __ add(str2, str2, 64); 7155 __ subs(rscratch2, cnt2, largeLoopExitCondition); 7156 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 7157 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 7158 } 7159 7160 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 7161 __ br(Assembler::LE, LESS16); 7162 __ align(OptoLoopAlignment); 7163 __ bind(LOOP_COMPARE16); 7164 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 7165 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 7166 __ cmp(tmp1, tmp2); 7167 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7168 __ br(Assembler::NE, DIFF); 7169 __ sub(cnt2, cnt2, isLL ? 16 : 8); 7170 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 7171 __ br(Assembler::LT, LESS16); 7172 7173 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 7174 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 7175 __ cmp(tmp1, tmp2); 7176 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7177 __ br(Assembler::NE, DIFF); 7178 __ sub(cnt2, cnt2, isLL ? 16 : 8); 7179 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 7180 __ br(Assembler::GE, LOOP_COMPARE16); 7181 __ cbz(cnt2, LENGTH_DIFF); 7182 7183 __ bind(LESS16); 7184 // each 8 compare 7185 __ subs(cnt2, cnt2, isLL ? 8 : 4); 7186 __ br(Assembler::LE, LESS8); 7187 __ ldr(tmp1, Address(__ post(str1, 8))); 7188 __ ldr(tmp2, Address(__ post(str2, 8))); 7189 __ eor(rscratch2, tmp1, tmp2); 7190 __ cbnz(rscratch2, CAL_DIFFERENCE); 7191 __ sub(cnt2, cnt2, isLL ? 8 : 4); 7192 7193 __ bind(LESS8); // directly load last 8 bytes 7194 if (!isLL) { 7195 __ add(cnt2, cnt2, cnt2); 7196 } 7197 __ ldr(tmp1, Address(str1, cnt2)); 7198 __ ldr(tmp2, Address(str2, cnt2)); 7199 __ eor(rscratch2, tmp1, tmp2); 7200 __ cbz(rscratch2, LENGTH_DIFF); 7201 __ b(CAL_DIFFERENCE); 7202 7203 __ bind(DIFF); 7204 __ cmp(tmp1, tmp2); 7205 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 7206 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 7207 // reuse rscratch2 register for the result of eor instruction 7208 __ eor(rscratch2, tmp1, tmp2); 7209 7210 __ bind(CAL_DIFFERENCE); 7211 __ rev(rscratch2, rscratch2); 7212 __ clz(rscratch2, rscratch2); 7213 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 7214 __ lsrv(tmp1, tmp1, rscratch2); 7215 __ lsrv(tmp2, tmp2, rscratch2); 7216 if (isLL) { 7217 __ uxtbw(tmp1, tmp1); 7218 __ uxtbw(tmp2, tmp2); 7219 } else { 7220 __ uxthw(tmp1, tmp1); 7221 __ uxthw(tmp2, tmp2); 7222 } 7223 __ subw(result, tmp1, tmp2); 7224 7225 __ bind(LENGTH_DIFF); 7226 __ ret(lr); 7227 return entry; 7228 } 7229 7230 enum string_compare_mode { 7231 LL, 7232 LU, 7233 UL, 7234 UU, 7235 }; 7236 7237 // The following registers are declared in aarch64.ad 7238 // r0 = result 7239 // r1 = str1 7240 // r2 = cnt1 7241 // r3 = str2 7242 // r4 = cnt2 7243 // r10 = tmp1 7244 // r11 = tmp2 7245 // z0 = ztmp1 7246 // z1 = ztmp2 7247 // p0 = pgtmp1 7248 // p1 = pgtmp2 7249 address generate_compare_long_string_sve(string_compare_mode mode) { 7250 StubGenStubId stub_id; 7251 switch (mode) { 7252 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 7253 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 7254 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 7255 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 7256 default: ShouldNotReachHere(); 7257 } 7258 7259 __ align(CodeEntryAlignment); 7260 address entry = __ pc(); 7261 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 7262 tmp1 = r10, tmp2 = r11; 7263 7264 Label LOOP, DONE, MISMATCH; 7265 Register vec_len = tmp1; 7266 Register idx = tmp2; 7267 // The minimum of the string lengths has been stored in cnt2. 7268 Register cnt = cnt2; 7269 FloatRegister ztmp1 = z0, ztmp2 = z1; 7270 PRegister pgtmp1 = p0, pgtmp2 = p1; 7271 7272 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 7273 switch (mode) { \ 7274 case LL: \ 7275 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 7276 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 7277 break; \ 7278 case LU: \ 7279 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 7280 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 7281 break; \ 7282 case UL: \ 7283 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 7284 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 7285 break; \ 7286 case UU: \ 7287 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 7288 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 7289 break; \ 7290 default: \ 7291 ShouldNotReachHere(); \ 7292 } 7293 7294 StubCodeMark mark(this, stub_id); 7295 7296 __ mov(idx, 0); 7297 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 7298 7299 if (mode == LL) { 7300 __ sve_cntb(vec_len); 7301 } else { 7302 __ sve_cnth(vec_len); 7303 } 7304 7305 __ sub(rscratch1, cnt, vec_len); 7306 7307 __ bind(LOOP); 7308 7309 // main loop 7310 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 7311 __ add(idx, idx, vec_len); 7312 // Compare strings. 7313 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 7314 __ br(__ NE, MISMATCH); 7315 __ cmp(idx, rscratch1); 7316 __ br(__ LT, LOOP); 7317 7318 // post loop, last iteration 7319 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 7320 7321 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 7322 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 7323 __ br(__ EQ, DONE); 7324 7325 __ bind(MISMATCH); 7326 7327 // Crop the vector to find its location. 7328 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 7329 // Extract the first different characters of each string. 7330 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 7331 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 7332 7333 // Compute the difference of the first different characters. 7334 __ sub(result, rscratch1, rscratch2); 7335 7336 __ bind(DONE); 7337 __ ret(lr); 7338 #undef LOAD_PAIR 7339 return entry; 7340 } 7341 7342 void generate_compare_long_strings() { 7343 if (UseSVE == 0) { 7344 StubRoutines::aarch64::_compare_long_string_LL 7345 = generate_compare_long_string_same_encoding(true); 7346 StubRoutines::aarch64::_compare_long_string_UU 7347 = generate_compare_long_string_same_encoding(false); 7348 StubRoutines::aarch64::_compare_long_string_LU 7349 = generate_compare_long_string_different_encoding(true); 7350 StubRoutines::aarch64::_compare_long_string_UL 7351 = generate_compare_long_string_different_encoding(false); 7352 } else { 7353 StubRoutines::aarch64::_compare_long_string_LL 7354 = generate_compare_long_string_sve(LL); 7355 StubRoutines::aarch64::_compare_long_string_UU 7356 = generate_compare_long_string_sve(UU); 7357 StubRoutines::aarch64::_compare_long_string_LU 7358 = generate_compare_long_string_sve(LU); 7359 StubRoutines::aarch64::_compare_long_string_UL 7360 = generate_compare_long_string_sve(UL); 7361 } 7362 } 7363 7364 // R0 = result 7365 // R1 = str2 7366 // R2 = cnt1 7367 // R3 = str1 7368 // R4 = cnt2 7369 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 7370 // 7371 // This generic linear code use few additional ideas, which makes it faster: 7372 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 7373 // in order to skip initial loading(help in systems with 1 ld pipeline) 7374 // 2) we can use "fast" algorithm of finding single character to search for 7375 // first symbol with less branches(1 branch per each loaded register instead 7376 // of branch for each symbol), so, this is where constants like 7377 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 7378 // 3) after loading and analyzing 1st register of source string, it can be 7379 // used to search for every 1st character entry, saving few loads in 7380 // comparison with "simplier-but-slower" implementation 7381 // 4) in order to avoid lots of push/pop operations, code below is heavily 7382 // re-using/re-initializing/compressing register values, which makes code 7383 // larger and a bit less readable, however, most of extra operations are 7384 // issued during loads or branches, so, penalty is minimal 7385 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 7386 StubGenStubId stub_id; 7387 if (str1_isL) { 7388 if (str2_isL) { 7389 stub_id = StubGenStubId::string_indexof_linear_ll_id; 7390 } else { 7391 stub_id = StubGenStubId::string_indexof_linear_ul_id; 7392 } 7393 } else { 7394 if (str2_isL) { 7395 ShouldNotReachHere(); 7396 } else { 7397 stub_id = StubGenStubId::string_indexof_linear_uu_id; 7398 } 7399 } 7400 __ align(CodeEntryAlignment); 7401 StubCodeMark mark(this, stub_id); 7402 address entry = __ pc(); 7403 7404 int str1_chr_size = str1_isL ? 1 : 2; 7405 int str2_chr_size = str2_isL ? 1 : 2; 7406 int str1_chr_shift = str1_isL ? 0 : 1; 7407 int str2_chr_shift = str2_isL ? 0 : 1; 7408 bool isL = str1_isL && str2_isL; 7409 // parameters 7410 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 7411 // temporary registers 7412 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 7413 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 7414 // redefinitions 7415 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 7416 7417 __ push(spilled_regs, sp); 7418 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 7419 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 7420 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 7421 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 7422 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 7423 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 7424 // Read whole register from str1. It is safe, because length >=8 here 7425 __ ldr(ch1, Address(str1)); 7426 // Read whole register from str2. It is safe, because length >=8 here 7427 __ ldr(ch2, Address(str2)); 7428 __ sub(cnt2, cnt2, cnt1); 7429 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 7430 if (str1_isL != str2_isL) { 7431 __ eor(v0, __ T16B, v0, v0); 7432 } 7433 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 7434 __ mul(first, first, tmp1); 7435 // check if we have less than 1 register to check 7436 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 7437 if (str1_isL != str2_isL) { 7438 __ fmovd(v1, ch1); 7439 } 7440 __ br(__ LE, L_SMALL); 7441 __ eor(ch2, first, ch2); 7442 if (str1_isL != str2_isL) { 7443 __ zip1(v1, __ T16B, v1, v0); 7444 } 7445 __ sub(tmp2, ch2, tmp1); 7446 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7447 __ bics(tmp2, tmp2, ch2); 7448 if (str1_isL != str2_isL) { 7449 __ fmovd(ch1, v1); 7450 } 7451 __ br(__ NE, L_HAS_ZERO); 7452 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 7453 __ add(result, result, wordSize/str2_chr_size); 7454 __ add(str2, str2, wordSize); 7455 __ br(__ LT, L_POST_LOOP); 7456 __ BIND(L_LOOP); 7457 __ ldr(ch2, Address(str2)); 7458 __ eor(ch2, first, ch2); 7459 __ sub(tmp2, ch2, tmp1); 7460 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7461 __ bics(tmp2, tmp2, ch2); 7462 __ br(__ NE, L_HAS_ZERO); 7463 __ BIND(L_LOOP_PROCEED); 7464 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 7465 __ add(str2, str2, wordSize); 7466 __ add(result, result, wordSize/str2_chr_size); 7467 __ br(__ GE, L_LOOP); 7468 __ BIND(L_POST_LOOP); 7469 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 7470 __ br(__ LE, NOMATCH); 7471 __ ldr(ch2, Address(str2)); 7472 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 7473 __ eor(ch2, first, ch2); 7474 __ sub(tmp2, ch2, tmp1); 7475 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7476 __ mov(tmp4, -1); // all bits set 7477 __ b(L_SMALL_PROCEED); 7478 __ align(OptoLoopAlignment); 7479 __ BIND(L_SMALL); 7480 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 7481 __ eor(ch2, first, ch2); 7482 if (str1_isL != str2_isL) { 7483 __ zip1(v1, __ T16B, v1, v0); 7484 } 7485 __ sub(tmp2, ch2, tmp1); 7486 __ mov(tmp4, -1); // all bits set 7487 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7488 if (str1_isL != str2_isL) { 7489 __ fmovd(ch1, v1); // move converted 4 symbols 7490 } 7491 __ BIND(L_SMALL_PROCEED); 7492 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 7493 __ bic(tmp2, tmp2, ch2); 7494 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 7495 __ rbit(tmp2, tmp2); 7496 __ br(__ EQ, NOMATCH); 7497 __ BIND(L_SMALL_HAS_ZERO_LOOP); 7498 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 7499 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 7500 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 7501 if (str2_isL) { // LL 7502 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 7503 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 7504 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 7505 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 7506 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7507 } else { 7508 __ mov(ch2, 0xE); // all bits in byte set except last one 7509 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7510 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7511 __ lslv(tmp2, tmp2, tmp4); 7512 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7513 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7514 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7515 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7516 } 7517 __ cmp(ch1, ch2); 7518 __ mov(tmp4, wordSize/str2_chr_size); 7519 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7520 __ BIND(L_SMALL_CMP_LOOP); 7521 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 7522 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 7523 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 7524 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 7525 __ add(tmp4, tmp4, 1); 7526 __ cmp(tmp4, cnt1); 7527 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 7528 __ cmp(first, ch2); 7529 __ br(__ EQ, L_SMALL_CMP_LOOP); 7530 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 7531 __ cbz(tmp2, NOMATCH); // no more matches. exit 7532 __ clz(tmp4, tmp2); 7533 __ add(result, result, 1); // advance index 7534 __ add(str2, str2, str2_chr_size); // advance pointer 7535 __ b(L_SMALL_HAS_ZERO_LOOP); 7536 __ align(OptoLoopAlignment); 7537 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 7538 __ cmp(first, ch2); 7539 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7540 __ b(DONE); 7541 __ align(OptoLoopAlignment); 7542 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 7543 if (str2_isL) { // LL 7544 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 7545 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 7546 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 7547 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 7548 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7549 } else { 7550 __ mov(ch2, 0xE); // all bits in byte set except last one 7551 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7552 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7553 __ lslv(tmp2, tmp2, tmp4); 7554 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7555 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7556 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7557 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7558 } 7559 __ cmp(ch1, ch2); 7560 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7561 __ b(DONE); 7562 __ align(OptoLoopAlignment); 7563 __ BIND(L_HAS_ZERO); 7564 __ rbit(tmp2, tmp2); 7565 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 7566 // Now, perform compression of counters(cnt2 and cnt1) into one register. 7567 // It's fine because both counters are 32bit and are not changed in this 7568 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 7569 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 7570 __ sub(result, result, 1); 7571 __ BIND(L_HAS_ZERO_LOOP); 7572 __ mov(cnt1, wordSize/str2_chr_size); 7573 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 7574 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 7575 if (str2_isL) { 7576 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 7577 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7578 __ lslv(tmp2, tmp2, tmp4); 7579 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7580 __ add(tmp4, tmp4, 1); 7581 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7582 __ lsl(tmp2, tmp2, 1); 7583 __ mov(tmp4, wordSize/str2_chr_size); 7584 } else { 7585 __ mov(ch2, 0xE); 7586 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7587 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7588 __ lslv(tmp2, tmp2, tmp4); 7589 __ add(tmp4, tmp4, 1); 7590 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7591 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 7592 __ lsl(tmp2, tmp2, 1); 7593 __ mov(tmp4, wordSize/str2_chr_size); 7594 __ sub(str2, str2, str2_chr_size); 7595 } 7596 __ cmp(ch1, ch2); 7597 __ mov(tmp4, wordSize/str2_chr_size); 7598 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7599 __ BIND(L_CMP_LOOP); 7600 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 7601 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 7602 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 7603 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 7604 __ add(tmp4, tmp4, 1); 7605 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 7606 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 7607 __ cmp(cnt1, ch2); 7608 __ br(__ EQ, L_CMP_LOOP); 7609 __ BIND(L_CMP_LOOP_NOMATCH); 7610 // here we're not matched 7611 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 7612 __ clz(tmp4, tmp2); 7613 __ add(str2, str2, str2_chr_size); // advance pointer 7614 __ b(L_HAS_ZERO_LOOP); 7615 __ align(OptoLoopAlignment); 7616 __ BIND(L_CMP_LOOP_LAST_CMP); 7617 __ cmp(cnt1, ch2); 7618 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7619 __ b(DONE); 7620 __ align(OptoLoopAlignment); 7621 __ BIND(L_CMP_LOOP_LAST_CMP2); 7622 if (str2_isL) { 7623 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 7624 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7625 __ lslv(tmp2, tmp2, tmp4); 7626 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7627 __ add(tmp4, tmp4, 1); 7628 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7629 __ lsl(tmp2, tmp2, 1); 7630 } else { 7631 __ mov(ch2, 0xE); 7632 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7633 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7634 __ lslv(tmp2, tmp2, tmp4); 7635 __ add(tmp4, tmp4, 1); 7636 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7637 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 7638 __ lsl(tmp2, tmp2, 1); 7639 __ sub(str2, str2, str2_chr_size); 7640 } 7641 __ cmp(ch1, ch2); 7642 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7643 __ b(DONE); 7644 __ align(OptoLoopAlignment); 7645 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 7646 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 7647 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 7648 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 7649 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 7650 // result by analyzed characters value, so, we can just reset lower bits 7651 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 7652 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 7653 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 7654 // index of last analyzed substring inside current octet. So, str2 in at 7655 // respective start address. We need to advance it to next octet 7656 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 7657 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 7658 __ bfm(result, zr, 0, 2 - str2_chr_shift); 7659 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 7660 __ movw(cnt2, cnt2); 7661 __ b(L_LOOP_PROCEED); 7662 __ align(OptoLoopAlignment); 7663 __ BIND(NOMATCH); 7664 __ mov(result, -1); 7665 __ BIND(DONE); 7666 __ pop(spilled_regs, sp); 7667 __ ret(lr); 7668 return entry; 7669 } 7670 7671 void generate_string_indexof_stubs() { 7672 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 7673 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 7674 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 7675 } 7676 7677 void inflate_and_store_2_fp_registers(bool generatePrfm, 7678 FloatRegister src1, FloatRegister src2) { 7679 Register dst = r1; 7680 __ zip1(v1, __ T16B, src1, v0); 7681 __ zip2(v2, __ T16B, src1, v0); 7682 if (generatePrfm) { 7683 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 7684 } 7685 __ zip1(v3, __ T16B, src2, v0); 7686 __ zip2(v4, __ T16B, src2, v0); 7687 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 7688 } 7689 7690 // R0 = src 7691 // R1 = dst 7692 // R2 = len 7693 // R3 = len >> 3 7694 // V0 = 0 7695 // v1 = loaded 8 bytes 7696 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 7697 address generate_large_byte_array_inflate() { 7698 __ align(CodeEntryAlignment); 7699 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 7700 StubCodeMark mark(this, stub_id); 7701 address entry = __ pc(); 7702 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 7703 Register src = r0, dst = r1, len = r2, octetCounter = r3; 7704 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 7705 7706 // do one more 8-byte read to have address 16-byte aligned in most cases 7707 // also use single store instruction 7708 __ ldrd(v2, __ post(src, 8)); 7709 __ sub(octetCounter, octetCounter, 2); 7710 __ zip1(v1, __ T16B, v1, v0); 7711 __ zip1(v2, __ T16B, v2, v0); 7712 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 7713 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7714 __ subs(rscratch1, octetCounter, large_loop_threshold); 7715 __ br(__ LE, LOOP_START); 7716 __ b(LOOP_PRFM_START); 7717 __ bind(LOOP_PRFM); 7718 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7719 __ bind(LOOP_PRFM_START); 7720 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 7721 __ sub(octetCounter, octetCounter, 8); 7722 __ subs(rscratch1, octetCounter, large_loop_threshold); 7723 inflate_and_store_2_fp_registers(true, v3, v4); 7724 inflate_and_store_2_fp_registers(true, v5, v6); 7725 __ br(__ GT, LOOP_PRFM); 7726 __ cmp(octetCounter, (u1)8); 7727 __ br(__ LT, DONE); 7728 __ bind(LOOP); 7729 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7730 __ bind(LOOP_START); 7731 __ sub(octetCounter, octetCounter, 8); 7732 __ cmp(octetCounter, (u1)8); 7733 inflate_and_store_2_fp_registers(false, v3, v4); 7734 inflate_and_store_2_fp_registers(false, v5, v6); 7735 __ br(__ GE, LOOP); 7736 __ bind(DONE); 7737 __ ret(lr); 7738 return entry; 7739 } 7740 7741 /** 7742 * Arguments: 7743 * 7744 * Input: 7745 * c_rarg0 - current state address 7746 * c_rarg1 - H key address 7747 * c_rarg2 - data address 7748 * c_rarg3 - number of blocks 7749 * 7750 * Output: 7751 * Updated state at c_rarg0 7752 */ 7753 address generate_ghash_processBlocks() { 7754 // Bafflingly, GCM uses little-endian for the byte order, but 7755 // big-endian for the bit order. For example, the polynomial 1 is 7756 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 7757 // 7758 // So, we must either reverse the bytes in each word and do 7759 // everything big-endian or reverse the bits in each byte and do 7760 // it little-endian. On AArch64 it's more idiomatic to reverse 7761 // the bits in each byte (we have an instruction, RBIT, to do 7762 // that) and keep the data in little-endian bit order through the 7763 // calculation, bit-reversing the inputs and outputs. 7764 7765 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 7766 StubCodeMark mark(this, stub_id); 7767 __ align(wordSize * 2); 7768 address p = __ pc(); 7769 __ emit_int64(0x87); // The low-order bits of the field 7770 // polynomial (i.e. p = z^7+z^2+z+1) 7771 // repeated in the low and high parts of a 7772 // 128-bit vector 7773 __ emit_int64(0x87); 7774 7775 __ align(CodeEntryAlignment); 7776 address start = __ pc(); 7777 7778 Register state = c_rarg0; 7779 Register subkeyH = c_rarg1; 7780 Register data = c_rarg2; 7781 Register blocks = c_rarg3; 7782 7783 FloatRegister vzr = v30; 7784 __ eor(vzr, __ T16B, vzr, vzr); // zero register 7785 7786 __ ldrq(v24, p); // The field polynomial 7787 7788 __ ldrq(v0, Address(state)); 7789 __ ldrq(v1, Address(subkeyH)); 7790 7791 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 7792 __ rbit(v0, __ T16B, v0); 7793 __ rev64(v1, __ T16B, v1); 7794 __ rbit(v1, __ T16B, v1); 7795 7796 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 7797 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 7798 7799 { 7800 Label L_ghash_loop; 7801 __ bind(L_ghash_loop); 7802 7803 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 7804 // reversing each byte 7805 __ rbit(v2, __ T16B, v2); 7806 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 7807 7808 // Multiply state in v2 by subkey in v1 7809 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 7810 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 7811 /*temps*/v6, v3, /*reuse/clobber b*/v2); 7812 // Reduce v7:v5 by the field polynomial 7813 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 7814 7815 __ sub(blocks, blocks, 1); 7816 __ cbnz(blocks, L_ghash_loop); 7817 } 7818 7819 // The bit-reversed result is at this point in v0 7820 __ rev64(v0, __ T16B, v0); 7821 __ rbit(v0, __ T16B, v0); 7822 7823 __ st1(v0, __ T16B, state); 7824 __ ret(lr); 7825 7826 return start; 7827 } 7828 7829 address generate_ghash_processBlocks_wide() { 7830 address small = generate_ghash_processBlocks(); 7831 7832 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 7833 StubCodeMark mark(this, stub_id); 7834 __ align(wordSize * 2); 7835 address p = __ pc(); 7836 __ emit_int64(0x87); // The low-order bits of the field 7837 // polynomial (i.e. p = z^7+z^2+z+1) 7838 // repeated in the low and high parts of a 7839 // 128-bit vector 7840 __ emit_int64(0x87); 7841 7842 __ align(CodeEntryAlignment); 7843 address start = __ pc(); 7844 7845 Register state = c_rarg0; 7846 Register subkeyH = c_rarg1; 7847 Register data = c_rarg2; 7848 Register blocks = c_rarg3; 7849 7850 const int unroll = 4; 7851 7852 __ cmp(blocks, (unsigned char)(unroll * 2)); 7853 __ br(__ LT, small); 7854 7855 if (unroll > 1) { 7856 // Save state before entering routine 7857 __ sub(sp, sp, 4 * 16); 7858 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 7859 __ sub(sp, sp, 4 * 16); 7860 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 7861 } 7862 7863 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 7864 7865 if (unroll > 1) { 7866 // And restore state 7867 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 7868 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 7869 } 7870 7871 __ cmp(blocks, (unsigned char)0); 7872 __ br(__ GT, small); 7873 7874 __ ret(lr); 7875 7876 return start; 7877 } 7878 7879 void generate_base64_encode_simdround(Register src, Register dst, 7880 FloatRegister codec, u8 size) { 7881 7882 FloatRegister in0 = v4, in1 = v5, in2 = v6; 7883 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 7884 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 7885 7886 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 7887 7888 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 7889 7890 __ ushr(ind0, arrangement, in0, 2); 7891 7892 __ ushr(ind1, arrangement, in1, 2); 7893 __ shl(in0, arrangement, in0, 6); 7894 __ orr(ind1, arrangement, ind1, in0); 7895 __ ushr(ind1, arrangement, ind1, 2); 7896 7897 __ ushr(ind2, arrangement, in2, 4); 7898 __ shl(in1, arrangement, in1, 4); 7899 __ orr(ind2, arrangement, in1, ind2); 7900 __ ushr(ind2, arrangement, ind2, 2); 7901 7902 __ shl(ind3, arrangement, in2, 2); 7903 __ ushr(ind3, arrangement, ind3, 2); 7904 7905 __ tbl(out0, arrangement, codec, 4, ind0); 7906 __ tbl(out1, arrangement, codec, 4, ind1); 7907 __ tbl(out2, arrangement, codec, 4, ind2); 7908 __ tbl(out3, arrangement, codec, 4, ind3); 7909 7910 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 7911 } 7912 7913 /** 7914 * Arguments: 7915 * 7916 * Input: 7917 * c_rarg0 - src_start 7918 * c_rarg1 - src_offset 7919 * c_rarg2 - src_length 7920 * c_rarg3 - dest_start 7921 * c_rarg4 - dest_offset 7922 * c_rarg5 - isURL 7923 * 7924 */ 7925 address generate_base64_encodeBlock() { 7926 7927 static const char toBase64[64] = { 7928 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 7929 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 7930 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 7931 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 7932 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 7933 }; 7934 7935 static const char toBase64URL[64] = { 7936 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 7937 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 7938 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 7939 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 7940 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 7941 }; 7942 7943 __ align(CodeEntryAlignment); 7944 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 7945 StubCodeMark mark(this, stub_id); 7946 address start = __ pc(); 7947 7948 Register src = c_rarg0; // source array 7949 Register soff = c_rarg1; // source start offset 7950 Register send = c_rarg2; // source end offset 7951 Register dst = c_rarg3; // dest array 7952 Register doff = c_rarg4; // position for writing to dest array 7953 Register isURL = c_rarg5; // Base64 or URL character set 7954 7955 // c_rarg6 and c_rarg7 are free to use as temps 7956 Register codec = c_rarg6; 7957 Register length = c_rarg7; 7958 7959 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 7960 7961 __ add(src, src, soff); 7962 __ add(dst, dst, doff); 7963 __ sub(length, send, soff); 7964 7965 // load the codec base address 7966 __ lea(codec, ExternalAddress((address) toBase64)); 7967 __ cbz(isURL, ProcessData); 7968 __ lea(codec, ExternalAddress((address) toBase64URL)); 7969 7970 __ BIND(ProcessData); 7971 7972 // too short to formup a SIMD loop, roll back 7973 __ cmp(length, (u1)24); 7974 __ br(Assembler::LT, Process3B); 7975 7976 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 7977 7978 __ BIND(Process48B); 7979 __ cmp(length, (u1)48); 7980 __ br(Assembler::LT, Process24B); 7981 generate_base64_encode_simdround(src, dst, v0, 16); 7982 __ sub(length, length, 48); 7983 __ b(Process48B); 7984 7985 __ BIND(Process24B); 7986 __ cmp(length, (u1)24); 7987 __ br(Assembler::LT, SIMDExit); 7988 generate_base64_encode_simdround(src, dst, v0, 8); 7989 __ sub(length, length, 24); 7990 7991 __ BIND(SIMDExit); 7992 __ cbz(length, Exit); 7993 7994 __ BIND(Process3B); 7995 // 3 src bytes, 24 bits 7996 __ ldrb(r10, __ post(src, 1)); 7997 __ ldrb(r11, __ post(src, 1)); 7998 __ ldrb(r12, __ post(src, 1)); 7999 __ orrw(r11, r11, r10, Assembler::LSL, 8); 8000 __ orrw(r12, r12, r11, Assembler::LSL, 8); 8001 // codec index 8002 __ ubfmw(r15, r12, 18, 23); 8003 __ ubfmw(r14, r12, 12, 17); 8004 __ ubfmw(r13, r12, 6, 11); 8005 __ andw(r12, r12, 63); 8006 // get the code based on the codec 8007 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 8008 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 8009 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 8010 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 8011 __ strb(r15, __ post(dst, 1)); 8012 __ strb(r14, __ post(dst, 1)); 8013 __ strb(r13, __ post(dst, 1)); 8014 __ strb(r12, __ post(dst, 1)); 8015 __ sub(length, length, 3); 8016 __ cbnz(length, Process3B); 8017 8018 __ BIND(Exit); 8019 __ ret(lr); 8020 8021 return start; 8022 } 8023 8024 void generate_base64_decode_simdround(Register src, Register dst, 8025 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 8026 8027 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 8028 FloatRegister out0 = v20, out1 = v21, out2 = v22; 8029 8030 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 8031 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 8032 8033 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 8034 8035 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 8036 8037 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 8038 8039 // we need unsigned saturating subtract, to make sure all input values 8040 // in range [0, 63] will have 0U value in the higher half lookup 8041 __ uqsubv(decH0, __ T16B, in0, v27); 8042 __ uqsubv(decH1, __ T16B, in1, v27); 8043 __ uqsubv(decH2, __ T16B, in2, v27); 8044 __ uqsubv(decH3, __ T16B, in3, v27); 8045 8046 // lower half lookup 8047 __ tbl(decL0, arrangement, codecL, 4, in0); 8048 __ tbl(decL1, arrangement, codecL, 4, in1); 8049 __ tbl(decL2, arrangement, codecL, 4, in2); 8050 __ tbl(decL3, arrangement, codecL, 4, in3); 8051 8052 // higher half lookup 8053 __ tbx(decH0, arrangement, codecH, 4, decH0); 8054 __ tbx(decH1, arrangement, codecH, 4, decH1); 8055 __ tbx(decH2, arrangement, codecH, 4, decH2); 8056 __ tbx(decH3, arrangement, codecH, 4, decH3); 8057 8058 // combine lower and higher 8059 __ orr(decL0, arrangement, decL0, decH0); 8060 __ orr(decL1, arrangement, decL1, decH1); 8061 __ orr(decL2, arrangement, decL2, decH2); 8062 __ orr(decL3, arrangement, decL3, decH3); 8063 8064 // check illegal inputs, value larger than 63 (maximum of 6 bits) 8065 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 8066 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 8067 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 8068 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 8069 __ orr(in0, arrangement, decH0, decH1); 8070 __ orr(in1, arrangement, decH2, decH3); 8071 __ orr(in2, arrangement, in0, in1); 8072 __ umaxv(in3, arrangement, in2); 8073 __ umov(rscratch2, in3, __ B, 0); 8074 8075 // get the data to output 8076 __ shl(out0, arrangement, decL0, 2); 8077 __ ushr(out1, arrangement, decL1, 4); 8078 __ orr(out0, arrangement, out0, out1); 8079 __ shl(out1, arrangement, decL1, 4); 8080 __ ushr(out2, arrangement, decL2, 2); 8081 __ orr(out1, arrangement, out1, out2); 8082 __ shl(out2, arrangement, decL2, 6); 8083 __ orr(out2, arrangement, out2, decL3); 8084 8085 __ cbz(rscratch2, NoIllegalData); 8086 8087 // handle illegal input 8088 __ umov(r10, in2, __ D, 0); 8089 if (size == 16) { 8090 __ cbnz(r10, ErrorInLowerHalf); 8091 8092 // illegal input is in higher half, store the lower half now. 8093 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 8094 8095 __ umov(r10, in2, __ D, 1); 8096 __ umov(r11, out0, __ D, 1); 8097 __ umov(r12, out1, __ D, 1); 8098 __ umov(r13, out2, __ D, 1); 8099 __ b(StoreLegalData); 8100 8101 __ BIND(ErrorInLowerHalf); 8102 } 8103 __ umov(r11, out0, __ D, 0); 8104 __ umov(r12, out1, __ D, 0); 8105 __ umov(r13, out2, __ D, 0); 8106 8107 __ BIND(StoreLegalData); 8108 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 8109 __ strb(r11, __ post(dst, 1)); 8110 __ strb(r12, __ post(dst, 1)); 8111 __ strb(r13, __ post(dst, 1)); 8112 __ lsr(r10, r10, 8); 8113 __ lsr(r11, r11, 8); 8114 __ lsr(r12, r12, 8); 8115 __ lsr(r13, r13, 8); 8116 __ b(StoreLegalData); 8117 8118 __ BIND(NoIllegalData); 8119 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 8120 } 8121 8122 8123 /** 8124 * Arguments: 8125 * 8126 * Input: 8127 * c_rarg0 - src_start 8128 * c_rarg1 - src_offset 8129 * c_rarg2 - src_length 8130 * c_rarg3 - dest_start 8131 * c_rarg4 - dest_offset 8132 * c_rarg5 - isURL 8133 * c_rarg6 - isMIME 8134 * 8135 */ 8136 address generate_base64_decodeBlock() { 8137 8138 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 8139 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 8140 // titled "Base64 decoding". 8141 8142 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 8143 // except the trailing character '=' is also treated illegal value in this intrinsic. That 8144 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 8145 static const uint8_t fromBase64ForNoSIMD[256] = { 8146 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8147 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8148 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 8149 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8150 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 8151 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 8152 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 8153 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 8154 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8155 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8156 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8157 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8158 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8159 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8160 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8161 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8162 }; 8163 8164 static const uint8_t fromBase64URLForNoSIMD[256] = { 8165 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8166 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8167 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 8168 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8169 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 8170 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 8171 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 8172 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 8173 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8174 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8175 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8176 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8177 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8178 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8179 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8180 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8181 }; 8182 8183 // A legal value of base64 code is in range [0, 127]. We need two lookups 8184 // with tbl/tbx and combine them to get the decode data. The 1st table vector 8185 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 8186 // table vector lookup use tbx, out of range indices are unchanged in 8187 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 8188 // The value of index 64 is set to 0, so that we know that we already get the 8189 // decoded data with the 1st lookup. 8190 static const uint8_t fromBase64ForSIMD[128] = { 8191 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8192 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8193 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 8194 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8195 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 8196 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 8197 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 8198 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 8199 }; 8200 8201 static const uint8_t fromBase64URLForSIMD[128] = { 8202 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8203 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8204 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 8205 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8206 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 8207 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 8208 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 8209 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 8210 }; 8211 8212 __ align(CodeEntryAlignment); 8213 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 8214 StubCodeMark mark(this, stub_id); 8215 address start = __ pc(); 8216 8217 Register src = c_rarg0; // source array 8218 Register soff = c_rarg1; // source start offset 8219 Register send = c_rarg2; // source end offset 8220 Register dst = c_rarg3; // dest array 8221 Register doff = c_rarg4; // position for writing to dest array 8222 Register isURL = c_rarg5; // Base64 or URL character set 8223 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 8224 8225 Register length = send; // reuse send as length of source data to process 8226 8227 Register simd_codec = c_rarg6; 8228 Register nosimd_codec = c_rarg7; 8229 8230 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 8231 8232 __ enter(); 8233 8234 __ add(src, src, soff); 8235 __ add(dst, dst, doff); 8236 8237 __ mov(doff, dst); 8238 8239 __ sub(length, send, soff); 8240 __ bfm(length, zr, 0, 1); 8241 8242 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 8243 __ cbz(isURL, ProcessData); 8244 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 8245 8246 __ BIND(ProcessData); 8247 __ mov(rscratch1, length); 8248 __ cmp(length, (u1)144); // 144 = 80 + 64 8249 __ br(Assembler::LT, Process4B); 8250 8251 // In the MIME case, the line length cannot be more than 76 8252 // bytes (see RFC 2045). This is too short a block for SIMD 8253 // to be worthwhile, so we use non-SIMD here. 8254 __ movw(rscratch1, 79); 8255 8256 __ BIND(Process4B); 8257 __ ldrw(r14, __ post(src, 4)); 8258 __ ubfxw(r10, r14, 0, 8); 8259 __ ubfxw(r11, r14, 8, 8); 8260 __ ubfxw(r12, r14, 16, 8); 8261 __ ubfxw(r13, r14, 24, 8); 8262 // get the de-code 8263 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 8264 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 8265 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 8266 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 8267 // error detection, 255u indicates an illegal input 8268 __ orrw(r14, r10, r11); 8269 __ orrw(r15, r12, r13); 8270 __ orrw(r14, r14, r15); 8271 __ tbnz(r14, 7, Exit); 8272 // recover the data 8273 __ lslw(r14, r10, 10); 8274 __ bfiw(r14, r11, 4, 6); 8275 __ bfmw(r14, r12, 2, 5); 8276 __ rev16w(r14, r14); 8277 __ bfiw(r13, r12, 6, 2); 8278 __ strh(r14, __ post(dst, 2)); 8279 __ strb(r13, __ post(dst, 1)); 8280 // non-simd loop 8281 __ subsw(rscratch1, rscratch1, 4); 8282 __ br(Assembler::GT, Process4B); 8283 8284 // if exiting from PreProcess80B, rscratch1 == -1; 8285 // otherwise, rscratch1 == 0. 8286 __ cbzw(rscratch1, Exit); 8287 __ sub(length, length, 80); 8288 8289 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 8290 __ cbz(isURL, SIMDEnter); 8291 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 8292 8293 __ BIND(SIMDEnter); 8294 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 8295 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 8296 __ mov(rscratch1, 63); 8297 __ dup(v27, __ T16B, rscratch1); 8298 8299 __ BIND(Process64B); 8300 __ cmp(length, (u1)64); 8301 __ br(Assembler::LT, Process32B); 8302 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 8303 __ sub(length, length, 64); 8304 __ b(Process64B); 8305 8306 __ BIND(Process32B); 8307 __ cmp(length, (u1)32); 8308 __ br(Assembler::LT, SIMDExit); 8309 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 8310 __ sub(length, length, 32); 8311 __ b(Process32B); 8312 8313 __ BIND(SIMDExit); 8314 __ cbz(length, Exit); 8315 __ movw(rscratch1, length); 8316 __ b(Process4B); 8317 8318 __ BIND(Exit); 8319 __ sub(c_rarg0, dst, doff); 8320 8321 __ leave(); 8322 __ ret(lr); 8323 8324 return start; 8325 } 8326 8327 // Support for spin waits. 8328 address generate_spin_wait() { 8329 __ align(CodeEntryAlignment); 8330 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 8331 StubCodeMark mark(this, stub_id); 8332 address start = __ pc(); 8333 8334 __ spin_wait(); 8335 __ ret(lr); 8336 8337 return start; 8338 } 8339 8340 void generate_lookup_secondary_supers_table_stub() { 8341 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 8342 StubCodeMark mark(this, stub_id); 8343 8344 const Register 8345 r_super_klass = r0, 8346 r_array_base = r1, 8347 r_array_length = r2, 8348 r_array_index = r3, 8349 r_sub_klass = r4, 8350 r_bitmap = rscratch2, 8351 result = r5; 8352 const FloatRegister 8353 vtemp = v0; 8354 8355 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 8356 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 8357 Label L_success; 8358 __ enter(); 8359 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 8360 r_array_base, r_array_length, r_array_index, 8361 vtemp, result, slot, 8362 /*stub_is_near*/true); 8363 __ leave(); 8364 __ ret(lr); 8365 } 8366 } 8367 8368 // Slow path implementation for UseSecondarySupersTable. 8369 address generate_lookup_secondary_supers_table_slow_path_stub() { 8370 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 8371 StubCodeMark mark(this, stub_id); 8372 8373 address start = __ pc(); 8374 const Register 8375 r_super_klass = r0, // argument 8376 r_array_base = r1, // argument 8377 temp1 = r2, // temp 8378 r_array_index = r3, // argument 8379 r_bitmap = rscratch2, // argument 8380 result = r5; // argument 8381 8382 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 8383 __ ret(lr); 8384 8385 return start; 8386 } 8387 8388 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8389 8390 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 8391 // 8392 // If LSE is in use, generate LSE versions of all the stubs. The 8393 // non-LSE versions are in atomic_aarch64.S. 8394 8395 // class AtomicStubMark records the entry point of a stub and the 8396 // stub pointer which will point to it. The stub pointer is set to 8397 // the entry point when ~AtomicStubMark() is called, which must be 8398 // after ICache::invalidate_range. This ensures safe publication of 8399 // the generated code. 8400 class AtomicStubMark { 8401 address _entry_point; 8402 aarch64_atomic_stub_t *_stub; 8403 MacroAssembler *_masm; 8404 public: 8405 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 8406 _masm = masm; 8407 __ align(32); 8408 _entry_point = __ pc(); 8409 _stub = stub; 8410 } 8411 ~AtomicStubMark() { 8412 *_stub = (aarch64_atomic_stub_t)_entry_point; 8413 } 8414 }; 8415 8416 // NB: For memory_order_conservative we need a trailing membar after 8417 // LSE atomic operations but not a leading membar. 8418 // 8419 // We don't need a leading membar because a clause in the Arm ARM 8420 // says: 8421 // 8422 // Barrier-ordered-before 8423 // 8424 // Barrier instructions order prior Memory effects before subsequent 8425 // Memory effects generated by the same Observer. A read or a write 8426 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 8427 // Observer if and only if RW1 appears in program order before RW 2 8428 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 8429 // instruction with both Acquire and Release semantics. 8430 // 8431 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 8432 // and Release semantics, therefore we don't need a leading 8433 // barrier. However, there is no corresponding Barrier-ordered-after 8434 // relationship, therefore we need a trailing membar to prevent a 8435 // later store or load from being reordered with the store in an 8436 // atomic instruction. 8437 // 8438 // This was checked by using the herd7 consistency model simulator 8439 // (http://diy.inria.fr/) with this test case: 8440 // 8441 // AArch64 LseCas 8442 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 8443 // P0 | P1; 8444 // LDR W4, [X2] | MOV W3, #0; 8445 // DMB LD | MOV W4, #1; 8446 // LDR W3, [X1] | CASAL W3, W4, [X1]; 8447 // | DMB ISH; 8448 // | STR W4, [X2]; 8449 // exists 8450 // (0:X3=0 /\ 0:X4=1) 8451 // 8452 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 8453 // with the store to x in P1. Without the DMB in P1 this may happen. 8454 // 8455 // At the time of writing we don't know of any AArch64 hardware that 8456 // reorders stores in this way, but the Reference Manual permits it. 8457 8458 void gen_cas_entry(Assembler::operand_size size, 8459 atomic_memory_order order) { 8460 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 8461 exchange_val = c_rarg2; 8462 bool acquire, release; 8463 switch (order) { 8464 case memory_order_relaxed: 8465 acquire = false; 8466 release = false; 8467 break; 8468 case memory_order_release: 8469 acquire = false; 8470 release = true; 8471 break; 8472 default: 8473 acquire = true; 8474 release = true; 8475 break; 8476 } 8477 __ mov(prev, compare_val); 8478 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 8479 if (order == memory_order_conservative) { 8480 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8481 } 8482 if (size == Assembler::xword) { 8483 __ mov(r0, prev); 8484 } else { 8485 __ movw(r0, prev); 8486 } 8487 __ ret(lr); 8488 } 8489 8490 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 8491 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 8492 // If not relaxed, then default to conservative. Relaxed is the only 8493 // case we use enough to be worth specializing. 8494 if (order == memory_order_relaxed) { 8495 __ ldadd(size, incr, prev, addr); 8496 } else { 8497 __ ldaddal(size, incr, prev, addr); 8498 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8499 } 8500 if (size == Assembler::xword) { 8501 __ mov(r0, prev); 8502 } else { 8503 __ movw(r0, prev); 8504 } 8505 __ ret(lr); 8506 } 8507 8508 void gen_swpal_entry(Assembler::operand_size size) { 8509 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 8510 __ swpal(size, incr, prev, addr); 8511 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8512 if (size == Assembler::xword) { 8513 __ mov(r0, prev); 8514 } else { 8515 __ movw(r0, prev); 8516 } 8517 __ ret(lr); 8518 } 8519 8520 void generate_atomic_entry_points() { 8521 if (! UseLSE) { 8522 return; 8523 } 8524 __ align(CodeEntryAlignment); 8525 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 8526 StubCodeMark mark(this, stub_id); 8527 address first_entry = __ pc(); 8528 8529 // ADD, memory_order_conservative 8530 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 8531 gen_ldadd_entry(Assembler::word, memory_order_conservative); 8532 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 8533 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 8534 8535 // ADD, memory_order_relaxed 8536 AtomicStubMark mark_fetch_add_4_relaxed 8537 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 8538 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 8539 AtomicStubMark mark_fetch_add_8_relaxed 8540 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 8541 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 8542 8543 // XCHG, memory_order_conservative 8544 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 8545 gen_swpal_entry(Assembler::word); 8546 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 8547 gen_swpal_entry(Assembler::xword); 8548 8549 // CAS, memory_order_conservative 8550 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 8551 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 8552 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 8553 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 8554 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 8555 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 8556 8557 // CAS, memory_order_relaxed 8558 AtomicStubMark mark_cmpxchg_1_relaxed 8559 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 8560 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 8561 AtomicStubMark mark_cmpxchg_4_relaxed 8562 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 8563 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 8564 AtomicStubMark mark_cmpxchg_8_relaxed 8565 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 8566 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 8567 8568 AtomicStubMark mark_cmpxchg_4_release 8569 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 8570 gen_cas_entry(MacroAssembler::word, memory_order_release); 8571 AtomicStubMark mark_cmpxchg_8_release 8572 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 8573 gen_cas_entry(MacroAssembler::xword, memory_order_release); 8574 8575 AtomicStubMark mark_cmpxchg_4_seq_cst 8576 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 8577 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 8578 AtomicStubMark mark_cmpxchg_8_seq_cst 8579 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 8580 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 8581 8582 ICache::invalidate_range(first_entry, __ pc() - first_entry); 8583 } 8584 #endif // LINUX 8585 8586 address generate_cont_thaw(Continuation::thaw_kind kind) { 8587 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 8588 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 8589 8590 address start = __ pc(); 8591 8592 if (return_barrier) { 8593 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 8594 __ mov(sp, rscratch1); 8595 } 8596 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 8597 8598 if (return_barrier) { 8599 // preserve possible return value from a method returning to the return barrier 8600 __ fmovd(rscratch1, v0); 8601 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 8602 } 8603 8604 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 8605 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 8606 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 8607 8608 if (return_barrier) { 8609 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 8610 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 8611 __ fmovd(v0, rscratch1); 8612 } 8613 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 8614 8615 8616 Label thaw_success; 8617 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 8618 __ cbnz(rscratch2, thaw_success); 8619 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 8620 __ br(rscratch1); 8621 __ bind(thaw_success); 8622 8623 // make room for the thawed frames 8624 __ sub(rscratch1, sp, rscratch2); 8625 __ andr(rscratch1, rscratch1, -16); // align 8626 __ mov(sp, rscratch1); 8627 8628 if (return_barrier) { 8629 // save original return value -- again 8630 __ fmovd(rscratch1, v0); 8631 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 8632 } 8633 8634 // If we want, we can templatize thaw by kind, and have three different entries 8635 __ movw(c_rarg1, (uint32_t)kind); 8636 8637 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 8638 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 8639 8640 if (return_barrier) { 8641 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 8642 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 8643 __ fmovd(v0, rscratch1); 8644 } else { 8645 __ mov(r0, zr); // return 0 (success) from doYield 8646 } 8647 8648 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 8649 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 8650 __ mov(rfp, sp); 8651 8652 if (return_barrier_exception) { 8653 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 8654 __ authenticate_return_address(c_rarg1); 8655 __ verify_oop(r0); 8656 // save return value containing the exception oop in callee-saved R19 8657 __ mov(r19, r0); 8658 8659 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 8660 8661 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 8662 // __ reinitialize_ptrue(); 8663 8664 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 8665 8666 __ mov(r1, r0); // the exception handler 8667 __ mov(r0, r19); // restore return value containing the exception oop 8668 __ verify_oop(r0); 8669 8670 __ leave(); 8671 __ mov(r3, lr); 8672 __ br(r1); // the exception handler 8673 } else { 8674 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 8675 __ leave(); 8676 __ ret(lr); 8677 } 8678 8679 return start; 8680 } 8681 8682 address generate_cont_thaw() { 8683 if (!Continuations::enabled()) return nullptr; 8684 8685 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 8686 StubCodeMark mark(this, stub_id); 8687 address start = __ pc(); 8688 generate_cont_thaw(Continuation::thaw_top); 8689 return start; 8690 } 8691 8692 address generate_cont_returnBarrier() { 8693 if (!Continuations::enabled()) return nullptr; 8694 8695 // TODO: will probably need multiple return barriers depending on return type 8696 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 8697 StubCodeMark mark(this, stub_id); 8698 address start = __ pc(); 8699 8700 generate_cont_thaw(Continuation::thaw_return_barrier); 8701 8702 return start; 8703 } 8704 8705 address generate_cont_returnBarrier_exception() { 8706 if (!Continuations::enabled()) return nullptr; 8707 8708 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 8709 StubCodeMark mark(this, stub_id); 8710 address start = __ pc(); 8711 8712 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 8713 8714 return start; 8715 } 8716 8717 address generate_cont_preempt_stub() { 8718 if (!Continuations::enabled()) return nullptr; 8719 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 8720 StubCodeMark mark(this, stub_id); 8721 address start = __ pc(); 8722 8723 __ reset_last_Java_frame(true); 8724 8725 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 8726 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 8727 __ mov(sp, rscratch2); 8728 8729 Label preemption_cancelled; 8730 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 8731 __ cbnz(rscratch1, preemption_cancelled); 8732 8733 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 8734 SharedRuntime::continuation_enter_cleanup(_masm); 8735 __ leave(); 8736 __ ret(lr); 8737 8738 // We acquired the monitor after freezing the frames so call thaw to continue execution. 8739 __ bind(preemption_cancelled); 8740 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 8741 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 8742 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 8743 __ ldr(rscratch1, Address(rscratch1)); 8744 __ br(rscratch1); 8745 8746 return start; 8747 } 8748 8749 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 8750 // are represented as long[5], with BITS_PER_LIMB = 26. 8751 // Pack five 26-bit limbs into three 64-bit registers. 8752 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 8753 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 8754 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 8755 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 8756 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 8757 8758 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 8759 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 8760 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 8761 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 8762 8763 if (dest2->is_valid()) { 8764 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 8765 } else { 8766 #ifdef ASSERT 8767 Label OK; 8768 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 8769 __ br(__ EQ, OK); 8770 __ stop("high bits of Poly1305 integer should be zero"); 8771 __ should_not_reach_here(); 8772 __ bind(OK); 8773 #endif 8774 } 8775 } 8776 8777 // As above, but return only a 128-bit integer, packed into two 8778 // 64-bit registers. 8779 void pack_26(Register dest0, Register dest1, Register src) { 8780 pack_26(dest0, dest1, noreg, src); 8781 } 8782 8783 // Multiply and multiply-accumulate unsigned 64-bit registers. 8784 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 8785 __ mul(prod_lo, n, m); 8786 __ umulh(prod_hi, n, m); 8787 } 8788 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 8789 wide_mul(rscratch1, rscratch2, n, m); 8790 __ adds(sum_lo, sum_lo, rscratch1); 8791 __ adc(sum_hi, sum_hi, rscratch2); 8792 } 8793 8794 // Poly1305, RFC 7539 8795 8796 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 8797 // description of the tricks used to simplify and accelerate this 8798 // computation. 8799 8800 address generate_poly1305_processBlocks() { 8801 __ align(CodeEntryAlignment); 8802 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 8803 StubCodeMark mark(this, stub_id); 8804 address start = __ pc(); 8805 Label here; 8806 __ enter(); 8807 RegSet callee_saved = RegSet::range(r19, r28); 8808 __ push(callee_saved, sp); 8809 8810 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 8811 8812 // Arguments 8813 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 8814 8815 // R_n is the 128-bit randomly-generated key, packed into two 8816 // registers. The caller passes this key to us as long[5], with 8817 // BITS_PER_LIMB = 26. 8818 const Register R_0 = *++regs, R_1 = *++regs; 8819 pack_26(R_0, R_1, r_start); 8820 8821 // RR_n is (R_n >> 2) * 5 8822 const Register RR_0 = *++regs, RR_1 = *++regs; 8823 __ lsr(RR_0, R_0, 2); 8824 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 8825 __ lsr(RR_1, R_1, 2); 8826 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 8827 8828 // U_n is the current checksum 8829 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 8830 pack_26(U_0, U_1, U_2, acc_start); 8831 8832 static constexpr int BLOCK_LENGTH = 16; 8833 Label DONE, LOOP; 8834 8835 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 8836 __ br(Assembler::LT, DONE); { 8837 __ bind(LOOP); 8838 8839 // S_n is to be the sum of U_n and the next block of data 8840 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 8841 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 8842 __ adds(S_0, U_0, S_0); 8843 __ adcs(S_1, U_1, S_1); 8844 __ adc(S_2, U_2, zr); 8845 __ add(S_2, S_2, 1); 8846 8847 const Register U_0HI = *++regs, U_1HI = *++regs; 8848 8849 // NB: this logic depends on some of the special properties of 8850 // Poly1305 keys. In particular, because we know that the top 8851 // four bits of R_0 and R_1 are zero, we can add together 8852 // partial products without any risk of needing to propagate a 8853 // carry out. 8854 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 8855 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 8856 __ andr(U_2, R_0, 3); 8857 __ mul(U_2, S_2, U_2); 8858 8859 // Recycle registers S_0, S_1, S_2 8860 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 8861 8862 // Partial reduction mod 2**130 - 5 8863 __ adds(U_1, U_0HI, U_1); 8864 __ adc(U_2, U_1HI, U_2); 8865 // Sum now in U_2:U_1:U_0. 8866 // Dead: U_0HI, U_1HI. 8867 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 8868 8869 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 8870 8871 // First, U_2:U_1:U_0 += (U_2 >> 2) 8872 __ lsr(rscratch1, U_2, 2); 8873 __ andr(U_2, U_2, (u8)3); 8874 __ adds(U_0, U_0, rscratch1); 8875 __ adcs(U_1, U_1, zr); 8876 __ adc(U_2, U_2, zr); 8877 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 8878 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 8879 __ adcs(U_1, U_1, zr); 8880 __ adc(U_2, U_2, zr); 8881 8882 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 8883 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 8884 __ br(~ Assembler::LT, LOOP); 8885 } 8886 8887 // Further reduce modulo 2^130 - 5 8888 __ lsr(rscratch1, U_2, 2); 8889 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 8890 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 8891 __ adcs(U_1, U_1, zr); 8892 __ andr(U_2, U_2, (u1)3); 8893 __ adc(U_2, U_2, zr); 8894 8895 // Unpack the sum into five 26-bit limbs and write to memory. 8896 __ ubfiz(rscratch1, U_0, 0, 26); 8897 __ ubfx(rscratch2, U_0, 26, 26); 8898 __ stp(rscratch1, rscratch2, Address(acc_start)); 8899 __ ubfx(rscratch1, U_0, 52, 12); 8900 __ bfi(rscratch1, U_1, 12, 14); 8901 __ ubfx(rscratch2, U_1, 14, 26); 8902 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 8903 __ ubfx(rscratch1, U_1, 40, 24); 8904 __ bfi(rscratch1, U_2, 24, 3); 8905 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 8906 8907 __ bind(DONE); 8908 __ pop(callee_saved, sp); 8909 __ leave(); 8910 __ ret(lr); 8911 8912 return start; 8913 } 8914 8915 // exception handler for upcall stubs 8916 address generate_upcall_stub_exception_handler() { 8917 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 8918 StubCodeMark mark(this, stub_id); 8919 address start = __ pc(); 8920 8921 // Native caller has no idea how to handle exceptions, 8922 // so we just crash here. Up to callee to catch exceptions. 8923 __ verify_oop(r0); 8924 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 8925 __ blr(rscratch1); 8926 __ should_not_reach_here(); 8927 8928 return start; 8929 } 8930 8931 // load Method* target of MethodHandle 8932 // j_rarg0 = jobject receiver 8933 // rmethod = result 8934 address generate_upcall_stub_load_target() { 8935 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 8936 StubCodeMark mark(this, stub_id); 8937 address start = __ pc(); 8938 8939 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 8940 // Load target method from receiver 8941 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 8942 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 8943 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 8944 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 8945 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 8946 noreg, noreg); 8947 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 8948 8949 __ ret(lr); 8950 8951 return start; 8952 } 8953 8954 #undef __ 8955 #define __ masm-> 8956 8957 class MontgomeryMultiplyGenerator : public MacroAssembler { 8958 8959 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 8960 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 8961 8962 RegSet _toSave; 8963 bool _squaring; 8964 8965 public: 8966 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 8967 : MacroAssembler(as->code()), _squaring(squaring) { 8968 8969 // Register allocation 8970 8971 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 8972 Pa_base = *regs; // Argument registers 8973 if (squaring) 8974 Pb_base = Pa_base; 8975 else 8976 Pb_base = *++regs; 8977 Pn_base = *++regs; 8978 Rlen= *++regs; 8979 inv = *++regs; 8980 Pm_base = *++regs; 8981 8982 // Working registers: 8983 Ra = *++regs; // The current digit of a, b, n, and m. 8984 Rb = *++regs; 8985 Rm = *++regs; 8986 Rn = *++regs; 8987 8988 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 8989 Pb = *++regs; 8990 Pm = *++regs; 8991 Pn = *++regs; 8992 8993 t0 = *++regs; // Three registers which form a 8994 t1 = *++regs; // triple-precision accumuator. 8995 t2 = *++regs; 8996 8997 Ri = *++regs; // Inner and outer loop indexes. 8998 Rj = *++regs; 8999 9000 Rhi_ab = *++regs; // Product registers: low and high parts 9001 Rlo_ab = *++regs; // of a*b and m*n. 9002 Rhi_mn = *++regs; 9003 Rlo_mn = *++regs; 9004 9005 // r19 and up are callee-saved. 9006 _toSave = RegSet::range(r19, *regs) + Pm_base; 9007 } 9008 9009 private: 9010 void save_regs() { 9011 push(_toSave, sp); 9012 } 9013 9014 void restore_regs() { 9015 pop(_toSave, sp); 9016 } 9017 9018 template <typename T> 9019 void unroll_2(Register count, T block) { 9020 Label loop, end, odd; 9021 tbnz(count, 0, odd); 9022 cbz(count, end); 9023 align(16); 9024 bind(loop); 9025 (this->*block)(); 9026 bind(odd); 9027 (this->*block)(); 9028 subs(count, count, 2); 9029 br(Assembler::GT, loop); 9030 bind(end); 9031 } 9032 9033 template <typename T> 9034 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 9035 Label loop, end, odd; 9036 tbnz(count, 0, odd); 9037 cbz(count, end); 9038 align(16); 9039 bind(loop); 9040 (this->*block)(d, s, tmp); 9041 bind(odd); 9042 (this->*block)(d, s, tmp); 9043 subs(count, count, 2); 9044 br(Assembler::GT, loop); 9045 bind(end); 9046 } 9047 9048 void pre1(RegisterOrConstant i) { 9049 block_comment("pre1"); 9050 // Pa = Pa_base; 9051 // Pb = Pb_base + i; 9052 // Pm = Pm_base; 9053 // Pn = Pn_base + i; 9054 // Ra = *Pa; 9055 // Rb = *Pb; 9056 // Rm = *Pm; 9057 // Rn = *Pn; 9058 ldr(Ra, Address(Pa_base)); 9059 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 9060 ldr(Rm, Address(Pm_base)); 9061 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9062 lea(Pa, Address(Pa_base)); 9063 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 9064 lea(Pm, Address(Pm_base)); 9065 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9066 9067 // Zero the m*n result. 9068 mov(Rhi_mn, zr); 9069 mov(Rlo_mn, zr); 9070 } 9071 9072 // The core multiply-accumulate step of a Montgomery 9073 // multiplication. The idea is to schedule operations as a 9074 // pipeline so that instructions with long latencies (loads and 9075 // multiplies) have time to complete before their results are 9076 // used. This most benefits in-order implementations of the 9077 // architecture but out-of-order ones also benefit. 9078 void step() { 9079 block_comment("step"); 9080 // MACC(Ra, Rb, t0, t1, t2); 9081 // Ra = *++Pa; 9082 // Rb = *--Pb; 9083 umulh(Rhi_ab, Ra, Rb); 9084 mul(Rlo_ab, Ra, Rb); 9085 ldr(Ra, pre(Pa, wordSize)); 9086 ldr(Rb, pre(Pb, -wordSize)); 9087 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 9088 // previous iteration. 9089 // MACC(Rm, Rn, t0, t1, t2); 9090 // Rm = *++Pm; 9091 // Rn = *--Pn; 9092 umulh(Rhi_mn, Rm, Rn); 9093 mul(Rlo_mn, Rm, Rn); 9094 ldr(Rm, pre(Pm, wordSize)); 9095 ldr(Rn, pre(Pn, -wordSize)); 9096 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9097 } 9098 9099 void post1() { 9100 block_comment("post1"); 9101 9102 // MACC(Ra, Rb, t0, t1, t2); 9103 // Ra = *++Pa; 9104 // Rb = *--Pb; 9105 umulh(Rhi_ab, Ra, Rb); 9106 mul(Rlo_ab, Ra, Rb); 9107 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9108 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9109 9110 // *Pm = Rm = t0 * inv; 9111 mul(Rm, t0, inv); 9112 str(Rm, Address(Pm)); 9113 9114 // MACC(Rm, Rn, t0, t1, t2); 9115 // t0 = t1; t1 = t2; t2 = 0; 9116 umulh(Rhi_mn, Rm, Rn); 9117 9118 #ifndef PRODUCT 9119 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 9120 { 9121 mul(Rlo_mn, Rm, Rn); 9122 add(Rlo_mn, t0, Rlo_mn); 9123 Label ok; 9124 cbz(Rlo_mn, ok); { 9125 stop("broken Montgomery multiply"); 9126 } bind(ok); 9127 } 9128 #endif 9129 // We have very carefully set things up so that 9130 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 9131 // the lower half of Rm * Rn because we know the result already: 9132 // it must be -t0. t0 + (-t0) must generate a carry iff 9133 // t0 != 0. So, rather than do a mul and an adds we just set 9134 // the carry flag iff t0 is nonzero. 9135 // 9136 // mul(Rlo_mn, Rm, Rn); 9137 // adds(zr, t0, Rlo_mn); 9138 subs(zr, t0, 1); // Set carry iff t0 is nonzero 9139 adcs(t0, t1, Rhi_mn); 9140 adc(t1, t2, zr); 9141 mov(t2, zr); 9142 } 9143 9144 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 9145 block_comment("pre2"); 9146 // Pa = Pa_base + i-len; 9147 // Pb = Pb_base + len; 9148 // Pm = Pm_base + i-len; 9149 // Pn = Pn_base + len; 9150 9151 if (i.is_register()) { 9152 sub(Rj, i.as_register(), len); 9153 } else { 9154 mov(Rj, i.as_constant()); 9155 sub(Rj, Rj, len); 9156 } 9157 // Rj == i-len 9158 9159 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 9160 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 9161 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 9162 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 9163 9164 // Ra = *++Pa; 9165 // Rb = *--Pb; 9166 // Rm = *++Pm; 9167 // Rn = *--Pn; 9168 ldr(Ra, pre(Pa, wordSize)); 9169 ldr(Rb, pre(Pb, -wordSize)); 9170 ldr(Rm, pre(Pm, wordSize)); 9171 ldr(Rn, pre(Pn, -wordSize)); 9172 9173 mov(Rhi_mn, zr); 9174 mov(Rlo_mn, zr); 9175 } 9176 9177 void post2(RegisterOrConstant i, RegisterOrConstant len) { 9178 block_comment("post2"); 9179 if (i.is_constant()) { 9180 mov(Rj, i.as_constant()-len.as_constant()); 9181 } else { 9182 sub(Rj, i.as_register(), len); 9183 } 9184 9185 adds(t0, t0, Rlo_mn); // The pending m*n, low part 9186 9187 // As soon as we know the least significant digit of our result, 9188 // store it. 9189 // Pm_base[i-len] = t0; 9190 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 9191 9192 // t0 = t1; t1 = t2; t2 = 0; 9193 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 9194 adc(t1, t2, zr); 9195 mov(t2, zr); 9196 } 9197 9198 // A carry in t0 after Montgomery multiplication means that we 9199 // should subtract multiples of n from our result in m. We'll 9200 // keep doing that until there is no carry. 9201 void normalize(RegisterOrConstant len) { 9202 block_comment("normalize"); 9203 // while (t0) 9204 // t0 = sub(Pm_base, Pn_base, t0, len); 9205 Label loop, post, again; 9206 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 9207 cbz(t0, post); { 9208 bind(again); { 9209 mov(i, zr); 9210 mov(cnt, len); 9211 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9212 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9213 subs(zr, zr, zr); // set carry flag, i.e. no borrow 9214 align(16); 9215 bind(loop); { 9216 sbcs(Rm, Rm, Rn); 9217 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9218 add(i, i, 1); 9219 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9220 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9221 sub(cnt, cnt, 1); 9222 } cbnz(cnt, loop); 9223 sbc(t0, t0, zr); 9224 } cbnz(t0, again); 9225 } bind(post); 9226 } 9227 9228 // Move memory at s to d, reversing words. 9229 // Increments d to end of copied memory 9230 // Destroys tmp1, tmp2 9231 // Preserves len 9232 // Leaves s pointing to the address which was in d at start 9233 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 9234 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 9235 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 9236 9237 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 9238 mov(tmp1, len); 9239 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 9240 sub(s, d, len, ext::uxtw, LogBytesPerWord); 9241 } 9242 // where 9243 void reverse1(Register d, Register s, Register tmp) { 9244 ldr(tmp, pre(s, -wordSize)); 9245 ror(tmp, tmp, 32); 9246 str(tmp, post(d, wordSize)); 9247 } 9248 9249 void step_squaring() { 9250 // An extra ACC 9251 step(); 9252 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9253 } 9254 9255 void last_squaring(RegisterOrConstant i) { 9256 Label dont; 9257 // if ((i & 1) == 0) { 9258 tbnz(i.as_register(), 0, dont); { 9259 // MACC(Ra, Rb, t0, t1, t2); 9260 // Ra = *++Pa; 9261 // Rb = *--Pb; 9262 umulh(Rhi_ab, Ra, Rb); 9263 mul(Rlo_ab, Ra, Rb); 9264 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9265 } bind(dont); 9266 } 9267 9268 void extra_step_squaring() { 9269 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9270 9271 // MACC(Rm, Rn, t0, t1, t2); 9272 // Rm = *++Pm; 9273 // Rn = *--Pn; 9274 umulh(Rhi_mn, Rm, Rn); 9275 mul(Rlo_mn, Rm, Rn); 9276 ldr(Rm, pre(Pm, wordSize)); 9277 ldr(Rn, pre(Pn, -wordSize)); 9278 } 9279 9280 void post1_squaring() { 9281 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9282 9283 // *Pm = Rm = t0 * inv; 9284 mul(Rm, t0, inv); 9285 str(Rm, Address(Pm)); 9286 9287 // MACC(Rm, Rn, t0, t1, t2); 9288 // t0 = t1; t1 = t2; t2 = 0; 9289 umulh(Rhi_mn, Rm, Rn); 9290 9291 #ifndef PRODUCT 9292 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 9293 { 9294 mul(Rlo_mn, Rm, Rn); 9295 add(Rlo_mn, t0, Rlo_mn); 9296 Label ok; 9297 cbz(Rlo_mn, ok); { 9298 stop("broken Montgomery multiply"); 9299 } bind(ok); 9300 } 9301 #endif 9302 // We have very carefully set things up so that 9303 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 9304 // the lower half of Rm * Rn because we know the result already: 9305 // it must be -t0. t0 + (-t0) must generate a carry iff 9306 // t0 != 0. So, rather than do a mul and an adds we just set 9307 // the carry flag iff t0 is nonzero. 9308 // 9309 // mul(Rlo_mn, Rm, Rn); 9310 // adds(zr, t0, Rlo_mn); 9311 subs(zr, t0, 1); // Set carry iff t0 is nonzero 9312 adcs(t0, t1, Rhi_mn); 9313 adc(t1, t2, zr); 9314 mov(t2, zr); 9315 } 9316 9317 void acc(Register Rhi, Register Rlo, 9318 Register t0, Register t1, Register t2) { 9319 adds(t0, t0, Rlo); 9320 adcs(t1, t1, Rhi); 9321 adc(t2, t2, zr); 9322 } 9323 9324 public: 9325 /** 9326 * Fast Montgomery multiplication. The derivation of the 9327 * algorithm is in A Cryptographic Library for the Motorola 9328 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 9329 * 9330 * Arguments: 9331 * 9332 * Inputs for multiplication: 9333 * c_rarg0 - int array elements a 9334 * c_rarg1 - int array elements b 9335 * c_rarg2 - int array elements n (the modulus) 9336 * c_rarg3 - int length 9337 * c_rarg4 - int inv 9338 * c_rarg5 - int array elements m (the result) 9339 * 9340 * Inputs for squaring: 9341 * c_rarg0 - int array elements a 9342 * c_rarg1 - int array elements n (the modulus) 9343 * c_rarg2 - int length 9344 * c_rarg3 - int inv 9345 * c_rarg4 - int array elements m (the result) 9346 * 9347 */ 9348 address generate_multiply() { 9349 Label argh, nothing; 9350 bind(argh); 9351 stop("MontgomeryMultiply total_allocation must be <= 8192"); 9352 9353 align(CodeEntryAlignment); 9354 address entry = pc(); 9355 9356 cbzw(Rlen, nothing); 9357 9358 enter(); 9359 9360 // Make room. 9361 cmpw(Rlen, 512); 9362 br(Assembler::HI, argh); 9363 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 9364 andr(sp, Ra, -2 * wordSize); 9365 9366 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 9367 9368 { 9369 // Copy input args, reversing as we go. We use Ra as a 9370 // temporary variable. 9371 reverse(Ra, Pa_base, Rlen, t0, t1); 9372 if (!_squaring) 9373 reverse(Ra, Pb_base, Rlen, t0, t1); 9374 reverse(Ra, Pn_base, Rlen, t0, t1); 9375 } 9376 9377 // Push all call-saved registers and also Pm_base which we'll need 9378 // at the end. 9379 save_regs(); 9380 9381 #ifndef PRODUCT 9382 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 9383 { 9384 ldr(Rn, Address(Pn_base, 0)); 9385 mul(Rlo_mn, Rn, inv); 9386 subs(zr, Rlo_mn, -1); 9387 Label ok; 9388 br(EQ, ok); { 9389 stop("broken inverse in Montgomery multiply"); 9390 } bind(ok); 9391 } 9392 #endif 9393 9394 mov(Pm_base, Ra); 9395 9396 mov(t0, zr); 9397 mov(t1, zr); 9398 mov(t2, zr); 9399 9400 block_comment("for (int i = 0; i < len; i++) {"); 9401 mov(Ri, zr); { 9402 Label loop, end; 9403 cmpw(Ri, Rlen); 9404 br(Assembler::GE, end); 9405 9406 bind(loop); 9407 pre1(Ri); 9408 9409 block_comment(" for (j = i; j; j--) {"); { 9410 movw(Rj, Ri); 9411 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 9412 } block_comment(" } // j"); 9413 9414 post1(); 9415 addw(Ri, Ri, 1); 9416 cmpw(Ri, Rlen); 9417 br(Assembler::LT, loop); 9418 bind(end); 9419 block_comment("} // i"); 9420 } 9421 9422 block_comment("for (int i = len; i < 2*len; i++) {"); 9423 mov(Ri, Rlen); { 9424 Label loop, end; 9425 cmpw(Ri, Rlen, Assembler::LSL, 1); 9426 br(Assembler::GE, end); 9427 9428 bind(loop); 9429 pre2(Ri, Rlen); 9430 9431 block_comment(" for (j = len*2-i-1; j; j--) {"); { 9432 lslw(Rj, Rlen, 1); 9433 subw(Rj, Rj, Ri); 9434 subw(Rj, Rj, 1); 9435 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 9436 } block_comment(" } // j"); 9437 9438 post2(Ri, Rlen); 9439 addw(Ri, Ri, 1); 9440 cmpw(Ri, Rlen, Assembler::LSL, 1); 9441 br(Assembler::LT, loop); 9442 bind(end); 9443 } 9444 block_comment("} // i"); 9445 9446 normalize(Rlen); 9447 9448 mov(Ra, Pm_base); // Save Pm_base in Ra 9449 restore_regs(); // Restore caller's Pm_base 9450 9451 // Copy our result into caller's Pm_base 9452 reverse(Pm_base, Ra, Rlen, t0, t1); 9453 9454 leave(); 9455 bind(nothing); 9456 ret(lr); 9457 9458 return entry; 9459 } 9460 // In C, approximately: 9461 9462 // void 9463 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 9464 // julong Pn_base[], julong Pm_base[], 9465 // julong inv, int len) { 9466 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 9467 // julong *Pa, *Pb, *Pn, *Pm; 9468 // julong Ra, Rb, Rn, Rm; 9469 9470 // int i; 9471 9472 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 9473 9474 // for (i = 0; i < len; i++) { 9475 // int j; 9476 9477 // Pa = Pa_base; 9478 // Pb = Pb_base + i; 9479 // Pm = Pm_base; 9480 // Pn = Pn_base + i; 9481 9482 // Ra = *Pa; 9483 // Rb = *Pb; 9484 // Rm = *Pm; 9485 // Rn = *Pn; 9486 9487 // int iters = i; 9488 // for (j = 0; iters--; j++) { 9489 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 9490 // MACC(Ra, Rb, t0, t1, t2); 9491 // Ra = *++Pa; 9492 // Rb = *--Pb; 9493 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9494 // MACC(Rm, Rn, t0, t1, t2); 9495 // Rm = *++Pm; 9496 // Rn = *--Pn; 9497 // } 9498 9499 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 9500 // MACC(Ra, Rb, t0, t1, t2); 9501 // *Pm = Rm = t0 * inv; 9502 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 9503 // MACC(Rm, Rn, t0, t1, t2); 9504 9505 // assert(t0 == 0, "broken Montgomery multiply"); 9506 9507 // t0 = t1; t1 = t2; t2 = 0; 9508 // } 9509 9510 // for (i = len; i < 2*len; i++) { 9511 // int j; 9512 9513 // Pa = Pa_base + i-len; 9514 // Pb = Pb_base + len; 9515 // Pm = Pm_base + i-len; 9516 // Pn = Pn_base + len; 9517 9518 // Ra = *++Pa; 9519 // Rb = *--Pb; 9520 // Rm = *++Pm; 9521 // Rn = *--Pn; 9522 9523 // int iters = len*2-i-1; 9524 // for (j = i-len+1; iters--; j++) { 9525 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 9526 // MACC(Ra, Rb, t0, t1, t2); 9527 // Ra = *++Pa; 9528 // Rb = *--Pb; 9529 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9530 // MACC(Rm, Rn, t0, t1, t2); 9531 // Rm = *++Pm; 9532 // Rn = *--Pn; 9533 // } 9534 9535 // Pm_base[i-len] = t0; 9536 // t0 = t1; t1 = t2; t2 = 0; 9537 // } 9538 9539 // while (t0) 9540 // t0 = sub(Pm_base, Pn_base, t0, len); 9541 // } 9542 9543 /** 9544 * Fast Montgomery squaring. This uses asymptotically 25% fewer 9545 * multiplies than Montgomery multiplication so it should be up to 9546 * 25% faster. However, its loop control is more complex and it 9547 * may actually run slower on some machines. 9548 * 9549 * Arguments: 9550 * 9551 * Inputs: 9552 * c_rarg0 - int array elements a 9553 * c_rarg1 - int array elements n (the modulus) 9554 * c_rarg2 - int length 9555 * c_rarg3 - int inv 9556 * c_rarg4 - int array elements m (the result) 9557 * 9558 */ 9559 address generate_square() { 9560 Label argh; 9561 bind(argh); 9562 stop("MontgomeryMultiply total_allocation must be <= 8192"); 9563 9564 align(CodeEntryAlignment); 9565 address entry = pc(); 9566 9567 enter(); 9568 9569 // Make room. 9570 cmpw(Rlen, 512); 9571 br(Assembler::HI, argh); 9572 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 9573 andr(sp, Ra, -2 * wordSize); 9574 9575 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 9576 9577 { 9578 // Copy input args, reversing as we go. We use Ra as a 9579 // temporary variable. 9580 reverse(Ra, Pa_base, Rlen, t0, t1); 9581 reverse(Ra, Pn_base, Rlen, t0, t1); 9582 } 9583 9584 // Push all call-saved registers and also Pm_base which we'll need 9585 // at the end. 9586 save_regs(); 9587 9588 mov(Pm_base, Ra); 9589 9590 mov(t0, zr); 9591 mov(t1, zr); 9592 mov(t2, zr); 9593 9594 block_comment("for (int i = 0; i < len; i++) {"); 9595 mov(Ri, zr); { 9596 Label loop, end; 9597 bind(loop); 9598 cmp(Ri, Rlen); 9599 br(Assembler::GE, end); 9600 9601 pre1(Ri); 9602 9603 block_comment("for (j = (i+1)/2; j; j--) {"); { 9604 add(Rj, Ri, 1); 9605 lsr(Rj, Rj, 1); 9606 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 9607 } block_comment(" } // j"); 9608 9609 last_squaring(Ri); 9610 9611 block_comment(" for (j = i/2; j; j--) {"); { 9612 lsr(Rj, Ri, 1); 9613 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 9614 } block_comment(" } // j"); 9615 9616 post1_squaring(); 9617 add(Ri, Ri, 1); 9618 cmp(Ri, Rlen); 9619 br(Assembler::LT, loop); 9620 9621 bind(end); 9622 block_comment("} // i"); 9623 } 9624 9625 block_comment("for (int i = len; i < 2*len; i++) {"); 9626 mov(Ri, Rlen); { 9627 Label loop, end; 9628 bind(loop); 9629 cmp(Ri, Rlen, Assembler::LSL, 1); 9630 br(Assembler::GE, end); 9631 9632 pre2(Ri, Rlen); 9633 9634 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 9635 lsl(Rj, Rlen, 1); 9636 sub(Rj, Rj, Ri); 9637 sub(Rj, Rj, 1); 9638 lsr(Rj, Rj, 1); 9639 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 9640 } block_comment(" } // j"); 9641 9642 last_squaring(Ri); 9643 9644 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 9645 lsl(Rj, Rlen, 1); 9646 sub(Rj, Rj, Ri); 9647 lsr(Rj, Rj, 1); 9648 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 9649 } block_comment(" } // j"); 9650 9651 post2(Ri, Rlen); 9652 add(Ri, Ri, 1); 9653 cmp(Ri, Rlen, Assembler::LSL, 1); 9654 9655 br(Assembler::LT, loop); 9656 bind(end); 9657 block_comment("} // i"); 9658 } 9659 9660 normalize(Rlen); 9661 9662 mov(Ra, Pm_base); // Save Pm_base in Ra 9663 restore_regs(); // Restore caller's Pm_base 9664 9665 // Copy our result into caller's Pm_base 9666 reverse(Pm_base, Ra, Rlen, t0, t1); 9667 9668 leave(); 9669 ret(lr); 9670 9671 return entry; 9672 } 9673 // In C, approximately: 9674 9675 // void 9676 // montgomery_square(julong Pa_base[], julong Pn_base[], 9677 // julong Pm_base[], julong inv, int len) { 9678 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 9679 // julong *Pa, *Pb, *Pn, *Pm; 9680 // julong Ra, Rb, Rn, Rm; 9681 9682 // int i; 9683 9684 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 9685 9686 // for (i = 0; i < len; i++) { 9687 // int j; 9688 9689 // Pa = Pa_base; 9690 // Pb = Pa_base + i; 9691 // Pm = Pm_base; 9692 // Pn = Pn_base + i; 9693 9694 // Ra = *Pa; 9695 // Rb = *Pb; 9696 // Rm = *Pm; 9697 // Rn = *Pn; 9698 9699 // int iters = (i+1)/2; 9700 // for (j = 0; iters--; j++) { 9701 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 9702 // MACC2(Ra, Rb, t0, t1, t2); 9703 // Ra = *++Pa; 9704 // Rb = *--Pb; 9705 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9706 // MACC(Rm, Rn, t0, t1, t2); 9707 // Rm = *++Pm; 9708 // Rn = *--Pn; 9709 // } 9710 // if ((i & 1) == 0) { 9711 // assert(Ra == Pa_base[j], "must be"); 9712 // MACC(Ra, Ra, t0, t1, t2); 9713 // } 9714 // iters = i/2; 9715 // assert(iters == i-j, "must be"); 9716 // for (; iters--; j++) { 9717 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9718 // MACC(Rm, Rn, t0, t1, t2); 9719 // Rm = *++Pm; 9720 // Rn = *--Pn; 9721 // } 9722 9723 // *Pm = Rm = t0 * inv; 9724 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 9725 // MACC(Rm, Rn, t0, t1, t2); 9726 9727 // assert(t0 == 0, "broken Montgomery multiply"); 9728 9729 // t0 = t1; t1 = t2; t2 = 0; 9730 // } 9731 9732 // for (i = len; i < 2*len; i++) { 9733 // int start = i-len+1; 9734 // int end = start + (len - start)/2; 9735 // int j; 9736 9737 // Pa = Pa_base + i-len; 9738 // Pb = Pa_base + len; 9739 // Pm = Pm_base + i-len; 9740 // Pn = Pn_base + len; 9741 9742 // Ra = *++Pa; 9743 // Rb = *--Pb; 9744 // Rm = *++Pm; 9745 // Rn = *--Pn; 9746 9747 // int iters = (2*len-i-1)/2; 9748 // assert(iters == end-start, "must be"); 9749 // for (j = start; iters--; j++) { 9750 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 9751 // MACC2(Ra, Rb, t0, t1, t2); 9752 // Ra = *++Pa; 9753 // Rb = *--Pb; 9754 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9755 // MACC(Rm, Rn, t0, t1, t2); 9756 // Rm = *++Pm; 9757 // Rn = *--Pn; 9758 // } 9759 // if ((i & 1) == 0) { 9760 // assert(Ra == Pa_base[j], "must be"); 9761 // MACC(Ra, Ra, t0, t1, t2); 9762 // } 9763 // iters = (2*len-i)/2; 9764 // assert(iters == len-j, "must be"); 9765 // for (; iters--; j++) { 9766 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9767 // MACC(Rm, Rn, t0, t1, t2); 9768 // Rm = *++Pm; 9769 // Rn = *--Pn; 9770 // } 9771 // Pm_base[i-len] = t0; 9772 // t0 = t1; t1 = t2; t2 = 0; 9773 // } 9774 9775 // while (t0) 9776 // t0 = sub(Pm_base, Pn_base, t0, len); 9777 // } 9778 }; 9779 9780 void generate_vector_math_stubs() { 9781 // Get native vector math stub routine addresses 9782 void* libsleef = nullptr; 9783 char ebuf[1024]; 9784 char dll_name[JVM_MAXPATHLEN]; 9785 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 9786 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 9787 } 9788 if (libsleef == nullptr) { 9789 log_info(library)("Failed to load native vector math library, %s!", ebuf); 9790 return; 9791 } 9792 // Method naming convention 9793 // All the methods are named as <OP><T><N>_<U><suffix> 9794 // Where: 9795 // <OP> is the operation name, e.g. sin 9796 // <T> is optional to indicate float/double 9797 // "f/d" for vector float/double operation 9798 // <N> is the number of elements in the vector 9799 // "2/4" for neon, and "x" for sve 9800 // <U> is the precision level 9801 // "u10/u05" represents 1.0/0.5 ULP error bounds 9802 // We use "u10" for all operations by default 9803 // But for those functions do not have u10 support, we use "u05" instead 9804 // <suffix> indicates neon/sve 9805 // "sve/advsimd" for sve/neon implementations 9806 // e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions 9807 // cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions 9808 // 9809 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 9810 9811 // Math vector stubs implemented with SVE for scalable vector size. 9812 if (UseSVE > 0) { 9813 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 9814 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 9815 // Skip "tanh" because there is performance regression 9816 if (vop == VectorSupport::VECTOR_OP_TANH) { 9817 continue; 9818 } 9819 9820 // The native library does not support u10 level of "hypot". 9821 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 9822 9823 snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf); 9824 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 9825 9826 snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf); 9827 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 9828 } 9829 } 9830 9831 // Math vector stubs implemented with NEON for 64/128 bits vector size. 9832 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 9833 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 9834 // Skip "tanh" because there is performance regression 9835 if (vop == VectorSupport::VECTOR_OP_TANH) { 9836 continue; 9837 } 9838 9839 // The native library does not support u10 level of "hypot". 9840 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 9841 9842 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 9843 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf); 9844 9845 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 9846 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 9847 9848 snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf); 9849 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 9850 } 9851 } 9852 9853 // Initialization 9854 void generate_initial_stubs() { 9855 // Generate initial stubs and initializes the entry points 9856 9857 // entry points that exist in all platforms Note: This is code 9858 // that could be shared among different platforms - however the 9859 // benefit seems to be smaller than the disadvantage of having a 9860 // much more complicated generator structure. See also comment in 9861 // stubRoutines.hpp. 9862 9863 StubRoutines::_forward_exception_entry = generate_forward_exception(); 9864 9865 StubRoutines::_call_stub_entry = 9866 generate_call_stub(StubRoutines::_call_stub_return_address); 9867 9868 // is referenced by megamorphic call 9869 StubRoutines::_catch_exception_entry = generate_catch_exception(); 9870 9871 // Initialize table for copy memory (arraycopy) check. 9872 if (UnsafeMemoryAccess::_table == nullptr) { 9873 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 9874 } 9875 9876 if (UseCRC32Intrinsics) { 9877 // set table address before stub generation which use it 9878 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 9879 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 9880 } 9881 9882 if (UseCRC32CIntrinsics) { 9883 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 9884 } 9885 9886 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 9887 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 9888 } 9889 9890 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 9891 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 9892 } 9893 9894 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 9895 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 9896 StubRoutines::_hf2f = generate_float16ToFloat(); 9897 StubRoutines::_f2hf = generate_floatToFloat16(); 9898 } 9899 } 9900 9901 void generate_continuation_stubs() { 9902 // Continuation stubs: 9903 StubRoutines::_cont_thaw = generate_cont_thaw(); 9904 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 9905 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 9906 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 9907 } 9908 9909 void generate_final_stubs() { 9910 // support for verify_oop (must happen after universe_init) 9911 if (VerifyOops) { 9912 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 9913 } 9914 9915 // arraycopy stubs used by compilers 9916 generate_arraycopy_stubs(); 9917 9918 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 9919 9920 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 9921 9922 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 9923 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 9924 9925 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 9926 9927 generate_atomic_entry_points(); 9928 9929 #endif // LINUX 9930 9931 #ifdef COMPILER2 9932 if (UseSecondarySupersTable) { 9933 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 9934 if (! InlineSecondarySupersTest) { 9935 generate_lookup_secondary_supers_table_stub(); 9936 } 9937 } 9938 #endif 9939 9940 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 9941 } 9942 9943 void generate_compiler_stubs() { 9944 #if COMPILER2_OR_JVMCI 9945 9946 if (UseSVE == 0) { 9947 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 9948 } 9949 9950 // array equals stub for large arrays. 9951 if (!UseSimpleArrayEquals) { 9952 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 9953 } 9954 9955 // arrays_hascode stub for large arrays. 9956 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 9957 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 9958 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 9959 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 9960 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 9961 9962 // byte_array_inflate stub for large arrays. 9963 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 9964 9965 // countPositives stub for large arrays. 9966 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 9967 9968 generate_compare_long_strings(); 9969 9970 generate_string_indexof_stubs(); 9971 9972 #ifdef COMPILER2 9973 if (UseMultiplyToLenIntrinsic) { 9974 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 9975 } 9976 9977 if (UseSquareToLenIntrinsic) { 9978 StubRoutines::_squareToLen = generate_squareToLen(); 9979 } 9980 9981 if (UseMulAddIntrinsic) { 9982 StubRoutines::_mulAdd = generate_mulAdd(); 9983 } 9984 9985 if (UseSIMDForBigIntegerShiftIntrinsics) { 9986 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 9987 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 9988 } 9989 9990 if (UseMontgomeryMultiplyIntrinsic) { 9991 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 9992 StubCodeMark mark(this, stub_id); 9993 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 9994 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 9995 } 9996 9997 if (UseMontgomerySquareIntrinsic) { 9998 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 9999 StubCodeMark mark(this, stub_id); 10000 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 10001 // We use generate_multiply() rather than generate_square() 10002 // because it's faster for the sizes of modulus we care about. 10003 StubRoutines::_montgomerySquare = g.generate_multiply(); 10004 } 10005 10006 generate_vector_math_stubs(); 10007 10008 #endif // COMPILER2 10009 10010 if (UseChaCha20Intrinsics) { 10011 StubRoutines::_chacha20Block = generate_chacha20Block_qrpar(); 10012 } 10013 10014 if (UseDilithiumIntrinsics) { 10015 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 10016 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 10017 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 10018 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 10019 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 10020 } 10021 10022 if (UseBASE64Intrinsics) { 10023 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 10024 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 10025 } 10026 10027 // data cache line writeback 10028 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 10029 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 10030 10031 if (UseAESIntrinsics) { 10032 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 10033 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 10034 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 10035 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 10036 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 10037 } 10038 if (UseGHASHIntrinsics) { 10039 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 10040 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 10041 } 10042 if (UseAESIntrinsics && UseGHASHIntrinsics) { 10043 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 10044 } 10045 10046 if (UseMD5Intrinsics) { 10047 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 10048 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 10049 } 10050 if (UseSHA1Intrinsics) { 10051 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 10052 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 10053 } 10054 if (UseSHA256Intrinsics) { 10055 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 10056 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 10057 } 10058 if (UseSHA512Intrinsics) { 10059 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 10060 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 10061 } 10062 if (UseSHA3Intrinsics) { 10063 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 10064 StubRoutines::_double_keccak = generate_double_keccak(); 10065 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 10066 } 10067 10068 if (UsePoly1305Intrinsics) { 10069 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 10070 } 10071 10072 // generate Adler32 intrinsics code 10073 if (UseAdler32Intrinsics) { 10074 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 10075 } 10076 10077 #endif // COMPILER2_OR_JVMCI 10078 } 10079 10080 public: 10081 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 10082 switch(blob_id) { 10083 case initial_id: 10084 generate_initial_stubs(); 10085 break; 10086 case continuation_id: 10087 generate_continuation_stubs(); 10088 break; 10089 case compiler_id: 10090 generate_compiler_stubs(); 10091 break; 10092 case final_id: 10093 generate_final_stubs(); 10094 break; 10095 default: 10096 fatal("unexpected blob id: %d", blob_id); 10097 break; 10098 }; 10099 } 10100 }; // end class declaration 10101 10102 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 10103 StubGenerator g(code, blob_id); 10104 } 10105 10106 10107 #if defined (LINUX) 10108 10109 // Define pointers to atomic stubs and initialize them to point to the 10110 // code in atomic_aarch64.S. 10111 10112 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 10113 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 10114 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 10115 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 10116 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 10117 10118 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 10119 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 10120 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 10121 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 10122 DEFAULT_ATOMIC_OP(xchg, 4, ) 10123 DEFAULT_ATOMIC_OP(xchg, 8, ) 10124 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 10125 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 10126 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 10127 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 10128 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 10129 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 10130 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 10131 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 10132 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 10133 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 10134 10135 #undef DEFAULT_ATOMIC_OP 10136 10137 #endif // LINUX