1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/globalDefinitions.hpp" 57 #include "utilities/powerOfTwo.hpp" 58 #ifdef COMPILER2 59 #include "opto/runtime.hpp" 60 #endif 61 #if INCLUDE_ZGC 62 #include "gc/z/zThreadLocalData.hpp" 63 #endif 64 65 // Declaration and definition of StubGenerator (no .hpp file). 66 // For a more detailed description of the stub routine structure 67 // see the comment in stubRoutines.hpp 68 69 #undef __ 70 #define __ _masm-> 71 72 #ifdef PRODUCT 73 #define BLOCK_COMMENT(str) /* nothing */ 74 #else 75 #define BLOCK_COMMENT(str) __ block_comment(str) 76 #endif 77 78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 79 80 // Stub Code definitions 81 82 class StubGenerator: public StubCodeGenerator { 83 private: 84 85 #ifdef PRODUCT 86 #define inc_counter_np(counter) ((void)0) 87 #else 88 void inc_counter_np_(uint& counter) { 89 __ incrementw(ExternalAddress((address)&counter)); 90 } 91 #define inc_counter_np(counter) \ 92 BLOCK_COMMENT("inc_counter " #counter); \ 93 inc_counter_np_(counter); 94 #endif 95 96 // Call stubs are used to call Java from C 97 // 98 // Arguments: 99 // c_rarg0: call wrapper address address 100 // c_rarg1: result address 101 // c_rarg2: result type BasicType 102 // c_rarg3: method Method* 103 // c_rarg4: (interpreter) entry point address 104 // c_rarg5: parameters intptr_t* 105 // c_rarg6: parameter size (in words) int 106 // c_rarg7: thread Thread* 107 // 108 // There is no return from the stub itself as any Java result 109 // is written to result 110 // 111 // we save r30 (lr) as the return PC at the base of the frame and 112 // link r29 (fp) below it as the frame pointer installing sp (r31) 113 // into fp. 114 // 115 // we save r0-r7, which accounts for all the c arguments. 116 // 117 // TODO: strictly do we need to save them all? they are treated as 118 // volatile by C so could we omit saving the ones we are going to 119 // place in global registers (thread? method?) or those we only use 120 // during setup of the Java call? 121 // 122 // we don't need to save r8 which C uses as an indirect result location 123 // return register. 124 // 125 // we don't need to save r9-r15 which both C and Java treat as 126 // volatile 127 // 128 // we don't need to save r16-18 because Java does not use them 129 // 130 // we save r19-r28 which Java uses as scratch registers and C 131 // expects to be callee-save 132 // 133 // we save the bottom 64 bits of each value stored in v8-v15; it is 134 // the responsibility of the caller to preserve larger values. 135 // 136 // so the stub frame looks like this when we enter Java code 137 // 138 // [ return_from_Java ] <--- sp 139 // [ argument word n ] 140 // ... 141 // -29 [ argument word 1 ] 142 // -28 [ saved Floating-point Control Register ] 143 // -26 [ saved v15 ] <--- sp_after_call 144 // -25 [ saved v14 ] 145 // -24 [ saved v13 ] 146 // -23 [ saved v12 ] 147 // -22 [ saved v11 ] 148 // -21 [ saved v10 ] 149 // -20 [ saved v9 ] 150 // -19 [ saved v8 ] 151 // -18 [ saved r28 ] 152 // -17 [ saved r27 ] 153 // -16 [ saved r26 ] 154 // -15 [ saved r25 ] 155 // -14 [ saved r24 ] 156 // -13 [ saved r23 ] 157 // -12 [ saved r22 ] 158 // -11 [ saved r21 ] 159 // -10 [ saved r20 ] 160 // -9 [ saved r19 ] 161 // -8 [ call wrapper (r0) ] 162 // -7 [ result (r1) ] 163 // -6 [ result type (r2) ] 164 // -5 [ method (r3) ] 165 // -4 [ entry point (r4) ] 166 // -3 [ parameters (r5) ] 167 // -2 [ parameter size (r6) ] 168 // -1 [ thread (r7) ] 169 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 170 // 1 [ saved lr (r30) ] 171 172 // Call stub stack layout word offsets from fp 173 enum call_stub_layout { 174 sp_after_call_off = -28, 175 176 fpcr_off = sp_after_call_off, 177 d15_off = -26, 178 d13_off = -24, 179 d11_off = -22, 180 d9_off = -20, 181 182 r28_off = -18, 183 r26_off = -16, 184 r24_off = -14, 185 r22_off = -12, 186 r20_off = -10, 187 call_wrapper_off = -8, 188 result_off = -7, 189 result_type_off = -6, 190 method_off = -5, 191 entry_point_off = -4, 192 parameter_size_off = -2, 193 thread_off = -1, 194 fp_f = 0, 195 retaddr_off = 1, 196 }; 197 198 address generate_call_stub(address& return_address) { 199 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 200 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 201 "adjust this code"); 202 203 StubCodeMark mark(this, "StubRoutines", "call_stub"); 204 address start = __ pc(); 205 206 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 207 208 const Address fpcr_save (rfp, fpcr_off * wordSize); 209 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 210 const Address result (rfp, result_off * wordSize); 211 const Address result_type (rfp, result_type_off * wordSize); 212 const Address method (rfp, method_off * wordSize); 213 const Address entry_point (rfp, entry_point_off * wordSize); 214 const Address parameter_size(rfp, parameter_size_off * wordSize); 215 216 const Address thread (rfp, thread_off * wordSize); 217 218 const Address d15_save (rfp, d15_off * wordSize); 219 const Address d13_save (rfp, d13_off * wordSize); 220 const Address d11_save (rfp, d11_off * wordSize); 221 const Address d9_save (rfp, d9_off * wordSize); 222 223 const Address r28_save (rfp, r28_off * wordSize); 224 const Address r26_save (rfp, r26_off * wordSize); 225 const Address r24_save (rfp, r24_off * wordSize); 226 const Address r22_save (rfp, r22_off * wordSize); 227 const Address r20_save (rfp, r20_off * wordSize); 228 229 // stub code 230 231 address aarch64_entry = __ pc(); 232 233 // set up frame and move sp to end of save area 234 __ enter(); 235 __ sub(sp, rfp, -sp_after_call_off * wordSize); 236 237 // save register parameters and Java scratch/global registers 238 // n.b. we save thread even though it gets installed in 239 // rthread because we want to sanity check rthread later 240 __ str(c_rarg7, thread); 241 __ strw(c_rarg6, parameter_size); 242 __ stp(c_rarg4, c_rarg5, entry_point); 243 __ stp(c_rarg2, c_rarg3, result_type); 244 __ stp(c_rarg0, c_rarg1, call_wrapper); 245 246 __ stp(r20, r19, r20_save); 247 __ stp(r22, r21, r22_save); 248 __ stp(r24, r23, r24_save); 249 __ stp(r26, r25, r26_save); 250 __ stp(r28, r27, r28_save); 251 252 __ stpd(v9, v8, d9_save); 253 __ stpd(v11, v10, d11_save); 254 __ stpd(v13, v12, d13_save); 255 __ stpd(v15, v14, d15_save); 256 257 __ get_fpcr(rscratch1); 258 __ str(rscratch1, fpcr_save); 259 // Set FPCR to the state we need. We do want Round to Nearest. We 260 // don't want non-IEEE rounding modes or floating-point traps. 261 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 262 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 263 __ set_fpcr(rscratch1); 264 265 // install Java thread in global register now we have saved 266 // whatever value it held 267 __ mov(rthread, c_rarg7); 268 // And method 269 __ mov(rmethod, c_rarg3); 270 271 // set up the heapbase register 272 __ reinit_heapbase(); 273 274 #ifdef ASSERT 275 // make sure we have no pending exceptions 276 { 277 Label L; 278 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 279 __ cmp(rscratch1, (u1)NULL_WORD); 280 __ br(Assembler::EQ, L); 281 __ stop("StubRoutines::call_stub: entered with pending exception"); 282 __ BIND(L); 283 } 284 #endif 285 // pass parameters if any 286 __ mov(esp, sp); 287 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 288 __ andr(sp, rscratch1, -2 * wordSize); 289 290 BLOCK_COMMENT("pass parameters if any"); 291 Label parameters_done; 292 // parameter count is still in c_rarg6 293 // and parameter pointer identifying param 1 is in c_rarg5 294 __ cbzw(c_rarg6, parameters_done); 295 296 address loop = __ pc(); 297 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 298 __ subsw(c_rarg6, c_rarg6, 1); 299 __ push(rscratch1); 300 __ br(Assembler::GT, loop); 301 302 __ BIND(parameters_done); 303 304 // call Java entry -- passing methdoOop, and current sp 305 // rmethod: Method* 306 // r19_sender_sp: sender sp 307 BLOCK_COMMENT("call Java function"); 308 __ mov(r19_sender_sp, sp); 309 __ blr(c_rarg4); 310 311 // we do this here because the notify will already have been done 312 // if we get to the next instruction via an exception 313 // 314 // n.b. adding this instruction here affects the calculation of 315 // whether or not a routine returns to the call stub (used when 316 // doing stack walks) since the normal test is to check the return 317 // pc against the address saved below. so we may need to allow for 318 // this extra instruction in the check. 319 320 // save current address for use by exception handling code 321 322 return_address = __ pc(); 323 324 // store result depending on type (everything that is not 325 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 326 // n.b. this assumes Java returns an integral result in r0 327 // and a floating result in j_farg0 328 __ ldr(j_rarg2, result); 329 Label is_long, is_float, is_double, exit; 330 __ ldr(j_rarg1, result_type); 331 __ cmp(j_rarg1, (u1)T_OBJECT); 332 __ br(Assembler::EQ, is_long); 333 __ cmp(j_rarg1, (u1)T_LONG); 334 __ br(Assembler::EQ, is_long); 335 __ cmp(j_rarg1, (u1)T_FLOAT); 336 __ br(Assembler::EQ, is_float); 337 __ cmp(j_rarg1, (u1)T_DOUBLE); 338 __ br(Assembler::EQ, is_double); 339 340 // handle T_INT case 341 __ strw(r0, Address(j_rarg2)); 342 343 __ BIND(exit); 344 345 // pop parameters 346 __ sub(esp, rfp, -sp_after_call_off * wordSize); 347 348 #ifdef ASSERT 349 // verify that threads correspond 350 { 351 Label L, S; 352 __ ldr(rscratch1, thread); 353 __ cmp(rthread, rscratch1); 354 __ br(Assembler::NE, S); 355 __ get_thread(rscratch1); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::EQ, L); 358 __ BIND(S); 359 __ stop("StubRoutines::call_stub: threads must correspond"); 360 __ BIND(L); 361 } 362 #endif 363 364 __ pop_cont_fastpath(rthread); 365 366 // restore callee-save registers 367 __ ldpd(v15, v14, d15_save); 368 __ ldpd(v13, v12, d13_save); 369 __ ldpd(v11, v10, d11_save); 370 __ ldpd(v9, v8, d9_save); 371 372 __ ldp(r28, r27, r28_save); 373 __ ldp(r26, r25, r26_save); 374 __ ldp(r24, r23, r24_save); 375 __ ldp(r22, r21, r22_save); 376 __ ldp(r20, r19, r20_save); 377 378 // restore fpcr 379 __ ldr(rscratch1, fpcr_save); 380 __ set_fpcr(rscratch1); 381 382 __ ldp(c_rarg0, c_rarg1, call_wrapper); 383 __ ldrw(c_rarg2, result_type); 384 __ ldr(c_rarg3, method); 385 __ ldp(c_rarg4, c_rarg5, entry_point); 386 __ ldp(c_rarg6, c_rarg7, parameter_size); 387 388 // leave frame and return to caller 389 __ leave(); 390 __ ret(lr); 391 392 // handle return types different from T_INT 393 394 __ BIND(is_long); 395 __ str(r0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 __ BIND(is_float); 399 __ strs(j_farg0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_double); 403 __ strd(j_farg0, Address(j_rarg2, 0)); 404 __ br(Assembler::AL, exit); 405 406 return start; 407 } 408 409 // Return point for a Java call if there's an exception thrown in 410 // Java code. The exception is caught and transformed into a 411 // pending exception stored in JavaThread that can be tested from 412 // within the VM. 413 // 414 // Note: Usually the parameters are removed by the callee. In case 415 // of an exception crossing an activation frame boundary, that is 416 // not the case if the callee is compiled code => need to setup the 417 // rsp. 418 // 419 // r0: exception oop 420 421 address generate_catch_exception() { 422 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 423 address start = __ pc(); 424 425 // same as in generate_call_stub(): 426 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 427 const Address thread (rfp, thread_off * wordSize); 428 429 #ifdef ASSERT 430 // verify that threads correspond 431 { 432 Label L, S; 433 __ ldr(rscratch1, thread); 434 __ cmp(rthread, rscratch1); 435 __ br(Assembler::NE, S); 436 __ get_thread(rscratch1); 437 __ cmp(rthread, rscratch1); 438 __ br(Assembler::EQ, L); 439 __ bind(S); 440 __ stop("StubRoutines::catch_exception: threads must correspond"); 441 __ bind(L); 442 } 443 #endif 444 445 // set pending exception 446 __ verify_oop(r0); 447 448 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 449 __ mov(rscratch1, (address)__FILE__); 450 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 451 __ movw(rscratch1, (int)__LINE__); 452 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 453 454 // complete return to VM 455 assert(StubRoutines::_call_stub_return_address != nullptr, 456 "_call_stub_return_address must have been generated before"); 457 __ b(StubRoutines::_call_stub_return_address); 458 459 return start; 460 } 461 462 // Continuation point for runtime calls returning with a pending 463 // exception. The pending exception check happened in the runtime 464 // or native call stub. The pending exception in Thread is 465 // converted into a Java-level exception. 466 // 467 // Contract with Java-level exception handlers: 468 // r0: exception 469 // r3: throwing pc 470 // 471 // NOTE: At entry of this stub, exception-pc must be in LR !! 472 473 // NOTE: this is always used as a jump target within generated code 474 // so it just needs to be generated code with no x86 prolog 475 476 address generate_forward_exception() { 477 StubCodeMark mark(this, "StubRoutines", "forward exception"); 478 address start = __ pc(); 479 480 // Upon entry, LR points to the return address returning into 481 // Java (interpreted or compiled) code; i.e., the return address 482 // becomes the throwing pc. 483 // 484 // Arguments pushed before the runtime call are still on the stack 485 // but the exception handler will reset the stack pointer -> 486 // ignore them. A potential result in registers can be ignored as 487 // well. 488 489 #ifdef ASSERT 490 // make sure this code is only executed if there is a pending exception 491 { 492 Label L; 493 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 494 __ cbnz(rscratch1, L); 495 __ stop("StubRoutines::forward exception: no pending exception (1)"); 496 __ bind(L); 497 } 498 #endif 499 500 // compute exception handler into r19 501 502 // call the VM to find the handler address associated with the 503 // caller address. pass thread in r0 and caller pc (ret address) 504 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 505 // the stack. 506 __ mov(c_rarg1, lr); 507 // lr will be trashed by the VM call so we move it to R19 508 // (callee-saved) because we also need to pass it to the handler 509 // returned by this call. 510 __ mov(r19, lr); 511 BLOCK_COMMENT("call exception_handler_for_return_address"); 512 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 513 SharedRuntime::exception_handler_for_return_address), 514 rthread, c_rarg1); 515 // Reinitialize the ptrue predicate register, in case the external runtime 516 // call clobbers ptrue reg, as we may return to SVE compiled code. 517 __ reinitialize_ptrue(); 518 519 // we should not really care that lr is no longer the callee 520 // address. we saved the value the handler needs in r19 so we can 521 // just copy it to r3. however, the C2 handler will push its own 522 // frame and then calls into the VM and the VM code asserts that 523 // the PC for the frame above the handler belongs to a compiled 524 // Java method. So, we restore lr here to satisfy that assert. 525 __ mov(lr, r19); 526 // setup r0 & r3 & clear pending exception 527 __ mov(r3, r19); 528 __ mov(r19, r0); 529 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 530 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 531 532 #ifdef ASSERT 533 // make sure exception is set 534 { 535 Label L; 536 __ cbnz(r0, L); 537 __ stop("StubRoutines::forward exception: no pending exception (2)"); 538 __ bind(L); 539 } 540 #endif 541 542 // continue at exception handler 543 // r0: exception 544 // r3: throwing pc 545 // r19: exception handler 546 __ verify_oop(r0); 547 __ br(r19); 548 549 return start; 550 } 551 552 // Non-destructive plausibility checks for oops 553 // 554 // Arguments: 555 // r0: oop to verify 556 // rscratch1: error message 557 // 558 // Stack after saving c_rarg3: 559 // [tos + 0]: saved c_rarg3 560 // [tos + 1]: saved c_rarg2 561 // [tos + 2]: saved lr 562 // [tos + 3]: saved rscratch2 563 // [tos + 4]: saved r0 564 // [tos + 5]: saved rscratch1 565 address generate_verify_oop() { 566 567 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 568 address start = __ pc(); 569 570 Label exit, error; 571 572 // save c_rarg2 and c_rarg3 573 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 574 575 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 576 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 577 __ ldr(c_rarg3, Address(c_rarg2)); 578 __ add(c_rarg3, c_rarg3, 1); 579 __ str(c_rarg3, Address(c_rarg2)); 580 581 // object is in r0 582 // make sure object is 'reasonable' 583 __ cbz(r0, exit); // if obj is null it is OK 584 585 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 586 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 587 588 // return if everything seems ok 589 __ bind(exit); 590 591 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 592 __ ret(lr); 593 594 // handle errors 595 __ bind(error); 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 598 __ push(RegSet::range(r0, r29), sp); 599 // debug(char* msg, int64_t pc, int64_t regs[]) 600 __ mov(c_rarg0, rscratch1); // pass address of error message 601 __ mov(c_rarg1, lr); // pass return address 602 __ mov(c_rarg2, sp); // pass address of regs on stack 603 #ifndef PRODUCT 604 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 605 #endif 606 BLOCK_COMMENT("call MacroAssembler::debug"); 607 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 608 __ blr(rscratch1); 609 __ hlt(0); 610 611 return start; 612 } 613 614 // Generate indices for iota vector. 615 address generate_iota_indices(const char *stub_name) { 616 __ align(CodeEntryAlignment); 617 StubCodeMark mark(this, "StubRoutines", stub_name); 618 address start = __ pc(); 619 // B 620 __ emit_data64(0x0706050403020100, relocInfo::none); 621 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 622 // H 623 __ emit_data64(0x0003000200010000, relocInfo::none); 624 __ emit_data64(0x0007000600050004, relocInfo::none); 625 // S 626 __ emit_data64(0x0000000100000000, relocInfo::none); 627 __ emit_data64(0x0000000300000002, relocInfo::none); 628 // D 629 __ emit_data64(0x0000000000000000, relocInfo::none); 630 __ emit_data64(0x0000000000000001, relocInfo::none); 631 // S - FP 632 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 633 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 634 // D - FP 635 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 636 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 637 return start; 638 } 639 640 // The inner part of zero_words(). This is the bulk operation, 641 // zeroing words in blocks, possibly using DC ZVA to do it. The 642 // caller is responsible for zeroing the last few words. 643 // 644 // Inputs: 645 // r10: the HeapWord-aligned base address of an array to zero. 646 // r11: the count in HeapWords, r11 > 0. 647 // 648 // Returns r10 and r11, adjusted for the caller to clear. 649 // r10: the base address of the tail of words left to clear. 650 // r11: the number of words in the tail. 651 // r11 < MacroAssembler::zero_words_block_size. 652 653 address generate_zero_blocks() { 654 Label done; 655 Label base_aligned; 656 657 Register base = r10, cnt = r11; 658 659 __ align(CodeEntryAlignment); 660 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 661 address start = __ pc(); 662 663 if (UseBlockZeroing) { 664 int zva_length = VM_Version::zva_length(); 665 666 // Ensure ZVA length can be divided by 16. This is required by 667 // the subsequent operations. 668 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 669 670 __ tbz(base, 3, base_aligned); 671 __ str(zr, Address(__ post(base, 8))); 672 __ sub(cnt, cnt, 1); 673 __ bind(base_aligned); 674 675 // Ensure count >= zva_length * 2 so that it still deserves a zva after 676 // alignment. 677 Label small; 678 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 679 __ subs(rscratch1, cnt, low_limit >> 3); 680 __ br(Assembler::LT, small); 681 __ zero_dcache_blocks(base, cnt); 682 __ bind(small); 683 } 684 685 { 686 // Number of stp instructions we'll unroll 687 const int unroll = 688 MacroAssembler::zero_words_block_size / 2; 689 // Clear the remaining blocks. 690 Label loop; 691 __ subs(cnt, cnt, unroll * 2); 692 __ br(Assembler::LT, done); 693 __ bind(loop); 694 for (int i = 0; i < unroll; i++) 695 __ stp(zr, zr, __ post(base, 16)); 696 __ subs(cnt, cnt, unroll * 2); 697 __ br(Assembler::GE, loop); 698 __ bind(done); 699 __ add(cnt, cnt, unroll * 2); 700 } 701 702 __ ret(lr); 703 704 return start; 705 } 706 707 708 typedef enum { 709 copy_forwards = 1, 710 copy_backwards = -1 711 } copy_direction; 712 713 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 714 // for arraycopy stubs. 715 class ArrayCopyBarrierSetHelper : StackObj { 716 BarrierSetAssembler* _bs_asm; 717 MacroAssembler* _masm; 718 DecoratorSet _decorators; 719 BasicType _type; 720 Register _gct1; 721 Register _gct2; 722 Register _gct3; 723 FloatRegister _gcvt1; 724 FloatRegister _gcvt2; 725 FloatRegister _gcvt3; 726 727 public: 728 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 729 DecoratorSet decorators, 730 BasicType type, 731 Register gct1, 732 Register gct2, 733 Register gct3, 734 FloatRegister gcvt1, 735 FloatRegister gcvt2, 736 FloatRegister gcvt3) 737 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 738 _masm(masm), 739 _decorators(decorators), 740 _type(type), 741 _gct1(gct1), 742 _gct2(gct2), 743 _gct3(gct3), 744 _gcvt1(gcvt1), 745 _gcvt2(gcvt2), 746 _gcvt3(gcvt3) { 747 } 748 749 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 750 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 751 dst1, dst2, src, 752 _gct1, _gct2, _gcvt1); 753 } 754 755 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 756 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 757 dst, src1, src2, 758 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 759 } 760 761 void copy_load_at_16(Register dst1, Register dst2, Address src) { 762 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 763 dst1, dst2, src, 764 _gct1); 765 } 766 767 void copy_store_at_16(Address dst, Register src1, Register src2) { 768 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 769 dst, src1, src2, 770 _gct1, _gct2, _gct3); 771 } 772 773 void copy_load_at_8(Register dst, Address src) { 774 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 775 dst, noreg, src, 776 _gct1); 777 } 778 779 void copy_store_at_8(Address dst, Register src) { 780 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 781 dst, src, noreg, 782 _gct1, _gct2, _gct3); 783 } 784 }; 785 786 // Bulk copy of blocks of 8 words. 787 // 788 // count is a count of words. 789 // 790 // Precondition: count >= 8 791 // 792 // Postconditions: 793 // 794 // The least significant bit of count contains the remaining count 795 // of words to copy. The rest of count is trash. 796 // 797 // s and d are adjusted to point to the remaining words to copy 798 // 799 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 800 copy_direction direction) { 801 int unit = wordSize * direction; 802 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 803 804 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 805 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 806 const Register stride = r14; 807 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 808 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 809 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 810 811 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 812 assert_different_registers(s, d, count, rscratch1, rscratch2); 813 814 Label again, drain; 815 const char *stub_name; 816 if (direction == copy_forwards) 817 stub_name = "forward_copy_longs"; 818 else 819 stub_name = "backward_copy_longs"; 820 821 __ align(CodeEntryAlignment); 822 823 StubCodeMark mark(this, "StubRoutines", stub_name); 824 825 __ bind(start); 826 827 Label unaligned_copy_long; 828 if (AvoidUnalignedAccesses) { 829 __ tbnz(d, 3, unaligned_copy_long); 830 } 831 832 if (direction == copy_forwards) { 833 __ sub(s, s, bias); 834 __ sub(d, d, bias); 835 } 836 837 #ifdef ASSERT 838 // Make sure we are never given < 8 words 839 { 840 Label L; 841 __ cmp(count, (u1)8); 842 __ br(Assembler::GE, L); 843 __ stop("genrate_copy_longs called with < 8 words"); 844 __ bind(L); 845 } 846 #endif 847 848 // Fill 8 registers 849 if (UseSIMDForMemoryOps) { 850 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 851 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 852 } else { 853 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 854 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 855 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 856 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 857 } 858 859 __ subs(count, count, 16); 860 __ br(Assembler::LO, drain); 861 862 int prefetch = PrefetchCopyIntervalInBytes; 863 bool use_stride = false; 864 if (direction == copy_backwards) { 865 use_stride = prefetch > 256; 866 prefetch = -prefetch; 867 if (use_stride) __ mov(stride, prefetch); 868 } 869 870 __ bind(again); 871 872 if (PrefetchCopyIntervalInBytes > 0) 873 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 874 875 if (UseSIMDForMemoryOps) { 876 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 877 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 878 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 879 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 880 } else { 881 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 882 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 883 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 884 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 885 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 886 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 887 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 888 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 889 } 890 891 __ subs(count, count, 8); 892 __ br(Assembler::HS, again); 893 894 // Drain 895 __ bind(drain); 896 if (UseSIMDForMemoryOps) { 897 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 898 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 899 } else { 900 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 901 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 902 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 903 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 904 } 905 906 { 907 Label L1, L2; 908 __ tbz(count, exact_log2(4), L1); 909 if (UseSIMDForMemoryOps) { 910 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 911 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 912 } else { 913 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 914 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 915 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 916 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 917 } 918 __ bind(L1); 919 920 if (direction == copy_forwards) { 921 __ add(s, s, bias); 922 __ add(d, d, bias); 923 } 924 925 __ tbz(count, 1, L2); 926 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 927 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 928 __ bind(L2); 929 } 930 931 __ ret(lr); 932 933 if (AvoidUnalignedAccesses) { 934 Label drain, again; 935 // Register order for storing. Order is different for backward copy. 936 937 __ bind(unaligned_copy_long); 938 939 // source address is even aligned, target odd aligned 940 // 941 // when forward copying word pairs we read long pairs at offsets 942 // {0, 2, 4, 6} (in long words). when backwards copying we read 943 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 944 // address by -2 in the forwards case so we can compute the 945 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 946 // or -1. 947 // 948 // when forward copying we need to store 1 word, 3 pairs and 949 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 950 // zero offset We adjust the destination by -1 which means we 951 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 952 // 953 // When backwards copyng we need to store 1 word, 3 pairs and 954 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 955 // offsets {1, 3, 5, 7, 8} * unit. 956 957 if (direction == copy_forwards) { 958 __ sub(s, s, 16); 959 __ sub(d, d, 8); 960 } 961 962 // Fill 8 registers 963 // 964 // for forwards copy s was offset by -16 from the original input 965 // value of s so the register contents are at these offsets 966 // relative to the 64 bit block addressed by that original input 967 // and so on for each successive 64 byte block when s is updated 968 // 969 // t0 at offset 0, t1 at offset 8 970 // t2 at offset 16, t3 at offset 24 971 // t4 at offset 32, t5 at offset 40 972 // t6 at offset 48, t7 at offset 56 973 974 // for backwards copy s was not offset so the register contents 975 // are at these offsets into the preceding 64 byte block 976 // relative to that original input and so on for each successive 977 // preceding 64 byte block when s is updated. this explains the 978 // slightly counter-intuitive looking pattern of register usage 979 // in the stp instructions for backwards copy. 980 // 981 // t0 at offset -16, t1 at offset -8 982 // t2 at offset -32, t3 at offset -24 983 // t4 at offset -48, t5 at offset -40 984 // t6 at offset -64, t7 at offset -56 985 986 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 987 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 988 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 989 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 990 991 __ subs(count, count, 16); 992 __ br(Assembler::LO, drain); 993 994 int prefetch = PrefetchCopyIntervalInBytes; 995 bool use_stride = false; 996 if (direction == copy_backwards) { 997 use_stride = prefetch > 256; 998 prefetch = -prefetch; 999 if (use_stride) __ mov(stride, prefetch); 1000 } 1001 1002 __ bind(again); 1003 1004 if (PrefetchCopyIntervalInBytes > 0) 1005 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1006 1007 if (direction == copy_forwards) { 1008 // allowing for the offset of -8 the store instructions place 1009 // registers into the target 64 bit block at the following 1010 // offsets 1011 // 1012 // t0 at offset 0 1013 // t1 at offset 8, t2 at offset 16 1014 // t3 at offset 24, t4 at offset 32 1015 // t5 at offset 40, t6 at offset 48 1016 // t7 at offset 56 1017 1018 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1019 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1020 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1021 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1022 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1023 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1024 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1025 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1026 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1027 } else { 1028 // d was not offset when we started so the registers are 1029 // written into the 64 bit block preceding d with the following 1030 // offsets 1031 // 1032 // t1 at offset -8 1033 // t3 at offset -24, t0 at offset -16 1034 // t5 at offset -48, t2 at offset -32 1035 // t7 at offset -56, t4 at offset -48 1036 // t6 at offset -64 1037 // 1038 // note that this matches the offsets previously noted for the 1039 // loads 1040 1041 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1042 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1043 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1044 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1045 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1046 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1047 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1048 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1049 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1050 } 1051 1052 __ subs(count, count, 8); 1053 __ br(Assembler::HS, again); 1054 1055 // Drain 1056 // 1057 // this uses the same pattern of offsets and register arguments 1058 // as above 1059 __ bind(drain); 1060 if (direction == copy_forwards) { 1061 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1062 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1063 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1064 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1065 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1066 } else { 1067 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1068 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1069 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1070 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1071 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1072 } 1073 // now we need to copy any remaining part block which may 1074 // include a 4 word block subblock and/or a 2 word subblock. 1075 // bits 2 and 1 in the count are the tell-tale for whether we 1076 // have each such subblock 1077 { 1078 Label L1, L2; 1079 __ tbz(count, exact_log2(4), L1); 1080 // this is the same as above but copying only 4 longs hence 1081 // with only one intervening stp between the str instructions 1082 // but note that the offsets and registers still follow the 1083 // same pattern 1084 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1085 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1086 if (direction == copy_forwards) { 1087 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1088 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1089 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1090 } else { 1091 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1092 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1093 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1094 } 1095 __ bind(L1); 1096 1097 __ tbz(count, 1, L2); 1098 // this is the same as above but copying only 2 longs hence 1099 // there is no intervening stp between the str instructions 1100 // but note that the offset and register patterns are still 1101 // the same 1102 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1103 if (direction == copy_forwards) { 1104 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1105 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1106 } else { 1107 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1108 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1109 } 1110 __ bind(L2); 1111 1112 // for forwards copy we need to re-adjust the offsets we 1113 // applied so that s and d are follow the last words written 1114 1115 if (direction == copy_forwards) { 1116 __ add(s, s, 16); 1117 __ add(d, d, 8); 1118 } 1119 1120 } 1121 1122 __ ret(lr); 1123 } 1124 } 1125 1126 // Small copy: less than 16 bytes. 1127 // 1128 // NB: Ignores all of the bits of count which represent more than 15 1129 // bytes, so a caller doesn't have to mask them. 1130 1131 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1132 bool is_backwards = step < 0; 1133 size_t granularity = uabs(step); 1134 int direction = is_backwards ? -1 : 1; 1135 1136 Label Lword, Lint, Lshort, Lbyte; 1137 1138 assert(granularity 1139 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1140 1141 const Register t0 = r3; 1142 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1143 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1144 1145 // ??? I don't know if this bit-test-and-branch is the right thing 1146 // to do. It does a lot of jumping, resulting in several 1147 // mispredicted branches. It might make more sense to do this 1148 // with something like Duff's device with a single computed branch. 1149 1150 __ tbz(count, 3 - exact_log2(granularity), Lword); 1151 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1152 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1153 __ bind(Lword); 1154 1155 if (granularity <= sizeof (jint)) { 1156 __ tbz(count, 2 - exact_log2(granularity), Lint); 1157 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1158 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1159 __ bind(Lint); 1160 } 1161 1162 if (granularity <= sizeof (jshort)) { 1163 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1164 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1165 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1166 __ bind(Lshort); 1167 } 1168 1169 if (granularity <= sizeof (jbyte)) { 1170 __ tbz(count, 0, Lbyte); 1171 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1172 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1173 __ bind(Lbyte); 1174 } 1175 } 1176 1177 Label copy_f, copy_b; 1178 Label copy_obj_f, copy_obj_b; 1179 Label copy_obj_uninit_f, copy_obj_uninit_b; 1180 1181 // All-singing all-dancing memory copy. 1182 // 1183 // Copy count units of memory from s to d. The size of a unit is 1184 // step, which can be positive or negative depending on the direction 1185 // of copy. If is_aligned is false, we align the source address. 1186 // 1187 1188 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1189 Register s, Register d, Register count, int step) { 1190 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1191 bool is_backwards = step < 0; 1192 unsigned int granularity = uabs(step); 1193 const Register t0 = r3, t1 = r4; 1194 1195 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1196 // load all the data before writing anything 1197 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1198 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1199 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1200 const Register send = r17, dend = r16; 1201 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1202 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1203 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1204 1205 if (PrefetchCopyIntervalInBytes > 0) 1206 __ prfm(Address(s, 0), PLDL1KEEP); 1207 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1208 __ br(Assembler::HI, copy_big); 1209 1210 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1211 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1212 1213 __ cmp(count, u1(16/granularity)); 1214 __ br(Assembler::LS, copy16); 1215 1216 __ cmp(count, u1(64/granularity)); 1217 __ br(Assembler::HI, copy80); 1218 1219 __ cmp(count, u1(32/granularity)); 1220 __ br(Assembler::LS, copy32); 1221 1222 // 33..64 bytes 1223 if (UseSIMDForMemoryOps) { 1224 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1225 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1226 bs.copy_store_at_32(Address(d, 0), v0, v1); 1227 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1228 } else { 1229 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1230 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1231 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1232 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1233 1234 bs.copy_store_at_16(Address(d, 0), t0, t1); 1235 bs.copy_store_at_16(Address(d, 16), t2, t3); 1236 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1237 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1238 } 1239 __ b(finish); 1240 1241 // 17..32 bytes 1242 __ bind(copy32); 1243 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1244 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1245 1246 bs.copy_store_at_16(Address(d, 0), t0, t1); 1247 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1248 __ b(finish); 1249 1250 // 65..80/96 bytes 1251 // (96 bytes if SIMD because we do 32 byes per instruction) 1252 __ bind(copy80); 1253 if (UseSIMDForMemoryOps) { 1254 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1255 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1256 // Unaligned pointers can be an issue for copying. 1257 // The issue has more chances to happen when granularity of data is 1258 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1259 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1260 // The most performance drop has been seen for the range 65-80 bytes. 1261 // For such cases using the pair of ldp/stp instead of the third pair of 1262 // ldpq/stpq fixes the performance issue. 1263 if (granularity < sizeof (jint)) { 1264 Label copy96; 1265 __ cmp(count, u1(80/granularity)); 1266 __ br(Assembler::HI, copy96); 1267 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1268 1269 bs.copy_store_at_32(Address(d, 0), v0, v1); 1270 bs.copy_store_at_32(Address(d, 32), v2, v3); 1271 1272 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1273 __ b(finish); 1274 1275 __ bind(copy96); 1276 } 1277 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1278 1279 bs.copy_store_at_32(Address(d, 0), v0, v1); 1280 bs.copy_store_at_32(Address(d, 32), v2, v3); 1281 1282 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1283 } else { 1284 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1285 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1286 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1287 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1288 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1289 1290 bs.copy_store_at_16(Address(d, 0), t0, t1); 1291 bs.copy_store_at_16(Address(d, 16), t2, t3); 1292 bs.copy_store_at_16(Address(d, 32), t4, t5); 1293 bs.copy_store_at_16(Address(d, 48), t6, t7); 1294 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1295 } 1296 __ b(finish); 1297 1298 // 0..16 bytes 1299 __ bind(copy16); 1300 __ cmp(count, u1(8/granularity)); 1301 __ br(Assembler::LO, copy8); 1302 1303 // 8..16 bytes 1304 bs.copy_load_at_8(t0, Address(s, 0)); 1305 bs.copy_load_at_8(t1, Address(send, -8)); 1306 bs.copy_store_at_8(Address(d, 0), t0); 1307 bs.copy_store_at_8(Address(dend, -8), t1); 1308 __ b(finish); 1309 1310 if (granularity < 8) { 1311 // 4..7 bytes 1312 __ bind(copy8); 1313 __ tbz(count, 2 - exact_log2(granularity), copy4); 1314 __ ldrw(t0, Address(s, 0)); 1315 __ ldrw(t1, Address(send, -4)); 1316 __ strw(t0, Address(d, 0)); 1317 __ strw(t1, Address(dend, -4)); 1318 __ b(finish); 1319 if (granularity < 4) { 1320 // 0..3 bytes 1321 __ bind(copy4); 1322 __ cbz(count, finish); // get rid of 0 case 1323 if (granularity == 2) { 1324 __ ldrh(t0, Address(s, 0)); 1325 __ strh(t0, Address(d, 0)); 1326 } else { // granularity == 1 1327 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1328 // the first and last byte. 1329 // Handle the 3 byte case by loading and storing base + count/2 1330 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1331 // This does means in the 1 byte case we load/store the same 1332 // byte 3 times. 1333 __ lsr(count, count, 1); 1334 __ ldrb(t0, Address(s, 0)); 1335 __ ldrb(t1, Address(send, -1)); 1336 __ ldrb(t2, Address(s, count)); 1337 __ strb(t0, Address(d, 0)); 1338 __ strb(t1, Address(dend, -1)); 1339 __ strb(t2, Address(d, count)); 1340 } 1341 __ b(finish); 1342 } 1343 } 1344 1345 __ bind(copy_big); 1346 if (is_backwards) { 1347 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1348 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1349 } 1350 1351 // Now we've got the small case out of the way we can align the 1352 // source address on a 2-word boundary. 1353 1354 // Here we will materialize a count in r15, which is used by copy_memory_small 1355 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1356 // Up until here, we have used t9, which aliases r15, but from here on, that register 1357 // can not be used as a temp register, as it contains the count. 1358 1359 Label aligned; 1360 1361 if (is_aligned) { 1362 // We may have to adjust by 1 word to get s 2-word-aligned. 1363 __ tbz(s, exact_log2(wordSize), aligned); 1364 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1365 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1366 __ sub(count, count, wordSize/granularity); 1367 } else { 1368 if (is_backwards) { 1369 __ andr(r15, s, 2 * wordSize - 1); 1370 } else { 1371 __ neg(r15, s); 1372 __ andr(r15, r15, 2 * wordSize - 1); 1373 } 1374 // r15 is the byte adjustment needed to align s. 1375 __ cbz(r15, aligned); 1376 int shift = exact_log2(granularity); 1377 if (shift) __ lsr(r15, r15, shift); 1378 __ sub(count, count, r15); 1379 1380 #if 0 1381 // ?? This code is only correct for a disjoint copy. It may or 1382 // may not make sense to use it in that case. 1383 1384 // Copy the first pair; s and d may not be aligned. 1385 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1386 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1387 1388 // Align s and d, adjust count 1389 if (is_backwards) { 1390 __ sub(s, s, r15); 1391 __ sub(d, d, r15); 1392 } else { 1393 __ add(s, s, r15); 1394 __ add(d, d, r15); 1395 } 1396 #else 1397 copy_memory_small(decorators, type, s, d, r15, step); 1398 #endif 1399 } 1400 1401 __ bind(aligned); 1402 1403 // s is now 2-word-aligned. 1404 1405 // We have a count of units and some trailing bytes. Adjust the 1406 // count and do a bulk copy of words. 1407 __ lsr(r15, count, exact_log2(wordSize/granularity)); 1408 if (direction == copy_forwards) { 1409 if (type != T_OBJECT) { 1410 __ bl(copy_f); 1411 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1412 __ bl(copy_obj_uninit_f); 1413 } else { 1414 __ bl(copy_obj_f); 1415 } 1416 } else { 1417 if (type != T_OBJECT) { 1418 __ bl(copy_b); 1419 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1420 __ bl(copy_obj_uninit_b); 1421 } else { 1422 __ bl(copy_obj_b); 1423 } 1424 } 1425 1426 // And the tail. 1427 copy_memory_small(decorators, type, s, d, count, step); 1428 1429 if (granularity >= 8) __ bind(copy8); 1430 if (granularity >= 4) __ bind(copy4); 1431 __ bind(finish); 1432 } 1433 1434 1435 void clobber_registers() { 1436 #ifdef ASSERT 1437 RegSet clobbered 1438 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1439 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1440 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1441 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1442 __ mov(*it, rscratch1); 1443 } 1444 #endif 1445 1446 } 1447 1448 // Scan over array at a for count oops, verifying each one. 1449 // Preserves a and count, clobbers rscratch1 and rscratch2. 1450 void verify_oop_array (int size, Register a, Register count, Register temp) { 1451 Label loop, end; 1452 __ mov(rscratch1, a); 1453 __ mov(rscratch2, zr); 1454 __ bind(loop); 1455 __ cmp(rscratch2, count); 1456 __ br(Assembler::HS, end); 1457 if (size == wordSize) { 1458 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1459 __ verify_oop(temp); 1460 } else { 1461 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1462 __ decode_heap_oop(temp); // calls verify_oop 1463 } 1464 __ add(rscratch2, rscratch2, 1); 1465 __ b(loop); 1466 __ bind(end); 1467 } 1468 1469 // Arguments: 1470 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1471 // ignored 1472 // is_oop - true => oop array, so generate store check code 1473 // name - stub name string 1474 // 1475 // Inputs: 1476 // c_rarg0 - source array address 1477 // c_rarg1 - destination array address 1478 // c_rarg2 - element count, treated as ssize_t, can be zero 1479 // 1480 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1481 // the hardware handle it. The two dwords within qwords that span 1482 // cache line boundaries will still be loaded and stored atomically. 1483 // 1484 // Side Effects: 1485 // disjoint_int_copy_entry is set to the no-overlap entry point 1486 // used by generate_conjoint_int_oop_copy(). 1487 // 1488 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1489 const char *name, bool dest_uninitialized = false) { 1490 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1491 RegSet saved_reg = RegSet::of(s, d, count); 1492 __ align(CodeEntryAlignment); 1493 StubCodeMark mark(this, "StubRoutines", name); 1494 address start = __ pc(); 1495 __ enter(); 1496 1497 if (entry != nullptr) { 1498 *entry = __ pc(); 1499 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1500 BLOCK_COMMENT("Entry:"); 1501 } 1502 1503 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1504 if (dest_uninitialized) { 1505 decorators |= IS_DEST_UNINITIALIZED; 1506 } 1507 if (aligned) { 1508 decorators |= ARRAYCOPY_ALIGNED; 1509 } 1510 1511 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1512 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1513 1514 if (is_oop) { 1515 // save regs before copy_memory 1516 __ push(RegSet::of(d, count), sp); 1517 } 1518 { 1519 // UnsafeMemoryAccess page error: continue after unsafe access 1520 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1521 UnsafeMemoryAccessMark umam(this, add_entry, true); 1522 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1523 } 1524 1525 if (is_oop) { 1526 __ pop(RegSet::of(d, count), sp); 1527 if (VerifyOops) 1528 verify_oop_array(size, d, count, r16); 1529 } 1530 1531 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1532 1533 __ leave(); 1534 __ mov(r0, zr); // return 0 1535 __ ret(lr); 1536 return start; 1537 } 1538 1539 // Arguments: 1540 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1541 // ignored 1542 // is_oop - true => oop array, so generate store check code 1543 // name - stub name string 1544 // 1545 // Inputs: 1546 // c_rarg0 - source array address 1547 // c_rarg1 - destination array address 1548 // c_rarg2 - element count, treated as ssize_t, can be zero 1549 // 1550 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1551 // the hardware handle it. The two dwords within qwords that span 1552 // cache line boundaries will still be loaded and stored atomically. 1553 // 1554 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1555 address *entry, const char *name, 1556 bool dest_uninitialized = false) { 1557 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1558 RegSet saved_regs = RegSet::of(s, d, count); 1559 StubCodeMark mark(this, "StubRoutines", name); 1560 address start = __ pc(); 1561 __ enter(); 1562 1563 if (entry != nullptr) { 1564 *entry = __ pc(); 1565 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1566 BLOCK_COMMENT("Entry:"); 1567 } 1568 1569 // use fwd copy when (d-s) above_equal (count*size) 1570 __ sub(rscratch1, d, s); 1571 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1572 __ br(Assembler::HS, nooverlap_target); 1573 1574 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1575 if (dest_uninitialized) { 1576 decorators |= IS_DEST_UNINITIALIZED; 1577 } 1578 if (aligned) { 1579 decorators |= ARRAYCOPY_ALIGNED; 1580 } 1581 1582 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1583 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1584 1585 if (is_oop) { 1586 // save regs before copy_memory 1587 __ push(RegSet::of(d, count), sp); 1588 } 1589 { 1590 // UnsafeMemoryAccess page error: continue after unsafe access 1591 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1592 UnsafeMemoryAccessMark umam(this, add_entry, true); 1593 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1594 } 1595 if (is_oop) { 1596 __ pop(RegSet::of(d, count), sp); 1597 if (VerifyOops) 1598 verify_oop_array(size, d, count, r16); 1599 } 1600 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1601 __ leave(); 1602 __ mov(r0, zr); // return 0 1603 __ ret(lr); 1604 return start; 1605 } 1606 1607 // Arguments: 1608 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1609 // ignored 1610 // name - stub name string 1611 // 1612 // Inputs: 1613 // c_rarg0 - source array address 1614 // c_rarg1 - destination array address 1615 // c_rarg2 - element count, treated as ssize_t, can be zero 1616 // 1617 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1618 // we let the hardware handle it. The one to eight bytes within words, 1619 // dwords or qwords that span cache line boundaries will still be loaded 1620 // and stored atomically. 1621 // 1622 // Side Effects: 1623 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1624 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1625 // we let the hardware handle it. The one to eight bytes within words, 1626 // dwords or qwords that span cache line boundaries will still be loaded 1627 // and stored atomically. 1628 // 1629 // Side Effects: 1630 // disjoint_byte_copy_entry is set to the no-overlap entry point 1631 // used by generate_conjoint_byte_copy(). 1632 // 1633 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1634 const bool not_oop = false; 1635 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1636 } 1637 1638 // Arguments: 1639 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1640 // ignored 1641 // name - stub name string 1642 // 1643 // Inputs: 1644 // c_rarg0 - source array address 1645 // c_rarg1 - destination array address 1646 // c_rarg2 - element count, treated as ssize_t, can be zero 1647 // 1648 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1649 // we let the hardware handle it. The one to eight bytes within words, 1650 // dwords or qwords that span cache line boundaries will still be loaded 1651 // and stored atomically. 1652 // 1653 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1654 address* entry, const char *name) { 1655 const bool not_oop = false; 1656 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1657 } 1658 1659 // Arguments: 1660 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1661 // ignored 1662 // name - stub name string 1663 // 1664 // Inputs: 1665 // c_rarg0 - source array address 1666 // c_rarg1 - destination array address 1667 // c_rarg2 - element count, treated as ssize_t, can be zero 1668 // 1669 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1670 // let the hardware handle it. The two or four words within dwords 1671 // or qwords that span cache line boundaries will still be loaded 1672 // and stored atomically. 1673 // 1674 // Side Effects: 1675 // disjoint_short_copy_entry is set to the no-overlap entry point 1676 // used by generate_conjoint_short_copy(). 1677 // 1678 address generate_disjoint_short_copy(bool aligned, 1679 address* entry, const char *name) { 1680 const bool not_oop = false; 1681 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1682 } 1683 1684 // Arguments: 1685 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1686 // ignored 1687 // name - stub name string 1688 // 1689 // Inputs: 1690 // c_rarg0 - source array address 1691 // c_rarg1 - destination array address 1692 // c_rarg2 - element count, treated as ssize_t, can be zero 1693 // 1694 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1695 // let the hardware handle it. The two or four words within dwords 1696 // or qwords that span cache line boundaries will still be loaded 1697 // and stored atomically. 1698 // 1699 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1700 address *entry, const char *name) { 1701 const bool not_oop = false; 1702 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1703 1704 } 1705 // Arguments: 1706 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1707 // ignored 1708 // name - stub name string 1709 // 1710 // Inputs: 1711 // c_rarg0 - source array address 1712 // c_rarg1 - destination array address 1713 // c_rarg2 - element count, treated as ssize_t, can be zero 1714 // 1715 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1716 // the hardware handle it. The two dwords within qwords that span 1717 // cache line boundaries will still be loaded and stored atomically. 1718 // 1719 // Side Effects: 1720 // disjoint_int_copy_entry is set to the no-overlap entry point 1721 // used by generate_conjoint_int_oop_copy(). 1722 // 1723 address generate_disjoint_int_copy(bool aligned, address *entry, 1724 const char *name, bool dest_uninitialized = false) { 1725 const bool not_oop = false; 1726 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1727 } 1728 1729 // Arguments: 1730 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1731 // ignored 1732 // name - stub name string 1733 // 1734 // Inputs: 1735 // c_rarg0 - source array address 1736 // c_rarg1 - destination array address 1737 // c_rarg2 - element count, treated as ssize_t, can be zero 1738 // 1739 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1740 // the hardware handle it. The two dwords within qwords that span 1741 // cache line boundaries will still be loaded and stored atomically. 1742 // 1743 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1744 address *entry, const char *name, 1745 bool dest_uninitialized = false) { 1746 const bool not_oop = false; 1747 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1748 } 1749 1750 1751 // Arguments: 1752 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1753 // ignored 1754 // name - stub name string 1755 // 1756 // Inputs: 1757 // c_rarg0 - source array address 1758 // c_rarg1 - destination array address 1759 // c_rarg2 - element count, treated as size_t, can be zero 1760 // 1761 // Side Effects: 1762 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1763 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1764 // 1765 address generate_disjoint_long_copy(bool aligned, address *entry, 1766 const char *name, bool dest_uninitialized = false) { 1767 const bool not_oop = false; 1768 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1769 } 1770 1771 // Arguments: 1772 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1773 // ignored 1774 // name - stub name string 1775 // 1776 // Inputs: 1777 // c_rarg0 - source array address 1778 // c_rarg1 - destination array address 1779 // c_rarg2 - element count, treated as size_t, can be zero 1780 // 1781 address generate_conjoint_long_copy(bool aligned, 1782 address nooverlap_target, address *entry, 1783 const char *name, bool dest_uninitialized = false) { 1784 const bool not_oop = false; 1785 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1786 } 1787 1788 // Arguments: 1789 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1790 // ignored 1791 // name - stub name string 1792 // 1793 // Inputs: 1794 // c_rarg0 - source array address 1795 // c_rarg1 - destination array address 1796 // c_rarg2 - element count, treated as size_t, can be zero 1797 // 1798 // Side Effects: 1799 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1800 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1801 // 1802 address generate_disjoint_oop_copy(bool aligned, address *entry, 1803 const char *name, bool dest_uninitialized) { 1804 const bool is_oop = true; 1805 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1806 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1807 } 1808 1809 // Arguments: 1810 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1811 // ignored 1812 // name - stub name string 1813 // 1814 // Inputs: 1815 // c_rarg0 - source array address 1816 // c_rarg1 - destination array address 1817 // c_rarg2 - element count, treated as size_t, can be zero 1818 // 1819 address generate_conjoint_oop_copy(bool aligned, 1820 address nooverlap_target, address *entry, 1821 const char *name, bool dest_uninitialized) { 1822 const bool is_oop = true; 1823 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1824 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1825 name, dest_uninitialized); 1826 } 1827 1828 1829 // Helper for generating a dynamic type check. 1830 // Smashes rscratch1, rscratch2. 1831 void generate_type_check(Register sub_klass, 1832 Register super_check_offset, 1833 Register super_klass, 1834 Label& L_success) { 1835 assert_different_registers(sub_klass, super_check_offset, super_klass); 1836 1837 BLOCK_COMMENT("type_check:"); 1838 1839 Label L_miss; 1840 1841 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1842 super_check_offset); 1843 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1844 1845 // Fall through on failure! 1846 __ BIND(L_miss); 1847 } 1848 1849 // 1850 // Generate checkcasting array copy stub 1851 // 1852 // Input: 1853 // c_rarg0 - source array address 1854 // c_rarg1 - destination array address 1855 // c_rarg2 - element count, treated as ssize_t, can be zero 1856 // c_rarg3 - size_t ckoff (super_check_offset) 1857 // c_rarg4 - oop ckval (super_klass) 1858 // 1859 // Output: 1860 // r0 == 0 - success 1861 // r0 == -1^K - failure, where K is partial transfer count 1862 // 1863 address generate_checkcast_copy(const char *name, address *entry, 1864 bool dest_uninitialized = false) { 1865 1866 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1867 1868 // Input registers (after setup_arg_regs) 1869 const Register from = c_rarg0; // source array address 1870 const Register to = c_rarg1; // destination array address 1871 const Register count = c_rarg2; // elementscount 1872 const Register ckoff = c_rarg3; // super_check_offset 1873 const Register ckval = c_rarg4; // super_klass 1874 1875 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1876 RegSet wb_post_saved_regs = RegSet::of(count); 1877 1878 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1879 const Register copied_oop = r22; // actual oop copied 1880 const Register count_save = r21; // orig elementscount 1881 const Register start_to = r20; // destination array start address 1882 const Register r19_klass = r19; // oop._klass 1883 1884 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1885 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1886 1887 //--------------------------------------------------------------- 1888 // Assembler stub will be used for this call to arraycopy 1889 // if the two arrays are subtypes of Object[] but the 1890 // destination array type is not equal to or a supertype 1891 // of the source type. Each element must be separately 1892 // checked. 1893 1894 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1895 copied_oop, r19_klass, count_save); 1896 1897 __ align(CodeEntryAlignment); 1898 StubCodeMark mark(this, "StubRoutines", name); 1899 address start = __ pc(); 1900 1901 __ enter(); // required for proper stackwalking of RuntimeStub frame 1902 1903 #ifdef ASSERT 1904 // caller guarantees that the arrays really are different 1905 // otherwise, we would have to make conjoint checks 1906 { Label L; 1907 __ b(L); // conjoint check not yet implemented 1908 __ stop("checkcast_copy within a single array"); 1909 __ bind(L); 1910 } 1911 #endif //ASSERT 1912 1913 // Caller of this entry point must set up the argument registers. 1914 if (entry != nullptr) { 1915 *entry = __ pc(); 1916 BLOCK_COMMENT("Entry:"); 1917 } 1918 1919 // Empty array: Nothing to do. 1920 __ cbz(count, L_done); 1921 __ push(RegSet::of(r19, r20, r21, r22), sp); 1922 1923 #ifdef ASSERT 1924 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1925 // The ckoff and ckval must be mutually consistent, 1926 // even though caller generates both. 1927 { Label L; 1928 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1929 __ ldrw(start_to, Address(ckval, sco_offset)); 1930 __ cmpw(ckoff, start_to); 1931 __ br(Assembler::EQ, L); 1932 __ stop("super_check_offset inconsistent"); 1933 __ bind(L); 1934 } 1935 #endif //ASSERT 1936 1937 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1938 bool is_oop = true; 1939 int element_size = UseCompressedOops ? 4 : 8; 1940 if (dest_uninitialized) { 1941 decorators |= IS_DEST_UNINITIALIZED; 1942 } 1943 1944 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1945 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1946 1947 // save the original count 1948 __ mov(count_save, count); 1949 1950 // Copy from low to high addresses 1951 __ mov(start_to, to); // Save destination array start address 1952 __ b(L_load_element); 1953 1954 // ======== begin loop ======== 1955 // (Loop is rotated; its entry is L_load_element.) 1956 // Loop control: 1957 // for (; count != 0; count--) { 1958 // copied_oop = load_heap_oop(from++); 1959 // ... generate_type_check ...; 1960 // store_heap_oop(to++, copied_oop); 1961 // } 1962 __ align(OptoLoopAlignment); 1963 1964 __ BIND(L_store_element); 1965 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1966 __ post(to, element_size), copied_oop, noreg, 1967 gct1, gct2, gct3); 1968 __ sub(count, count, 1); 1969 __ cbz(count, L_do_card_marks); 1970 1971 // ======== loop entry is here ======== 1972 __ BIND(L_load_element); 1973 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1974 copied_oop, noreg, __ post(from, element_size), 1975 gct1); 1976 __ cbz(copied_oop, L_store_element); 1977 1978 __ load_klass(r19_klass, copied_oop);// query the object klass 1979 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1980 // ======== end loop ======== 1981 1982 // It was a real error; we must depend on the caller to finish the job. 1983 // Register count = remaining oops, count_orig = total oops. 1984 // Emit GC store barriers for the oops we have copied and report 1985 // their number to the caller. 1986 1987 __ subs(count, count_save, count); // K = partially copied oop count 1988 __ eon(count, count, zr); // report (-1^K) to caller 1989 __ br(Assembler::EQ, L_done_pop); 1990 1991 __ BIND(L_do_card_marks); 1992 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1993 1994 __ bind(L_done_pop); 1995 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1996 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1997 1998 __ bind(L_done); 1999 __ mov(r0, count); 2000 __ leave(); 2001 __ ret(lr); 2002 2003 return start; 2004 } 2005 2006 // Perform range checks on the proposed arraycopy. 2007 // Kills temp, but nothing else. 2008 // Also, clean the sign bits of src_pos and dst_pos. 2009 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2010 Register src_pos, // source position (c_rarg1) 2011 Register dst, // destination array oo (c_rarg2) 2012 Register dst_pos, // destination position (c_rarg3) 2013 Register length, 2014 Register temp, 2015 Label& L_failed) { 2016 BLOCK_COMMENT("arraycopy_range_checks:"); 2017 2018 assert_different_registers(rscratch1, temp); 2019 2020 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2021 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2022 __ addw(temp, length, src_pos); 2023 __ cmpw(temp, rscratch1); 2024 __ br(Assembler::HI, L_failed); 2025 2026 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2027 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2028 __ addw(temp, length, dst_pos); 2029 __ cmpw(temp, rscratch1); 2030 __ br(Assembler::HI, L_failed); 2031 2032 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2033 __ movw(src_pos, src_pos); 2034 __ movw(dst_pos, dst_pos); 2035 2036 BLOCK_COMMENT("arraycopy_range_checks done"); 2037 } 2038 2039 // These stubs get called from some dumb test routine. 2040 // I'll write them properly when they're called from 2041 // something that's actually doing something. 2042 static void fake_arraycopy_stub(address src, address dst, int count) { 2043 assert(count == 0, "huh?"); 2044 } 2045 2046 2047 // 2048 // Generate 'unsafe' array copy stub 2049 // Though just as safe as the other stubs, it takes an unscaled 2050 // size_t argument instead of an element count. 2051 // 2052 // Input: 2053 // c_rarg0 - source array address 2054 // c_rarg1 - destination array address 2055 // c_rarg2 - byte count, treated as ssize_t, can be zero 2056 // 2057 // Examines the alignment of the operands and dispatches 2058 // to a long, int, short, or byte copy loop. 2059 // 2060 address generate_unsafe_copy(const char *name, 2061 address byte_copy_entry, 2062 address short_copy_entry, 2063 address int_copy_entry, 2064 address long_copy_entry) { 2065 Label L_long_aligned, L_int_aligned, L_short_aligned; 2066 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2067 2068 __ align(CodeEntryAlignment); 2069 StubCodeMark mark(this, "StubRoutines", name); 2070 address start = __ pc(); 2071 __ enter(); // required for proper stackwalking of RuntimeStub frame 2072 2073 // bump this on entry, not on exit: 2074 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2075 2076 __ orr(rscratch1, s, d); 2077 __ orr(rscratch1, rscratch1, count); 2078 2079 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2080 __ cbz(rscratch1, L_long_aligned); 2081 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2082 __ cbz(rscratch1, L_int_aligned); 2083 __ tbz(rscratch1, 0, L_short_aligned); 2084 __ b(RuntimeAddress(byte_copy_entry)); 2085 2086 __ BIND(L_short_aligned); 2087 __ lsr(count, count, LogBytesPerShort); // size => short_count 2088 __ b(RuntimeAddress(short_copy_entry)); 2089 __ BIND(L_int_aligned); 2090 __ lsr(count, count, LogBytesPerInt); // size => int_count 2091 __ b(RuntimeAddress(int_copy_entry)); 2092 __ BIND(L_long_aligned); 2093 __ lsr(count, count, LogBytesPerLong); // size => long_count 2094 __ b(RuntimeAddress(long_copy_entry)); 2095 2096 return start; 2097 } 2098 2099 // 2100 // Generate generic array copy stubs 2101 // 2102 // Input: 2103 // c_rarg0 - src oop 2104 // c_rarg1 - src_pos (32-bits) 2105 // c_rarg2 - dst oop 2106 // c_rarg3 - dst_pos (32-bits) 2107 // c_rarg4 - element count (32-bits) 2108 // 2109 // Output: 2110 // r0 == 0 - success 2111 // r0 == -1^K - failure, where K is partial transfer count 2112 // 2113 address generate_generic_copy(const char *name, 2114 address byte_copy_entry, address short_copy_entry, 2115 address int_copy_entry, address oop_copy_entry, 2116 address long_copy_entry, address checkcast_copy_entry) { 2117 2118 Label L_failed, L_objArray; 2119 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2120 2121 // Input registers 2122 const Register src = c_rarg0; // source array oop 2123 const Register src_pos = c_rarg1; // source position 2124 const Register dst = c_rarg2; // destination array oop 2125 const Register dst_pos = c_rarg3; // destination position 2126 const Register length = c_rarg4; 2127 2128 2129 // Registers used as temps 2130 const Register dst_klass = c_rarg5; 2131 2132 __ align(CodeEntryAlignment); 2133 2134 StubCodeMark mark(this, "StubRoutines", name); 2135 2136 address start = __ pc(); 2137 2138 __ enter(); // required for proper stackwalking of RuntimeStub frame 2139 2140 // bump this on entry, not on exit: 2141 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2142 2143 //----------------------------------------------------------------------- 2144 // Assembler stub will be used for this call to arraycopy 2145 // if the following conditions are met: 2146 // 2147 // (1) src and dst must not be null. 2148 // (2) src_pos must not be negative. 2149 // (3) dst_pos must not be negative. 2150 // (4) length must not be negative. 2151 // (5) src klass and dst klass should be the same and not null. 2152 // (6) src and dst should be arrays. 2153 // (7) src_pos + length must not exceed length of src. 2154 // (8) dst_pos + length must not exceed length of dst. 2155 // 2156 2157 // if (src == nullptr) return -1; 2158 __ cbz(src, L_failed); 2159 2160 // if (src_pos < 0) return -1; 2161 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2162 2163 // if (dst == nullptr) return -1; 2164 __ cbz(dst, L_failed); 2165 2166 // if (dst_pos < 0) return -1; 2167 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2168 2169 // registers used as temp 2170 const Register scratch_length = r16; // elements count to copy 2171 const Register scratch_src_klass = r17; // array klass 2172 const Register lh = r15; // layout helper 2173 2174 // if (length < 0) return -1; 2175 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2176 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2177 2178 __ load_klass(scratch_src_klass, src); 2179 #ifdef ASSERT 2180 // assert(src->klass() != nullptr); 2181 { 2182 BLOCK_COMMENT("assert klasses not null {"); 2183 Label L1, L2; 2184 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2185 __ bind(L1); 2186 __ stop("broken null klass"); 2187 __ bind(L2); 2188 __ load_klass(rscratch1, dst); 2189 __ cbz(rscratch1, L1); // this would be broken also 2190 BLOCK_COMMENT("} assert klasses not null done"); 2191 } 2192 #endif 2193 2194 // Load layout helper (32-bits) 2195 // 2196 // |array_tag| | header_size | element_type | |log2_element_size| 2197 // 32 30 24 16 8 2 0 2198 // 2199 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2200 // 2201 2202 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2203 2204 // Handle objArrays completely differently... 2205 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2206 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2207 __ movw(rscratch1, objArray_lh); 2208 __ eorw(rscratch2, lh, rscratch1); 2209 __ cbzw(rscratch2, L_objArray); 2210 2211 // if (src->klass() != dst->klass()) return -1; 2212 __ load_klass(rscratch2, dst); 2213 __ eor(rscratch2, rscratch2, scratch_src_klass); 2214 __ cbnz(rscratch2, L_failed); 2215 2216 // if (!src->is_Array()) return -1; 2217 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2218 2219 // At this point, it is known to be a typeArray (array_tag 0x3). 2220 #ifdef ASSERT 2221 { 2222 BLOCK_COMMENT("assert primitive array {"); 2223 Label L; 2224 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2225 __ cmpw(lh, rscratch2); 2226 __ br(Assembler::GE, L); 2227 __ stop("must be a primitive array"); 2228 __ bind(L); 2229 BLOCK_COMMENT("} assert primitive array done"); 2230 } 2231 #endif 2232 2233 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2234 rscratch2, L_failed); 2235 2236 // TypeArrayKlass 2237 // 2238 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2239 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2240 // 2241 2242 const Register rscratch1_offset = rscratch1; // array offset 2243 const Register r15_elsize = lh; // element size 2244 2245 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2246 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2247 __ add(src, src, rscratch1_offset); // src array offset 2248 __ add(dst, dst, rscratch1_offset); // dst array offset 2249 BLOCK_COMMENT("choose copy loop based on element size"); 2250 2251 // next registers should be set before the jump to corresponding stub 2252 const Register from = c_rarg0; // source array address 2253 const Register to = c_rarg1; // destination array address 2254 const Register count = c_rarg2; // elements count 2255 2256 // 'from', 'to', 'count' registers should be set in such order 2257 // since they are the same as 'src', 'src_pos', 'dst'. 2258 2259 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2260 2261 // The possible values of elsize are 0-3, i.e. exact_log2(element 2262 // size in bytes). We do a simple bitwise binary search. 2263 __ BIND(L_copy_bytes); 2264 __ tbnz(r15_elsize, 1, L_copy_ints); 2265 __ tbnz(r15_elsize, 0, L_copy_shorts); 2266 __ lea(from, Address(src, src_pos));// src_addr 2267 __ lea(to, Address(dst, dst_pos));// dst_addr 2268 __ movw(count, scratch_length); // length 2269 __ b(RuntimeAddress(byte_copy_entry)); 2270 2271 __ BIND(L_copy_shorts); 2272 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2273 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2274 __ movw(count, scratch_length); // length 2275 __ b(RuntimeAddress(short_copy_entry)); 2276 2277 __ BIND(L_copy_ints); 2278 __ tbnz(r15_elsize, 0, L_copy_longs); 2279 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2280 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2281 __ movw(count, scratch_length); // length 2282 __ b(RuntimeAddress(int_copy_entry)); 2283 2284 __ BIND(L_copy_longs); 2285 #ifdef ASSERT 2286 { 2287 BLOCK_COMMENT("assert long copy {"); 2288 Label L; 2289 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2290 __ cmpw(r15_elsize, LogBytesPerLong); 2291 __ br(Assembler::EQ, L); 2292 __ stop("must be long copy, but elsize is wrong"); 2293 __ bind(L); 2294 BLOCK_COMMENT("} assert long copy done"); 2295 } 2296 #endif 2297 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2298 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2299 __ movw(count, scratch_length); // length 2300 __ b(RuntimeAddress(long_copy_entry)); 2301 2302 // ObjArrayKlass 2303 __ BIND(L_objArray); 2304 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2305 2306 Label L_plain_copy, L_checkcast_copy; 2307 // test array classes for subtyping 2308 __ load_klass(r15, dst); 2309 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2310 __ br(Assembler::NE, L_checkcast_copy); 2311 2312 // Identically typed arrays can be copied without element-wise checks. 2313 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2314 rscratch2, L_failed); 2315 2316 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2317 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2318 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2319 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2320 __ movw(count, scratch_length); // length 2321 __ BIND(L_plain_copy); 2322 __ b(RuntimeAddress(oop_copy_entry)); 2323 2324 __ BIND(L_checkcast_copy); 2325 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2326 { 2327 // Before looking at dst.length, make sure dst is also an objArray. 2328 __ ldrw(rscratch1, Address(r15, lh_offset)); 2329 __ movw(rscratch2, objArray_lh); 2330 __ eorw(rscratch1, rscratch1, rscratch2); 2331 __ cbnzw(rscratch1, L_failed); 2332 2333 // It is safe to examine both src.length and dst.length. 2334 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2335 r15, L_failed); 2336 2337 __ load_klass(dst_klass, dst); // reload 2338 2339 // Marshal the base address arguments now, freeing registers. 2340 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2341 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2342 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2343 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2344 __ movw(count, length); // length (reloaded) 2345 Register sco_temp = c_rarg3; // this register is free now 2346 assert_different_registers(from, to, count, sco_temp, 2347 dst_klass, scratch_src_klass); 2348 // assert_clean_int(count, sco_temp); 2349 2350 // Generate the type check. 2351 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2352 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2353 2354 // Smashes rscratch1, rscratch2 2355 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2356 2357 // Fetch destination element klass from the ObjArrayKlass header. 2358 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2359 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2360 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2361 2362 // the checkcast_copy loop needs two extra arguments: 2363 assert(c_rarg3 == sco_temp, "#3 already in place"); 2364 // Set up arguments for checkcast_copy_entry. 2365 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2366 __ b(RuntimeAddress(checkcast_copy_entry)); 2367 } 2368 2369 __ BIND(L_failed); 2370 __ mov(r0, -1); 2371 __ leave(); // required for proper stackwalking of RuntimeStub frame 2372 __ ret(lr); 2373 2374 return start; 2375 } 2376 2377 // 2378 // Generate stub for array fill. If "aligned" is true, the 2379 // "to" address is assumed to be heapword aligned. 2380 // 2381 // Arguments for generated stub: 2382 // to: c_rarg0 2383 // value: c_rarg1 2384 // count: c_rarg2 treated as signed 2385 // 2386 address generate_fill(BasicType t, bool aligned, const char *name) { 2387 __ align(CodeEntryAlignment); 2388 StubCodeMark mark(this, "StubRoutines", name); 2389 address start = __ pc(); 2390 2391 BLOCK_COMMENT("Entry:"); 2392 2393 const Register to = c_rarg0; // source array address 2394 const Register value = c_rarg1; // value 2395 const Register count = c_rarg2; // elements count 2396 2397 const Register bz_base = r10; // base for block_zero routine 2398 const Register cnt_words = r11; // temp register 2399 2400 __ enter(); 2401 2402 Label L_fill_elements, L_exit1; 2403 2404 int shift = -1; 2405 switch (t) { 2406 case T_BYTE: 2407 shift = 0; 2408 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2409 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2410 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2411 __ br(Assembler::LO, L_fill_elements); 2412 break; 2413 case T_SHORT: 2414 shift = 1; 2415 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2416 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2417 __ br(Assembler::LO, L_fill_elements); 2418 break; 2419 case T_INT: 2420 shift = 2; 2421 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2422 __ br(Assembler::LO, L_fill_elements); 2423 break; 2424 default: ShouldNotReachHere(); 2425 } 2426 2427 // Align source address at 8 bytes address boundary. 2428 Label L_skip_align1, L_skip_align2, L_skip_align4; 2429 if (!aligned) { 2430 switch (t) { 2431 case T_BYTE: 2432 // One byte misalignment happens only for byte arrays. 2433 __ tbz(to, 0, L_skip_align1); 2434 __ strb(value, Address(__ post(to, 1))); 2435 __ subw(count, count, 1); 2436 __ bind(L_skip_align1); 2437 // Fallthrough 2438 case T_SHORT: 2439 // Two bytes misalignment happens only for byte and short (char) arrays. 2440 __ tbz(to, 1, L_skip_align2); 2441 __ strh(value, Address(__ post(to, 2))); 2442 __ subw(count, count, 2 >> shift); 2443 __ bind(L_skip_align2); 2444 // Fallthrough 2445 case T_INT: 2446 // Align to 8 bytes, we know we are 4 byte aligned to start. 2447 __ tbz(to, 2, L_skip_align4); 2448 __ strw(value, Address(__ post(to, 4))); 2449 __ subw(count, count, 4 >> shift); 2450 __ bind(L_skip_align4); 2451 break; 2452 default: ShouldNotReachHere(); 2453 } 2454 } 2455 2456 // 2457 // Fill large chunks 2458 // 2459 __ lsrw(cnt_words, count, 3 - shift); // number of words 2460 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2461 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2462 if (UseBlockZeroing) { 2463 Label non_block_zeroing, rest; 2464 // If the fill value is zero we can use the fast zero_words(). 2465 __ cbnz(value, non_block_zeroing); 2466 __ mov(bz_base, to); 2467 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2468 address tpc = __ zero_words(bz_base, cnt_words); 2469 if (tpc == nullptr) { 2470 fatal("CodeCache is full at generate_fill"); 2471 } 2472 __ b(rest); 2473 __ bind(non_block_zeroing); 2474 __ fill_words(to, cnt_words, value); 2475 __ bind(rest); 2476 } else { 2477 __ fill_words(to, cnt_words, value); 2478 } 2479 2480 // Remaining count is less than 8 bytes. Fill it by a single store. 2481 // Note that the total length is no less than 8 bytes. 2482 if (t == T_BYTE || t == T_SHORT) { 2483 Label L_exit1; 2484 __ cbzw(count, L_exit1); 2485 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2486 __ str(value, Address(to, -8)); // overwrite some elements 2487 __ bind(L_exit1); 2488 __ leave(); 2489 __ ret(lr); 2490 } 2491 2492 // Handle copies less than 8 bytes. 2493 Label L_fill_2, L_fill_4, L_exit2; 2494 __ bind(L_fill_elements); 2495 switch (t) { 2496 case T_BYTE: 2497 __ tbz(count, 0, L_fill_2); 2498 __ strb(value, Address(__ post(to, 1))); 2499 __ bind(L_fill_2); 2500 __ tbz(count, 1, L_fill_4); 2501 __ strh(value, Address(__ post(to, 2))); 2502 __ bind(L_fill_4); 2503 __ tbz(count, 2, L_exit2); 2504 __ strw(value, Address(to)); 2505 break; 2506 case T_SHORT: 2507 __ tbz(count, 0, L_fill_4); 2508 __ strh(value, Address(__ post(to, 2))); 2509 __ bind(L_fill_4); 2510 __ tbz(count, 1, L_exit2); 2511 __ strw(value, Address(to)); 2512 break; 2513 case T_INT: 2514 __ cbzw(count, L_exit2); 2515 __ strw(value, Address(to)); 2516 break; 2517 default: ShouldNotReachHere(); 2518 } 2519 __ bind(L_exit2); 2520 __ leave(); 2521 __ ret(lr); 2522 return start; 2523 } 2524 2525 address generate_data_cache_writeback() { 2526 const Register line = c_rarg0; // address of line to write back 2527 2528 __ align(CodeEntryAlignment); 2529 2530 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2531 2532 address start = __ pc(); 2533 __ enter(); 2534 __ cache_wb(Address(line, 0)); 2535 __ leave(); 2536 __ ret(lr); 2537 2538 return start; 2539 } 2540 2541 address generate_data_cache_writeback_sync() { 2542 const Register is_pre = c_rarg0; // pre or post sync 2543 2544 __ align(CodeEntryAlignment); 2545 2546 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2547 2548 // pre wbsync is a no-op 2549 // post wbsync translates to an sfence 2550 2551 Label skip; 2552 address start = __ pc(); 2553 __ enter(); 2554 __ cbnz(is_pre, skip); 2555 __ cache_wbsync(false); 2556 __ bind(skip); 2557 __ leave(); 2558 __ ret(lr); 2559 2560 return start; 2561 } 2562 2563 void generate_arraycopy_stubs() { 2564 address entry; 2565 address entry_jbyte_arraycopy; 2566 address entry_jshort_arraycopy; 2567 address entry_jint_arraycopy; 2568 address entry_oop_arraycopy; 2569 address entry_jlong_arraycopy; 2570 address entry_checkcast_arraycopy; 2571 2572 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2573 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2574 2575 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2576 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2577 2578 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2579 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2580 2581 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2582 2583 //*** jbyte 2584 // Always need aligned and unaligned versions 2585 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2586 "jbyte_disjoint_arraycopy"); 2587 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2588 &entry_jbyte_arraycopy, 2589 "jbyte_arraycopy"); 2590 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2591 "arrayof_jbyte_disjoint_arraycopy"); 2592 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2593 "arrayof_jbyte_arraycopy"); 2594 2595 //*** jshort 2596 // Always need aligned and unaligned versions 2597 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2598 "jshort_disjoint_arraycopy"); 2599 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2600 &entry_jshort_arraycopy, 2601 "jshort_arraycopy"); 2602 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2603 "arrayof_jshort_disjoint_arraycopy"); 2604 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2605 "arrayof_jshort_arraycopy"); 2606 2607 //*** jint 2608 // Aligned versions 2609 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2610 "arrayof_jint_disjoint_arraycopy"); 2611 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2612 "arrayof_jint_arraycopy"); 2613 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2614 // entry_jint_arraycopy always points to the unaligned version 2615 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2616 "jint_disjoint_arraycopy"); 2617 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2618 &entry_jint_arraycopy, 2619 "jint_arraycopy"); 2620 2621 //*** jlong 2622 // It is always aligned 2623 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2624 "arrayof_jlong_disjoint_arraycopy"); 2625 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2626 "arrayof_jlong_arraycopy"); 2627 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2628 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2629 2630 //*** oops 2631 { 2632 // With compressed oops we need unaligned versions; notice that 2633 // we overwrite entry_oop_arraycopy. 2634 bool aligned = !UseCompressedOops; 2635 2636 StubRoutines::_arrayof_oop_disjoint_arraycopy 2637 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2638 /*dest_uninitialized*/false); 2639 StubRoutines::_arrayof_oop_arraycopy 2640 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2641 /*dest_uninitialized*/false); 2642 // Aligned versions without pre-barriers 2643 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2644 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2645 /*dest_uninitialized*/true); 2646 StubRoutines::_arrayof_oop_arraycopy_uninit 2647 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2648 /*dest_uninitialized*/true); 2649 } 2650 2651 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2652 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2653 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2654 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2655 2656 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2657 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2658 /*dest_uninitialized*/true); 2659 2660 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2661 entry_jbyte_arraycopy, 2662 entry_jshort_arraycopy, 2663 entry_jint_arraycopy, 2664 entry_jlong_arraycopy); 2665 2666 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2667 entry_jbyte_arraycopy, 2668 entry_jshort_arraycopy, 2669 entry_jint_arraycopy, 2670 entry_oop_arraycopy, 2671 entry_jlong_arraycopy, 2672 entry_checkcast_arraycopy); 2673 2674 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2675 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2676 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2677 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2678 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2679 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2680 } 2681 2682 void generate_math_stubs() { Unimplemented(); } 2683 2684 // Arguments: 2685 // 2686 // Inputs: 2687 // c_rarg0 - source byte array address 2688 // c_rarg1 - destination byte array address 2689 // c_rarg2 - K (key) in little endian int array 2690 // 2691 address generate_aescrypt_encryptBlock() { 2692 __ align(CodeEntryAlignment); 2693 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2694 2695 const Register from = c_rarg0; // source array address 2696 const Register to = c_rarg1; // destination array address 2697 const Register key = c_rarg2; // key array address 2698 const Register keylen = rscratch1; 2699 2700 address start = __ pc(); 2701 __ enter(); 2702 2703 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2704 2705 __ aesenc_loadkeys(key, keylen); 2706 __ aesecb_encrypt(from, to, keylen); 2707 2708 __ mov(r0, 0); 2709 2710 __ leave(); 2711 __ ret(lr); 2712 2713 return start; 2714 } 2715 2716 // Arguments: 2717 // 2718 // Inputs: 2719 // c_rarg0 - source byte array address 2720 // c_rarg1 - destination byte array address 2721 // c_rarg2 - K (key) in little endian int array 2722 // 2723 address generate_aescrypt_decryptBlock() { 2724 assert(UseAES, "need AES cryptographic extension support"); 2725 __ align(CodeEntryAlignment); 2726 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2727 Label L_doLast; 2728 2729 const Register from = c_rarg0; // source array address 2730 const Register to = c_rarg1; // destination array address 2731 const Register key = c_rarg2; // key array address 2732 const Register keylen = rscratch1; 2733 2734 address start = __ pc(); 2735 __ enter(); // required for proper stackwalking of RuntimeStub frame 2736 2737 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2738 2739 __ aesecb_decrypt(from, to, key, keylen); 2740 2741 __ mov(r0, 0); 2742 2743 __ leave(); 2744 __ ret(lr); 2745 2746 return start; 2747 } 2748 2749 // Arguments: 2750 // 2751 // Inputs: 2752 // c_rarg0 - source byte array address 2753 // c_rarg1 - destination byte array address 2754 // c_rarg2 - K (key) in little endian int array 2755 // c_rarg3 - r vector byte array address 2756 // c_rarg4 - input length 2757 // 2758 // Output: 2759 // x0 - input length 2760 // 2761 address generate_cipherBlockChaining_encryptAESCrypt() { 2762 assert(UseAES, "need AES cryptographic extension support"); 2763 __ align(CodeEntryAlignment); 2764 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2765 2766 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2767 2768 const Register from = c_rarg0; // source array address 2769 const Register to = c_rarg1; // destination array address 2770 const Register key = c_rarg2; // key array address 2771 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2772 // and left with the results of the last encryption block 2773 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2774 const Register keylen = rscratch1; 2775 2776 address start = __ pc(); 2777 2778 __ enter(); 2779 2780 __ movw(rscratch2, len_reg); 2781 2782 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2783 2784 __ ld1(v0, __ T16B, rvec); 2785 2786 __ cmpw(keylen, 52); 2787 __ br(Assembler::CC, L_loadkeys_44); 2788 __ br(Assembler::EQ, L_loadkeys_52); 2789 2790 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2791 __ rev32(v17, __ T16B, v17); 2792 __ rev32(v18, __ T16B, v18); 2793 __ BIND(L_loadkeys_52); 2794 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2795 __ rev32(v19, __ T16B, v19); 2796 __ rev32(v20, __ T16B, v20); 2797 __ BIND(L_loadkeys_44); 2798 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2799 __ rev32(v21, __ T16B, v21); 2800 __ rev32(v22, __ T16B, v22); 2801 __ rev32(v23, __ T16B, v23); 2802 __ rev32(v24, __ T16B, v24); 2803 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2804 __ rev32(v25, __ T16B, v25); 2805 __ rev32(v26, __ T16B, v26); 2806 __ rev32(v27, __ T16B, v27); 2807 __ rev32(v28, __ T16B, v28); 2808 __ ld1(v29, v30, v31, __ T16B, key); 2809 __ rev32(v29, __ T16B, v29); 2810 __ rev32(v30, __ T16B, v30); 2811 __ rev32(v31, __ T16B, v31); 2812 2813 __ BIND(L_aes_loop); 2814 __ ld1(v1, __ T16B, __ post(from, 16)); 2815 __ eor(v0, __ T16B, v0, v1); 2816 2817 __ br(Assembler::CC, L_rounds_44); 2818 __ br(Assembler::EQ, L_rounds_52); 2819 2820 __ aese(v0, v17); __ aesmc(v0, v0); 2821 __ aese(v0, v18); __ aesmc(v0, v0); 2822 __ BIND(L_rounds_52); 2823 __ aese(v0, v19); __ aesmc(v0, v0); 2824 __ aese(v0, v20); __ aesmc(v0, v0); 2825 __ BIND(L_rounds_44); 2826 __ aese(v0, v21); __ aesmc(v0, v0); 2827 __ aese(v0, v22); __ aesmc(v0, v0); 2828 __ aese(v0, v23); __ aesmc(v0, v0); 2829 __ aese(v0, v24); __ aesmc(v0, v0); 2830 __ aese(v0, v25); __ aesmc(v0, v0); 2831 __ aese(v0, v26); __ aesmc(v0, v0); 2832 __ aese(v0, v27); __ aesmc(v0, v0); 2833 __ aese(v0, v28); __ aesmc(v0, v0); 2834 __ aese(v0, v29); __ aesmc(v0, v0); 2835 __ aese(v0, v30); 2836 __ eor(v0, __ T16B, v0, v31); 2837 2838 __ st1(v0, __ T16B, __ post(to, 16)); 2839 2840 __ subw(len_reg, len_reg, 16); 2841 __ cbnzw(len_reg, L_aes_loop); 2842 2843 __ st1(v0, __ T16B, rvec); 2844 2845 __ mov(r0, rscratch2); 2846 2847 __ leave(); 2848 __ ret(lr); 2849 2850 return start; 2851 } 2852 2853 // Arguments: 2854 // 2855 // Inputs: 2856 // c_rarg0 - source byte array address 2857 // c_rarg1 - destination byte array address 2858 // c_rarg2 - K (key) in little endian int array 2859 // c_rarg3 - r vector byte array address 2860 // c_rarg4 - input length 2861 // 2862 // Output: 2863 // r0 - input length 2864 // 2865 address generate_cipherBlockChaining_decryptAESCrypt() { 2866 assert(UseAES, "need AES cryptographic extension support"); 2867 __ align(CodeEntryAlignment); 2868 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2869 2870 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2871 2872 const Register from = c_rarg0; // source array address 2873 const Register to = c_rarg1; // destination array address 2874 const Register key = c_rarg2; // key array address 2875 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2876 // and left with the results of the last encryption block 2877 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2878 const Register keylen = rscratch1; 2879 2880 address start = __ pc(); 2881 2882 __ enter(); 2883 2884 __ movw(rscratch2, len_reg); 2885 2886 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2887 2888 __ ld1(v2, __ T16B, rvec); 2889 2890 __ ld1(v31, __ T16B, __ post(key, 16)); 2891 __ rev32(v31, __ T16B, v31); 2892 2893 __ cmpw(keylen, 52); 2894 __ br(Assembler::CC, L_loadkeys_44); 2895 __ br(Assembler::EQ, L_loadkeys_52); 2896 2897 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2898 __ rev32(v17, __ T16B, v17); 2899 __ rev32(v18, __ T16B, v18); 2900 __ BIND(L_loadkeys_52); 2901 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2902 __ rev32(v19, __ T16B, v19); 2903 __ rev32(v20, __ T16B, v20); 2904 __ BIND(L_loadkeys_44); 2905 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2906 __ rev32(v21, __ T16B, v21); 2907 __ rev32(v22, __ T16B, v22); 2908 __ rev32(v23, __ T16B, v23); 2909 __ rev32(v24, __ T16B, v24); 2910 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2911 __ rev32(v25, __ T16B, v25); 2912 __ rev32(v26, __ T16B, v26); 2913 __ rev32(v27, __ T16B, v27); 2914 __ rev32(v28, __ T16B, v28); 2915 __ ld1(v29, v30, __ T16B, key); 2916 __ rev32(v29, __ T16B, v29); 2917 __ rev32(v30, __ T16B, v30); 2918 2919 __ BIND(L_aes_loop); 2920 __ ld1(v0, __ T16B, __ post(from, 16)); 2921 __ orr(v1, __ T16B, v0, v0); 2922 2923 __ br(Assembler::CC, L_rounds_44); 2924 __ br(Assembler::EQ, L_rounds_52); 2925 2926 __ aesd(v0, v17); __ aesimc(v0, v0); 2927 __ aesd(v0, v18); __ aesimc(v0, v0); 2928 __ BIND(L_rounds_52); 2929 __ aesd(v0, v19); __ aesimc(v0, v0); 2930 __ aesd(v0, v20); __ aesimc(v0, v0); 2931 __ BIND(L_rounds_44); 2932 __ aesd(v0, v21); __ aesimc(v0, v0); 2933 __ aesd(v0, v22); __ aesimc(v0, v0); 2934 __ aesd(v0, v23); __ aesimc(v0, v0); 2935 __ aesd(v0, v24); __ aesimc(v0, v0); 2936 __ aesd(v0, v25); __ aesimc(v0, v0); 2937 __ aesd(v0, v26); __ aesimc(v0, v0); 2938 __ aesd(v0, v27); __ aesimc(v0, v0); 2939 __ aesd(v0, v28); __ aesimc(v0, v0); 2940 __ aesd(v0, v29); __ aesimc(v0, v0); 2941 __ aesd(v0, v30); 2942 __ eor(v0, __ T16B, v0, v31); 2943 __ eor(v0, __ T16B, v0, v2); 2944 2945 __ st1(v0, __ T16B, __ post(to, 16)); 2946 __ orr(v2, __ T16B, v1, v1); 2947 2948 __ subw(len_reg, len_reg, 16); 2949 __ cbnzw(len_reg, L_aes_loop); 2950 2951 __ st1(v2, __ T16B, rvec); 2952 2953 __ mov(r0, rscratch2); 2954 2955 __ leave(); 2956 __ ret(lr); 2957 2958 return start; 2959 } 2960 2961 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2962 // Inputs: 128-bits. in is preserved. 2963 // The least-significant 64-bit word is in the upper dword of each vector. 2964 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2965 // Output: result 2966 void be_add_128_64(FloatRegister result, FloatRegister in, 2967 FloatRegister inc, FloatRegister tmp) { 2968 assert_different_registers(result, tmp, inc); 2969 2970 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2971 // input 2972 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 2973 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 2974 // MSD == 0 (must be!) to LSD 2975 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 2976 } 2977 2978 // CTR AES crypt. 2979 // Arguments: 2980 // 2981 // Inputs: 2982 // c_rarg0 - source byte array address 2983 // c_rarg1 - destination byte array address 2984 // c_rarg2 - K (key) in little endian int array 2985 // c_rarg3 - counter vector byte array address 2986 // c_rarg4 - input length 2987 // c_rarg5 - saved encryptedCounter start 2988 // c_rarg6 - saved used length 2989 // 2990 // Output: 2991 // r0 - input length 2992 // 2993 address generate_counterMode_AESCrypt() { 2994 const Register in = c_rarg0; 2995 const Register out = c_rarg1; 2996 const Register key = c_rarg2; 2997 const Register counter = c_rarg3; 2998 const Register saved_len = c_rarg4, len = r10; 2999 const Register saved_encrypted_ctr = c_rarg5; 3000 const Register used_ptr = c_rarg6, used = r12; 3001 3002 const Register offset = r7; 3003 const Register keylen = r11; 3004 3005 const unsigned char block_size = 16; 3006 const int bulk_width = 4; 3007 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3008 // performance with larger data sizes, but it also means that the 3009 // fast path isn't used until you have at least 8 blocks, and up 3010 // to 127 bytes of data will be executed on the slow path. For 3011 // that reason, and also so as not to blow away too much icache, 4 3012 // blocks seems like a sensible compromise. 3013 3014 // Algorithm: 3015 // 3016 // if (len == 0) { 3017 // goto DONE; 3018 // } 3019 // int result = len; 3020 // do { 3021 // if (used >= blockSize) { 3022 // if (len >= bulk_width * blockSize) { 3023 // CTR_large_block(); 3024 // if (len == 0) 3025 // goto DONE; 3026 // } 3027 // for (;;) { 3028 // 16ByteVector v0 = counter; 3029 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3030 // used = 0; 3031 // if (len < blockSize) 3032 // break; /* goto NEXT */ 3033 // 16ByteVector v1 = load16Bytes(in, offset); 3034 // v1 = v1 ^ encryptedCounter; 3035 // store16Bytes(out, offset); 3036 // used = blockSize; 3037 // offset += blockSize; 3038 // len -= blockSize; 3039 // if (len == 0) 3040 // goto DONE; 3041 // } 3042 // } 3043 // NEXT: 3044 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3045 // len--; 3046 // } while (len != 0); 3047 // DONE: 3048 // return result; 3049 // 3050 // CTR_large_block() 3051 // Wide bulk encryption of whole blocks. 3052 3053 __ align(CodeEntryAlignment); 3054 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3055 const address start = __ pc(); 3056 __ enter(); 3057 3058 Label DONE, CTR_large_block, large_block_return; 3059 __ ldrw(used, Address(used_ptr)); 3060 __ cbzw(saved_len, DONE); 3061 3062 __ mov(len, saved_len); 3063 __ mov(offset, 0); 3064 3065 // Compute #rounds for AES based on the length of the key array 3066 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3067 3068 __ aesenc_loadkeys(key, keylen); 3069 3070 { 3071 Label L_CTR_loop, NEXT; 3072 3073 __ bind(L_CTR_loop); 3074 3075 __ cmp(used, block_size); 3076 __ br(__ LO, NEXT); 3077 3078 // Maybe we have a lot of data 3079 __ subsw(rscratch1, len, bulk_width * block_size); 3080 __ br(__ HS, CTR_large_block); 3081 __ BIND(large_block_return); 3082 __ cbzw(len, DONE); 3083 3084 // Setup the counter 3085 __ movi(v4, __ T4S, 0); 3086 __ movi(v5, __ T4S, 1); 3087 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3088 3089 // 128-bit big-endian increment 3090 __ ld1(v0, __ T16B, counter); 3091 __ rev64(v16, __ T16B, v0); 3092 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3093 __ rev64(v16, __ T16B, v16); 3094 __ st1(v16, __ T16B, counter); 3095 // Previous counter value is in v0 3096 // v4 contains { 0, 1 } 3097 3098 { 3099 // We have fewer than bulk_width blocks of data left. Encrypt 3100 // them one by one until there is less than a full block 3101 // remaining, being careful to save both the encrypted counter 3102 // and the counter. 3103 3104 Label inner_loop; 3105 __ bind(inner_loop); 3106 // Counter to encrypt is in v0 3107 __ aesecb_encrypt(noreg, noreg, keylen); 3108 __ st1(v0, __ T16B, saved_encrypted_ctr); 3109 3110 // Do we have a remaining full block? 3111 3112 __ mov(used, 0); 3113 __ cmp(len, block_size); 3114 __ br(__ LO, NEXT); 3115 3116 // Yes, we have a full block 3117 __ ldrq(v1, Address(in, offset)); 3118 __ eor(v1, __ T16B, v1, v0); 3119 __ strq(v1, Address(out, offset)); 3120 __ mov(used, block_size); 3121 __ add(offset, offset, block_size); 3122 3123 __ subw(len, len, block_size); 3124 __ cbzw(len, DONE); 3125 3126 // Increment the counter, store it back 3127 __ orr(v0, __ T16B, v16, v16); 3128 __ rev64(v16, __ T16B, v16); 3129 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3130 __ rev64(v16, __ T16B, v16); 3131 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3132 3133 __ b(inner_loop); 3134 } 3135 3136 __ BIND(NEXT); 3137 3138 // Encrypt a single byte, and loop. 3139 // We expect this to be a rare event. 3140 __ ldrb(rscratch1, Address(in, offset)); 3141 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3142 __ eor(rscratch1, rscratch1, rscratch2); 3143 __ strb(rscratch1, Address(out, offset)); 3144 __ add(offset, offset, 1); 3145 __ add(used, used, 1); 3146 __ subw(len, len,1); 3147 __ cbnzw(len, L_CTR_loop); 3148 } 3149 3150 __ bind(DONE); 3151 __ strw(used, Address(used_ptr)); 3152 __ mov(r0, saved_len); 3153 3154 __ leave(); // required for proper stackwalking of RuntimeStub frame 3155 __ ret(lr); 3156 3157 // Bulk encryption 3158 3159 __ BIND (CTR_large_block); 3160 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3161 3162 if (bulk_width == 8) { 3163 __ sub(sp, sp, 4 * 16); 3164 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3165 } 3166 __ sub(sp, sp, 4 * 16); 3167 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3168 RegSet saved_regs = (RegSet::of(in, out, offset) 3169 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3170 __ push(saved_regs, sp); 3171 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3172 __ add(in, in, offset); 3173 __ add(out, out, offset); 3174 3175 // Keys should already be loaded into the correct registers 3176 3177 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3178 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3179 3180 // AES/CTR loop 3181 { 3182 Label L_CTR_loop; 3183 __ BIND(L_CTR_loop); 3184 3185 // Setup the counters 3186 __ movi(v8, __ T4S, 0); 3187 __ movi(v9, __ T4S, 1); 3188 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3189 3190 for (int i = 0; i < bulk_width; i++) { 3191 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3192 __ rev64(v0_ofs, __ T16B, v16); 3193 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3194 } 3195 3196 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3197 3198 // Encrypt the counters 3199 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3200 3201 if (bulk_width == 8) { 3202 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3203 } 3204 3205 // XOR the encrypted counters with the inputs 3206 for (int i = 0; i < bulk_width; i++) { 3207 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3208 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3209 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3210 } 3211 3212 // Write the encrypted data 3213 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3214 if (bulk_width == 8) { 3215 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3216 } 3217 3218 __ subw(len, len, 16 * bulk_width); 3219 __ cbnzw(len, L_CTR_loop); 3220 } 3221 3222 // Save the counter back where it goes 3223 __ rev64(v16, __ T16B, v16); 3224 __ st1(v16, __ T16B, counter); 3225 3226 __ pop(saved_regs, sp); 3227 3228 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3229 if (bulk_width == 8) { 3230 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3231 } 3232 3233 __ andr(rscratch1, len, -16 * bulk_width); 3234 __ sub(len, len, rscratch1); 3235 __ add(offset, offset, rscratch1); 3236 __ mov(used, 16); 3237 __ strw(used, Address(used_ptr)); 3238 __ b(large_block_return); 3239 3240 return start; 3241 } 3242 3243 // Vector AES Galois Counter Mode implementation. Parameters: 3244 // 3245 // in = c_rarg0 3246 // len = c_rarg1 3247 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3248 // out = c_rarg3 3249 // key = c_rarg4 3250 // state = c_rarg5 - GHASH.state 3251 // subkeyHtbl = c_rarg6 - powers of H 3252 // counter = c_rarg7 - 16 bytes of CTR 3253 // return - number of processed bytes 3254 address generate_galoisCounterMode_AESCrypt() { 3255 address ghash_polynomial = __ pc(); 3256 __ emit_int64(0x87); // The low-order bits of the field 3257 // polynomial (i.e. p = z^7+z^2+z+1) 3258 // repeated in the low and high parts of a 3259 // 128-bit vector 3260 __ emit_int64(0x87); 3261 3262 __ align(CodeEntryAlignment); 3263 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3264 address start = __ pc(); 3265 __ enter(); 3266 3267 const Register in = c_rarg0; 3268 const Register len = c_rarg1; 3269 const Register ct = c_rarg2; 3270 const Register out = c_rarg3; 3271 // and updated with the incremented counter in the end 3272 3273 const Register key = c_rarg4; 3274 const Register state = c_rarg5; 3275 3276 const Register subkeyHtbl = c_rarg6; 3277 3278 const Register counter = c_rarg7; 3279 3280 const Register keylen = r10; 3281 // Save state before entering routine 3282 __ sub(sp, sp, 4 * 16); 3283 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3284 __ sub(sp, sp, 4 * 16); 3285 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3286 3287 // __ andr(len, len, -512); 3288 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3289 __ str(len, __ pre(sp, -2 * wordSize)); 3290 3291 Label DONE; 3292 __ cbz(len, DONE); 3293 3294 // Compute #rounds for AES based on the length of the key array 3295 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3296 3297 __ aesenc_loadkeys(key, keylen); 3298 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3299 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3300 3301 // AES/CTR loop 3302 { 3303 Label L_CTR_loop; 3304 __ BIND(L_CTR_loop); 3305 3306 // Setup the counters 3307 __ movi(v8, __ T4S, 0); 3308 __ movi(v9, __ T4S, 1); 3309 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3310 3311 assert(v0->encoding() < v8->encoding(), ""); 3312 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3313 FloatRegister f = as_FloatRegister(i); 3314 __ rev32(f, __ T16B, v16); 3315 __ addv(v16, __ T4S, v16, v8); 3316 } 3317 3318 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3319 3320 // Encrypt the counters 3321 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3322 3323 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3324 3325 // XOR the encrypted counters with the inputs 3326 for (int i = 0; i < 8; i++) { 3327 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3328 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3329 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3330 } 3331 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3332 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3333 3334 __ subw(len, len, 16 * 8); 3335 __ cbnzw(len, L_CTR_loop); 3336 } 3337 3338 __ rev32(v16, __ T16B, v16); 3339 __ st1(v16, __ T16B, counter); 3340 3341 __ ldr(len, Address(sp)); 3342 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3343 3344 // GHASH/CTR loop 3345 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3346 len, /*unrolls*/4); 3347 3348 #ifdef ASSERT 3349 { Label L; 3350 __ cmp(len, (unsigned char)0); 3351 __ br(Assembler::EQ, L); 3352 __ stop("stubGenerator: abort"); 3353 __ bind(L); 3354 } 3355 #endif 3356 3357 __ bind(DONE); 3358 // Return the number of bytes processed 3359 __ ldr(r0, __ post(sp, 2 * wordSize)); 3360 3361 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3362 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3363 3364 __ leave(); // required for proper stackwalking of RuntimeStub frame 3365 __ ret(lr); 3366 return start; 3367 } 3368 3369 class Cached64Bytes { 3370 private: 3371 MacroAssembler *_masm; 3372 Register _regs[8]; 3373 3374 public: 3375 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3376 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3377 auto it = rs.begin(); 3378 for (auto &r: _regs) { 3379 r = *it; 3380 ++it; 3381 } 3382 } 3383 3384 void gen_loads(Register base) { 3385 for (int i = 0; i < 8; i += 2) { 3386 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3387 } 3388 } 3389 3390 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3391 void extract_u32(Register dest, int i) { 3392 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3393 } 3394 }; 3395 3396 // Utility routines for md5. 3397 // Clobbers r10 and r11. 3398 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3399 int k, int s, int t) { 3400 Register rscratch3 = r10; 3401 Register rscratch4 = r11; 3402 3403 __ eorw(rscratch3, r3, r4); 3404 __ movw(rscratch2, t); 3405 __ andw(rscratch3, rscratch3, r2); 3406 __ addw(rscratch4, r1, rscratch2); 3407 reg_cache.extract_u32(rscratch1, k); 3408 __ eorw(rscratch3, rscratch3, r4); 3409 __ addw(rscratch4, rscratch4, rscratch1); 3410 __ addw(rscratch3, rscratch3, rscratch4); 3411 __ rorw(rscratch2, rscratch3, 32 - s); 3412 __ addw(r1, rscratch2, r2); 3413 } 3414 3415 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3416 int k, int s, int t) { 3417 Register rscratch3 = r10; 3418 Register rscratch4 = r11; 3419 3420 reg_cache.extract_u32(rscratch1, k); 3421 __ movw(rscratch2, t); 3422 __ addw(rscratch4, r1, rscratch2); 3423 __ addw(rscratch4, rscratch4, rscratch1); 3424 __ bicw(rscratch2, r3, r4); 3425 __ andw(rscratch3, r2, r4); 3426 __ addw(rscratch2, rscratch2, rscratch4); 3427 __ addw(rscratch2, rscratch2, rscratch3); 3428 __ rorw(rscratch2, rscratch2, 32 - s); 3429 __ addw(r1, rscratch2, r2); 3430 } 3431 3432 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3433 int k, int s, int t) { 3434 Register rscratch3 = r10; 3435 Register rscratch4 = r11; 3436 3437 __ eorw(rscratch3, r3, r4); 3438 __ movw(rscratch2, t); 3439 __ addw(rscratch4, r1, rscratch2); 3440 reg_cache.extract_u32(rscratch1, k); 3441 __ eorw(rscratch3, rscratch3, r2); 3442 __ addw(rscratch4, rscratch4, rscratch1); 3443 __ addw(rscratch3, rscratch3, rscratch4); 3444 __ rorw(rscratch2, rscratch3, 32 - s); 3445 __ addw(r1, rscratch2, r2); 3446 } 3447 3448 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3449 int k, int s, int t) { 3450 Register rscratch3 = r10; 3451 Register rscratch4 = r11; 3452 3453 __ movw(rscratch3, t); 3454 __ ornw(rscratch2, r2, r4); 3455 __ addw(rscratch4, r1, rscratch3); 3456 reg_cache.extract_u32(rscratch1, k); 3457 __ eorw(rscratch3, rscratch2, r3); 3458 __ addw(rscratch4, rscratch4, rscratch1); 3459 __ addw(rscratch3, rscratch3, rscratch4); 3460 __ rorw(rscratch2, rscratch3, 32 - s); 3461 __ addw(r1, rscratch2, r2); 3462 } 3463 3464 // Arguments: 3465 // 3466 // Inputs: 3467 // c_rarg0 - byte[] source+offset 3468 // c_rarg1 - int[] SHA.state 3469 // c_rarg2 - int offset 3470 // c_rarg3 - int limit 3471 // 3472 address generate_md5_implCompress(bool multi_block, const char *name) { 3473 __ align(CodeEntryAlignment); 3474 StubCodeMark mark(this, "StubRoutines", name); 3475 address start = __ pc(); 3476 3477 Register buf = c_rarg0; 3478 Register state = c_rarg1; 3479 Register ofs = c_rarg2; 3480 Register limit = c_rarg3; 3481 Register a = r4; 3482 Register b = r5; 3483 Register c = r6; 3484 Register d = r7; 3485 Register rscratch3 = r10; 3486 Register rscratch4 = r11; 3487 3488 Register state_regs[2] = { r12, r13 }; 3489 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3490 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3491 3492 __ push(saved_regs, sp); 3493 3494 __ ldp(state_regs[0], state_regs[1], Address(state)); 3495 __ ubfx(a, state_regs[0], 0, 32); 3496 __ ubfx(b, state_regs[0], 32, 32); 3497 __ ubfx(c, state_regs[1], 0, 32); 3498 __ ubfx(d, state_regs[1], 32, 32); 3499 3500 Label md5_loop; 3501 __ BIND(md5_loop); 3502 3503 reg_cache.gen_loads(buf); 3504 3505 // Round 1 3506 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3507 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3508 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3509 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3510 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3511 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3512 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3513 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3514 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3515 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3516 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3517 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3518 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3519 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3520 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3521 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3522 3523 // Round 2 3524 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3525 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3526 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3527 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3528 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3529 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3530 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3531 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3532 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3533 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3534 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3535 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3536 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3537 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3538 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3539 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3540 3541 // Round 3 3542 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3543 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3544 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3545 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3546 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3547 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3548 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3549 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3550 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3551 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3552 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3553 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3554 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3555 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3556 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3557 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3558 3559 // Round 4 3560 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3561 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3562 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3563 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3564 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3565 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3566 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3567 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3568 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3569 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3570 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3571 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3572 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3573 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3574 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3575 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3576 3577 __ addw(a, state_regs[0], a); 3578 __ ubfx(rscratch2, state_regs[0], 32, 32); 3579 __ addw(b, rscratch2, b); 3580 __ addw(c, state_regs[1], c); 3581 __ ubfx(rscratch4, state_regs[1], 32, 32); 3582 __ addw(d, rscratch4, d); 3583 3584 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3585 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3586 3587 if (multi_block) { 3588 __ add(buf, buf, 64); 3589 __ add(ofs, ofs, 64); 3590 __ cmp(ofs, limit); 3591 __ br(Assembler::LE, md5_loop); 3592 __ mov(c_rarg0, ofs); // return ofs 3593 } 3594 3595 // write hash values back in the correct order 3596 __ stp(state_regs[0], state_regs[1], Address(state)); 3597 3598 __ pop(saved_regs, sp); 3599 3600 __ ret(lr); 3601 3602 return start; 3603 } 3604 3605 // Arguments: 3606 // 3607 // Inputs: 3608 // c_rarg0 - byte[] source+offset 3609 // c_rarg1 - int[] SHA.state 3610 // c_rarg2 - int offset 3611 // c_rarg3 - int limit 3612 // 3613 address generate_sha1_implCompress(bool multi_block, const char *name) { 3614 __ align(CodeEntryAlignment); 3615 StubCodeMark mark(this, "StubRoutines", name); 3616 address start = __ pc(); 3617 3618 Register buf = c_rarg0; 3619 Register state = c_rarg1; 3620 Register ofs = c_rarg2; 3621 Register limit = c_rarg3; 3622 3623 Label keys; 3624 Label sha1_loop; 3625 3626 // load the keys into v0..v3 3627 __ adr(rscratch1, keys); 3628 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3629 // load 5 words state into v6, v7 3630 __ ldrq(v6, Address(state, 0)); 3631 __ ldrs(v7, Address(state, 16)); 3632 3633 3634 __ BIND(sha1_loop); 3635 // load 64 bytes of data into v16..v19 3636 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3637 __ rev32(v16, __ T16B, v16); 3638 __ rev32(v17, __ T16B, v17); 3639 __ rev32(v18, __ T16B, v18); 3640 __ rev32(v19, __ T16B, v19); 3641 3642 // do the sha1 3643 __ addv(v4, __ T4S, v16, v0); 3644 __ orr(v20, __ T16B, v6, v6); 3645 3646 FloatRegister d0 = v16; 3647 FloatRegister d1 = v17; 3648 FloatRegister d2 = v18; 3649 FloatRegister d3 = v19; 3650 3651 for (int round = 0; round < 20; round++) { 3652 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3653 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3654 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3655 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3656 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3657 3658 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3659 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3660 __ sha1h(tmp2, __ T4S, v20); 3661 if (round < 5) 3662 __ sha1c(v20, __ T4S, tmp3, tmp4); 3663 else if (round < 10 || round >= 15) 3664 __ sha1p(v20, __ T4S, tmp3, tmp4); 3665 else 3666 __ sha1m(v20, __ T4S, tmp3, tmp4); 3667 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3668 3669 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3670 } 3671 3672 __ addv(v7, __ T2S, v7, v21); 3673 __ addv(v6, __ T4S, v6, v20); 3674 3675 if (multi_block) { 3676 __ add(ofs, ofs, 64); 3677 __ cmp(ofs, limit); 3678 __ br(Assembler::LE, sha1_loop); 3679 __ mov(c_rarg0, ofs); // return ofs 3680 } 3681 3682 __ strq(v6, Address(state, 0)); 3683 __ strs(v7, Address(state, 16)); 3684 3685 __ ret(lr); 3686 3687 __ bind(keys); 3688 __ emit_int32(0x5a827999); 3689 __ emit_int32(0x6ed9eba1); 3690 __ emit_int32(0x8f1bbcdc); 3691 __ emit_int32(0xca62c1d6); 3692 3693 return start; 3694 } 3695 3696 3697 // Arguments: 3698 // 3699 // Inputs: 3700 // c_rarg0 - byte[] source+offset 3701 // c_rarg1 - int[] SHA.state 3702 // c_rarg2 - int offset 3703 // c_rarg3 - int limit 3704 // 3705 address generate_sha256_implCompress(bool multi_block, const char *name) { 3706 static const uint32_t round_consts[64] = { 3707 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3708 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3709 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3710 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3711 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3712 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3713 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3714 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3715 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3716 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3717 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3718 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3719 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3720 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3721 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3722 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3723 }; 3724 __ align(CodeEntryAlignment); 3725 StubCodeMark mark(this, "StubRoutines", name); 3726 address start = __ pc(); 3727 3728 Register buf = c_rarg0; 3729 Register state = c_rarg1; 3730 Register ofs = c_rarg2; 3731 Register limit = c_rarg3; 3732 3733 Label sha1_loop; 3734 3735 __ stpd(v8, v9, __ pre(sp, -32)); 3736 __ stpd(v10, v11, Address(sp, 16)); 3737 3738 // dga == v0 3739 // dgb == v1 3740 // dg0 == v2 3741 // dg1 == v3 3742 // dg2 == v4 3743 // t0 == v6 3744 // t1 == v7 3745 3746 // load 16 keys to v16..v31 3747 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3748 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3749 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3750 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3751 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3752 3753 // load 8 words (256 bits) state 3754 __ ldpq(v0, v1, state); 3755 3756 __ BIND(sha1_loop); 3757 // load 64 bytes of data into v8..v11 3758 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3759 __ rev32(v8, __ T16B, v8); 3760 __ rev32(v9, __ T16B, v9); 3761 __ rev32(v10, __ T16B, v10); 3762 __ rev32(v11, __ T16B, v11); 3763 3764 __ addv(v6, __ T4S, v8, v16); 3765 __ orr(v2, __ T16B, v0, v0); 3766 __ orr(v3, __ T16B, v1, v1); 3767 3768 FloatRegister d0 = v8; 3769 FloatRegister d1 = v9; 3770 FloatRegister d2 = v10; 3771 FloatRegister d3 = v11; 3772 3773 3774 for (int round = 0; round < 16; round++) { 3775 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3776 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3777 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3778 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3779 3780 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3781 __ orr(v4, __ T16B, v2, v2); 3782 if (round < 15) 3783 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3784 __ sha256h(v2, __ T4S, v3, tmp2); 3785 __ sha256h2(v3, __ T4S, v4, tmp2); 3786 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3787 3788 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3789 } 3790 3791 __ addv(v0, __ T4S, v0, v2); 3792 __ addv(v1, __ T4S, v1, v3); 3793 3794 if (multi_block) { 3795 __ add(ofs, ofs, 64); 3796 __ cmp(ofs, limit); 3797 __ br(Assembler::LE, sha1_loop); 3798 __ mov(c_rarg0, ofs); // return ofs 3799 } 3800 3801 __ ldpd(v10, v11, Address(sp, 16)); 3802 __ ldpd(v8, v9, __ post(sp, 32)); 3803 3804 __ stpq(v0, v1, state); 3805 3806 __ ret(lr); 3807 3808 return start; 3809 } 3810 3811 // Double rounds for sha512. 3812 void sha512_dround(int dr, 3813 FloatRegister vi0, FloatRegister vi1, 3814 FloatRegister vi2, FloatRegister vi3, 3815 FloatRegister vi4, FloatRegister vrc0, 3816 FloatRegister vrc1, FloatRegister vin0, 3817 FloatRegister vin1, FloatRegister vin2, 3818 FloatRegister vin3, FloatRegister vin4) { 3819 if (dr < 36) { 3820 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3821 } 3822 __ addv(v5, __ T2D, vrc0, vin0); 3823 __ ext(v6, __ T16B, vi2, vi3, 8); 3824 __ ext(v5, __ T16B, v5, v5, 8); 3825 __ ext(v7, __ T16B, vi1, vi2, 8); 3826 __ addv(vi3, __ T2D, vi3, v5); 3827 if (dr < 32) { 3828 __ ext(v5, __ T16B, vin3, vin4, 8); 3829 __ sha512su0(vin0, __ T2D, vin1); 3830 } 3831 __ sha512h(vi3, __ T2D, v6, v7); 3832 if (dr < 32) { 3833 __ sha512su1(vin0, __ T2D, vin2, v5); 3834 } 3835 __ addv(vi4, __ T2D, vi1, vi3); 3836 __ sha512h2(vi3, __ T2D, vi1, vi0); 3837 } 3838 3839 // Arguments: 3840 // 3841 // Inputs: 3842 // c_rarg0 - byte[] source+offset 3843 // c_rarg1 - int[] SHA.state 3844 // c_rarg2 - int offset 3845 // c_rarg3 - int limit 3846 // 3847 address generate_sha512_implCompress(bool multi_block, const char *name) { 3848 static const uint64_t round_consts[80] = { 3849 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3850 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3851 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3852 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3853 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3854 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3855 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3856 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3857 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3858 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3859 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3860 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3861 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3862 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3863 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3864 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3865 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3866 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3867 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3868 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3869 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3870 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3871 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3872 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3873 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3874 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3875 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3876 }; 3877 3878 __ align(CodeEntryAlignment); 3879 StubCodeMark mark(this, "StubRoutines", name); 3880 address start = __ pc(); 3881 3882 Register buf = c_rarg0; 3883 Register state = c_rarg1; 3884 Register ofs = c_rarg2; 3885 Register limit = c_rarg3; 3886 3887 __ stpd(v8, v9, __ pre(sp, -64)); 3888 __ stpd(v10, v11, Address(sp, 16)); 3889 __ stpd(v12, v13, Address(sp, 32)); 3890 __ stpd(v14, v15, Address(sp, 48)); 3891 3892 Label sha512_loop; 3893 3894 // load state 3895 __ ld1(v8, v9, v10, v11, __ T2D, state); 3896 3897 // load first 4 round constants 3898 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3899 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3900 3901 __ BIND(sha512_loop); 3902 // load 128B of data into v12..v19 3903 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3904 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3905 __ rev64(v12, __ T16B, v12); 3906 __ rev64(v13, __ T16B, v13); 3907 __ rev64(v14, __ T16B, v14); 3908 __ rev64(v15, __ T16B, v15); 3909 __ rev64(v16, __ T16B, v16); 3910 __ rev64(v17, __ T16B, v17); 3911 __ rev64(v18, __ T16B, v18); 3912 __ rev64(v19, __ T16B, v19); 3913 3914 __ mov(rscratch2, rscratch1); 3915 3916 __ mov(v0, __ T16B, v8); 3917 __ mov(v1, __ T16B, v9); 3918 __ mov(v2, __ T16B, v10); 3919 __ mov(v3, __ T16B, v11); 3920 3921 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3922 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3923 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3924 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3925 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3926 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3927 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3928 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3929 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3930 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3931 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3932 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3933 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3934 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3935 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3936 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3937 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3938 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3939 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3940 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3941 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3942 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3943 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3944 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3945 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3946 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3947 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3948 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3949 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3950 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3951 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3952 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3953 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3954 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3955 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3956 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3957 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3958 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3959 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3960 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3961 3962 __ addv(v8, __ T2D, v8, v0); 3963 __ addv(v9, __ T2D, v9, v1); 3964 __ addv(v10, __ T2D, v10, v2); 3965 __ addv(v11, __ T2D, v11, v3); 3966 3967 if (multi_block) { 3968 __ add(ofs, ofs, 128); 3969 __ cmp(ofs, limit); 3970 __ br(Assembler::LE, sha512_loop); 3971 __ mov(c_rarg0, ofs); // return ofs 3972 } 3973 3974 __ st1(v8, v9, v10, v11, __ T2D, state); 3975 3976 __ ldpd(v14, v15, Address(sp, 48)); 3977 __ ldpd(v12, v13, Address(sp, 32)); 3978 __ ldpd(v10, v11, Address(sp, 16)); 3979 __ ldpd(v8, v9, __ post(sp, 64)); 3980 3981 __ ret(lr); 3982 3983 return start; 3984 } 3985 3986 // Arguments: 3987 // 3988 // Inputs: 3989 // c_rarg0 - byte[] source+offset 3990 // c_rarg1 - byte[] SHA.state 3991 // c_rarg2 - int block_size 3992 // c_rarg3 - int offset 3993 // c_rarg4 - int limit 3994 // 3995 address generate_sha3_implCompress(bool multi_block, const char *name) { 3996 static const uint64_t round_consts[24] = { 3997 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 3998 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 3999 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4000 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4001 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4002 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4003 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4004 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4005 }; 4006 4007 __ align(CodeEntryAlignment); 4008 StubCodeMark mark(this, "StubRoutines", name); 4009 address start = __ pc(); 4010 4011 Register buf = c_rarg0; 4012 Register state = c_rarg1; 4013 Register block_size = c_rarg2; 4014 Register ofs = c_rarg3; 4015 Register limit = c_rarg4; 4016 4017 Label sha3_loop, rounds24_loop; 4018 Label sha3_512_or_sha3_384, shake128; 4019 4020 __ stpd(v8, v9, __ pre(sp, -64)); 4021 __ stpd(v10, v11, Address(sp, 16)); 4022 __ stpd(v12, v13, Address(sp, 32)); 4023 __ stpd(v14, v15, Address(sp, 48)); 4024 4025 // load state 4026 __ add(rscratch1, state, 32); 4027 __ ld1(v0, v1, v2, v3, __ T1D, state); 4028 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4029 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4030 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4031 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4032 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4033 __ ld1(v24, __ T1D, rscratch1); 4034 4035 __ BIND(sha3_loop); 4036 4037 // 24 keccak rounds 4038 __ movw(rscratch2, 24); 4039 4040 // load round_constants base 4041 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4042 4043 // load input 4044 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4045 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4046 __ eor(v0, __ T8B, v0, v25); 4047 __ eor(v1, __ T8B, v1, v26); 4048 __ eor(v2, __ T8B, v2, v27); 4049 __ eor(v3, __ T8B, v3, v28); 4050 __ eor(v4, __ T8B, v4, v29); 4051 __ eor(v5, __ T8B, v5, v30); 4052 __ eor(v6, __ T8B, v6, v31); 4053 4054 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4055 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4056 4057 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4058 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4059 __ eor(v7, __ T8B, v7, v25); 4060 __ eor(v8, __ T8B, v8, v26); 4061 __ eor(v9, __ T8B, v9, v27); 4062 __ eor(v10, __ T8B, v10, v28); 4063 __ eor(v11, __ T8B, v11, v29); 4064 __ eor(v12, __ T8B, v12, v30); 4065 __ eor(v13, __ T8B, v13, v31); 4066 4067 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4068 __ eor(v14, __ T8B, v14, v25); 4069 __ eor(v15, __ T8B, v15, v26); 4070 __ eor(v16, __ T8B, v16, v27); 4071 4072 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4073 __ andw(c_rarg5, block_size, 48); 4074 __ cbzw(c_rarg5, rounds24_loop); 4075 4076 __ tbnz(block_size, 5, shake128); 4077 // block_size == 144, bit5 == 0, SHA3-244 4078 __ ldrd(v28, __ post(buf, 8)); 4079 __ eor(v17, __ T8B, v17, v28); 4080 __ b(rounds24_loop); 4081 4082 __ BIND(shake128); 4083 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4084 __ eor(v17, __ T8B, v17, v28); 4085 __ eor(v18, __ T8B, v18, v29); 4086 __ eor(v19, __ T8B, v19, v30); 4087 __ eor(v20, __ T8B, v20, v31); 4088 __ b(rounds24_loop); // block_size == 168, SHAKE128 4089 4090 __ BIND(sha3_512_or_sha3_384); 4091 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4092 __ eor(v7, __ T8B, v7, v25); 4093 __ eor(v8, __ T8B, v8, v26); 4094 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4095 4096 // SHA3-384 4097 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4098 __ eor(v9, __ T8B, v9, v27); 4099 __ eor(v10, __ T8B, v10, v28); 4100 __ eor(v11, __ T8B, v11, v29); 4101 __ eor(v12, __ T8B, v12, v30); 4102 4103 __ BIND(rounds24_loop); 4104 __ subw(rscratch2, rscratch2, 1); 4105 4106 __ eor3(v29, __ T16B, v4, v9, v14); 4107 __ eor3(v26, __ T16B, v1, v6, v11); 4108 __ eor3(v28, __ T16B, v3, v8, v13); 4109 __ eor3(v25, __ T16B, v0, v5, v10); 4110 __ eor3(v27, __ T16B, v2, v7, v12); 4111 __ eor3(v29, __ T16B, v29, v19, v24); 4112 __ eor3(v26, __ T16B, v26, v16, v21); 4113 __ eor3(v28, __ T16B, v28, v18, v23); 4114 __ eor3(v25, __ T16B, v25, v15, v20); 4115 __ eor3(v27, __ T16B, v27, v17, v22); 4116 4117 __ rax1(v30, __ T2D, v29, v26); 4118 __ rax1(v26, __ T2D, v26, v28); 4119 __ rax1(v28, __ T2D, v28, v25); 4120 __ rax1(v25, __ T2D, v25, v27); 4121 __ rax1(v27, __ T2D, v27, v29); 4122 4123 __ eor(v0, __ T16B, v0, v30); 4124 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4125 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4126 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4127 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4128 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4129 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4130 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4131 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4132 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4133 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4134 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4135 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4136 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4137 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4138 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4139 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4140 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4141 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4142 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4143 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4144 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4145 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4146 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4147 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4148 4149 __ bcax(v20, __ T16B, v31, v22, v8); 4150 __ bcax(v21, __ T16B, v8, v23, v22); 4151 __ bcax(v22, __ T16B, v22, v24, v23); 4152 __ bcax(v23, __ T16B, v23, v31, v24); 4153 __ bcax(v24, __ T16B, v24, v8, v31); 4154 4155 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4156 4157 __ bcax(v17, __ T16B, v25, v19, v3); 4158 __ bcax(v18, __ T16B, v3, v15, v19); 4159 __ bcax(v19, __ T16B, v19, v16, v15); 4160 __ bcax(v15, __ T16B, v15, v25, v16); 4161 __ bcax(v16, __ T16B, v16, v3, v25); 4162 4163 __ bcax(v10, __ T16B, v29, v12, v26); 4164 __ bcax(v11, __ T16B, v26, v13, v12); 4165 __ bcax(v12, __ T16B, v12, v14, v13); 4166 __ bcax(v13, __ T16B, v13, v29, v14); 4167 __ bcax(v14, __ T16B, v14, v26, v29); 4168 4169 __ bcax(v7, __ T16B, v30, v9, v4); 4170 __ bcax(v8, __ T16B, v4, v5, v9); 4171 __ bcax(v9, __ T16B, v9, v6, v5); 4172 __ bcax(v5, __ T16B, v5, v30, v6); 4173 __ bcax(v6, __ T16B, v6, v4, v30); 4174 4175 __ bcax(v3, __ T16B, v27, v0, v28); 4176 __ bcax(v4, __ T16B, v28, v1, v0); 4177 __ bcax(v0, __ T16B, v0, v2, v1); 4178 __ bcax(v1, __ T16B, v1, v27, v2); 4179 __ bcax(v2, __ T16B, v2, v28, v27); 4180 4181 __ eor(v0, __ T16B, v0, v31); 4182 4183 __ cbnzw(rscratch2, rounds24_loop); 4184 4185 if (multi_block) { 4186 __ add(ofs, ofs, block_size); 4187 __ cmp(ofs, limit); 4188 __ br(Assembler::LE, sha3_loop); 4189 __ mov(c_rarg0, ofs); // return ofs 4190 } 4191 4192 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4193 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4194 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4195 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4196 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4197 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4198 __ st1(v24, __ T1D, state); 4199 4200 __ ldpd(v14, v15, Address(sp, 48)); 4201 __ ldpd(v12, v13, Address(sp, 32)); 4202 __ ldpd(v10, v11, Address(sp, 16)); 4203 __ ldpd(v8, v9, __ post(sp, 64)); 4204 4205 __ ret(lr); 4206 4207 return start; 4208 } 4209 4210 /** 4211 * Arguments: 4212 * 4213 * Inputs: 4214 * c_rarg0 - int crc 4215 * c_rarg1 - byte* buf 4216 * c_rarg2 - int length 4217 * 4218 * Output: 4219 * rax - int crc result 4220 */ 4221 address generate_updateBytesCRC32() { 4222 assert(UseCRC32Intrinsics, "what are we doing here?"); 4223 4224 __ align(CodeEntryAlignment); 4225 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4226 4227 address start = __ pc(); 4228 4229 const Register crc = c_rarg0; // crc 4230 const Register buf = c_rarg1; // source java byte array address 4231 const Register len = c_rarg2; // length 4232 const Register table0 = c_rarg3; // crc_table address 4233 const Register table1 = c_rarg4; 4234 const Register table2 = c_rarg5; 4235 const Register table3 = c_rarg6; 4236 const Register tmp3 = c_rarg7; 4237 4238 BLOCK_COMMENT("Entry:"); 4239 __ enter(); // required for proper stackwalking of RuntimeStub frame 4240 4241 __ kernel_crc32(crc, buf, len, 4242 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4243 4244 __ leave(); // required for proper stackwalking of RuntimeStub frame 4245 __ ret(lr); 4246 4247 return start; 4248 } 4249 4250 // ChaCha20 block function. This version parallelizes by loading 4251 // individual 32-bit state elements into vectors for four blocks 4252 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4253 // 4254 // state (int[16]) = c_rarg0 4255 // keystream (byte[1024]) = c_rarg1 4256 // return - number of bytes of keystream (always 256) 4257 address generate_chacha20Block_blockpar() { 4258 Label L_twoRounds, L_cc20_const; 4259 // The constant data is broken into two 128-bit segments to be loaded 4260 // onto FloatRegisters. The first 128 bits are a counter add overlay 4261 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4262 // The second 128-bits is a table constant used for 8-bit left rotations. 4263 __ BIND(L_cc20_const); 4264 __ emit_int64(0x0000000100000000UL); 4265 __ emit_int64(0x0000000300000002UL); 4266 __ emit_int64(0x0605040702010003UL); 4267 __ emit_int64(0x0E0D0C0F0A09080BUL); 4268 4269 __ align(CodeEntryAlignment); 4270 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4271 address start = __ pc(); 4272 __ enter(); 4273 4274 int i, j; 4275 const Register state = c_rarg0; 4276 const Register keystream = c_rarg1; 4277 const Register loopCtr = r10; 4278 const Register tmpAddr = r11; 4279 4280 const FloatRegister stateFirst = v0; 4281 const FloatRegister stateSecond = v1; 4282 const FloatRegister stateThird = v2; 4283 const FloatRegister stateFourth = v3; 4284 const FloatRegister origCtrState = v28; 4285 const FloatRegister scratch = v29; 4286 const FloatRegister lrot8Tbl = v30; 4287 4288 // Organize SIMD registers in an array that facilitates 4289 // putting repetitive opcodes into loop structures. It is 4290 // important that each grouping of 4 registers is monotonically 4291 // increasing to support the requirements of multi-register 4292 // instructions (e.g. ld4r, st4, etc.) 4293 const FloatRegister workSt[16] = { 4294 v4, v5, v6, v7, v16, v17, v18, v19, 4295 v20, v21, v22, v23, v24, v25, v26, v27 4296 }; 4297 4298 // Load from memory and interlace across 16 SIMD registers, 4299 // With each word from memory being broadcast to all lanes of 4300 // each successive SIMD register. 4301 // Addr(0) -> All lanes in workSt[i] 4302 // Addr(4) -> All lanes workSt[i + 1], etc. 4303 __ mov(tmpAddr, state); 4304 for (i = 0; i < 16; i += 4) { 4305 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4306 __ post(tmpAddr, 16)); 4307 } 4308 4309 // Pull in constant data. The first 16 bytes are the add overlay 4310 // which is applied to the vector holding the counter (state[12]). 4311 // The second 16 bytes is the index register for the 8-bit left 4312 // rotation tbl instruction. 4313 __ adr(tmpAddr, L_cc20_const); 4314 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4315 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4316 4317 // Set up the 10 iteration loop and perform all 8 quarter round ops 4318 __ mov(loopCtr, 10); 4319 __ BIND(L_twoRounds); 4320 4321 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4322 scratch, lrot8Tbl); 4323 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4324 scratch, lrot8Tbl); 4325 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4326 scratch, lrot8Tbl); 4327 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4328 scratch, lrot8Tbl); 4329 4330 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4331 scratch, lrot8Tbl); 4332 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4333 scratch, lrot8Tbl); 4334 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4335 scratch, lrot8Tbl); 4336 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4337 scratch, lrot8Tbl); 4338 4339 // Decrement and iterate 4340 __ sub(loopCtr, loopCtr, 1); 4341 __ cbnz(loopCtr, L_twoRounds); 4342 4343 __ mov(tmpAddr, state); 4344 4345 // Add the starting state back to the post-loop keystream 4346 // state. We read/interlace the state array from memory into 4347 // 4 registers similar to what we did in the beginning. Then 4348 // add the counter overlay onto workSt[12] at the end. 4349 for (i = 0; i < 16; i += 4) { 4350 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4351 __ post(tmpAddr, 16)); 4352 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4353 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4354 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4355 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4356 } 4357 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4358 4359 // Write to key stream, storing the same element out of workSt[0..15] 4360 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4361 // for the next element position. 4362 for (i = 0; i < 4; i++) { 4363 for (j = 0; j < 16; j += 4) { 4364 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4365 __ post(keystream, 16)); 4366 } 4367 } 4368 4369 __ mov(r0, 256); // Return length of output keystream 4370 __ leave(); 4371 __ ret(lr); 4372 4373 return start; 4374 } 4375 4376 /** 4377 * Arguments: 4378 * 4379 * Inputs: 4380 * c_rarg0 - int crc 4381 * c_rarg1 - byte* buf 4382 * c_rarg2 - int length 4383 * c_rarg3 - int* table 4384 * 4385 * Output: 4386 * r0 - int crc result 4387 */ 4388 address generate_updateBytesCRC32C() { 4389 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4390 4391 __ align(CodeEntryAlignment); 4392 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4393 4394 address start = __ pc(); 4395 4396 const Register crc = c_rarg0; // crc 4397 const Register buf = c_rarg1; // source java byte array address 4398 const Register len = c_rarg2; // length 4399 const Register table0 = c_rarg3; // crc_table address 4400 const Register table1 = c_rarg4; 4401 const Register table2 = c_rarg5; 4402 const Register table3 = c_rarg6; 4403 const Register tmp3 = c_rarg7; 4404 4405 BLOCK_COMMENT("Entry:"); 4406 __ enter(); // required for proper stackwalking of RuntimeStub frame 4407 4408 __ kernel_crc32c(crc, buf, len, 4409 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4410 4411 __ leave(); // required for proper stackwalking of RuntimeStub frame 4412 __ ret(lr); 4413 4414 return start; 4415 } 4416 4417 /*** 4418 * Arguments: 4419 * 4420 * Inputs: 4421 * c_rarg0 - int adler 4422 * c_rarg1 - byte* buff 4423 * c_rarg2 - int len 4424 * 4425 * Output: 4426 * c_rarg0 - int adler result 4427 */ 4428 address generate_updateBytesAdler32() { 4429 __ align(CodeEntryAlignment); 4430 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4431 address start = __ pc(); 4432 4433 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4434 4435 // Aliases 4436 Register adler = c_rarg0; 4437 Register s1 = c_rarg0; 4438 Register s2 = c_rarg3; 4439 Register buff = c_rarg1; 4440 Register len = c_rarg2; 4441 Register nmax = r4; 4442 Register base = r5; 4443 Register count = r6; 4444 Register temp0 = rscratch1; 4445 Register temp1 = rscratch2; 4446 FloatRegister vbytes = v0; 4447 FloatRegister vs1acc = v1; 4448 FloatRegister vs2acc = v2; 4449 FloatRegister vtable = v3; 4450 4451 // Max number of bytes we can process before having to take the mod 4452 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4453 uint64_t BASE = 0xfff1; 4454 uint64_t NMAX = 0x15B0; 4455 4456 __ mov(base, BASE); 4457 __ mov(nmax, NMAX); 4458 4459 // Load accumulation coefficients for the upper 16 bits 4460 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4461 __ ld1(vtable, __ T16B, Address(temp0)); 4462 4463 // s1 is initialized to the lower 16 bits of adler 4464 // s2 is initialized to the upper 16 bits of adler 4465 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4466 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4467 4468 // The pipelined loop needs at least 16 elements for 1 iteration 4469 // It does check this, but it is more effective to skip to the cleanup loop 4470 __ cmp(len, (u1)16); 4471 __ br(Assembler::HS, L_nmax); 4472 __ cbz(len, L_combine); 4473 4474 __ bind(L_simple_by1_loop); 4475 __ ldrb(temp0, Address(__ post(buff, 1))); 4476 __ add(s1, s1, temp0); 4477 __ add(s2, s2, s1); 4478 __ subs(len, len, 1); 4479 __ br(Assembler::HI, L_simple_by1_loop); 4480 4481 // s1 = s1 % BASE 4482 __ subs(temp0, s1, base); 4483 __ csel(s1, temp0, s1, Assembler::HS); 4484 4485 // s2 = s2 % BASE 4486 __ lsr(temp0, s2, 16); 4487 __ lsl(temp1, temp0, 4); 4488 __ sub(temp1, temp1, temp0); 4489 __ add(s2, temp1, s2, ext::uxth); 4490 4491 __ subs(temp0, s2, base); 4492 __ csel(s2, temp0, s2, Assembler::HS); 4493 4494 __ b(L_combine); 4495 4496 __ bind(L_nmax); 4497 __ subs(len, len, nmax); 4498 __ sub(count, nmax, 16); 4499 __ br(Assembler::LO, L_by16); 4500 4501 __ bind(L_nmax_loop); 4502 4503 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4504 vbytes, vs1acc, vs2acc, vtable); 4505 4506 __ subs(count, count, 16); 4507 __ br(Assembler::HS, L_nmax_loop); 4508 4509 // s1 = s1 % BASE 4510 __ lsr(temp0, s1, 16); 4511 __ lsl(temp1, temp0, 4); 4512 __ sub(temp1, temp1, temp0); 4513 __ add(temp1, temp1, s1, ext::uxth); 4514 4515 __ lsr(temp0, temp1, 16); 4516 __ lsl(s1, temp0, 4); 4517 __ sub(s1, s1, temp0); 4518 __ add(s1, s1, temp1, ext:: uxth); 4519 4520 __ subs(temp0, s1, base); 4521 __ csel(s1, temp0, s1, Assembler::HS); 4522 4523 // s2 = s2 % BASE 4524 __ lsr(temp0, s2, 16); 4525 __ lsl(temp1, temp0, 4); 4526 __ sub(temp1, temp1, temp0); 4527 __ add(temp1, temp1, s2, ext::uxth); 4528 4529 __ lsr(temp0, temp1, 16); 4530 __ lsl(s2, temp0, 4); 4531 __ sub(s2, s2, temp0); 4532 __ add(s2, s2, temp1, ext:: uxth); 4533 4534 __ subs(temp0, s2, base); 4535 __ csel(s2, temp0, s2, Assembler::HS); 4536 4537 __ subs(len, len, nmax); 4538 __ sub(count, nmax, 16); 4539 __ br(Assembler::HS, L_nmax_loop); 4540 4541 __ bind(L_by16); 4542 __ adds(len, len, count); 4543 __ br(Assembler::LO, L_by1); 4544 4545 __ bind(L_by16_loop); 4546 4547 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4548 vbytes, vs1acc, vs2acc, vtable); 4549 4550 __ subs(len, len, 16); 4551 __ br(Assembler::HS, L_by16_loop); 4552 4553 __ bind(L_by1); 4554 __ adds(len, len, 15); 4555 __ br(Assembler::LO, L_do_mod); 4556 4557 __ bind(L_by1_loop); 4558 __ ldrb(temp0, Address(__ post(buff, 1))); 4559 __ add(s1, temp0, s1); 4560 __ add(s2, s2, s1); 4561 __ subs(len, len, 1); 4562 __ br(Assembler::HS, L_by1_loop); 4563 4564 __ bind(L_do_mod); 4565 // s1 = s1 % BASE 4566 __ lsr(temp0, s1, 16); 4567 __ lsl(temp1, temp0, 4); 4568 __ sub(temp1, temp1, temp0); 4569 __ add(temp1, temp1, s1, ext::uxth); 4570 4571 __ lsr(temp0, temp1, 16); 4572 __ lsl(s1, temp0, 4); 4573 __ sub(s1, s1, temp0); 4574 __ add(s1, s1, temp1, ext:: uxth); 4575 4576 __ subs(temp0, s1, base); 4577 __ csel(s1, temp0, s1, Assembler::HS); 4578 4579 // s2 = s2 % BASE 4580 __ lsr(temp0, s2, 16); 4581 __ lsl(temp1, temp0, 4); 4582 __ sub(temp1, temp1, temp0); 4583 __ add(temp1, temp1, s2, ext::uxth); 4584 4585 __ lsr(temp0, temp1, 16); 4586 __ lsl(s2, temp0, 4); 4587 __ sub(s2, s2, temp0); 4588 __ add(s2, s2, temp1, ext:: uxth); 4589 4590 __ subs(temp0, s2, base); 4591 __ csel(s2, temp0, s2, Assembler::HS); 4592 4593 // Combine lower bits and higher bits 4594 __ bind(L_combine); 4595 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4596 4597 __ ret(lr); 4598 4599 return start; 4600 } 4601 4602 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4603 Register temp0, Register temp1, FloatRegister vbytes, 4604 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4605 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4606 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4607 // In non-vectorized code, we update s1 and s2 as: 4608 // s1 <- s1 + b1 4609 // s2 <- s2 + s1 4610 // s1 <- s1 + b2 4611 // s2 <- s2 + b1 4612 // ... 4613 // s1 <- s1 + b16 4614 // s2 <- s2 + s1 4615 // Putting above assignments together, we have: 4616 // s1_new = s1 + b1 + b2 + ... + b16 4617 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4618 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4619 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4620 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4621 4622 // s2 = s2 + s1 * 16 4623 __ add(s2, s2, s1, Assembler::LSL, 4); 4624 4625 // vs1acc = b1 + b2 + b3 + ... + b16 4626 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4627 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4628 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4629 __ uaddlv(vs1acc, __ T16B, vbytes); 4630 __ uaddlv(vs2acc, __ T8H, vs2acc); 4631 4632 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4633 __ fmovd(temp0, vs1acc); 4634 __ fmovd(temp1, vs2acc); 4635 __ add(s1, s1, temp0); 4636 __ add(s2, s2, temp1); 4637 } 4638 4639 /** 4640 * Arguments: 4641 * 4642 * Input: 4643 * c_rarg0 - x address 4644 * c_rarg1 - x length 4645 * c_rarg2 - y address 4646 * c_rarg3 - y length 4647 * c_rarg4 - z address 4648 */ 4649 address generate_multiplyToLen() { 4650 __ align(CodeEntryAlignment); 4651 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4652 4653 address start = __ pc(); 4654 const Register x = r0; 4655 const Register xlen = r1; 4656 const Register y = r2; 4657 const Register ylen = r3; 4658 const Register z = r4; 4659 4660 const Register tmp0 = r5; 4661 const Register tmp1 = r10; 4662 const Register tmp2 = r11; 4663 const Register tmp3 = r12; 4664 const Register tmp4 = r13; 4665 const Register tmp5 = r14; 4666 const Register tmp6 = r15; 4667 const Register tmp7 = r16; 4668 4669 BLOCK_COMMENT("Entry:"); 4670 __ enter(); // required for proper stackwalking of RuntimeStub frame 4671 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4672 __ leave(); // required for proper stackwalking of RuntimeStub frame 4673 __ ret(lr); 4674 4675 return start; 4676 } 4677 4678 address generate_squareToLen() { 4679 // squareToLen algorithm for sizes 1..127 described in java code works 4680 // faster than multiply_to_len on some CPUs and slower on others, but 4681 // multiply_to_len shows a bit better overall results 4682 __ align(CodeEntryAlignment); 4683 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4684 address start = __ pc(); 4685 4686 const Register x = r0; 4687 const Register xlen = r1; 4688 const Register z = r2; 4689 const Register y = r4; // == x 4690 const Register ylen = r5; // == xlen 4691 4692 const Register tmp0 = r3; 4693 const Register tmp1 = r10; 4694 const Register tmp2 = r11; 4695 const Register tmp3 = r12; 4696 const Register tmp4 = r13; 4697 const Register tmp5 = r14; 4698 const Register tmp6 = r15; 4699 const Register tmp7 = r16; 4700 4701 RegSet spilled_regs = RegSet::of(y, ylen); 4702 BLOCK_COMMENT("Entry:"); 4703 __ enter(); 4704 __ push(spilled_regs, sp); 4705 __ mov(y, x); 4706 __ mov(ylen, xlen); 4707 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4708 __ pop(spilled_regs, sp); 4709 __ leave(); 4710 __ ret(lr); 4711 return start; 4712 } 4713 4714 address generate_mulAdd() { 4715 __ align(CodeEntryAlignment); 4716 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4717 4718 address start = __ pc(); 4719 4720 const Register out = r0; 4721 const Register in = r1; 4722 const Register offset = r2; 4723 const Register len = r3; 4724 const Register k = r4; 4725 4726 BLOCK_COMMENT("Entry:"); 4727 __ enter(); 4728 __ mul_add(out, in, offset, len, k); 4729 __ leave(); 4730 __ ret(lr); 4731 4732 return start; 4733 } 4734 4735 // Arguments: 4736 // 4737 // Input: 4738 // c_rarg0 - newArr address 4739 // c_rarg1 - oldArr address 4740 // c_rarg2 - newIdx 4741 // c_rarg3 - shiftCount 4742 // c_rarg4 - numIter 4743 // 4744 address generate_bigIntegerRightShift() { 4745 __ align(CodeEntryAlignment); 4746 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4747 address start = __ pc(); 4748 4749 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4750 4751 Register newArr = c_rarg0; 4752 Register oldArr = c_rarg1; 4753 Register newIdx = c_rarg2; 4754 Register shiftCount = c_rarg3; 4755 Register numIter = c_rarg4; 4756 Register idx = numIter; 4757 4758 Register newArrCur = rscratch1; 4759 Register shiftRevCount = rscratch2; 4760 Register oldArrCur = r13; 4761 Register oldArrNext = r14; 4762 4763 FloatRegister oldElem0 = v0; 4764 FloatRegister oldElem1 = v1; 4765 FloatRegister newElem = v2; 4766 FloatRegister shiftVCount = v3; 4767 FloatRegister shiftVRevCount = v4; 4768 4769 __ cbz(idx, Exit); 4770 4771 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4772 4773 // left shift count 4774 __ movw(shiftRevCount, 32); 4775 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4776 4777 // numIter too small to allow a 4-words SIMD loop, rolling back 4778 __ cmp(numIter, (u1)4); 4779 __ br(Assembler::LT, ShiftThree); 4780 4781 __ dup(shiftVCount, __ T4S, shiftCount); 4782 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4783 __ negr(shiftVCount, __ T4S, shiftVCount); 4784 4785 __ BIND(ShiftSIMDLoop); 4786 4787 // Calculate the load addresses 4788 __ sub(idx, idx, 4); 4789 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4790 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4791 __ add(oldArrCur, oldArrNext, 4); 4792 4793 // Load 4 words and process 4794 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4795 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4796 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4797 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4798 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4799 __ st1(newElem, __ T4S, Address(newArrCur)); 4800 4801 __ cmp(idx, (u1)4); 4802 __ br(Assembler::LT, ShiftTwoLoop); 4803 __ b(ShiftSIMDLoop); 4804 4805 __ BIND(ShiftTwoLoop); 4806 __ cbz(idx, Exit); 4807 __ cmp(idx, (u1)1); 4808 __ br(Assembler::EQ, ShiftOne); 4809 4810 // Calculate the load addresses 4811 __ sub(idx, idx, 2); 4812 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4813 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4814 __ add(oldArrCur, oldArrNext, 4); 4815 4816 // Load 2 words and process 4817 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4818 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4819 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4820 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4821 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4822 __ st1(newElem, __ T2S, Address(newArrCur)); 4823 __ b(ShiftTwoLoop); 4824 4825 __ BIND(ShiftThree); 4826 __ tbz(idx, 1, ShiftOne); 4827 __ tbz(idx, 0, ShiftTwo); 4828 __ ldrw(r10, Address(oldArr, 12)); 4829 __ ldrw(r11, Address(oldArr, 8)); 4830 __ lsrvw(r10, r10, shiftCount); 4831 __ lslvw(r11, r11, shiftRevCount); 4832 __ orrw(r12, r10, r11); 4833 __ strw(r12, Address(newArr, 8)); 4834 4835 __ BIND(ShiftTwo); 4836 __ ldrw(r10, Address(oldArr, 8)); 4837 __ ldrw(r11, Address(oldArr, 4)); 4838 __ lsrvw(r10, r10, shiftCount); 4839 __ lslvw(r11, r11, shiftRevCount); 4840 __ orrw(r12, r10, r11); 4841 __ strw(r12, Address(newArr, 4)); 4842 4843 __ BIND(ShiftOne); 4844 __ ldrw(r10, Address(oldArr, 4)); 4845 __ ldrw(r11, Address(oldArr)); 4846 __ lsrvw(r10, r10, shiftCount); 4847 __ lslvw(r11, r11, shiftRevCount); 4848 __ orrw(r12, r10, r11); 4849 __ strw(r12, Address(newArr)); 4850 4851 __ BIND(Exit); 4852 __ ret(lr); 4853 4854 return start; 4855 } 4856 4857 // Arguments: 4858 // 4859 // Input: 4860 // c_rarg0 - newArr address 4861 // c_rarg1 - oldArr address 4862 // c_rarg2 - newIdx 4863 // c_rarg3 - shiftCount 4864 // c_rarg4 - numIter 4865 // 4866 address generate_bigIntegerLeftShift() { 4867 __ align(CodeEntryAlignment); 4868 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4869 address start = __ pc(); 4870 4871 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4872 4873 Register newArr = c_rarg0; 4874 Register oldArr = c_rarg1; 4875 Register newIdx = c_rarg2; 4876 Register shiftCount = c_rarg3; 4877 Register numIter = c_rarg4; 4878 4879 Register shiftRevCount = rscratch1; 4880 Register oldArrNext = rscratch2; 4881 4882 FloatRegister oldElem0 = v0; 4883 FloatRegister oldElem1 = v1; 4884 FloatRegister newElem = v2; 4885 FloatRegister shiftVCount = v3; 4886 FloatRegister shiftVRevCount = v4; 4887 4888 __ cbz(numIter, Exit); 4889 4890 __ add(oldArrNext, oldArr, 4); 4891 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4892 4893 // right shift count 4894 __ movw(shiftRevCount, 32); 4895 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4896 4897 // numIter too small to allow a 4-words SIMD loop, rolling back 4898 __ cmp(numIter, (u1)4); 4899 __ br(Assembler::LT, ShiftThree); 4900 4901 __ dup(shiftVCount, __ T4S, shiftCount); 4902 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4903 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4904 4905 __ BIND(ShiftSIMDLoop); 4906 4907 // load 4 words and process 4908 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4909 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4910 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4911 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4912 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4913 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4914 __ sub(numIter, numIter, 4); 4915 4916 __ cmp(numIter, (u1)4); 4917 __ br(Assembler::LT, ShiftTwoLoop); 4918 __ b(ShiftSIMDLoop); 4919 4920 __ BIND(ShiftTwoLoop); 4921 __ cbz(numIter, Exit); 4922 __ cmp(numIter, (u1)1); 4923 __ br(Assembler::EQ, ShiftOne); 4924 4925 // load 2 words and process 4926 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4927 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4928 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4929 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4930 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4931 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4932 __ sub(numIter, numIter, 2); 4933 __ b(ShiftTwoLoop); 4934 4935 __ BIND(ShiftThree); 4936 __ ldrw(r10, __ post(oldArr, 4)); 4937 __ ldrw(r11, __ post(oldArrNext, 4)); 4938 __ lslvw(r10, r10, shiftCount); 4939 __ lsrvw(r11, r11, shiftRevCount); 4940 __ orrw(r12, r10, r11); 4941 __ strw(r12, __ post(newArr, 4)); 4942 __ tbz(numIter, 1, Exit); 4943 __ tbz(numIter, 0, ShiftOne); 4944 4945 __ BIND(ShiftTwo); 4946 __ ldrw(r10, __ post(oldArr, 4)); 4947 __ ldrw(r11, __ post(oldArrNext, 4)); 4948 __ lslvw(r10, r10, shiftCount); 4949 __ lsrvw(r11, r11, shiftRevCount); 4950 __ orrw(r12, r10, r11); 4951 __ strw(r12, __ post(newArr, 4)); 4952 4953 __ BIND(ShiftOne); 4954 __ ldrw(r10, Address(oldArr)); 4955 __ ldrw(r11, Address(oldArrNext)); 4956 __ lslvw(r10, r10, shiftCount); 4957 __ lsrvw(r11, r11, shiftRevCount); 4958 __ orrw(r12, r10, r11); 4959 __ strw(r12, Address(newArr)); 4960 4961 __ BIND(Exit); 4962 __ ret(lr); 4963 4964 return start; 4965 } 4966 4967 address generate_count_positives(address &count_positives_long) { 4968 const u1 large_loop_size = 64; 4969 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4970 int dcache_line = VM_Version::dcache_line_size(); 4971 4972 Register ary1 = r1, len = r2, result = r0; 4973 4974 __ align(CodeEntryAlignment); 4975 4976 StubCodeMark mark(this, "StubRoutines", "count_positives"); 4977 4978 address entry = __ pc(); 4979 4980 __ enter(); 4981 // precondition: a copy of len is already in result 4982 // __ mov(result, len); 4983 4984 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 4985 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 4986 4987 __ cmp(len, (u1)15); 4988 __ br(Assembler::GT, LEN_OVER_15); 4989 // The only case when execution falls into this code is when pointer is near 4990 // the end of memory page and we have to avoid reading next page 4991 __ add(ary1, ary1, len); 4992 __ subs(len, len, 8); 4993 __ br(Assembler::GT, LEN_OVER_8); 4994 __ ldr(rscratch2, Address(ary1, -8)); 4995 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 4996 __ lsrv(rscratch2, rscratch2, rscratch1); 4997 __ tst(rscratch2, UPPER_BIT_MASK); 4998 __ csel(result, zr, result, Assembler::NE); 4999 __ leave(); 5000 __ ret(lr); 5001 __ bind(LEN_OVER_8); 5002 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5003 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5004 __ tst(rscratch2, UPPER_BIT_MASK); 5005 __ br(Assembler::NE, RET_NO_POP); 5006 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5007 __ lsrv(rscratch1, rscratch1, rscratch2); 5008 __ tst(rscratch1, UPPER_BIT_MASK); 5009 __ bind(RET_NO_POP); 5010 __ csel(result, zr, result, Assembler::NE); 5011 __ leave(); 5012 __ ret(lr); 5013 5014 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5015 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5016 5017 count_positives_long = __ pc(); // 2nd entry point 5018 5019 __ enter(); 5020 5021 __ bind(LEN_OVER_15); 5022 __ push(spilled_regs, sp); 5023 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5024 __ cbz(rscratch2, ALIGNED); 5025 __ ldp(tmp6, tmp1, Address(ary1)); 5026 __ mov(tmp5, 16); 5027 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5028 __ add(ary1, ary1, rscratch1); 5029 __ orr(tmp6, tmp6, tmp1); 5030 __ tst(tmp6, UPPER_BIT_MASK); 5031 __ br(Assembler::NE, RET_ADJUST); 5032 __ sub(len, len, rscratch1); 5033 5034 __ bind(ALIGNED); 5035 __ cmp(len, large_loop_size); 5036 __ br(Assembler::LT, CHECK_16); 5037 // Perform 16-byte load as early return in pre-loop to handle situation 5038 // when initially aligned large array has negative values at starting bytes, 5039 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5040 // slower. Cases with negative bytes further ahead won't be affected that 5041 // much. In fact, it'll be faster due to early loads, less instructions and 5042 // less branches in LARGE_LOOP. 5043 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5044 __ sub(len, len, 16); 5045 __ orr(tmp6, tmp6, tmp1); 5046 __ tst(tmp6, UPPER_BIT_MASK); 5047 __ br(Assembler::NE, RET_ADJUST_16); 5048 __ cmp(len, large_loop_size); 5049 __ br(Assembler::LT, CHECK_16); 5050 5051 if (SoftwarePrefetchHintDistance >= 0 5052 && SoftwarePrefetchHintDistance >= dcache_line) { 5053 // initial prefetch 5054 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5055 } 5056 __ bind(LARGE_LOOP); 5057 if (SoftwarePrefetchHintDistance >= 0) { 5058 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5059 } 5060 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5061 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5062 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5063 // instructions per cycle and have less branches, but this approach disables 5064 // early return, thus, all 64 bytes are loaded and checked every time. 5065 __ ldp(tmp2, tmp3, Address(ary1)); 5066 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5067 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5068 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5069 __ add(ary1, ary1, large_loop_size); 5070 __ sub(len, len, large_loop_size); 5071 __ orr(tmp2, tmp2, tmp3); 5072 __ orr(tmp4, tmp4, tmp5); 5073 __ orr(rscratch1, rscratch1, rscratch2); 5074 __ orr(tmp6, tmp6, tmp1); 5075 __ orr(tmp2, tmp2, tmp4); 5076 __ orr(rscratch1, rscratch1, tmp6); 5077 __ orr(tmp2, tmp2, rscratch1); 5078 __ tst(tmp2, UPPER_BIT_MASK); 5079 __ br(Assembler::NE, RET_ADJUST_LONG); 5080 __ cmp(len, large_loop_size); 5081 __ br(Assembler::GE, LARGE_LOOP); 5082 5083 __ bind(CHECK_16); // small 16-byte load pre-loop 5084 __ cmp(len, (u1)16); 5085 __ br(Assembler::LT, POST_LOOP16); 5086 5087 __ bind(LOOP16); // small 16-byte load loop 5088 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5089 __ sub(len, len, 16); 5090 __ orr(tmp2, tmp2, tmp3); 5091 __ tst(tmp2, UPPER_BIT_MASK); 5092 __ br(Assembler::NE, RET_ADJUST_16); 5093 __ cmp(len, (u1)16); 5094 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5095 5096 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5097 __ cmp(len, (u1)8); 5098 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5099 __ ldr(tmp3, Address(__ post(ary1, 8))); 5100 __ tst(tmp3, UPPER_BIT_MASK); 5101 __ br(Assembler::NE, RET_ADJUST); 5102 __ sub(len, len, 8); 5103 5104 __ bind(POST_LOOP16_LOAD_TAIL); 5105 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5106 __ ldr(tmp1, Address(ary1)); 5107 __ mov(tmp2, 64); 5108 __ sub(tmp4, tmp2, len, __ LSL, 3); 5109 __ lslv(tmp1, tmp1, tmp4); 5110 __ tst(tmp1, UPPER_BIT_MASK); 5111 __ br(Assembler::NE, RET_ADJUST); 5112 // Fallthrough 5113 5114 __ bind(RET_LEN); 5115 __ pop(spilled_regs, sp); 5116 __ leave(); 5117 __ ret(lr); 5118 5119 // difference result - len is the count of guaranteed to be 5120 // positive bytes 5121 5122 __ bind(RET_ADJUST_LONG); 5123 __ add(len, len, (u1)(large_loop_size - 16)); 5124 __ bind(RET_ADJUST_16); 5125 __ add(len, len, 16); 5126 __ bind(RET_ADJUST); 5127 __ pop(spilled_regs, sp); 5128 __ leave(); 5129 __ sub(result, result, len); 5130 __ ret(lr); 5131 5132 return entry; 5133 } 5134 5135 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5136 bool usePrefetch, Label &NOT_EQUAL) { 5137 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5138 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5139 tmp7 = r12, tmp8 = r13; 5140 Label LOOP; 5141 5142 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5143 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5144 __ bind(LOOP); 5145 if (usePrefetch) { 5146 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5147 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5148 } 5149 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5150 __ eor(tmp1, tmp1, tmp2); 5151 __ eor(tmp3, tmp3, tmp4); 5152 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5153 __ orr(tmp1, tmp1, tmp3); 5154 __ cbnz(tmp1, NOT_EQUAL); 5155 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5156 __ eor(tmp5, tmp5, tmp6); 5157 __ eor(tmp7, tmp7, tmp8); 5158 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5159 __ orr(tmp5, tmp5, tmp7); 5160 __ cbnz(tmp5, NOT_EQUAL); 5161 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5162 __ eor(tmp1, tmp1, tmp2); 5163 __ eor(tmp3, tmp3, tmp4); 5164 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5165 __ orr(tmp1, tmp1, tmp3); 5166 __ cbnz(tmp1, NOT_EQUAL); 5167 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5168 __ eor(tmp5, tmp5, tmp6); 5169 __ sub(cnt1, cnt1, 8 * wordSize); 5170 __ eor(tmp7, tmp7, tmp8); 5171 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5172 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5173 // cmp) because subs allows an unlimited range of immediate operand. 5174 __ subs(tmp6, cnt1, loopThreshold); 5175 __ orr(tmp5, tmp5, tmp7); 5176 __ cbnz(tmp5, NOT_EQUAL); 5177 __ br(__ GE, LOOP); 5178 // post-loop 5179 __ eor(tmp1, tmp1, tmp2); 5180 __ eor(tmp3, tmp3, tmp4); 5181 __ orr(tmp1, tmp1, tmp3); 5182 __ sub(cnt1, cnt1, 2 * wordSize); 5183 __ cbnz(tmp1, NOT_EQUAL); 5184 } 5185 5186 void generate_large_array_equals_loop_simd(int loopThreshold, 5187 bool usePrefetch, Label &NOT_EQUAL) { 5188 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5189 tmp2 = rscratch2; 5190 Label LOOP; 5191 5192 __ bind(LOOP); 5193 if (usePrefetch) { 5194 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5195 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5196 } 5197 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5198 __ sub(cnt1, cnt1, 8 * wordSize); 5199 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5200 __ subs(tmp1, cnt1, loopThreshold); 5201 __ eor(v0, __ T16B, v0, v4); 5202 __ eor(v1, __ T16B, v1, v5); 5203 __ eor(v2, __ T16B, v2, v6); 5204 __ eor(v3, __ T16B, v3, v7); 5205 __ orr(v0, __ T16B, v0, v1); 5206 __ orr(v1, __ T16B, v2, v3); 5207 __ orr(v0, __ T16B, v0, v1); 5208 __ umov(tmp1, v0, __ D, 0); 5209 __ umov(tmp2, v0, __ D, 1); 5210 __ orr(tmp1, tmp1, tmp2); 5211 __ cbnz(tmp1, NOT_EQUAL); 5212 __ br(__ GE, LOOP); 5213 } 5214 5215 // a1 = r1 - array1 address 5216 // a2 = r2 - array2 address 5217 // result = r0 - return value. Already contains "false" 5218 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5219 // r3-r5 are reserved temporary registers 5220 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5221 address generate_large_array_equals() { 5222 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5223 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5224 tmp7 = r12, tmp8 = r13; 5225 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5226 SMALL_LOOP, POST_LOOP; 5227 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5228 // calculate if at least 32 prefetched bytes are used 5229 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5230 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5231 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5232 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5233 tmp5, tmp6, tmp7, tmp8); 5234 5235 __ align(CodeEntryAlignment); 5236 5237 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5238 5239 address entry = __ pc(); 5240 __ enter(); 5241 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5242 // also advance pointers to use post-increment instead of pre-increment 5243 __ add(a1, a1, wordSize); 5244 __ add(a2, a2, wordSize); 5245 if (AvoidUnalignedAccesses) { 5246 // both implementations (SIMD/nonSIMD) are using relatively large load 5247 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5248 // on some CPUs in case of address is not at least 16-byte aligned. 5249 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5250 // load if needed at least for 1st address and make if 16-byte aligned. 5251 Label ALIGNED16; 5252 __ tbz(a1, 3, ALIGNED16); 5253 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5254 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5255 __ sub(cnt1, cnt1, wordSize); 5256 __ eor(tmp1, tmp1, tmp2); 5257 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5258 __ bind(ALIGNED16); 5259 } 5260 if (UseSIMDForArrayEquals) { 5261 if (SoftwarePrefetchHintDistance >= 0) { 5262 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5263 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5264 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5265 /* prfm = */ true, NOT_EQUAL); 5266 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5267 __ br(__ LT, TAIL); 5268 } 5269 __ bind(NO_PREFETCH_LARGE_LOOP); 5270 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5271 /* prfm = */ false, NOT_EQUAL); 5272 } else { 5273 __ push(spilled_regs, sp); 5274 if (SoftwarePrefetchHintDistance >= 0) { 5275 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5276 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5277 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5278 /* prfm = */ true, NOT_EQUAL); 5279 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5280 __ br(__ LT, TAIL); 5281 } 5282 __ bind(NO_PREFETCH_LARGE_LOOP); 5283 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5284 /* prfm = */ false, NOT_EQUAL); 5285 } 5286 __ bind(TAIL); 5287 __ cbz(cnt1, EQUAL); 5288 __ subs(cnt1, cnt1, wordSize); 5289 __ br(__ LE, POST_LOOP); 5290 __ bind(SMALL_LOOP); 5291 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5292 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5293 __ subs(cnt1, cnt1, wordSize); 5294 __ eor(tmp1, tmp1, tmp2); 5295 __ cbnz(tmp1, NOT_EQUAL); 5296 __ br(__ GT, SMALL_LOOP); 5297 __ bind(POST_LOOP); 5298 __ ldr(tmp1, Address(a1, cnt1)); 5299 __ ldr(tmp2, Address(a2, cnt1)); 5300 __ eor(tmp1, tmp1, tmp2); 5301 __ cbnz(tmp1, NOT_EQUAL); 5302 __ bind(EQUAL); 5303 __ mov(result, true); 5304 __ bind(NOT_EQUAL); 5305 if (!UseSIMDForArrayEquals) { 5306 __ pop(spilled_regs, sp); 5307 } 5308 __ bind(NOT_EQUAL_NO_POP); 5309 __ leave(); 5310 __ ret(lr); 5311 return entry; 5312 } 5313 5314 address generate_dsin_dcos(bool isCos) { 5315 __ align(CodeEntryAlignment); 5316 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5317 address start = __ pc(); 5318 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5319 (address)StubRoutines::aarch64::_two_over_pi, 5320 (address)StubRoutines::aarch64::_pio2, 5321 (address)StubRoutines::aarch64::_dsin_coef, 5322 (address)StubRoutines::aarch64::_dcos_coef); 5323 return start; 5324 } 5325 5326 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5327 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5328 Label &DIFF2) { 5329 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5330 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5331 5332 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5333 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5334 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5335 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5336 5337 __ fmovd(tmpL, vtmp3); 5338 __ eor(rscratch2, tmp3, tmpL); 5339 __ cbnz(rscratch2, DIFF2); 5340 5341 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5342 __ umov(tmpL, vtmp3, __ D, 1); 5343 __ eor(rscratch2, tmpU, tmpL); 5344 __ cbnz(rscratch2, DIFF1); 5345 5346 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5347 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5348 __ fmovd(tmpL, vtmp); 5349 __ eor(rscratch2, tmp3, tmpL); 5350 __ cbnz(rscratch2, DIFF2); 5351 5352 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5353 __ umov(tmpL, vtmp, __ D, 1); 5354 __ eor(rscratch2, tmpU, tmpL); 5355 __ cbnz(rscratch2, DIFF1); 5356 } 5357 5358 // r0 = result 5359 // r1 = str1 5360 // r2 = cnt1 5361 // r3 = str2 5362 // r4 = cnt2 5363 // r10 = tmp1 5364 // r11 = tmp2 5365 address generate_compare_long_string_different_encoding(bool isLU) { 5366 __ align(CodeEntryAlignment); 5367 StubCodeMark mark(this, "StubRoutines", isLU 5368 ? "compare_long_string_different_encoding LU" 5369 : "compare_long_string_different_encoding UL"); 5370 address entry = __ pc(); 5371 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5372 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5373 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5374 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5375 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5376 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5377 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5378 5379 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5380 5381 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5382 // cnt2 == amount of characters left to compare 5383 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5384 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5385 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5386 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5387 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5388 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5389 __ eor(rscratch2, tmp1, tmp2); 5390 __ mov(rscratch1, tmp2); 5391 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5392 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5393 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5394 __ push(spilled_regs, sp); 5395 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5396 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5397 5398 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5399 5400 if (SoftwarePrefetchHintDistance >= 0) { 5401 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5402 __ br(__ LT, NO_PREFETCH); 5403 __ bind(LARGE_LOOP_PREFETCH); 5404 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5405 __ mov(tmp4, 2); 5406 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5407 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5408 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5409 __ subs(tmp4, tmp4, 1); 5410 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5411 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5412 __ mov(tmp4, 2); 5413 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5414 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5415 __ subs(tmp4, tmp4, 1); 5416 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5417 __ sub(cnt2, cnt2, 64); 5418 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5419 __ br(__ GE, LARGE_LOOP_PREFETCH); 5420 } 5421 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5422 __ bind(NO_PREFETCH); 5423 __ subs(cnt2, cnt2, 16); 5424 __ br(__ LT, TAIL); 5425 __ align(OptoLoopAlignment); 5426 __ bind(SMALL_LOOP); // smaller loop 5427 __ subs(cnt2, cnt2, 16); 5428 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5429 __ br(__ GE, SMALL_LOOP); 5430 __ cmn(cnt2, (u1)16); 5431 __ br(__ EQ, LOAD_LAST); 5432 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5433 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5434 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5435 __ ldr(tmp3, Address(cnt1, -8)); 5436 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5437 __ b(LOAD_LAST); 5438 __ bind(DIFF2); 5439 __ mov(tmpU, tmp3); 5440 __ bind(DIFF1); 5441 __ pop(spilled_regs, sp); 5442 __ b(CALCULATE_DIFFERENCE); 5443 __ bind(LOAD_LAST); 5444 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5445 // No need to load it again 5446 __ mov(tmpU, tmp3); 5447 __ pop(spilled_regs, sp); 5448 5449 // tmp2 points to the address of the last 4 Latin1 characters right now 5450 __ ldrs(vtmp, Address(tmp2)); 5451 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5452 __ fmovd(tmpL, vtmp); 5453 5454 __ eor(rscratch2, tmpU, tmpL); 5455 __ cbz(rscratch2, DONE); 5456 5457 // Find the first different characters in the longwords and 5458 // compute their difference. 5459 __ bind(CALCULATE_DIFFERENCE); 5460 __ rev(rscratch2, rscratch2); 5461 __ clz(rscratch2, rscratch2); 5462 __ andr(rscratch2, rscratch2, -16); 5463 __ lsrv(tmp1, tmp1, rscratch2); 5464 __ uxthw(tmp1, tmp1); 5465 __ lsrv(rscratch1, rscratch1, rscratch2); 5466 __ uxthw(rscratch1, rscratch1); 5467 __ subw(result, tmp1, rscratch1); 5468 __ bind(DONE); 5469 __ ret(lr); 5470 return entry; 5471 } 5472 5473 // r0 = input (float16) 5474 // v0 = result (float) 5475 // v1 = temporary float register 5476 address generate_float16ToFloat() { 5477 __ align(CodeEntryAlignment); 5478 StubCodeMark mark(this, "StubRoutines", "float16ToFloat"); 5479 address entry = __ pc(); 5480 BLOCK_COMMENT("Entry:"); 5481 __ flt16_to_flt(v0, r0, v1); 5482 __ ret(lr); 5483 return entry; 5484 } 5485 5486 // v0 = input (float) 5487 // r0 = result (float16) 5488 // v1 = temporary float register 5489 address generate_floatToFloat16() { 5490 __ align(CodeEntryAlignment); 5491 StubCodeMark mark(this, "StubRoutines", "floatToFloat16"); 5492 address entry = __ pc(); 5493 BLOCK_COMMENT("Entry:"); 5494 __ flt_to_flt16(r0, v0, v1); 5495 __ ret(lr); 5496 return entry; 5497 } 5498 5499 address generate_method_entry_barrier() { 5500 __ align(CodeEntryAlignment); 5501 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5502 5503 Label deoptimize_label; 5504 5505 address start = __ pc(); 5506 5507 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5508 5509 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5510 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5511 // We can get here despite the nmethod being good, if we have not 5512 // yet applied our cross modification fence (or data fence). 5513 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5514 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5515 __ ldrw(rscratch2, rscratch2); 5516 __ strw(rscratch2, thread_epoch_addr); 5517 __ isb(); 5518 __ membar(__ LoadLoad); 5519 } 5520 5521 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5522 5523 __ enter(); 5524 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5525 5526 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5527 5528 __ push_call_clobbered_registers(); 5529 5530 __ mov(c_rarg0, rscratch2); 5531 __ call_VM_leaf 5532 (CAST_FROM_FN_PTR 5533 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5534 5535 __ reset_last_Java_frame(true); 5536 5537 __ mov(rscratch1, r0); 5538 5539 __ pop_call_clobbered_registers(); 5540 5541 __ cbnz(rscratch1, deoptimize_label); 5542 5543 __ leave(); 5544 __ ret(lr); 5545 5546 __ BIND(deoptimize_label); 5547 5548 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5549 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5550 5551 __ mov(sp, rscratch1); 5552 __ br(rscratch2); 5553 5554 return start; 5555 } 5556 5557 // r0 = result 5558 // r1 = str1 5559 // r2 = cnt1 5560 // r3 = str2 5561 // r4 = cnt2 5562 // r10 = tmp1 5563 // r11 = tmp2 5564 address generate_compare_long_string_same_encoding(bool isLL) { 5565 __ align(CodeEntryAlignment); 5566 StubCodeMark mark(this, "StubRoutines", isLL 5567 ? "compare_long_string_same_encoding LL" 5568 : "compare_long_string_same_encoding UU"); 5569 address entry = __ pc(); 5570 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5571 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5572 5573 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5574 5575 // exit from large loop when less than 64 bytes left to read or we're about 5576 // to prefetch memory behind array border 5577 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5578 5579 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5580 __ eor(rscratch2, tmp1, tmp2); 5581 __ cbnz(rscratch2, CAL_DIFFERENCE); 5582 5583 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5584 // update pointers, because of previous read 5585 __ add(str1, str1, wordSize); 5586 __ add(str2, str2, wordSize); 5587 if (SoftwarePrefetchHintDistance >= 0) { 5588 __ align(OptoLoopAlignment); 5589 __ bind(LARGE_LOOP_PREFETCH); 5590 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5591 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5592 5593 for (int i = 0; i < 4; i++) { 5594 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5595 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5596 __ cmp(tmp1, tmp2); 5597 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5598 __ br(Assembler::NE, DIFF); 5599 } 5600 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5601 __ add(str1, str1, 64); 5602 __ add(str2, str2, 64); 5603 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5604 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5605 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5606 } 5607 5608 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5609 __ br(Assembler::LE, LESS16); 5610 __ align(OptoLoopAlignment); 5611 __ bind(LOOP_COMPARE16); 5612 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5613 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5614 __ cmp(tmp1, tmp2); 5615 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5616 __ br(Assembler::NE, DIFF); 5617 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5618 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5619 __ br(Assembler::LT, LESS16); 5620 5621 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5622 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5623 __ cmp(tmp1, tmp2); 5624 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5625 __ br(Assembler::NE, DIFF); 5626 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5627 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5628 __ br(Assembler::GE, LOOP_COMPARE16); 5629 __ cbz(cnt2, LENGTH_DIFF); 5630 5631 __ bind(LESS16); 5632 // each 8 compare 5633 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5634 __ br(Assembler::LE, LESS8); 5635 __ ldr(tmp1, Address(__ post(str1, 8))); 5636 __ ldr(tmp2, Address(__ post(str2, 8))); 5637 __ eor(rscratch2, tmp1, tmp2); 5638 __ cbnz(rscratch2, CAL_DIFFERENCE); 5639 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5640 5641 __ bind(LESS8); // directly load last 8 bytes 5642 if (!isLL) { 5643 __ add(cnt2, cnt2, cnt2); 5644 } 5645 __ ldr(tmp1, Address(str1, cnt2)); 5646 __ ldr(tmp2, Address(str2, cnt2)); 5647 __ eor(rscratch2, tmp1, tmp2); 5648 __ cbz(rscratch2, LENGTH_DIFF); 5649 __ b(CAL_DIFFERENCE); 5650 5651 __ bind(DIFF); 5652 __ cmp(tmp1, tmp2); 5653 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5654 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5655 // reuse rscratch2 register for the result of eor instruction 5656 __ eor(rscratch2, tmp1, tmp2); 5657 5658 __ bind(CAL_DIFFERENCE); 5659 __ rev(rscratch2, rscratch2); 5660 __ clz(rscratch2, rscratch2); 5661 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5662 __ lsrv(tmp1, tmp1, rscratch2); 5663 __ lsrv(tmp2, tmp2, rscratch2); 5664 if (isLL) { 5665 __ uxtbw(tmp1, tmp1); 5666 __ uxtbw(tmp2, tmp2); 5667 } else { 5668 __ uxthw(tmp1, tmp1); 5669 __ uxthw(tmp2, tmp2); 5670 } 5671 __ subw(result, tmp1, tmp2); 5672 5673 __ bind(LENGTH_DIFF); 5674 __ ret(lr); 5675 return entry; 5676 } 5677 5678 enum string_compare_mode { 5679 LL, 5680 LU, 5681 UL, 5682 UU, 5683 }; 5684 5685 // The following registers are declared in aarch64.ad 5686 // r0 = result 5687 // r1 = str1 5688 // r2 = cnt1 5689 // r3 = str2 5690 // r4 = cnt2 5691 // r10 = tmp1 5692 // r11 = tmp2 5693 // z0 = ztmp1 5694 // z1 = ztmp2 5695 // p0 = pgtmp1 5696 // p1 = pgtmp2 5697 address generate_compare_long_string_sve(string_compare_mode mode) { 5698 __ align(CodeEntryAlignment); 5699 address entry = __ pc(); 5700 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5701 tmp1 = r10, tmp2 = r11; 5702 5703 Label LOOP, DONE, MISMATCH; 5704 Register vec_len = tmp1; 5705 Register idx = tmp2; 5706 // The minimum of the string lengths has been stored in cnt2. 5707 Register cnt = cnt2; 5708 FloatRegister ztmp1 = z0, ztmp2 = z1; 5709 PRegister pgtmp1 = p0, pgtmp2 = p1; 5710 5711 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 5712 switch (mode) { \ 5713 case LL: \ 5714 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 5715 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 5716 break; \ 5717 case LU: \ 5718 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 5719 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5720 break; \ 5721 case UL: \ 5722 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5723 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 5724 break; \ 5725 case UU: \ 5726 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5727 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5728 break; \ 5729 default: \ 5730 ShouldNotReachHere(); \ 5731 } 5732 5733 const char* stubname; 5734 switch (mode) { 5735 case LL: stubname = "compare_long_string_same_encoding LL"; break; 5736 case LU: stubname = "compare_long_string_different_encoding LU"; break; 5737 case UL: stubname = "compare_long_string_different_encoding UL"; break; 5738 case UU: stubname = "compare_long_string_same_encoding UU"; break; 5739 default: ShouldNotReachHere(); 5740 } 5741 5742 StubCodeMark mark(this, "StubRoutines", stubname); 5743 5744 __ mov(idx, 0); 5745 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5746 5747 if (mode == LL) { 5748 __ sve_cntb(vec_len); 5749 } else { 5750 __ sve_cnth(vec_len); 5751 } 5752 5753 __ sub(rscratch1, cnt, vec_len); 5754 5755 __ bind(LOOP); 5756 5757 // main loop 5758 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5759 __ add(idx, idx, vec_len); 5760 // Compare strings. 5761 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5762 __ br(__ NE, MISMATCH); 5763 __ cmp(idx, rscratch1); 5764 __ br(__ LT, LOOP); 5765 5766 // post loop, last iteration 5767 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5768 5769 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5770 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5771 __ br(__ EQ, DONE); 5772 5773 __ bind(MISMATCH); 5774 5775 // Crop the vector to find its location. 5776 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 5777 // Extract the first different characters of each string. 5778 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 5779 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 5780 5781 // Compute the difference of the first different characters. 5782 __ sub(result, rscratch1, rscratch2); 5783 5784 __ bind(DONE); 5785 __ ret(lr); 5786 #undef LOAD_PAIR 5787 return entry; 5788 } 5789 5790 void generate_compare_long_strings() { 5791 if (UseSVE == 0) { 5792 StubRoutines::aarch64::_compare_long_string_LL 5793 = generate_compare_long_string_same_encoding(true); 5794 StubRoutines::aarch64::_compare_long_string_UU 5795 = generate_compare_long_string_same_encoding(false); 5796 StubRoutines::aarch64::_compare_long_string_LU 5797 = generate_compare_long_string_different_encoding(true); 5798 StubRoutines::aarch64::_compare_long_string_UL 5799 = generate_compare_long_string_different_encoding(false); 5800 } else { 5801 StubRoutines::aarch64::_compare_long_string_LL 5802 = generate_compare_long_string_sve(LL); 5803 StubRoutines::aarch64::_compare_long_string_UU 5804 = generate_compare_long_string_sve(UU); 5805 StubRoutines::aarch64::_compare_long_string_LU 5806 = generate_compare_long_string_sve(LU); 5807 StubRoutines::aarch64::_compare_long_string_UL 5808 = generate_compare_long_string_sve(UL); 5809 } 5810 } 5811 5812 // R0 = result 5813 // R1 = str2 5814 // R2 = cnt1 5815 // R3 = str1 5816 // R4 = cnt2 5817 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 5818 // 5819 // This generic linear code use few additional ideas, which makes it faster: 5820 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5821 // in order to skip initial loading(help in systems with 1 ld pipeline) 5822 // 2) we can use "fast" algorithm of finding single character to search for 5823 // first symbol with less branches(1 branch per each loaded register instead 5824 // of branch for each symbol), so, this is where constants like 5825 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5826 // 3) after loading and analyzing 1st register of source string, it can be 5827 // used to search for every 1st character entry, saving few loads in 5828 // comparison with "simplier-but-slower" implementation 5829 // 4) in order to avoid lots of push/pop operations, code below is heavily 5830 // re-using/re-initializing/compressing register values, which makes code 5831 // larger and a bit less readable, however, most of extra operations are 5832 // issued during loads or branches, so, penalty is minimal 5833 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5834 const char* stubName = str1_isL 5835 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5836 : "indexof_linear_uu"; 5837 __ align(CodeEntryAlignment); 5838 StubCodeMark mark(this, "StubRoutines", stubName); 5839 address entry = __ pc(); 5840 5841 int str1_chr_size = str1_isL ? 1 : 2; 5842 int str2_chr_size = str2_isL ? 1 : 2; 5843 int str1_chr_shift = str1_isL ? 0 : 1; 5844 int str2_chr_shift = str2_isL ? 0 : 1; 5845 bool isL = str1_isL && str2_isL; 5846 // parameters 5847 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5848 // temporary registers 5849 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5850 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5851 // redefinitions 5852 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5853 5854 __ push(spilled_regs, sp); 5855 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5856 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5857 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5858 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5859 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5860 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5861 // Read whole register from str1. It is safe, because length >=8 here 5862 __ ldr(ch1, Address(str1)); 5863 // Read whole register from str2. It is safe, because length >=8 here 5864 __ ldr(ch2, Address(str2)); 5865 __ sub(cnt2, cnt2, cnt1); 5866 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5867 if (str1_isL != str2_isL) { 5868 __ eor(v0, __ T16B, v0, v0); 5869 } 5870 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5871 __ mul(first, first, tmp1); 5872 // check if we have less than 1 register to check 5873 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5874 if (str1_isL != str2_isL) { 5875 __ fmovd(v1, ch1); 5876 } 5877 __ br(__ LE, L_SMALL); 5878 __ eor(ch2, first, ch2); 5879 if (str1_isL != str2_isL) { 5880 __ zip1(v1, __ T16B, v1, v0); 5881 } 5882 __ sub(tmp2, ch2, tmp1); 5883 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5884 __ bics(tmp2, tmp2, ch2); 5885 if (str1_isL != str2_isL) { 5886 __ fmovd(ch1, v1); 5887 } 5888 __ br(__ NE, L_HAS_ZERO); 5889 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5890 __ add(result, result, wordSize/str2_chr_size); 5891 __ add(str2, str2, wordSize); 5892 __ br(__ LT, L_POST_LOOP); 5893 __ BIND(L_LOOP); 5894 __ ldr(ch2, Address(str2)); 5895 __ eor(ch2, first, ch2); 5896 __ sub(tmp2, ch2, tmp1); 5897 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5898 __ bics(tmp2, tmp2, ch2); 5899 __ br(__ NE, L_HAS_ZERO); 5900 __ BIND(L_LOOP_PROCEED); 5901 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5902 __ add(str2, str2, wordSize); 5903 __ add(result, result, wordSize/str2_chr_size); 5904 __ br(__ GE, L_LOOP); 5905 __ BIND(L_POST_LOOP); 5906 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5907 __ br(__ LE, NOMATCH); 5908 __ ldr(ch2, Address(str2)); 5909 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5910 __ eor(ch2, first, ch2); 5911 __ sub(tmp2, ch2, tmp1); 5912 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5913 __ mov(tmp4, -1); // all bits set 5914 __ b(L_SMALL_PROCEED); 5915 __ align(OptoLoopAlignment); 5916 __ BIND(L_SMALL); 5917 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5918 __ eor(ch2, first, ch2); 5919 if (str1_isL != str2_isL) { 5920 __ zip1(v1, __ T16B, v1, v0); 5921 } 5922 __ sub(tmp2, ch2, tmp1); 5923 __ mov(tmp4, -1); // all bits set 5924 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5925 if (str1_isL != str2_isL) { 5926 __ fmovd(ch1, v1); // move converted 4 symbols 5927 } 5928 __ BIND(L_SMALL_PROCEED); 5929 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5930 __ bic(tmp2, tmp2, ch2); 5931 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5932 __ rbit(tmp2, tmp2); 5933 __ br(__ EQ, NOMATCH); 5934 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5935 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5936 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5937 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5938 if (str2_isL) { // LL 5939 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5940 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5941 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5942 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5943 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5944 } else { 5945 __ mov(ch2, 0xE); // all bits in byte set except last one 5946 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5947 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5948 __ lslv(tmp2, tmp2, tmp4); 5949 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5950 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5951 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5952 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5953 } 5954 __ cmp(ch1, ch2); 5955 __ mov(tmp4, wordSize/str2_chr_size); 5956 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5957 __ BIND(L_SMALL_CMP_LOOP); 5958 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5959 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5960 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5961 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5962 __ add(tmp4, tmp4, 1); 5963 __ cmp(tmp4, cnt1); 5964 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5965 __ cmp(first, ch2); 5966 __ br(__ EQ, L_SMALL_CMP_LOOP); 5967 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5968 __ cbz(tmp2, NOMATCH); // no more matches. exit 5969 __ clz(tmp4, tmp2); 5970 __ add(result, result, 1); // advance index 5971 __ add(str2, str2, str2_chr_size); // advance pointer 5972 __ b(L_SMALL_HAS_ZERO_LOOP); 5973 __ align(OptoLoopAlignment); 5974 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5975 __ cmp(first, ch2); 5976 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5977 __ b(DONE); 5978 __ align(OptoLoopAlignment); 5979 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5980 if (str2_isL) { // LL 5981 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5982 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5983 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5984 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5985 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5986 } else { 5987 __ mov(ch2, 0xE); // all bits in byte set except last one 5988 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5989 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5990 __ lslv(tmp2, tmp2, tmp4); 5991 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5992 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5993 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5994 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5995 } 5996 __ cmp(ch1, ch2); 5997 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5998 __ b(DONE); 5999 __ align(OptoLoopAlignment); 6000 __ BIND(L_HAS_ZERO); 6001 __ rbit(tmp2, tmp2); 6002 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 6003 // Now, perform compression of counters(cnt2 and cnt1) into one register. 6004 // It's fine because both counters are 32bit and are not changed in this 6005 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 6006 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 6007 __ sub(result, result, 1); 6008 __ BIND(L_HAS_ZERO_LOOP); 6009 __ mov(cnt1, wordSize/str2_chr_size); 6010 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6011 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6012 if (str2_isL) { 6013 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6014 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6015 __ lslv(tmp2, tmp2, tmp4); 6016 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6017 __ add(tmp4, tmp4, 1); 6018 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6019 __ lsl(tmp2, tmp2, 1); 6020 __ mov(tmp4, wordSize/str2_chr_size); 6021 } else { 6022 __ mov(ch2, 0xE); 6023 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6024 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6025 __ lslv(tmp2, tmp2, tmp4); 6026 __ add(tmp4, tmp4, 1); 6027 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6028 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6029 __ lsl(tmp2, tmp2, 1); 6030 __ mov(tmp4, wordSize/str2_chr_size); 6031 __ sub(str2, str2, str2_chr_size); 6032 } 6033 __ cmp(ch1, ch2); 6034 __ mov(tmp4, wordSize/str2_chr_size); 6035 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6036 __ BIND(L_CMP_LOOP); 6037 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6038 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6039 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6040 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6041 __ add(tmp4, tmp4, 1); 6042 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6043 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6044 __ cmp(cnt1, ch2); 6045 __ br(__ EQ, L_CMP_LOOP); 6046 __ BIND(L_CMP_LOOP_NOMATCH); 6047 // here we're not matched 6048 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6049 __ clz(tmp4, tmp2); 6050 __ add(str2, str2, str2_chr_size); // advance pointer 6051 __ b(L_HAS_ZERO_LOOP); 6052 __ align(OptoLoopAlignment); 6053 __ BIND(L_CMP_LOOP_LAST_CMP); 6054 __ cmp(cnt1, ch2); 6055 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6056 __ b(DONE); 6057 __ align(OptoLoopAlignment); 6058 __ BIND(L_CMP_LOOP_LAST_CMP2); 6059 if (str2_isL) { 6060 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6061 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6062 __ lslv(tmp2, tmp2, tmp4); 6063 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6064 __ add(tmp4, tmp4, 1); 6065 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6066 __ lsl(tmp2, tmp2, 1); 6067 } else { 6068 __ mov(ch2, 0xE); 6069 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6070 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6071 __ lslv(tmp2, tmp2, tmp4); 6072 __ add(tmp4, tmp4, 1); 6073 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6074 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6075 __ lsl(tmp2, tmp2, 1); 6076 __ sub(str2, str2, str2_chr_size); 6077 } 6078 __ cmp(ch1, ch2); 6079 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6080 __ b(DONE); 6081 __ align(OptoLoopAlignment); 6082 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6083 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6084 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6085 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6086 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6087 // result by analyzed characters value, so, we can just reset lower bits 6088 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6089 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6090 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6091 // index of last analyzed substring inside current octet. So, str2 in at 6092 // respective start address. We need to advance it to next octet 6093 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6094 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6095 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6096 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6097 __ movw(cnt2, cnt2); 6098 __ b(L_LOOP_PROCEED); 6099 __ align(OptoLoopAlignment); 6100 __ BIND(NOMATCH); 6101 __ mov(result, -1); 6102 __ BIND(DONE); 6103 __ pop(spilled_regs, sp); 6104 __ ret(lr); 6105 return entry; 6106 } 6107 6108 void generate_string_indexof_stubs() { 6109 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6110 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6111 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6112 } 6113 6114 void inflate_and_store_2_fp_registers(bool generatePrfm, 6115 FloatRegister src1, FloatRegister src2) { 6116 Register dst = r1; 6117 __ zip1(v1, __ T16B, src1, v0); 6118 __ zip2(v2, __ T16B, src1, v0); 6119 if (generatePrfm) { 6120 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6121 } 6122 __ zip1(v3, __ T16B, src2, v0); 6123 __ zip2(v4, __ T16B, src2, v0); 6124 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6125 } 6126 6127 // R0 = src 6128 // R1 = dst 6129 // R2 = len 6130 // R3 = len >> 3 6131 // V0 = 0 6132 // v1 = loaded 8 bytes 6133 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6134 address generate_large_byte_array_inflate() { 6135 __ align(CodeEntryAlignment); 6136 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6137 address entry = __ pc(); 6138 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6139 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6140 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6141 6142 // do one more 8-byte read to have address 16-byte aligned in most cases 6143 // also use single store instruction 6144 __ ldrd(v2, __ post(src, 8)); 6145 __ sub(octetCounter, octetCounter, 2); 6146 __ zip1(v1, __ T16B, v1, v0); 6147 __ zip1(v2, __ T16B, v2, v0); 6148 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6149 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6150 __ subs(rscratch1, octetCounter, large_loop_threshold); 6151 __ br(__ LE, LOOP_START); 6152 __ b(LOOP_PRFM_START); 6153 __ bind(LOOP_PRFM); 6154 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6155 __ bind(LOOP_PRFM_START); 6156 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6157 __ sub(octetCounter, octetCounter, 8); 6158 __ subs(rscratch1, octetCounter, large_loop_threshold); 6159 inflate_and_store_2_fp_registers(true, v3, v4); 6160 inflate_and_store_2_fp_registers(true, v5, v6); 6161 __ br(__ GT, LOOP_PRFM); 6162 __ cmp(octetCounter, (u1)8); 6163 __ br(__ LT, DONE); 6164 __ bind(LOOP); 6165 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6166 __ bind(LOOP_START); 6167 __ sub(octetCounter, octetCounter, 8); 6168 __ cmp(octetCounter, (u1)8); 6169 inflate_and_store_2_fp_registers(false, v3, v4); 6170 inflate_and_store_2_fp_registers(false, v5, v6); 6171 __ br(__ GE, LOOP); 6172 __ bind(DONE); 6173 __ ret(lr); 6174 return entry; 6175 } 6176 6177 /** 6178 * Arguments: 6179 * 6180 * Input: 6181 * c_rarg0 - current state address 6182 * c_rarg1 - H key address 6183 * c_rarg2 - data address 6184 * c_rarg3 - number of blocks 6185 * 6186 * Output: 6187 * Updated state at c_rarg0 6188 */ 6189 address generate_ghash_processBlocks() { 6190 // Bafflingly, GCM uses little-endian for the byte order, but 6191 // big-endian for the bit order. For example, the polynomial 1 is 6192 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6193 // 6194 // So, we must either reverse the bytes in each word and do 6195 // everything big-endian or reverse the bits in each byte and do 6196 // it little-endian. On AArch64 it's more idiomatic to reverse 6197 // the bits in each byte (we have an instruction, RBIT, to do 6198 // that) and keep the data in little-endian bit order through the 6199 // calculation, bit-reversing the inputs and outputs. 6200 6201 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6202 __ align(wordSize * 2); 6203 address p = __ pc(); 6204 __ emit_int64(0x87); // The low-order bits of the field 6205 // polynomial (i.e. p = z^7+z^2+z+1) 6206 // repeated in the low and high parts of a 6207 // 128-bit vector 6208 __ emit_int64(0x87); 6209 6210 __ align(CodeEntryAlignment); 6211 address start = __ pc(); 6212 6213 Register state = c_rarg0; 6214 Register subkeyH = c_rarg1; 6215 Register data = c_rarg2; 6216 Register blocks = c_rarg3; 6217 6218 FloatRegister vzr = v30; 6219 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6220 6221 __ ldrq(v24, p); // The field polynomial 6222 6223 __ ldrq(v0, Address(state)); 6224 __ ldrq(v1, Address(subkeyH)); 6225 6226 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6227 __ rbit(v0, __ T16B, v0); 6228 __ rev64(v1, __ T16B, v1); 6229 __ rbit(v1, __ T16B, v1); 6230 6231 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6232 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6233 6234 { 6235 Label L_ghash_loop; 6236 __ bind(L_ghash_loop); 6237 6238 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6239 // reversing each byte 6240 __ rbit(v2, __ T16B, v2); 6241 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6242 6243 // Multiply state in v2 by subkey in v1 6244 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6245 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6246 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6247 // Reduce v7:v5 by the field polynomial 6248 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6249 6250 __ sub(blocks, blocks, 1); 6251 __ cbnz(blocks, L_ghash_loop); 6252 } 6253 6254 // The bit-reversed result is at this point in v0 6255 __ rev64(v0, __ T16B, v0); 6256 __ rbit(v0, __ T16B, v0); 6257 6258 __ st1(v0, __ T16B, state); 6259 __ ret(lr); 6260 6261 return start; 6262 } 6263 6264 address generate_ghash_processBlocks_wide() { 6265 address small = generate_ghash_processBlocks(); 6266 6267 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6268 __ align(wordSize * 2); 6269 address p = __ pc(); 6270 __ emit_int64(0x87); // The low-order bits of the field 6271 // polynomial (i.e. p = z^7+z^2+z+1) 6272 // repeated in the low and high parts of a 6273 // 128-bit vector 6274 __ emit_int64(0x87); 6275 6276 __ align(CodeEntryAlignment); 6277 address start = __ pc(); 6278 6279 Register state = c_rarg0; 6280 Register subkeyH = c_rarg1; 6281 Register data = c_rarg2; 6282 Register blocks = c_rarg3; 6283 6284 const int unroll = 4; 6285 6286 __ cmp(blocks, (unsigned char)(unroll * 2)); 6287 __ br(__ LT, small); 6288 6289 if (unroll > 1) { 6290 // Save state before entering routine 6291 __ sub(sp, sp, 4 * 16); 6292 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6293 __ sub(sp, sp, 4 * 16); 6294 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6295 } 6296 6297 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6298 6299 if (unroll > 1) { 6300 // And restore state 6301 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6302 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6303 } 6304 6305 __ cmp(blocks, (unsigned char)0); 6306 __ br(__ GT, small); 6307 6308 __ ret(lr); 6309 6310 return start; 6311 } 6312 6313 void generate_base64_encode_simdround(Register src, Register dst, 6314 FloatRegister codec, u8 size) { 6315 6316 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6317 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6318 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6319 6320 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6321 6322 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6323 6324 __ ushr(ind0, arrangement, in0, 2); 6325 6326 __ ushr(ind1, arrangement, in1, 2); 6327 __ shl(in0, arrangement, in0, 6); 6328 __ orr(ind1, arrangement, ind1, in0); 6329 __ ushr(ind1, arrangement, ind1, 2); 6330 6331 __ ushr(ind2, arrangement, in2, 4); 6332 __ shl(in1, arrangement, in1, 4); 6333 __ orr(ind2, arrangement, in1, ind2); 6334 __ ushr(ind2, arrangement, ind2, 2); 6335 6336 __ shl(ind3, arrangement, in2, 2); 6337 __ ushr(ind3, arrangement, ind3, 2); 6338 6339 __ tbl(out0, arrangement, codec, 4, ind0); 6340 __ tbl(out1, arrangement, codec, 4, ind1); 6341 __ tbl(out2, arrangement, codec, 4, ind2); 6342 __ tbl(out3, arrangement, codec, 4, ind3); 6343 6344 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6345 } 6346 6347 /** 6348 * Arguments: 6349 * 6350 * Input: 6351 * c_rarg0 - src_start 6352 * c_rarg1 - src_offset 6353 * c_rarg2 - src_length 6354 * c_rarg3 - dest_start 6355 * c_rarg4 - dest_offset 6356 * c_rarg5 - isURL 6357 * 6358 */ 6359 address generate_base64_encodeBlock() { 6360 6361 static const char toBase64[64] = { 6362 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6363 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6364 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6365 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6366 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6367 }; 6368 6369 static const char toBase64URL[64] = { 6370 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6371 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6372 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6373 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6374 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6375 }; 6376 6377 __ align(CodeEntryAlignment); 6378 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6379 address start = __ pc(); 6380 6381 Register src = c_rarg0; // source array 6382 Register soff = c_rarg1; // source start offset 6383 Register send = c_rarg2; // source end offset 6384 Register dst = c_rarg3; // dest array 6385 Register doff = c_rarg4; // position for writing to dest array 6386 Register isURL = c_rarg5; // Base64 or URL character set 6387 6388 // c_rarg6 and c_rarg7 are free to use as temps 6389 Register codec = c_rarg6; 6390 Register length = c_rarg7; 6391 6392 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6393 6394 __ add(src, src, soff); 6395 __ add(dst, dst, doff); 6396 __ sub(length, send, soff); 6397 6398 // load the codec base address 6399 __ lea(codec, ExternalAddress((address) toBase64)); 6400 __ cbz(isURL, ProcessData); 6401 __ lea(codec, ExternalAddress((address) toBase64URL)); 6402 6403 __ BIND(ProcessData); 6404 6405 // too short to formup a SIMD loop, roll back 6406 __ cmp(length, (u1)24); 6407 __ br(Assembler::LT, Process3B); 6408 6409 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6410 6411 __ BIND(Process48B); 6412 __ cmp(length, (u1)48); 6413 __ br(Assembler::LT, Process24B); 6414 generate_base64_encode_simdround(src, dst, v0, 16); 6415 __ sub(length, length, 48); 6416 __ b(Process48B); 6417 6418 __ BIND(Process24B); 6419 __ cmp(length, (u1)24); 6420 __ br(Assembler::LT, SIMDExit); 6421 generate_base64_encode_simdround(src, dst, v0, 8); 6422 __ sub(length, length, 24); 6423 6424 __ BIND(SIMDExit); 6425 __ cbz(length, Exit); 6426 6427 __ BIND(Process3B); 6428 // 3 src bytes, 24 bits 6429 __ ldrb(r10, __ post(src, 1)); 6430 __ ldrb(r11, __ post(src, 1)); 6431 __ ldrb(r12, __ post(src, 1)); 6432 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6433 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6434 // codec index 6435 __ ubfmw(r15, r12, 18, 23); 6436 __ ubfmw(r14, r12, 12, 17); 6437 __ ubfmw(r13, r12, 6, 11); 6438 __ andw(r12, r12, 63); 6439 // get the code based on the codec 6440 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6441 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6442 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6443 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6444 __ strb(r15, __ post(dst, 1)); 6445 __ strb(r14, __ post(dst, 1)); 6446 __ strb(r13, __ post(dst, 1)); 6447 __ strb(r12, __ post(dst, 1)); 6448 __ sub(length, length, 3); 6449 __ cbnz(length, Process3B); 6450 6451 __ BIND(Exit); 6452 __ ret(lr); 6453 6454 return start; 6455 } 6456 6457 void generate_base64_decode_simdround(Register src, Register dst, 6458 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6459 6460 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6461 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6462 6463 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6464 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6465 6466 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6467 6468 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6469 6470 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6471 6472 // we need unsigned saturating subtract, to make sure all input values 6473 // in range [0, 63] will have 0U value in the higher half lookup 6474 __ uqsubv(decH0, __ T16B, in0, v27); 6475 __ uqsubv(decH1, __ T16B, in1, v27); 6476 __ uqsubv(decH2, __ T16B, in2, v27); 6477 __ uqsubv(decH3, __ T16B, in3, v27); 6478 6479 // lower half lookup 6480 __ tbl(decL0, arrangement, codecL, 4, in0); 6481 __ tbl(decL1, arrangement, codecL, 4, in1); 6482 __ tbl(decL2, arrangement, codecL, 4, in2); 6483 __ tbl(decL3, arrangement, codecL, 4, in3); 6484 6485 // higher half lookup 6486 __ tbx(decH0, arrangement, codecH, 4, decH0); 6487 __ tbx(decH1, arrangement, codecH, 4, decH1); 6488 __ tbx(decH2, arrangement, codecH, 4, decH2); 6489 __ tbx(decH3, arrangement, codecH, 4, decH3); 6490 6491 // combine lower and higher 6492 __ orr(decL0, arrangement, decL0, decH0); 6493 __ orr(decL1, arrangement, decL1, decH1); 6494 __ orr(decL2, arrangement, decL2, decH2); 6495 __ orr(decL3, arrangement, decL3, decH3); 6496 6497 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6498 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6499 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6500 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6501 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6502 __ orr(in0, arrangement, decH0, decH1); 6503 __ orr(in1, arrangement, decH2, decH3); 6504 __ orr(in2, arrangement, in0, in1); 6505 __ umaxv(in3, arrangement, in2); 6506 __ umov(rscratch2, in3, __ B, 0); 6507 6508 // get the data to output 6509 __ shl(out0, arrangement, decL0, 2); 6510 __ ushr(out1, arrangement, decL1, 4); 6511 __ orr(out0, arrangement, out0, out1); 6512 __ shl(out1, arrangement, decL1, 4); 6513 __ ushr(out2, arrangement, decL2, 2); 6514 __ orr(out1, arrangement, out1, out2); 6515 __ shl(out2, arrangement, decL2, 6); 6516 __ orr(out2, arrangement, out2, decL3); 6517 6518 __ cbz(rscratch2, NoIllegalData); 6519 6520 // handle illegal input 6521 __ umov(r10, in2, __ D, 0); 6522 if (size == 16) { 6523 __ cbnz(r10, ErrorInLowerHalf); 6524 6525 // illegal input is in higher half, store the lower half now. 6526 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6527 6528 __ umov(r10, in2, __ D, 1); 6529 __ umov(r11, out0, __ D, 1); 6530 __ umov(r12, out1, __ D, 1); 6531 __ umov(r13, out2, __ D, 1); 6532 __ b(StoreLegalData); 6533 6534 __ BIND(ErrorInLowerHalf); 6535 } 6536 __ umov(r11, out0, __ D, 0); 6537 __ umov(r12, out1, __ D, 0); 6538 __ umov(r13, out2, __ D, 0); 6539 6540 __ BIND(StoreLegalData); 6541 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6542 __ strb(r11, __ post(dst, 1)); 6543 __ strb(r12, __ post(dst, 1)); 6544 __ strb(r13, __ post(dst, 1)); 6545 __ lsr(r10, r10, 8); 6546 __ lsr(r11, r11, 8); 6547 __ lsr(r12, r12, 8); 6548 __ lsr(r13, r13, 8); 6549 __ b(StoreLegalData); 6550 6551 __ BIND(NoIllegalData); 6552 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6553 } 6554 6555 6556 /** 6557 * Arguments: 6558 * 6559 * Input: 6560 * c_rarg0 - src_start 6561 * c_rarg1 - src_offset 6562 * c_rarg2 - src_length 6563 * c_rarg3 - dest_start 6564 * c_rarg4 - dest_offset 6565 * c_rarg5 - isURL 6566 * c_rarg6 - isMIME 6567 * 6568 */ 6569 address generate_base64_decodeBlock() { 6570 6571 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6572 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6573 // titled "Base64 decoding". 6574 6575 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6576 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6577 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6578 static const uint8_t fromBase64ForNoSIMD[256] = { 6579 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6580 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6581 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6582 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6583 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6584 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6585 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6586 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6587 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6588 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6589 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6590 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6591 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6592 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6593 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6594 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6595 }; 6596 6597 static const uint8_t fromBase64URLForNoSIMD[256] = { 6598 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6599 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6600 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6601 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6602 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6603 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6604 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6605 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6606 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6607 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6608 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6609 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6610 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6611 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6612 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6613 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6614 }; 6615 6616 // A legal value of base64 code is in range [0, 127]. We need two lookups 6617 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6618 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6619 // table vector lookup use tbx, out of range indices are unchanged in 6620 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6621 // The value of index 64 is set to 0, so that we know that we already get the 6622 // decoded data with the 1st lookup. 6623 static const uint8_t fromBase64ForSIMD[128] = { 6624 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6625 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6626 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6627 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6628 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6629 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6630 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6631 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6632 }; 6633 6634 static const uint8_t fromBase64URLForSIMD[128] = { 6635 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6636 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6637 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6638 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6639 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6640 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6641 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6642 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6643 }; 6644 6645 __ align(CodeEntryAlignment); 6646 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6647 address start = __ pc(); 6648 6649 Register src = c_rarg0; // source array 6650 Register soff = c_rarg1; // source start offset 6651 Register send = c_rarg2; // source end offset 6652 Register dst = c_rarg3; // dest array 6653 Register doff = c_rarg4; // position for writing to dest array 6654 Register isURL = c_rarg5; // Base64 or URL character set 6655 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6656 6657 Register length = send; // reuse send as length of source data to process 6658 6659 Register simd_codec = c_rarg6; 6660 Register nosimd_codec = c_rarg7; 6661 6662 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6663 6664 __ enter(); 6665 6666 __ add(src, src, soff); 6667 __ add(dst, dst, doff); 6668 6669 __ mov(doff, dst); 6670 6671 __ sub(length, send, soff); 6672 __ bfm(length, zr, 0, 1); 6673 6674 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6675 __ cbz(isURL, ProcessData); 6676 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6677 6678 __ BIND(ProcessData); 6679 __ mov(rscratch1, length); 6680 __ cmp(length, (u1)144); // 144 = 80 + 64 6681 __ br(Assembler::LT, Process4B); 6682 6683 // In the MIME case, the line length cannot be more than 76 6684 // bytes (see RFC 2045). This is too short a block for SIMD 6685 // to be worthwhile, so we use non-SIMD here. 6686 __ movw(rscratch1, 79); 6687 6688 __ BIND(Process4B); 6689 __ ldrw(r14, __ post(src, 4)); 6690 __ ubfxw(r10, r14, 0, 8); 6691 __ ubfxw(r11, r14, 8, 8); 6692 __ ubfxw(r12, r14, 16, 8); 6693 __ ubfxw(r13, r14, 24, 8); 6694 // get the de-code 6695 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6696 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6697 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6698 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6699 // error detection, 255u indicates an illegal input 6700 __ orrw(r14, r10, r11); 6701 __ orrw(r15, r12, r13); 6702 __ orrw(r14, r14, r15); 6703 __ tbnz(r14, 7, Exit); 6704 // recover the data 6705 __ lslw(r14, r10, 10); 6706 __ bfiw(r14, r11, 4, 6); 6707 __ bfmw(r14, r12, 2, 5); 6708 __ rev16w(r14, r14); 6709 __ bfiw(r13, r12, 6, 2); 6710 __ strh(r14, __ post(dst, 2)); 6711 __ strb(r13, __ post(dst, 1)); 6712 // non-simd loop 6713 __ subsw(rscratch1, rscratch1, 4); 6714 __ br(Assembler::GT, Process4B); 6715 6716 // if exiting from PreProcess80B, rscratch1 == -1; 6717 // otherwise, rscratch1 == 0. 6718 __ cbzw(rscratch1, Exit); 6719 __ sub(length, length, 80); 6720 6721 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6722 __ cbz(isURL, SIMDEnter); 6723 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6724 6725 __ BIND(SIMDEnter); 6726 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6727 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6728 __ mov(rscratch1, 63); 6729 __ dup(v27, __ T16B, rscratch1); 6730 6731 __ BIND(Process64B); 6732 __ cmp(length, (u1)64); 6733 __ br(Assembler::LT, Process32B); 6734 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6735 __ sub(length, length, 64); 6736 __ b(Process64B); 6737 6738 __ BIND(Process32B); 6739 __ cmp(length, (u1)32); 6740 __ br(Assembler::LT, SIMDExit); 6741 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6742 __ sub(length, length, 32); 6743 __ b(Process32B); 6744 6745 __ BIND(SIMDExit); 6746 __ cbz(length, Exit); 6747 __ movw(rscratch1, length); 6748 __ b(Process4B); 6749 6750 __ BIND(Exit); 6751 __ sub(c_rarg0, dst, doff); 6752 6753 __ leave(); 6754 __ ret(lr); 6755 6756 return start; 6757 } 6758 6759 // Support for spin waits. 6760 address generate_spin_wait() { 6761 __ align(CodeEntryAlignment); 6762 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6763 address start = __ pc(); 6764 6765 __ spin_wait(); 6766 __ ret(lr); 6767 6768 return start; 6769 } 6770 6771 address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { 6772 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table"); 6773 6774 address start = __ pc(); 6775 const Register 6776 r_super_klass = r0, 6777 r_array_base = r1, 6778 r_array_length = r2, 6779 r_array_index = r3, 6780 r_sub_klass = r4, 6781 r_bitmap = rscratch2, 6782 result = r5; 6783 const FloatRegister 6784 vtemp = v0; 6785 6786 Label L_success; 6787 __ enter(); 6788 __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, 6789 r_array_base, r_array_length, r_array_index, 6790 vtemp, result, super_klass_index, 6791 /*stub_is_near*/true); 6792 __ leave(); 6793 __ ret(lr); 6794 6795 return start; 6796 } 6797 6798 // Slow path implementation for UseSecondarySupersTable. 6799 address generate_lookup_secondary_supers_table_slow_path_stub() { 6800 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path"); 6801 6802 address start = __ pc(); 6803 const Register 6804 r_super_klass = r0, // argument 6805 r_array_base = r1, // argument 6806 temp1 = r2, // temp 6807 r_array_index = r3, // argument 6808 r_bitmap = rscratch2, // argument 6809 result = r5; // argument 6810 6811 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 6812 __ ret(lr); 6813 6814 return start; 6815 } 6816 6817 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 6818 6819 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6820 // 6821 // If LSE is in use, generate LSE versions of all the stubs. The 6822 // non-LSE versions are in atomic_aarch64.S. 6823 6824 // class AtomicStubMark records the entry point of a stub and the 6825 // stub pointer which will point to it. The stub pointer is set to 6826 // the entry point when ~AtomicStubMark() is called, which must be 6827 // after ICache::invalidate_range. This ensures safe publication of 6828 // the generated code. 6829 class AtomicStubMark { 6830 address _entry_point; 6831 aarch64_atomic_stub_t *_stub; 6832 MacroAssembler *_masm; 6833 public: 6834 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6835 _masm = masm; 6836 __ align(32); 6837 _entry_point = __ pc(); 6838 _stub = stub; 6839 } 6840 ~AtomicStubMark() { 6841 *_stub = (aarch64_atomic_stub_t)_entry_point; 6842 } 6843 }; 6844 6845 // NB: For memory_order_conservative we need a trailing membar after 6846 // LSE atomic operations but not a leading membar. 6847 // 6848 // We don't need a leading membar because a clause in the Arm ARM 6849 // says: 6850 // 6851 // Barrier-ordered-before 6852 // 6853 // Barrier instructions order prior Memory effects before subsequent 6854 // Memory effects generated by the same Observer. A read or a write 6855 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6856 // Observer if and only if RW1 appears in program order before RW 2 6857 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6858 // instruction with both Acquire and Release semantics. 6859 // 6860 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6861 // and Release semantics, therefore we don't need a leading 6862 // barrier. However, there is no corresponding Barrier-ordered-after 6863 // relationship, therefore we need a trailing membar to prevent a 6864 // later store or load from being reordered with the store in an 6865 // atomic instruction. 6866 // 6867 // This was checked by using the herd7 consistency model simulator 6868 // (http://diy.inria.fr/) with this test case: 6869 // 6870 // AArch64 LseCas 6871 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6872 // P0 | P1; 6873 // LDR W4, [X2] | MOV W3, #0; 6874 // DMB LD | MOV W4, #1; 6875 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6876 // | DMB ISH; 6877 // | STR W4, [X2]; 6878 // exists 6879 // (0:X3=0 /\ 0:X4=1) 6880 // 6881 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6882 // with the store to x in P1. Without the DMB in P1 this may happen. 6883 // 6884 // At the time of writing we don't know of any AArch64 hardware that 6885 // reorders stores in this way, but the Reference Manual permits it. 6886 6887 void gen_cas_entry(Assembler::operand_size size, 6888 atomic_memory_order order) { 6889 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6890 exchange_val = c_rarg2; 6891 bool acquire, release; 6892 switch (order) { 6893 case memory_order_relaxed: 6894 acquire = false; 6895 release = false; 6896 break; 6897 case memory_order_release: 6898 acquire = false; 6899 release = true; 6900 break; 6901 default: 6902 acquire = true; 6903 release = true; 6904 break; 6905 } 6906 __ mov(prev, compare_val); 6907 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6908 if (order == memory_order_conservative) { 6909 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6910 } 6911 if (size == Assembler::xword) { 6912 __ mov(r0, prev); 6913 } else { 6914 __ movw(r0, prev); 6915 } 6916 __ ret(lr); 6917 } 6918 6919 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6920 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6921 // If not relaxed, then default to conservative. Relaxed is the only 6922 // case we use enough to be worth specializing. 6923 if (order == memory_order_relaxed) { 6924 __ ldadd(size, incr, prev, addr); 6925 } else { 6926 __ ldaddal(size, incr, prev, addr); 6927 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6928 } 6929 if (size == Assembler::xword) { 6930 __ mov(r0, prev); 6931 } else { 6932 __ movw(r0, prev); 6933 } 6934 __ ret(lr); 6935 } 6936 6937 void gen_swpal_entry(Assembler::operand_size size) { 6938 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6939 __ swpal(size, incr, prev, addr); 6940 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6941 if (size == Assembler::xword) { 6942 __ mov(r0, prev); 6943 } else { 6944 __ movw(r0, prev); 6945 } 6946 __ ret(lr); 6947 } 6948 6949 void generate_atomic_entry_points() { 6950 if (! UseLSE) { 6951 return; 6952 } 6953 6954 __ align(CodeEntryAlignment); 6955 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6956 address first_entry = __ pc(); 6957 6958 // ADD, memory_order_conservative 6959 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6960 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6961 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6962 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6963 6964 // ADD, memory_order_relaxed 6965 AtomicStubMark mark_fetch_add_4_relaxed 6966 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6967 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6968 AtomicStubMark mark_fetch_add_8_relaxed 6969 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6970 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6971 6972 // XCHG, memory_order_conservative 6973 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6974 gen_swpal_entry(Assembler::word); 6975 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6976 gen_swpal_entry(Assembler::xword); 6977 6978 // CAS, memory_order_conservative 6979 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6980 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6981 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6982 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6983 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6984 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6985 6986 // CAS, memory_order_relaxed 6987 AtomicStubMark mark_cmpxchg_1_relaxed 6988 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6989 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6990 AtomicStubMark mark_cmpxchg_4_relaxed 6991 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6992 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6993 AtomicStubMark mark_cmpxchg_8_relaxed 6994 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6995 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6996 6997 AtomicStubMark mark_cmpxchg_4_release 6998 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6999 gen_cas_entry(MacroAssembler::word, memory_order_release); 7000 AtomicStubMark mark_cmpxchg_8_release 7001 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 7002 gen_cas_entry(MacroAssembler::xword, memory_order_release); 7003 7004 AtomicStubMark mark_cmpxchg_4_seq_cst 7005 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 7006 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 7007 AtomicStubMark mark_cmpxchg_8_seq_cst 7008 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 7009 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 7010 7011 ICache::invalidate_range(first_entry, __ pc() - first_entry); 7012 } 7013 #endif // LINUX 7014 7015 address generate_cont_thaw(Continuation::thaw_kind kind) { 7016 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 7017 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 7018 7019 address start = __ pc(); 7020 7021 if (return_barrier) { 7022 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 7023 __ mov(sp, rscratch1); 7024 } 7025 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7026 7027 if (return_barrier) { 7028 // preserve possible return value from a method returning to the return barrier 7029 __ fmovd(rscratch1, v0); 7030 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7031 } 7032 7033 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 7034 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 7035 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 7036 7037 if (return_barrier) { 7038 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7039 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7040 __ fmovd(v0, rscratch1); 7041 } 7042 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7043 7044 7045 Label thaw_success; 7046 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 7047 __ cbnz(rscratch2, thaw_success); 7048 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 7049 __ br(rscratch1); 7050 __ bind(thaw_success); 7051 7052 // make room for the thawed frames 7053 __ sub(rscratch1, sp, rscratch2); 7054 __ andr(rscratch1, rscratch1, -16); // align 7055 __ mov(sp, rscratch1); 7056 7057 if (return_barrier) { 7058 // save original return value -- again 7059 __ fmovd(rscratch1, v0); 7060 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7061 } 7062 7063 // If we want, we can templatize thaw by kind, and have three different entries 7064 __ movw(c_rarg1, (uint32_t)kind); 7065 7066 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7067 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7068 7069 if (return_barrier) { 7070 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7071 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7072 __ fmovd(v0, rscratch1); 7073 } else { 7074 __ mov(r0, zr); // return 0 (success) from doYield 7075 } 7076 7077 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7078 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7079 __ mov(rfp, sp); 7080 7081 if (return_barrier_exception) { 7082 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7083 __ authenticate_return_address(c_rarg1); 7084 __ verify_oop(r0); 7085 // save return value containing the exception oop in callee-saved R19 7086 __ mov(r19, r0); 7087 7088 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7089 7090 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7091 // __ reinitialize_ptrue(); 7092 7093 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7094 7095 __ mov(r1, r0); // the exception handler 7096 __ mov(r0, r19); // restore return value containing the exception oop 7097 __ verify_oop(r0); 7098 7099 __ leave(); 7100 __ mov(r3, lr); 7101 __ br(r1); // the exception handler 7102 } else { 7103 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7104 __ leave(); 7105 __ ret(lr); 7106 } 7107 7108 return start; 7109 } 7110 7111 address generate_cont_thaw() { 7112 if (!Continuations::enabled()) return nullptr; 7113 7114 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7115 address start = __ pc(); 7116 generate_cont_thaw(Continuation::thaw_top); 7117 return start; 7118 } 7119 7120 address generate_cont_returnBarrier() { 7121 if (!Continuations::enabled()) return nullptr; 7122 7123 // TODO: will probably need multiple return barriers depending on return type 7124 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7125 address start = __ pc(); 7126 7127 generate_cont_thaw(Continuation::thaw_return_barrier); 7128 7129 return start; 7130 } 7131 7132 address generate_cont_returnBarrier_exception() { 7133 if (!Continuations::enabled()) return nullptr; 7134 7135 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7136 address start = __ pc(); 7137 7138 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7139 7140 return start; 7141 } 7142 7143 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7144 // are represented as long[5], with BITS_PER_LIMB = 26. 7145 // Pack five 26-bit limbs into three 64-bit registers. 7146 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7147 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7148 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7149 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7150 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7151 7152 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7153 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7154 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7155 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7156 7157 if (dest2->is_valid()) { 7158 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7159 } else { 7160 #ifdef ASSERT 7161 Label OK; 7162 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7163 __ br(__ EQ, OK); 7164 __ stop("high bits of Poly1305 integer should be zero"); 7165 __ should_not_reach_here(); 7166 __ bind(OK); 7167 #endif 7168 } 7169 } 7170 7171 // As above, but return only a 128-bit integer, packed into two 7172 // 64-bit registers. 7173 void pack_26(Register dest0, Register dest1, Register src) { 7174 pack_26(dest0, dest1, noreg, src); 7175 } 7176 7177 // Multiply and multiply-accumulate unsigned 64-bit registers. 7178 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7179 __ mul(prod_lo, n, m); 7180 __ umulh(prod_hi, n, m); 7181 } 7182 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7183 wide_mul(rscratch1, rscratch2, n, m); 7184 __ adds(sum_lo, sum_lo, rscratch1); 7185 __ adc(sum_hi, sum_hi, rscratch2); 7186 } 7187 7188 // Poly1305, RFC 7539 7189 7190 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7191 // description of the tricks used to simplify and accelerate this 7192 // computation. 7193 7194 address generate_poly1305_processBlocks() { 7195 __ align(CodeEntryAlignment); 7196 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 7197 address start = __ pc(); 7198 Label here; 7199 __ enter(); 7200 RegSet callee_saved = RegSet::range(r19, r28); 7201 __ push(callee_saved, sp); 7202 7203 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7204 7205 // Arguments 7206 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7207 7208 // R_n is the 128-bit randomly-generated key, packed into two 7209 // registers. The caller passes this key to us as long[5], with 7210 // BITS_PER_LIMB = 26. 7211 const Register R_0 = *++regs, R_1 = *++regs; 7212 pack_26(R_0, R_1, r_start); 7213 7214 // RR_n is (R_n >> 2) * 5 7215 const Register RR_0 = *++regs, RR_1 = *++regs; 7216 __ lsr(RR_0, R_0, 2); 7217 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7218 __ lsr(RR_1, R_1, 2); 7219 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7220 7221 // U_n is the current checksum 7222 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7223 pack_26(U_0, U_1, U_2, acc_start); 7224 7225 static constexpr int BLOCK_LENGTH = 16; 7226 Label DONE, LOOP; 7227 7228 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7229 __ br(Assembler::LT, DONE); { 7230 __ bind(LOOP); 7231 7232 // S_n is to be the sum of U_n and the next block of data 7233 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7234 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7235 __ adds(S_0, U_0, S_0); 7236 __ adcs(S_1, U_1, S_1); 7237 __ adc(S_2, U_2, zr); 7238 __ add(S_2, S_2, 1); 7239 7240 const Register U_0HI = *++regs, U_1HI = *++regs; 7241 7242 // NB: this logic depends on some of the special properties of 7243 // Poly1305 keys. In particular, because we know that the top 7244 // four bits of R_0 and R_1 are zero, we can add together 7245 // partial products without any risk of needing to propagate a 7246 // carry out. 7247 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7248 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7249 __ andr(U_2, R_0, 3); 7250 __ mul(U_2, S_2, U_2); 7251 7252 // Recycle registers S_0, S_1, S_2 7253 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7254 7255 // Partial reduction mod 2**130 - 5 7256 __ adds(U_1, U_0HI, U_1); 7257 __ adc(U_2, U_1HI, U_2); 7258 // Sum now in U_2:U_1:U_0. 7259 // Dead: U_0HI, U_1HI. 7260 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7261 7262 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7263 7264 // First, U_2:U_1:U_0 += (U_2 >> 2) 7265 __ lsr(rscratch1, U_2, 2); 7266 __ andr(U_2, U_2, (u8)3); 7267 __ adds(U_0, U_0, rscratch1); 7268 __ adcs(U_1, U_1, zr); 7269 __ adc(U_2, U_2, zr); 7270 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7271 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7272 __ adcs(U_1, U_1, zr); 7273 __ adc(U_2, U_2, zr); 7274 7275 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7276 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7277 __ br(~ Assembler::LT, LOOP); 7278 } 7279 7280 // Further reduce modulo 2^130 - 5 7281 __ lsr(rscratch1, U_2, 2); 7282 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7283 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7284 __ adcs(U_1, U_1, zr); 7285 __ andr(U_2, U_2, (u1)3); 7286 __ adc(U_2, U_2, zr); 7287 7288 // Unpack the sum into five 26-bit limbs and write to memory. 7289 __ ubfiz(rscratch1, U_0, 0, 26); 7290 __ ubfx(rscratch2, U_0, 26, 26); 7291 __ stp(rscratch1, rscratch2, Address(acc_start)); 7292 __ ubfx(rscratch1, U_0, 52, 12); 7293 __ bfi(rscratch1, U_1, 12, 14); 7294 __ ubfx(rscratch2, U_1, 14, 26); 7295 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7296 __ ubfx(rscratch1, U_1, 40, 24); 7297 __ bfi(rscratch1, U_2, 24, 3); 7298 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7299 7300 __ bind(DONE); 7301 __ pop(callee_saved, sp); 7302 __ leave(); 7303 __ ret(lr); 7304 7305 return start; 7306 } 7307 7308 // exception handler for upcall stubs 7309 address generate_upcall_stub_exception_handler() { 7310 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 7311 address start = __ pc(); 7312 7313 // Native caller has no idea how to handle exceptions, 7314 // so we just crash here. Up to callee to catch exceptions. 7315 __ verify_oop(r0); 7316 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7317 __ blr(rscratch1); 7318 __ should_not_reach_here(); 7319 7320 return start; 7321 } 7322 7323 // load Method* target of MethodHandle 7324 // j_rarg0 = jobject receiver 7325 // rmethod = result 7326 address generate_upcall_stub_load_target() { 7327 StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target"); 7328 address start = __ pc(); 7329 7330 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 7331 // Load target method from receiver 7332 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 7333 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 7334 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 7335 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 7336 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 7337 noreg, noreg); 7338 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 7339 7340 __ ret(lr); 7341 7342 return start; 7343 } 7344 7345 #undef __ 7346 #define __ masm-> 7347 7348 class MontgomeryMultiplyGenerator : public MacroAssembler { 7349 7350 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7351 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7352 7353 RegSet _toSave; 7354 bool _squaring; 7355 7356 public: 7357 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7358 : MacroAssembler(as->code()), _squaring(squaring) { 7359 7360 // Register allocation 7361 7362 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7363 Pa_base = *regs; // Argument registers 7364 if (squaring) 7365 Pb_base = Pa_base; 7366 else 7367 Pb_base = *++regs; 7368 Pn_base = *++regs; 7369 Rlen= *++regs; 7370 inv = *++regs; 7371 Pm_base = *++regs; 7372 7373 // Working registers: 7374 Ra = *++regs; // The current digit of a, b, n, and m. 7375 Rb = *++regs; 7376 Rm = *++regs; 7377 Rn = *++regs; 7378 7379 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7380 Pb = *++regs; 7381 Pm = *++regs; 7382 Pn = *++regs; 7383 7384 t0 = *++regs; // Three registers which form a 7385 t1 = *++regs; // triple-precision accumuator. 7386 t2 = *++regs; 7387 7388 Ri = *++regs; // Inner and outer loop indexes. 7389 Rj = *++regs; 7390 7391 Rhi_ab = *++regs; // Product registers: low and high parts 7392 Rlo_ab = *++regs; // of a*b and m*n. 7393 Rhi_mn = *++regs; 7394 Rlo_mn = *++regs; 7395 7396 // r19 and up are callee-saved. 7397 _toSave = RegSet::range(r19, *regs) + Pm_base; 7398 } 7399 7400 private: 7401 void save_regs() { 7402 push(_toSave, sp); 7403 } 7404 7405 void restore_regs() { 7406 pop(_toSave, sp); 7407 } 7408 7409 template <typename T> 7410 void unroll_2(Register count, T block) { 7411 Label loop, end, odd; 7412 tbnz(count, 0, odd); 7413 cbz(count, end); 7414 align(16); 7415 bind(loop); 7416 (this->*block)(); 7417 bind(odd); 7418 (this->*block)(); 7419 subs(count, count, 2); 7420 br(Assembler::GT, loop); 7421 bind(end); 7422 } 7423 7424 template <typename T> 7425 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7426 Label loop, end, odd; 7427 tbnz(count, 0, odd); 7428 cbz(count, end); 7429 align(16); 7430 bind(loop); 7431 (this->*block)(d, s, tmp); 7432 bind(odd); 7433 (this->*block)(d, s, tmp); 7434 subs(count, count, 2); 7435 br(Assembler::GT, loop); 7436 bind(end); 7437 } 7438 7439 void pre1(RegisterOrConstant i) { 7440 block_comment("pre1"); 7441 // Pa = Pa_base; 7442 // Pb = Pb_base + i; 7443 // Pm = Pm_base; 7444 // Pn = Pn_base + i; 7445 // Ra = *Pa; 7446 // Rb = *Pb; 7447 // Rm = *Pm; 7448 // Rn = *Pn; 7449 ldr(Ra, Address(Pa_base)); 7450 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7451 ldr(Rm, Address(Pm_base)); 7452 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7453 lea(Pa, Address(Pa_base)); 7454 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7455 lea(Pm, Address(Pm_base)); 7456 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7457 7458 // Zero the m*n result. 7459 mov(Rhi_mn, zr); 7460 mov(Rlo_mn, zr); 7461 } 7462 7463 // The core multiply-accumulate step of a Montgomery 7464 // multiplication. The idea is to schedule operations as a 7465 // pipeline so that instructions with long latencies (loads and 7466 // multiplies) have time to complete before their results are 7467 // used. This most benefits in-order implementations of the 7468 // architecture but out-of-order ones also benefit. 7469 void step() { 7470 block_comment("step"); 7471 // MACC(Ra, Rb, t0, t1, t2); 7472 // Ra = *++Pa; 7473 // Rb = *--Pb; 7474 umulh(Rhi_ab, Ra, Rb); 7475 mul(Rlo_ab, Ra, Rb); 7476 ldr(Ra, pre(Pa, wordSize)); 7477 ldr(Rb, pre(Pb, -wordSize)); 7478 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7479 // previous iteration. 7480 // MACC(Rm, Rn, t0, t1, t2); 7481 // Rm = *++Pm; 7482 // Rn = *--Pn; 7483 umulh(Rhi_mn, Rm, Rn); 7484 mul(Rlo_mn, Rm, Rn); 7485 ldr(Rm, pre(Pm, wordSize)); 7486 ldr(Rn, pre(Pn, -wordSize)); 7487 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7488 } 7489 7490 void post1() { 7491 block_comment("post1"); 7492 7493 // MACC(Ra, Rb, t0, t1, t2); 7494 // Ra = *++Pa; 7495 // Rb = *--Pb; 7496 umulh(Rhi_ab, Ra, Rb); 7497 mul(Rlo_ab, Ra, Rb); 7498 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7499 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7500 7501 // *Pm = Rm = t0 * inv; 7502 mul(Rm, t0, inv); 7503 str(Rm, Address(Pm)); 7504 7505 // MACC(Rm, Rn, t0, t1, t2); 7506 // t0 = t1; t1 = t2; t2 = 0; 7507 umulh(Rhi_mn, Rm, Rn); 7508 7509 #ifndef PRODUCT 7510 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7511 { 7512 mul(Rlo_mn, Rm, Rn); 7513 add(Rlo_mn, t0, Rlo_mn); 7514 Label ok; 7515 cbz(Rlo_mn, ok); { 7516 stop("broken Montgomery multiply"); 7517 } bind(ok); 7518 } 7519 #endif 7520 // We have very carefully set things up so that 7521 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7522 // the lower half of Rm * Rn because we know the result already: 7523 // it must be -t0. t0 + (-t0) must generate a carry iff 7524 // t0 != 0. So, rather than do a mul and an adds we just set 7525 // the carry flag iff t0 is nonzero. 7526 // 7527 // mul(Rlo_mn, Rm, Rn); 7528 // adds(zr, t0, Rlo_mn); 7529 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7530 adcs(t0, t1, Rhi_mn); 7531 adc(t1, t2, zr); 7532 mov(t2, zr); 7533 } 7534 7535 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7536 block_comment("pre2"); 7537 // Pa = Pa_base + i-len; 7538 // Pb = Pb_base + len; 7539 // Pm = Pm_base + i-len; 7540 // Pn = Pn_base + len; 7541 7542 if (i.is_register()) { 7543 sub(Rj, i.as_register(), len); 7544 } else { 7545 mov(Rj, i.as_constant()); 7546 sub(Rj, Rj, len); 7547 } 7548 // Rj == i-len 7549 7550 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7551 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7552 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7553 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7554 7555 // Ra = *++Pa; 7556 // Rb = *--Pb; 7557 // Rm = *++Pm; 7558 // Rn = *--Pn; 7559 ldr(Ra, pre(Pa, wordSize)); 7560 ldr(Rb, pre(Pb, -wordSize)); 7561 ldr(Rm, pre(Pm, wordSize)); 7562 ldr(Rn, pre(Pn, -wordSize)); 7563 7564 mov(Rhi_mn, zr); 7565 mov(Rlo_mn, zr); 7566 } 7567 7568 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7569 block_comment("post2"); 7570 if (i.is_constant()) { 7571 mov(Rj, i.as_constant()-len.as_constant()); 7572 } else { 7573 sub(Rj, i.as_register(), len); 7574 } 7575 7576 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7577 7578 // As soon as we know the least significant digit of our result, 7579 // store it. 7580 // Pm_base[i-len] = t0; 7581 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7582 7583 // t0 = t1; t1 = t2; t2 = 0; 7584 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7585 adc(t1, t2, zr); 7586 mov(t2, zr); 7587 } 7588 7589 // A carry in t0 after Montgomery multiplication means that we 7590 // should subtract multiples of n from our result in m. We'll 7591 // keep doing that until there is no carry. 7592 void normalize(RegisterOrConstant len) { 7593 block_comment("normalize"); 7594 // while (t0) 7595 // t0 = sub(Pm_base, Pn_base, t0, len); 7596 Label loop, post, again; 7597 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7598 cbz(t0, post); { 7599 bind(again); { 7600 mov(i, zr); 7601 mov(cnt, len); 7602 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7603 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7604 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7605 align(16); 7606 bind(loop); { 7607 sbcs(Rm, Rm, Rn); 7608 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7609 add(i, i, 1); 7610 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7611 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7612 sub(cnt, cnt, 1); 7613 } cbnz(cnt, loop); 7614 sbc(t0, t0, zr); 7615 } cbnz(t0, again); 7616 } bind(post); 7617 } 7618 7619 // Move memory at s to d, reversing words. 7620 // Increments d to end of copied memory 7621 // Destroys tmp1, tmp2 7622 // Preserves len 7623 // Leaves s pointing to the address which was in d at start 7624 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7625 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7626 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7627 7628 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7629 mov(tmp1, len); 7630 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7631 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7632 } 7633 // where 7634 void reverse1(Register d, Register s, Register tmp) { 7635 ldr(tmp, pre(s, -wordSize)); 7636 ror(tmp, tmp, 32); 7637 str(tmp, post(d, wordSize)); 7638 } 7639 7640 void step_squaring() { 7641 // An extra ACC 7642 step(); 7643 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7644 } 7645 7646 void last_squaring(RegisterOrConstant i) { 7647 Label dont; 7648 // if ((i & 1) == 0) { 7649 tbnz(i.as_register(), 0, dont); { 7650 // MACC(Ra, Rb, t0, t1, t2); 7651 // Ra = *++Pa; 7652 // Rb = *--Pb; 7653 umulh(Rhi_ab, Ra, Rb); 7654 mul(Rlo_ab, Ra, Rb); 7655 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7656 } bind(dont); 7657 } 7658 7659 void extra_step_squaring() { 7660 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7661 7662 // MACC(Rm, Rn, t0, t1, t2); 7663 // Rm = *++Pm; 7664 // Rn = *--Pn; 7665 umulh(Rhi_mn, Rm, Rn); 7666 mul(Rlo_mn, Rm, Rn); 7667 ldr(Rm, pre(Pm, wordSize)); 7668 ldr(Rn, pre(Pn, -wordSize)); 7669 } 7670 7671 void post1_squaring() { 7672 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7673 7674 // *Pm = Rm = t0 * inv; 7675 mul(Rm, t0, inv); 7676 str(Rm, Address(Pm)); 7677 7678 // MACC(Rm, Rn, t0, t1, t2); 7679 // t0 = t1; t1 = t2; t2 = 0; 7680 umulh(Rhi_mn, Rm, Rn); 7681 7682 #ifndef PRODUCT 7683 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7684 { 7685 mul(Rlo_mn, Rm, Rn); 7686 add(Rlo_mn, t0, Rlo_mn); 7687 Label ok; 7688 cbz(Rlo_mn, ok); { 7689 stop("broken Montgomery multiply"); 7690 } bind(ok); 7691 } 7692 #endif 7693 // We have very carefully set things up so that 7694 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7695 // the lower half of Rm * Rn because we know the result already: 7696 // it must be -t0. t0 + (-t0) must generate a carry iff 7697 // t0 != 0. So, rather than do a mul and an adds we just set 7698 // the carry flag iff t0 is nonzero. 7699 // 7700 // mul(Rlo_mn, Rm, Rn); 7701 // adds(zr, t0, Rlo_mn); 7702 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7703 adcs(t0, t1, Rhi_mn); 7704 adc(t1, t2, zr); 7705 mov(t2, zr); 7706 } 7707 7708 void acc(Register Rhi, Register Rlo, 7709 Register t0, Register t1, Register t2) { 7710 adds(t0, t0, Rlo); 7711 adcs(t1, t1, Rhi); 7712 adc(t2, t2, zr); 7713 } 7714 7715 public: 7716 /** 7717 * Fast Montgomery multiplication. The derivation of the 7718 * algorithm is in A Cryptographic Library for the Motorola 7719 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7720 * 7721 * Arguments: 7722 * 7723 * Inputs for multiplication: 7724 * c_rarg0 - int array elements a 7725 * c_rarg1 - int array elements b 7726 * c_rarg2 - int array elements n (the modulus) 7727 * c_rarg3 - int length 7728 * c_rarg4 - int inv 7729 * c_rarg5 - int array elements m (the result) 7730 * 7731 * Inputs for squaring: 7732 * c_rarg0 - int array elements a 7733 * c_rarg1 - int array elements n (the modulus) 7734 * c_rarg2 - int length 7735 * c_rarg3 - int inv 7736 * c_rarg4 - int array elements m (the result) 7737 * 7738 */ 7739 address generate_multiply() { 7740 Label argh, nothing; 7741 bind(argh); 7742 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7743 7744 align(CodeEntryAlignment); 7745 address entry = pc(); 7746 7747 cbzw(Rlen, nothing); 7748 7749 enter(); 7750 7751 // Make room. 7752 cmpw(Rlen, 512); 7753 br(Assembler::HI, argh); 7754 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7755 andr(sp, Ra, -2 * wordSize); 7756 7757 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7758 7759 { 7760 // Copy input args, reversing as we go. We use Ra as a 7761 // temporary variable. 7762 reverse(Ra, Pa_base, Rlen, t0, t1); 7763 if (!_squaring) 7764 reverse(Ra, Pb_base, Rlen, t0, t1); 7765 reverse(Ra, Pn_base, Rlen, t0, t1); 7766 } 7767 7768 // Push all call-saved registers and also Pm_base which we'll need 7769 // at the end. 7770 save_regs(); 7771 7772 #ifndef PRODUCT 7773 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7774 { 7775 ldr(Rn, Address(Pn_base, 0)); 7776 mul(Rlo_mn, Rn, inv); 7777 subs(zr, Rlo_mn, -1); 7778 Label ok; 7779 br(EQ, ok); { 7780 stop("broken inverse in Montgomery multiply"); 7781 } bind(ok); 7782 } 7783 #endif 7784 7785 mov(Pm_base, Ra); 7786 7787 mov(t0, zr); 7788 mov(t1, zr); 7789 mov(t2, zr); 7790 7791 block_comment("for (int i = 0; i < len; i++) {"); 7792 mov(Ri, zr); { 7793 Label loop, end; 7794 cmpw(Ri, Rlen); 7795 br(Assembler::GE, end); 7796 7797 bind(loop); 7798 pre1(Ri); 7799 7800 block_comment(" for (j = i; j; j--) {"); { 7801 movw(Rj, Ri); 7802 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7803 } block_comment(" } // j"); 7804 7805 post1(); 7806 addw(Ri, Ri, 1); 7807 cmpw(Ri, Rlen); 7808 br(Assembler::LT, loop); 7809 bind(end); 7810 block_comment("} // i"); 7811 } 7812 7813 block_comment("for (int i = len; i < 2*len; i++) {"); 7814 mov(Ri, Rlen); { 7815 Label loop, end; 7816 cmpw(Ri, Rlen, Assembler::LSL, 1); 7817 br(Assembler::GE, end); 7818 7819 bind(loop); 7820 pre2(Ri, Rlen); 7821 7822 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7823 lslw(Rj, Rlen, 1); 7824 subw(Rj, Rj, Ri); 7825 subw(Rj, Rj, 1); 7826 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7827 } block_comment(" } // j"); 7828 7829 post2(Ri, Rlen); 7830 addw(Ri, Ri, 1); 7831 cmpw(Ri, Rlen, Assembler::LSL, 1); 7832 br(Assembler::LT, loop); 7833 bind(end); 7834 } 7835 block_comment("} // i"); 7836 7837 normalize(Rlen); 7838 7839 mov(Ra, Pm_base); // Save Pm_base in Ra 7840 restore_regs(); // Restore caller's Pm_base 7841 7842 // Copy our result into caller's Pm_base 7843 reverse(Pm_base, Ra, Rlen, t0, t1); 7844 7845 leave(); 7846 bind(nothing); 7847 ret(lr); 7848 7849 return entry; 7850 } 7851 // In C, approximately: 7852 7853 // void 7854 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7855 // julong Pn_base[], julong Pm_base[], 7856 // julong inv, int len) { 7857 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7858 // julong *Pa, *Pb, *Pn, *Pm; 7859 // julong Ra, Rb, Rn, Rm; 7860 7861 // int i; 7862 7863 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7864 7865 // for (i = 0; i < len; i++) { 7866 // int j; 7867 7868 // Pa = Pa_base; 7869 // Pb = Pb_base + i; 7870 // Pm = Pm_base; 7871 // Pn = Pn_base + i; 7872 7873 // Ra = *Pa; 7874 // Rb = *Pb; 7875 // Rm = *Pm; 7876 // Rn = *Pn; 7877 7878 // int iters = i; 7879 // for (j = 0; iters--; j++) { 7880 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7881 // MACC(Ra, Rb, t0, t1, t2); 7882 // Ra = *++Pa; 7883 // Rb = *--Pb; 7884 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7885 // MACC(Rm, Rn, t0, t1, t2); 7886 // Rm = *++Pm; 7887 // Rn = *--Pn; 7888 // } 7889 7890 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 7891 // MACC(Ra, Rb, t0, t1, t2); 7892 // *Pm = Rm = t0 * inv; 7893 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7894 // MACC(Rm, Rn, t0, t1, t2); 7895 7896 // assert(t0 == 0, "broken Montgomery multiply"); 7897 7898 // t0 = t1; t1 = t2; t2 = 0; 7899 // } 7900 7901 // for (i = len; i < 2*len; i++) { 7902 // int j; 7903 7904 // Pa = Pa_base + i-len; 7905 // Pb = Pb_base + len; 7906 // Pm = Pm_base + i-len; 7907 // Pn = Pn_base + len; 7908 7909 // Ra = *++Pa; 7910 // Rb = *--Pb; 7911 // Rm = *++Pm; 7912 // Rn = *--Pn; 7913 7914 // int iters = len*2-i-1; 7915 // for (j = i-len+1; iters--; j++) { 7916 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7917 // MACC(Ra, Rb, t0, t1, t2); 7918 // Ra = *++Pa; 7919 // Rb = *--Pb; 7920 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7921 // MACC(Rm, Rn, t0, t1, t2); 7922 // Rm = *++Pm; 7923 // Rn = *--Pn; 7924 // } 7925 7926 // Pm_base[i-len] = t0; 7927 // t0 = t1; t1 = t2; t2 = 0; 7928 // } 7929 7930 // while (t0) 7931 // t0 = sub(Pm_base, Pn_base, t0, len); 7932 // } 7933 7934 /** 7935 * Fast Montgomery squaring. This uses asymptotically 25% fewer 7936 * multiplies than Montgomery multiplication so it should be up to 7937 * 25% faster. However, its loop control is more complex and it 7938 * may actually run slower on some machines. 7939 * 7940 * Arguments: 7941 * 7942 * Inputs: 7943 * c_rarg0 - int array elements a 7944 * c_rarg1 - int array elements n (the modulus) 7945 * c_rarg2 - int length 7946 * c_rarg3 - int inv 7947 * c_rarg4 - int array elements m (the result) 7948 * 7949 */ 7950 address generate_square() { 7951 Label argh; 7952 bind(argh); 7953 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7954 7955 align(CodeEntryAlignment); 7956 address entry = pc(); 7957 7958 enter(); 7959 7960 // Make room. 7961 cmpw(Rlen, 512); 7962 br(Assembler::HI, argh); 7963 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7964 andr(sp, Ra, -2 * wordSize); 7965 7966 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7967 7968 { 7969 // Copy input args, reversing as we go. We use Ra as a 7970 // temporary variable. 7971 reverse(Ra, Pa_base, Rlen, t0, t1); 7972 reverse(Ra, Pn_base, Rlen, t0, t1); 7973 } 7974 7975 // Push all call-saved registers and also Pm_base which we'll need 7976 // at the end. 7977 save_regs(); 7978 7979 mov(Pm_base, Ra); 7980 7981 mov(t0, zr); 7982 mov(t1, zr); 7983 mov(t2, zr); 7984 7985 block_comment("for (int i = 0; i < len; i++) {"); 7986 mov(Ri, zr); { 7987 Label loop, end; 7988 bind(loop); 7989 cmp(Ri, Rlen); 7990 br(Assembler::GE, end); 7991 7992 pre1(Ri); 7993 7994 block_comment("for (j = (i+1)/2; j; j--) {"); { 7995 add(Rj, Ri, 1); 7996 lsr(Rj, Rj, 1); 7997 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7998 } block_comment(" } // j"); 7999 8000 last_squaring(Ri); 8001 8002 block_comment(" for (j = i/2; j; j--) {"); { 8003 lsr(Rj, Ri, 1); 8004 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8005 } block_comment(" } // j"); 8006 8007 post1_squaring(); 8008 add(Ri, Ri, 1); 8009 cmp(Ri, Rlen); 8010 br(Assembler::LT, loop); 8011 8012 bind(end); 8013 block_comment("} // i"); 8014 } 8015 8016 block_comment("for (int i = len; i < 2*len; i++) {"); 8017 mov(Ri, Rlen); { 8018 Label loop, end; 8019 bind(loop); 8020 cmp(Ri, Rlen, Assembler::LSL, 1); 8021 br(Assembler::GE, end); 8022 8023 pre2(Ri, Rlen); 8024 8025 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8026 lsl(Rj, Rlen, 1); 8027 sub(Rj, Rj, Ri); 8028 sub(Rj, Rj, 1); 8029 lsr(Rj, Rj, 1); 8030 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8031 } block_comment(" } // j"); 8032 8033 last_squaring(Ri); 8034 8035 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8036 lsl(Rj, Rlen, 1); 8037 sub(Rj, Rj, Ri); 8038 lsr(Rj, Rj, 1); 8039 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8040 } block_comment(" } // j"); 8041 8042 post2(Ri, Rlen); 8043 add(Ri, Ri, 1); 8044 cmp(Ri, Rlen, Assembler::LSL, 1); 8045 8046 br(Assembler::LT, loop); 8047 bind(end); 8048 block_comment("} // i"); 8049 } 8050 8051 normalize(Rlen); 8052 8053 mov(Ra, Pm_base); // Save Pm_base in Ra 8054 restore_regs(); // Restore caller's Pm_base 8055 8056 // Copy our result into caller's Pm_base 8057 reverse(Pm_base, Ra, Rlen, t0, t1); 8058 8059 leave(); 8060 ret(lr); 8061 8062 return entry; 8063 } 8064 // In C, approximately: 8065 8066 // void 8067 // montgomery_square(julong Pa_base[], julong Pn_base[], 8068 // julong Pm_base[], julong inv, int len) { 8069 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8070 // julong *Pa, *Pb, *Pn, *Pm; 8071 // julong Ra, Rb, Rn, Rm; 8072 8073 // int i; 8074 8075 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8076 8077 // for (i = 0; i < len; i++) { 8078 // int j; 8079 8080 // Pa = Pa_base; 8081 // Pb = Pa_base + i; 8082 // Pm = Pm_base; 8083 // Pn = Pn_base + i; 8084 8085 // Ra = *Pa; 8086 // Rb = *Pb; 8087 // Rm = *Pm; 8088 // Rn = *Pn; 8089 8090 // int iters = (i+1)/2; 8091 // for (j = 0; iters--; j++) { 8092 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8093 // MACC2(Ra, Rb, t0, t1, t2); 8094 // Ra = *++Pa; 8095 // Rb = *--Pb; 8096 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8097 // MACC(Rm, Rn, t0, t1, t2); 8098 // Rm = *++Pm; 8099 // Rn = *--Pn; 8100 // } 8101 // if ((i & 1) == 0) { 8102 // assert(Ra == Pa_base[j], "must be"); 8103 // MACC(Ra, Ra, t0, t1, t2); 8104 // } 8105 // iters = i/2; 8106 // assert(iters == i-j, "must be"); 8107 // for (; iters--; j++) { 8108 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8109 // MACC(Rm, Rn, t0, t1, t2); 8110 // Rm = *++Pm; 8111 // Rn = *--Pn; 8112 // } 8113 8114 // *Pm = Rm = t0 * inv; 8115 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8116 // MACC(Rm, Rn, t0, t1, t2); 8117 8118 // assert(t0 == 0, "broken Montgomery multiply"); 8119 8120 // t0 = t1; t1 = t2; t2 = 0; 8121 // } 8122 8123 // for (i = len; i < 2*len; i++) { 8124 // int start = i-len+1; 8125 // int end = start + (len - start)/2; 8126 // int j; 8127 8128 // Pa = Pa_base + i-len; 8129 // Pb = Pa_base + len; 8130 // Pm = Pm_base + i-len; 8131 // Pn = Pn_base + len; 8132 8133 // Ra = *++Pa; 8134 // Rb = *--Pb; 8135 // Rm = *++Pm; 8136 // Rn = *--Pn; 8137 8138 // int iters = (2*len-i-1)/2; 8139 // assert(iters == end-start, "must be"); 8140 // for (j = start; iters--; j++) { 8141 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8142 // MACC2(Ra, Rb, t0, t1, t2); 8143 // Ra = *++Pa; 8144 // Rb = *--Pb; 8145 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8146 // MACC(Rm, Rn, t0, t1, t2); 8147 // Rm = *++Pm; 8148 // Rn = *--Pn; 8149 // } 8150 // if ((i & 1) == 0) { 8151 // assert(Ra == Pa_base[j], "must be"); 8152 // MACC(Ra, Ra, t0, t1, t2); 8153 // } 8154 // iters = (2*len-i)/2; 8155 // assert(iters == len-j, "must be"); 8156 // for (; iters--; j++) { 8157 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8158 // MACC(Rm, Rn, t0, t1, t2); 8159 // Rm = *++Pm; 8160 // Rn = *--Pn; 8161 // } 8162 // Pm_base[i-len] = t0; 8163 // t0 = t1; t1 = t2; t2 = 0; 8164 // } 8165 8166 // while (t0) 8167 // t0 = sub(Pm_base, Pn_base, t0, len); 8168 // } 8169 }; 8170 8171 8172 // Initialization 8173 void generate_initial_stubs() { 8174 // Generate initial stubs and initializes the entry points 8175 8176 // entry points that exist in all platforms Note: This is code 8177 // that could be shared among different platforms - however the 8178 // benefit seems to be smaller than the disadvantage of having a 8179 // much more complicated generator structure. See also comment in 8180 // stubRoutines.hpp. 8181 8182 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8183 8184 StubRoutines::_call_stub_entry = 8185 generate_call_stub(StubRoutines::_call_stub_return_address); 8186 8187 // is referenced by megamorphic call 8188 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8189 8190 // Initialize table for copy memory (arraycopy) check. 8191 if (UnsafeMemoryAccess::_table == nullptr) { 8192 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 8193 } 8194 8195 if (UseCRC32Intrinsics) { 8196 // set table address before stub generation which use it 8197 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8198 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8199 } 8200 8201 if (UseCRC32CIntrinsics) { 8202 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8203 } 8204 8205 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8206 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8207 } 8208 8209 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8210 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8211 } 8212 8213 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 8214 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 8215 StubRoutines::_hf2f = generate_float16ToFloat(); 8216 StubRoutines::_f2hf = generate_floatToFloat16(); 8217 } 8218 } 8219 8220 void generate_continuation_stubs() { 8221 // Continuation stubs: 8222 StubRoutines::_cont_thaw = generate_cont_thaw(); 8223 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8224 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8225 } 8226 8227 void generate_final_stubs() { 8228 // support for verify_oop (must happen after universe_init) 8229 if (VerifyOops) { 8230 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8231 } 8232 8233 // arraycopy stubs used by compilers 8234 generate_arraycopy_stubs(); 8235 8236 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8237 if (bs_nm != nullptr) { 8238 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8239 } 8240 8241 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8242 8243 if (UsePoly1305Intrinsics) { 8244 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 8245 } 8246 8247 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8248 8249 generate_atomic_entry_points(); 8250 8251 #endif // LINUX 8252 8253 #ifdef COMPILER2 8254 if (UseSecondarySupersTable) { 8255 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 8256 if (! InlineSecondarySupersTest) { 8257 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 8258 StubRoutines::_lookup_secondary_supers_table_stubs[slot] 8259 = generate_lookup_secondary_supers_table_stub(slot); 8260 } 8261 } 8262 } 8263 #endif 8264 8265 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8266 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 8267 8268 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8269 } 8270 8271 void generate_compiler_stubs() { 8272 #if COMPILER2_OR_JVMCI 8273 8274 if (UseSVE == 0) { 8275 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8276 } 8277 8278 // array equals stub for large arrays. 8279 if (!UseSimpleArrayEquals) { 8280 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8281 } 8282 8283 // byte_array_inflate stub for large arrays. 8284 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8285 8286 // countPositives stub for large arrays. 8287 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8288 8289 generate_compare_long_strings(); 8290 8291 generate_string_indexof_stubs(); 8292 8293 #ifdef COMPILER2 8294 if (UseMultiplyToLenIntrinsic) { 8295 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8296 } 8297 8298 if (UseSquareToLenIntrinsic) { 8299 StubRoutines::_squareToLen = generate_squareToLen(); 8300 } 8301 8302 if (UseMulAddIntrinsic) { 8303 StubRoutines::_mulAdd = generate_mulAdd(); 8304 } 8305 8306 if (UseSIMDForBigIntegerShiftIntrinsics) { 8307 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8308 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8309 } 8310 8311 if (UseMontgomeryMultiplyIntrinsic) { 8312 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8313 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8314 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8315 } 8316 8317 if (UseMontgomerySquareIntrinsic) { 8318 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8319 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8320 // We use generate_multiply() rather than generate_square() 8321 // because it's faster for the sizes of modulus we care about. 8322 StubRoutines::_montgomerySquare = g.generate_multiply(); 8323 } 8324 #endif // COMPILER2 8325 8326 if (UseChaCha20Intrinsics) { 8327 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8328 } 8329 8330 if (UseBASE64Intrinsics) { 8331 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8332 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8333 } 8334 8335 // data cache line writeback 8336 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8337 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8338 8339 if (UseAESIntrinsics) { 8340 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8341 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8342 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8343 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8344 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8345 } 8346 if (UseGHASHIntrinsics) { 8347 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8348 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8349 } 8350 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8351 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8352 } 8353 8354 if (UseMD5Intrinsics) { 8355 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8356 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8357 } 8358 if (UseSHA1Intrinsics) { 8359 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8360 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8361 } 8362 if (UseSHA256Intrinsics) { 8363 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8364 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8365 } 8366 if (UseSHA512Intrinsics) { 8367 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8368 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8369 } 8370 if (UseSHA3Intrinsics) { 8371 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8372 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8373 } 8374 8375 // generate Adler32 intrinsics code 8376 if (UseAdler32Intrinsics) { 8377 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8378 } 8379 #endif // COMPILER2_OR_JVMCI 8380 } 8381 8382 public: 8383 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8384 switch(kind) { 8385 case Initial_stubs: 8386 generate_initial_stubs(); 8387 break; 8388 case Continuation_stubs: 8389 generate_continuation_stubs(); 8390 break; 8391 case Compiler_stubs: 8392 generate_compiler_stubs(); 8393 break; 8394 case Final_stubs: 8395 generate_final_stubs(); 8396 break; 8397 default: 8398 fatal("unexpected stubs kind: %d", kind); 8399 break; 8400 }; 8401 } 8402 }; // end class declaration 8403 8404 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8405 StubGenerator g(code, kind); 8406 } 8407 8408 8409 #if defined (LINUX) 8410 8411 // Define pointers to atomic stubs and initialize them to point to the 8412 // code in atomic_aarch64.S. 8413 8414 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8415 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8416 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8417 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8418 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8419 8420 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8421 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8422 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8423 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8424 DEFAULT_ATOMIC_OP(xchg, 4, ) 8425 DEFAULT_ATOMIC_OP(xchg, 8, ) 8426 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8427 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8428 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8429 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8430 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8431 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8432 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8433 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8434 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8435 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8436 8437 #undef DEFAULT_ATOMIC_OP 8438 8439 #endif // LINUX