1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "code/SCCache.hpp" 32 #include "compiler/oopMap.hpp" 33 #include "gc/shared/barrierSet.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/gc_globals.hpp" 36 #include "gc/shared/tlab_globals.hpp" 37 #include "interpreter/interpreter.hpp" 38 #include "memory/universe.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/instanceOop.hpp" 41 #include "oops/method.hpp" 42 #include "oops/objArrayKlass.hpp" 43 #include "oops/oop.inline.hpp" 44 #include "prims/methodHandles.hpp" 45 #include "prims/upcallLinker.hpp" 46 #include "runtime/atomic.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/frame.inline.hpp" 50 #include "runtime/handles.inline.hpp" 51 #include "runtime/javaThread.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/stubCodeGenerator.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "utilities/align.hpp" 56 #include "utilities/checkedCast.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/powerOfTwo.hpp" 59 #ifdef COMPILER2 60 #include "opto/runtime.hpp" 61 #endif 62 #if INCLUDE_ZGC 63 #include "gc/z/zThreadLocalData.hpp" 64 #endif 65 66 // Declaration and definition of StubGenerator (no .hpp file). 67 // For a more detailed description of the stub routine structure 68 // see the comment in stubRoutines.hpp 69 70 #undef __ 71 #define __ _masm-> 72 73 #ifdef PRODUCT 74 #define BLOCK_COMMENT(str) /* nothing */ 75 #else 76 #define BLOCK_COMMENT(str) __ block_comment(str) 77 #endif 78 79 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 80 81 // Stub Code definitions 82 83 class StubGenerator: public StubCodeGenerator { 84 private: 85 86 #ifdef PRODUCT 87 #define inc_counter_np(counter) ((void)0) 88 #else 89 void inc_counter_np_(uint& counter) { 90 __ incrementw(ExternalAddress((address)&counter)); 91 } 92 #define inc_counter_np(counter) \ 93 BLOCK_COMMENT("inc_counter " #counter); \ 94 inc_counter_np_(counter); 95 #endif 96 97 // Call stubs are used to call Java from C 98 // 99 // Arguments: 100 // c_rarg0: call wrapper address address 101 // c_rarg1: result address 102 // c_rarg2: result type BasicType 103 // c_rarg3: method Method* 104 // c_rarg4: (interpreter) entry point address 105 // c_rarg5: parameters intptr_t* 106 // c_rarg6: parameter size (in words) int 107 // c_rarg7: thread Thread* 108 // 109 // There is no return from the stub itself as any Java result 110 // is written to result 111 // 112 // we save r30 (lr) as the return PC at the base of the frame and 113 // link r29 (fp) below it as the frame pointer installing sp (r31) 114 // into fp. 115 // 116 // we save r0-r7, which accounts for all the c arguments. 117 // 118 // TODO: strictly do we need to save them all? they are treated as 119 // volatile by C so could we omit saving the ones we are going to 120 // place in global registers (thread? method?) or those we only use 121 // during setup of the Java call? 122 // 123 // we don't need to save r8 which C uses as an indirect result location 124 // return register. 125 // 126 // we don't need to save r9-r15 which both C and Java treat as 127 // volatile 128 // 129 // we don't need to save r16-18 because Java does not use them 130 // 131 // we save r19-r28 which Java uses as scratch registers and C 132 // expects to be callee-save 133 // 134 // we save the bottom 64 bits of each value stored in v8-v15; it is 135 // the responsibility of the caller to preserve larger values. 136 // 137 // so the stub frame looks like this when we enter Java code 138 // 139 // [ return_from_Java ] <--- sp 140 // [ argument word n ] 141 // ... 142 // -29 [ argument word 1 ] 143 // -28 [ saved Floating-point Control Register ] 144 // -26 [ saved v15 ] <--- sp_after_call 145 // -25 [ saved v14 ] 146 // -24 [ saved v13 ] 147 // -23 [ saved v12 ] 148 // -22 [ saved v11 ] 149 // -21 [ saved v10 ] 150 // -20 [ saved v9 ] 151 // -19 [ saved v8 ] 152 // -18 [ saved r28 ] 153 // -17 [ saved r27 ] 154 // -16 [ saved r26 ] 155 // -15 [ saved r25 ] 156 // -14 [ saved r24 ] 157 // -13 [ saved r23 ] 158 // -12 [ saved r22 ] 159 // -11 [ saved r21 ] 160 // -10 [ saved r20 ] 161 // -9 [ saved r19 ] 162 // -8 [ call wrapper (r0) ] 163 // -7 [ result (r1) ] 164 // -6 [ result type (r2) ] 165 // -5 [ method (r3) ] 166 // -4 [ entry point (r4) ] 167 // -3 [ parameters (r5) ] 168 // -2 [ parameter size (r6) ] 169 // -1 [ thread (r7) ] 170 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 171 // 1 [ saved lr (r30) ] 172 173 // Call stub stack layout word offsets from fp 174 enum call_stub_layout { 175 sp_after_call_off = -28, 176 177 fpcr_off = sp_after_call_off, 178 d15_off = -26, 179 d13_off = -24, 180 d11_off = -22, 181 d9_off = -20, 182 183 r28_off = -18, 184 r26_off = -16, 185 r24_off = -14, 186 r22_off = -12, 187 r20_off = -10, 188 call_wrapper_off = -8, 189 result_off = -7, 190 result_type_off = -6, 191 method_off = -5, 192 entry_point_off = -4, 193 parameter_size_off = -2, 194 thread_off = -1, 195 fp_f = 0, 196 retaddr_off = 1, 197 }; 198 199 address generate_call_stub(address& return_address) { 200 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 201 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 202 "adjust this code"); 203 204 StubCodeMark mark(this, "StubRoutines", "call_stub"); 205 address start = __ pc(); 206 207 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 208 209 const Address fpcr_save (rfp, fpcr_off * wordSize); 210 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 211 const Address result (rfp, result_off * wordSize); 212 const Address result_type (rfp, result_type_off * wordSize); 213 const Address method (rfp, method_off * wordSize); 214 const Address entry_point (rfp, entry_point_off * wordSize); 215 const Address parameter_size(rfp, parameter_size_off * wordSize); 216 217 const Address thread (rfp, thread_off * wordSize); 218 219 const Address d15_save (rfp, d15_off * wordSize); 220 const Address d13_save (rfp, d13_off * wordSize); 221 const Address d11_save (rfp, d11_off * wordSize); 222 const Address d9_save (rfp, d9_off * wordSize); 223 224 const Address r28_save (rfp, r28_off * wordSize); 225 const Address r26_save (rfp, r26_off * wordSize); 226 const Address r24_save (rfp, r24_off * wordSize); 227 const Address r22_save (rfp, r22_off * wordSize); 228 const Address r20_save (rfp, r20_off * wordSize); 229 230 // stub code 231 232 address aarch64_entry = __ pc(); 233 234 // set up frame and move sp to end of save area 235 __ enter(); 236 __ sub(sp, rfp, -sp_after_call_off * wordSize); 237 238 // save register parameters and Java scratch/global registers 239 // n.b. we save thread even though it gets installed in 240 // rthread because we want to sanity check rthread later 241 __ str(c_rarg7, thread); 242 __ strw(c_rarg6, parameter_size); 243 __ stp(c_rarg4, c_rarg5, entry_point); 244 __ stp(c_rarg2, c_rarg3, result_type); 245 __ stp(c_rarg0, c_rarg1, call_wrapper); 246 247 __ stp(r20, r19, r20_save); 248 __ stp(r22, r21, r22_save); 249 __ stp(r24, r23, r24_save); 250 __ stp(r26, r25, r26_save); 251 __ stp(r28, r27, r28_save); 252 253 __ stpd(v9, v8, d9_save); 254 __ stpd(v11, v10, d11_save); 255 __ stpd(v13, v12, d13_save); 256 __ stpd(v15, v14, d15_save); 257 258 __ get_fpcr(rscratch1); 259 __ str(rscratch1, fpcr_save); 260 // Set FPCR to the state we need. We do want Round to Nearest. We 261 // don't want non-IEEE rounding modes or floating-point traps. 262 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 263 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 264 __ set_fpcr(rscratch1); 265 266 // install Java thread in global register now we have saved 267 // whatever value it held 268 __ mov(rthread, c_rarg7); 269 // And method 270 __ mov(rmethod, c_rarg3); 271 272 // set up the heapbase register 273 __ reinit_heapbase(); 274 275 #ifdef ASSERT 276 // make sure we have no pending exceptions 277 { 278 Label L; 279 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 280 __ cmp(rscratch1, (u1)NULL_WORD); 281 __ br(Assembler::EQ, L); 282 __ stop("StubRoutines::call_stub: entered with pending exception"); 283 __ BIND(L); 284 } 285 #endif 286 // pass parameters if any 287 __ mov(esp, sp); 288 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 289 __ andr(sp, rscratch1, -2 * wordSize); 290 291 BLOCK_COMMENT("pass parameters if any"); 292 Label parameters_done; 293 // parameter count is still in c_rarg6 294 // and parameter pointer identifying param 1 is in c_rarg5 295 __ cbzw(c_rarg6, parameters_done); 296 297 address loop = __ pc(); 298 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 299 __ subsw(c_rarg6, c_rarg6, 1); 300 __ push(rscratch1); 301 __ br(Assembler::GT, loop); 302 303 __ BIND(parameters_done); 304 305 // call Java entry -- passing methdoOop, and current sp 306 // rmethod: Method* 307 // r19_sender_sp: sender sp 308 BLOCK_COMMENT("call Java function"); 309 __ mov(r19_sender_sp, sp); 310 __ blr(c_rarg4); 311 312 // we do this here because the notify will already have been done 313 // if we get to the next instruction via an exception 314 // 315 // n.b. adding this instruction here affects the calculation of 316 // whether or not a routine returns to the call stub (used when 317 // doing stack walks) since the normal test is to check the return 318 // pc against the address saved below. so we may need to allow for 319 // this extra instruction in the check. 320 321 // save current address for use by exception handling code 322 323 return_address = __ pc(); 324 325 // store result depending on type (everything that is not 326 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 327 // n.b. this assumes Java returns an integral result in r0 328 // and a floating result in j_farg0 329 __ ldr(j_rarg2, result); 330 Label is_long, is_float, is_double, exit; 331 __ ldr(j_rarg1, result_type); 332 __ cmp(j_rarg1, (u1)T_OBJECT); 333 __ br(Assembler::EQ, is_long); 334 __ cmp(j_rarg1, (u1)T_LONG); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, (u1)T_FLOAT); 337 __ br(Assembler::EQ, is_float); 338 __ cmp(j_rarg1, (u1)T_DOUBLE); 339 __ br(Assembler::EQ, is_double); 340 341 // handle T_INT case 342 __ strw(r0, Address(j_rarg2)); 343 344 __ BIND(exit); 345 346 // pop parameters 347 __ sub(esp, rfp, -sp_after_call_off * wordSize); 348 349 #ifdef ASSERT 350 // verify that threads correspond 351 { 352 Label L, S; 353 __ ldr(rscratch1, thread); 354 __ cmp(rthread, rscratch1); 355 __ br(Assembler::NE, S); 356 __ get_thread(rscratch1); 357 __ cmp(rthread, rscratch1); 358 __ br(Assembler::EQ, L); 359 __ BIND(S); 360 __ stop("StubRoutines::call_stub: threads must correspond"); 361 __ BIND(L); 362 } 363 #endif 364 365 __ pop_cont_fastpath(rthread); 366 367 // restore callee-save registers 368 __ ldpd(v15, v14, d15_save); 369 __ ldpd(v13, v12, d13_save); 370 __ ldpd(v11, v10, d11_save); 371 __ ldpd(v9, v8, d9_save); 372 373 __ ldp(r28, r27, r28_save); 374 __ ldp(r26, r25, r26_save); 375 __ ldp(r24, r23, r24_save); 376 __ ldp(r22, r21, r22_save); 377 __ ldp(r20, r19, r20_save); 378 379 // restore fpcr 380 __ ldr(rscratch1, fpcr_save); 381 __ set_fpcr(rscratch1); 382 383 __ ldp(c_rarg0, c_rarg1, call_wrapper); 384 __ ldrw(c_rarg2, result_type); 385 __ ldr(c_rarg3, method); 386 __ ldp(c_rarg4, c_rarg5, entry_point); 387 __ ldp(c_rarg6, c_rarg7, parameter_size); 388 389 // leave frame and return to caller 390 __ leave(); 391 __ ret(lr); 392 393 // handle return types different from T_INT 394 395 __ BIND(is_long); 396 __ str(r0, Address(j_rarg2, 0)); 397 __ br(Assembler::AL, exit); 398 399 __ BIND(is_float); 400 __ strs(j_farg0, Address(j_rarg2, 0)); 401 __ br(Assembler::AL, exit); 402 403 __ BIND(is_double); 404 __ strd(j_farg0, Address(j_rarg2, 0)); 405 __ br(Assembler::AL, exit); 406 407 return start; 408 } 409 410 // Return point for a Java call if there's an exception thrown in 411 // Java code. The exception is caught and transformed into a 412 // pending exception stored in JavaThread that can be tested from 413 // within the VM. 414 // 415 // Note: Usually the parameters are removed by the callee. In case 416 // of an exception crossing an activation frame boundary, that is 417 // not the case if the callee is compiled code => need to setup the 418 // rsp. 419 // 420 // r0: exception oop 421 422 address generate_catch_exception() { 423 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 424 address start = __ pc(); 425 426 // same as in generate_call_stub(): 427 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 428 const Address thread (rfp, thread_off * wordSize); 429 430 #ifdef ASSERT 431 // verify that threads correspond 432 { 433 Label L, S; 434 __ ldr(rscratch1, thread); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::NE, S); 437 __ get_thread(rscratch1); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::EQ, L); 440 __ bind(S); 441 __ stop("StubRoutines::catch_exception: threads must correspond"); 442 __ bind(L); 443 } 444 #endif 445 446 // set pending exception 447 __ verify_oop(r0); 448 449 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 450 __ mov(rscratch1, (address)__FILE__); 451 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 452 __ movw(rscratch1, (int)__LINE__); 453 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 454 455 // complete return to VM 456 assert(StubRoutines::_call_stub_return_address != nullptr, 457 "_call_stub_return_address must have been generated before"); 458 __ b(StubRoutines::_call_stub_return_address); 459 460 return start; 461 } 462 463 // Continuation point for runtime calls returning with a pending 464 // exception. The pending exception check happened in the runtime 465 // or native call stub. The pending exception in Thread is 466 // converted into a Java-level exception. 467 // 468 // Contract with Java-level exception handlers: 469 // r0: exception 470 // r3: throwing pc 471 // 472 // NOTE: At entry of this stub, exception-pc must be in LR !! 473 474 // NOTE: this is always used as a jump target within generated code 475 // so it just needs to be generated code with no x86 prolog 476 477 address generate_forward_exception() { 478 StubCodeMark mark(this, "StubRoutines", "forward exception"); 479 address start = __ pc(); 480 481 // Upon entry, LR points to the return address returning into 482 // Java (interpreted or compiled) code; i.e., the return address 483 // becomes the throwing pc. 484 // 485 // Arguments pushed before the runtime call are still on the stack 486 // but the exception handler will reset the stack pointer -> 487 // ignore them. A potential result in registers can be ignored as 488 // well. 489 490 #ifdef ASSERT 491 // make sure this code is only executed if there is a pending exception 492 { 493 Label L; 494 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 495 __ cbnz(rscratch1, L); 496 __ stop("StubRoutines::forward exception: no pending exception (1)"); 497 __ bind(L); 498 } 499 #endif 500 501 // compute exception handler into r19 502 503 // call the VM to find the handler address associated with the 504 // caller address. pass thread in r0 and caller pc (ret address) 505 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 506 // the stack. 507 __ mov(c_rarg1, lr); 508 // lr will be trashed by the VM call so we move it to R19 509 // (callee-saved) because we also need to pass it to the handler 510 // returned by this call. 511 __ mov(r19, lr); 512 BLOCK_COMMENT("call exception_handler_for_return_address"); 513 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 514 SharedRuntime::exception_handler_for_return_address), 515 rthread, c_rarg1); 516 // Reinitialize the ptrue predicate register, in case the external runtime 517 // call clobbers ptrue reg, as we may return to SVE compiled code. 518 __ reinitialize_ptrue(); 519 520 // we should not really care that lr is no longer the callee 521 // address. we saved the value the handler needs in r19 so we can 522 // just copy it to r3. however, the C2 handler will push its own 523 // frame and then calls into the VM and the VM code asserts that 524 // the PC for the frame above the handler belongs to a compiled 525 // Java method. So, we restore lr here to satisfy that assert. 526 __ mov(lr, r19); 527 // setup r0 & r3 & clear pending exception 528 __ mov(r3, r19); 529 __ mov(r19, r0); 530 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 531 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 532 533 #ifdef ASSERT 534 // make sure exception is set 535 { 536 Label L; 537 __ cbnz(r0, L); 538 __ stop("StubRoutines::forward exception: no pending exception (2)"); 539 __ bind(L); 540 } 541 #endif 542 543 // continue at exception handler 544 // r0: exception 545 // r3: throwing pc 546 // r19: exception handler 547 __ verify_oop(r0); 548 __ br(r19); 549 550 return start; 551 } 552 553 // Non-destructive plausibility checks for oops 554 // 555 // Arguments: 556 // r0: oop to verify 557 // rscratch1: error message 558 // 559 // Stack after saving c_rarg3: 560 // [tos + 0]: saved c_rarg3 561 // [tos + 1]: saved c_rarg2 562 // [tos + 2]: saved lr 563 // [tos + 3]: saved rscratch2 564 // [tos + 4]: saved r0 565 // [tos + 5]: saved rscratch1 566 address generate_verify_oop() { 567 568 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 569 address start = __ pc(); 570 571 Label exit, error; 572 573 // save c_rarg2 and c_rarg3 574 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 575 576 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 577 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 578 __ ldr(c_rarg3, Address(c_rarg2)); 579 __ add(c_rarg3, c_rarg3, 1); 580 __ str(c_rarg3, Address(c_rarg2)); 581 582 // object is in r0 583 // make sure object is 'reasonable' 584 __ cbz(r0, exit); // if obj is null it is OK 585 586 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 587 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 588 589 // return if everything seems ok 590 __ bind(exit); 591 592 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 593 __ ret(lr); 594 595 // handle errors 596 __ bind(error); 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 599 __ push(RegSet::range(r0, r29), sp); 600 // debug(char* msg, int64_t pc, int64_t regs[]) 601 __ mov(c_rarg0, rscratch1); // pass address of error message 602 __ mov(c_rarg1, lr); // pass return address 603 __ mov(c_rarg2, sp); // pass address of regs on stack 604 #ifndef PRODUCT 605 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 606 #endif 607 BLOCK_COMMENT("call MacroAssembler::debug"); 608 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 609 __ blr(rscratch1); 610 __ hlt(0); 611 612 return start; 613 } 614 615 // Generate indices for iota vector. 616 address generate_iota_indices(const char *stub_name) { 617 __ align(CodeEntryAlignment); 618 StubCodeMark mark(this, "StubRoutines", stub_name); 619 address start = __ pc(); 620 // B 621 __ emit_data64(0x0706050403020100, relocInfo::none); 622 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 623 // H 624 __ emit_data64(0x0003000200010000, relocInfo::none); 625 __ emit_data64(0x0007000600050004, relocInfo::none); 626 // S 627 __ emit_data64(0x0000000100000000, relocInfo::none); 628 __ emit_data64(0x0000000300000002, relocInfo::none); 629 // D 630 __ emit_data64(0x0000000000000000, relocInfo::none); 631 __ emit_data64(0x0000000000000001, relocInfo::none); 632 // S - FP 633 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 634 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 635 // D - FP 636 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 637 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 638 return start; 639 } 640 641 // The inner part of zero_words(). This is the bulk operation, 642 // zeroing words in blocks, possibly using DC ZVA to do it. The 643 // caller is responsible for zeroing the last few words. 644 // 645 // Inputs: 646 // r10: the HeapWord-aligned base address of an array to zero. 647 // r11: the count in HeapWords, r11 > 0. 648 // 649 // Returns r10 and r11, adjusted for the caller to clear. 650 // r10: the base address of the tail of words left to clear. 651 // r11: the number of words in the tail. 652 // r11 < MacroAssembler::zero_words_block_size. 653 654 address generate_zero_blocks() { 655 Label done; 656 Label base_aligned; 657 658 Register base = r10, cnt = r11; 659 660 __ align(CodeEntryAlignment); 661 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 662 address start = __ pc(); 663 664 if (UseBlockZeroing) { 665 int zva_length = VM_Version::zva_length(); 666 667 // Ensure ZVA length can be divided by 16. This is required by 668 // the subsequent operations. 669 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 670 671 __ tbz(base, 3, base_aligned); 672 __ str(zr, Address(__ post(base, 8))); 673 __ sub(cnt, cnt, 1); 674 __ bind(base_aligned); 675 676 // Ensure count >= zva_length * 2 so that it still deserves a zva after 677 // alignment. 678 Label small; 679 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 680 __ subs(rscratch1, cnt, low_limit >> 3); 681 __ br(Assembler::LT, small); 682 __ zero_dcache_blocks(base, cnt); 683 __ bind(small); 684 } 685 686 { 687 // Number of stp instructions we'll unroll 688 const int unroll = 689 MacroAssembler::zero_words_block_size / 2; 690 // Clear the remaining blocks. 691 Label loop; 692 __ subs(cnt, cnt, unroll * 2); 693 __ br(Assembler::LT, done); 694 __ bind(loop); 695 for (int i = 0; i < unroll; i++) 696 __ stp(zr, zr, __ post(base, 16)); 697 __ subs(cnt, cnt, unroll * 2); 698 __ br(Assembler::GE, loop); 699 __ bind(done); 700 __ add(cnt, cnt, unroll * 2); 701 } 702 703 __ ret(lr); 704 705 return start; 706 } 707 708 709 typedef enum { 710 copy_forwards = 1, 711 copy_backwards = -1 712 } copy_direction; 713 714 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 715 // for arraycopy stubs. 716 class ArrayCopyBarrierSetHelper : StackObj { 717 BarrierSetAssembler* _bs_asm; 718 MacroAssembler* _masm; 719 DecoratorSet _decorators; 720 BasicType _type; 721 Register _gct1; 722 Register _gct2; 723 Register _gct3; 724 FloatRegister _gcvt1; 725 FloatRegister _gcvt2; 726 FloatRegister _gcvt3; 727 728 public: 729 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 730 DecoratorSet decorators, 731 BasicType type, 732 Register gct1, 733 Register gct2, 734 Register gct3, 735 FloatRegister gcvt1, 736 FloatRegister gcvt2, 737 FloatRegister gcvt3) 738 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 739 _masm(masm), 740 _decorators(decorators), 741 _type(type), 742 _gct1(gct1), 743 _gct2(gct2), 744 _gct3(gct3), 745 _gcvt1(gcvt1), 746 _gcvt2(gcvt2), 747 _gcvt3(gcvt3) { 748 } 749 750 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 751 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 752 dst1, dst2, src, 753 _gct1, _gct2, _gcvt1); 754 } 755 756 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 757 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 758 dst, src1, src2, 759 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 760 } 761 762 void copy_load_at_16(Register dst1, Register dst2, Address src) { 763 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 764 dst1, dst2, src, 765 _gct1); 766 } 767 768 void copy_store_at_16(Address dst, Register src1, Register src2) { 769 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 770 dst, src1, src2, 771 _gct1, _gct2, _gct3); 772 } 773 774 void copy_load_at_8(Register dst, Address src) { 775 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 776 dst, noreg, src, 777 _gct1); 778 } 779 780 void copy_store_at_8(Address dst, Register src) { 781 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 782 dst, src, noreg, 783 _gct1, _gct2, _gct3); 784 } 785 }; 786 787 // Bulk copy of blocks of 8 words. 788 // 789 // count is a count of words. 790 // 791 // Precondition: count >= 8 792 // 793 // Postconditions: 794 // 795 // The least significant bit of count contains the remaining count 796 // of words to copy. The rest of count is trash. 797 // 798 // s and d are adjusted to point to the remaining words to copy 799 // 800 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 801 copy_direction direction) { 802 int unit = wordSize * direction; 803 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 804 805 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 806 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 807 const Register stride = r14; 808 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 809 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 810 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 811 812 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 813 assert_different_registers(s, d, count, rscratch1, rscratch2); 814 815 Label again, drain; 816 const char *stub_name; 817 if (direction == copy_forwards) 818 stub_name = "forward_copy_longs"; 819 else 820 stub_name = "backward_copy_longs"; 821 822 __ align(CodeEntryAlignment); 823 824 StubCodeMark mark(this, "StubRoutines", stub_name); 825 826 __ bind(start); 827 828 Label unaligned_copy_long; 829 if (AvoidUnalignedAccesses) { 830 __ tbnz(d, 3, unaligned_copy_long); 831 } 832 833 if (direction == copy_forwards) { 834 __ sub(s, s, bias); 835 __ sub(d, d, bias); 836 } 837 838 #ifdef ASSERT 839 // Make sure we are never given < 8 words 840 { 841 Label L; 842 __ cmp(count, (u1)8); 843 __ br(Assembler::GE, L); 844 __ stop("genrate_copy_longs called with < 8 words"); 845 __ bind(L); 846 } 847 #endif 848 849 // Fill 8 registers 850 if (UseSIMDForMemoryOps) { 851 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 852 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 853 } else { 854 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 855 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 856 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 857 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 858 } 859 860 __ subs(count, count, 16); 861 __ br(Assembler::LO, drain); 862 863 int prefetch = PrefetchCopyIntervalInBytes; 864 bool use_stride = false; 865 if (direction == copy_backwards) { 866 use_stride = prefetch > 256; 867 prefetch = -prefetch; 868 if (use_stride) __ mov(stride, prefetch); 869 } 870 871 __ bind(again); 872 873 if (PrefetchCopyIntervalInBytes > 0) 874 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 875 876 if (UseSIMDForMemoryOps) { 877 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 878 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 879 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 880 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 881 } else { 882 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 883 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 884 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 885 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 886 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 887 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 888 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 889 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 890 } 891 892 __ subs(count, count, 8); 893 __ br(Assembler::HS, again); 894 895 // Drain 896 __ bind(drain); 897 if (UseSIMDForMemoryOps) { 898 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 899 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 900 } else { 901 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 902 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 903 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 904 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 905 } 906 907 { 908 Label L1, L2; 909 __ tbz(count, exact_log2(4), L1); 910 if (UseSIMDForMemoryOps) { 911 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 912 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 913 } else { 914 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 915 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 916 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 917 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 918 } 919 __ bind(L1); 920 921 if (direction == copy_forwards) { 922 __ add(s, s, bias); 923 __ add(d, d, bias); 924 } 925 926 __ tbz(count, 1, L2); 927 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 928 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 929 __ bind(L2); 930 } 931 932 __ ret(lr); 933 934 if (AvoidUnalignedAccesses) { 935 Label drain, again; 936 // Register order for storing. Order is different for backward copy. 937 938 __ bind(unaligned_copy_long); 939 940 // source address is even aligned, target odd aligned 941 // 942 // when forward copying word pairs we read long pairs at offsets 943 // {0, 2, 4, 6} (in long words). when backwards copying we read 944 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 945 // address by -2 in the forwards case so we can compute the 946 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 947 // or -1. 948 // 949 // when forward copying we need to store 1 word, 3 pairs and 950 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 951 // zero offset We adjust the destination by -1 which means we 952 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 953 // 954 // When backwards copyng we need to store 1 word, 3 pairs and 955 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 956 // offsets {1, 3, 5, 7, 8} * unit. 957 958 if (direction == copy_forwards) { 959 __ sub(s, s, 16); 960 __ sub(d, d, 8); 961 } 962 963 // Fill 8 registers 964 // 965 // for forwards copy s was offset by -16 from the original input 966 // value of s so the register contents are at these offsets 967 // relative to the 64 bit block addressed by that original input 968 // and so on for each successive 64 byte block when s is updated 969 // 970 // t0 at offset 0, t1 at offset 8 971 // t2 at offset 16, t3 at offset 24 972 // t4 at offset 32, t5 at offset 40 973 // t6 at offset 48, t7 at offset 56 974 975 // for backwards copy s was not offset so the register contents 976 // are at these offsets into the preceding 64 byte block 977 // relative to that original input and so on for each successive 978 // preceding 64 byte block when s is updated. this explains the 979 // slightly counter-intuitive looking pattern of register usage 980 // in the stp instructions for backwards copy. 981 // 982 // t0 at offset -16, t1 at offset -8 983 // t2 at offset -32, t3 at offset -24 984 // t4 at offset -48, t5 at offset -40 985 // t6 at offset -64, t7 at offset -56 986 987 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 988 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 989 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 990 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 991 992 __ subs(count, count, 16); 993 __ br(Assembler::LO, drain); 994 995 int prefetch = PrefetchCopyIntervalInBytes; 996 bool use_stride = false; 997 if (direction == copy_backwards) { 998 use_stride = prefetch > 256; 999 prefetch = -prefetch; 1000 if (use_stride) __ mov(stride, prefetch); 1001 } 1002 1003 __ bind(again); 1004 1005 if (PrefetchCopyIntervalInBytes > 0) 1006 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1007 1008 if (direction == copy_forwards) { 1009 // allowing for the offset of -8 the store instructions place 1010 // registers into the target 64 bit block at the following 1011 // offsets 1012 // 1013 // t0 at offset 0 1014 // t1 at offset 8, t2 at offset 16 1015 // t3 at offset 24, t4 at offset 32 1016 // t5 at offset 40, t6 at offset 48 1017 // t7 at offset 56 1018 1019 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1020 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1021 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1022 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1023 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1024 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1025 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1026 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1027 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1028 } else { 1029 // d was not offset when we started so the registers are 1030 // written into the 64 bit block preceding d with the following 1031 // offsets 1032 // 1033 // t1 at offset -8 1034 // t3 at offset -24, t0 at offset -16 1035 // t5 at offset -48, t2 at offset -32 1036 // t7 at offset -56, t4 at offset -48 1037 // t6 at offset -64 1038 // 1039 // note that this matches the offsets previously noted for the 1040 // loads 1041 1042 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1043 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1044 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1045 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1046 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1047 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1048 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1049 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1050 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1051 } 1052 1053 __ subs(count, count, 8); 1054 __ br(Assembler::HS, again); 1055 1056 // Drain 1057 // 1058 // this uses the same pattern of offsets and register arguments 1059 // as above 1060 __ bind(drain); 1061 if (direction == copy_forwards) { 1062 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1063 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1064 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1065 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1066 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1067 } else { 1068 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1069 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1070 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1071 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1072 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1073 } 1074 // now we need to copy any remaining part block which may 1075 // include a 4 word block subblock and/or a 2 word subblock. 1076 // bits 2 and 1 in the count are the tell-tale for whether we 1077 // have each such subblock 1078 { 1079 Label L1, L2; 1080 __ tbz(count, exact_log2(4), L1); 1081 // this is the same as above but copying only 4 longs hence 1082 // with only one intervening stp between the str instructions 1083 // but note that the offsets and registers still follow the 1084 // same pattern 1085 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1086 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1087 if (direction == copy_forwards) { 1088 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1089 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1090 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1091 } else { 1092 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1093 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1094 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1095 } 1096 __ bind(L1); 1097 1098 __ tbz(count, 1, L2); 1099 // this is the same as above but copying only 2 longs hence 1100 // there is no intervening stp between the str instructions 1101 // but note that the offset and register patterns are still 1102 // the same 1103 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1104 if (direction == copy_forwards) { 1105 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1106 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1107 } else { 1108 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1109 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1110 } 1111 __ bind(L2); 1112 1113 // for forwards copy we need to re-adjust the offsets we 1114 // applied so that s and d are follow the last words written 1115 1116 if (direction == copy_forwards) { 1117 __ add(s, s, 16); 1118 __ add(d, d, 8); 1119 } 1120 1121 } 1122 1123 __ ret(lr); 1124 } 1125 } 1126 1127 // Small copy: less than 16 bytes. 1128 // 1129 // NB: Ignores all of the bits of count which represent more than 15 1130 // bytes, so a caller doesn't have to mask them. 1131 1132 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1133 bool is_backwards = step < 0; 1134 size_t granularity = uabs(step); 1135 int direction = is_backwards ? -1 : 1; 1136 1137 Label Lword, Lint, Lshort, Lbyte; 1138 1139 assert(granularity 1140 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1141 1142 const Register t0 = r3; 1143 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1144 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1145 1146 // ??? I don't know if this bit-test-and-branch is the right thing 1147 // to do. It does a lot of jumping, resulting in several 1148 // mispredicted branches. It might make more sense to do this 1149 // with something like Duff's device with a single computed branch. 1150 1151 __ tbz(count, 3 - exact_log2(granularity), Lword); 1152 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1153 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1154 __ bind(Lword); 1155 1156 if (granularity <= sizeof (jint)) { 1157 __ tbz(count, 2 - exact_log2(granularity), Lint); 1158 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1159 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1160 __ bind(Lint); 1161 } 1162 1163 if (granularity <= sizeof (jshort)) { 1164 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1165 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1166 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1167 __ bind(Lshort); 1168 } 1169 1170 if (granularity <= sizeof (jbyte)) { 1171 __ tbz(count, 0, Lbyte); 1172 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1173 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1174 __ bind(Lbyte); 1175 } 1176 } 1177 1178 Label copy_f, copy_b; 1179 Label copy_obj_f, copy_obj_b; 1180 Label copy_obj_uninit_f, copy_obj_uninit_b; 1181 1182 // All-singing all-dancing memory copy. 1183 // 1184 // Copy count units of memory from s to d. The size of a unit is 1185 // step, which can be positive or negative depending on the direction 1186 // of copy. If is_aligned is false, we align the source address. 1187 // 1188 1189 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1190 Register s, Register d, Register count, int step) { 1191 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1192 bool is_backwards = step < 0; 1193 unsigned int granularity = uabs(step); 1194 const Register t0 = r3, t1 = r4; 1195 1196 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1197 // load all the data before writing anything 1198 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1199 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1200 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1201 const Register send = r17, dend = r16; 1202 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1203 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1204 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1205 1206 if (PrefetchCopyIntervalInBytes > 0) 1207 __ prfm(Address(s, 0), PLDL1KEEP); 1208 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1209 __ br(Assembler::HI, copy_big); 1210 1211 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1212 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1213 1214 __ cmp(count, u1(16/granularity)); 1215 __ br(Assembler::LS, copy16); 1216 1217 __ cmp(count, u1(64/granularity)); 1218 __ br(Assembler::HI, copy80); 1219 1220 __ cmp(count, u1(32/granularity)); 1221 __ br(Assembler::LS, copy32); 1222 1223 // 33..64 bytes 1224 if (UseSIMDForMemoryOps) { 1225 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1226 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1227 bs.copy_store_at_32(Address(d, 0), v0, v1); 1228 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1229 } else { 1230 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1231 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1232 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1233 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1234 1235 bs.copy_store_at_16(Address(d, 0), t0, t1); 1236 bs.copy_store_at_16(Address(d, 16), t2, t3); 1237 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1238 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1239 } 1240 __ b(finish); 1241 1242 // 17..32 bytes 1243 __ bind(copy32); 1244 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1245 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1246 1247 bs.copy_store_at_16(Address(d, 0), t0, t1); 1248 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1249 __ b(finish); 1250 1251 // 65..80/96 bytes 1252 // (96 bytes if SIMD because we do 32 byes per instruction) 1253 __ bind(copy80); 1254 if (UseSIMDForMemoryOps) { 1255 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1256 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1257 // Unaligned pointers can be an issue for copying. 1258 // The issue has more chances to happen when granularity of data is 1259 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1260 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1261 // The most performance drop has been seen for the range 65-80 bytes. 1262 // For such cases using the pair of ldp/stp instead of the third pair of 1263 // ldpq/stpq fixes the performance issue. 1264 if (granularity < sizeof (jint)) { 1265 Label copy96; 1266 __ cmp(count, u1(80/granularity)); 1267 __ br(Assembler::HI, copy96); 1268 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1269 1270 bs.copy_store_at_32(Address(d, 0), v0, v1); 1271 bs.copy_store_at_32(Address(d, 32), v2, v3); 1272 1273 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1274 __ b(finish); 1275 1276 __ bind(copy96); 1277 } 1278 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1279 1280 bs.copy_store_at_32(Address(d, 0), v0, v1); 1281 bs.copy_store_at_32(Address(d, 32), v2, v3); 1282 1283 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1284 } else { 1285 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1286 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1287 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1288 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1289 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1290 1291 bs.copy_store_at_16(Address(d, 0), t0, t1); 1292 bs.copy_store_at_16(Address(d, 16), t2, t3); 1293 bs.copy_store_at_16(Address(d, 32), t4, t5); 1294 bs.copy_store_at_16(Address(d, 48), t6, t7); 1295 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1296 } 1297 __ b(finish); 1298 1299 // 0..16 bytes 1300 __ bind(copy16); 1301 __ cmp(count, u1(8/granularity)); 1302 __ br(Assembler::LO, copy8); 1303 1304 // 8..16 bytes 1305 bs.copy_load_at_8(t0, Address(s, 0)); 1306 bs.copy_load_at_8(t1, Address(send, -8)); 1307 bs.copy_store_at_8(Address(d, 0), t0); 1308 bs.copy_store_at_8(Address(dend, -8), t1); 1309 __ b(finish); 1310 1311 if (granularity < 8) { 1312 // 4..7 bytes 1313 __ bind(copy8); 1314 __ tbz(count, 2 - exact_log2(granularity), copy4); 1315 __ ldrw(t0, Address(s, 0)); 1316 __ ldrw(t1, Address(send, -4)); 1317 __ strw(t0, Address(d, 0)); 1318 __ strw(t1, Address(dend, -4)); 1319 __ b(finish); 1320 if (granularity < 4) { 1321 // 0..3 bytes 1322 __ bind(copy4); 1323 __ cbz(count, finish); // get rid of 0 case 1324 if (granularity == 2) { 1325 __ ldrh(t0, Address(s, 0)); 1326 __ strh(t0, Address(d, 0)); 1327 } else { // granularity == 1 1328 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1329 // the first and last byte. 1330 // Handle the 3 byte case by loading and storing base + count/2 1331 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1332 // This does means in the 1 byte case we load/store the same 1333 // byte 3 times. 1334 __ lsr(count, count, 1); 1335 __ ldrb(t0, Address(s, 0)); 1336 __ ldrb(t1, Address(send, -1)); 1337 __ ldrb(t2, Address(s, count)); 1338 __ strb(t0, Address(d, 0)); 1339 __ strb(t1, Address(dend, -1)); 1340 __ strb(t2, Address(d, count)); 1341 } 1342 __ b(finish); 1343 } 1344 } 1345 1346 __ bind(copy_big); 1347 if (is_backwards) { 1348 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1349 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1350 } 1351 1352 // Now we've got the small case out of the way we can align the 1353 // source address on a 2-word boundary. 1354 1355 // Here we will materialize a count in r15, which is used by copy_memory_small 1356 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1357 // Up until here, we have used t9, which aliases r15, but from here on, that register 1358 // can not be used as a temp register, as it contains the count. 1359 1360 Label aligned; 1361 1362 if (is_aligned) { 1363 // We may have to adjust by 1 word to get s 2-word-aligned. 1364 __ tbz(s, exact_log2(wordSize), aligned); 1365 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1366 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1367 __ sub(count, count, wordSize/granularity); 1368 } else { 1369 if (is_backwards) { 1370 __ andr(r15, s, 2 * wordSize - 1); 1371 } else { 1372 __ neg(r15, s); 1373 __ andr(r15, r15, 2 * wordSize - 1); 1374 } 1375 // r15 is the byte adjustment needed to align s. 1376 __ cbz(r15, aligned); 1377 int shift = exact_log2(granularity); 1378 if (shift) __ lsr(r15, r15, shift); 1379 __ sub(count, count, r15); 1380 1381 #if 0 1382 // ?? This code is only correct for a disjoint copy. It may or 1383 // may not make sense to use it in that case. 1384 1385 // Copy the first pair; s and d may not be aligned. 1386 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1387 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1388 1389 // Align s and d, adjust count 1390 if (is_backwards) { 1391 __ sub(s, s, r15); 1392 __ sub(d, d, r15); 1393 } else { 1394 __ add(s, s, r15); 1395 __ add(d, d, r15); 1396 } 1397 #else 1398 copy_memory_small(decorators, type, s, d, r15, step); 1399 #endif 1400 } 1401 1402 __ bind(aligned); 1403 1404 // s is now 2-word-aligned. 1405 1406 // We have a count of units and some trailing bytes. Adjust the 1407 // count and do a bulk copy of words. 1408 __ lsr(r15, count, exact_log2(wordSize/granularity)); 1409 if (direction == copy_forwards) { 1410 if (type != T_OBJECT) { 1411 __ bl(copy_f); 1412 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1413 __ bl(copy_obj_uninit_f); 1414 } else { 1415 __ bl(copy_obj_f); 1416 } 1417 } else { 1418 if (type != T_OBJECT) { 1419 __ bl(copy_b); 1420 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1421 __ bl(copy_obj_uninit_b); 1422 } else { 1423 __ bl(copy_obj_b); 1424 } 1425 } 1426 1427 // And the tail. 1428 copy_memory_small(decorators, type, s, d, count, step); 1429 1430 if (granularity >= 8) __ bind(copy8); 1431 if (granularity >= 4) __ bind(copy4); 1432 __ bind(finish); 1433 } 1434 1435 1436 void clobber_registers() { 1437 #ifdef ASSERT 1438 RegSet clobbered 1439 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1440 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1441 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1442 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1443 __ mov(*it, rscratch1); 1444 } 1445 #endif 1446 1447 } 1448 1449 // Scan over array at a for count oops, verifying each one. 1450 // Preserves a and count, clobbers rscratch1 and rscratch2. 1451 void verify_oop_array (int size, Register a, Register count, Register temp) { 1452 Label loop, end; 1453 __ mov(rscratch1, a); 1454 __ mov(rscratch2, zr); 1455 __ bind(loop); 1456 __ cmp(rscratch2, count); 1457 __ br(Assembler::HS, end); 1458 if (size == wordSize) { 1459 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1460 __ verify_oop(temp); 1461 } else { 1462 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1463 __ decode_heap_oop(temp); // calls verify_oop 1464 } 1465 __ add(rscratch2, rscratch2, 1); 1466 __ b(loop); 1467 __ bind(end); 1468 } 1469 1470 // Arguments: 1471 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1472 // ignored 1473 // is_oop - true => oop array, so generate store check code 1474 // name - stub name string 1475 // 1476 // Inputs: 1477 // c_rarg0 - source array address 1478 // c_rarg1 - destination array address 1479 // c_rarg2 - element count, treated as ssize_t, can be zero 1480 // 1481 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1482 // the hardware handle it. The two dwords within qwords that span 1483 // cache line boundaries will still be loaded and stored atomically. 1484 // 1485 // Side Effects: 1486 // disjoint_int_copy_entry is set to the no-overlap entry point 1487 // used by generate_conjoint_int_oop_copy(). 1488 // 1489 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1490 const char *name, bool dest_uninitialized = false) { 1491 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1492 RegSet saved_reg = RegSet::of(s, d, count); 1493 __ align(CodeEntryAlignment); 1494 StubCodeMark mark(this, "StubRoutines", name); 1495 address start = __ pc(); 1496 __ enter(); 1497 1498 if (entry != nullptr) { 1499 *entry = __ pc(); 1500 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1501 BLOCK_COMMENT("Entry:"); 1502 } 1503 1504 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1505 if (dest_uninitialized) { 1506 decorators |= IS_DEST_UNINITIALIZED; 1507 } 1508 if (aligned) { 1509 decorators |= ARRAYCOPY_ALIGNED; 1510 } 1511 1512 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1513 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1514 1515 if (is_oop) { 1516 // save regs before copy_memory 1517 __ push(RegSet::of(d, count), sp); 1518 } 1519 { 1520 // UnsafeMemoryAccess page error: continue after unsafe access 1521 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1522 UnsafeMemoryAccessMark umam(this, add_entry, true); 1523 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1524 } 1525 1526 if (is_oop) { 1527 __ pop(RegSet::of(d, count), sp); 1528 if (VerifyOops) 1529 verify_oop_array(size, d, count, r16); 1530 } 1531 1532 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1533 1534 __ leave(); 1535 __ mov(r0, zr); // return 0 1536 __ ret(lr); 1537 return start; 1538 } 1539 1540 // Arguments: 1541 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1542 // ignored 1543 // is_oop - true => oop array, so generate store check code 1544 // name - stub name string 1545 // 1546 // Inputs: 1547 // c_rarg0 - source array address 1548 // c_rarg1 - destination array address 1549 // c_rarg2 - element count, treated as ssize_t, can be zero 1550 // 1551 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1552 // the hardware handle it. The two dwords within qwords that span 1553 // cache line boundaries will still be loaded and stored atomically. 1554 // 1555 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1556 address *entry, const char *name, 1557 bool dest_uninitialized = false) { 1558 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1559 RegSet saved_regs = RegSet::of(s, d, count); 1560 StubCodeMark mark(this, "StubRoutines", name); 1561 address start = __ pc(); 1562 __ enter(); 1563 1564 if (entry != nullptr) { 1565 *entry = __ pc(); 1566 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1567 BLOCK_COMMENT("Entry:"); 1568 } 1569 1570 // use fwd copy when (d-s) above_equal (count*size) 1571 __ sub(rscratch1, d, s); 1572 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1573 __ br(Assembler::HS, nooverlap_target); 1574 1575 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1576 if (dest_uninitialized) { 1577 decorators |= IS_DEST_UNINITIALIZED; 1578 } 1579 if (aligned) { 1580 decorators |= ARRAYCOPY_ALIGNED; 1581 } 1582 1583 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1584 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1585 1586 if (is_oop) { 1587 // save regs before copy_memory 1588 __ push(RegSet::of(d, count), sp); 1589 } 1590 { 1591 // UnsafeMemoryAccess page error: continue after unsafe access 1592 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1593 UnsafeMemoryAccessMark umam(this, add_entry, true); 1594 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1595 } 1596 if (is_oop) { 1597 __ pop(RegSet::of(d, count), sp); 1598 if (VerifyOops) 1599 verify_oop_array(size, d, count, r16); 1600 } 1601 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1602 __ leave(); 1603 __ mov(r0, zr); // return 0 1604 __ ret(lr); 1605 return start; 1606 } 1607 1608 // Arguments: 1609 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1610 // ignored 1611 // name - stub name string 1612 // 1613 // Inputs: 1614 // c_rarg0 - source array address 1615 // c_rarg1 - destination array address 1616 // c_rarg2 - element count, treated as ssize_t, can be zero 1617 // 1618 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1619 // we let the hardware handle it. The one to eight bytes within words, 1620 // dwords or qwords that span cache line boundaries will still be loaded 1621 // and stored atomically. 1622 // 1623 // Side Effects: 1624 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1625 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1626 // we let the hardware handle it. The one to eight bytes within words, 1627 // dwords or qwords that span cache line boundaries will still be loaded 1628 // and stored atomically. 1629 // 1630 // Side Effects: 1631 // disjoint_byte_copy_entry is set to the no-overlap entry point 1632 // used by generate_conjoint_byte_copy(). 1633 // 1634 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1635 const bool not_oop = false; 1636 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1637 } 1638 1639 // Arguments: 1640 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1641 // ignored 1642 // name - stub name string 1643 // 1644 // Inputs: 1645 // c_rarg0 - source array address 1646 // c_rarg1 - destination array address 1647 // c_rarg2 - element count, treated as ssize_t, can be zero 1648 // 1649 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1650 // we let the hardware handle it. The one to eight bytes within words, 1651 // dwords or qwords that span cache line boundaries will still be loaded 1652 // and stored atomically. 1653 // 1654 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1655 address* entry, const char *name) { 1656 const bool not_oop = false; 1657 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1658 } 1659 1660 // Arguments: 1661 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1662 // ignored 1663 // name - stub name string 1664 // 1665 // Inputs: 1666 // c_rarg0 - source array address 1667 // c_rarg1 - destination array address 1668 // c_rarg2 - element count, treated as ssize_t, can be zero 1669 // 1670 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1671 // let the hardware handle it. The two or four words within dwords 1672 // or qwords that span cache line boundaries will still be loaded 1673 // and stored atomically. 1674 // 1675 // Side Effects: 1676 // disjoint_short_copy_entry is set to the no-overlap entry point 1677 // used by generate_conjoint_short_copy(). 1678 // 1679 address generate_disjoint_short_copy(bool aligned, 1680 address* entry, const char *name) { 1681 const bool not_oop = false; 1682 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1683 } 1684 1685 // Arguments: 1686 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1687 // ignored 1688 // name - stub name string 1689 // 1690 // Inputs: 1691 // c_rarg0 - source array address 1692 // c_rarg1 - destination array address 1693 // c_rarg2 - element count, treated as ssize_t, can be zero 1694 // 1695 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1696 // let the hardware handle it. The two or four words within dwords 1697 // or qwords that span cache line boundaries will still be loaded 1698 // and stored atomically. 1699 // 1700 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1701 address *entry, const char *name) { 1702 const bool not_oop = false; 1703 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1704 1705 } 1706 // Arguments: 1707 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1708 // ignored 1709 // name - stub name string 1710 // 1711 // Inputs: 1712 // c_rarg0 - source array address 1713 // c_rarg1 - destination array address 1714 // c_rarg2 - element count, treated as ssize_t, can be zero 1715 // 1716 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1717 // the hardware handle it. The two dwords within qwords that span 1718 // cache line boundaries will still be loaded and stored atomically. 1719 // 1720 // Side Effects: 1721 // disjoint_int_copy_entry is set to the no-overlap entry point 1722 // used by generate_conjoint_int_oop_copy(). 1723 // 1724 address generate_disjoint_int_copy(bool aligned, address *entry, 1725 const char *name, bool dest_uninitialized = false) { 1726 const bool not_oop = false; 1727 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1728 } 1729 1730 // Arguments: 1731 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1732 // ignored 1733 // name - stub name string 1734 // 1735 // Inputs: 1736 // c_rarg0 - source array address 1737 // c_rarg1 - destination array address 1738 // c_rarg2 - element count, treated as ssize_t, can be zero 1739 // 1740 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1741 // the hardware handle it. The two dwords within qwords that span 1742 // cache line boundaries will still be loaded and stored atomically. 1743 // 1744 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1745 address *entry, const char *name, 1746 bool dest_uninitialized = false) { 1747 const bool not_oop = false; 1748 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1749 } 1750 1751 1752 // Arguments: 1753 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1754 // ignored 1755 // name - stub name string 1756 // 1757 // Inputs: 1758 // c_rarg0 - source array address 1759 // c_rarg1 - destination array address 1760 // c_rarg2 - element count, treated as size_t, can be zero 1761 // 1762 // Side Effects: 1763 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1764 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1765 // 1766 address generate_disjoint_long_copy(bool aligned, address *entry, 1767 const char *name, bool dest_uninitialized = false) { 1768 const bool not_oop = false; 1769 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1770 } 1771 1772 // Arguments: 1773 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1774 // ignored 1775 // name - stub name string 1776 // 1777 // Inputs: 1778 // c_rarg0 - source array address 1779 // c_rarg1 - destination array address 1780 // c_rarg2 - element count, treated as size_t, can be zero 1781 // 1782 address generate_conjoint_long_copy(bool aligned, 1783 address nooverlap_target, address *entry, 1784 const char *name, bool dest_uninitialized = false) { 1785 const bool not_oop = false; 1786 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1787 } 1788 1789 // Arguments: 1790 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1791 // ignored 1792 // name - stub name string 1793 // 1794 // Inputs: 1795 // c_rarg0 - source array address 1796 // c_rarg1 - destination array address 1797 // c_rarg2 - element count, treated as size_t, can be zero 1798 // 1799 // Side Effects: 1800 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1801 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1802 // 1803 address generate_disjoint_oop_copy(bool aligned, address *entry, 1804 const char *name, bool dest_uninitialized) { 1805 const bool is_oop = true; 1806 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1807 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1808 } 1809 1810 // Arguments: 1811 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1812 // ignored 1813 // name - stub name string 1814 // 1815 // Inputs: 1816 // c_rarg0 - source array address 1817 // c_rarg1 - destination array address 1818 // c_rarg2 - element count, treated as size_t, can be zero 1819 // 1820 address generate_conjoint_oop_copy(bool aligned, 1821 address nooverlap_target, address *entry, 1822 const char *name, bool dest_uninitialized) { 1823 const bool is_oop = true; 1824 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1825 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1826 name, dest_uninitialized); 1827 } 1828 1829 1830 // Helper for generating a dynamic type check. 1831 // Smashes rscratch1, rscratch2. 1832 void generate_type_check(Register sub_klass, 1833 Register super_check_offset, 1834 Register super_klass, 1835 Label& L_success) { 1836 assert_different_registers(sub_klass, super_check_offset, super_klass); 1837 1838 BLOCK_COMMENT("type_check:"); 1839 1840 Label L_miss; 1841 1842 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1843 super_check_offset); 1844 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1845 1846 // Fall through on failure! 1847 __ BIND(L_miss); 1848 } 1849 1850 // 1851 // Generate checkcasting array copy stub 1852 // 1853 // Input: 1854 // c_rarg0 - source array address 1855 // c_rarg1 - destination array address 1856 // c_rarg2 - element count, treated as ssize_t, can be zero 1857 // c_rarg3 - size_t ckoff (super_check_offset) 1858 // c_rarg4 - oop ckval (super_klass) 1859 // 1860 // Output: 1861 // r0 == 0 - success 1862 // r0 == -1^K - failure, where K is partial transfer count 1863 // 1864 address generate_checkcast_copy(const char *name, address *entry, 1865 bool dest_uninitialized = false) { 1866 1867 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1868 1869 // Input registers (after setup_arg_regs) 1870 const Register from = c_rarg0; // source array address 1871 const Register to = c_rarg1; // destination array address 1872 const Register count = c_rarg2; // elementscount 1873 const Register ckoff = c_rarg3; // super_check_offset 1874 const Register ckval = c_rarg4; // super_klass 1875 1876 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1877 RegSet wb_post_saved_regs = RegSet::of(count); 1878 1879 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1880 const Register copied_oop = r22; // actual oop copied 1881 const Register count_save = r21; // orig elementscount 1882 const Register start_to = r20; // destination array start address 1883 const Register r19_klass = r19; // oop._klass 1884 1885 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1886 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1887 1888 //--------------------------------------------------------------- 1889 // Assembler stub will be used for this call to arraycopy 1890 // if the two arrays are subtypes of Object[] but the 1891 // destination array type is not equal to or a supertype 1892 // of the source type. Each element must be separately 1893 // checked. 1894 1895 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1896 copied_oop, r19_klass, count_save); 1897 1898 __ align(CodeEntryAlignment); 1899 StubCodeMark mark(this, "StubRoutines", name); 1900 address start = __ pc(); 1901 1902 __ enter(); // required for proper stackwalking of RuntimeStub frame 1903 1904 #ifdef ASSERT 1905 // caller guarantees that the arrays really are different 1906 // otherwise, we would have to make conjoint checks 1907 { Label L; 1908 __ b(L); // conjoint check not yet implemented 1909 __ stop("checkcast_copy within a single array"); 1910 __ bind(L); 1911 } 1912 #endif //ASSERT 1913 1914 // Caller of this entry point must set up the argument registers. 1915 if (entry != nullptr) { 1916 *entry = __ pc(); 1917 BLOCK_COMMENT("Entry:"); 1918 } 1919 1920 // Empty array: Nothing to do. 1921 __ cbz(count, L_done); 1922 __ push(RegSet::of(r19, r20, r21, r22), sp); 1923 1924 #ifdef ASSERT 1925 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1926 // The ckoff and ckval must be mutually consistent, 1927 // even though caller generates both. 1928 { Label L; 1929 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1930 __ ldrw(start_to, Address(ckval, sco_offset)); 1931 __ cmpw(ckoff, start_to); 1932 __ br(Assembler::EQ, L); 1933 __ stop("super_check_offset inconsistent"); 1934 __ bind(L); 1935 } 1936 #endif //ASSERT 1937 1938 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1939 bool is_oop = true; 1940 int element_size = UseCompressedOops ? 4 : 8; 1941 if (dest_uninitialized) { 1942 decorators |= IS_DEST_UNINITIALIZED; 1943 } 1944 1945 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1946 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1947 1948 // save the original count 1949 __ mov(count_save, count); 1950 1951 // Copy from low to high addresses 1952 __ mov(start_to, to); // Save destination array start address 1953 __ b(L_load_element); 1954 1955 // ======== begin loop ======== 1956 // (Loop is rotated; its entry is L_load_element.) 1957 // Loop control: 1958 // for (; count != 0; count--) { 1959 // copied_oop = load_heap_oop(from++); 1960 // ... generate_type_check ...; 1961 // store_heap_oop(to++, copied_oop); 1962 // } 1963 __ align(OptoLoopAlignment); 1964 1965 __ BIND(L_store_element); 1966 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1967 __ post(to, element_size), copied_oop, noreg, 1968 gct1, gct2, gct3); 1969 __ sub(count, count, 1); 1970 __ cbz(count, L_do_card_marks); 1971 1972 // ======== loop entry is here ======== 1973 __ BIND(L_load_element); 1974 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1975 copied_oop, noreg, __ post(from, element_size), 1976 gct1); 1977 __ cbz(copied_oop, L_store_element); 1978 1979 __ load_klass(r19_klass, copied_oop);// query the object klass 1980 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1981 // ======== end loop ======== 1982 1983 // It was a real error; we must depend on the caller to finish the job. 1984 // Register count = remaining oops, count_orig = total oops. 1985 // Emit GC store barriers for the oops we have copied and report 1986 // their number to the caller. 1987 1988 __ subs(count, count_save, count); // K = partially copied oop count 1989 __ eon(count, count, zr); // report (-1^K) to caller 1990 __ br(Assembler::EQ, L_done_pop); 1991 1992 __ BIND(L_do_card_marks); 1993 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1994 1995 __ bind(L_done_pop); 1996 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1997 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1998 1999 __ bind(L_done); 2000 __ mov(r0, count); 2001 __ leave(); 2002 __ ret(lr); 2003 2004 return start; 2005 } 2006 2007 // Perform range checks on the proposed arraycopy. 2008 // Kills temp, but nothing else. 2009 // Also, clean the sign bits of src_pos and dst_pos. 2010 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2011 Register src_pos, // source position (c_rarg1) 2012 Register dst, // destination array oo (c_rarg2) 2013 Register dst_pos, // destination position (c_rarg3) 2014 Register length, 2015 Register temp, 2016 Label& L_failed) { 2017 BLOCK_COMMENT("arraycopy_range_checks:"); 2018 2019 assert_different_registers(rscratch1, temp); 2020 2021 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2022 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2023 __ addw(temp, length, src_pos); 2024 __ cmpw(temp, rscratch1); 2025 __ br(Assembler::HI, L_failed); 2026 2027 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2028 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2029 __ addw(temp, length, dst_pos); 2030 __ cmpw(temp, rscratch1); 2031 __ br(Assembler::HI, L_failed); 2032 2033 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2034 __ movw(src_pos, src_pos); 2035 __ movw(dst_pos, dst_pos); 2036 2037 BLOCK_COMMENT("arraycopy_range_checks done"); 2038 } 2039 2040 // These stubs get called from some dumb test routine. 2041 // I'll write them properly when they're called from 2042 // something that's actually doing something. 2043 static void fake_arraycopy_stub(address src, address dst, int count) { 2044 assert(count == 0, "huh?"); 2045 } 2046 2047 2048 // 2049 // Generate 'unsafe' array copy stub 2050 // Though just as safe as the other stubs, it takes an unscaled 2051 // size_t argument instead of an element count. 2052 // 2053 // Input: 2054 // c_rarg0 - source array address 2055 // c_rarg1 - destination array address 2056 // c_rarg2 - byte count, treated as ssize_t, can be zero 2057 // 2058 // Examines the alignment of the operands and dispatches 2059 // to a long, int, short, or byte copy loop. 2060 // 2061 address generate_unsafe_copy(const char *name, 2062 address byte_copy_entry, 2063 address short_copy_entry, 2064 address int_copy_entry, 2065 address long_copy_entry) { 2066 Label L_long_aligned, L_int_aligned, L_short_aligned; 2067 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2068 2069 __ align(CodeEntryAlignment); 2070 StubCodeMark mark(this, "StubRoutines", name); 2071 address start = __ pc(); 2072 __ enter(); // required for proper stackwalking of RuntimeStub frame 2073 2074 // bump this on entry, not on exit: 2075 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2076 2077 __ orr(rscratch1, s, d); 2078 __ orr(rscratch1, rscratch1, count); 2079 2080 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2081 __ cbz(rscratch1, L_long_aligned); 2082 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2083 __ cbz(rscratch1, L_int_aligned); 2084 __ tbz(rscratch1, 0, L_short_aligned); 2085 __ b(RuntimeAddress(byte_copy_entry)); 2086 2087 __ BIND(L_short_aligned); 2088 __ lsr(count, count, LogBytesPerShort); // size => short_count 2089 __ b(RuntimeAddress(short_copy_entry)); 2090 __ BIND(L_int_aligned); 2091 __ lsr(count, count, LogBytesPerInt); // size => int_count 2092 __ b(RuntimeAddress(int_copy_entry)); 2093 __ BIND(L_long_aligned); 2094 __ lsr(count, count, LogBytesPerLong); // size => long_count 2095 __ b(RuntimeAddress(long_copy_entry)); 2096 2097 return start; 2098 } 2099 2100 // 2101 // Generate generic array copy stubs 2102 // 2103 // Input: 2104 // c_rarg0 - src oop 2105 // c_rarg1 - src_pos (32-bits) 2106 // c_rarg2 - dst oop 2107 // c_rarg3 - dst_pos (32-bits) 2108 // c_rarg4 - element count (32-bits) 2109 // 2110 // Output: 2111 // r0 == 0 - success 2112 // r0 == -1^K - failure, where K is partial transfer count 2113 // 2114 address generate_generic_copy(const char *name, 2115 address byte_copy_entry, address short_copy_entry, 2116 address int_copy_entry, address oop_copy_entry, 2117 address long_copy_entry, address checkcast_copy_entry) { 2118 2119 Label L_failed, L_objArray; 2120 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2121 2122 // Input registers 2123 const Register src = c_rarg0; // source array oop 2124 const Register src_pos = c_rarg1; // source position 2125 const Register dst = c_rarg2; // destination array oop 2126 const Register dst_pos = c_rarg3; // destination position 2127 const Register length = c_rarg4; 2128 2129 2130 // Registers used as temps 2131 const Register dst_klass = c_rarg5; 2132 2133 __ align(CodeEntryAlignment); 2134 2135 StubCodeMark mark(this, "StubRoutines", name); 2136 2137 address start = __ pc(); 2138 2139 __ enter(); // required for proper stackwalking of RuntimeStub frame 2140 2141 // bump this on entry, not on exit: 2142 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2143 2144 //----------------------------------------------------------------------- 2145 // Assembler stub will be used for this call to arraycopy 2146 // if the following conditions are met: 2147 // 2148 // (1) src and dst must not be null. 2149 // (2) src_pos must not be negative. 2150 // (3) dst_pos must not be negative. 2151 // (4) length must not be negative. 2152 // (5) src klass and dst klass should be the same and not null. 2153 // (6) src and dst should be arrays. 2154 // (7) src_pos + length must not exceed length of src. 2155 // (8) dst_pos + length must not exceed length of dst. 2156 // 2157 2158 // if (src == nullptr) return -1; 2159 __ cbz(src, L_failed); 2160 2161 // if (src_pos < 0) return -1; 2162 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2163 2164 // if (dst == nullptr) return -1; 2165 __ cbz(dst, L_failed); 2166 2167 // if (dst_pos < 0) return -1; 2168 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2169 2170 // registers used as temp 2171 const Register scratch_length = r16; // elements count to copy 2172 const Register scratch_src_klass = r17; // array klass 2173 const Register lh = r15; // layout helper 2174 2175 // if (length < 0) return -1; 2176 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2177 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2178 2179 __ load_klass(scratch_src_klass, src); 2180 #ifdef ASSERT 2181 // assert(src->klass() != nullptr); 2182 { 2183 BLOCK_COMMENT("assert klasses not null {"); 2184 Label L1, L2; 2185 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2186 __ bind(L1); 2187 __ stop("broken null klass"); 2188 __ bind(L2); 2189 __ load_klass(rscratch1, dst); 2190 __ cbz(rscratch1, L1); // this would be broken also 2191 BLOCK_COMMENT("} assert klasses not null done"); 2192 } 2193 #endif 2194 2195 // Load layout helper (32-bits) 2196 // 2197 // |array_tag| | header_size | element_type | |log2_element_size| 2198 // 32 30 24 16 8 2 0 2199 // 2200 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2201 // 2202 2203 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2204 2205 // Handle objArrays completely differently... 2206 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2207 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2208 __ movw(rscratch1, objArray_lh); 2209 __ eorw(rscratch2, lh, rscratch1); 2210 __ cbzw(rscratch2, L_objArray); 2211 2212 // if (src->klass() != dst->klass()) return -1; 2213 __ load_klass(rscratch2, dst); 2214 __ eor(rscratch2, rscratch2, scratch_src_klass); 2215 __ cbnz(rscratch2, L_failed); 2216 2217 // if (!src->is_Array()) return -1; 2218 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2219 2220 // At this point, it is known to be a typeArray (array_tag 0x3). 2221 #ifdef ASSERT 2222 { 2223 BLOCK_COMMENT("assert primitive array {"); 2224 Label L; 2225 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2226 __ cmpw(lh, rscratch2); 2227 __ br(Assembler::GE, L); 2228 __ stop("must be a primitive array"); 2229 __ bind(L); 2230 BLOCK_COMMENT("} assert primitive array done"); 2231 } 2232 #endif 2233 2234 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2235 rscratch2, L_failed); 2236 2237 // TypeArrayKlass 2238 // 2239 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2240 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2241 // 2242 2243 const Register rscratch1_offset = rscratch1; // array offset 2244 const Register r15_elsize = lh; // element size 2245 2246 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2247 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2248 __ add(src, src, rscratch1_offset); // src array offset 2249 __ add(dst, dst, rscratch1_offset); // dst array offset 2250 BLOCK_COMMENT("choose copy loop based on element size"); 2251 2252 // next registers should be set before the jump to corresponding stub 2253 const Register from = c_rarg0; // source array address 2254 const Register to = c_rarg1; // destination array address 2255 const Register count = c_rarg2; // elements count 2256 2257 // 'from', 'to', 'count' registers should be set in such order 2258 // since they are the same as 'src', 'src_pos', 'dst'. 2259 2260 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2261 2262 // The possible values of elsize are 0-3, i.e. exact_log2(element 2263 // size in bytes). We do a simple bitwise binary search. 2264 __ BIND(L_copy_bytes); 2265 __ tbnz(r15_elsize, 1, L_copy_ints); 2266 __ tbnz(r15_elsize, 0, L_copy_shorts); 2267 __ lea(from, Address(src, src_pos));// src_addr 2268 __ lea(to, Address(dst, dst_pos));// dst_addr 2269 __ movw(count, scratch_length); // length 2270 __ b(RuntimeAddress(byte_copy_entry)); 2271 2272 __ BIND(L_copy_shorts); 2273 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2274 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2275 __ movw(count, scratch_length); // length 2276 __ b(RuntimeAddress(short_copy_entry)); 2277 2278 __ BIND(L_copy_ints); 2279 __ tbnz(r15_elsize, 0, L_copy_longs); 2280 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2281 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2282 __ movw(count, scratch_length); // length 2283 __ b(RuntimeAddress(int_copy_entry)); 2284 2285 __ BIND(L_copy_longs); 2286 #ifdef ASSERT 2287 { 2288 BLOCK_COMMENT("assert long copy {"); 2289 Label L; 2290 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2291 __ cmpw(r15_elsize, LogBytesPerLong); 2292 __ br(Assembler::EQ, L); 2293 __ stop("must be long copy, but elsize is wrong"); 2294 __ bind(L); 2295 BLOCK_COMMENT("} assert long copy done"); 2296 } 2297 #endif 2298 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2299 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2300 __ movw(count, scratch_length); // length 2301 __ b(RuntimeAddress(long_copy_entry)); 2302 2303 // ObjArrayKlass 2304 __ BIND(L_objArray); 2305 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2306 2307 Label L_plain_copy, L_checkcast_copy; 2308 // test array classes for subtyping 2309 __ load_klass(r15, dst); 2310 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2311 __ br(Assembler::NE, L_checkcast_copy); 2312 2313 // Identically typed arrays can be copied without element-wise checks. 2314 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2315 rscratch2, L_failed); 2316 2317 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2318 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2319 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2320 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2321 __ movw(count, scratch_length); // length 2322 __ BIND(L_plain_copy); 2323 __ b(RuntimeAddress(oop_copy_entry)); 2324 2325 __ BIND(L_checkcast_copy); 2326 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2327 { 2328 // Before looking at dst.length, make sure dst is also an objArray. 2329 __ ldrw(rscratch1, Address(r15, lh_offset)); 2330 __ movw(rscratch2, objArray_lh); 2331 __ eorw(rscratch1, rscratch1, rscratch2); 2332 __ cbnzw(rscratch1, L_failed); 2333 2334 // It is safe to examine both src.length and dst.length. 2335 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2336 r15, L_failed); 2337 2338 __ load_klass(dst_klass, dst); // reload 2339 2340 // Marshal the base address arguments now, freeing registers. 2341 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2342 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2343 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2344 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2345 __ movw(count, length); // length (reloaded) 2346 Register sco_temp = c_rarg3; // this register is free now 2347 assert_different_registers(from, to, count, sco_temp, 2348 dst_klass, scratch_src_klass); 2349 // assert_clean_int(count, sco_temp); 2350 2351 // Generate the type check. 2352 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2353 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2354 2355 // Smashes rscratch1, rscratch2 2356 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2357 2358 // Fetch destination element klass from the ObjArrayKlass header. 2359 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2360 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2361 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2362 2363 // the checkcast_copy loop needs two extra arguments: 2364 assert(c_rarg3 == sco_temp, "#3 already in place"); 2365 // Set up arguments for checkcast_copy_entry. 2366 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2367 __ b(RuntimeAddress(checkcast_copy_entry)); 2368 } 2369 2370 __ BIND(L_failed); 2371 __ mov(r0, -1); 2372 __ leave(); // required for proper stackwalking of RuntimeStub frame 2373 __ ret(lr); 2374 2375 return start; 2376 } 2377 2378 // 2379 // Generate stub for array fill. If "aligned" is true, the 2380 // "to" address is assumed to be heapword aligned. 2381 // 2382 // Arguments for generated stub: 2383 // to: c_rarg0 2384 // value: c_rarg1 2385 // count: c_rarg2 treated as signed 2386 // 2387 address generate_fill(BasicType t, bool aligned, const char *name) { 2388 __ align(CodeEntryAlignment); 2389 StubCodeMark mark(this, "StubRoutines", name); 2390 address start = __ pc(); 2391 2392 BLOCK_COMMENT("Entry:"); 2393 2394 const Register to = c_rarg0; // source array address 2395 const Register value = c_rarg1; // value 2396 const Register count = c_rarg2; // elements count 2397 2398 const Register bz_base = r10; // base for block_zero routine 2399 const Register cnt_words = r11; // temp register 2400 2401 __ enter(); 2402 2403 Label L_fill_elements, L_exit1; 2404 2405 int shift = -1; 2406 switch (t) { 2407 case T_BYTE: 2408 shift = 0; 2409 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2410 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2411 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2412 __ br(Assembler::LO, L_fill_elements); 2413 break; 2414 case T_SHORT: 2415 shift = 1; 2416 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2417 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2418 __ br(Assembler::LO, L_fill_elements); 2419 break; 2420 case T_INT: 2421 shift = 2; 2422 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2423 __ br(Assembler::LO, L_fill_elements); 2424 break; 2425 default: ShouldNotReachHere(); 2426 } 2427 2428 // Align source address at 8 bytes address boundary. 2429 Label L_skip_align1, L_skip_align2, L_skip_align4; 2430 if (!aligned) { 2431 switch (t) { 2432 case T_BYTE: 2433 // One byte misalignment happens only for byte arrays. 2434 __ tbz(to, 0, L_skip_align1); 2435 __ strb(value, Address(__ post(to, 1))); 2436 __ subw(count, count, 1); 2437 __ bind(L_skip_align1); 2438 // Fallthrough 2439 case T_SHORT: 2440 // Two bytes misalignment happens only for byte and short (char) arrays. 2441 __ tbz(to, 1, L_skip_align2); 2442 __ strh(value, Address(__ post(to, 2))); 2443 __ subw(count, count, 2 >> shift); 2444 __ bind(L_skip_align2); 2445 // Fallthrough 2446 case T_INT: 2447 // Align to 8 bytes, we know we are 4 byte aligned to start. 2448 __ tbz(to, 2, L_skip_align4); 2449 __ strw(value, Address(__ post(to, 4))); 2450 __ subw(count, count, 4 >> shift); 2451 __ bind(L_skip_align4); 2452 break; 2453 default: ShouldNotReachHere(); 2454 } 2455 } 2456 2457 // 2458 // Fill large chunks 2459 // 2460 __ lsrw(cnt_words, count, 3 - shift); // number of words 2461 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2462 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2463 if (UseBlockZeroing) { 2464 Label non_block_zeroing, rest; 2465 // If the fill value is zero we can use the fast zero_words(). 2466 __ cbnz(value, non_block_zeroing); 2467 __ mov(bz_base, to); 2468 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2469 address tpc = __ zero_words(bz_base, cnt_words); 2470 if (tpc == nullptr) { 2471 fatal("CodeCache is full at generate_fill"); 2472 } 2473 __ b(rest); 2474 __ bind(non_block_zeroing); 2475 __ fill_words(to, cnt_words, value); 2476 __ bind(rest); 2477 } else { 2478 __ fill_words(to, cnt_words, value); 2479 } 2480 2481 // Remaining count is less than 8 bytes. Fill it by a single store. 2482 // Note that the total length is no less than 8 bytes. 2483 if (t == T_BYTE || t == T_SHORT) { 2484 Label L_exit1; 2485 __ cbzw(count, L_exit1); 2486 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2487 __ str(value, Address(to, -8)); // overwrite some elements 2488 __ bind(L_exit1); 2489 __ leave(); 2490 __ ret(lr); 2491 } 2492 2493 // Handle copies less than 8 bytes. 2494 Label L_fill_2, L_fill_4, L_exit2; 2495 __ bind(L_fill_elements); 2496 switch (t) { 2497 case T_BYTE: 2498 __ tbz(count, 0, L_fill_2); 2499 __ strb(value, Address(__ post(to, 1))); 2500 __ bind(L_fill_2); 2501 __ tbz(count, 1, L_fill_4); 2502 __ strh(value, Address(__ post(to, 2))); 2503 __ bind(L_fill_4); 2504 __ tbz(count, 2, L_exit2); 2505 __ strw(value, Address(to)); 2506 break; 2507 case T_SHORT: 2508 __ tbz(count, 0, L_fill_4); 2509 __ strh(value, Address(__ post(to, 2))); 2510 __ bind(L_fill_4); 2511 __ tbz(count, 1, L_exit2); 2512 __ strw(value, Address(to)); 2513 break; 2514 case T_INT: 2515 __ cbzw(count, L_exit2); 2516 __ strw(value, Address(to)); 2517 break; 2518 default: ShouldNotReachHere(); 2519 } 2520 __ bind(L_exit2); 2521 __ leave(); 2522 __ ret(lr); 2523 return start; 2524 } 2525 2526 address generate_data_cache_writeback() { 2527 const Register line = c_rarg0; // address of line to write back 2528 2529 __ align(CodeEntryAlignment); 2530 2531 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2532 2533 address start = __ pc(); 2534 __ enter(); 2535 __ cache_wb(Address(line, 0)); 2536 __ leave(); 2537 __ ret(lr); 2538 2539 return start; 2540 } 2541 2542 address generate_data_cache_writeback_sync() { 2543 const Register is_pre = c_rarg0; // pre or post sync 2544 2545 __ align(CodeEntryAlignment); 2546 2547 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2548 2549 // pre wbsync is a no-op 2550 // post wbsync translates to an sfence 2551 2552 Label skip; 2553 address start = __ pc(); 2554 __ enter(); 2555 __ cbnz(is_pre, skip); 2556 __ cache_wbsync(false); 2557 __ bind(skip); 2558 __ leave(); 2559 __ ret(lr); 2560 2561 return start; 2562 } 2563 2564 void generate_arraycopy_stubs() { 2565 address entry; 2566 address entry_jbyte_arraycopy; 2567 address entry_jshort_arraycopy; 2568 address entry_jint_arraycopy; 2569 address entry_oop_arraycopy; 2570 address entry_jlong_arraycopy; 2571 address entry_checkcast_arraycopy; 2572 2573 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2574 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2575 2576 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2577 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2578 2579 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2580 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2581 2582 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2583 2584 //*** jbyte 2585 // Always need aligned and unaligned versions 2586 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2587 "jbyte_disjoint_arraycopy"); 2588 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2589 &entry_jbyte_arraycopy, 2590 "jbyte_arraycopy"); 2591 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2592 "arrayof_jbyte_disjoint_arraycopy"); 2593 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2594 "arrayof_jbyte_arraycopy"); 2595 2596 //*** jshort 2597 // Always need aligned and unaligned versions 2598 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2599 "jshort_disjoint_arraycopy"); 2600 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2601 &entry_jshort_arraycopy, 2602 "jshort_arraycopy"); 2603 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2604 "arrayof_jshort_disjoint_arraycopy"); 2605 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2606 "arrayof_jshort_arraycopy"); 2607 2608 //*** jint 2609 // Aligned versions 2610 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2611 "arrayof_jint_disjoint_arraycopy"); 2612 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2613 "arrayof_jint_arraycopy"); 2614 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2615 // entry_jint_arraycopy always points to the unaligned version 2616 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2617 "jint_disjoint_arraycopy"); 2618 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2619 &entry_jint_arraycopy, 2620 "jint_arraycopy"); 2621 2622 //*** jlong 2623 // It is always aligned 2624 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2625 "arrayof_jlong_disjoint_arraycopy"); 2626 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2627 "arrayof_jlong_arraycopy"); 2628 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2629 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2630 2631 //*** oops 2632 { 2633 // With compressed oops we need unaligned versions; notice that 2634 // we overwrite entry_oop_arraycopy. 2635 bool aligned = !UseCompressedOops; 2636 2637 StubRoutines::_arrayof_oop_disjoint_arraycopy 2638 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2639 /*dest_uninitialized*/false); 2640 StubRoutines::_arrayof_oop_arraycopy 2641 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2642 /*dest_uninitialized*/false); 2643 // Aligned versions without pre-barriers 2644 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2645 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2646 /*dest_uninitialized*/true); 2647 StubRoutines::_arrayof_oop_arraycopy_uninit 2648 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2649 /*dest_uninitialized*/true); 2650 } 2651 2652 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2653 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2654 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2655 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2656 2657 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2658 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2659 /*dest_uninitialized*/true); 2660 2661 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2662 entry_jbyte_arraycopy, 2663 entry_jshort_arraycopy, 2664 entry_jint_arraycopy, 2665 entry_jlong_arraycopy); 2666 2667 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2668 entry_jbyte_arraycopy, 2669 entry_jshort_arraycopy, 2670 entry_jint_arraycopy, 2671 entry_oop_arraycopy, 2672 entry_jlong_arraycopy, 2673 entry_checkcast_arraycopy); 2674 2675 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2676 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2677 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2678 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2679 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2680 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2681 } 2682 2683 void generate_math_stubs() { Unimplemented(); } 2684 2685 // Arguments: 2686 // 2687 // Inputs: 2688 // c_rarg0 - source byte array address 2689 // c_rarg1 - destination byte array address 2690 // c_rarg2 - K (key) in little endian int array 2691 // 2692 address generate_aescrypt_encryptBlock() { 2693 __ align(CodeEntryAlignment); 2694 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2695 2696 const Register from = c_rarg0; // source array address 2697 const Register to = c_rarg1; // destination array address 2698 const Register key = c_rarg2; // key array address 2699 const Register keylen = rscratch1; 2700 2701 address start = __ pc(); 2702 __ enter(); 2703 2704 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2705 2706 __ aesenc_loadkeys(key, keylen); 2707 __ aesecb_encrypt(from, to, keylen); 2708 2709 __ mov(r0, 0); 2710 2711 __ leave(); 2712 __ ret(lr); 2713 2714 return start; 2715 } 2716 2717 // Arguments: 2718 // 2719 // Inputs: 2720 // c_rarg0 - source byte array address 2721 // c_rarg1 - destination byte array address 2722 // c_rarg2 - K (key) in little endian int array 2723 // 2724 address generate_aescrypt_decryptBlock() { 2725 assert(UseAES, "need AES cryptographic extension support"); 2726 __ align(CodeEntryAlignment); 2727 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2728 Label L_doLast; 2729 2730 const Register from = c_rarg0; // source array address 2731 const Register to = c_rarg1; // destination array address 2732 const Register key = c_rarg2; // key array address 2733 const Register keylen = rscratch1; 2734 2735 address start = __ pc(); 2736 __ enter(); // required for proper stackwalking of RuntimeStub frame 2737 2738 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2739 2740 __ aesecb_decrypt(from, to, key, keylen); 2741 2742 __ mov(r0, 0); 2743 2744 __ leave(); 2745 __ ret(lr); 2746 2747 return start; 2748 } 2749 2750 // Arguments: 2751 // 2752 // Inputs: 2753 // c_rarg0 - source byte array address 2754 // c_rarg1 - destination byte array address 2755 // c_rarg2 - K (key) in little endian int array 2756 // c_rarg3 - r vector byte array address 2757 // c_rarg4 - input length 2758 // 2759 // Output: 2760 // x0 - input length 2761 // 2762 address generate_cipherBlockChaining_encryptAESCrypt() { 2763 assert(UseAES, "need AES cryptographic extension support"); 2764 __ align(CodeEntryAlignment); 2765 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2766 2767 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2768 2769 const Register from = c_rarg0; // source array address 2770 const Register to = c_rarg1; // destination array address 2771 const Register key = c_rarg2; // key array address 2772 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2773 // and left with the results of the last encryption block 2774 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2775 const Register keylen = rscratch1; 2776 2777 address start = __ pc(); 2778 2779 __ enter(); 2780 2781 __ movw(rscratch2, len_reg); 2782 2783 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2784 2785 __ ld1(v0, __ T16B, rvec); 2786 2787 __ cmpw(keylen, 52); 2788 __ br(Assembler::CC, L_loadkeys_44); 2789 __ br(Assembler::EQ, L_loadkeys_52); 2790 2791 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2792 __ rev32(v17, __ T16B, v17); 2793 __ rev32(v18, __ T16B, v18); 2794 __ BIND(L_loadkeys_52); 2795 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2796 __ rev32(v19, __ T16B, v19); 2797 __ rev32(v20, __ T16B, v20); 2798 __ BIND(L_loadkeys_44); 2799 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2800 __ rev32(v21, __ T16B, v21); 2801 __ rev32(v22, __ T16B, v22); 2802 __ rev32(v23, __ T16B, v23); 2803 __ rev32(v24, __ T16B, v24); 2804 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2805 __ rev32(v25, __ T16B, v25); 2806 __ rev32(v26, __ T16B, v26); 2807 __ rev32(v27, __ T16B, v27); 2808 __ rev32(v28, __ T16B, v28); 2809 __ ld1(v29, v30, v31, __ T16B, key); 2810 __ rev32(v29, __ T16B, v29); 2811 __ rev32(v30, __ T16B, v30); 2812 __ rev32(v31, __ T16B, v31); 2813 2814 __ BIND(L_aes_loop); 2815 __ ld1(v1, __ T16B, __ post(from, 16)); 2816 __ eor(v0, __ T16B, v0, v1); 2817 2818 __ br(Assembler::CC, L_rounds_44); 2819 __ br(Assembler::EQ, L_rounds_52); 2820 2821 __ aese(v0, v17); __ aesmc(v0, v0); 2822 __ aese(v0, v18); __ aesmc(v0, v0); 2823 __ BIND(L_rounds_52); 2824 __ aese(v0, v19); __ aesmc(v0, v0); 2825 __ aese(v0, v20); __ aesmc(v0, v0); 2826 __ BIND(L_rounds_44); 2827 __ aese(v0, v21); __ aesmc(v0, v0); 2828 __ aese(v0, v22); __ aesmc(v0, v0); 2829 __ aese(v0, v23); __ aesmc(v0, v0); 2830 __ aese(v0, v24); __ aesmc(v0, v0); 2831 __ aese(v0, v25); __ aesmc(v0, v0); 2832 __ aese(v0, v26); __ aesmc(v0, v0); 2833 __ aese(v0, v27); __ aesmc(v0, v0); 2834 __ aese(v0, v28); __ aesmc(v0, v0); 2835 __ aese(v0, v29); __ aesmc(v0, v0); 2836 __ aese(v0, v30); 2837 __ eor(v0, __ T16B, v0, v31); 2838 2839 __ st1(v0, __ T16B, __ post(to, 16)); 2840 2841 __ subw(len_reg, len_reg, 16); 2842 __ cbnzw(len_reg, L_aes_loop); 2843 2844 __ st1(v0, __ T16B, rvec); 2845 2846 __ mov(r0, rscratch2); 2847 2848 __ leave(); 2849 __ ret(lr); 2850 2851 return start; 2852 } 2853 2854 // Arguments: 2855 // 2856 // Inputs: 2857 // c_rarg0 - source byte array address 2858 // c_rarg1 - destination byte array address 2859 // c_rarg2 - K (key) in little endian int array 2860 // c_rarg3 - r vector byte array address 2861 // c_rarg4 - input length 2862 // 2863 // Output: 2864 // r0 - input length 2865 // 2866 address generate_cipherBlockChaining_decryptAESCrypt() { 2867 assert(UseAES, "need AES cryptographic extension support"); 2868 __ align(CodeEntryAlignment); 2869 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2870 2871 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2872 2873 const Register from = c_rarg0; // source array address 2874 const Register to = c_rarg1; // destination array address 2875 const Register key = c_rarg2; // key array address 2876 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2877 // and left with the results of the last encryption block 2878 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2879 const Register keylen = rscratch1; 2880 2881 address start = __ pc(); 2882 2883 __ enter(); 2884 2885 __ movw(rscratch2, len_reg); 2886 2887 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2888 2889 __ ld1(v2, __ T16B, rvec); 2890 2891 __ ld1(v31, __ T16B, __ post(key, 16)); 2892 __ rev32(v31, __ T16B, v31); 2893 2894 __ cmpw(keylen, 52); 2895 __ br(Assembler::CC, L_loadkeys_44); 2896 __ br(Assembler::EQ, L_loadkeys_52); 2897 2898 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2899 __ rev32(v17, __ T16B, v17); 2900 __ rev32(v18, __ T16B, v18); 2901 __ BIND(L_loadkeys_52); 2902 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2903 __ rev32(v19, __ T16B, v19); 2904 __ rev32(v20, __ T16B, v20); 2905 __ BIND(L_loadkeys_44); 2906 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2907 __ rev32(v21, __ T16B, v21); 2908 __ rev32(v22, __ T16B, v22); 2909 __ rev32(v23, __ T16B, v23); 2910 __ rev32(v24, __ T16B, v24); 2911 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2912 __ rev32(v25, __ T16B, v25); 2913 __ rev32(v26, __ T16B, v26); 2914 __ rev32(v27, __ T16B, v27); 2915 __ rev32(v28, __ T16B, v28); 2916 __ ld1(v29, v30, __ T16B, key); 2917 __ rev32(v29, __ T16B, v29); 2918 __ rev32(v30, __ T16B, v30); 2919 2920 __ BIND(L_aes_loop); 2921 __ ld1(v0, __ T16B, __ post(from, 16)); 2922 __ orr(v1, __ T16B, v0, v0); 2923 2924 __ br(Assembler::CC, L_rounds_44); 2925 __ br(Assembler::EQ, L_rounds_52); 2926 2927 __ aesd(v0, v17); __ aesimc(v0, v0); 2928 __ aesd(v0, v18); __ aesimc(v0, v0); 2929 __ BIND(L_rounds_52); 2930 __ aesd(v0, v19); __ aesimc(v0, v0); 2931 __ aesd(v0, v20); __ aesimc(v0, v0); 2932 __ BIND(L_rounds_44); 2933 __ aesd(v0, v21); __ aesimc(v0, v0); 2934 __ aesd(v0, v22); __ aesimc(v0, v0); 2935 __ aesd(v0, v23); __ aesimc(v0, v0); 2936 __ aesd(v0, v24); __ aesimc(v0, v0); 2937 __ aesd(v0, v25); __ aesimc(v0, v0); 2938 __ aesd(v0, v26); __ aesimc(v0, v0); 2939 __ aesd(v0, v27); __ aesimc(v0, v0); 2940 __ aesd(v0, v28); __ aesimc(v0, v0); 2941 __ aesd(v0, v29); __ aesimc(v0, v0); 2942 __ aesd(v0, v30); 2943 __ eor(v0, __ T16B, v0, v31); 2944 __ eor(v0, __ T16B, v0, v2); 2945 2946 __ st1(v0, __ T16B, __ post(to, 16)); 2947 __ orr(v2, __ T16B, v1, v1); 2948 2949 __ subw(len_reg, len_reg, 16); 2950 __ cbnzw(len_reg, L_aes_loop); 2951 2952 __ st1(v2, __ T16B, rvec); 2953 2954 __ mov(r0, rscratch2); 2955 2956 __ leave(); 2957 __ ret(lr); 2958 2959 return start; 2960 } 2961 2962 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2963 // Inputs: 128-bits. in is preserved. 2964 // The least-significant 64-bit word is in the upper dword of each vector. 2965 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2966 // Output: result 2967 void be_add_128_64(FloatRegister result, FloatRegister in, 2968 FloatRegister inc, FloatRegister tmp) { 2969 assert_different_registers(result, tmp, inc); 2970 2971 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2972 // input 2973 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 2974 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 2975 // MSD == 0 (must be!) to LSD 2976 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 2977 } 2978 2979 // CTR AES crypt. 2980 // Arguments: 2981 // 2982 // Inputs: 2983 // c_rarg0 - source byte array address 2984 // c_rarg1 - destination byte array address 2985 // c_rarg2 - K (key) in little endian int array 2986 // c_rarg3 - counter vector byte array address 2987 // c_rarg4 - input length 2988 // c_rarg5 - saved encryptedCounter start 2989 // c_rarg6 - saved used length 2990 // 2991 // Output: 2992 // r0 - input length 2993 // 2994 address generate_counterMode_AESCrypt() { 2995 const Register in = c_rarg0; 2996 const Register out = c_rarg1; 2997 const Register key = c_rarg2; 2998 const Register counter = c_rarg3; 2999 const Register saved_len = c_rarg4, len = r10; 3000 const Register saved_encrypted_ctr = c_rarg5; 3001 const Register used_ptr = c_rarg6, used = r12; 3002 3003 const Register offset = r7; 3004 const Register keylen = r11; 3005 3006 const unsigned char block_size = 16; 3007 const int bulk_width = 4; 3008 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3009 // performance with larger data sizes, but it also means that the 3010 // fast path isn't used until you have at least 8 blocks, and up 3011 // to 127 bytes of data will be executed on the slow path. For 3012 // that reason, and also so as not to blow away too much icache, 4 3013 // blocks seems like a sensible compromise. 3014 3015 // Algorithm: 3016 // 3017 // if (len == 0) { 3018 // goto DONE; 3019 // } 3020 // int result = len; 3021 // do { 3022 // if (used >= blockSize) { 3023 // if (len >= bulk_width * blockSize) { 3024 // CTR_large_block(); 3025 // if (len == 0) 3026 // goto DONE; 3027 // } 3028 // for (;;) { 3029 // 16ByteVector v0 = counter; 3030 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3031 // used = 0; 3032 // if (len < blockSize) 3033 // break; /* goto NEXT */ 3034 // 16ByteVector v1 = load16Bytes(in, offset); 3035 // v1 = v1 ^ encryptedCounter; 3036 // store16Bytes(out, offset); 3037 // used = blockSize; 3038 // offset += blockSize; 3039 // len -= blockSize; 3040 // if (len == 0) 3041 // goto DONE; 3042 // } 3043 // } 3044 // NEXT: 3045 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3046 // len--; 3047 // } while (len != 0); 3048 // DONE: 3049 // return result; 3050 // 3051 // CTR_large_block() 3052 // Wide bulk encryption of whole blocks. 3053 3054 __ align(CodeEntryAlignment); 3055 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3056 const address start = __ pc(); 3057 __ enter(); 3058 3059 Label DONE, CTR_large_block, large_block_return; 3060 __ ldrw(used, Address(used_ptr)); 3061 __ cbzw(saved_len, DONE); 3062 3063 __ mov(len, saved_len); 3064 __ mov(offset, 0); 3065 3066 // Compute #rounds for AES based on the length of the key array 3067 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3068 3069 __ aesenc_loadkeys(key, keylen); 3070 3071 { 3072 Label L_CTR_loop, NEXT; 3073 3074 __ bind(L_CTR_loop); 3075 3076 __ cmp(used, block_size); 3077 __ br(__ LO, NEXT); 3078 3079 // Maybe we have a lot of data 3080 __ subsw(rscratch1, len, bulk_width * block_size); 3081 __ br(__ HS, CTR_large_block); 3082 __ BIND(large_block_return); 3083 __ cbzw(len, DONE); 3084 3085 // Setup the counter 3086 __ movi(v4, __ T4S, 0); 3087 __ movi(v5, __ T4S, 1); 3088 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3089 3090 // 128-bit big-endian increment 3091 __ ld1(v0, __ T16B, counter); 3092 __ rev64(v16, __ T16B, v0); 3093 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3094 __ rev64(v16, __ T16B, v16); 3095 __ st1(v16, __ T16B, counter); 3096 // Previous counter value is in v0 3097 // v4 contains { 0, 1 } 3098 3099 { 3100 // We have fewer than bulk_width blocks of data left. Encrypt 3101 // them one by one until there is less than a full block 3102 // remaining, being careful to save both the encrypted counter 3103 // and the counter. 3104 3105 Label inner_loop; 3106 __ bind(inner_loop); 3107 // Counter to encrypt is in v0 3108 __ aesecb_encrypt(noreg, noreg, keylen); 3109 __ st1(v0, __ T16B, saved_encrypted_ctr); 3110 3111 // Do we have a remaining full block? 3112 3113 __ mov(used, 0); 3114 __ cmp(len, block_size); 3115 __ br(__ LO, NEXT); 3116 3117 // Yes, we have a full block 3118 __ ldrq(v1, Address(in, offset)); 3119 __ eor(v1, __ T16B, v1, v0); 3120 __ strq(v1, Address(out, offset)); 3121 __ mov(used, block_size); 3122 __ add(offset, offset, block_size); 3123 3124 __ subw(len, len, block_size); 3125 __ cbzw(len, DONE); 3126 3127 // Increment the counter, store it back 3128 __ orr(v0, __ T16B, v16, v16); 3129 __ rev64(v16, __ T16B, v16); 3130 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3131 __ rev64(v16, __ T16B, v16); 3132 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3133 3134 __ b(inner_loop); 3135 } 3136 3137 __ BIND(NEXT); 3138 3139 // Encrypt a single byte, and loop. 3140 // We expect this to be a rare event. 3141 __ ldrb(rscratch1, Address(in, offset)); 3142 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3143 __ eor(rscratch1, rscratch1, rscratch2); 3144 __ strb(rscratch1, Address(out, offset)); 3145 __ add(offset, offset, 1); 3146 __ add(used, used, 1); 3147 __ subw(len, len,1); 3148 __ cbnzw(len, L_CTR_loop); 3149 } 3150 3151 __ bind(DONE); 3152 __ strw(used, Address(used_ptr)); 3153 __ mov(r0, saved_len); 3154 3155 __ leave(); // required for proper stackwalking of RuntimeStub frame 3156 __ ret(lr); 3157 3158 // Bulk encryption 3159 3160 __ BIND (CTR_large_block); 3161 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3162 3163 if (bulk_width == 8) { 3164 __ sub(sp, sp, 4 * 16); 3165 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3166 } 3167 __ sub(sp, sp, 4 * 16); 3168 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3169 RegSet saved_regs = (RegSet::of(in, out, offset) 3170 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3171 __ push(saved_regs, sp); 3172 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3173 __ add(in, in, offset); 3174 __ add(out, out, offset); 3175 3176 // Keys should already be loaded into the correct registers 3177 3178 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3179 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3180 3181 // AES/CTR loop 3182 { 3183 Label L_CTR_loop; 3184 __ BIND(L_CTR_loop); 3185 3186 // Setup the counters 3187 __ movi(v8, __ T4S, 0); 3188 __ movi(v9, __ T4S, 1); 3189 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3190 3191 for (int i = 0; i < bulk_width; i++) { 3192 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3193 __ rev64(v0_ofs, __ T16B, v16); 3194 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3195 } 3196 3197 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3198 3199 // Encrypt the counters 3200 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3201 3202 if (bulk_width == 8) { 3203 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3204 } 3205 3206 // XOR the encrypted counters with the inputs 3207 for (int i = 0; i < bulk_width; i++) { 3208 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3209 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3210 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3211 } 3212 3213 // Write the encrypted data 3214 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3215 if (bulk_width == 8) { 3216 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3217 } 3218 3219 __ subw(len, len, 16 * bulk_width); 3220 __ cbnzw(len, L_CTR_loop); 3221 } 3222 3223 // Save the counter back where it goes 3224 __ rev64(v16, __ T16B, v16); 3225 __ st1(v16, __ T16B, counter); 3226 3227 __ pop(saved_regs, sp); 3228 3229 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3230 if (bulk_width == 8) { 3231 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3232 } 3233 3234 __ andr(rscratch1, len, -16 * bulk_width); 3235 __ sub(len, len, rscratch1); 3236 __ add(offset, offset, rscratch1); 3237 __ mov(used, 16); 3238 __ strw(used, Address(used_ptr)); 3239 __ b(large_block_return); 3240 3241 return start; 3242 } 3243 3244 // Vector AES Galois Counter Mode implementation. Parameters: 3245 // 3246 // in = c_rarg0 3247 // len = c_rarg1 3248 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3249 // out = c_rarg3 3250 // key = c_rarg4 3251 // state = c_rarg5 - GHASH.state 3252 // subkeyHtbl = c_rarg6 - powers of H 3253 // counter = c_rarg7 - 16 bytes of CTR 3254 // return - number of processed bytes 3255 address generate_galoisCounterMode_AESCrypt() { 3256 address ghash_polynomial = __ pc(); 3257 __ emit_int64(0x87); // The low-order bits of the field 3258 // polynomial (i.e. p = z^7+z^2+z+1) 3259 // repeated in the low and high parts of a 3260 // 128-bit vector 3261 __ emit_int64(0x87); 3262 3263 __ align(CodeEntryAlignment); 3264 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3265 address start = __ pc(); 3266 __ enter(); 3267 3268 const Register in = c_rarg0; 3269 const Register len = c_rarg1; 3270 const Register ct = c_rarg2; 3271 const Register out = c_rarg3; 3272 // and updated with the incremented counter in the end 3273 3274 const Register key = c_rarg4; 3275 const Register state = c_rarg5; 3276 3277 const Register subkeyHtbl = c_rarg6; 3278 3279 const Register counter = c_rarg7; 3280 3281 const Register keylen = r10; 3282 // Save state before entering routine 3283 __ sub(sp, sp, 4 * 16); 3284 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3285 __ sub(sp, sp, 4 * 16); 3286 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3287 3288 // __ andr(len, len, -512); 3289 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3290 __ str(len, __ pre(sp, -2 * wordSize)); 3291 3292 Label DONE; 3293 __ cbz(len, DONE); 3294 3295 // Compute #rounds for AES based on the length of the key array 3296 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3297 3298 __ aesenc_loadkeys(key, keylen); 3299 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3300 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3301 3302 // AES/CTR loop 3303 { 3304 Label L_CTR_loop; 3305 __ BIND(L_CTR_loop); 3306 3307 // Setup the counters 3308 __ movi(v8, __ T4S, 0); 3309 __ movi(v9, __ T4S, 1); 3310 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3311 3312 assert(v0->encoding() < v8->encoding(), ""); 3313 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3314 FloatRegister f = as_FloatRegister(i); 3315 __ rev32(f, __ T16B, v16); 3316 __ addv(v16, __ T4S, v16, v8); 3317 } 3318 3319 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3320 3321 // Encrypt the counters 3322 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3323 3324 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3325 3326 // XOR the encrypted counters with the inputs 3327 for (int i = 0; i < 8; i++) { 3328 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3329 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3330 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3331 } 3332 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3333 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3334 3335 __ subw(len, len, 16 * 8); 3336 __ cbnzw(len, L_CTR_loop); 3337 } 3338 3339 __ rev32(v16, __ T16B, v16); 3340 __ st1(v16, __ T16B, counter); 3341 3342 __ ldr(len, Address(sp)); 3343 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3344 3345 // GHASH/CTR loop 3346 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3347 len, /*unrolls*/4); 3348 3349 #ifdef ASSERT 3350 { Label L; 3351 __ cmp(len, (unsigned char)0); 3352 __ br(Assembler::EQ, L); 3353 __ stop("stubGenerator: abort"); 3354 __ bind(L); 3355 } 3356 #endif 3357 3358 __ bind(DONE); 3359 // Return the number of bytes processed 3360 __ ldr(r0, __ post(sp, 2 * wordSize)); 3361 3362 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3363 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3364 3365 __ leave(); // required for proper stackwalking of RuntimeStub frame 3366 __ ret(lr); 3367 return start; 3368 } 3369 3370 class Cached64Bytes { 3371 private: 3372 MacroAssembler *_masm; 3373 Register _regs[8]; 3374 3375 public: 3376 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3377 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3378 auto it = rs.begin(); 3379 for (auto &r: _regs) { 3380 r = *it; 3381 ++it; 3382 } 3383 } 3384 3385 void gen_loads(Register base) { 3386 for (int i = 0; i < 8; i += 2) { 3387 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3388 } 3389 } 3390 3391 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3392 void extract_u32(Register dest, int i) { 3393 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3394 } 3395 }; 3396 3397 // Utility routines for md5. 3398 // Clobbers r10 and r11. 3399 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3400 int k, int s, int t) { 3401 Register rscratch3 = r10; 3402 Register rscratch4 = r11; 3403 3404 __ eorw(rscratch3, r3, r4); 3405 __ movw(rscratch2, t); 3406 __ andw(rscratch3, rscratch3, r2); 3407 __ addw(rscratch4, r1, rscratch2); 3408 reg_cache.extract_u32(rscratch1, k); 3409 __ eorw(rscratch3, rscratch3, r4); 3410 __ addw(rscratch4, rscratch4, rscratch1); 3411 __ addw(rscratch3, rscratch3, rscratch4); 3412 __ rorw(rscratch2, rscratch3, 32 - s); 3413 __ addw(r1, rscratch2, r2); 3414 } 3415 3416 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3417 int k, int s, int t) { 3418 Register rscratch3 = r10; 3419 Register rscratch4 = r11; 3420 3421 reg_cache.extract_u32(rscratch1, k); 3422 __ movw(rscratch2, t); 3423 __ addw(rscratch4, r1, rscratch2); 3424 __ addw(rscratch4, rscratch4, rscratch1); 3425 __ bicw(rscratch2, r3, r4); 3426 __ andw(rscratch3, r2, r4); 3427 __ addw(rscratch2, rscratch2, rscratch4); 3428 __ addw(rscratch2, rscratch2, rscratch3); 3429 __ rorw(rscratch2, rscratch2, 32 - s); 3430 __ addw(r1, rscratch2, r2); 3431 } 3432 3433 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3434 int k, int s, int t) { 3435 Register rscratch3 = r10; 3436 Register rscratch4 = r11; 3437 3438 __ eorw(rscratch3, r3, r4); 3439 __ movw(rscratch2, t); 3440 __ addw(rscratch4, r1, rscratch2); 3441 reg_cache.extract_u32(rscratch1, k); 3442 __ eorw(rscratch3, rscratch3, r2); 3443 __ addw(rscratch4, rscratch4, rscratch1); 3444 __ addw(rscratch3, rscratch3, rscratch4); 3445 __ rorw(rscratch2, rscratch3, 32 - s); 3446 __ addw(r1, rscratch2, r2); 3447 } 3448 3449 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3450 int k, int s, int t) { 3451 Register rscratch3 = r10; 3452 Register rscratch4 = r11; 3453 3454 __ movw(rscratch3, t); 3455 __ ornw(rscratch2, r2, r4); 3456 __ addw(rscratch4, r1, rscratch3); 3457 reg_cache.extract_u32(rscratch1, k); 3458 __ eorw(rscratch3, rscratch2, r3); 3459 __ addw(rscratch4, rscratch4, rscratch1); 3460 __ addw(rscratch3, rscratch3, rscratch4); 3461 __ rorw(rscratch2, rscratch3, 32 - s); 3462 __ addw(r1, rscratch2, r2); 3463 } 3464 3465 // Arguments: 3466 // 3467 // Inputs: 3468 // c_rarg0 - byte[] source+offset 3469 // c_rarg1 - int[] SHA.state 3470 // c_rarg2 - int offset 3471 // c_rarg3 - int limit 3472 // 3473 address generate_md5_implCompress(bool multi_block, const char *name) { 3474 __ align(CodeEntryAlignment); 3475 StubCodeMark mark(this, "StubRoutines", name); 3476 address start = __ pc(); 3477 3478 Register buf = c_rarg0; 3479 Register state = c_rarg1; 3480 Register ofs = c_rarg2; 3481 Register limit = c_rarg3; 3482 Register a = r4; 3483 Register b = r5; 3484 Register c = r6; 3485 Register d = r7; 3486 Register rscratch3 = r10; 3487 Register rscratch4 = r11; 3488 3489 Register state_regs[2] = { r12, r13 }; 3490 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3491 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3492 3493 __ push(saved_regs, sp); 3494 3495 __ ldp(state_regs[0], state_regs[1], Address(state)); 3496 __ ubfx(a, state_regs[0], 0, 32); 3497 __ ubfx(b, state_regs[0], 32, 32); 3498 __ ubfx(c, state_regs[1], 0, 32); 3499 __ ubfx(d, state_regs[1], 32, 32); 3500 3501 Label md5_loop; 3502 __ BIND(md5_loop); 3503 3504 reg_cache.gen_loads(buf); 3505 3506 // Round 1 3507 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3508 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3509 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3510 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3511 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3512 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3513 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3514 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3515 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3516 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3517 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3518 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3519 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3520 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3521 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3522 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3523 3524 // Round 2 3525 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3526 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3527 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3528 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3529 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3530 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3531 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3532 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3533 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3534 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3535 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3536 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3537 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3538 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3539 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3540 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3541 3542 // Round 3 3543 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3544 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3545 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3546 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3547 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3548 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3549 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3550 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3551 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3552 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3553 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3554 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3555 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3556 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3557 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3558 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3559 3560 // Round 4 3561 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3562 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3563 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3564 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3565 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3566 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3567 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3568 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3569 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3570 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3571 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3572 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3573 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3574 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3575 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3576 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3577 3578 __ addw(a, state_regs[0], a); 3579 __ ubfx(rscratch2, state_regs[0], 32, 32); 3580 __ addw(b, rscratch2, b); 3581 __ addw(c, state_regs[1], c); 3582 __ ubfx(rscratch4, state_regs[1], 32, 32); 3583 __ addw(d, rscratch4, d); 3584 3585 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3586 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3587 3588 if (multi_block) { 3589 __ add(buf, buf, 64); 3590 __ add(ofs, ofs, 64); 3591 __ cmp(ofs, limit); 3592 __ br(Assembler::LE, md5_loop); 3593 __ mov(c_rarg0, ofs); // return ofs 3594 } 3595 3596 // write hash values back in the correct order 3597 __ stp(state_regs[0], state_regs[1], Address(state)); 3598 3599 __ pop(saved_regs, sp); 3600 3601 __ ret(lr); 3602 3603 return start; 3604 } 3605 3606 // Arguments: 3607 // 3608 // Inputs: 3609 // c_rarg0 - byte[] source+offset 3610 // c_rarg1 - int[] SHA.state 3611 // c_rarg2 - int offset 3612 // c_rarg3 - int limit 3613 // 3614 address generate_sha1_implCompress(bool multi_block, const char *name) { 3615 __ align(CodeEntryAlignment); 3616 StubCodeMark mark(this, "StubRoutines", name); 3617 address start = __ pc(); 3618 3619 Register buf = c_rarg0; 3620 Register state = c_rarg1; 3621 Register ofs = c_rarg2; 3622 Register limit = c_rarg3; 3623 3624 Label keys; 3625 Label sha1_loop; 3626 3627 // load the keys into v0..v3 3628 __ adr(rscratch1, keys); 3629 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3630 // load 5 words state into v6, v7 3631 __ ldrq(v6, Address(state, 0)); 3632 __ ldrs(v7, Address(state, 16)); 3633 3634 3635 __ BIND(sha1_loop); 3636 // load 64 bytes of data into v16..v19 3637 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3638 __ rev32(v16, __ T16B, v16); 3639 __ rev32(v17, __ T16B, v17); 3640 __ rev32(v18, __ T16B, v18); 3641 __ rev32(v19, __ T16B, v19); 3642 3643 // do the sha1 3644 __ addv(v4, __ T4S, v16, v0); 3645 __ orr(v20, __ T16B, v6, v6); 3646 3647 FloatRegister d0 = v16; 3648 FloatRegister d1 = v17; 3649 FloatRegister d2 = v18; 3650 FloatRegister d3 = v19; 3651 3652 for (int round = 0; round < 20; round++) { 3653 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3654 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3655 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3656 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3657 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3658 3659 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3660 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3661 __ sha1h(tmp2, __ T4S, v20); 3662 if (round < 5) 3663 __ sha1c(v20, __ T4S, tmp3, tmp4); 3664 else if (round < 10 || round >= 15) 3665 __ sha1p(v20, __ T4S, tmp3, tmp4); 3666 else 3667 __ sha1m(v20, __ T4S, tmp3, tmp4); 3668 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3669 3670 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3671 } 3672 3673 __ addv(v7, __ T2S, v7, v21); 3674 __ addv(v6, __ T4S, v6, v20); 3675 3676 if (multi_block) { 3677 __ add(ofs, ofs, 64); 3678 __ cmp(ofs, limit); 3679 __ br(Assembler::LE, sha1_loop); 3680 __ mov(c_rarg0, ofs); // return ofs 3681 } 3682 3683 __ strq(v6, Address(state, 0)); 3684 __ strs(v7, Address(state, 16)); 3685 3686 __ ret(lr); 3687 3688 __ bind(keys); 3689 __ emit_int32(0x5a827999); 3690 __ emit_int32(0x6ed9eba1); 3691 __ emit_int32(0x8f1bbcdc); 3692 __ emit_int32(0xca62c1d6); 3693 3694 return start; 3695 } 3696 3697 3698 // Arguments: 3699 // 3700 // Inputs: 3701 // c_rarg0 - byte[] source+offset 3702 // c_rarg1 - int[] SHA.state 3703 // c_rarg2 - int offset 3704 // c_rarg3 - int limit 3705 // 3706 address generate_sha256_implCompress(bool multi_block, const char *name) { 3707 static const uint32_t round_consts[64] = { 3708 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3709 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3710 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3711 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3712 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3713 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3714 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3715 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3716 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3717 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3718 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3719 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3720 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3721 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3722 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3723 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3724 }; 3725 __ align(CodeEntryAlignment); 3726 StubCodeMark mark(this, "StubRoutines", name); 3727 address start = __ pc(); 3728 3729 Register buf = c_rarg0; 3730 Register state = c_rarg1; 3731 Register ofs = c_rarg2; 3732 Register limit = c_rarg3; 3733 3734 Label sha1_loop; 3735 3736 __ stpd(v8, v9, __ pre(sp, -32)); 3737 __ stpd(v10, v11, Address(sp, 16)); 3738 3739 // dga == v0 3740 // dgb == v1 3741 // dg0 == v2 3742 // dg1 == v3 3743 // dg2 == v4 3744 // t0 == v6 3745 // t1 == v7 3746 3747 // load 16 keys to v16..v31 3748 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3749 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3750 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3751 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3752 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3753 3754 // load 8 words (256 bits) state 3755 __ ldpq(v0, v1, state); 3756 3757 __ BIND(sha1_loop); 3758 // load 64 bytes of data into v8..v11 3759 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3760 __ rev32(v8, __ T16B, v8); 3761 __ rev32(v9, __ T16B, v9); 3762 __ rev32(v10, __ T16B, v10); 3763 __ rev32(v11, __ T16B, v11); 3764 3765 __ addv(v6, __ T4S, v8, v16); 3766 __ orr(v2, __ T16B, v0, v0); 3767 __ orr(v3, __ T16B, v1, v1); 3768 3769 FloatRegister d0 = v8; 3770 FloatRegister d1 = v9; 3771 FloatRegister d2 = v10; 3772 FloatRegister d3 = v11; 3773 3774 3775 for (int round = 0; round < 16; round++) { 3776 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3777 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3778 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3779 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3780 3781 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3782 __ orr(v4, __ T16B, v2, v2); 3783 if (round < 15) 3784 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3785 __ sha256h(v2, __ T4S, v3, tmp2); 3786 __ sha256h2(v3, __ T4S, v4, tmp2); 3787 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3788 3789 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3790 } 3791 3792 __ addv(v0, __ T4S, v0, v2); 3793 __ addv(v1, __ T4S, v1, v3); 3794 3795 if (multi_block) { 3796 __ add(ofs, ofs, 64); 3797 __ cmp(ofs, limit); 3798 __ br(Assembler::LE, sha1_loop); 3799 __ mov(c_rarg0, ofs); // return ofs 3800 } 3801 3802 __ ldpd(v10, v11, Address(sp, 16)); 3803 __ ldpd(v8, v9, __ post(sp, 32)); 3804 3805 __ stpq(v0, v1, state); 3806 3807 __ ret(lr); 3808 3809 return start; 3810 } 3811 3812 // Double rounds for sha512. 3813 void sha512_dround(int dr, 3814 FloatRegister vi0, FloatRegister vi1, 3815 FloatRegister vi2, FloatRegister vi3, 3816 FloatRegister vi4, FloatRegister vrc0, 3817 FloatRegister vrc1, FloatRegister vin0, 3818 FloatRegister vin1, FloatRegister vin2, 3819 FloatRegister vin3, FloatRegister vin4) { 3820 if (dr < 36) { 3821 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3822 } 3823 __ addv(v5, __ T2D, vrc0, vin0); 3824 __ ext(v6, __ T16B, vi2, vi3, 8); 3825 __ ext(v5, __ T16B, v5, v5, 8); 3826 __ ext(v7, __ T16B, vi1, vi2, 8); 3827 __ addv(vi3, __ T2D, vi3, v5); 3828 if (dr < 32) { 3829 __ ext(v5, __ T16B, vin3, vin4, 8); 3830 __ sha512su0(vin0, __ T2D, vin1); 3831 } 3832 __ sha512h(vi3, __ T2D, v6, v7); 3833 if (dr < 32) { 3834 __ sha512su1(vin0, __ T2D, vin2, v5); 3835 } 3836 __ addv(vi4, __ T2D, vi1, vi3); 3837 __ sha512h2(vi3, __ T2D, vi1, vi0); 3838 } 3839 3840 // Arguments: 3841 // 3842 // Inputs: 3843 // c_rarg0 - byte[] source+offset 3844 // c_rarg1 - int[] SHA.state 3845 // c_rarg2 - int offset 3846 // c_rarg3 - int limit 3847 // 3848 address generate_sha512_implCompress(bool multi_block, const char *name) { 3849 static const uint64_t round_consts[80] = { 3850 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3851 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3852 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3853 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3854 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3855 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3856 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3857 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3858 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3859 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3860 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3861 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3862 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3863 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3864 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3865 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3866 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3867 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3868 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3869 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3870 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3871 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3872 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3873 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3874 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3875 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3876 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3877 }; 3878 3879 __ align(CodeEntryAlignment); 3880 StubCodeMark mark(this, "StubRoutines", name); 3881 address start = __ pc(); 3882 3883 Register buf = c_rarg0; 3884 Register state = c_rarg1; 3885 Register ofs = c_rarg2; 3886 Register limit = c_rarg3; 3887 3888 __ stpd(v8, v9, __ pre(sp, -64)); 3889 __ stpd(v10, v11, Address(sp, 16)); 3890 __ stpd(v12, v13, Address(sp, 32)); 3891 __ stpd(v14, v15, Address(sp, 48)); 3892 3893 Label sha512_loop; 3894 3895 // load state 3896 __ ld1(v8, v9, v10, v11, __ T2D, state); 3897 3898 // load first 4 round constants 3899 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3900 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3901 3902 __ BIND(sha512_loop); 3903 // load 128B of data into v12..v19 3904 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3905 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3906 __ rev64(v12, __ T16B, v12); 3907 __ rev64(v13, __ T16B, v13); 3908 __ rev64(v14, __ T16B, v14); 3909 __ rev64(v15, __ T16B, v15); 3910 __ rev64(v16, __ T16B, v16); 3911 __ rev64(v17, __ T16B, v17); 3912 __ rev64(v18, __ T16B, v18); 3913 __ rev64(v19, __ T16B, v19); 3914 3915 __ mov(rscratch2, rscratch1); 3916 3917 __ mov(v0, __ T16B, v8); 3918 __ mov(v1, __ T16B, v9); 3919 __ mov(v2, __ T16B, v10); 3920 __ mov(v3, __ T16B, v11); 3921 3922 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3923 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3924 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3925 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3926 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3927 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3928 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3929 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3930 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3931 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3932 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3933 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3934 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3935 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3936 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3937 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3938 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3939 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3940 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3941 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3942 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3943 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3944 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3945 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3946 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3947 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3948 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3949 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3950 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3951 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3952 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3953 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3954 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3955 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3956 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3957 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3958 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3959 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3960 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3961 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3962 3963 __ addv(v8, __ T2D, v8, v0); 3964 __ addv(v9, __ T2D, v9, v1); 3965 __ addv(v10, __ T2D, v10, v2); 3966 __ addv(v11, __ T2D, v11, v3); 3967 3968 if (multi_block) { 3969 __ add(ofs, ofs, 128); 3970 __ cmp(ofs, limit); 3971 __ br(Assembler::LE, sha512_loop); 3972 __ mov(c_rarg0, ofs); // return ofs 3973 } 3974 3975 __ st1(v8, v9, v10, v11, __ T2D, state); 3976 3977 __ ldpd(v14, v15, Address(sp, 48)); 3978 __ ldpd(v12, v13, Address(sp, 32)); 3979 __ ldpd(v10, v11, Address(sp, 16)); 3980 __ ldpd(v8, v9, __ post(sp, 64)); 3981 3982 __ ret(lr); 3983 3984 return start; 3985 } 3986 3987 // Arguments: 3988 // 3989 // Inputs: 3990 // c_rarg0 - byte[] source+offset 3991 // c_rarg1 - byte[] SHA.state 3992 // c_rarg2 - int block_size 3993 // c_rarg3 - int offset 3994 // c_rarg4 - int limit 3995 // 3996 address generate_sha3_implCompress(bool multi_block, const char *name) { 3997 static const uint64_t round_consts[24] = { 3998 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 3999 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4000 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4001 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4002 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4003 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4004 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4005 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4006 }; 4007 4008 __ align(CodeEntryAlignment); 4009 StubCodeMark mark(this, "StubRoutines", name); 4010 address start = __ pc(); 4011 4012 Register buf = c_rarg0; 4013 Register state = c_rarg1; 4014 Register block_size = c_rarg2; 4015 Register ofs = c_rarg3; 4016 Register limit = c_rarg4; 4017 4018 Label sha3_loop, rounds24_loop; 4019 Label sha3_512_or_sha3_384, shake128; 4020 4021 __ stpd(v8, v9, __ pre(sp, -64)); 4022 __ stpd(v10, v11, Address(sp, 16)); 4023 __ stpd(v12, v13, Address(sp, 32)); 4024 __ stpd(v14, v15, Address(sp, 48)); 4025 4026 // load state 4027 __ add(rscratch1, state, 32); 4028 __ ld1(v0, v1, v2, v3, __ T1D, state); 4029 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4030 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4031 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4032 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4033 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4034 __ ld1(v24, __ T1D, rscratch1); 4035 4036 __ BIND(sha3_loop); 4037 4038 // 24 keccak rounds 4039 __ movw(rscratch2, 24); 4040 4041 // load round_constants base 4042 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4043 4044 // load input 4045 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4046 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4047 __ eor(v0, __ T8B, v0, v25); 4048 __ eor(v1, __ T8B, v1, v26); 4049 __ eor(v2, __ T8B, v2, v27); 4050 __ eor(v3, __ T8B, v3, v28); 4051 __ eor(v4, __ T8B, v4, v29); 4052 __ eor(v5, __ T8B, v5, v30); 4053 __ eor(v6, __ T8B, v6, v31); 4054 4055 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4056 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4057 4058 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4059 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4060 __ eor(v7, __ T8B, v7, v25); 4061 __ eor(v8, __ T8B, v8, v26); 4062 __ eor(v9, __ T8B, v9, v27); 4063 __ eor(v10, __ T8B, v10, v28); 4064 __ eor(v11, __ T8B, v11, v29); 4065 __ eor(v12, __ T8B, v12, v30); 4066 __ eor(v13, __ T8B, v13, v31); 4067 4068 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4069 __ eor(v14, __ T8B, v14, v25); 4070 __ eor(v15, __ T8B, v15, v26); 4071 __ eor(v16, __ T8B, v16, v27); 4072 4073 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4074 __ andw(c_rarg5, block_size, 48); 4075 __ cbzw(c_rarg5, rounds24_loop); 4076 4077 __ tbnz(block_size, 5, shake128); 4078 // block_size == 144, bit5 == 0, SHA3-244 4079 __ ldrd(v28, __ post(buf, 8)); 4080 __ eor(v17, __ T8B, v17, v28); 4081 __ b(rounds24_loop); 4082 4083 __ BIND(shake128); 4084 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4085 __ eor(v17, __ T8B, v17, v28); 4086 __ eor(v18, __ T8B, v18, v29); 4087 __ eor(v19, __ T8B, v19, v30); 4088 __ eor(v20, __ T8B, v20, v31); 4089 __ b(rounds24_loop); // block_size == 168, SHAKE128 4090 4091 __ BIND(sha3_512_or_sha3_384); 4092 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4093 __ eor(v7, __ T8B, v7, v25); 4094 __ eor(v8, __ T8B, v8, v26); 4095 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4096 4097 // SHA3-384 4098 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4099 __ eor(v9, __ T8B, v9, v27); 4100 __ eor(v10, __ T8B, v10, v28); 4101 __ eor(v11, __ T8B, v11, v29); 4102 __ eor(v12, __ T8B, v12, v30); 4103 4104 __ BIND(rounds24_loop); 4105 __ subw(rscratch2, rscratch2, 1); 4106 4107 __ eor3(v29, __ T16B, v4, v9, v14); 4108 __ eor3(v26, __ T16B, v1, v6, v11); 4109 __ eor3(v28, __ T16B, v3, v8, v13); 4110 __ eor3(v25, __ T16B, v0, v5, v10); 4111 __ eor3(v27, __ T16B, v2, v7, v12); 4112 __ eor3(v29, __ T16B, v29, v19, v24); 4113 __ eor3(v26, __ T16B, v26, v16, v21); 4114 __ eor3(v28, __ T16B, v28, v18, v23); 4115 __ eor3(v25, __ T16B, v25, v15, v20); 4116 __ eor3(v27, __ T16B, v27, v17, v22); 4117 4118 __ rax1(v30, __ T2D, v29, v26); 4119 __ rax1(v26, __ T2D, v26, v28); 4120 __ rax1(v28, __ T2D, v28, v25); 4121 __ rax1(v25, __ T2D, v25, v27); 4122 __ rax1(v27, __ T2D, v27, v29); 4123 4124 __ eor(v0, __ T16B, v0, v30); 4125 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4126 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4127 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4128 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4129 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4130 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4131 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4132 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4133 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4134 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4135 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4136 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4137 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4138 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4139 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4140 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4141 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4142 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4143 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4144 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4145 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4146 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4147 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4148 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4149 4150 __ bcax(v20, __ T16B, v31, v22, v8); 4151 __ bcax(v21, __ T16B, v8, v23, v22); 4152 __ bcax(v22, __ T16B, v22, v24, v23); 4153 __ bcax(v23, __ T16B, v23, v31, v24); 4154 __ bcax(v24, __ T16B, v24, v8, v31); 4155 4156 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4157 4158 __ bcax(v17, __ T16B, v25, v19, v3); 4159 __ bcax(v18, __ T16B, v3, v15, v19); 4160 __ bcax(v19, __ T16B, v19, v16, v15); 4161 __ bcax(v15, __ T16B, v15, v25, v16); 4162 __ bcax(v16, __ T16B, v16, v3, v25); 4163 4164 __ bcax(v10, __ T16B, v29, v12, v26); 4165 __ bcax(v11, __ T16B, v26, v13, v12); 4166 __ bcax(v12, __ T16B, v12, v14, v13); 4167 __ bcax(v13, __ T16B, v13, v29, v14); 4168 __ bcax(v14, __ T16B, v14, v26, v29); 4169 4170 __ bcax(v7, __ T16B, v30, v9, v4); 4171 __ bcax(v8, __ T16B, v4, v5, v9); 4172 __ bcax(v9, __ T16B, v9, v6, v5); 4173 __ bcax(v5, __ T16B, v5, v30, v6); 4174 __ bcax(v6, __ T16B, v6, v4, v30); 4175 4176 __ bcax(v3, __ T16B, v27, v0, v28); 4177 __ bcax(v4, __ T16B, v28, v1, v0); 4178 __ bcax(v0, __ T16B, v0, v2, v1); 4179 __ bcax(v1, __ T16B, v1, v27, v2); 4180 __ bcax(v2, __ T16B, v2, v28, v27); 4181 4182 __ eor(v0, __ T16B, v0, v31); 4183 4184 __ cbnzw(rscratch2, rounds24_loop); 4185 4186 if (multi_block) { 4187 __ add(ofs, ofs, block_size); 4188 __ cmp(ofs, limit); 4189 __ br(Assembler::LE, sha3_loop); 4190 __ mov(c_rarg0, ofs); // return ofs 4191 } 4192 4193 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4194 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4195 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4196 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4197 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4198 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4199 __ st1(v24, __ T1D, state); 4200 4201 __ ldpd(v14, v15, Address(sp, 48)); 4202 __ ldpd(v12, v13, Address(sp, 32)); 4203 __ ldpd(v10, v11, Address(sp, 16)); 4204 __ ldpd(v8, v9, __ post(sp, 64)); 4205 4206 __ ret(lr); 4207 4208 return start; 4209 } 4210 4211 /** 4212 * Arguments: 4213 * 4214 * Inputs: 4215 * c_rarg0 - int crc 4216 * c_rarg1 - byte* buf 4217 * c_rarg2 - int length 4218 * 4219 * Output: 4220 * rax - int crc result 4221 */ 4222 address generate_updateBytesCRC32() { 4223 assert(UseCRC32Intrinsics, "what are we doing here?"); 4224 4225 __ align(CodeEntryAlignment); 4226 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4227 4228 address start = __ pc(); 4229 4230 const Register crc = c_rarg0; // crc 4231 const Register buf = c_rarg1; // source java byte array address 4232 const Register len = c_rarg2; // length 4233 const Register table0 = c_rarg3; // crc_table address 4234 const Register table1 = c_rarg4; 4235 const Register table2 = c_rarg5; 4236 const Register table3 = c_rarg6; 4237 const Register tmp3 = c_rarg7; 4238 4239 BLOCK_COMMENT("Entry:"); 4240 __ enter(); // required for proper stackwalking of RuntimeStub frame 4241 4242 __ kernel_crc32(crc, buf, len, 4243 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4244 4245 __ leave(); // required for proper stackwalking of RuntimeStub frame 4246 __ ret(lr); 4247 4248 return start; 4249 } 4250 4251 // ChaCha20 block function. This version parallelizes by loading 4252 // individual 32-bit state elements into vectors for four blocks 4253 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4254 // 4255 // state (int[16]) = c_rarg0 4256 // keystream (byte[1024]) = c_rarg1 4257 // return - number of bytes of keystream (always 256) 4258 address generate_chacha20Block_blockpar() { 4259 Label L_twoRounds, L_cc20_const; 4260 // The constant data is broken into two 128-bit segments to be loaded 4261 // onto FloatRegisters. The first 128 bits are a counter add overlay 4262 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4263 // The second 128-bits is a table constant used for 8-bit left rotations. 4264 __ BIND(L_cc20_const); 4265 __ emit_int64(0x0000000100000000UL); 4266 __ emit_int64(0x0000000300000002UL); 4267 __ emit_int64(0x0605040702010003UL); 4268 __ emit_int64(0x0E0D0C0F0A09080BUL); 4269 4270 __ align(CodeEntryAlignment); 4271 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4272 address start = __ pc(); 4273 __ enter(); 4274 4275 int i, j; 4276 const Register state = c_rarg0; 4277 const Register keystream = c_rarg1; 4278 const Register loopCtr = r10; 4279 const Register tmpAddr = r11; 4280 4281 const FloatRegister stateFirst = v0; 4282 const FloatRegister stateSecond = v1; 4283 const FloatRegister stateThird = v2; 4284 const FloatRegister stateFourth = v3; 4285 const FloatRegister origCtrState = v28; 4286 const FloatRegister scratch = v29; 4287 const FloatRegister lrot8Tbl = v30; 4288 4289 // Organize SIMD registers in an array that facilitates 4290 // putting repetitive opcodes into loop structures. It is 4291 // important that each grouping of 4 registers is monotonically 4292 // increasing to support the requirements of multi-register 4293 // instructions (e.g. ld4r, st4, etc.) 4294 const FloatRegister workSt[16] = { 4295 v4, v5, v6, v7, v16, v17, v18, v19, 4296 v20, v21, v22, v23, v24, v25, v26, v27 4297 }; 4298 4299 // Load from memory and interlace across 16 SIMD registers, 4300 // With each word from memory being broadcast to all lanes of 4301 // each successive SIMD register. 4302 // Addr(0) -> All lanes in workSt[i] 4303 // Addr(4) -> All lanes workSt[i + 1], etc. 4304 __ mov(tmpAddr, state); 4305 for (i = 0; i < 16; i += 4) { 4306 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4307 __ post(tmpAddr, 16)); 4308 } 4309 4310 // Pull in constant data. The first 16 bytes are the add overlay 4311 // which is applied to the vector holding the counter (state[12]). 4312 // The second 16 bytes is the index register for the 8-bit left 4313 // rotation tbl instruction. 4314 __ adr(tmpAddr, L_cc20_const); 4315 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4316 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4317 4318 // Set up the 10 iteration loop and perform all 8 quarter round ops 4319 __ mov(loopCtr, 10); 4320 __ BIND(L_twoRounds); 4321 4322 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4323 scratch, lrot8Tbl); 4324 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4325 scratch, lrot8Tbl); 4326 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4327 scratch, lrot8Tbl); 4328 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4329 scratch, lrot8Tbl); 4330 4331 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4332 scratch, lrot8Tbl); 4333 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4334 scratch, lrot8Tbl); 4335 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4336 scratch, lrot8Tbl); 4337 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4338 scratch, lrot8Tbl); 4339 4340 // Decrement and iterate 4341 __ sub(loopCtr, loopCtr, 1); 4342 __ cbnz(loopCtr, L_twoRounds); 4343 4344 __ mov(tmpAddr, state); 4345 4346 // Add the starting state back to the post-loop keystream 4347 // state. We read/interlace the state array from memory into 4348 // 4 registers similar to what we did in the beginning. Then 4349 // add the counter overlay onto workSt[12] at the end. 4350 for (i = 0; i < 16; i += 4) { 4351 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4352 __ post(tmpAddr, 16)); 4353 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4354 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4355 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4356 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4357 } 4358 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4359 4360 // Write to key stream, storing the same element out of workSt[0..15] 4361 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4362 // for the next element position. 4363 for (i = 0; i < 4; i++) { 4364 for (j = 0; j < 16; j += 4) { 4365 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4366 __ post(keystream, 16)); 4367 } 4368 } 4369 4370 __ mov(r0, 256); // Return length of output keystream 4371 __ leave(); 4372 __ ret(lr); 4373 4374 return start; 4375 } 4376 4377 /** 4378 * Arguments: 4379 * 4380 * Inputs: 4381 * c_rarg0 - int crc 4382 * c_rarg1 - byte* buf 4383 * c_rarg2 - int length 4384 * c_rarg3 - int* table 4385 * 4386 * Output: 4387 * r0 - int crc result 4388 */ 4389 address generate_updateBytesCRC32C() { 4390 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4391 4392 __ align(CodeEntryAlignment); 4393 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4394 4395 address start = __ pc(); 4396 4397 const Register crc = c_rarg0; // crc 4398 const Register buf = c_rarg1; // source java byte array address 4399 const Register len = c_rarg2; // length 4400 const Register table0 = c_rarg3; // crc_table address 4401 const Register table1 = c_rarg4; 4402 const Register table2 = c_rarg5; 4403 const Register table3 = c_rarg6; 4404 const Register tmp3 = c_rarg7; 4405 4406 BLOCK_COMMENT("Entry:"); 4407 __ enter(); // required for proper stackwalking of RuntimeStub frame 4408 4409 __ kernel_crc32c(crc, buf, len, 4410 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4411 4412 __ leave(); // required for proper stackwalking of RuntimeStub frame 4413 __ ret(lr); 4414 4415 return start; 4416 } 4417 4418 /*** 4419 * Arguments: 4420 * 4421 * Inputs: 4422 * c_rarg0 - int adler 4423 * c_rarg1 - byte* buff 4424 * c_rarg2 - int len 4425 * 4426 * Output: 4427 * c_rarg0 - int adler result 4428 */ 4429 address generate_updateBytesAdler32() { 4430 __ align(CodeEntryAlignment); 4431 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4432 address start = __ pc(); 4433 4434 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4435 4436 // Aliases 4437 Register adler = c_rarg0; 4438 Register s1 = c_rarg0; 4439 Register s2 = c_rarg3; 4440 Register buff = c_rarg1; 4441 Register len = c_rarg2; 4442 Register nmax = r4; 4443 Register base = r5; 4444 Register count = r6; 4445 Register temp0 = rscratch1; 4446 Register temp1 = rscratch2; 4447 FloatRegister vbytes = v0; 4448 FloatRegister vs1acc = v1; 4449 FloatRegister vs2acc = v2; 4450 FloatRegister vtable = v3; 4451 4452 // Max number of bytes we can process before having to take the mod 4453 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4454 uint64_t BASE = 0xfff1; 4455 uint64_t NMAX = 0x15B0; 4456 4457 __ mov(base, BASE); 4458 __ mov(nmax, NMAX); 4459 4460 // Load accumulation coefficients for the upper 16 bits 4461 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4462 __ ld1(vtable, __ T16B, Address(temp0)); 4463 4464 // s1 is initialized to the lower 16 bits of adler 4465 // s2 is initialized to the upper 16 bits of adler 4466 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4467 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4468 4469 // The pipelined loop needs at least 16 elements for 1 iteration 4470 // It does check this, but it is more effective to skip to the cleanup loop 4471 __ cmp(len, (u1)16); 4472 __ br(Assembler::HS, L_nmax); 4473 __ cbz(len, L_combine); 4474 4475 __ bind(L_simple_by1_loop); 4476 __ ldrb(temp0, Address(__ post(buff, 1))); 4477 __ add(s1, s1, temp0); 4478 __ add(s2, s2, s1); 4479 __ subs(len, len, 1); 4480 __ br(Assembler::HI, L_simple_by1_loop); 4481 4482 // s1 = s1 % BASE 4483 __ subs(temp0, s1, base); 4484 __ csel(s1, temp0, s1, Assembler::HS); 4485 4486 // s2 = s2 % BASE 4487 __ lsr(temp0, s2, 16); 4488 __ lsl(temp1, temp0, 4); 4489 __ sub(temp1, temp1, temp0); 4490 __ add(s2, temp1, s2, ext::uxth); 4491 4492 __ subs(temp0, s2, base); 4493 __ csel(s2, temp0, s2, Assembler::HS); 4494 4495 __ b(L_combine); 4496 4497 __ bind(L_nmax); 4498 __ subs(len, len, nmax); 4499 __ sub(count, nmax, 16); 4500 __ br(Assembler::LO, L_by16); 4501 4502 __ bind(L_nmax_loop); 4503 4504 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4505 vbytes, vs1acc, vs2acc, vtable); 4506 4507 __ subs(count, count, 16); 4508 __ br(Assembler::HS, L_nmax_loop); 4509 4510 // s1 = s1 % BASE 4511 __ lsr(temp0, s1, 16); 4512 __ lsl(temp1, temp0, 4); 4513 __ sub(temp1, temp1, temp0); 4514 __ add(temp1, temp1, s1, ext::uxth); 4515 4516 __ lsr(temp0, temp1, 16); 4517 __ lsl(s1, temp0, 4); 4518 __ sub(s1, s1, temp0); 4519 __ add(s1, s1, temp1, ext:: uxth); 4520 4521 __ subs(temp0, s1, base); 4522 __ csel(s1, temp0, s1, Assembler::HS); 4523 4524 // s2 = s2 % BASE 4525 __ lsr(temp0, s2, 16); 4526 __ lsl(temp1, temp0, 4); 4527 __ sub(temp1, temp1, temp0); 4528 __ add(temp1, temp1, s2, ext::uxth); 4529 4530 __ lsr(temp0, temp1, 16); 4531 __ lsl(s2, temp0, 4); 4532 __ sub(s2, s2, temp0); 4533 __ add(s2, s2, temp1, ext:: uxth); 4534 4535 __ subs(temp0, s2, base); 4536 __ csel(s2, temp0, s2, Assembler::HS); 4537 4538 __ subs(len, len, nmax); 4539 __ sub(count, nmax, 16); 4540 __ br(Assembler::HS, L_nmax_loop); 4541 4542 __ bind(L_by16); 4543 __ adds(len, len, count); 4544 __ br(Assembler::LO, L_by1); 4545 4546 __ bind(L_by16_loop); 4547 4548 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4549 vbytes, vs1acc, vs2acc, vtable); 4550 4551 __ subs(len, len, 16); 4552 __ br(Assembler::HS, L_by16_loop); 4553 4554 __ bind(L_by1); 4555 __ adds(len, len, 15); 4556 __ br(Assembler::LO, L_do_mod); 4557 4558 __ bind(L_by1_loop); 4559 __ ldrb(temp0, Address(__ post(buff, 1))); 4560 __ add(s1, temp0, s1); 4561 __ add(s2, s2, s1); 4562 __ subs(len, len, 1); 4563 __ br(Assembler::HS, L_by1_loop); 4564 4565 __ bind(L_do_mod); 4566 // s1 = s1 % BASE 4567 __ lsr(temp0, s1, 16); 4568 __ lsl(temp1, temp0, 4); 4569 __ sub(temp1, temp1, temp0); 4570 __ add(temp1, temp1, s1, ext::uxth); 4571 4572 __ lsr(temp0, temp1, 16); 4573 __ lsl(s1, temp0, 4); 4574 __ sub(s1, s1, temp0); 4575 __ add(s1, s1, temp1, ext:: uxth); 4576 4577 __ subs(temp0, s1, base); 4578 __ csel(s1, temp0, s1, Assembler::HS); 4579 4580 // s2 = s2 % BASE 4581 __ lsr(temp0, s2, 16); 4582 __ lsl(temp1, temp0, 4); 4583 __ sub(temp1, temp1, temp0); 4584 __ add(temp1, temp1, s2, ext::uxth); 4585 4586 __ lsr(temp0, temp1, 16); 4587 __ lsl(s2, temp0, 4); 4588 __ sub(s2, s2, temp0); 4589 __ add(s2, s2, temp1, ext:: uxth); 4590 4591 __ subs(temp0, s2, base); 4592 __ csel(s2, temp0, s2, Assembler::HS); 4593 4594 // Combine lower bits and higher bits 4595 __ bind(L_combine); 4596 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4597 4598 __ ret(lr); 4599 4600 return start; 4601 } 4602 4603 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4604 Register temp0, Register temp1, FloatRegister vbytes, 4605 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4606 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4607 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4608 // In non-vectorized code, we update s1 and s2 as: 4609 // s1 <- s1 + b1 4610 // s2 <- s2 + s1 4611 // s1 <- s1 + b2 4612 // s2 <- s2 + b1 4613 // ... 4614 // s1 <- s1 + b16 4615 // s2 <- s2 + s1 4616 // Putting above assignments together, we have: 4617 // s1_new = s1 + b1 + b2 + ... + b16 4618 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4619 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4620 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4621 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4622 4623 // s2 = s2 + s1 * 16 4624 __ add(s2, s2, s1, Assembler::LSL, 4); 4625 4626 // vs1acc = b1 + b2 + b3 + ... + b16 4627 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4628 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4629 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4630 __ uaddlv(vs1acc, __ T16B, vbytes); 4631 __ uaddlv(vs2acc, __ T8H, vs2acc); 4632 4633 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4634 __ fmovd(temp0, vs1acc); 4635 __ fmovd(temp1, vs2acc); 4636 __ add(s1, s1, temp0); 4637 __ add(s2, s2, temp1); 4638 } 4639 4640 /** 4641 * Arguments: 4642 * 4643 * Input: 4644 * c_rarg0 - x address 4645 * c_rarg1 - x length 4646 * c_rarg2 - y address 4647 * c_rarg3 - y length 4648 * c_rarg4 - z address 4649 */ 4650 address generate_multiplyToLen() { 4651 __ align(CodeEntryAlignment); 4652 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4653 4654 address start = __ pc(); 4655 4656 if (SCCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) { 4657 return start; 4658 } 4659 const Register x = r0; 4660 const Register xlen = r1; 4661 const Register y = r2; 4662 const Register ylen = r3; 4663 const Register z = r4; 4664 4665 const Register tmp0 = r5; 4666 const Register tmp1 = r10; 4667 const Register tmp2 = r11; 4668 const Register tmp3 = r12; 4669 const Register tmp4 = r13; 4670 const Register tmp5 = r14; 4671 const Register tmp6 = r15; 4672 const Register tmp7 = r16; 4673 4674 BLOCK_COMMENT("Entry:"); 4675 __ enter(); // required for proper stackwalking of RuntimeStub frame 4676 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4677 __ leave(); // required for proper stackwalking of RuntimeStub frame 4678 __ ret(lr); 4679 4680 SCCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start); 4681 return start; 4682 } 4683 4684 address generate_squareToLen() { 4685 // squareToLen algorithm for sizes 1..127 described in java code works 4686 // faster than multiply_to_len on some CPUs and slower on others, but 4687 // multiply_to_len shows a bit better overall results 4688 __ align(CodeEntryAlignment); 4689 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4690 address start = __ pc(); 4691 4692 if (SCCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) { 4693 return start; 4694 } 4695 const Register x = r0; 4696 const Register xlen = r1; 4697 const Register z = r2; 4698 const Register y = r4; // == x 4699 const Register ylen = r5; // == xlen 4700 4701 const Register tmp0 = r3; 4702 const Register tmp1 = r10; 4703 const Register tmp2 = r11; 4704 const Register tmp3 = r12; 4705 const Register tmp4 = r13; 4706 const Register tmp5 = r14; 4707 const Register tmp6 = r15; 4708 const Register tmp7 = r16; 4709 4710 RegSet spilled_regs = RegSet::of(y, ylen); 4711 BLOCK_COMMENT("Entry:"); 4712 __ enter(); 4713 __ push(spilled_regs, sp); 4714 __ mov(y, x); 4715 __ mov(ylen, xlen); 4716 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4717 __ pop(spilled_regs, sp); 4718 __ leave(); 4719 __ ret(lr); 4720 4721 SCCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start); 4722 return start; 4723 } 4724 4725 address generate_mulAdd() { 4726 __ align(CodeEntryAlignment); 4727 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4728 4729 address start = __ pc(); 4730 4731 if (SCCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) { 4732 return start; 4733 } 4734 const Register out = r0; 4735 const Register in = r1; 4736 const Register offset = r2; 4737 const Register len = r3; 4738 const Register k = r4; 4739 4740 BLOCK_COMMENT("Entry:"); 4741 __ enter(); 4742 __ mul_add(out, in, offset, len, k); 4743 __ leave(); 4744 __ ret(lr); 4745 4746 SCCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start); 4747 return start; 4748 } 4749 4750 // Arguments: 4751 // 4752 // Input: 4753 // c_rarg0 - newArr address 4754 // c_rarg1 - oldArr address 4755 // c_rarg2 - newIdx 4756 // c_rarg3 - shiftCount 4757 // c_rarg4 - numIter 4758 // 4759 address generate_bigIntegerRightShift() { 4760 __ align(CodeEntryAlignment); 4761 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4762 address start = __ pc(); 4763 4764 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4765 4766 Register newArr = c_rarg0; 4767 Register oldArr = c_rarg1; 4768 Register newIdx = c_rarg2; 4769 Register shiftCount = c_rarg3; 4770 Register numIter = c_rarg4; 4771 Register idx = numIter; 4772 4773 Register newArrCur = rscratch1; 4774 Register shiftRevCount = rscratch2; 4775 Register oldArrCur = r13; 4776 Register oldArrNext = r14; 4777 4778 FloatRegister oldElem0 = v0; 4779 FloatRegister oldElem1 = v1; 4780 FloatRegister newElem = v2; 4781 FloatRegister shiftVCount = v3; 4782 FloatRegister shiftVRevCount = v4; 4783 4784 __ cbz(idx, Exit); 4785 4786 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4787 4788 // left shift count 4789 __ movw(shiftRevCount, 32); 4790 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4791 4792 // numIter too small to allow a 4-words SIMD loop, rolling back 4793 __ cmp(numIter, (u1)4); 4794 __ br(Assembler::LT, ShiftThree); 4795 4796 __ dup(shiftVCount, __ T4S, shiftCount); 4797 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4798 __ negr(shiftVCount, __ T4S, shiftVCount); 4799 4800 __ BIND(ShiftSIMDLoop); 4801 4802 // Calculate the load addresses 4803 __ sub(idx, idx, 4); 4804 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4805 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4806 __ add(oldArrCur, oldArrNext, 4); 4807 4808 // Load 4 words and process 4809 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4810 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4811 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4812 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4813 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4814 __ st1(newElem, __ T4S, Address(newArrCur)); 4815 4816 __ cmp(idx, (u1)4); 4817 __ br(Assembler::LT, ShiftTwoLoop); 4818 __ b(ShiftSIMDLoop); 4819 4820 __ BIND(ShiftTwoLoop); 4821 __ cbz(idx, Exit); 4822 __ cmp(idx, (u1)1); 4823 __ br(Assembler::EQ, ShiftOne); 4824 4825 // Calculate the load addresses 4826 __ sub(idx, idx, 2); 4827 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4828 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4829 __ add(oldArrCur, oldArrNext, 4); 4830 4831 // Load 2 words and process 4832 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4833 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4834 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4835 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4836 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4837 __ st1(newElem, __ T2S, Address(newArrCur)); 4838 __ b(ShiftTwoLoop); 4839 4840 __ BIND(ShiftThree); 4841 __ tbz(idx, 1, ShiftOne); 4842 __ tbz(idx, 0, ShiftTwo); 4843 __ ldrw(r10, Address(oldArr, 12)); 4844 __ ldrw(r11, Address(oldArr, 8)); 4845 __ lsrvw(r10, r10, shiftCount); 4846 __ lslvw(r11, r11, shiftRevCount); 4847 __ orrw(r12, r10, r11); 4848 __ strw(r12, Address(newArr, 8)); 4849 4850 __ BIND(ShiftTwo); 4851 __ ldrw(r10, Address(oldArr, 8)); 4852 __ ldrw(r11, Address(oldArr, 4)); 4853 __ lsrvw(r10, r10, shiftCount); 4854 __ lslvw(r11, r11, shiftRevCount); 4855 __ orrw(r12, r10, r11); 4856 __ strw(r12, Address(newArr, 4)); 4857 4858 __ BIND(ShiftOne); 4859 __ ldrw(r10, Address(oldArr, 4)); 4860 __ ldrw(r11, Address(oldArr)); 4861 __ lsrvw(r10, r10, shiftCount); 4862 __ lslvw(r11, r11, shiftRevCount); 4863 __ orrw(r12, r10, r11); 4864 __ strw(r12, Address(newArr)); 4865 4866 __ BIND(Exit); 4867 __ ret(lr); 4868 4869 return start; 4870 } 4871 4872 // Arguments: 4873 // 4874 // Input: 4875 // c_rarg0 - newArr address 4876 // c_rarg1 - oldArr address 4877 // c_rarg2 - newIdx 4878 // c_rarg3 - shiftCount 4879 // c_rarg4 - numIter 4880 // 4881 address generate_bigIntegerLeftShift() { 4882 __ align(CodeEntryAlignment); 4883 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4884 address start = __ pc(); 4885 4886 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4887 4888 Register newArr = c_rarg0; 4889 Register oldArr = c_rarg1; 4890 Register newIdx = c_rarg2; 4891 Register shiftCount = c_rarg3; 4892 Register numIter = c_rarg4; 4893 4894 Register shiftRevCount = rscratch1; 4895 Register oldArrNext = rscratch2; 4896 4897 FloatRegister oldElem0 = v0; 4898 FloatRegister oldElem1 = v1; 4899 FloatRegister newElem = v2; 4900 FloatRegister shiftVCount = v3; 4901 FloatRegister shiftVRevCount = v4; 4902 4903 __ cbz(numIter, Exit); 4904 4905 __ add(oldArrNext, oldArr, 4); 4906 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4907 4908 // right shift count 4909 __ movw(shiftRevCount, 32); 4910 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4911 4912 // numIter too small to allow a 4-words SIMD loop, rolling back 4913 __ cmp(numIter, (u1)4); 4914 __ br(Assembler::LT, ShiftThree); 4915 4916 __ dup(shiftVCount, __ T4S, shiftCount); 4917 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4918 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4919 4920 __ BIND(ShiftSIMDLoop); 4921 4922 // load 4 words and process 4923 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4924 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4925 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4926 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4927 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4928 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4929 __ sub(numIter, numIter, 4); 4930 4931 __ cmp(numIter, (u1)4); 4932 __ br(Assembler::LT, ShiftTwoLoop); 4933 __ b(ShiftSIMDLoop); 4934 4935 __ BIND(ShiftTwoLoop); 4936 __ cbz(numIter, Exit); 4937 __ cmp(numIter, (u1)1); 4938 __ br(Assembler::EQ, ShiftOne); 4939 4940 // load 2 words and process 4941 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4942 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4943 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4944 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4945 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4946 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4947 __ sub(numIter, numIter, 2); 4948 __ b(ShiftTwoLoop); 4949 4950 __ BIND(ShiftThree); 4951 __ ldrw(r10, __ post(oldArr, 4)); 4952 __ ldrw(r11, __ post(oldArrNext, 4)); 4953 __ lslvw(r10, r10, shiftCount); 4954 __ lsrvw(r11, r11, shiftRevCount); 4955 __ orrw(r12, r10, r11); 4956 __ strw(r12, __ post(newArr, 4)); 4957 __ tbz(numIter, 1, Exit); 4958 __ tbz(numIter, 0, ShiftOne); 4959 4960 __ BIND(ShiftTwo); 4961 __ ldrw(r10, __ post(oldArr, 4)); 4962 __ ldrw(r11, __ post(oldArrNext, 4)); 4963 __ lslvw(r10, r10, shiftCount); 4964 __ lsrvw(r11, r11, shiftRevCount); 4965 __ orrw(r12, r10, r11); 4966 __ strw(r12, __ post(newArr, 4)); 4967 4968 __ BIND(ShiftOne); 4969 __ ldrw(r10, Address(oldArr)); 4970 __ ldrw(r11, Address(oldArrNext)); 4971 __ lslvw(r10, r10, shiftCount); 4972 __ lsrvw(r11, r11, shiftRevCount); 4973 __ orrw(r12, r10, r11); 4974 __ strw(r12, Address(newArr)); 4975 4976 __ BIND(Exit); 4977 __ ret(lr); 4978 4979 return start; 4980 } 4981 4982 address generate_count_positives(address &count_positives_long) { 4983 const u1 large_loop_size = 64; 4984 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4985 int dcache_line = VM_Version::dcache_line_size(); 4986 4987 Register ary1 = r1, len = r2, result = r0; 4988 4989 __ align(CodeEntryAlignment); 4990 4991 StubCodeMark mark(this, "StubRoutines", "count_positives"); 4992 4993 address entry = __ pc(); 4994 4995 __ enter(); 4996 // precondition: a copy of len is already in result 4997 // __ mov(result, len); 4998 4999 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 5000 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 5001 5002 __ cmp(len, (u1)15); 5003 __ br(Assembler::GT, LEN_OVER_15); 5004 // The only case when execution falls into this code is when pointer is near 5005 // the end of memory page and we have to avoid reading next page 5006 __ add(ary1, ary1, len); 5007 __ subs(len, len, 8); 5008 __ br(Assembler::GT, LEN_OVER_8); 5009 __ ldr(rscratch2, Address(ary1, -8)); 5010 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 5011 __ lsrv(rscratch2, rscratch2, rscratch1); 5012 __ tst(rscratch2, UPPER_BIT_MASK); 5013 __ csel(result, zr, result, Assembler::NE); 5014 __ leave(); 5015 __ ret(lr); 5016 __ bind(LEN_OVER_8); 5017 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5018 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5019 __ tst(rscratch2, UPPER_BIT_MASK); 5020 __ br(Assembler::NE, RET_NO_POP); 5021 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5022 __ lsrv(rscratch1, rscratch1, rscratch2); 5023 __ tst(rscratch1, UPPER_BIT_MASK); 5024 __ bind(RET_NO_POP); 5025 __ csel(result, zr, result, Assembler::NE); 5026 __ leave(); 5027 __ ret(lr); 5028 5029 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5030 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5031 5032 count_positives_long = __ pc(); // 2nd entry point 5033 5034 __ enter(); 5035 5036 __ bind(LEN_OVER_15); 5037 __ push(spilled_regs, sp); 5038 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5039 __ cbz(rscratch2, ALIGNED); 5040 __ ldp(tmp6, tmp1, Address(ary1)); 5041 __ mov(tmp5, 16); 5042 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5043 __ add(ary1, ary1, rscratch1); 5044 __ orr(tmp6, tmp6, tmp1); 5045 __ tst(tmp6, UPPER_BIT_MASK); 5046 __ br(Assembler::NE, RET_ADJUST); 5047 __ sub(len, len, rscratch1); 5048 5049 __ bind(ALIGNED); 5050 __ cmp(len, large_loop_size); 5051 __ br(Assembler::LT, CHECK_16); 5052 // Perform 16-byte load as early return in pre-loop to handle situation 5053 // when initially aligned large array has negative values at starting bytes, 5054 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5055 // slower. Cases with negative bytes further ahead won't be affected that 5056 // much. In fact, it'll be faster due to early loads, less instructions and 5057 // less branches in LARGE_LOOP. 5058 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5059 __ sub(len, len, 16); 5060 __ orr(tmp6, tmp6, tmp1); 5061 __ tst(tmp6, UPPER_BIT_MASK); 5062 __ br(Assembler::NE, RET_ADJUST_16); 5063 __ cmp(len, large_loop_size); 5064 __ br(Assembler::LT, CHECK_16); 5065 5066 if (SoftwarePrefetchHintDistance >= 0 5067 && SoftwarePrefetchHintDistance >= dcache_line) { 5068 // initial prefetch 5069 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5070 } 5071 __ bind(LARGE_LOOP); 5072 if (SoftwarePrefetchHintDistance >= 0) { 5073 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5074 } 5075 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5076 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5077 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5078 // instructions per cycle and have less branches, but this approach disables 5079 // early return, thus, all 64 bytes are loaded and checked every time. 5080 __ ldp(tmp2, tmp3, Address(ary1)); 5081 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5082 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5083 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5084 __ add(ary1, ary1, large_loop_size); 5085 __ sub(len, len, large_loop_size); 5086 __ orr(tmp2, tmp2, tmp3); 5087 __ orr(tmp4, tmp4, tmp5); 5088 __ orr(rscratch1, rscratch1, rscratch2); 5089 __ orr(tmp6, tmp6, tmp1); 5090 __ orr(tmp2, tmp2, tmp4); 5091 __ orr(rscratch1, rscratch1, tmp6); 5092 __ orr(tmp2, tmp2, rscratch1); 5093 __ tst(tmp2, UPPER_BIT_MASK); 5094 __ br(Assembler::NE, RET_ADJUST_LONG); 5095 __ cmp(len, large_loop_size); 5096 __ br(Assembler::GE, LARGE_LOOP); 5097 5098 __ bind(CHECK_16); // small 16-byte load pre-loop 5099 __ cmp(len, (u1)16); 5100 __ br(Assembler::LT, POST_LOOP16); 5101 5102 __ bind(LOOP16); // small 16-byte load loop 5103 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5104 __ sub(len, len, 16); 5105 __ orr(tmp2, tmp2, tmp3); 5106 __ tst(tmp2, UPPER_BIT_MASK); 5107 __ br(Assembler::NE, RET_ADJUST_16); 5108 __ cmp(len, (u1)16); 5109 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5110 5111 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5112 __ cmp(len, (u1)8); 5113 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5114 __ ldr(tmp3, Address(__ post(ary1, 8))); 5115 __ tst(tmp3, UPPER_BIT_MASK); 5116 __ br(Assembler::NE, RET_ADJUST); 5117 __ sub(len, len, 8); 5118 5119 __ bind(POST_LOOP16_LOAD_TAIL); 5120 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5121 __ ldr(tmp1, Address(ary1)); 5122 __ mov(tmp2, 64); 5123 __ sub(tmp4, tmp2, len, __ LSL, 3); 5124 __ lslv(tmp1, tmp1, tmp4); 5125 __ tst(tmp1, UPPER_BIT_MASK); 5126 __ br(Assembler::NE, RET_ADJUST); 5127 // Fallthrough 5128 5129 __ bind(RET_LEN); 5130 __ pop(spilled_regs, sp); 5131 __ leave(); 5132 __ ret(lr); 5133 5134 // difference result - len is the count of guaranteed to be 5135 // positive bytes 5136 5137 __ bind(RET_ADJUST_LONG); 5138 __ add(len, len, (u1)(large_loop_size - 16)); 5139 __ bind(RET_ADJUST_16); 5140 __ add(len, len, 16); 5141 __ bind(RET_ADJUST); 5142 __ pop(spilled_regs, sp); 5143 __ leave(); 5144 __ sub(result, result, len); 5145 __ ret(lr); 5146 5147 return entry; 5148 } 5149 5150 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5151 bool usePrefetch, Label &NOT_EQUAL) { 5152 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5153 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5154 tmp7 = r12, tmp8 = r13; 5155 Label LOOP; 5156 5157 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5158 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5159 __ bind(LOOP); 5160 if (usePrefetch) { 5161 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5162 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5163 } 5164 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5165 __ eor(tmp1, tmp1, tmp2); 5166 __ eor(tmp3, tmp3, tmp4); 5167 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5168 __ orr(tmp1, tmp1, tmp3); 5169 __ cbnz(tmp1, NOT_EQUAL); 5170 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5171 __ eor(tmp5, tmp5, tmp6); 5172 __ eor(tmp7, tmp7, tmp8); 5173 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5174 __ orr(tmp5, tmp5, tmp7); 5175 __ cbnz(tmp5, NOT_EQUAL); 5176 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5177 __ eor(tmp1, tmp1, tmp2); 5178 __ eor(tmp3, tmp3, tmp4); 5179 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5180 __ orr(tmp1, tmp1, tmp3); 5181 __ cbnz(tmp1, NOT_EQUAL); 5182 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5183 __ eor(tmp5, tmp5, tmp6); 5184 __ sub(cnt1, cnt1, 8 * wordSize); 5185 __ eor(tmp7, tmp7, tmp8); 5186 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5187 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5188 // cmp) because subs allows an unlimited range of immediate operand. 5189 __ subs(tmp6, cnt1, loopThreshold); 5190 __ orr(tmp5, tmp5, tmp7); 5191 __ cbnz(tmp5, NOT_EQUAL); 5192 __ br(__ GE, LOOP); 5193 // post-loop 5194 __ eor(tmp1, tmp1, tmp2); 5195 __ eor(tmp3, tmp3, tmp4); 5196 __ orr(tmp1, tmp1, tmp3); 5197 __ sub(cnt1, cnt1, 2 * wordSize); 5198 __ cbnz(tmp1, NOT_EQUAL); 5199 } 5200 5201 void generate_large_array_equals_loop_simd(int loopThreshold, 5202 bool usePrefetch, Label &NOT_EQUAL) { 5203 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5204 tmp2 = rscratch2; 5205 Label LOOP; 5206 5207 __ bind(LOOP); 5208 if (usePrefetch) { 5209 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5210 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5211 } 5212 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5213 __ sub(cnt1, cnt1, 8 * wordSize); 5214 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5215 __ subs(tmp1, cnt1, loopThreshold); 5216 __ eor(v0, __ T16B, v0, v4); 5217 __ eor(v1, __ T16B, v1, v5); 5218 __ eor(v2, __ T16B, v2, v6); 5219 __ eor(v3, __ T16B, v3, v7); 5220 __ orr(v0, __ T16B, v0, v1); 5221 __ orr(v1, __ T16B, v2, v3); 5222 __ orr(v0, __ T16B, v0, v1); 5223 __ umov(tmp1, v0, __ D, 0); 5224 __ umov(tmp2, v0, __ D, 1); 5225 __ orr(tmp1, tmp1, tmp2); 5226 __ cbnz(tmp1, NOT_EQUAL); 5227 __ br(__ GE, LOOP); 5228 } 5229 5230 // a1 = r1 - array1 address 5231 // a2 = r2 - array2 address 5232 // result = r0 - return value. Already contains "false" 5233 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5234 // r3-r5 are reserved temporary registers 5235 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5236 address generate_large_array_equals() { 5237 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5238 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5239 tmp7 = r12, tmp8 = r13; 5240 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5241 SMALL_LOOP, POST_LOOP; 5242 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5243 // calculate if at least 32 prefetched bytes are used 5244 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5245 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5246 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5247 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5248 tmp5, tmp6, tmp7, tmp8); 5249 5250 __ align(CodeEntryAlignment); 5251 5252 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5253 5254 address entry = __ pc(); 5255 __ enter(); 5256 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5257 // also advance pointers to use post-increment instead of pre-increment 5258 __ add(a1, a1, wordSize); 5259 __ add(a2, a2, wordSize); 5260 if (AvoidUnalignedAccesses) { 5261 // both implementations (SIMD/nonSIMD) are using relatively large load 5262 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5263 // on some CPUs in case of address is not at least 16-byte aligned. 5264 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5265 // load if needed at least for 1st address and make if 16-byte aligned. 5266 Label ALIGNED16; 5267 __ tbz(a1, 3, ALIGNED16); 5268 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5269 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5270 __ sub(cnt1, cnt1, wordSize); 5271 __ eor(tmp1, tmp1, tmp2); 5272 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5273 __ bind(ALIGNED16); 5274 } 5275 if (UseSIMDForArrayEquals) { 5276 if (SoftwarePrefetchHintDistance >= 0) { 5277 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5278 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5279 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5280 /* prfm = */ true, NOT_EQUAL); 5281 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5282 __ br(__ LT, TAIL); 5283 } 5284 __ bind(NO_PREFETCH_LARGE_LOOP); 5285 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5286 /* prfm = */ false, NOT_EQUAL); 5287 } else { 5288 __ push(spilled_regs, sp); 5289 if (SoftwarePrefetchHintDistance >= 0) { 5290 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5291 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5292 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5293 /* prfm = */ true, NOT_EQUAL); 5294 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5295 __ br(__ LT, TAIL); 5296 } 5297 __ bind(NO_PREFETCH_LARGE_LOOP); 5298 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5299 /* prfm = */ false, NOT_EQUAL); 5300 } 5301 __ bind(TAIL); 5302 __ cbz(cnt1, EQUAL); 5303 __ subs(cnt1, cnt1, wordSize); 5304 __ br(__ LE, POST_LOOP); 5305 __ bind(SMALL_LOOP); 5306 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5307 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5308 __ subs(cnt1, cnt1, wordSize); 5309 __ eor(tmp1, tmp1, tmp2); 5310 __ cbnz(tmp1, NOT_EQUAL); 5311 __ br(__ GT, SMALL_LOOP); 5312 __ bind(POST_LOOP); 5313 __ ldr(tmp1, Address(a1, cnt1)); 5314 __ ldr(tmp2, Address(a2, cnt1)); 5315 __ eor(tmp1, tmp1, tmp2); 5316 __ cbnz(tmp1, NOT_EQUAL); 5317 __ bind(EQUAL); 5318 __ mov(result, true); 5319 __ bind(NOT_EQUAL); 5320 if (!UseSIMDForArrayEquals) { 5321 __ pop(spilled_regs, sp); 5322 } 5323 __ bind(NOT_EQUAL_NO_POP); 5324 __ leave(); 5325 __ ret(lr); 5326 return entry; 5327 } 5328 5329 address generate_dsin_dcos(bool isCos) { 5330 __ align(CodeEntryAlignment); 5331 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5332 address start = __ pc(); 5333 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5334 (address)StubRoutines::aarch64::_two_over_pi, 5335 (address)StubRoutines::aarch64::_pio2, 5336 (address)StubRoutines::aarch64::_dsin_coef, 5337 (address)StubRoutines::aarch64::_dcos_coef); 5338 return start; 5339 } 5340 5341 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5342 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5343 Label &DIFF2) { 5344 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5345 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5346 5347 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5348 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5349 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5350 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5351 5352 __ fmovd(tmpL, vtmp3); 5353 __ eor(rscratch2, tmp3, tmpL); 5354 __ cbnz(rscratch2, DIFF2); 5355 5356 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5357 __ umov(tmpL, vtmp3, __ D, 1); 5358 __ eor(rscratch2, tmpU, tmpL); 5359 __ cbnz(rscratch2, DIFF1); 5360 5361 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5362 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5363 __ fmovd(tmpL, vtmp); 5364 __ eor(rscratch2, tmp3, tmpL); 5365 __ cbnz(rscratch2, DIFF2); 5366 5367 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5368 __ umov(tmpL, vtmp, __ D, 1); 5369 __ eor(rscratch2, tmpU, tmpL); 5370 __ cbnz(rscratch2, DIFF1); 5371 } 5372 5373 // r0 = result 5374 // r1 = str1 5375 // r2 = cnt1 5376 // r3 = str2 5377 // r4 = cnt2 5378 // r10 = tmp1 5379 // r11 = tmp2 5380 address generate_compare_long_string_different_encoding(bool isLU) { 5381 __ align(CodeEntryAlignment); 5382 StubCodeMark mark(this, "StubRoutines", isLU 5383 ? "compare_long_string_different_encoding LU" 5384 : "compare_long_string_different_encoding UL"); 5385 address entry = __ pc(); 5386 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5387 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5388 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5389 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5390 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5391 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5392 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5393 5394 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5395 5396 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5397 // cnt2 == amount of characters left to compare 5398 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5399 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5400 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5401 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5402 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5403 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5404 __ eor(rscratch2, tmp1, tmp2); 5405 __ mov(rscratch1, tmp2); 5406 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5407 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5408 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5409 __ push(spilled_regs, sp); 5410 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5411 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5412 5413 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5414 5415 if (SoftwarePrefetchHintDistance >= 0) { 5416 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5417 __ br(__ LT, NO_PREFETCH); 5418 __ bind(LARGE_LOOP_PREFETCH); 5419 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5420 __ mov(tmp4, 2); 5421 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5422 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5423 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5424 __ subs(tmp4, tmp4, 1); 5425 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5426 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5427 __ mov(tmp4, 2); 5428 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5429 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5430 __ subs(tmp4, tmp4, 1); 5431 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5432 __ sub(cnt2, cnt2, 64); 5433 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5434 __ br(__ GE, LARGE_LOOP_PREFETCH); 5435 } 5436 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5437 __ bind(NO_PREFETCH); 5438 __ subs(cnt2, cnt2, 16); 5439 __ br(__ LT, TAIL); 5440 __ align(OptoLoopAlignment); 5441 __ bind(SMALL_LOOP); // smaller loop 5442 __ subs(cnt2, cnt2, 16); 5443 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5444 __ br(__ GE, SMALL_LOOP); 5445 __ cmn(cnt2, (u1)16); 5446 __ br(__ EQ, LOAD_LAST); 5447 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5448 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5449 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5450 __ ldr(tmp3, Address(cnt1, -8)); 5451 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5452 __ b(LOAD_LAST); 5453 __ bind(DIFF2); 5454 __ mov(tmpU, tmp3); 5455 __ bind(DIFF1); 5456 __ pop(spilled_regs, sp); 5457 __ b(CALCULATE_DIFFERENCE); 5458 __ bind(LOAD_LAST); 5459 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5460 // No need to load it again 5461 __ mov(tmpU, tmp3); 5462 __ pop(spilled_regs, sp); 5463 5464 // tmp2 points to the address of the last 4 Latin1 characters right now 5465 __ ldrs(vtmp, Address(tmp2)); 5466 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5467 __ fmovd(tmpL, vtmp); 5468 5469 __ eor(rscratch2, tmpU, tmpL); 5470 __ cbz(rscratch2, DONE); 5471 5472 // Find the first different characters in the longwords and 5473 // compute their difference. 5474 __ bind(CALCULATE_DIFFERENCE); 5475 __ rev(rscratch2, rscratch2); 5476 __ clz(rscratch2, rscratch2); 5477 __ andr(rscratch2, rscratch2, -16); 5478 __ lsrv(tmp1, tmp1, rscratch2); 5479 __ uxthw(tmp1, tmp1); 5480 __ lsrv(rscratch1, rscratch1, rscratch2); 5481 __ uxthw(rscratch1, rscratch1); 5482 __ subw(result, tmp1, rscratch1); 5483 __ bind(DONE); 5484 __ ret(lr); 5485 return entry; 5486 } 5487 5488 // r0 = input (float16) 5489 // v0 = result (float) 5490 // v1 = temporary float register 5491 address generate_float16ToFloat() { 5492 __ align(CodeEntryAlignment); 5493 StubCodeMark mark(this, "StubRoutines", "float16ToFloat"); 5494 address entry = __ pc(); 5495 BLOCK_COMMENT("Entry:"); 5496 __ flt16_to_flt(v0, r0, v1); 5497 __ ret(lr); 5498 return entry; 5499 } 5500 5501 // v0 = input (float) 5502 // r0 = result (float16) 5503 // v1 = temporary float register 5504 address generate_floatToFloat16() { 5505 __ align(CodeEntryAlignment); 5506 StubCodeMark mark(this, "StubRoutines", "floatToFloat16"); 5507 address entry = __ pc(); 5508 BLOCK_COMMENT("Entry:"); 5509 __ flt_to_flt16(r0, v0, v1); 5510 __ ret(lr); 5511 return entry; 5512 } 5513 5514 address generate_method_entry_barrier() { 5515 __ align(CodeEntryAlignment); 5516 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5517 5518 Label deoptimize_label; 5519 5520 address start = __ pc(); 5521 5522 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5523 5524 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5525 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5526 // We can get here despite the nmethod being good, if we have not 5527 // yet applied our cross modification fence (or data fence). 5528 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5529 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5530 __ ldrw(rscratch2, rscratch2); 5531 __ strw(rscratch2, thread_epoch_addr); 5532 __ isb(); 5533 __ membar(__ LoadLoad); 5534 } 5535 5536 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5537 5538 __ enter(); 5539 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5540 5541 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5542 5543 __ push_call_clobbered_registers(); 5544 5545 __ mov(c_rarg0, rscratch2); 5546 __ call_VM_leaf 5547 (CAST_FROM_FN_PTR 5548 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5549 5550 __ reset_last_Java_frame(true); 5551 5552 __ mov(rscratch1, r0); 5553 5554 __ pop_call_clobbered_registers(); 5555 5556 __ cbnz(rscratch1, deoptimize_label); 5557 5558 __ leave(); 5559 __ ret(lr); 5560 5561 __ BIND(deoptimize_label); 5562 5563 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5564 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5565 5566 __ mov(sp, rscratch1); 5567 __ br(rscratch2); 5568 5569 return start; 5570 } 5571 5572 // r0 = result 5573 // r1 = str1 5574 // r2 = cnt1 5575 // r3 = str2 5576 // r4 = cnt2 5577 // r10 = tmp1 5578 // r11 = tmp2 5579 address generate_compare_long_string_same_encoding(bool isLL) { 5580 __ align(CodeEntryAlignment); 5581 StubCodeMark mark(this, "StubRoutines", isLL 5582 ? "compare_long_string_same_encoding LL" 5583 : "compare_long_string_same_encoding UU"); 5584 address entry = __ pc(); 5585 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5586 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5587 5588 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5589 5590 // exit from large loop when less than 64 bytes left to read or we're about 5591 // to prefetch memory behind array border 5592 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5593 5594 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5595 __ eor(rscratch2, tmp1, tmp2); 5596 __ cbnz(rscratch2, CAL_DIFFERENCE); 5597 5598 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5599 // update pointers, because of previous read 5600 __ add(str1, str1, wordSize); 5601 __ add(str2, str2, wordSize); 5602 if (SoftwarePrefetchHintDistance >= 0) { 5603 __ align(OptoLoopAlignment); 5604 __ bind(LARGE_LOOP_PREFETCH); 5605 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5606 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5607 5608 for (int i = 0; i < 4; i++) { 5609 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5610 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5611 __ cmp(tmp1, tmp2); 5612 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5613 __ br(Assembler::NE, DIFF); 5614 } 5615 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5616 __ add(str1, str1, 64); 5617 __ add(str2, str2, 64); 5618 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5619 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5620 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5621 } 5622 5623 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5624 __ br(Assembler::LE, LESS16); 5625 __ align(OptoLoopAlignment); 5626 __ bind(LOOP_COMPARE16); 5627 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5628 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5629 __ cmp(tmp1, tmp2); 5630 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5631 __ br(Assembler::NE, DIFF); 5632 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5633 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5634 __ br(Assembler::LT, LESS16); 5635 5636 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5637 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5638 __ cmp(tmp1, tmp2); 5639 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5640 __ br(Assembler::NE, DIFF); 5641 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5642 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5643 __ br(Assembler::GE, LOOP_COMPARE16); 5644 __ cbz(cnt2, LENGTH_DIFF); 5645 5646 __ bind(LESS16); 5647 // each 8 compare 5648 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5649 __ br(Assembler::LE, LESS8); 5650 __ ldr(tmp1, Address(__ post(str1, 8))); 5651 __ ldr(tmp2, Address(__ post(str2, 8))); 5652 __ eor(rscratch2, tmp1, tmp2); 5653 __ cbnz(rscratch2, CAL_DIFFERENCE); 5654 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5655 5656 __ bind(LESS8); // directly load last 8 bytes 5657 if (!isLL) { 5658 __ add(cnt2, cnt2, cnt2); 5659 } 5660 __ ldr(tmp1, Address(str1, cnt2)); 5661 __ ldr(tmp2, Address(str2, cnt2)); 5662 __ eor(rscratch2, tmp1, tmp2); 5663 __ cbz(rscratch2, LENGTH_DIFF); 5664 __ b(CAL_DIFFERENCE); 5665 5666 __ bind(DIFF); 5667 __ cmp(tmp1, tmp2); 5668 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5669 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5670 // reuse rscratch2 register for the result of eor instruction 5671 __ eor(rscratch2, tmp1, tmp2); 5672 5673 __ bind(CAL_DIFFERENCE); 5674 __ rev(rscratch2, rscratch2); 5675 __ clz(rscratch2, rscratch2); 5676 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5677 __ lsrv(tmp1, tmp1, rscratch2); 5678 __ lsrv(tmp2, tmp2, rscratch2); 5679 if (isLL) { 5680 __ uxtbw(tmp1, tmp1); 5681 __ uxtbw(tmp2, tmp2); 5682 } else { 5683 __ uxthw(tmp1, tmp1); 5684 __ uxthw(tmp2, tmp2); 5685 } 5686 __ subw(result, tmp1, tmp2); 5687 5688 __ bind(LENGTH_DIFF); 5689 __ ret(lr); 5690 return entry; 5691 } 5692 5693 enum string_compare_mode { 5694 LL, 5695 LU, 5696 UL, 5697 UU, 5698 }; 5699 5700 // The following registers are declared in aarch64.ad 5701 // r0 = result 5702 // r1 = str1 5703 // r2 = cnt1 5704 // r3 = str2 5705 // r4 = cnt2 5706 // r10 = tmp1 5707 // r11 = tmp2 5708 // z0 = ztmp1 5709 // z1 = ztmp2 5710 // p0 = pgtmp1 5711 // p1 = pgtmp2 5712 address generate_compare_long_string_sve(string_compare_mode mode) { 5713 __ align(CodeEntryAlignment); 5714 address entry = __ pc(); 5715 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5716 tmp1 = r10, tmp2 = r11; 5717 5718 Label LOOP, DONE, MISMATCH; 5719 Register vec_len = tmp1; 5720 Register idx = tmp2; 5721 // The minimum of the string lengths has been stored in cnt2. 5722 Register cnt = cnt2; 5723 FloatRegister ztmp1 = z0, ztmp2 = z1; 5724 PRegister pgtmp1 = p0, pgtmp2 = p1; 5725 5726 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 5727 switch (mode) { \ 5728 case LL: \ 5729 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 5730 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 5731 break; \ 5732 case LU: \ 5733 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 5734 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5735 break; \ 5736 case UL: \ 5737 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5738 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 5739 break; \ 5740 case UU: \ 5741 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5742 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5743 break; \ 5744 default: \ 5745 ShouldNotReachHere(); \ 5746 } 5747 5748 const char* stubname; 5749 switch (mode) { 5750 case LL: stubname = "compare_long_string_same_encoding LL"; break; 5751 case LU: stubname = "compare_long_string_different_encoding LU"; break; 5752 case UL: stubname = "compare_long_string_different_encoding UL"; break; 5753 case UU: stubname = "compare_long_string_same_encoding UU"; break; 5754 default: ShouldNotReachHere(); 5755 } 5756 5757 StubCodeMark mark(this, "StubRoutines", stubname); 5758 5759 __ mov(idx, 0); 5760 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5761 5762 if (mode == LL) { 5763 __ sve_cntb(vec_len); 5764 } else { 5765 __ sve_cnth(vec_len); 5766 } 5767 5768 __ sub(rscratch1, cnt, vec_len); 5769 5770 __ bind(LOOP); 5771 5772 // main loop 5773 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5774 __ add(idx, idx, vec_len); 5775 // Compare strings. 5776 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5777 __ br(__ NE, MISMATCH); 5778 __ cmp(idx, rscratch1); 5779 __ br(__ LT, LOOP); 5780 5781 // post loop, last iteration 5782 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5783 5784 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5785 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5786 __ br(__ EQ, DONE); 5787 5788 __ bind(MISMATCH); 5789 5790 // Crop the vector to find its location. 5791 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 5792 // Extract the first different characters of each string. 5793 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 5794 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 5795 5796 // Compute the difference of the first different characters. 5797 __ sub(result, rscratch1, rscratch2); 5798 5799 __ bind(DONE); 5800 __ ret(lr); 5801 #undef LOAD_PAIR 5802 return entry; 5803 } 5804 5805 void generate_compare_long_strings() { 5806 if (UseSVE == 0) { 5807 StubRoutines::aarch64::_compare_long_string_LL 5808 = generate_compare_long_string_same_encoding(true); 5809 StubRoutines::aarch64::_compare_long_string_UU 5810 = generate_compare_long_string_same_encoding(false); 5811 StubRoutines::aarch64::_compare_long_string_LU 5812 = generate_compare_long_string_different_encoding(true); 5813 StubRoutines::aarch64::_compare_long_string_UL 5814 = generate_compare_long_string_different_encoding(false); 5815 } else { 5816 StubRoutines::aarch64::_compare_long_string_LL 5817 = generate_compare_long_string_sve(LL); 5818 StubRoutines::aarch64::_compare_long_string_UU 5819 = generate_compare_long_string_sve(UU); 5820 StubRoutines::aarch64::_compare_long_string_LU 5821 = generate_compare_long_string_sve(LU); 5822 StubRoutines::aarch64::_compare_long_string_UL 5823 = generate_compare_long_string_sve(UL); 5824 } 5825 } 5826 5827 // R0 = result 5828 // R1 = str2 5829 // R2 = cnt1 5830 // R3 = str1 5831 // R4 = cnt2 5832 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 5833 // 5834 // This generic linear code use few additional ideas, which makes it faster: 5835 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5836 // in order to skip initial loading(help in systems with 1 ld pipeline) 5837 // 2) we can use "fast" algorithm of finding single character to search for 5838 // first symbol with less branches(1 branch per each loaded register instead 5839 // of branch for each symbol), so, this is where constants like 5840 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5841 // 3) after loading and analyzing 1st register of source string, it can be 5842 // used to search for every 1st character entry, saving few loads in 5843 // comparison with "simplier-but-slower" implementation 5844 // 4) in order to avoid lots of push/pop operations, code below is heavily 5845 // re-using/re-initializing/compressing register values, which makes code 5846 // larger and a bit less readable, however, most of extra operations are 5847 // issued during loads or branches, so, penalty is minimal 5848 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5849 const char* stubName = str1_isL 5850 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5851 : "indexof_linear_uu"; 5852 __ align(CodeEntryAlignment); 5853 StubCodeMark mark(this, "StubRoutines", stubName); 5854 address entry = __ pc(); 5855 5856 int str1_chr_size = str1_isL ? 1 : 2; 5857 int str2_chr_size = str2_isL ? 1 : 2; 5858 int str1_chr_shift = str1_isL ? 0 : 1; 5859 int str2_chr_shift = str2_isL ? 0 : 1; 5860 bool isL = str1_isL && str2_isL; 5861 // parameters 5862 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5863 // temporary registers 5864 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5865 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5866 // redefinitions 5867 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5868 5869 __ push(spilled_regs, sp); 5870 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5871 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5872 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5873 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5874 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5875 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5876 // Read whole register from str1. It is safe, because length >=8 here 5877 __ ldr(ch1, Address(str1)); 5878 // Read whole register from str2. It is safe, because length >=8 here 5879 __ ldr(ch2, Address(str2)); 5880 __ sub(cnt2, cnt2, cnt1); 5881 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5882 if (str1_isL != str2_isL) { 5883 __ eor(v0, __ T16B, v0, v0); 5884 } 5885 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5886 __ mul(first, first, tmp1); 5887 // check if we have less than 1 register to check 5888 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5889 if (str1_isL != str2_isL) { 5890 __ fmovd(v1, ch1); 5891 } 5892 __ br(__ LE, L_SMALL); 5893 __ eor(ch2, first, ch2); 5894 if (str1_isL != str2_isL) { 5895 __ zip1(v1, __ T16B, v1, v0); 5896 } 5897 __ sub(tmp2, ch2, tmp1); 5898 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5899 __ bics(tmp2, tmp2, ch2); 5900 if (str1_isL != str2_isL) { 5901 __ fmovd(ch1, v1); 5902 } 5903 __ br(__ NE, L_HAS_ZERO); 5904 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5905 __ add(result, result, wordSize/str2_chr_size); 5906 __ add(str2, str2, wordSize); 5907 __ br(__ LT, L_POST_LOOP); 5908 __ BIND(L_LOOP); 5909 __ ldr(ch2, Address(str2)); 5910 __ eor(ch2, first, ch2); 5911 __ sub(tmp2, ch2, tmp1); 5912 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5913 __ bics(tmp2, tmp2, ch2); 5914 __ br(__ NE, L_HAS_ZERO); 5915 __ BIND(L_LOOP_PROCEED); 5916 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5917 __ add(str2, str2, wordSize); 5918 __ add(result, result, wordSize/str2_chr_size); 5919 __ br(__ GE, L_LOOP); 5920 __ BIND(L_POST_LOOP); 5921 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5922 __ br(__ LE, NOMATCH); 5923 __ ldr(ch2, Address(str2)); 5924 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5925 __ eor(ch2, first, ch2); 5926 __ sub(tmp2, ch2, tmp1); 5927 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5928 __ mov(tmp4, -1); // all bits set 5929 __ b(L_SMALL_PROCEED); 5930 __ align(OptoLoopAlignment); 5931 __ BIND(L_SMALL); 5932 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5933 __ eor(ch2, first, ch2); 5934 if (str1_isL != str2_isL) { 5935 __ zip1(v1, __ T16B, v1, v0); 5936 } 5937 __ sub(tmp2, ch2, tmp1); 5938 __ mov(tmp4, -1); // all bits set 5939 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5940 if (str1_isL != str2_isL) { 5941 __ fmovd(ch1, v1); // move converted 4 symbols 5942 } 5943 __ BIND(L_SMALL_PROCEED); 5944 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5945 __ bic(tmp2, tmp2, ch2); 5946 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5947 __ rbit(tmp2, tmp2); 5948 __ br(__ EQ, NOMATCH); 5949 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5950 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5951 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5952 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5953 if (str2_isL) { // LL 5954 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5955 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5956 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5957 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5958 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5959 } else { 5960 __ mov(ch2, 0xE); // all bits in byte set except last one 5961 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5962 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5963 __ lslv(tmp2, tmp2, tmp4); 5964 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5965 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5966 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5967 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5968 } 5969 __ cmp(ch1, ch2); 5970 __ mov(tmp4, wordSize/str2_chr_size); 5971 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5972 __ BIND(L_SMALL_CMP_LOOP); 5973 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5974 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5975 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5976 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5977 __ add(tmp4, tmp4, 1); 5978 __ cmp(tmp4, cnt1); 5979 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5980 __ cmp(first, ch2); 5981 __ br(__ EQ, L_SMALL_CMP_LOOP); 5982 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5983 __ cbz(tmp2, NOMATCH); // no more matches. exit 5984 __ clz(tmp4, tmp2); 5985 __ add(result, result, 1); // advance index 5986 __ add(str2, str2, str2_chr_size); // advance pointer 5987 __ b(L_SMALL_HAS_ZERO_LOOP); 5988 __ align(OptoLoopAlignment); 5989 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5990 __ cmp(first, ch2); 5991 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5992 __ b(DONE); 5993 __ align(OptoLoopAlignment); 5994 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5995 if (str2_isL) { // LL 5996 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5997 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5998 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5999 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6000 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6001 } else { 6002 __ mov(ch2, 0xE); // all bits in byte set except last one 6003 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6004 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6005 __ lslv(tmp2, tmp2, tmp4); 6006 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6007 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6008 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6009 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6010 } 6011 __ cmp(ch1, ch2); 6012 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6013 __ b(DONE); 6014 __ align(OptoLoopAlignment); 6015 __ BIND(L_HAS_ZERO); 6016 __ rbit(tmp2, tmp2); 6017 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 6018 // Now, perform compression of counters(cnt2 and cnt1) into one register. 6019 // It's fine because both counters are 32bit and are not changed in this 6020 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 6021 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 6022 __ sub(result, result, 1); 6023 __ BIND(L_HAS_ZERO_LOOP); 6024 __ mov(cnt1, wordSize/str2_chr_size); 6025 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6026 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6027 if (str2_isL) { 6028 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6029 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6030 __ lslv(tmp2, tmp2, tmp4); 6031 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6032 __ add(tmp4, tmp4, 1); 6033 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6034 __ lsl(tmp2, tmp2, 1); 6035 __ mov(tmp4, wordSize/str2_chr_size); 6036 } else { 6037 __ mov(ch2, 0xE); 6038 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6039 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6040 __ lslv(tmp2, tmp2, tmp4); 6041 __ add(tmp4, tmp4, 1); 6042 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6043 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6044 __ lsl(tmp2, tmp2, 1); 6045 __ mov(tmp4, wordSize/str2_chr_size); 6046 __ sub(str2, str2, str2_chr_size); 6047 } 6048 __ cmp(ch1, ch2); 6049 __ mov(tmp4, wordSize/str2_chr_size); 6050 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6051 __ BIND(L_CMP_LOOP); 6052 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6053 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6054 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6055 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6056 __ add(tmp4, tmp4, 1); 6057 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6058 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6059 __ cmp(cnt1, ch2); 6060 __ br(__ EQ, L_CMP_LOOP); 6061 __ BIND(L_CMP_LOOP_NOMATCH); 6062 // here we're not matched 6063 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6064 __ clz(tmp4, tmp2); 6065 __ add(str2, str2, str2_chr_size); // advance pointer 6066 __ b(L_HAS_ZERO_LOOP); 6067 __ align(OptoLoopAlignment); 6068 __ BIND(L_CMP_LOOP_LAST_CMP); 6069 __ cmp(cnt1, ch2); 6070 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6071 __ b(DONE); 6072 __ align(OptoLoopAlignment); 6073 __ BIND(L_CMP_LOOP_LAST_CMP2); 6074 if (str2_isL) { 6075 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6076 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6077 __ lslv(tmp2, tmp2, tmp4); 6078 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6079 __ add(tmp4, tmp4, 1); 6080 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6081 __ lsl(tmp2, tmp2, 1); 6082 } else { 6083 __ mov(ch2, 0xE); 6084 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6085 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6086 __ lslv(tmp2, tmp2, tmp4); 6087 __ add(tmp4, tmp4, 1); 6088 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6089 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6090 __ lsl(tmp2, tmp2, 1); 6091 __ sub(str2, str2, str2_chr_size); 6092 } 6093 __ cmp(ch1, ch2); 6094 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6095 __ b(DONE); 6096 __ align(OptoLoopAlignment); 6097 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6098 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6099 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6100 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6101 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6102 // result by analyzed characters value, so, we can just reset lower bits 6103 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6104 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6105 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6106 // index of last analyzed substring inside current octet. So, str2 in at 6107 // respective start address. We need to advance it to next octet 6108 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6109 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6110 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6111 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6112 __ movw(cnt2, cnt2); 6113 __ b(L_LOOP_PROCEED); 6114 __ align(OptoLoopAlignment); 6115 __ BIND(NOMATCH); 6116 __ mov(result, -1); 6117 __ BIND(DONE); 6118 __ pop(spilled_regs, sp); 6119 __ ret(lr); 6120 return entry; 6121 } 6122 6123 void generate_string_indexof_stubs() { 6124 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6125 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6126 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6127 } 6128 6129 void inflate_and_store_2_fp_registers(bool generatePrfm, 6130 FloatRegister src1, FloatRegister src2) { 6131 Register dst = r1; 6132 __ zip1(v1, __ T16B, src1, v0); 6133 __ zip2(v2, __ T16B, src1, v0); 6134 if (generatePrfm) { 6135 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6136 } 6137 __ zip1(v3, __ T16B, src2, v0); 6138 __ zip2(v4, __ T16B, src2, v0); 6139 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6140 } 6141 6142 // R0 = src 6143 // R1 = dst 6144 // R2 = len 6145 // R3 = len >> 3 6146 // V0 = 0 6147 // v1 = loaded 8 bytes 6148 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6149 address generate_large_byte_array_inflate() { 6150 __ align(CodeEntryAlignment); 6151 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6152 address entry = __ pc(); 6153 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6154 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6155 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6156 6157 // do one more 8-byte read to have address 16-byte aligned in most cases 6158 // also use single store instruction 6159 __ ldrd(v2, __ post(src, 8)); 6160 __ sub(octetCounter, octetCounter, 2); 6161 __ zip1(v1, __ T16B, v1, v0); 6162 __ zip1(v2, __ T16B, v2, v0); 6163 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6164 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6165 __ subs(rscratch1, octetCounter, large_loop_threshold); 6166 __ br(__ LE, LOOP_START); 6167 __ b(LOOP_PRFM_START); 6168 __ bind(LOOP_PRFM); 6169 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6170 __ bind(LOOP_PRFM_START); 6171 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6172 __ sub(octetCounter, octetCounter, 8); 6173 __ subs(rscratch1, octetCounter, large_loop_threshold); 6174 inflate_and_store_2_fp_registers(true, v3, v4); 6175 inflate_and_store_2_fp_registers(true, v5, v6); 6176 __ br(__ GT, LOOP_PRFM); 6177 __ cmp(octetCounter, (u1)8); 6178 __ br(__ LT, DONE); 6179 __ bind(LOOP); 6180 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6181 __ bind(LOOP_START); 6182 __ sub(octetCounter, octetCounter, 8); 6183 __ cmp(octetCounter, (u1)8); 6184 inflate_and_store_2_fp_registers(false, v3, v4); 6185 inflate_and_store_2_fp_registers(false, v5, v6); 6186 __ br(__ GE, LOOP); 6187 __ bind(DONE); 6188 __ ret(lr); 6189 return entry; 6190 } 6191 6192 /** 6193 * Arguments: 6194 * 6195 * Input: 6196 * c_rarg0 - current state address 6197 * c_rarg1 - H key address 6198 * c_rarg2 - data address 6199 * c_rarg3 - number of blocks 6200 * 6201 * Output: 6202 * Updated state at c_rarg0 6203 */ 6204 address generate_ghash_processBlocks() { 6205 // Bafflingly, GCM uses little-endian for the byte order, but 6206 // big-endian for the bit order. For example, the polynomial 1 is 6207 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6208 // 6209 // So, we must either reverse the bytes in each word and do 6210 // everything big-endian or reverse the bits in each byte and do 6211 // it little-endian. On AArch64 it's more idiomatic to reverse 6212 // the bits in each byte (we have an instruction, RBIT, to do 6213 // that) and keep the data in little-endian bit order through the 6214 // calculation, bit-reversing the inputs and outputs. 6215 6216 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6217 __ align(wordSize * 2); 6218 address p = __ pc(); 6219 __ emit_int64(0x87); // The low-order bits of the field 6220 // polynomial (i.e. p = z^7+z^2+z+1) 6221 // repeated in the low and high parts of a 6222 // 128-bit vector 6223 __ emit_int64(0x87); 6224 6225 __ align(CodeEntryAlignment); 6226 address start = __ pc(); 6227 6228 Register state = c_rarg0; 6229 Register subkeyH = c_rarg1; 6230 Register data = c_rarg2; 6231 Register blocks = c_rarg3; 6232 6233 FloatRegister vzr = v30; 6234 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6235 6236 __ ldrq(v24, p); // The field polynomial 6237 6238 __ ldrq(v0, Address(state)); 6239 __ ldrq(v1, Address(subkeyH)); 6240 6241 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6242 __ rbit(v0, __ T16B, v0); 6243 __ rev64(v1, __ T16B, v1); 6244 __ rbit(v1, __ T16B, v1); 6245 6246 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6247 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6248 6249 { 6250 Label L_ghash_loop; 6251 __ bind(L_ghash_loop); 6252 6253 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6254 // reversing each byte 6255 __ rbit(v2, __ T16B, v2); 6256 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6257 6258 // Multiply state in v2 by subkey in v1 6259 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6260 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6261 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6262 // Reduce v7:v5 by the field polynomial 6263 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6264 6265 __ sub(blocks, blocks, 1); 6266 __ cbnz(blocks, L_ghash_loop); 6267 } 6268 6269 // The bit-reversed result is at this point in v0 6270 __ rev64(v0, __ T16B, v0); 6271 __ rbit(v0, __ T16B, v0); 6272 6273 __ st1(v0, __ T16B, state); 6274 __ ret(lr); 6275 6276 return start; 6277 } 6278 6279 address generate_ghash_processBlocks_wide() { 6280 address small = generate_ghash_processBlocks(); 6281 6282 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6283 __ align(wordSize * 2); 6284 address p = __ pc(); 6285 __ emit_int64(0x87); // The low-order bits of the field 6286 // polynomial (i.e. p = z^7+z^2+z+1) 6287 // repeated in the low and high parts of a 6288 // 128-bit vector 6289 __ emit_int64(0x87); 6290 6291 __ align(CodeEntryAlignment); 6292 address start = __ pc(); 6293 6294 Register state = c_rarg0; 6295 Register subkeyH = c_rarg1; 6296 Register data = c_rarg2; 6297 Register blocks = c_rarg3; 6298 6299 const int unroll = 4; 6300 6301 __ cmp(blocks, (unsigned char)(unroll * 2)); 6302 __ br(__ LT, small); 6303 6304 if (unroll > 1) { 6305 // Save state before entering routine 6306 __ sub(sp, sp, 4 * 16); 6307 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6308 __ sub(sp, sp, 4 * 16); 6309 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6310 } 6311 6312 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6313 6314 if (unroll > 1) { 6315 // And restore state 6316 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6317 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6318 } 6319 6320 __ cmp(blocks, (unsigned char)0); 6321 __ br(__ GT, small); 6322 6323 __ ret(lr); 6324 6325 return start; 6326 } 6327 6328 void generate_base64_encode_simdround(Register src, Register dst, 6329 FloatRegister codec, u8 size) { 6330 6331 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6332 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6333 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6334 6335 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6336 6337 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6338 6339 __ ushr(ind0, arrangement, in0, 2); 6340 6341 __ ushr(ind1, arrangement, in1, 2); 6342 __ shl(in0, arrangement, in0, 6); 6343 __ orr(ind1, arrangement, ind1, in0); 6344 __ ushr(ind1, arrangement, ind1, 2); 6345 6346 __ ushr(ind2, arrangement, in2, 4); 6347 __ shl(in1, arrangement, in1, 4); 6348 __ orr(ind2, arrangement, in1, ind2); 6349 __ ushr(ind2, arrangement, ind2, 2); 6350 6351 __ shl(ind3, arrangement, in2, 2); 6352 __ ushr(ind3, arrangement, ind3, 2); 6353 6354 __ tbl(out0, arrangement, codec, 4, ind0); 6355 __ tbl(out1, arrangement, codec, 4, ind1); 6356 __ tbl(out2, arrangement, codec, 4, ind2); 6357 __ tbl(out3, arrangement, codec, 4, ind3); 6358 6359 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6360 } 6361 6362 /** 6363 * Arguments: 6364 * 6365 * Input: 6366 * c_rarg0 - src_start 6367 * c_rarg1 - src_offset 6368 * c_rarg2 - src_length 6369 * c_rarg3 - dest_start 6370 * c_rarg4 - dest_offset 6371 * c_rarg5 - isURL 6372 * 6373 */ 6374 address generate_base64_encodeBlock() { 6375 6376 static const char toBase64[64] = { 6377 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6378 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6379 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6380 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6381 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6382 }; 6383 6384 static const char toBase64URL[64] = { 6385 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6386 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6387 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6388 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6389 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6390 }; 6391 6392 __ align(CodeEntryAlignment); 6393 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6394 address start = __ pc(); 6395 6396 Register src = c_rarg0; // source array 6397 Register soff = c_rarg1; // source start offset 6398 Register send = c_rarg2; // source end offset 6399 Register dst = c_rarg3; // dest array 6400 Register doff = c_rarg4; // position for writing to dest array 6401 Register isURL = c_rarg5; // Base64 or URL character set 6402 6403 // c_rarg6 and c_rarg7 are free to use as temps 6404 Register codec = c_rarg6; 6405 Register length = c_rarg7; 6406 6407 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6408 6409 __ add(src, src, soff); 6410 __ add(dst, dst, doff); 6411 __ sub(length, send, soff); 6412 6413 // load the codec base address 6414 __ lea(codec, ExternalAddress((address) toBase64)); 6415 __ cbz(isURL, ProcessData); 6416 __ lea(codec, ExternalAddress((address) toBase64URL)); 6417 6418 __ BIND(ProcessData); 6419 6420 // too short to formup a SIMD loop, roll back 6421 __ cmp(length, (u1)24); 6422 __ br(Assembler::LT, Process3B); 6423 6424 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6425 6426 __ BIND(Process48B); 6427 __ cmp(length, (u1)48); 6428 __ br(Assembler::LT, Process24B); 6429 generate_base64_encode_simdround(src, dst, v0, 16); 6430 __ sub(length, length, 48); 6431 __ b(Process48B); 6432 6433 __ BIND(Process24B); 6434 __ cmp(length, (u1)24); 6435 __ br(Assembler::LT, SIMDExit); 6436 generate_base64_encode_simdround(src, dst, v0, 8); 6437 __ sub(length, length, 24); 6438 6439 __ BIND(SIMDExit); 6440 __ cbz(length, Exit); 6441 6442 __ BIND(Process3B); 6443 // 3 src bytes, 24 bits 6444 __ ldrb(r10, __ post(src, 1)); 6445 __ ldrb(r11, __ post(src, 1)); 6446 __ ldrb(r12, __ post(src, 1)); 6447 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6448 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6449 // codec index 6450 __ ubfmw(r15, r12, 18, 23); 6451 __ ubfmw(r14, r12, 12, 17); 6452 __ ubfmw(r13, r12, 6, 11); 6453 __ andw(r12, r12, 63); 6454 // get the code based on the codec 6455 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6456 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6457 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6458 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6459 __ strb(r15, __ post(dst, 1)); 6460 __ strb(r14, __ post(dst, 1)); 6461 __ strb(r13, __ post(dst, 1)); 6462 __ strb(r12, __ post(dst, 1)); 6463 __ sub(length, length, 3); 6464 __ cbnz(length, Process3B); 6465 6466 __ BIND(Exit); 6467 __ ret(lr); 6468 6469 return start; 6470 } 6471 6472 void generate_base64_decode_simdround(Register src, Register dst, 6473 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6474 6475 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6476 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6477 6478 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6479 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6480 6481 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6482 6483 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6484 6485 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6486 6487 // we need unsigned saturating subtract, to make sure all input values 6488 // in range [0, 63] will have 0U value in the higher half lookup 6489 __ uqsubv(decH0, __ T16B, in0, v27); 6490 __ uqsubv(decH1, __ T16B, in1, v27); 6491 __ uqsubv(decH2, __ T16B, in2, v27); 6492 __ uqsubv(decH3, __ T16B, in3, v27); 6493 6494 // lower half lookup 6495 __ tbl(decL0, arrangement, codecL, 4, in0); 6496 __ tbl(decL1, arrangement, codecL, 4, in1); 6497 __ tbl(decL2, arrangement, codecL, 4, in2); 6498 __ tbl(decL3, arrangement, codecL, 4, in3); 6499 6500 // higher half lookup 6501 __ tbx(decH0, arrangement, codecH, 4, decH0); 6502 __ tbx(decH1, arrangement, codecH, 4, decH1); 6503 __ tbx(decH2, arrangement, codecH, 4, decH2); 6504 __ tbx(decH3, arrangement, codecH, 4, decH3); 6505 6506 // combine lower and higher 6507 __ orr(decL0, arrangement, decL0, decH0); 6508 __ orr(decL1, arrangement, decL1, decH1); 6509 __ orr(decL2, arrangement, decL2, decH2); 6510 __ orr(decL3, arrangement, decL3, decH3); 6511 6512 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6513 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6514 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6515 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6516 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6517 __ orr(in0, arrangement, decH0, decH1); 6518 __ orr(in1, arrangement, decH2, decH3); 6519 __ orr(in2, arrangement, in0, in1); 6520 __ umaxv(in3, arrangement, in2); 6521 __ umov(rscratch2, in3, __ B, 0); 6522 6523 // get the data to output 6524 __ shl(out0, arrangement, decL0, 2); 6525 __ ushr(out1, arrangement, decL1, 4); 6526 __ orr(out0, arrangement, out0, out1); 6527 __ shl(out1, arrangement, decL1, 4); 6528 __ ushr(out2, arrangement, decL2, 2); 6529 __ orr(out1, arrangement, out1, out2); 6530 __ shl(out2, arrangement, decL2, 6); 6531 __ orr(out2, arrangement, out2, decL3); 6532 6533 __ cbz(rscratch2, NoIllegalData); 6534 6535 // handle illegal input 6536 __ umov(r10, in2, __ D, 0); 6537 if (size == 16) { 6538 __ cbnz(r10, ErrorInLowerHalf); 6539 6540 // illegal input is in higher half, store the lower half now. 6541 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6542 6543 __ umov(r10, in2, __ D, 1); 6544 __ umov(r11, out0, __ D, 1); 6545 __ umov(r12, out1, __ D, 1); 6546 __ umov(r13, out2, __ D, 1); 6547 __ b(StoreLegalData); 6548 6549 __ BIND(ErrorInLowerHalf); 6550 } 6551 __ umov(r11, out0, __ D, 0); 6552 __ umov(r12, out1, __ D, 0); 6553 __ umov(r13, out2, __ D, 0); 6554 6555 __ BIND(StoreLegalData); 6556 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6557 __ strb(r11, __ post(dst, 1)); 6558 __ strb(r12, __ post(dst, 1)); 6559 __ strb(r13, __ post(dst, 1)); 6560 __ lsr(r10, r10, 8); 6561 __ lsr(r11, r11, 8); 6562 __ lsr(r12, r12, 8); 6563 __ lsr(r13, r13, 8); 6564 __ b(StoreLegalData); 6565 6566 __ BIND(NoIllegalData); 6567 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6568 } 6569 6570 6571 /** 6572 * Arguments: 6573 * 6574 * Input: 6575 * c_rarg0 - src_start 6576 * c_rarg1 - src_offset 6577 * c_rarg2 - src_length 6578 * c_rarg3 - dest_start 6579 * c_rarg4 - dest_offset 6580 * c_rarg5 - isURL 6581 * c_rarg6 - isMIME 6582 * 6583 */ 6584 address generate_base64_decodeBlock() { 6585 6586 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6587 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6588 // titled "Base64 decoding". 6589 6590 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6591 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6592 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6593 static const uint8_t fromBase64ForNoSIMD[256] = { 6594 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6595 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6596 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6597 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6598 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6599 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6600 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6601 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6602 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6603 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6604 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6605 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6606 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6607 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6608 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6609 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6610 }; 6611 6612 static const uint8_t fromBase64URLForNoSIMD[256] = { 6613 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6614 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6615 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6616 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6617 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6618 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6619 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6620 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6621 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6622 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6623 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6624 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6625 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6626 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6627 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6628 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6629 }; 6630 6631 // A legal value of base64 code is in range [0, 127]. We need two lookups 6632 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6633 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6634 // table vector lookup use tbx, out of range indices are unchanged in 6635 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6636 // The value of index 64 is set to 0, so that we know that we already get the 6637 // decoded data with the 1st lookup. 6638 static const uint8_t fromBase64ForSIMD[128] = { 6639 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6640 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6641 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6642 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6643 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6644 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6645 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6646 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6647 }; 6648 6649 static const uint8_t fromBase64URLForSIMD[128] = { 6650 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6651 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6652 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6653 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6654 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6655 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6656 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6657 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6658 }; 6659 6660 __ align(CodeEntryAlignment); 6661 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6662 address start = __ pc(); 6663 6664 Register src = c_rarg0; // source array 6665 Register soff = c_rarg1; // source start offset 6666 Register send = c_rarg2; // source end offset 6667 Register dst = c_rarg3; // dest array 6668 Register doff = c_rarg4; // position for writing to dest array 6669 Register isURL = c_rarg5; // Base64 or URL character set 6670 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6671 6672 Register length = send; // reuse send as length of source data to process 6673 6674 Register simd_codec = c_rarg6; 6675 Register nosimd_codec = c_rarg7; 6676 6677 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6678 6679 __ enter(); 6680 6681 __ add(src, src, soff); 6682 __ add(dst, dst, doff); 6683 6684 __ mov(doff, dst); 6685 6686 __ sub(length, send, soff); 6687 __ bfm(length, zr, 0, 1); 6688 6689 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6690 __ cbz(isURL, ProcessData); 6691 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6692 6693 __ BIND(ProcessData); 6694 __ mov(rscratch1, length); 6695 __ cmp(length, (u1)144); // 144 = 80 + 64 6696 __ br(Assembler::LT, Process4B); 6697 6698 // In the MIME case, the line length cannot be more than 76 6699 // bytes (see RFC 2045). This is too short a block for SIMD 6700 // to be worthwhile, so we use non-SIMD here. 6701 __ movw(rscratch1, 79); 6702 6703 __ BIND(Process4B); 6704 __ ldrw(r14, __ post(src, 4)); 6705 __ ubfxw(r10, r14, 0, 8); 6706 __ ubfxw(r11, r14, 8, 8); 6707 __ ubfxw(r12, r14, 16, 8); 6708 __ ubfxw(r13, r14, 24, 8); 6709 // get the de-code 6710 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6711 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6712 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6713 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6714 // error detection, 255u indicates an illegal input 6715 __ orrw(r14, r10, r11); 6716 __ orrw(r15, r12, r13); 6717 __ orrw(r14, r14, r15); 6718 __ tbnz(r14, 7, Exit); 6719 // recover the data 6720 __ lslw(r14, r10, 10); 6721 __ bfiw(r14, r11, 4, 6); 6722 __ bfmw(r14, r12, 2, 5); 6723 __ rev16w(r14, r14); 6724 __ bfiw(r13, r12, 6, 2); 6725 __ strh(r14, __ post(dst, 2)); 6726 __ strb(r13, __ post(dst, 1)); 6727 // non-simd loop 6728 __ subsw(rscratch1, rscratch1, 4); 6729 __ br(Assembler::GT, Process4B); 6730 6731 // if exiting from PreProcess80B, rscratch1 == -1; 6732 // otherwise, rscratch1 == 0. 6733 __ cbzw(rscratch1, Exit); 6734 __ sub(length, length, 80); 6735 6736 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6737 __ cbz(isURL, SIMDEnter); 6738 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6739 6740 __ BIND(SIMDEnter); 6741 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6742 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6743 __ mov(rscratch1, 63); 6744 __ dup(v27, __ T16B, rscratch1); 6745 6746 __ BIND(Process64B); 6747 __ cmp(length, (u1)64); 6748 __ br(Assembler::LT, Process32B); 6749 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6750 __ sub(length, length, 64); 6751 __ b(Process64B); 6752 6753 __ BIND(Process32B); 6754 __ cmp(length, (u1)32); 6755 __ br(Assembler::LT, SIMDExit); 6756 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6757 __ sub(length, length, 32); 6758 __ b(Process32B); 6759 6760 __ BIND(SIMDExit); 6761 __ cbz(length, Exit); 6762 __ movw(rscratch1, length); 6763 __ b(Process4B); 6764 6765 __ BIND(Exit); 6766 __ sub(c_rarg0, dst, doff); 6767 6768 __ leave(); 6769 __ ret(lr); 6770 6771 return start; 6772 } 6773 6774 // Support for spin waits. 6775 address generate_spin_wait() { 6776 __ align(CodeEntryAlignment); 6777 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6778 address start = __ pc(); 6779 6780 __ spin_wait(); 6781 __ ret(lr); 6782 6783 return start; 6784 } 6785 6786 address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { 6787 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table"); 6788 6789 address start = __ pc(); 6790 const Register 6791 r_super_klass = r0, 6792 r_array_base = r1, 6793 r_array_length = r2, 6794 r_array_index = r3, 6795 r_sub_klass = r4, 6796 r_bitmap = rscratch2, 6797 result = r5; 6798 const FloatRegister 6799 vtemp = v0; 6800 6801 Label L_success; 6802 __ enter(); 6803 __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, 6804 r_array_base, r_array_length, r_array_index, 6805 vtemp, result, super_klass_index, 6806 /*stub_is_near*/true); 6807 __ leave(); 6808 __ ret(lr); 6809 6810 return start; 6811 } 6812 6813 // Slow path implementation for UseSecondarySupersTable. 6814 address generate_lookup_secondary_supers_table_slow_path_stub() { 6815 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path"); 6816 6817 address start = __ pc(); 6818 const Register 6819 r_super_klass = r0, // argument 6820 r_array_base = r1, // argument 6821 temp1 = r2, // temp 6822 r_array_index = r3, // argument 6823 r_bitmap = rscratch2, // argument 6824 result = r5; // argument 6825 6826 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 6827 __ ret(lr); 6828 6829 return start; 6830 } 6831 6832 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 6833 6834 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6835 // 6836 // If LSE is in use, generate LSE versions of all the stubs. The 6837 // non-LSE versions are in atomic_aarch64.S. 6838 6839 // class AtomicStubMark records the entry point of a stub and the 6840 // stub pointer which will point to it. The stub pointer is set to 6841 // the entry point when ~AtomicStubMark() is called, which must be 6842 // after ICache::invalidate_range. This ensures safe publication of 6843 // the generated code. 6844 class AtomicStubMark { 6845 address _entry_point; 6846 aarch64_atomic_stub_t *_stub; 6847 MacroAssembler *_masm; 6848 public: 6849 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6850 _masm = masm; 6851 __ align(32); 6852 _entry_point = __ pc(); 6853 _stub = stub; 6854 } 6855 ~AtomicStubMark() { 6856 *_stub = (aarch64_atomic_stub_t)_entry_point; 6857 } 6858 }; 6859 6860 // NB: For memory_order_conservative we need a trailing membar after 6861 // LSE atomic operations but not a leading membar. 6862 // 6863 // We don't need a leading membar because a clause in the Arm ARM 6864 // says: 6865 // 6866 // Barrier-ordered-before 6867 // 6868 // Barrier instructions order prior Memory effects before subsequent 6869 // Memory effects generated by the same Observer. A read or a write 6870 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6871 // Observer if and only if RW1 appears in program order before RW 2 6872 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6873 // instruction with both Acquire and Release semantics. 6874 // 6875 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6876 // and Release semantics, therefore we don't need a leading 6877 // barrier. However, there is no corresponding Barrier-ordered-after 6878 // relationship, therefore we need a trailing membar to prevent a 6879 // later store or load from being reordered with the store in an 6880 // atomic instruction. 6881 // 6882 // This was checked by using the herd7 consistency model simulator 6883 // (http://diy.inria.fr/) with this test case: 6884 // 6885 // AArch64 LseCas 6886 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6887 // P0 | P1; 6888 // LDR W4, [X2] | MOV W3, #0; 6889 // DMB LD | MOV W4, #1; 6890 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6891 // | DMB ISH; 6892 // | STR W4, [X2]; 6893 // exists 6894 // (0:X3=0 /\ 0:X4=1) 6895 // 6896 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6897 // with the store to x in P1. Without the DMB in P1 this may happen. 6898 // 6899 // At the time of writing we don't know of any AArch64 hardware that 6900 // reorders stores in this way, but the Reference Manual permits it. 6901 6902 void gen_cas_entry(Assembler::operand_size size, 6903 atomic_memory_order order) { 6904 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6905 exchange_val = c_rarg2; 6906 bool acquire, release; 6907 switch (order) { 6908 case memory_order_relaxed: 6909 acquire = false; 6910 release = false; 6911 break; 6912 case memory_order_release: 6913 acquire = false; 6914 release = true; 6915 break; 6916 default: 6917 acquire = true; 6918 release = true; 6919 break; 6920 } 6921 __ mov(prev, compare_val); 6922 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6923 if (order == memory_order_conservative) { 6924 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6925 } 6926 if (size == Assembler::xword) { 6927 __ mov(r0, prev); 6928 } else { 6929 __ movw(r0, prev); 6930 } 6931 __ ret(lr); 6932 } 6933 6934 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6935 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6936 // If not relaxed, then default to conservative. Relaxed is the only 6937 // case we use enough to be worth specializing. 6938 if (order == memory_order_relaxed) { 6939 __ ldadd(size, incr, prev, addr); 6940 } else { 6941 __ ldaddal(size, incr, prev, addr); 6942 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6943 } 6944 if (size == Assembler::xword) { 6945 __ mov(r0, prev); 6946 } else { 6947 __ movw(r0, prev); 6948 } 6949 __ ret(lr); 6950 } 6951 6952 void gen_swpal_entry(Assembler::operand_size size) { 6953 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6954 __ swpal(size, incr, prev, addr); 6955 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6956 if (size == Assembler::xword) { 6957 __ mov(r0, prev); 6958 } else { 6959 __ movw(r0, prev); 6960 } 6961 __ ret(lr); 6962 } 6963 6964 void generate_atomic_entry_points() { 6965 if (! UseLSE) { 6966 return; 6967 } 6968 6969 __ align(CodeEntryAlignment); 6970 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6971 address first_entry = __ pc(); 6972 6973 // ADD, memory_order_conservative 6974 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6975 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6976 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6977 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6978 6979 // ADD, memory_order_relaxed 6980 AtomicStubMark mark_fetch_add_4_relaxed 6981 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6982 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6983 AtomicStubMark mark_fetch_add_8_relaxed 6984 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6985 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6986 6987 // XCHG, memory_order_conservative 6988 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6989 gen_swpal_entry(Assembler::word); 6990 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6991 gen_swpal_entry(Assembler::xword); 6992 6993 // CAS, memory_order_conservative 6994 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6995 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6996 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6997 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6998 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6999 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 7000 7001 // CAS, memory_order_relaxed 7002 AtomicStubMark mark_cmpxchg_1_relaxed 7003 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 7004 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 7005 AtomicStubMark mark_cmpxchg_4_relaxed 7006 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 7007 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 7008 AtomicStubMark mark_cmpxchg_8_relaxed 7009 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 7010 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 7011 7012 AtomicStubMark mark_cmpxchg_4_release 7013 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 7014 gen_cas_entry(MacroAssembler::word, memory_order_release); 7015 AtomicStubMark mark_cmpxchg_8_release 7016 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 7017 gen_cas_entry(MacroAssembler::xword, memory_order_release); 7018 7019 AtomicStubMark mark_cmpxchg_4_seq_cst 7020 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 7021 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 7022 AtomicStubMark mark_cmpxchg_8_seq_cst 7023 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 7024 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 7025 7026 ICache::invalidate_range(first_entry, __ pc() - first_entry); 7027 } 7028 #endif // LINUX 7029 7030 address generate_cont_thaw(Continuation::thaw_kind kind) { 7031 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 7032 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 7033 7034 address start = __ pc(); 7035 7036 if (return_barrier) { 7037 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 7038 __ mov(sp, rscratch1); 7039 } 7040 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7041 7042 if (return_barrier) { 7043 // preserve possible return value from a method returning to the return barrier 7044 __ fmovd(rscratch1, v0); 7045 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7046 } 7047 7048 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 7049 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 7050 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 7051 7052 if (return_barrier) { 7053 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7054 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7055 __ fmovd(v0, rscratch1); 7056 } 7057 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7058 7059 7060 Label thaw_success; 7061 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 7062 __ cbnz(rscratch2, thaw_success); 7063 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 7064 __ br(rscratch1); 7065 __ bind(thaw_success); 7066 7067 // make room for the thawed frames 7068 __ sub(rscratch1, sp, rscratch2); 7069 __ andr(rscratch1, rscratch1, -16); // align 7070 __ mov(sp, rscratch1); 7071 7072 if (return_barrier) { 7073 // save original return value -- again 7074 __ fmovd(rscratch1, v0); 7075 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7076 } 7077 7078 // If we want, we can templatize thaw by kind, and have three different entries 7079 __ movw(c_rarg1, (uint32_t)kind); 7080 7081 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7082 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7083 7084 if (return_barrier) { 7085 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7086 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7087 __ fmovd(v0, rscratch1); 7088 } else { 7089 __ mov(r0, zr); // return 0 (success) from doYield 7090 } 7091 7092 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7093 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7094 __ mov(rfp, sp); 7095 7096 if (return_barrier_exception) { 7097 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7098 __ authenticate_return_address(c_rarg1); 7099 __ verify_oop(r0); 7100 // save return value containing the exception oop in callee-saved R19 7101 __ mov(r19, r0); 7102 7103 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7104 7105 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7106 // __ reinitialize_ptrue(); 7107 7108 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7109 7110 __ mov(r1, r0); // the exception handler 7111 __ mov(r0, r19); // restore return value containing the exception oop 7112 __ verify_oop(r0); 7113 7114 __ leave(); 7115 __ mov(r3, lr); 7116 __ br(r1); // the exception handler 7117 } else { 7118 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7119 __ leave(); 7120 __ ret(lr); 7121 } 7122 7123 return start; 7124 } 7125 7126 address generate_cont_thaw() { 7127 if (!Continuations::enabled()) return nullptr; 7128 7129 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7130 address start = __ pc(); 7131 generate_cont_thaw(Continuation::thaw_top); 7132 return start; 7133 } 7134 7135 address generate_cont_returnBarrier() { 7136 if (!Continuations::enabled()) return nullptr; 7137 7138 // TODO: will probably need multiple return barriers depending on return type 7139 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7140 address start = __ pc(); 7141 7142 generate_cont_thaw(Continuation::thaw_return_barrier); 7143 7144 return start; 7145 } 7146 7147 address generate_cont_returnBarrier_exception() { 7148 if (!Continuations::enabled()) return nullptr; 7149 7150 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7151 address start = __ pc(); 7152 7153 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7154 7155 return start; 7156 } 7157 7158 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7159 // are represented as long[5], with BITS_PER_LIMB = 26. 7160 // Pack five 26-bit limbs into three 64-bit registers. 7161 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7162 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7163 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7164 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7165 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7166 7167 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7168 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7169 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7170 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7171 7172 if (dest2->is_valid()) { 7173 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7174 } else { 7175 #ifdef ASSERT 7176 Label OK; 7177 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7178 __ br(__ EQ, OK); 7179 __ stop("high bits of Poly1305 integer should be zero"); 7180 __ should_not_reach_here(); 7181 __ bind(OK); 7182 #endif 7183 } 7184 } 7185 7186 // As above, but return only a 128-bit integer, packed into two 7187 // 64-bit registers. 7188 void pack_26(Register dest0, Register dest1, Register src) { 7189 pack_26(dest0, dest1, noreg, src); 7190 } 7191 7192 // Multiply and multiply-accumulate unsigned 64-bit registers. 7193 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7194 __ mul(prod_lo, n, m); 7195 __ umulh(prod_hi, n, m); 7196 } 7197 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7198 wide_mul(rscratch1, rscratch2, n, m); 7199 __ adds(sum_lo, sum_lo, rscratch1); 7200 __ adc(sum_hi, sum_hi, rscratch2); 7201 } 7202 7203 // Poly1305, RFC 7539 7204 7205 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7206 // description of the tricks used to simplify and accelerate this 7207 // computation. 7208 7209 address generate_poly1305_processBlocks() { 7210 __ align(CodeEntryAlignment); 7211 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 7212 address start = __ pc(); 7213 Label here; 7214 __ enter(); 7215 RegSet callee_saved = RegSet::range(r19, r28); 7216 __ push(callee_saved, sp); 7217 7218 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7219 7220 // Arguments 7221 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7222 7223 // R_n is the 128-bit randomly-generated key, packed into two 7224 // registers. The caller passes this key to us as long[5], with 7225 // BITS_PER_LIMB = 26. 7226 const Register R_0 = *++regs, R_1 = *++regs; 7227 pack_26(R_0, R_1, r_start); 7228 7229 // RR_n is (R_n >> 2) * 5 7230 const Register RR_0 = *++regs, RR_1 = *++regs; 7231 __ lsr(RR_0, R_0, 2); 7232 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7233 __ lsr(RR_1, R_1, 2); 7234 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7235 7236 // U_n is the current checksum 7237 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7238 pack_26(U_0, U_1, U_2, acc_start); 7239 7240 static constexpr int BLOCK_LENGTH = 16; 7241 Label DONE, LOOP; 7242 7243 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7244 __ br(Assembler::LT, DONE); { 7245 __ bind(LOOP); 7246 7247 // S_n is to be the sum of U_n and the next block of data 7248 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7249 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7250 __ adds(S_0, U_0, S_0); 7251 __ adcs(S_1, U_1, S_1); 7252 __ adc(S_2, U_2, zr); 7253 __ add(S_2, S_2, 1); 7254 7255 const Register U_0HI = *++regs, U_1HI = *++regs; 7256 7257 // NB: this logic depends on some of the special properties of 7258 // Poly1305 keys. In particular, because we know that the top 7259 // four bits of R_0 and R_1 are zero, we can add together 7260 // partial products without any risk of needing to propagate a 7261 // carry out. 7262 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7263 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7264 __ andr(U_2, R_0, 3); 7265 __ mul(U_2, S_2, U_2); 7266 7267 // Recycle registers S_0, S_1, S_2 7268 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7269 7270 // Partial reduction mod 2**130 - 5 7271 __ adds(U_1, U_0HI, U_1); 7272 __ adc(U_2, U_1HI, U_2); 7273 // Sum now in U_2:U_1:U_0. 7274 // Dead: U_0HI, U_1HI. 7275 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7276 7277 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7278 7279 // First, U_2:U_1:U_0 += (U_2 >> 2) 7280 __ lsr(rscratch1, U_2, 2); 7281 __ andr(U_2, U_2, (u8)3); 7282 __ adds(U_0, U_0, rscratch1); 7283 __ adcs(U_1, U_1, zr); 7284 __ adc(U_2, U_2, zr); 7285 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7286 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7287 __ adcs(U_1, U_1, zr); 7288 __ adc(U_2, U_2, zr); 7289 7290 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7291 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7292 __ br(~ Assembler::LT, LOOP); 7293 } 7294 7295 // Further reduce modulo 2^130 - 5 7296 __ lsr(rscratch1, U_2, 2); 7297 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7298 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7299 __ adcs(U_1, U_1, zr); 7300 __ andr(U_2, U_2, (u1)3); 7301 __ adc(U_2, U_2, zr); 7302 7303 // Unpack the sum into five 26-bit limbs and write to memory. 7304 __ ubfiz(rscratch1, U_0, 0, 26); 7305 __ ubfx(rscratch2, U_0, 26, 26); 7306 __ stp(rscratch1, rscratch2, Address(acc_start)); 7307 __ ubfx(rscratch1, U_0, 52, 12); 7308 __ bfi(rscratch1, U_1, 12, 14); 7309 __ ubfx(rscratch2, U_1, 14, 26); 7310 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7311 __ ubfx(rscratch1, U_1, 40, 24); 7312 __ bfi(rscratch1, U_2, 24, 3); 7313 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7314 7315 __ bind(DONE); 7316 __ pop(callee_saved, sp); 7317 __ leave(); 7318 __ ret(lr); 7319 7320 return start; 7321 } 7322 7323 // exception handler for upcall stubs 7324 address generate_upcall_stub_exception_handler() { 7325 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 7326 address start = __ pc(); 7327 7328 // Native caller has no idea how to handle exceptions, 7329 // so we just crash here. Up to callee to catch exceptions. 7330 __ verify_oop(r0); 7331 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7332 __ blr(rscratch1); 7333 __ should_not_reach_here(); 7334 7335 return start; 7336 } 7337 7338 // load Method* target of MethodHandle 7339 // j_rarg0 = jobject receiver 7340 // rmethod = result 7341 address generate_upcall_stub_load_target() { 7342 StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target"); 7343 address start = __ pc(); 7344 7345 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 7346 // Load target method from receiver 7347 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 7348 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 7349 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 7350 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 7351 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 7352 noreg, noreg); 7353 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 7354 7355 __ ret(lr); 7356 7357 return start; 7358 } 7359 7360 #undef __ 7361 #define __ masm-> 7362 7363 class MontgomeryMultiplyGenerator : public MacroAssembler { 7364 7365 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7366 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7367 7368 RegSet _toSave; 7369 bool _squaring; 7370 7371 public: 7372 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7373 : MacroAssembler(as->code()), _squaring(squaring) { 7374 7375 // Register allocation 7376 7377 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7378 Pa_base = *regs; // Argument registers 7379 if (squaring) 7380 Pb_base = Pa_base; 7381 else 7382 Pb_base = *++regs; 7383 Pn_base = *++regs; 7384 Rlen= *++regs; 7385 inv = *++regs; 7386 Pm_base = *++regs; 7387 7388 // Working registers: 7389 Ra = *++regs; // The current digit of a, b, n, and m. 7390 Rb = *++regs; 7391 Rm = *++regs; 7392 Rn = *++regs; 7393 7394 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7395 Pb = *++regs; 7396 Pm = *++regs; 7397 Pn = *++regs; 7398 7399 t0 = *++regs; // Three registers which form a 7400 t1 = *++regs; // triple-precision accumuator. 7401 t2 = *++regs; 7402 7403 Ri = *++regs; // Inner and outer loop indexes. 7404 Rj = *++regs; 7405 7406 Rhi_ab = *++regs; // Product registers: low and high parts 7407 Rlo_ab = *++regs; // of a*b and m*n. 7408 Rhi_mn = *++regs; 7409 Rlo_mn = *++regs; 7410 7411 // r19 and up are callee-saved. 7412 _toSave = RegSet::range(r19, *regs) + Pm_base; 7413 } 7414 7415 private: 7416 void save_regs() { 7417 push(_toSave, sp); 7418 } 7419 7420 void restore_regs() { 7421 pop(_toSave, sp); 7422 } 7423 7424 template <typename T> 7425 void unroll_2(Register count, T block) { 7426 Label loop, end, odd; 7427 tbnz(count, 0, odd); 7428 cbz(count, end); 7429 align(16); 7430 bind(loop); 7431 (this->*block)(); 7432 bind(odd); 7433 (this->*block)(); 7434 subs(count, count, 2); 7435 br(Assembler::GT, loop); 7436 bind(end); 7437 } 7438 7439 template <typename T> 7440 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7441 Label loop, end, odd; 7442 tbnz(count, 0, odd); 7443 cbz(count, end); 7444 align(16); 7445 bind(loop); 7446 (this->*block)(d, s, tmp); 7447 bind(odd); 7448 (this->*block)(d, s, tmp); 7449 subs(count, count, 2); 7450 br(Assembler::GT, loop); 7451 bind(end); 7452 } 7453 7454 void pre1(RegisterOrConstant i) { 7455 block_comment("pre1"); 7456 // Pa = Pa_base; 7457 // Pb = Pb_base + i; 7458 // Pm = Pm_base; 7459 // Pn = Pn_base + i; 7460 // Ra = *Pa; 7461 // Rb = *Pb; 7462 // Rm = *Pm; 7463 // Rn = *Pn; 7464 ldr(Ra, Address(Pa_base)); 7465 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7466 ldr(Rm, Address(Pm_base)); 7467 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7468 lea(Pa, Address(Pa_base)); 7469 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7470 lea(Pm, Address(Pm_base)); 7471 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7472 7473 // Zero the m*n result. 7474 mov(Rhi_mn, zr); 7475 mov(Rlo_mn, zr); 7476 } 7477 7478 // The core multiply-accumulate step of a Montgomery 7479 // multiplication. The idea is to schedule operations as a 7480 // pipeline so that instructions with long latencies (loads and 7481 // multiplies) have time to complete before their results are 7482 // used. This most benefits in-order implementations of the 7483 // architecture but out-of-order ones also benefit. 7484 void step() { 7485 block_comment("step"); 7486 // MACC(Ra, Rb, t0, t1, t2); 7487 // Ra = *++Pa; 7488 // Rb = *--Pb; 7489 umulh(Rhi_ab, Ra, Rb); 7490 mul(Rlo_ab, Ra, Rb); 7491 ldr(Ra, pre(Pa, wordSize)); 7492 ldr(Rb, pre(Pb, -wordSize)); 7493 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7494 // previous iteration. 7495 // MACC(Rm, Rn, t0, t1, t2); 7496 // Rm = *++Pm; 7497 // Rn = *--Pn; 7498 umulh(Rhi_mn, Rm, Rn); 7499 mul(Rlo_mn, Rm, Rn); 7500 ldr(Rm, pre(Pm, wordSize)); 7501 ldr(Rn, pre(Pn, -wordSize)); 7502 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7503 } 7504 7505 void post1() { 7506 block_comment("post1"); 7507 7508 // MACC(Ra, Rb, t0, t1, t2); 7509 // Ra = *++Pa; 7510 // Rb = *--Pb; 7511 umulh(Rhi_ab, Ra, Rb); 7512 mul(Rlo_ab, Ra, Rb); 7513 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7514 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7515 7516 // *Pm = Rm = t0 * inv; 7517 mul(Rm, t0, inv); 7518 str(Rm, Address(Pm)); 7519 7520 // MACC(Rm, Rn, t0, t1, t2); 7521 // t0 = t1; t1 = t2; t2 = 0; 7522 umulh(Rhi_mn, Rm, Rn); 7523 7524 #ifndef PRODUCT 7525 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7526 { 7527 mul(Rlo_mn, Rm, Rn); 7528 add(Rlo_mn, t0, Rlo_mn); 7529 Label ok; 7530 cbz(Rlo_mn, ok); { 7531 stop("broken Montgomery multiply"); 7532 } bind(ok); 7533 } 7534 #endif 7535 // We have very carefully set things up so that 7536 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7537 // the lower half of Rm * Rn because we know the result already: 7538 // it must be -t0. t0 + (-t0) must generate a carry iff 7539 // t0 != 0. So, rather than do a mul and an adds we just set 7540 // the carry flag iff t0 is nonzero. 7541 // 7542 // mul(Rlo_mn, Rm, Rn); 7543 // adds(zr, t0, Rlo_mn); 7544 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7545 adcs(t0, t1, Rhi_mn); 7546 adc(t1, t2, zr); 7547 mov(t2, zr); 7548 } 7549 7550 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7551 block_comment("pre2"); 7552 // Pa = Pa_base + i-len; 7553 // Pb = Pb_base + len; 7554 // Pm = Pm_base + i-len; 7555 // Pn = Pn_base + len; 7556 7557 if (i.is_register()) { 7558 sub(Rj, i.as_register(), len); 7559 } else { 7560 mov(Rj, i.as_constant()); 7561 sub(Rj, Rj, len); 7562 } 7563 // Rj == i-len 7564 7565 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7566 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7567 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7568 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7569 7570 // Ra = *++Pa; 7571 // Rb = *--Pb; 7572 // Rm = *++Pm; 7573 // Rn = *--Pn; 7574 ldr(Ra, pre(Pa, wordSize)); 7575 ldr(Rb, pre(Pb, -wordSize)); 7576 ldr(Rm, pre(Pm, wordSize)); 7577 ldr(Rn, pre(Pn, -wordSize)); 7578 7579 mov(Rhi_mn, zr); 7580 mov(Rlo_mn, zr); 7581 } 7582 7583 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7584 block_comment("post2"); 7585 if (i.is_constant()) { 7586 mov(Rj, i.as_constant()-len.as_constant()); 7587 } else { 7588 sub(Rj, i.as_register(), len); 7589 } 7590 7591 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7592 7593 // As soon as we know the least significant digit of our result, 7594 // store it. 7595 // Pm_base[i-len] = t0; 7596 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7597 7598 // t0 = t1; t1 = t2; t2 = 0; 7599 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7600 adc(t1, t2, zr); 7601 mov(t2, zr); 7602 } 7603 7604 // A carry in t0 after Montgomery multiplication means that we 7605 // should subtract multiples of n from our result in m. We'll 7606 // keep doing that until there is no carry. 7607 void normalize(RegisterOrConstant len) { 7608 block_comment("normalize"); 7609 // while (t0) 7610 // t0 = sub(Pm_base, Pn_base, t0, len); 7611 Label loop, post, again; 7612 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7613 cbz(t0, post); { 7614 bind(again); { 7615 mov(i, zr); 7616 mov(cnt, len); 7617 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7618 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7619 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7620 align(16); 7621 bind(loop); { 7622 sbcs(Rm, Rm, Rn); 7623 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7624 add(i, i, 1); 7625 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7626 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7627 sub(cnt, cnt, 1); 7628 } cbnz(cnt, loop); 7629 sbc(t0, t0, zr); 7630 } cbnz(t0, again); 7631 } bind(post); 7632 } 7633 7634 // Move memory at s to d, reversing words. 7635 // Increments d to end of copied memory 7636 // Destroys tmp1, tmp2 7637 // Preserves len 7638 // Leaves s pointing to the address which was in d at start 7639 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7640 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7641 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7642 7643 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7644 mov(tmp1, len); 7645 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7646 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7647 } 7648 // where 7649 void reverse1(Register d, Register s, Register tmp) { 7650 ldr(tmp, pre(s, -wordSize)); 7651 ror(tmp, tmp, 32); 7652 str(tmp, post(d, wordSize)); 7653 } 7654 7655 void step_squaring() { 7656 // An extra ACC 7657 step(); 7658 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7659 } 7660 7661 void last_squaring(RegisterOrConstant i) { 7662 Label dont; 7663 // if ((i & 1) == 0) { 7664 tbnz(i.as_register(), 0, dont); { 7665 // MACC(Ra, Rb, t0, t1, t2); 7666 // Ra = *++Pa; 7667 // Rb = *--Pb; 7668 umulh(Rhi_ab, Ra, Rb); 7669 mul(Rlo_ab, Ra, Rb); 7670 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7671 } bind(dont); 7672 } 7673 7674 void extra_step_squaring() { 7675 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7676 7677 // MACC(Rm, Rn, t0, t1, t2); 7678 // Rm = *++Pm; 7679 // Rn = *--Pn; 7680 umulh(Rhi_mn, Rm, Rn); 7681 mul(Rlo_mn, Rm, Rn); 7682 ldr(Rm, pre(Pm, wordSize)); 7683 ldr(Rn, pre(Pn, -wordSize)); 7684 } 7685 7686 void post1_squaring() { 7687 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7688 7689 // *Pm = Rm = t0 * inv; 7690 mul(Rm, t0, inv); 7691 str(Rm, Address(Pm)); 7692 7693 // MACC(Rm, Rn, t0, t1, t2); 7694 // t0 = t1; t1 = t2; t2 = 0; 7695 umulh(Rhi_mn, Rm, Rn); 7696 7697 #ifndef PRODUCT 7698 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7699 { 7700 mul(Rlo_mn, Rm, Rn); 7701 add(Rlo_mn, t0, Rlo_mn); 7702 Label ok; 7703 cbz(Rlo_mn, ok); { 7704 stop("broken Montgomery multiply"); 7705 } bind(ok); 7706 } 7707 #endif 7708 // We have very carefully set things up so that 7709 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7710 // the lower half of Rm * Rn because we know the result already: 7711 // it must be -t0. t0 + (-t0) must generate a carry iff 7712 // t0 != 0. So, rather than do a mul and an adds we just set 7713 // the carry flag iff t0 is nonzero. 7714 // 7715 // mul(Rlo_mn, Rm, Rn); 7716 // adds(zr, t0, Rlo_mn); 7717 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7718 adcs(t0, t1, Rhi_mn); 7719 adc(t1, t2, zr); 7720 mov(t2, zr); 7721 } 7722 7723 void acc(Register Rhi, Register Rlo, 7724 Register t0, Register t1, Register t2) { 7725 adds(t0, t0, Rlo); 7726 adcs(t1, t1, Rhi); 7727 adc(t2, t2, zr); 7728 } 7729 7730 public: 7731 /** 7732 * Fast Montgomery multiplication. The derivation of the 7733 * algorithm is in A Cryptographic Library for the Motorola 7734 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7735 * 7736 * Arguments: 7737 * 7738 * Inputs for multiplication: 7739 * c_rarg0 - int array elements a 7740 * c_rarg1 - int array elements b 7741 * c_rarg2 - int array elements n (the modulus) 7742 * c_rarg3 - int length 7743 * c_rarg4 - int inv 7744 * c_rarg5 - int array elements m (the result) 7745 * 7746 * Inputs for squaring: 7747 * c_rarg0 - int array elements a 7748 * c_rarg1 - int array elements n (the modulus) 7749 * c_rarg2 - int length 7750 * c_rarg3 - int inv 7751 * c_rarg4 - int array elements m (the result) 7752 * 7753 */ 7754 address generate_multiply() { 7755 Label argh, nothing; 7756 bind(argh); 7757 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7758 7759 align(CodeEntryAlignment); 7760 address entry = pc(); 7761 7762 cbzw(Rlen, nothing); 7763 7764 enter(); 7765 7766 // Make room. 7767 cmpw(Rlen, 512); 7768 br(Assembler::HI, argh); 7769 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7770 andr(sp, Ra, -2 * wordSize); 7771 7772 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7773 7774 { 7775 // Copy input args, reversing as we go. We use Ra as a 7776 // temporary variable. 7777 reverse(Ra, Pa_base, Rlen, t0, t1); 7778 if (!_squaring) 7779 reverse(Ra, Pb_base, Rlen, t0, t1); 7780 reverse(Ra, Pn_base, Rlen, t0, t1); 7781 } 7782 7783 // Push all call-saved registers and also Pm_base which we'll need 7784 // at the end. 7785 save_regs(); 7786 7787 #ifndef PRODUCT 7788 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7789 { 7790 ldr(Rn, Address(Pn_base, 0)); 7791 mul(Rlo_mn, Rn, inv); 7792 subs(zr, Rlo_mn, -1); 7793 Label ok; 7794 br(EQ, ok); { 7795 stop("broken inverse in Montgomery multiply"); 7796 } bind(ok); 7797 } 7798 #endif 7799 7800 mov(Pm_base, Ra); 7801 7802 mov(t0, zr); 7803 mov(t1, zr); 7804 mov(t2, zr); 7805 7806 block_comment("for (int i = 0; i < len; i++) {"); 7807 mov(Ri, zr); { 7808 Label loop, end; 7809 cmpw(Ri, Rlen); 7810 br(Assembler::GE, end); 7811 7812 bind(loop); 7813 pre1(Ri); 7814 7815 block_comment(" for (j = i; j; j--) {"); { 7816 movw(Rj, Ri); 7817 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7818 } block_comment(" } // j"); 7819 7820 post1(); 7821 addw(Ri, Ri, 1); 7822 cmpw(Ri, Rlen); 7823 br(Assembler::LT, loop); 7824 bind(end); 7825 block_comment("} // i"); 7826 } 7827 7828 block_comment("for (int i = len; i < 2*len; i++) {"); 7829 mov(Ri, Rlen); { 7830 Label loop, end; 7831 cmpw(Ri, Rlen, Assembler::LSL, 1); 7832 br(Assembler::GE, end); 7833 7834 bind(loop); 7835 pre2(Ri, Rlen); 7836 7837 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7838 lslw(Rj, Rlen, 1); 7839 subw(Rj, Rj, Ri); 7840 subw(Rj, Rj, 1); 7841 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7842 } block_comment(" } // j"); 7843 7844 post2(Ri, Rlen); 7845 addw(Ri, Ri, 1); 7846 cmpw(Ri, Rlen, Assembler::LSL, 1); 7847 br(Assembler::LT, loop); 7848 bind(end); 7849 } 7850 block_comment("} // i"); 7851 7852 normalize(Rlen); 7853 7854 mov(Ra, Pm_base); // Save Pm_base in Ra 7855 restore_regs(); // Restore caller's Pm_base 7856 7857 // Copy our result into caller's Pm_base 7858 reverse(Pm_base, Ra, Rlen, t0, t1); 7859 7860 leave(); 7861 bind(nothing); 7862 ret(lr); 7863 7864 return entry; 7865 } 7866 // In C, approximately: 7867 7868 // void 7869 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7870 // julong Pn_base[], julong Pm_base[], 7871 // julong inv, int len) { 7872 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7873 // julong *Pa, *Pb, *Pn, *Pm; 7874 // julong Ra, Rb, Rn, Rm; 7875 7876 // int i; 7877 7878 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7879 7880 // for (i = 0; i < len; i++) { 7881 // int j; 7882 7883 // Pa = Pa_base; 7884 // Pb = Pb_base + i; 7885 // Pm = Pm_base; 7886 // Pn = Pn_base + i; 7887 7888 // Ra = *Pa; 7889 // Rb = *Pb; 7890 // Rm = *Pm; 7891 // Rn = *Pn; 7892 7893 // int iters = i; 7894 // for (j = 0; iters--; j++) { 7895 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7896 // MACC(Ra, Rb, t0, t1, t2); 7897 // Ra = *++Pa; 7898 // Rb = *--Pb; 7899 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7900 // MACC(Rm, Rn, t0, t1, t2); 7901 // Rm = *++Pm; 7902 // Rn = *--Pn; 7903 // } 7904 7905 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 7906 // MACC(Ra, Rb, t0, t1, t2); 7907 // *Pm = Rm = t0 * inv; 7908 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7909 // MACC(Rm, Rn, t0, t1, t2); 7910 7911 // assert(t0 == 0, "broken Montgomery multiply"); 7912 7913 // t0 = t1; t1 = t2; t2 = 0; 7914 // } 7915 7916 // for (i = len; i < 2*len; i++) { 7917 // int j; 7918 7919 // Pa = Pa_base + i-len; 7920 // Pb = Pb_base + len; 7921 // Pm = Pm_base + i-len; 7922 // Pn = Pn_base + len; 7923 7924 // Ra = *++Pa; 7925 // Rb = *--Pb; 7926 // Rm = *++Pm; 7927 // Rn = *--Pn; 7928 7929 // int iters = len*2-i-1; 7930 // for (j = i-len+1; iters--; j++) { 7931 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7932 // MACC(Ra, Rb, t0, t1, t2); 7933 // Ra = *++Pa; 7934 // Rb = *--Pb; 7935 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7936 // MACC(Rm, Rn, t0, t1, t2); 7937 // Rm = *++Pm; 7938 // Rn = *--Pn; 7939 // } 7940 7941 // Pm_base[i-len] = t0; 7942 // t0 = t1; t1 = t2; t2 = 0; 7943 // } 7944 7945 // while (t0) 7946 // t0 = sub(Pm_base, Pn_base, t0, len); 7947 // } 7948 7949 /** 7950 * Fast Montgomery squaring. This uses asymptotically 25% fewer 7951 * multiplies than Montgomery multiplication so it should be up to 7952 * 25% faster. However, its loop control is more complex and it 7953 * may actually run slower on some machines. 7954 * 7955 * Arguments: 7956 * 7957 * Inputs: 7958 * c_rarg0 - int array elements a 7959 * c_rarg1 - int array elements n (the modulus) 7960 * c_rarg2 - int length 7961 * c_rarg3 - int inv 7962 * c_rarg4 - int array elements m (the result) 7963 * 7964 */ 7965 address generate_square() { 7966 Label argh; 7967 bind(argh); 7968 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7969 7970 align(CodeEntryAlignment); 7971 address entry = pc(); 7972 7973 enter(); 7974 7975 // Make room. 7976 cmpw(Rlen, 512); 7977 br(Assembler::HI, argh); 7978 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7979 andr(sp, Ra, -2 * wordSize); 7980 7981 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7982 7983 { 7984 // Copy input args, reversing as we go. We use Ra as a 7985 // temporary variable. 7986 reverse(Ra, Pa_base, Rlen, t0, t1); 7987 reverse(Ra, Pn_base, Rlen, t0, t1); 7988 } 7989 7990 // Push all call-saved registers and also Pm_base which we'll need 7991 // at the end. 7992 save_regs(); 7993 7994 mov(Pm_base, Ra); 7995 7996 mov(t0, zr); 7997 mov(t1, zr); 7998 mov(t2, zr); 7999 8000 block_comment("for (int i = 0; i < len; i++) {"); 8001 mov(Ri, zr); { 8002 Label loop, end; 8003 bind(loop); 8004 cmp(Ri, Rlen); 8005 br(Assembler::GE, end); 8006 8007 pre1(Ri); 8008 8009 block_comment("for (j = (i+1)/2; j; j--) {"); { 8010 add(Rj, Ri, 1); 8011 lsr(Rj, Rj, 1); 8012 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8013 } block_comment(" } // j"); 8014 8015 last_squaring(Ri); 8016 8017 block_comment(" for (j = i/2; j; j--) {"); { 8018 lsr(Rj, Ri, 1); 8019 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8020 } block_comment(" } // j"); 8021 8022 post1_squaring(); 8023 add(Ri, Ri, 1); 8024 cmp(Ri, Rlen); 8025 br(Assembler::LT, loop); 8026 8027 bind(end); 8028 block_comment("} // i"); 8029 } 8030 8031 block_comment("for (int i = len; i < 2*len; i++) {"); 8032 mov(Ri, Rlen); { 8033 Label loop, end; 8034 bind(loop); 8035 cmp(Ri, Rlen, Assembler::LSL, 1); 8036 br(Assembler::GE, end); 8037 8038 pre2(Ri, Rlen); 8039 8040 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8041 lsl(Rj, Rlen, 1); 8042 sub(Rj, Rj, Ri); 8043 sub(Rj, Rj, 1); 8044 lsr(Rj, Rj, 1); 8045 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8046 } block_comment(" } // j"); 8047 8048 last_squaring(Ri); 8049 8050 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8051 lsl(Rj, Rlen, 1); 8052 sub(Rj, Rj, Ri); 8053 lsr(Rj, Rj, 1); 8054 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8055 } block_comment(" } // j"); 8056 8057 post2(Ri, Rlen); 8058 add(Ri, Ri, 1); 8059 cmp(Ri, Rlen, Assembler::LSL, 1); 8060 8061 br(Assembler::LT, loop); 8062 bind(end); 8063 block_comment("} // i"); 8064 } 8065 8066 normalize(Rlen); 8067 8068 mov(Ra, Pm_base); // Save Pm_base in Ra 8069 restore_regs(); // Restore caller's Pm_base 8070 8071 // Copy our result into caller's Pm_base 8072 reverse(Pm_base, Ra, Rlen, t0, t1); 8073 8074 leave(); 8075 ret(lr); 8076 8077 return entry; 8078 } 8079 // In C, approximately: 8080 8081 // void 8082 // montgomery_square(julong Pa_base[], julong Pn_base[], 8083 // julong Pm_base[], julong inv, int len) { 8084 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8085 // julong *Pa, *Pb, *Pn, *Pm; 8086 // julong Ra, Rb, Rn, Rm; 8087 8088 // int i; 8089 8090 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8091 8092 // for (i = 0; i < len; i++) { 8093 // int j; 8094 8095 // Pa = Pa_base; 8096 // Pb = Pa_base + i; 8097 // Pm = Pm_base; 8098 // Pn = Pn_base + i; 8099 8100 // Ra = *Pa; 8101 // Rb = *Pb; 8102 // Rm = *Pm; 8103 // Rn = *Pn; 8104 8105 // int iters = (i+1)/2; 8106 // for (j = 0; iters--; j++) { 8107 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8108 // MACC2(Ra, Rb, t0, t1, t2); 8109 // Ra = *++Pa; 8110 // Rb = *--Pb; 8111 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8112 // MACC(Rm, Rn, t0, t1, t2); 8113 // Rm = *++Pm; 8114 // Rn = *--Pn; 8115 // } 8116 // if ((i & 1) == 0) { 8117 // assert(Ra == Pa_base[j], "must be"); 8118 // MACC(Ra, Ra, t0, t1, t2); 8119 // } 8120 // iters = i/2; 8121 // assert(iters == i-j, "must be"); 8122 // for (; iters--; j++) { 8123 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8124 // MACC(Rm, Rn, t0, t1, t2); 8125 // Rm = *++Pm; 8126 // Rn = *--Pn; 8127 // } 8128 8129 // *Pm = Rm = t0 * inv; 8130 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8131 // MACC(Rm, Rn, t0, t1, t2); 8132 8133 // assert(t0 == 0, "broken Montgomery multiply"); 8134 8135 // t0 = t1; t1 = t2; t2 = 0; 8136 // } 8137 8138 // for (i = len; i < 2*len; i++) { 8139 // int start = i-len+1; 8140 // int end = start + (len - start)/2; 8141 // int j; 8142 8143 // Pa = Pa_base + i-len; 8144 // Pb = Pa_base + len; 8145 // Pm = Pm_base + i-len; 8146 // Pn = Pn_base + len; 8147 8148 // Ra = *++Pa; 8149 // Rb = *--Pb; 8150 // Rm = *++Pm; 8151 // Rn = *--Pn; 8152 8153 // int iters = (2*len-i-1)/2; 8154 // assert(iters == end-start, "must be"); 8155 // for (j = start; iters--; j++) { 8156 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8157 // MACC2(Ra, Rb, t0, t1, t2); 8158 // Ra = *++Pa; 8159 // Rb = *--Pb; 8160 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8161 // MACC(Rm, Rn, t0, t1, t2); 8162 // Rm = *++Pm; 8163 // Rn = *--Pn; 8164 // } 8165 // if ((i & 1) == 0) { 8166 // assert(Ra == Pa_base[j], "must be"); 8167 // MACC(Ra, Ra, t0, t1, t2); 8168 // } 8169 // iters = (2*len-i)/2; 8170 // assert(iters == len-j, "must be"); 8171 // for (; iters--; j++) { 8172 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8173 // MACC(Rm, Rn, t0, t1, t2); 8174 // Rm = *++Pm; 8175 // Rn = *--Pn; 8176 // } 8177 // Pm_base[i-len] = t0; 8178 // t0 = t1; t1 = t2; t2 = 0; 8179 // } 8180 8181 // while (t0) 8182 // t0 = sub(Pm_base, Pn_base, t0, len); 8183 // } 8184 }; 8185 8186 8187 // Initialization 8188 void generate_initial_stubs() { 8189 // Generate initial stubs and initializes the entry points 8190 8191 // entry points that exist in all platforms Note: This is code 8192 // that could be shared among different platforms - however the 8193 // benefit seems to be smaller than the disadvantage of having a 8194 // much more complicated generator structure. See also comment in 8195 // stubRoutines.hpp. 8196 8197 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8198 8199 StubRoutines::_call_stub_entry = 8200 generate_call_stub(StubRoutines::_call_stub_return_address); 8201 8202 // is referenced by megamorphic call 8203 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8204 8205 // Initialize table for copy memory (arraycopy) check. 8206 if (UnsafeMemoryAccess::_table == nullptr) { 8207 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 8208 } 8209 8210 if (UseCRC32Intrinsics) { 8211 // set table address before stub generation which use it 8212 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8213 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8214 } 8215 8216 if (UseCRC32CIntrinsics) { 8217 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8218 } 8219 8220 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8221 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8222 } 8223 8224 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8225 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8226 } 8227 8228 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 8229 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 8230 StubRoutines::_hf2f = generate_float16ToFloat(); 8231 StubRoutines::_f2hf = generate_floatToFloat16(); 8232 } 8233 } 8234 8235 void generate_continuation_stubs() { 8236 // Continuation stubs: 8237 StubRoutines::_cont_thaw = generate_cont_thaw(); 8238 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8239 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8240 } 8241 8242 void generate_final_stubs() { 8243 // support for verify_oop (must happen after universe_init) 8244 if (VerifyOops) { 8245 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8246 } 8247 8248 // arraycopy stubs used by compilers 8249 generate_arraycopy_stubs(); 8250 8251 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8252 if (bs_nm != nullptr) { 8253 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8254 } 8255 8256 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8257 8258 if (UsePoly1305Intrinsics) { 8259 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 8260 } 8261 8262 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8263 8264 generate_atomic_entry_points(); 8265 8266 #endif // LINUX 8267 8268 #ifdef COMPILER2 8269 if (UseSecondarySupersTable) { 8270 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 8271 if (! InlineSecondarySupersTest) { 8272 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 8273 StubRoutines::_lookup_secondary_supers_table_stubs[slot] 8274 = generate_lookup_secondary_supers_table_stub(slot); 8275 } 8276 } 8277 } 8278 #endif 8279 8280 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8281 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 8282 8283 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8284 } 8285 8286 void generate_compiler_stubs() { 8287 #if COMPILER2_OR_JVMCI 8288 8289 if (UseSVE == 0) { 8290 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8291 } 8292 8293 // array equals stub for large arrays. 8294 if (!UseSimpleArrayEquals) { 8295 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8296 } 8297 8298 // byte_array_inflate stub for large arrays. 8299 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8300 8301 // countPositives stub for large arrays. 8302 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8303 8304 generate_compare_long_strings(); 8305 8306 generate_string_indexof_stubs(); 8307 8308 #ifdef COMPILER2 8309 if (UseMultiplyToLenIntrinsic) { 8310 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8311 } 8312 8313 if (UseSquareToLenIntrinsic) { 8314 StubRoutines::_squareToLen = generate_squareToLen(); 8315 } 8316 8317 if (UseMulAddIntrinsic) { 8318 StubRoutines::_mulAdd = generate_mulAdd(); 8319 } 8320 8321 if (UseSIMDForBigIntegerShiftIntrinsics) { 8322 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8323 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8324 } 8325 8326 if (UseMontgomeryMultiplyIntrinsic) { 8327 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8328 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8329 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8330 } 8331 8332 if (UseMontgomerySquareIntrinsic) { 8333 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8334 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8335 // We use generate_multiply() rather than generate_square() 8336 // because it's faster for the sizes of modulus we care about. 8337 StubRoutines::_montgomerySquare = g.generate_multiply(); 8338 } 8339 #endif // COMPILER2 8340 8341 if (UseChaCha20Intrinsics) { 8342 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8343 } 8344 8345 if (UseBASE64Intrinsics) { 8346 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8347 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8348 } 8349 8350 // data cache line writeback 8351 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8352 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8353 8354 if (UseAESIntrinsics) { 8355 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8356 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8357 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8358 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8359 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8360 } 8361 if (UseGHASHIntrinsics) { 8362 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8363 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8364 } 8365 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8366 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8367 } 8368 8369 if (UseMD5Intrinsics) { 8370 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8371 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8372 } 8373 if (UseSHA1Intrinsics) { 8374 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8375 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8376 } 8377 if (UseSHA256Intrinsics) { 8378 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8379 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8380 } 8381 if (UseSHA512Intrinsics) { 8382 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8383 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8384 } 8385 if (UseSHA3Intrinsics) { 8386 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8387 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8388 } 8389 8390 // generate Adler32 intrinsics code 8391 if (UseAdler32Intrinsics) { 8392 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8393 } 8394 #endif // COMPILER2_OR_JVMCI 8395 } 8396 8397 public: 8398 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8399 switch(kind) { 8400 case Initial_stubs: 8401 generate_initial_stubs(); 8402 break; 8403 case Continuation_stubs: 8404 generate_continuation_stubs(); 8405 break; 8406 case Compiler_stubs: 8407 generate_compiler_stubs(); 8408 break; 8409 case Final_stubs: 8410 generate_final_stubs(); 8411 break; 8412 default: 8413 fatal("unexpected stubs kind: %d", kind); 8414 break; 8415 }; 8416 } 8417 }; // end class declaration 8418 8419 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8420 StubGenerator g(code, kind); 8421 } 8422 8423 8424 #if defined (LINUX) 8425 8426 // Define pointers to atomic stubs and initialize them to point to the 8427 // code in atomic_aarch64.S. 8428 8429 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8430 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8431 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8432 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8433 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8434 8435 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8436 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8437 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8438 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8439 DEFAULT_ATOMIC_OP(xchg, 4, ) 8440 DEFAULT_ATOMIC_OP(xchg, 8, ) 8441 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8442 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8443 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8444 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8445 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8446 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8447 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8448 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8449 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8450 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8451 8452 #undef DEFAULT_ATOMIC_OP 8453 8454 #endif // LINUX