1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/globalDefinitions.hpp" 57 #include "utilities/powerOfTwo.hpp" 58 #ifdef COMPILER2 59 #include "opto/runtime.hpp" 60 #endif 61 #if INCLUDE_ZGC 62 #include "gc/z/zThreadLocalData.hpp" 63 #endif 64 65 // Declaration and definition of StubGenerator (no .hpp file). 66 // For a more detailed description of the stub routine structure 67 // see the comment in stubRoutines.hpp 68 69 #undef __ 70 #define __ _masm-> 71 72 #ifdef PRODUCT 73 #define BLOCK_COMMENT(str) /* nothing */ 74 #else 75 #define BLOCK_COMMENT(str) __ block_comment(str) 76 #endif 77 78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 79 80 // Stub Code definitions 81 82 class StubGenerator: public StubCodeGenerator { 83 private: 84 85 #ifdef PRODUCT 86 #define inc_counter_np(counter) ((void)0) 87 #else 88 void inc_counter_np_(uint& counter) { 89 __ lea(rscratch2, ExternalAddress((address)&counter)); 90 __ ldrw(rscratch1, Address(rscratch2)); 91 __ addw(rscratch1, rscratch1, 1); 92 __ strw(rscratch1, Address(rscratch2)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubCodeMark mark(this, "StubRoutines", "call_stub"); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 __ ldr(j_rarg2, result); 332 Label is_long, is_float, is_double, exit; 333 __ ldr(j_rarg1, result_type); 334 __ cmp(j_rarg1, (u1)T_OBJECT); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, (u1)T_LONG); 337 __ br(Assembler::EQ, is_long); 338 __ cmp(j_rarg1, (u1)T_FLOAT); 339 __ br(Assembler::EQ, is_float); 340 __ cmp(j_rarg1, (u1)T_DOUBLE); 341 __ br(Assembler::EQ, is_double); 342 343 // handle T_INT case 344 __ strw(r0, Address(j_rarg2)); 345 346 __ BIND(exit); 347 348 // pop parameters 349 __ sub(esp, rfp, -sp_after_call_off * wordSize); 350 351 #ifdef ASSERT 352 // verify that threads correspond 353 { 354 Label L, S; 355 __ ldr(rscratch1, thread); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::NE, S); 358 __ get_thread(rscratch1); 359 __ cmp(rthread, rscratch1); 360 __ br(Assembler::EQ, L); 361 __ BIND(S); 362 __ stop("StubRoutines::call_stub: threads must correspond"); 363 __ BIND(L); 364 } 365 #endif 366 367 __ pop_cont_fastpath(rthread); 368 369 // restore callee-save registers 370 __ ldpd(v15, v14, d15_save); 371 __ ldpd(v13, v12, d13_save); 372 __ ldpd(v11, v10, d11_save); 373 __ ldpd(v9, v8, d9_save); 374 375 __ ldp(r28, r27, r28_save); 376 __ ldp(r26, r25, r26_save); 377 __ ldp(r24, r23, r24_save); 378 __ ldp(r22, r21, r22_save); 379 __ ldp(r20, r19, r20_save); 380 381 // restore fpcr 382 __ ldr(rscratch1, fpcr_save); 383 __ set_fpcr(rscratch1); 384 385 __ ldp(c_rarg0, c_rarg1, call_wrapper); 386 __ ldrw(c_rarg2, result_type); 387 __ ldr(c_rarg3, method); 388 __ ldp(c_rarg4, c_rarg5, entry_point); 389 __ ldp(c_rarg6, c_rarg7, parameter_size); 390 391 // leave frame and return to caller 392 __ leave(); 393 __ ret(lr); 394 395 // handle return types different from T_INT 396 397 __ BIND(is_long); 398 __ str(r0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 __ BIND(is_float); 402 __ strs(j_farg0, Address(j_rarg2, 0)); 403 __ br(Assembler::AL, exit); 404 405 __ BIND(is_double); 406 __ strd(j_farg0, Address(j_rarg2, 0)); 407 __ br(Assembler::AL, exit); 408 409 return start; 410 } 411 412 // Return point for a Java call if there's an exception thrown in 413 // Java code. The exception is caught and transformed into a 414 // pending exception stored in JavaThread that can be tested from 415 // within the VM. 416 // 417 // Note: Usually the parameters are removed by the callee. In case 418 // of an exception crossing an activation frame boundary, that is 419 // not the case if the callee is compiled code => need to setup the 420 // rsp. 421 // 422 // r0: exception oop 423 424 address generate_catch_exception() { 425 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 426 address start = __ pc(); 427 428 // same as in generate_call_stub(): 429 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 430 const Address thread (rfp, thread_off * wordSize); 431 432 #ifdef ASSERT 433 // verify that threads correspond 434 { 435 Label L, S; 436 __ ldr(rscratch1, thread); 437 __ cmp(rthread, rscratch1); 438 __ br(Assembler::NE, S); 439 __ get_thread(rscratch1); 440 __ cmp(rthread, rscratch1); 441 __ br(Assembler::EQ, L); 442 __ bind(S); 443 __ stop("StubRoutines::catch_exception: threads must correspond"); 444 __ bind(L); 445 } 446 #endif 447 448 // set pending exception 449 __ verify_oop(r0); 450 451 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 452 __ mov(rscratch1, (address)__FILE__); 453 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 454 __ movw(rscratch1, (int)__LINE__); 455 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 456 457 // complete return to VM 458 assert(StubRoutines::_call_stub_return_address != nullptr, 459 "_call_stub_return_address must have been generated before"); 460 __ b(StubRoutines::_call_stub_return_address); 461 462 return start; 463 } 464 465 // Continuation point for runtime calls returning with a pending 466 // exception. The pending exception check happened in the runtime 467 // or native call stub. The pending exception in Thread is 468 // converted into a Java-level exception. 469 // 470 // Contract with Java-level exception handlers: 471 // r0: exception 472 // r3: throwing pc 473 // 474 // NOTE: At entry of this stub, exception-pc must be in LR !! 475 476 // NOTE: this is always used as a jump target within generated code 477 // so it just needs to be generated code with no x86 prolog 478 479 address generate_forward_exception() { 480 StubCodeMark mark(this, "StubRoutines", "forward exception"); 481 address start = __ pc(); 482 483 // Upon entry, LR points to the return address returning into 484 // Java (interpreted or compiled) code; i.e., the return address 485 // becomes the throwing pc. 486 // 487 // Arguments pushed before the runtime call are still on the stack 488 // but the exception handler will reset the stack pointer -> 489 // ignore them. A potential result in registers can be ignored as 490 // well. 491 492 #ifdef ASSERT 493 // make sure this code is only executed if there is a pending exception 494 { 495 Label L; 496 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 497 __ cbnz(rscratch1, L); 498 __ stop("StubRoutines::forward exception: no pending exception (1)"); 499 __ bind(L); 500 } 501 #endif 502 503 // compute exception handler into r19 504 505 // call the VM to find the handler address associated with the 506 // caller address. pass thread in r0 and caller pc (ret address) 507 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 508 // the stack. 509 __ mov(c_rarg1, lr); 510 // lr will be trashed by the VM call so we move it to R19 511 // (callee-saved) because we also need to pass it to the handler 512 // returned by this call. 513 __ mov(r19, lr); 514 BLOCK_COMMENT("call exception_handler_for_return_address"); 515 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 516 SharedRuntime::exception_handler_for_return_address), 517 rthread, c_rarg1); 518 // Reinitialize the ptrue predicate register, in case the external runtime 519 // call clobbers ptrue reg, as we may return to SVE compiled code. 520 __ reinitialize_ptrue(); 521 522 // we should not really care that lr is no longer the callee 523 // address. we saved the value the handler needs in r19 so we can 524 // just copy it to r3. however, the C2 handler will push its own 525 // frame and then calls into the VM and the VM code asserts that 526 // the PC for the frame above the handler belongs to a compiled 527 // Java method. So, we restore lr here to satisfy that assert. 528 __ mov(lr, r19); 529 // setup r0 & r3 & clear pending exception 530 __ mov(r3, r19); 531 __ mov(r19, r0); 532 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 533 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 534 535 #ifdef ASSERT 536 // make sure exception is set 537 { 538 Label L; 539 __ cbnz(r0, L); 540 __ stop("StubRoutines::forward exception: no pending exception (2)"); 541 __ bind(L); 542 } 543 #endif 544 545 // continue at exception handler 546 // r0: exception 547 // r3: throwing pc 548 // r19: exception handler 549 __ verify_oop(r0); 550 __ br(r19); 551 552 return start; 553 } 554 555 // Non-destructive plausibility checks for oops 556 // 557 // Arguments: 558 // r0: oop to verify 559 // rscratch1: error message 560 // 561 // Stack after saving c_rarg3: 562 // [tos + 0]: saved c_rarg3 563 // [tos + 1]: saved c_rarg2 564 // [tos + 2]: saved lr 565 // [tos + 3]: saved rscratch2 566 // [tos + 4]: saved r0 567 // [tos + 5]: saved rscratch1 568 address generate_verify_oop() { 569 570 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 571 address start = __ pc(); 572 573 Label exit, error; 574 575 // save c_rarg2 and c_rarg3 576 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 577 578 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 579 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 580 __ ldr(c_rarg3, Address(c_rarg2)); 581 __ add(c_rarg3, c_rarg3, 1); 582 __ str(c_rarg3, Address(c_rarg2)); 583 584 // object is in r0 585 // make sure object is 'reasonable' 586 __ cbz(r0, exit); // if obj is null it is OK 587 588 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 589 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 590 591 // return if everything seems ok 592 __ bind(exit); 593 594 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 595 __ ret(lr); 596 597 // handle errors 598 __ bind(error); 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 601 __ push(RegSet::range(r0, r29), sp); 602 // debug(char* msg, int64_t pc, int64_t regs[]) 603 __ mov(c_rarg0, rscratch1); // pass address of error message 604 __ mov(c_rarg1, lr); // pass return address 605 __ mov(c_rarg2, sp); // pass address of regs on stack 606 #ifndef PRODUCT 607 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 608 #endif 609 BLOCK_COMMENT("call MacroAssembler::debug"); 610 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 611 __ blr(rscratch1); 612 __ hlt(0); 613 614 return start; 615 } 616 617 // Generate indices for iota vector. 618 address generate_iota_indices(const char *stub_name) { 619 __ align(CodeEntryAlignment); 620 StubCodeMark mark(this, "StubRoutines", stub_name); 621 address start = __ pc(); 622 // B 623 __ emit_data64(0x0706050403020100, relocInfo::none); 624 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 625 // H 626 __ emit_data64(0x0003000200010000, relocInfo::none); 627 __ emit_data64(0x0007000600050004, relocInfo::none); 628 // S 629 __ emit_data64(0x0000000100000000, relocInfo::none); 630 __ emit_data64(0x0000000300000002, relocInfo::none); 631 // D 632 __ emit_data64(0x0000000000000000, relocInfo::none); 633 __ emit_data64(0x0000000000000001, relocInfo::none); 634 // S - FP 635 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 636 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 637 // D - FP 638 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 639 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 640 return start; 641 } 642 643 // The inner part of zero_words(). This is the bulk operation, 644 // zeroing words in blocks, possibly using DC ZVA to do it. The 645 // caller is responsible for zeroing the last few words. 646 // 647 // Inputs: 648 // r10: the HeapWord-aligned base address of an array to zero. 649 // r11: the count in HeapWords, r11 > 0. 650 // 651 // Returns r10 and r11, adjusted for the caller to clear. 652 // r10: the base address of the tail of words left to clear. 653 // r11: the number of words in the tail. 654 // r11 < MacroAssembler::zero_words_block_size. 655 656 address generate_zero_blocks() { 657 Label done; 658 Label base_aligned; 659 660 Register base = r10, cnt = r11; 661 662 __ align(CodeEntryAlignment); 663 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 664 address start = __ pc(); 665 666 if (UseBlockZeroing) { 667 int zva_length = VM_Version::zva_length(); 668 669 // Ensure ZVA length can be divided by 16. This is required by 670 // the subsequent operations. 671 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 672 673 __ tbz(base, 3, base_aligned); 674 __ str(zr, Address(__ post(base, 8))); 675 __ sub(cnt, cnt, 1); 676 __ bind(base_aligned); 677 678 // Ensure count >= zva_length * 2 so that it still deserves a zva after 679 // alignment. 680 Label small; 681 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 682 __ subs(rscratch1, cnt, low_limit >> 3); 683 __ br(Assembler::LT, small); 684 __ zero_dcache_blocks(base, cnt); 685 __ bind(small); 686 } 687 688 { 689 // Number of stp instructions we'll unroll 690 const int unroll = 691 MacroAssembler::zero_words_block_size / 2; 692 // Clear the remaining blocks. 693 Label loop; 694 __ subs(cnt, cnt, unroll * 2); 695 __ br(Assembler::LT, done); 696 __ bind(loop); 697 for (int i = 0; i < unroll; i++) 698 __ stp(zr, zr, __ post(base, 16)); 699 __ subs(cnt, cnt, unroll * 2); 700 __ br(Assembler::GE, loop); 701 __ bind(done); 702 __ add(cnt, cnt, unroll * 2); 703 } 704 705 __ ret(lr); 706 707 return start; 708 } 709 710 711 typedef enum { 712 copy_forwards = 1, 713 copy_backwards = -1 714 } copy_direction; 715 716 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 717 // for arraycopy stubs. 718 class ArrayCopyBarrierSetHelper : StackObj { 719 BarrierSetAssembler* _bs_asm; 720 MacroAssembler* _masm; 721 DecoratorSet _decorators; 722 BasicType _type; 723 Register _gct1; 724 Register _gct2; 725 Register _gct3; 726 FloatRegister _gcvt1; 727 FloatRegister _gcvt2; 728 FloatRegister _gcvt3; 729 730 public: 731 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 732 DecoratorSet decorators, 733 BasicType type, 734 Register gct1, 735 Register gct2, 736 Register gct3, 737 FloatRegister gcvt1, 738 FloatRegister gcvt2, 739 FloatRegister gcvt3) 740 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 741 _masm(masm), 742 _decorators(decorators), 743 _type(type), 744 _gct1(gct1), 745 _gct2(gct2), 746 _gct3(gct3), 747 _gcvt1(gcvt1), 748 _gcvt2(gcvt2), 749 _gcvt3(gcvt3) { 750 } 751 752 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 753 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 754 dst1, dst2, src, 755 _gct1, _gct2, _gcvt1); 756 } 757 758 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 759 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 760 dst, src1, src2, 761 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 762 } 763 764 void copy_load_at_16(Register dst1, Register dst2, Address src) { 765 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 766 dst1, dst2, src, 767 _gct1); 768 } 769 770 void copy_store_at_16(Address dst, Register src1, Register src2) { 771 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 772 dst, src1, src2, 773 _gct1, _gct2, _gct3); 774 } 775 776 void copy_load_at_8(Register dst, Address src) { 777 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 778 dst, noreg, src, 779 _gct1); 780 } 781 782 void copy_store_at_8(Address dst, Register src) { 783 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 784 dst, src, noreg, 785 _gct1, _gct2, _gct3); 786 } 787 }; 788 789 // Bulk copy of blocks of 8 words. 790 // 791 // count is a count of words. 792 // 793 // Precondition: count >= 8 794 // 795 // Postconditions: 796 // 797 // The least significant bit of count contains the remaining count 798 // of words to copy. The rest of count is trash. 799 // 800 // s and d are adjusted to point to the remaining words to copy 801 // 802 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 803 copy_direction direction) { 804 int unit = wordSize * direction; 805 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 806 807 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 808 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 809 const Register stride = r14; 810 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 811 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 812 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 813 814 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 815 assert_different_registers(s, d, count, rscratch1, rscratch2); 816 817 Label again, drain; 818 const char *stub_name; 819 if (direction == copy_forwards) 820 stub_name = "forward_copy_longs"; 821 else 822 stub_name = "backward_copy_longs"; 823 824 __ align(CodeEntryAlignment); 825 826 StubCodeMark mark(this, "StubRoutines", stub_name); 827 828 __ bind(start); 829 830 Label unaligned_copy_long; 831 if (AvoidUnalignedAccesses) { 832 __ tbnz(d, 3, unaligned_copy_long); 833 } 834 835 if (direction == copy_forwards) { 836 __ sub(s, s, bias); 837 __ sub(d, d, bias); 838 } 839 840 #ifdef ASSERT 841 // Make sure we are never given < 8 words 842 { 843 Label L; 844 __ cmp(count, (u1)8); 845 __ br(Assembler::GE, L); 846 __ stop("genrate_copy_longs called with < 8 words"); 847 __ bind(L); 848 } 849 #endif 850 851 // Fill 8 registers 852 if (UseSIMDForMemoryOps) { 853 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 854 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 855 } else { 856 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 857 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 858 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 859 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 860 } 861 862 __ subs(count, count, 16); 863 __ br(Assembler::LO, drain); 864 865 int prefetch = PrefetchCopyIntervalInBytes; 866 bool use_stride = false; 867 if (direction == copy_backwards) { 868 use_stride = prefetch > 256; 869 prefetch = -prefetch; 870 if (use_stride) __ mov(stride, prefetch); 871 } 872 873 __ bind(again); 874 875 if (PrefetchCopyIntervalInBytes > 0) 876 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 877 878 if (UseSIMDForMemoryOps) { 879 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 880 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 881 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 882 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 883 } else { 884 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 886 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 888 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 889 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 890 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 891 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 892 } 893 894 __ subs(count, count, 8); 895 __ br(Assembler::HS, again); 896 897 // Drain 898 __ bind(drain); 899 if (UseSIMDForMemoryOps) { 900 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 901 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 902 } else { 903 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 904 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 905 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 906 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 907 } 908 909 { 910 Label L1, L2; 911 __ tbz(count, exact_log2(4), L1); 912 if (UseSIMDForMemoryOps) { 913 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 914 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 915 } else { 916 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 917 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 918 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 919 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 920 } 921 __ bind(L1); 922 923 if (direction == copy_forwards) { 924 __ add(s, s, bias); 925 __ add(d, d, bias); 926 } 927 928 __ tbz(count, 1, L2); 929 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 930 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 931 __ bind(L2); 932 } 933 934 __ ret(lr); 935 936 if (AvoidUnalignedAccesses) { 937 Label drain, again; 938 // Register order for storing. Order is different for backward copy. 939 940 __ bind(unaligned_copy_long); 941 942 // source address is even aligned, target odd aligned 943 // 944 // when forward copying word pairs we read long pairs at offsets 945 // {0, 2, 4, 6} (in long words). when backwards copying we read 946 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 947 // address by -2 in the forwards case so we can compute the 948 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 949 // or -1. 950 // 951 // when forward copying we need to store 1 word, 3 pairs and 952 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 953 // zero offset We adjust the destination by -1 which means we 954 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 955 // 956 // When backwards copyng we need to store 1 word, 3 pairs and 957 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 958 // offsets {1, 3, 5, 7, 8} * unit. 959 960 if (direction == copy_forwards) { 961 __ sub(s, s, 16); 962 __ sub(d, d, 8); 963 } 964 965 // Fill 8 registers 966 // 967 // for forwards copy s was offset by -16 from the original input 968 // value of s so the register contents are at these offsets 969 // relative to the 64 bit block addressed by that original input 970 // and so on for each successive 64 byte block when s is updated 971 // 972 // t0 at offset 0, t1 at offset 8 973 // t2 at offset 16, t3 at offset 24 974 // t4 at offset 32, t5 at offset 40 975 // t6 at offset 48, t7 at offset 56 976 977 // for backwards copy s was not offset so the register contents 978 // are at these offsets into the preceding 64 byte block 979 // relative to that original input and so on for each successive 980 // preceding 64 byte block when s is updated. this explains the 981 // slightly counter-intuitive looking pattern of register usage 982 // in the stp instructions for backwards copy. 983 // 984 // t0 at offset -16, t1 at offset -8 985 // t2 at offset -32, t3 at offset -24 986 // t4 at offset -48, t5 at offset -40 987 // t6 at offset -64, t7 at offset -56 988 989 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 990 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 991 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 992 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 993 994 __ subs(count, count, 16); 995 __ br(Assembler::LO, drain); 996 997 int prefetch = PrefetchCopyIntervalInBytes; 998 bool use_stride = false; 999 if (direction == copy_backwards) { 1000 use_stride = prefetch > 256; 1001 prefetch = -prefetch; 1002 if (use_stride) __ mov(stride, prefetch); 1003 } 1004 1005 __ bind(again); 1006 1007 if (PrefetchCopyIntervalInBytes > 0) 1008 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1009 1010 if (direction == copy_forwards) { 1011 // allowing for the offset of -8 the store instructions place 1012 // registers into the target 64 bit block at the following 1013 // offsets 1014 // 1015 // t0 at offset 0 1016 // t1 at offset 8, t2 at offset 16 1017 // t3 at offset 24, t4 at offset 32 1018 // t5 at offset 40, t6 at offset 48 1019 // t7 at offset 56 1020 1021 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1022 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1023 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1024 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1025 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1026 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1027 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1028 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1029 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1030 } else { 1031 // d was not offset when we started so the registers are 1032 // written into the 64 bit block preceding d with the following 1033 // offsets 1034 // 1035 // t1 at offset -8 1036 // t3 at offset -24, t0 at offset -16 1037 // t5 at offset -48, t2 at offset -32 1038 // t7 at offset -56, t4 at offset -48 1039 // t6 at offset -64 1040 // 1041 // note that this matches the offsets previously noted for the 1042 // loads 1043 1044 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1045 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1046 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1047 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1048 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1049 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1050 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1051 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1052 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1053 } 1054 1055 __ subs(count, count, 8); 1056 __ br(Assembler::HS, again); 1057 1058 // Drain 1059 // 1060 // this uses the same pattern of offsets and register arguments 1061 // as above 1062 __ bind(drain); 1063 if (direction == copy_forwards) { 1064 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1065 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1066 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1067 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1068 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1069 } else { 1070 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1071 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1072 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1073 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1074 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1075 } 1076 // now we need to copy any remaining part block which may 1077 // include a 4 word block subblock and/or a 2 word subblock. 1078 // bits 2 and 1 in the count are the tell-tale for whether we 1079 // have each such subblock 1080 { 1081 Label L1, L2; 1082 __ tbz(count, exact_log2(4), L1); 1083 // this is the same as above but copying only 4 longs hence 1084 // with only one intervening stp between the str instructions 1085 // but note that the offsets and registers still follow the 1086 // same pattern 1087 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1088 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1089 if (direction == copy_forwards) { 1090 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1091 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1092 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1093 } else { 1094 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1095 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1096 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1097 } 1098 __ bind(L1); 1099 1100 __ tbz(count, 1, L2); 1101 // this is the same as above but copying only 2 longs hence 1102 // there is no intervening stp between the str instructions 1103 // but note that the offset and register patterns are still 1104 // the same 1105 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1106 if (direction == copy_forwards) { 1107 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1108 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1109 } else { 1110 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1111 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1112 } 1113 __ bind(L2); 1114 1115 // for forwards copy we need to re-adjust the offsets we 1116 // applied so that s and d are follow the last words written 1117 1118 if (direction == copy_forwards) { 1119 __ add(s, s, 16); 1120 __ add(d, d, 8); 1121 } 1122 1123 } 1124 1125 __ ret(lr); 1126 } 1127 } 1128 1129 // Small copy: less than 16 bytes. 1130 // 1131 // NB: Ignores all of the bits of count which represent more than 15 1132 // bytes, so a caller doesn't have to mask them. 1133 1134 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1135 bool is_backwards = step < 0; 1136 size_t granularity = uabs(step); 1137 int direction = is_backwards ? -1 : 1; 1138 1139 Label Lword, Lint, Lshort, Lbyte; 1140 1141 assert(granularity 1142 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1143 1144 const Register t0 = r3; 1145 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1146 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1147 1148 // ??? I don't know if this bit-test-and-branch is the right thing 1149 // to do. It does a lot of jumping, resulting in several 1150 // mispredicted branches. It might make more sense to do this 1151 // with something like Duff's device with a single computed branch. 1152 1153 __ tbz(count, 3 - exact_log2(granularity), Lword); 1154 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1155 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1156 __ bind(Lword); 1157 1158 if (granularity <= sizeof (jint)) { 1159 __ tbz(count, 2 - exact_log2(granularity), Lint); 1160 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1161 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1162 __ bind(Lint); 1163 } 1164 1165 if (granularity <= sizeof (jshort)) { 1166 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1167 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1168 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1169 __ bind(Lshort); 1170 } 1171 1172 if (granularity <= sizeof (jbyte)) { 1173 __ tbz(count, 0, Lbyte); 1174 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1175 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1176 __ bind(Lbyte); 1177 } 1178 } 1179 1180 Label copy_f, copy_b; 1181 Label copy_obj_f, copy_obj_b; 1182 Label copy_obj_uninit_f, copy_obj_uninit_b; 1183 1184 // All-singing all-dancing memory copy. 1185 // 1186 // Copy count units of memory from s to d. The size of a unit is 1187 // step, which can be positive or negative depending on the direction 1188 // of copy. If is_aligned is false, we align the source address. 1189 // 1190 1191 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1192 Register s, Register d, Register count, int step) { 1193 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1194 bool is_backwards = step < 0; 1195 unsigned int granularity = uabs(step); 1196 const Register t0 = r3, t1 = r4; 1197 1198 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1199 // load all the data before writing anything 1200 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1201 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1202 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1203 const Register send = r17, dend = r16; 1204 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1205 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1206 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1207 1208 if (PrefetchCopyIntervalInBytes > 0) 1209 __ prfm(Address(s, 0), PLDL1KEEP); 1210 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1211 __ br(Assembler::HI, copy_big); 1212 1213 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1214 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1215 1216 __ cmp(count, u1(16/granularity)); 1217 __ br(Assembler::LS, copy16); 1218 1219 __ cmp(count, u1(64/granularity)); 1220 __ br(Assembler::HI, copy80); 1221 1222 __ cmp(count, u1(32/granularity)); 1223 __ br(Assembler::LS, copy32); 1224 1225 // 33..64 bytes 1226 if (UseSIMDForMemoryOps) { 1227 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1228 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1229 bs.copy_store_at_32(Address(d, 0), v0, v1); 1230 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1231 } else { 1232 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1233 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1234 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1235 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1236 1237 bs.copy_store_at_16(Address(d, 0), t0, t1); 1238 bs.copy_store_at_16(Address(d, 16), t2, t3); 1239 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1240 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1241 } 1242 __ b(finish); 1243 1244 // 17..32 bytes 1245 __ bind(copy32); 1246 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1247 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1248 1249 bs.copy_store_at_16(Address(d, 0), t0, t1); 1250 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1251 __ b(finish); 1252 1253 // 65..80/96 bytes 1254 // (96 bytes if SIMD because we do 32 byes per instruction) 1255 __ bind(copy80); 1256 if (UseSIMDForMemoryOps) { 1257 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1258 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1259 // Unaligned pointers can be an issue for copying. 1260 // The issue has more chances to happen when granularity of data is 1261 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1262 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1263 // The most performance drop has been seen for the range 65-80 bytes. 1264 // For such cases using the pair of ldp/stp instead of the third pair of 1265 // ldpq/stpq fixes the performance issue. 1266 if (granularity < sizeof (jint)) { 1267 Label copy96; 1268 __ cmp(count, u1(80/granularity)); 1269 __ br(Assembler::HI, copy96); 1270 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1271 1272 bs.copy_store_at_32(Address(d, 0), v0, v1); 1273 bs.copy_store_at_32(Address(d, 32), v2, v3); 1274 1275 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1276 __ b(finish); 1277 1278 __ bind(copy96); 1279 } 1280 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1281 1282 bs.copy_store_at_32(Address(d, 0), v0, v1); 1283 bs.copy_store_at_32(Address(d, 32), v2, v3); 1284 1285 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1286 } else { 1287 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1288 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1289 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1290 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1291 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1292 1293 bs.copy_store_at_16(Address(d, 0), t0, t1); 1294 bs.copy_store_at_16(Address(d, 16), t2, t3); 1295 bs.copy_store_at_16(Address(d, 32), t4, t5); 1296 bs.copy_store_at_16(Address(d, 48), t6, t7); 1297 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1298 } 1299 __ b(finish); 1300 1301 // 0..16 bytes 1302 __ bind(copy16); 1303 __ cmp(count, u1(8/granularity)); 1304 __ br(Assembler::LO, copy8); 1305 1306 // 8..16 bytes 1307 bs.copy_load_at_8(t0, Address(s, 0)); 1308 bs.copy_load_at_8(t1, Address(send, -8)); 1309 bs.copy_store_at_8(Address(d, 0), t0); 1310 bs.copy_store_at_8(Address(dend, -8), t1); 1311 __ b(finish); 1312 1313 if (granularity < 8) { 1314 // 4..7 bytes 1315 __ bind(copy8); 1316 __ tbz(count, 2 - exact_log2(granularity), copy4); 1317 __ ldrw(t0, Address(s, 0)); 1318 __ ldrw(t1, Address(send, -4)); 1319 __ strw(t0, Address(d, 0)); 1320 __ strw(t1, Address(dend, -4)); 1321 __ b(finish); 1322 if (granularity < 4) { 1323 // 0..3 bytes 1324 __ bind(copy4); 1325 __ cbz(count, finish); // get rid of 0 case 1326 if (granularity == 2) { 1327 __ ldrh(t0, Address(s, 0)); 1328 __ strh(t0, Address(d, 0)); 1329 } else { // granularity == 1 1330 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1331 // the first and last byte. 1332 // Handle the 3 byte case by loading and storing base + count/2 1333 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1334 // This does means in the 1 byte case we load/store the same 1335 // byte 3 times. 1336 __ lsr(count, count, 1); 1337 __ ldrb(t0, Address(s, 0)); 1338 __ ldrb(t1, Address(send, -1)); 1339 __ ldrb(t2, Address(s, count)); 1340 __ strb(t0, Address(d, 0)); 1341 __ strb(t1, Address(dend, -1)); 1342 __ strb(t2, Address(d, count)); 1343 } 1344 __ b(finish); 1345 } 1346 } 1347 1348 __ bind(copy_big); 1349 if (is_backwards) { 1350 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1351 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1352 } 1353 1354 // Now we've got the small case out of the way we can align the 1355 // source address on a 2-word boundary. 1356 1357 // Here we will materialize a count in r15, which is used by copy_memory_small 1358 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1359 // Up until here, we have used t9, which aliases r15, but from here on, that register 1360 // can not be used as a temp register, as it contains the count. 1361 1362 Label aligned; 1363 1364 if (is_aligned) { 1365 // We may have to adjust by 1 word to get s 2-word-aligned. 1366 __ tbz(s, exact_log2(wordSize), aligned); 1367 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1368 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1369 __ sub(count, count, wordSize/granularity); 1370 } else { 1371 if (is_backwards) { 1372 __ andr(r15, s, 2 * wordSize - 1); 1373 } else { 1374 __ neg(r15, s); 1375 __ andr(r15, r15, 2 * wordSize - 1); 1376 } 1377 // r15 is the byte adjustment needed to align s. 1378 __ cbz(r15, aligned); 1379 int shift = exact_log2(granularity); 1380 if (shift) __ lsr(r15, r15, shift); 1381 __ sub(count, count, r15); 1382 1383 #if 0 1384 // ?? This code is only correct for a disjoint copy. It may or 1385 // may not make sense to use it in that case. 1386 1387 // Copy the first pair; s and d may not be aligned. 1388 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1389 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1390 1391 // Align s and d, adjust count 1392 if (is_backwards) { 1393 __ sub(s, s, r15); 1394 __ sub(d, d, r15); 1395 } else { 1396 __ add(s, s, r15); 1397 __ add(d, d, r15); 1398 } 1399 #else 1400 copy_memory_small(decorators, type, s, d, r15, step); 1401 #endif 1402 } 1403 1404 __ bind(aligned); 1405 1406 // s is now 2-word-aligned. 1407 1408 // We have a count of units and some trailing bytes. Adjust the 1409 // count and do a bulk copy of words. 1410 __ lsr(r15, count, exact_log2(wordSize/granularity)); 1411 if (direction == copy_forwards) { 1412 if (type != T_OBJECT) { 1413 __ bl(copy_f); 1414 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1415 __ bl(copy_obj_uninit_f); 1416 } else { 1417 __ bl(copy_obj_f); 1418 } 1419 } else { 1420 if (type != T_OBJECT) { 1421 __ bl(copy_b); 1422 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1423 __ bl(copy_obj_uninit_b); 1424 } else { 1425 __ bl(copy_obj_b); 1426 } 1427 } 1428 1429 // And the tail. 1430 copy_memory_small(decorators, type, s, d, count, step); 1431 1432 if (granularity >= 8) __ bind(copy8); 1433 if (granularity >= 4) __ bind(copy4); 1434 __ bind(finish); 1435 } 1436 1437 1438 void clobber_registers() { 1439 #ifdef ASSERT 1440 RegSet clobbered 1441 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1442 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1443 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1444 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1445 __ mov(*it, rscratch1); 1446 } 1447 #endif 1448 1449 } 1450 1451 // Scan over array at a for count oops, verifying each one. 1452 // Preserves a and count, clobbers rscratch1 and rscratch2. 1453 void verify_oop_array (int size, Register a, Register count, Register temp) { 1454 Label loop, end; 1455 __ mov(rscratch1, a); 1456 __ mov(rscratch2, zr); 1457 __ bind(loop); 1458 __ cmp(rscratch2, count); 1459 __ br(Assembler::HS, end); 1460 if (size == wordSize) { 1461 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1462 __ verify_oop(temp); 1463 } else { 1464 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1465 __ decode_heap_oop(temp); // calls verify_oop 1466 } 1467 __ add(rscratch2, rscratch2, 1); 1468 __ b(loop); 1469 __ bind(end); 1470 } 1471 1472 // Arguments: 1473 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1474 // ignored 1475 // is_oop - true => oop array, so generate store check code 1476 // name - stub name string 1477 // 1478 // Inputs: 1479 // c_rarg0 - source array address 1480 // c_rarg1 - destination array address 1481 // c_rarg2 - element count, treated as ssize_t, can be zero 1482 // 1483 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1484 // the hardware handle it. The two dwords within qwords that span 1485 // cache line boundaries will still be loaded and stored atomically. 1486 // 1487 // Side Effects: 1488 // disjoint_int_copy_entry is set to the no-overlap entry point 1489 // used by generate_conjoint_int_oop_copy(). 1490 // 1491 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1492 const char *name, bool dest_uninitialized = false) { 1493 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1494 RegSet saved_reg = RegSet::of(s, d, count); 1495 __ align(CodeEntryAlignment); 1496 StubCodeMark mark(this, "StubRoutines", name); 1497 address start = __ pc(); 1498 __ enter(); 1499 1500 if (entry != nullptr) { 1501 *entry = __ pc(); 1502 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1503 BLOCK_COMMENT("Entry:"); 1504 } 1505 1506 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1507 if (dest_uninitialized) { 1508 decorators |= IS_DEST_UNINITIALIZED; 1509 } 1510 if (aligned) { 1511 decorators |= ARRAYCOPY_ALIGNED; 1512 } 1513 1514 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1515 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1516 1517 if (is_oop) { 1518 // save regs before copy_memory 1519 __ push(RegSet::of(d, count), sp); 1520 } 1521 { 1522 // UnsafeCopyMemory page error: continue after ucm 1523 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1524 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1525 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1526 } 1527 1528 if (is_oop) { 1529 __ pop(RegSet::of(d, count), sp); 1530 if (VerifyOops) 1531 verify_oop_array(size, d, count, r16); 1532 } 1533 1534 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1535 1536 __ leave(); 1537 __ mov(r0, zr); // return 0 1538 __ ret(lr); 1539 return start; 1540 } 1541 1542 // Arguments: 1543 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1544 // ignored 1545 // is_oop - true => oop array, so generate store check code 1546 // name - stub name string 1547 // 1548 // Inputs: 1549 // c_rarg0 - source array address 1550 // c_rarg1 - destination array address 1551 // c_rarg2 - element count, treated as ssize_t, can be zero 1552 // 1553 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1554 // the hardware handle it. The two dwords within qwords that span 1555 // cache line boundaries will still be loaded and stored atomically. 1556 // 1557 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1558 address *entry, const char *name, 1559 bool dest_uninitialized = false) { 1560 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1561 RegSet saved_regs = RegSet::of(s, d, count); 1562 StubCodeMark mark(this, "StubRoutines", name); 1563 address start = __ pc(); 1564 __ enter(); 1565 1566 if (entry != nullptr) { 1567 *entry = __ pc(); 1568 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1569 BLOCK_COMMENT("Entry:"); 1570 } 1571 1572 // use fwd copy when (d-s) above_equal (count*size) 1573 __ sub(rscratch1, d, s); 1574 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1575 __ br(Assembler::HS, nooverlap_target); 1576 1577 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1578 if (dest_uninitialized) { 1579 decorators |= IS_DEST_UNINITIALIZED; 1580 } 1581 if (aligned) { 1582 decorators |= ARRAYCOPY_ALIGNED; 1583 } 1584 1585 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1586 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1587 1588 if (is_oop) { 1589 // save regs before copy_memory 1590 __ push(RegSet::of(d, count), sp); 1591 } 1592 { 1593 // UnsafeCopyMemory page error: continue after ucm 1594 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1595 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1596 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1597 } 1598 if (is_oop) { 1599 __ pop(RegSet::of(d, count), sp); 1600 if (VerifyOops) 1601 verify_oop_array(size, d, count, r16); 1602 } 1603 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1604 __ leave(); 1605 __ mov(r0, zr); // return 0 1606 __ ret(lr); 1607 return start; 1608 } 1609 1610 // Arguments: 1611 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1612 // ignored 1613 // name - stub name string 1614 // 1615 // Inputs: 1616 // c_rarg0 - source array address 1617 // c_rarg1 - destination array address 1618 // c_rarg2 - element count, treated as ssize_t, can be zero 1619 // 1620 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1621 // we let the hardware handle it. The one to eight bytes within words, 1622 // dwords or qwords that span cache line boundaries will still be loaded 1623 // and stored atomically. 1624 // 1625 // Side Effects: 1626 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1627 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1628 // we let the hardware handle it. The one to eight bytes within words, 1629 // dwords or qwords that span cache line boundaries will still be loaded 1630 // and stored atomically. 1631 // 1632 // Side Effects: 1633 // disjoint_byte_copy_entry is set to the no-overlap entry point 1634 // used by generate_conjoint_byte_copy(). 1635 // 1636 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1637 const bool not_oop = false; 1638 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1639 } 1640 1641 // Arguments: 1642 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1643 // ignored 1644 // name - stub name string 1645 // 1646 // Inputs: 1647 // c_rarg0 - source array address 1648 // c_rarg1 - destination array address 1649 // c_rarg2 - element count, treated as ssize_t, can be zero 1650 // 1651 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1652 // we let the hardware handle it. The one to eight bytes within words, 1653 // dwords or qwords that span cache line boundaries will still be loaded 1654 // and stored atomically. 1655 // 1656 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1657 address* entry, const char *name) { 1658 const bool not_oop = false; 1659 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1660 } 1661 1662 // Arguments: 1663 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1664 // ignored 1665 // name - stub name string 1666 // 1667 // Inputs: 1668 // c_rarg0 - source array address 1669 // c_rarg1 - destination array address 1670 // c_rarg2 - element count, treated as ssize_t, can be zero 1671 // 1672 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1673 // let the hardware handle it. The two or four words within dwords 1674 // or qwords that span cache line boundaries will still be loaded 1675 // and stored atomically. 1676 // 1677 // Side Effects: 1678 // disjoint_short_copy_entry is set to the no-overlap entry point 1679 // used by generate_conjoint_short_copy(). 1680 // 1681 address generate_disjoint_short_copy(bool aligned, 1682 address* entry, const char *name) { 1683 const bool not_oop = false; 1684 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1685 } 1686 1687 // Arguments: 1688 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1689 // ignored 1690 // name - stub name string 1691 // 1692 // Inputs: 1693 // c_rarg0 - source array address 1694 // c_rarg1 - destination array address 1695 // c_rarg2 - element count, treated as ssize_t, can be zero 1696 // 1697 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1698 // let the hardware handle it. The two or four words within dwords 1699 // or qwords that span cache line boundaries will still be loaded 1700 // and stored atomically. 1701 // 1702 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1703 address *entry, const char *name) { 1704 const bool not_oop = false; 1705 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1706 1707 } 1708 // Arguments: 1709 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1710 // ignored 1711 // name - stub name string 1712 // 1713 // Inputs: 1714 // c_rarg0 - source array address 1715 // c_rarg1 - destination array address 1716 // c_rarg2 - element count, treated as ssize_t, can be zero 1717 // 1718 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1719 // the hardware handle it. The two dwords within qwords that span 1720 // cache line boundaries will still be loaded and stored atomically. 1721 // 1722 // Side Effects: 1723 // disjoint_int_copy_entry is set to the no-overlap entry point 1724 // used by generate_conjoint_int_oop_copy(). 1725 // 1726 address generate_disjoint_int_copy(bool aligned, address *entry, 1727 const char *name, bool dest_uninitialized = false) { 1728 const bool not_oop = false; 1729 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1730 } 1731 1732 // Arguments: 1733 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1734 // ignored 1735 // name - stub name string 1736 // 1737 // Inputs: 1738 // c_rarg0 - source array address 1739 // c_rarg1 - destination array address 1740 // c_rarg2 - element count, treated as ssize_t, can be zero 1741 // 1742 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1743 // the hardware handle it. The two dwords within qwords that span 1744 // cache line boundaries will still be loaded and stored atomically. 1745 // 1746 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1747 address *entry, const char *name, 1748 bool dest_uninitialized = false) { 1749 const bool not_oop = false; 1750 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1751 } 1752 1753 1754 // Arguments: 1755 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1756 // ignored 1757 // name - stub name string 1758 // 1759 // Inputs: 1760 // c_rarg0 - source array address 1761 // c_rarg1 - destination array address 1762 // c_rarg2 - element count, treated as size_t, can be zero 1763 // 1764 // Side Effects: 1765 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1766 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1767 // 1768 address generate_disjoint_long_copy(bool aligned, address *entry, 1769 const char *name, bool dest_uninitialized = false) { 1770 const bool not_oop = false; 1771 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1772 } 1773 1774 // Arguments: 1775 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1776 // ignored 1777 // name - stub name string 1778 // 1779 // Inputs: 1780 // c_rarg0 - source array address 1781 // c_rarg1 - destination array address 1782 // c_rarg2 - element count, treated as size_t, can be zero 1783 // 1784 address generate_conjoint_long_copy(bool aligned, 1785 address nooverlap_target, address *entry, 1786 const char *name, bool dest_uninitialized = false) { 1787 const bool not_oop = false; 1788 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1789 } 1790 1791 // Arguments: 1792 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1793 // ignored 1794 // name - stub name string 1795 // 1796 // Inputs: 1797 // c_rarg0 - source array address 1798 // c_rarg1 - destination array address 1799 // c_rarg2 - element count, treated as size_t, can be zero 1800 // 1801 // Side Effects: 1802 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1803 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1804 // 1805 address generate_disjoint_oop_copy(bool aligned, address *entry, 1806 const char *name, bool dest_uninitialized) { 1807 const bool is_oop = true; 1808 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1809 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1810 } 1811 1812 // Arguments: 1813 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1814 // ignored 1815 // name - stub name string 1816 // 1817 // Inputs: 1818 // c_rarg0 - source array address 1819 // c_rarg1 - destination array address 1820 // c_rarg2 - element count, treated as size_t, can be zero 1821 // 1822 address generate_conjoint_oop_copy(bool aligned, 1823 address nooverlap_target, address *entry, 1824 const char *name, bool dest_uninitialized) { 1825 const bool is_oop = true; 1826 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1827 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1828 name, dest_uninitialized); 1829 } 1830 1831 1832 // Helper for generating a dynamic type check. 1833 // Smashes rscratch1, rscratch2. 1834 void generate_type_check(Register sub_klass, 1835 Register super_check_offset, 1836 Register super_klass, 1837 Label& L_success) { 1838 assert_different_registers(sub_klass, super_check_offset, super_klass); 1839 1840 BLOCK_COMMENT("type_check:"); 1841 1842 Label L_miss; 1843 1844 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1845 super_check_offset); 1846 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1847 1848 // Fall through on failure! 1849 __ BIND(L_miss); 1850 } 1851 1852 // 1853 // Generate checkcasting array copy stub 1854 // 1855 // Input: 1856 // c_rarg0 - source array address 1857 // c_rarg1 - destination array address 1858 // c_rarg2 - element count, treated as ssize_t, can be zero 1859 // c_rarg3 - size_t ckoff (super_check_offset) 1860 // c_rarg4 - oop ckval (super_klass) 1861 // 1862 // Output: 1863 // r0 == 0 - success 1864 // r0 == -1^K - failure, where K is partial transfer count 1865 // 1866 address generate_checkcast_copy(const char *name, address *entry, 1867 bool dest_uninitialized = false) { 1868 1869 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1870 1871 // Input registers (after setup_arg_regs) 1872 const Register from = c_rarg0; // source array address 1873 const Register to = c_rarg1; // destination array address 1874 const Register count = c_rarg2; // elementscount 1875 const Register ckoff = c_rarg3; // super_check_offset 1876 const Register ckval = c_rarg4; // super_klass 1877 1878 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1879 RegSet wb_post_saved_regs = RegSet::of(count); 1880 1881 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1882 const Register copied_oop = r22; // actual oop copied 1883 const Register count_save = r21; // orig elementscount 1884 const Register start_to = r20; // destination array start address 1885 const Register r19_klass = r19; // oop._klass 1886 1887 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1888 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1889 1890 //--------------------------------------------------------------- 1891 // Assembler stub will be used for this call to arraycopy 1892 // if the two arrays are subtypes of Object[] but the 1893 // destination array type is not equal to or a supertype 1894 // of the source type. Each element must be separately 1895 // checked. 1896 1897 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1898 copied_oop, r19_klass, count_save); 1899 1900 __ align(CodeEntryAlignment); 1901 StubCodeMark mark(this, "StubRoutines", name); 1902 address start = __ pc(); 1903 1904 __ enter(); // required for proper stackwalking of RuntimeStub frame 1905 1906 #ifdef ASSERT 1907 // caller guarantees that the arrays really are different 1908 // otherwise, we would have to make conjoint checks 1909 { Label L; 1910 __ b(L); // conjoint check not yet implemented 1911 __ stop("checkcast_copy within a single array"); 1912 __ bind(L); 1913 } 1914 #endif //ASSERT 1915 1916 // Caller of this entry point must set up the argument registers. 1917 if (entry != nullptr) { 1918 *entry = __ pc(); 1919 BLOCK_COMMENT("Entry:"); 1920 } 1921 1922 // Empty array: Nothing to do. 1923 __ cbz(count, L_done); 1924 __ push(RegSet::of(r19, r20, r21, r22), sp); 1925 1926 #ifdef ASSERT 1927 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1928 // The ckoff and ckval must be mutually consistent, 1929 // even though caller generates both. 1930 { Label L; 1931 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1932 __ ldrw(start_to, Address(ckval, sco_offset)); 1933 __ cmpw(ckoff, start_to); 1934 __ br(Assembler::EQ, L); 1935 __ stop("super_check_offset inconsistent"); 1936 __ bind(L); 1937 } 1938 #endif //ASSERT 1939 1940 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1941 bool is_oop = true; 1942 int element_size = UseCompressedOops ? 4 : 8; 1943 if (dest_uninitialized) { 1944 decorators |= IS_DEST_UNINITIALIZED; 1945 } 1946 1947 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1948 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1949 1950 // save the original count 1951 __ mov(count_save, count); 1952 1953 // Copy from low to high addresses 1954 __ mov(start_to, to); // Save destination array start address 1955 __ b(L_load_element); 1956 1957 // ======== begin loop ======== 1958 // (Loop is rotated; its entry is L_load_element.) 1959 // Loop control: 1960 // for (; count != 0; count--) { 1961 // copied_oop = load_heap_oop(from++); 1962 // ... generate_type_check ...; 1963 // store_heap_oop(to++, copied_oop); 1964 // } 1965 __ align(OptoLoopAlignment); 1966 1967 __ BIND(L_store_element); 1968 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1969 __ post(to, element_size), copied_oop, noreg, 1970 gct1, gct2, gct3); 1971 __ sub(count, count, 1); 1972 __ cbz(count, L_do_card_marks); 1973 1974 // ======== loop entry is here ======== 1975 __ BIND(L_load_element); 1976 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1977 copied_oop, noreg, __ post(from, element_size), 1978 gct1); 1979 __ cbz(copied_oop, L_store_element); 1980 1981 __ load_klass(r19_klass, copied_oop);// query the object klass 1982 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1983 // ======== end loop ======== 1984 1985 // It was a real error; we must depend on the caller to finish the job. 1986 // Register count = remaining oops, count_orig = total oops. 1987 // Emit GC store barriers for the oops we have copied and report 1988 // their number to the caller. 1989 1990 __ subs(count, count_save, count); // K = partially copied oop count 1991 __ eon(count, count, zr); // report (-1^K) to caller 1992 __ br(Assembler::EQ, L_done_pop); 1993 1994 __ BIND(L_do_card_marks); 1995 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1996 1997 __ bind(L_done_pop); 1998 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1999 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2000 2001 __ bind(L_done); 2002 __ mov(r0, count); 2003 __ leave(); 2004 __ ret(lr); 2005 2006 return start; 2007 } 2008 2009 // Perform range checks on the proposed arraycopy. 2010 // Kills temp, but nothing else. 2011 // Also, clean the sign bits of src_pos and dst_pos. 2012 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2013 Register src_pos, // source position (c_rarg1) 2014 Register dst, // destination array oo (c_rarg2) 2015 Register dst_pos, // destination position (c_rarg3) 2016 Register length, 2017 Register temp, 2018 Label& L_failed) { 2019 BLOCK_COMMENT("arraycopy_range_checks:"); 2020 2021 assert_different_registers(rscratch1, temp); 2022 2023 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2024 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2025 __ addw(temp, length, src_pos); 2026 __ cmpw(temp, rscratch1); 2027 __ br(Assembler::HI, L_failed); 2028 2029 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2030 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2031 __ addw(temp, length, dst_pos); 2032 __ cmpw(temp, rscratch1); 2033 __ br(Assembler::HI, L_failed); 2034 2035 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2036 __ movw(src_pos, src_pos); 2037 __ movw(dst_pos, dst_pos); 2038 2039 BLOCK_COMMENT("arraycopy_range_checks done"); 2040 } 2041 2042 // These stubs get called from some dumb test routine. 2043 // I'll write them properly when they're called from 2044 // something that's actually doing something. 2045 static void fake_arraycopy_stub(address src, address dst, int count) { 2046 assert(count == 0, "huh?"); 2047 } 2048 2049 2050 // 2051 // Generate 'unsafe' array copy stub 2052 // Though just as safe as the other stubs, it takes an unscaled 2053 // size_t argument instead of an element count. 2054 // 2055 // Input: 2056 // c_rarg0 - source array address 2057 // c_rarg1 - destination array address 2058 // c_rarg2 - byte count, treated as ssize_t, can be zero 2059 // 2060 // Examines the alignment of the operands and dispatches 2061 // to a long, int, short, or byte copy loop. 2062 // 2063 address generate_unsafe_copy(const char *name, 2064 address byte_copy_entry, 2065 address short_copy_entry, 2066 address int_copy_entry, 2067 address long_copy_entry) { 2068 Label L_long_aligned, L_int_aligned, L_short_aligned; 2069 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2070 2071 __ align(CodeEntryAlignment); 2072 StubCodeMark mark(this, "StubRoutines", name); 2073 address start = __ pc(); 2074 __ enter(); // required for proper stackwalking of RuntimeStub frame 2075 2076 // bump this on entry, not on exit: 2077 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2078 2079 __ orr(rscratch1, s, d); 2080 __ orr(rscratch1, rscratch1, count); 2081 2082 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2083 __ cbz(rscratch1, L_long_aligned); 2084 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2085 __ cbz(rscratch1, L_int_aligned); 2086 __ tbz(rscratch1, 0, L_short_aligned); 2087 __ b(RuntimeAddress(byte_copy_entry)); 2088 2089 __ BIND(L_short_aligned); 2090 __ lsr(count, count, LogBytesPerShort); // size => short_count 2091 __ b(RuntimeAddress(short_copy_entry)); 2092 __ BIND(L_int_aligned); 2093 __ lsr(count, count, LogBytesPerInt); // size => int_count 2094 __ b(RuntimeAddress(int_copy_entry)); 2095 __ BIND(L_long_aligned); 2096 __ lsr(count, count, LogBytesPerLong); // size => long_count 2097 __ b(RuntimeAddress(long_copy_entry)); 2098 2099 return start; 2100 } 2101 2102 // 2103 // Generate generic array copy stubs 2104 // 2105 // Input: 2106 // c_rarg0 - src oop 2107 // c_rarg1 - src_pos (32-bits) 2108 // c_rarg2 - dst oop 2109 // c_rarg3 - dst_pos (32-bits) 2110 // c_rarg4 - element count (32-bits) 2111 // 2112 // Output: 2113 // r0 == 0 - success 2114 // r0 == -1^K - failure, where K is partial transfer count 2115 // 2116 address generate_generic_copy(const char *name, 2117 address byte_copy_entry, address short_copy_entry, 2118 address int_copy_entry, address oop_copy_entry, 2119 address long_copy_entry, address checkcast_copy_entry) { 2120 2121 Label L_failed, L_objArray; 2122 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2123 2124 // Input registers 2125 const Register src = c_rarg0; // source array oop 2126 const Register src_pos = c_rarg1; // source position 2127 const Register dst = c_rarg2; // destination array oop 2128 const Register dst_pos = c_rarg3; // destination position 2129 const Register length = c_rarg4; 2130 2131 2132 // Registers used as temps 2133 const Register dst_klass = c_rarg5; 2134 2135 __ align(CodeEntryAlignment); 2136 2137 StubCodeMark mark(this, "StubRoutines", name); 2138 2139 address start = __ pc(); 2140 2141 __ enter(); // required for proper stackwalking of RuntimeStub frame 2142 2143 // bump this on entry, not on exit: 2144 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2145 2146 //----------------------------------------------------------------------- 2147 // Assembler stub will be used for this call to arraycopy 2148 // if the following conditions are met: 2149 // 2150 // (1) src and dst must not be null. 2151 // (2) src_pos must not be negative. 2152 // (3) dst_pos must not be negative. 2153 // (4) length must not be negative. 2154 // (5) src klass and dst klass should be the same and not null. 2155 // (6) src and dst should be arrays. 2156 // (7) src_pos + length must not exceed length of src. 2157 // (8) dst_pos + length must not exceed length of dst. 2158 // 2159 2160 // if (src == nullptr) return -1; 2161 __ cbz(src, L_failed); 2162 2163 // if (src_pos < 0) return -1; 2164 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2165 2166 // if (dst == nullptr) return -1; 2167 __ cbz(dst, L_failed); 2168 2169 // if (dst_pos < 0) return -1; 2170 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2171 2172 // registers used as temp 2173 const Register scratch_length = r16; // elements count to copy 2174 const Register scratch_src_klass = r17; // array klass 2175 const Register lh = r15; // layout helper 2176 2177 // if (length < 0) return -1; 2178 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2179 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2180 2181 __ load_klass(scratch_src_klass, src); 2182 #ifdef ASSERT 2183 // assert(src->klass() != nullptr); 2184 { 2185 BLOCK_COMMENT("assert klasses not null {"); 2186 Label L1, L2; 2187 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2188 __ bind(L1); 2189 __ stop("broken null klass"); 2190 __ bind(L2); 2191 __ load_klass(rscratch1, dst); 2192 __ cbz(rscratch1, L1); // this would be broken also 2193 BLOCK_COMMENT("} assert klasses not null done"); 2194 } 2195 #endif 2196 2197 // Load layout helper (32-bits) 2198 // 2199 // |array_tag| | header_size | element_type | |log2_element_size| 2200 // 32 30 24 16 8 2 0 2201 // 2202 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2203 // 2204 2205 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2206 2207 // Handle objArrays completely differently... 2208 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2209 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2210 __ movw(rscratch1, objArray_lh); 2211 __ eorw(rscratch2, lh, rscratch1); 2212 __ cbzw(rscratch2, L_objArray); 2213 2214 // if (src->klass() != dst->klass()) return -1; 2215 __ load_klass(rscratch2, dst); 2216 __ eor(rscratch2, rscratch2, scratch_src_klass); 2217 __ cbnz(rscratch2, L_failed); 2218 2219 // if (!src->is_Array()) return -1; 2220 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2221 2222 // At this point, it is known to be a typeArray (array_tag 0x3). 2223 #ifdef ASSERT 2224 { 2225 BLOCK_COMMENT("assert primitive array {"); 2226 Label L; 2227 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2228 __ cmpw(lh, rscratch2); 2229 __ br(Assembler::GE, L); 2230 __ stop("must be a primitive array"); 2231 __ bind(L); 2232 BLOCK_COMMENT("} assert primitive array done"); 2233 } 2234 #endif 2235 2236 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2237 rscratch2, L_failed); 2238 2239 // TypeArrayKlass 2240 // 2241 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2242 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2243 // 2244 2245 const Register rscratch1_offset = rscratch1; // array offset 2246 const Register r15_elsize = lh; // element size 2247 2248 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2249 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2250 __ add(src, src, rscratch1_offset); // src array offset 2251 __ add(dst, dst, rscratch1_offset); // dst array offset 2252 BLOCK_COMMENT("choose copy loop based on element size"); 2253 2254 // next registers should be set before the jump to corresponding stub 2255 const Register from = c_rarg0; // source array address 2256 const Register to = c_rarg1; // destination array address 2257 const Register count = c_rarg2; // elements count 2258 2259 // 'from', 'to', 'count' registers should be set in such order 2260 // since they are the same as 'src', 'src_pos', 'dst'. 2261 2262 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2263 2264 // The possible values of elsize are 0-3, i.e. exact_log2(element 2265 // size in bytes). We do a simple bitwise binary search. 2266 __ BIND(L_copy_bytes); 2267 __ tbnz(r15_elsize, 1, L_copy_ints); 2268 __ tbnz(r15_elsize, 0, L_copy_shorts); 2269 __ lea(from, Address(src, src_pos));// src_addr 2270 __ lea(to, Address(dst, dst_pos));// dst_addr 2271 __ movw(count, scratch_length); // length 2272 __ b(RuntimeAddress(byte_copy_entry)); 2273 2274 __ BIND(L_copy_shorts); 2275 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2276 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2277 __ movw(count, scratch_length); // length 2278 __ b(RuntimeAddress(short_copy_entry)); 2279 2280 __ BIND(L_copy_ints); 2281 __ tbnz(r15_elsize, 0, L_copy_longs); 2282 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2283 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2284 __ movw(count, scratch_length); // length 2285 __ b(RuntimeAddress(int_copy_entry)); 2286 2287 __ BIND(L_copy_longs); 2288 #ifdef ASSERT 2289 { 2290 BLOCK_COMMENT("assert long copy {"); 2291 Label L; 2292 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2293 __ cmpw(r15_elsize, LogBytesPerLong); 2294 __ br(Assembler::EQ, L); 2295 __ stop("must be long copy, but elsize is wrong"); 2296 __ bind(L); 2297 BLOCK_COMMENT("} assert long copy done"); 2298 } 2299 #endif 2300 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2301 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2302 __ movw(count, scratch_length); // length 2303 __ b(RuntimeAddress(long_copy_entry)); 2304 2305 // ObjArrayKlass 2306 __ BIND(L_objArray); 2307 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2308 2309 Label L_plain_copy, L_checkcast_copy; 2310 // test array classes for subtyping 2311 __ load_klass(r15, dst); 2312 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2313 __ br(Assembler::NE, L_checkcast_copy); 2314 2315 // Identically typed arrays can be copied without element-wise checks. 2316 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2317 rscratch2, L_failed); 2318 2319 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2320 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2321 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2322 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2323 __ movw(count, scratch_length); // length 2324 __ BIND(L_plain_copy); 2325 __ b(RuntimeAddress(oop_copy_entry)); 2326 2327 __ BIND(L_checkcast_copy); 2328 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2329 { 2330 // Before looking at dst.length, make sure dst is also an objArray. 2331 __ ldrw(rscratch1, Address(r15, lh_offset)); 2332 __ movw(rscratch2, objArray_lh); 2333 __ eorw(rscratch1, rscratch1, rscratch2); 2334 __ cbnzw(rscratch1, L_failed); 2335 2336 // It is safe to examine both src.length and dst.length. 2337 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2338 r15, L_failed); 2339 2340 __ load_klass(dst_klass, dst); // reload 2341 2342 // Marshal the base address arguments now, freeing registers. 2343 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2344 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2345 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2346 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2347 __ movw(count, length); // length (reloaded) 2348 Register sco_temp = c_rarg3; // this register is free now 2349 assert_different_registers(from, to, count, sco_temp, 2350 dst_klass, scratch_src_klass); 2351 // assert_clean_int(count, sco_temp); 2352 2353 // Generate the type check. 2354 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2355 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2356 2357 // Smashes rscratch1, rscratch2 2358 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2359 2360 // Fetch destination element klass from the ObjArrayKlass header. 2361 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2362 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2363 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2364 2365 // the checkcast_copy loop needs two extra arguments: 2366 assert(c_rarg3 == sco_temp, "#3 already in place"); 2367 // Set up arguments for checkcast_copy_entry. 2368 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2369 __ b(RuntimeAddress(checkcast_copy_entry)); 2370 } 2371 2372 __ BIND(L_failed); 2373 __ mov(r0, -1); 2374 __ leave(); // required for proper stackwalking of RuntimeStub frame 2375 __ ret(lr); 2376 2377 return start; 2378 } 2379 2380 // 2381 // Generate stub for array fill. If "aligned" is true, the 2382 // "to" address is assumed to be heapword aligned. 2383 // 2384 // Arguments for generated stub: 2385 // to: c_rarg0 2386 // value: c_rarg1 2387 // count: c_rarg2 treated as signed 2388 // 2389 address generate_fill(BasicType t, bool aligned, const char *name) { 2390 __ align(CodeEntryAlignment); 2391 StubCodeMark mark(this, "StubRoutines", name); 2392 address start = __ pc(); 2393 2394 BLOCK_COMMENT("Entry:"); 2395 2396 const Register to = c_rarg0; // source array address 2397 const Register value = c_rarg1; // value 2398 const Register count = c_rarg2; // elements count 2399 2400 const Register bz_base = r10; // base for block_zero routine 2401 const Register cnt_words = r11; // temp register 2402 2403 __ enter(); 2404 2405 Label L_fill_elements, L_exit1; 2406 2407 int shift = -1; 2408 switch (t) { 2409 case T_BYTE: 2410 shift = 0; 2411 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2412 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2413 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2414 __ br(Assembler::LO, L_fill_elements); 2415 break; 2416 case T_SHORT: 2417 shift = 1; 2418 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2419 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2420 __ br(Assembler::LO, L_fill_elements); 2421 break; 2422 case T_INT: 2423 shift = 2; 2424 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2425 __ br(Assembler::LO, L_fill_elements); 2426 break; 2427 default: ShouldNotReachHere(); 2428 } 2429 2430 // Align source address at 8 bytes address boundary. 2431 Label L_skip_align1, L_skip_align2, L_skip_align4; 2432 if (!aligned) { 2433 switch (t) { 2434 case T_BYTE: 2435 // One byte misalignment happens only for byte arrays. 2436 __ tbz(to, 0, L_skip_align1); 2437 __ strb(value, Address(__ post(to, 1))); 2438 __ subw(count, count, 1); 2439 __ bind(L_skip_align1); 2440 // Fallthrough 2441 case T_SHORT: 2442 // Two bytes misalignment happens only for byte and short (char) arrays. 2443 __ tbz(to, 1, L_skip_align2); 2444 __ strh(value, Address(__ post(to, 2))); 2445 __ subw(count, count, 2 >> shift); 2446 __ bind(L_skip_align2); 2447 // Fallthrough 2448 case T_INT: 2449 // Align to 8 bytes, we know we are 4 byte aligned to start. 2450 __ tbz(to, 2, L_skip_align4); 2451 __ strw(value, Address(__ post(to, 4))); 2452 __ subw(count, count, 4 >> shift); 2453 __ bind(L_skip_align4); 2454 break; 2455 default: ShouldNotReachHere(); 2456 } 2457 } 2458 2459 // 2460 // Fill large chunks 2461 // 2462 __ lsrw(cnt_words, count, 3 - shift); // number of words 2463 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2464 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2465 if (UseBlockZeroing) { 2466 Label non_block_zeroing, rest; 2467 // If the fill value is zero we can use the fast zero_words(). 2468 __ cbnz(value, non_block_zeroing); 2469 __ mov(bz_base, to); 2470 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2471 address tpc = __ zero_words(bz_base, cnt_words); 2472 if (tpc == nullptr) { 2473 fatal("CodeCache is full at generate_fill"); 2474 } 2475 __ b(rest); 2476 __ bind(non_block_zeroing); 2477 __ fill_words(to, cnt_words, value); 2478 __ bind(rest); 2479 } else { 2480 __ fill_words(to, cnt_words, value); 2481 } 2482 2483 // Remaining count is less than 8 bytes. Fill it by a single store. 2484 // Note that the total length is no less than 8 bytes. 2485 if (t == T_BYTE || t == T_SHORT) { 2486 Label L_exit1; 2487 __ cbzw(count, L_exit1); 2488 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2489 __ str(value, Address(to, -8)); // overwrite some elements 2490 __ bind(L_exit1); 2491 __ leave(); 2492 __ ret(lr); 2493 } 2494 2495 // Handle copies less than 8 bytes. 2496 Label L_fill_2, L_fill_4, L_exit2; 2497 __ bind(L_fill_elements); 2498 switch (t) { 2499 case T_BYTE: 2500 __ tbz(count, 0, L_fill_2); 2501 __ strb(value, Address(__ post(to, 1))); 2502 __ bind(L_fill_2); 2503 __ tbz(count, 1, L_fill_4); 2504 __ strh(value, Address(__ post(to, 2))); 2505 __ bind(L_fill_4); 2506 __ tbz(count, 2, L_exit2); 2507 __ strw(value, Address(to)); 2508 break; 2509 case T_SHORT: 2510 __ tbz(count, 0, L_fill_4); 2511 __ strh(value, Address(__ post(to, 2))); 2512 __ bind(L_fill_4); 2513 __ tbz(count, 1, L_exit2); 2514 __ strw(value, Address(to)); 2515 break; 2516 case T_INT: 2517 __ cbzw(count, L_exit2); 2518 __ strw(value, Address(to)); 2519 break; 2520 default: ShouldNotReachHere(); 2521 } 2522 __ bind(L_exit2); 2523 __ leave(); 2524 __ ret(lr); 2525 return start; 2526 } 2527 2528 address generate_data_cache_writeback() { 2529 const Register line = c_rarg0; // address of line to write back 2530 2531 __ align(CodeEntryAlignment); 2532 2533 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2534 2535 address start = __ pc(); 2536 __ enter(); 2537 __ cache_wb(Address(line, 0)); 2538 __ leave(); 2539 __ ret(lr); 2540 2541 return start; 2542 } 2543 2544 address generate_data_cache_writeback_sync() { 2545 const Register is_pre = c_rarg0; // pre or post sync 2546 2547 __ align(CodeEntryAlignment); 2548 2549 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2550 2551 // pre wbsync is a no-op 2552 // post wbsync translates to an sfence 2553 2554 Label skip; 2555 address start = __ pc(); 2556 __ enter(); 2557 __ cbnz(is_pre, skip); 2558 __ cache_wbsync(false); 2559 __ bind(skip); 2560 __ leave(); 2561 __ ret(lr); 2562 2563 return start; 2564 } 2565 2566 void generate_arraycopy_stubs() { 2567 address entry; 2568 address entry_jbyte_arraycopy; 2569 address entry_jshort_arraycopy; 2570 address entry_jint_arraycopy; 2571 address entry_oop_arraycopy; 2572 address entry_jlong_arraycopy; 2573 address entry_checkcast_arraycopy; 2574 2575 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2576 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2577 2578 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2579 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2580 2581 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2582 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2583 2584 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2585 2586 //*** jbyte 2587 // Always need aligned and unaligned versions 2588 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2589 "jbyte_disjoint_arraycopy"); 2590 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2591 &entry_jbyte_arraycopy, 2592 "jbyte_arraycopy"); 2593 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2594 "arrayof_jbyte_disjoint_arraycopy"); 2595 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2596 "arrayof_jbyte_arraycopy"); 2597 2598 //*** jshort 2599 // Always need aligned and unaligned versions 2600 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2601 "jshort_disjoint_arraycopy"); 2602 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2603 &entry_jshort_arraycopy, 2604 "jshort_arraycopy"); 2605 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2606 "arrayof_jshort_disjoint_arraycopy"); 2607 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2608 "arrayof_jshort_arraycopy"); 2609 2610 //*** jint 2611 // Aligned versions 2612 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2613 "arrayof_jint_disjoint_arraycopy"); 2614 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2615 "arrayof_jint_arraycopy"); 2616 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2617 // entry_jint_arraycopy always points to the unaligned version 2618 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2619 "jint_disjoint_arraycopy"); 2620 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2621 &entry_jint_arraycopy, 2622 "jint_arraycopy"); 2623 2624 //*** jlong 2625 // It is always aligned 2626 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2627 "arrayof_jlong_disjoint_arraycopy"); 2628 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2629 "arrayof_jlong_arraycopy"); 2630 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2631 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2632 2633 //*** oops 2634 { 2635 // With compressed oops we need unaligned versions; notice that 2636 // we overwrite entry_oop_arraycopy. 2637 bool aligned = !UseCompressedOops; 2638 2639 StubRoutines::_arrayof_oop_disjoint_arraycopy 2640 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2641 /*dest_uninitialized*/false); 2642 StubRoutines::_arrayof_oop_arraycopy 2643 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2644 /*dest_uninitialized*/false); 2645 // Aligned versions without pre-barriers 2646 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2647 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2648 /*dest_uninitialized*/true); 2649 StubRoutines::_arrayof_oop_arraycopy_uninit 2650 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2651 /*dest_uninitialized*/true); 2652 } 2653 2654 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2655 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2656 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2657 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2658 2659 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2660 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2661 /*dest_uninitialized*/true); 2662 2663 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2664 entry_jbyte_arraycopy, 2665 entry_jshort_arraycopy, 2666 entry_jint_arraycopy, 2667 entry_jlong_arraycopy); 2668 2669 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2670 entry_jbyte_arraycopy, 2671 entry_jshort_arraycopy, 2672 entry_jint_arraycopy, 2673 entry_oop_arraycopy, 2674 entry_jlong_arraycopy, 2675 entry_checkcast_arraycopy); 2676 2677 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2678 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2679 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2680 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2681 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2682 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2683 } 2684 2685 void generate_math_stubs() { Unimplemented(); } 2686 2687 // Arguments: 2688 // 2689 // Inputs: 2690 // c_rarg0 - source byte array address 2691 // c_rarg1 - destination byte array address 2692 // c_rarg2 - K (key) in little endian int array 2693 // 2694 address generate_aescrypt_encryptBlock() { 2695 __ align(CodeEntryAlignment); 2696 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2697 2698 const Register from = c_rarg0; // source array address 2699 const Register to = c_rarg1; // destination array address 2700 const Register key = c_rarg2; // key array address 2701 const Register keylen = rscratch1; 2702 2703 address start = __ pc(); 2704 __ enter(); 2705 2706 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2707 2708 __ aesenc_loadkeys(key, keylen); 2709 __ aesecb_encrypt(from, to, keylen); 2710 2711 __ mov(r0, 0); 2712 2713 __ leave(); 2714 __ ret(lr); 2715 2716 return start; 2717 } 2718 2719 // Arguments: 2720 // 2721 // Inputs: 2722 // c_rarg0 - source byte array address 2723 // c_rarg1 - destination byte array address 2724 // c_rarg2 - K (key) in little endian int array 2725 // 2726 address generate_aescrypt_decryptBlock() { 2727 assert(UseAES, "need AES cryptographic extension support"); 2728 __ align(CodeEntryAlignment); 2729 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2730 Label L_doLast; 2731 2732 const Register from = c_rarg0; // source array address 2733 const Register to = c_rarg1; // destination array address 2734 const Register key = c_rarg2; // key array address 2735 const Register keylen = rscratch1; 2736 2737 address start = __ pc(); 2738 __ enter(); // required for proper stackwalking of RuntimeStub frame 2739 2740 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2741 2742 __ aesecb_decrypt(from, to, key, keylen); 2743 2744 __ mov(r0, 0); 2745 2746 __ leave(); 2747 __ ret(lr); 2748 2749 return start; 2750 } 2751 2752 // Arguments: 2753 // 2754 // Inputs: 2755 // c_rarg0 - source byte array address 2756 // c_rarg1 - destination byte array address 2757 // c_rarg2 - K (key) in little endian int array 2758 // c_rarg3 - r vector byte array address 2759 // c_rarg4 - input length 2760 // 2761 // Output: 2762 // x0 - input length 2763 // 2764 address generate_cipherBlockChaining_encryptAESCrypt() { 2765 assert(UseAES, "need AES cryptographic extension support"); 2766 __ align(CodeEntryAlignment); 2767 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2768 2769 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2770 2771 const Register from = c_rarg0; // source array address 2772 const Register to = c_rarg1; // destination array address 2773 const Register key = c_rarg2; // key array address 2774 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2775 // and left with the results of the last encryption block 2776 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2777 const Register keylen = rscratch1; 2778 2779 address start = __ pc(); 2780 2781 __ enter(); 2782 2783 __ movw(rscratch2, len_reg); 2784 2785 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2786 2787 __ ld1(v0, __ T16B, rvec); 2788 2789 __ cmpw(keylen, 52); 2790 __ br(Assembler::CC, L_loadkeys_44); 2791 __ br(Assembler::EQ, L_loadkeys_52); 2792 2793 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2794 __ rev32(v17, __ T16B, v17); 2795 __ rev32(v18, __ T16B, v18); 2796 __ BIND(L_loadkeys_52); 2797 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2798 __ rev32(v19, __ T16B, v19); 2799 __ rev32(v20, __ T16B, v20); 2800 __ BIND(L_loadkeys_44); 2801 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2802 __ rev32(v21, __ T16B, v21); 2803 __ rev32(v22, __ T16B, v22); 2804 __ rev32(v23, __ T16B, v23); 2805 __ rev32(v24, __ T16B, v24); 2806 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2807 __ rev32(v25, __ T16B, v25); 2808 __ rev32(v26, __ T16B, v26); 2809 __ rev32(v27, __ T16B, v27); 2810 __ rev32(v28, __ T16B, v28); 2811 __ ld1(v29, v30, v31, __ T16B, key); 2812 __ rev32(v29, __ T16B, v29); 2813 __ rev32(v30, __ T16B, v30); 2814 __ rev32(v31, __ T16B, v31); 2815 2816 __ BIND(L_aes_loop); 2817 __ ld1(v1, __ T16B, __ post(from, 16)); 2818 __ eor(v0, __ T16B, v0, v1); 2819 2820 __ br(Assembler::CC, L_rounds_44); 2821 __ br(Assembler::EQ, L_rounds_52); 2822 2823 __ aese(v0, v17); __ aesmc(v0, v0); 2824 __ aese(v0, v18); __ aesmc(v0, v0); 2825 __ BIND(L_rounds_52); 2826 __ aese(v0, v19); __ aesmc(v0, v0); 2827 __ aese(v0, v20); __ aesmc(v0, v0); 2828 __ BIND(L_rounds_44); 2829 __ aese(v0, v21); __ aesmc(v0, v0); 2830 __ aese(v0, v22); __ aesmc(v0, v0); 2831 __ aese(v0, v23); __ aesmc(v0, v0); 2832 __ aese(v0, v24); __ aesmc(v0, v0); 2833 __ aese(v0, v25); __ aesmc(v0, v0); 2834 __ aese(v0, v26); __ aesmc(v0, v0); 2835 __ aese(v0, v27); __ aesmc(v0, v0); 2836 __ aese(v0, v28); __ aesmc(v0, v0); 2837 __ aese(v0, v29); __ aesmc(v0, v0); 2838 __ aese(v0, v30); 2839 __ eor(v0, __ T16B, v0, v31); 2840 2841 __ st1(v0, __ T16B, __ post(to, 16)); 2842 2843 __ subw(len_reg, len_reg, 16); 2844 __ cbnzw(len_reg, L_aes_loop); 2845 2846 __ st1(v0, __ T16B, rvec); 2847 2848 __ mov(r0, rscratch2); 2849 2850 __ leave(); 2851 __ ret(lr); 2852 2853 return start; 2854 } 2855 2856 // Arguments: 2857 // 2858 // Inputs: 2859 // c_rarg0 - source byte array address 2860 // c_rarg1 - destination byte array address 2861 // c_rarg2 - K (key) in little endian int array 2862 // c_rarg3 - r vector byte array address 2863 // c_rarg4 - input length 2864 // 2865 // Output: 2866 // r0 - input length 2867 // 2868 address generate_cipherBlockChaining_decryptAESCrypt() { 2869 assert(UseAES, "need AES cryptographic extension support"); 2870 __ align(CodeEntryAlignment); 2871 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2872 2873 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2874 2875 const Register from = c_rarg0; // source array address 2876 const Register to = c_rarg1; // destination array address 2877 const Register key = c_rarg2; // key array address 2878 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2879 // and left with the results of the last encryption block 2880 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2881 const Register keylen = rscratch1; 2882 2883 address start = __ pc(); 2884 2885 __ enter(); 2886 2887 __ movw(rscratch2, len_reg); 2888 2889 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2890 2891 __ ld1(v2, __ T16B, rvec); 2892 2893 __ ld1(v31, __ T16B, __ post(key, 16)); 2894 __ rev32(v31, __ T16B, v31); 2895 2896 __ cmpw(keylen, 52); 2897 __ br(Assembler::CC, L_loadkeys_44); 2898 __ br(Assembler::EQ, L_loadkeys_52); 2899 2900 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2901 __ rev32(v17, __ T16B, v17); 2902 __ rev32(v18, __ T16B, v18); 2903 __ BIND(L_loadkeys_52); 2904 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2905 __ rev32(v19, __ T16B, v19); 2906 __ rev32(v20, __ T16B, v20); 2907 __ BIND(L_loadkeys_44); 2908 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2909 __ rev32(v21, __ T16B, v21); 2910 __ rev32(v22, __ T16B, v22); 2911 __ rev32(v23, __ T16B, v23); 2912 __ rev32(v24, __ T16B, v24); 2913 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2914 __ rev32(v25, __ T16B, v25); 2915 __ rev32(v26, __ T16B, v26); 2916 __ rev32(v27, __ T16B, v27); 2917 __ rev32(v28, __ T16B, v28); 2918 __ ld1(v29, v30, __ T16B, key); 2919 __ rev32(v29, __ T16B, v29); 2920 __ rev32(v30, __ T16B, v30); 2921 2922 __ BIND(L_aes_loop); 2923 __ ld1(v0, __ T16B, __ post(from, 16)); 2924 __ orr(v1, __ T16B, v0, v0); 2925 2926 __ br(Assembler::CC, L_rounds_44); 2927 __ br(Assembler::EQ, L_rounds_52); 2928 2929 __ aesd(v0, v17); __ aesimc(v0, v0); 2930 __ aesd(v0, v18); __ aesimc(v0, v0); 2931 __ BIND(L_rounds_52); 2932 __ aesd(v0, v19); __ aesimc(v0, v0); 2933 __ aesd(v0, v20); __ aesimc(v0, v0); 2934 __ BIND(L_rounds_44); 2935 __ aesd(v0, v21); __ aesimc(v0, v0); 2936 __ aesd(v0, v22); __ aesimc(v0, v0); 2937 __ aesd(v0, v23); __ aesimc(v0, v0); 2938 __ aesd(v0, v24); __ aesimc(v0, v0); 2939 __ aesd(v0, v25); __ aesimc(v0, v0); 2940 __ aesd(v0, v26); __ aesimc(v0, v0); 2941 __ aesd(v0, v27); __ aesimc(v0, v0); 2942 __ aesd(v0, v28); __ aesimc(v0, v0); 2943 __ aesd(v0, v29); __ aesimc(v0, v0); 2944 __ aesd(v0, v30); 2945 __ eor(v0, __ T16B, v0, v31); 2946 __ eor(v0, __ T16B, v0, v2); 2947 2948 __ st1(v0, __ T16B, __ post(to, 16)); 2949 __ orr(v2, __ T16B, v1, v1); 2950 2951 __ subw(len_reg, len_reg, 16); 2952 __ cbnzw(len_reg, L_aes_loop); 2953 2954 __ st1(v2, __ T16B, rvec); 2955 2956 __ mov(r0, rscratch2); 2957 2958 __ leave(); 2959 __ ret(lr); 2960 2961 return start; 2962 } 2963 2964 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2965 // Inputs: 128-bits. in is preserved. 2966 // The least-significant 64-bit word is in the upper dword of each vector. 2967 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2968 // Output: result 2969 void be_add_128_64(FloatRegister result, FloatRegister in, 2970 FloatRegister inc, FloatRegister tmp) { 2971 assert_different_registers(result, tmp, inc); 2972 2973 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2974 // input 2975 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 2976 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 2977 // MSD == 0 (must be!) to LSD 2978 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 2979 } 2980 2981 // CTR AES crypt. 2982 // Arguments: 2983 // 2984 // Inputs: 2985 // c_rarg0 - source byte array address 2986 // c_rarg1 - destination byte array address 2987 // c_rarg2 - K (key) in little endian int array 2988 // c_rarg3 - counter vector byte array address 2989 // c_rarg4 - input length 2990 // c_rarg5 - saved encryptedCounter start 2991 // c_rarg6 - saved used length 2992 // 2993 // Output: 2994 // r0 - input length 2995 // 2996 address generate_counterMode_AESCrypt() { 2997 const Register in = c_rarg0; 2998 const Register out = c_rarg1; 2999 const Register key = c_rarg2; 3000 const Register counter = c_rarg3; 3001 const Register saved_len = c_rarg4, len = r10; 3002 const Register saved_encrypted_ctr = c_rarg5; 3003 const Register used_ptr = c_rarg6, used = r12; 3004 3005 const Register offset = r7; 3006 const Register keylen = r11; 3007 3008 const unsigned char block_size = 16; 3009 const int bulk_width = 4; 3010 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3011 // performance with larger data sizes, but it also means that the 3012 // fast path isn't used until you have at least 8 blocks, and up 3013 // to 127 bytes of data will be executed on the slow path. For 3014 // that reason, and also so as not to blow away too much icache, 4 3015 // blocks seems like a sensible compromise. 3016 3017 // Algorithm: 3018 // 3019 // if (len == 0) { 3020 // goto DONE; 3021 // } 3022 // int result = len; 3023 // do { 3024 // if (used >= blockSize) { 3025 // if (len >= bulk_width * blockSize) { 3026 // CTR_large_block(); 3027 // if (len == 0) 3028 // goto DONE; 3029 // } 3030 // for (;;) { 3031 // 16ByteVector v0 = counter; 3032 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3033 // used = 0; 3034 // if (len < blockSize) 3035 // break; /* goto NEXT */ 3036 // 16ByteVector v1 = load16Bytes(in, offset); 3037 // v1 = v1 ^ encryptedCounter; 3038 // store16Bytes(out, offset); 3039 // used = blockSize; 3040 // offset += blockSize; 3041 // len -= blockSize; 3042 // if (len == 0) 3043 // goto DONE; 3044 // } 3045 // } 3046 // NEXT: 3047 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3048 // len--; 3049 // } while (len != 0); 3050 // DONE: 3051 // return result; 3052 // 3053 // CTR_large_block() 3054 // Wide bulk encryption of whole blocks. 3055 3056 __ align(CodeEntryAlignment); 3057 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3058 const address start = __ pc(); 3059 __ enter(); 3060 3061 Label DONE, CTR_large_block, large_block_return; 3062 __ ldrw(used, Address(used_ptr)); 3063 __ cbzw(saved_len, DONE); 3064 3065 __ mov(len, saved_len); 3066 __ mov(offset, 0); 3067 3068 // Compute #rounds for AES based on the length of the key array 3069 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3070 3071 __ aesenc_loadkeys(key, keylen); 3072 3073 { 3074 Label L_CTR_loop, NEXT; 3075 3076 __ bind(L_CTR_loop); 3077 3078 __ cmp(used, block_size); 3079 __ br(__ LO, NEXT); 3080 3081 // Maybe we have a lot of data 3082 __ subsw(rscratch1, len, bulk_width * block_size); 3083 __ br(__ HS, CTR_large_block); 3084 __ BIND(large_block_return); 3085 __ cbzw(len, DONE); 3086 3087 // Setup the counter 3088 __ movi(v4, __ T4S, 0); 3089 __ movi(v5, __ T4S, 1); 3090 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3091 3092 // 128-bit big-endian increment 3093 __ ld1(v0, __ T16B, counter); 3094 __ rev64(v16, __ T16B, v0); 3095 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3096 __ rev64(v16, __ T16B, v16); 3097 __ st1(v16, __ T16B, counter); 3098 // Previous counter value is in v0 3099 // v4 contains { 0, 1 } 3100 3101 { 3102 // We have fewer than bulk_width blocks of data left. Encrypt 3103 // them one by one until there is less than a full block 3104 // remaining, being careful to save both the encrypted counter 3105 // and the counter. 3106 3107 Label inner_loop; 3108 __ bind(inner_loop); 3109 // Counter to encrypt is in v0 3110 __ aesecb_encrypt(noreg, noreg, keylen); 3111 __ st1(v0, __ T16B, saved_encrypted_ctr); 3112 3113 // Do we have a remaining full block? 3114 3115 __ mov(used, 0); 3116 __ cmp(len, block_size); 3117 __ br(__ LO, NEXT); 3118 3119 // Yes, we have a full block 3120 __ ldrq(v1, Address(in, offset)); 3121 __ eor(v1, __ T16B, v1, v0); 3122 __ strq(v1, Address(out, offset)); 3123 __ mov(used, block_size); 3124 __ add(offset, offset, block_size); 3125 3126 __ subw(len, len, block_size); 3127 __ cbzw(len, DONE); 3128 3129 // Increment the counter, store it back 3130 __ orr(v0, __ T16B, v16, v16); 3131 __ rev64(v16, __ T16B, v16); 3132 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3133 __ rev64(v16, __ T16B, v16); 3134 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3135 3136 __ b(inner_loop); 3137 } 3138 3139 __ BIND(NEXT); 3140 3141 // Encrypt a single byte, and loop. 3142 // We expect this to be a rare event. 3143 __ ldrb(rscratch1, Address(in, offset)); 3144 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3145 __ eor(rscratch1, rscratch1, rscratch2); 3146 __ strb(rscratch1, Address(out, offset)); 3147 __ add(offset, offset, 1); 3148 __ add(used, used, 1); 3149 __ subw(len, len,1); 3150 __ cbnzw(len, L_CTR_loop); 3151 } 3152 3153 __ bind(DONE); 3154 __ strw(used, Address(used_ptr)); 3155 __ mov(r0, saved_len); 3156 3157 __ leave(); // required for proper stackwalking of RuntimeStub frame 3158 __ ret(lr); 3159 3160 // Bulk encryption 3161 3162 __ BIND (CTR_large_block); 3163 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3164 3165 if (bulk_width == 8) { 3166 __ sub(sp, sp, 4 * 16); 3167 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3168 } 3169 __ sub(sp, sp, 4 * 16); 3170 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3171 RegSet saved_regs = (RegSet::of(in, out, offset) 3172 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3173 __ push(saved_regs, sp); 3174 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3175 __ add(in, in, offset); 3176 __ add(out, out, offset); 3177 3178 // Keys should already be loaded into the correct registers 3179 3180 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3181 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3182 3183 // AES/CTR loop 3184 { 3185 Label L_CTR_loop; 3186 __ BIND(L_CTR_loop); 3187 3188 // Setup the counters 3189 __ movi(v8, __ T4S, 0); 3190 __ movi(v9, __ T4S, 1); 3191 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3192 3193 for (int i = 0; i < bulk_width; i++) { 3194 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3195 __ rev64(v0_ofs, __ T16B, v16); 3196 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3197 } 3198 3199 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3200 3201 // Encrypt the counters 3202 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3203 3204 if (bulk_width == 8) { 3205 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3206 } 3207 3208 // XOR the encrypted counters with the inputs 3209 for (int i = 0; i < bulk_width; i++) { 3210 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3211 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3212 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3213 } 3214 3215 // Write the encrypted data 3216 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3217 if (bulk_width == 8) { 3218 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3219 } 3220 3221 __ subw(len, len, 16 * bulk_width); 3222 __ cbnzw(len, L_CTR_loop); 3223 } 3224 3225 // Save the counter back where it goes 3226 __ rev64(v16, __ T16B, v16); 3227 __ st1(v16, __ T16B, counter); 3228 3229 __ pop(saved_regs, sp); 3230 3231 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3232 if (bulk_width == 8) { 3233 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3234 } 3235 3236 __ andr(rscratch1, len, -16 * bulk_width); 3237 __ sub(len, len, rscratch1); 3238 __ add(offset, offset, rscratch1); 3239 __ mov(used, 16); 3240 __ strw(used, Address(used_ptr)); 3241 __ b(large_block_return); 3242 3243 return start; 3244 } 3245 3246 // Vector AES Galois Counter Mode implementation. Parameters: 3247 // 3248 // in = c_rarg0 3249 // len = c_rarg1 3250 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3251 // out = c_rarg3 3252 // key = c_rarg4 3253 // state = c_rarg5 - GHASH.state 3254 // subkeyHtbl = c_rarg6 - powers of H 3255 // counter = c_rarg7 - 16 bytes of CTR 3256 // return - number of processed bytes 3257 address generate_galoisCounterMode_AESCrypt() { 3258 address ghash_polynomial = __ pc(); 3259 __ emit_int64(0x87); // The low-order bits of the field 3260 // polynomial (i.e. p = z^7+z^2+z+1) 3261 // repeated in the low and high parts of a 3262 // 128-bit vector 3263 __ emit_int64(0x87); 3264 3265 __ align(CodeEntryAlignment); 3266 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3267 address start = __ pc(); 3268 __ enter(); 3269 3270 const Register in = c_rarg0; 3271 const Register len = c_rarg1; 3272 const Register ct = c_rarg2; 3273 const Register out = c_rarg3; 3274 // and updated with the incremented counter in the end 3275 3276 const Register key = c_rarg4; 3277 const Register state = c_rarg5; 3278 3279 const Register subkeyHtbl = c_rarg6; 3280 3281 const Register counter = c_rarg7; 3282 3283 const Register keylen = r10; 3284 // Save state before entering routine 3285 __ sub(sp, sp, 4 * 16); 3286 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3287 __ sub(sp, sp, 4 * 16); 3288 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3289 3290 // __ andr(len, len, -512); 3291 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3292 __ str(len, __ pre(sp, -2 * wordSize)); 3293 3294 Label DONE; 3295 __ cbz(len, DONE); 3296 3297 // Compute #rounds for AES based on the length of the key array 3298 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3299 3300 __ aesenc_loadkeys(key, keylen); 3301 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3302 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3303 3304 // AES/CTR loop 3305 { 3306 Label L_CTR_loop; 3307 __ BIND(L_CTR_loop); 3308 3309 // Setup the counters 3310 __ movi(v8, __ T4S, 0); 3311 __ movi(v9, __ T4S, 1); 3312 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3313 3314 assert(v0->encoding() < v8->encoding(), ""); 3315 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3316 FloatRegister f = as_FloatRegister(i); 3317 __ rev32(f, __ T16B, v16); 3318 __ addv(v16, __ T4S, v16, v8); 3319 } 3320 3321 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3322 3323 // Encrypt the counters 3324 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3325 3326 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3327 3328 // XOR the encrypted counters with the inputs 3329 for (int i = 0; i < 8; i++) { 3330 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3331 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3332 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3333 } 3334 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3335 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3336 3337 __ subw(len, len, 16 * 8); 3338 __ cbnzw(len, L_CTR_loop); 3339 } 3340 3341 __ rev32(v16, __ T16B, v16); 3342 __ st1(v16, __ T16B, counter); 3343 3344 __ ldr(len, Address(sp)); 3345 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3346 3347 // GHASH/CTR loop 3348 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3349 len, /*unrolls*/4); 3350 3351 #ifdef ASSERT 3352 { Label L; 3353 __ cmp(len, (unsigned char)0); 3354 __ br(Assembler::EQ, L); 3355 __ stop("stubGenerator: abort"); 3356 __ bind(L); 3357 } 3358 #endif 3359 3360 __ bind(DONE); 3361 // Return the number of bytes processed 3362 __ ldr(r0, __ post(sp, 2 * wordSize)); 3363 3364 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3365 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3366 3367 __ leave(); // required for proper stackwalking of RuntimeStub frame 3368 __ ret(lr); 3369 return start; 3370 } 3371 3372 class Cached64Bytes { 3373 private: 3374 MacroAssembler *_masm; 3375 Register _regs[8]; 3376 3377 public: 3378 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3379 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3380 auto it = rs.begin(); 3381 for (auto &r: _regs) { 3382 r = *it; 3383 ++it; 3384 } 3385 } 3386 3387 void gen_loads(Register base) { 3388 for (int i = 0; i < 8; i += 2) { 3389 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3390 } 3391 } 3392 3393 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3394 void extract_u32(Register dest, int i) { 3395 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3396 } 3397 }; 3398 3399 // Utility routines for md5. 3400 // Clobbers r10 and r11. 3401 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3402 int k, int s, int t) { 3403 Register rscratch3 = r10; 3404 Register rscratch4 = r11; 3405 3406 __ eorw(rscratch3, r3, r4); 3407 __ movw(rscratch2, t); 3408 __ andw(rscratch3, rscratch3, r2); 3409 __ addw(rscratch4, r1, rscratch2); 3410 reg_cache.extract_u32(rscratch1, k); 3411 __ eorw(rscratch3, rscratch3, r4); 3412 __ addw(rscratch4, rscratch4, rscratch1); 3413 __ addw(rscratch3, rscratch3, rscratch4); 3414 __ rorw(rscratch2, rscratch3, 32 - s); 3415 __ addw(r1, rscratch2, r2); 3416 } 3417 3418 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3419 int k, int s, int t) { 3420 Register rscratch3 = r10; 3421 Register rscratch4 = r11; 3422 3423 __ andw(rscratch3, r2, r4); 3424 __ bicw(rscratch4, r3, r4); 3425 reg_cache.extract_u32(rscratch1, k); 3426 __ movw(rscratch2, t); 3427 __ orrw(rscratch3, rscratch3, rscratch4); 3428 __ addw(rscratch4, r1, rscratch2); 3429 __ addw(rscratch4, rscratch4, rscratch1); 3430 __ addw(rscratch3, rscratch3, rscratch4); 3431 __ rorw(rscratch2, rscratch3, 32 - s); 3432 __ addw(r1, rscratch2, r2); 3433 } 3434 3435 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3436 int k, int s, int t) { 3437 Register rscratch3 = r10; 3438 Register rscratch4 = r11; 3439 3440 __ eorw(rscratch3, r3, r4); 3441 __ movw(rscratch2, t); 3442 __ addw(rscratch4, r1, rscratch2); 3443 reg_cache.extract_u32(rscratch1, k); 3444 __ eorw(rscratch3, rscratch3, r2); 3445 __ addw(rscratch4, rscratch4, rscratch1); 3446 __ addw(rscratch3, rscratch3, rscratch4); 3447 __ rorw(rscratch2, rscratch3, 32 - s); 3448 __ addw(r1, rscratch2, r2); 3449 } 3450 3451 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3452 int k, int s, int t) { 3453 Register rscratch3 = r10; 3454 Register rscratch4 = r11; 3455 3456 __ movw(rscratch3, t); 3457 __ ornw(rscratch2, r2, r4); 3458 __ addw(rscratch4, r1, rscratch3); 3459 reg_cache.extract_u32(rscratch1, k); 3460 __ eorw(rscratch3, rscratch2, r3); 3461 __ addw(rscratch4, rscratch4, rscratch1); 3462 __ addw(rscratch3, rscratch3, rscratch4); 3463 __ rorw(rscratch2, rscratch3, 32 - s); 3464 __ addw(r1, rscratch2, r2); 3465 } 3466 3467 // Arguments: 3468 // 3469 // Inputs: 3470 // c_rarg0 - byte[] source+offset 3471 // c_rarg1 - int[] SHA.state 3472 // c_rarg2 - int offset 3473 // c_rarg3 - int limit 3474 // 3475 address generate_md5_implCompress(bool multi_block, const char *name) { 3476 __ align(CodeEntryAlignment); 3477 StubCodeMark mark(this, "StubRoutines", name); 3478 address start = __ pc(); 3479 3480 Register buf = c_rarg0; 3481 Register state = c_rarg1; 3482 Register ofs = c_rarg2; 3483 Register limit = c_rarg3; 3484 Register a = r4; 3485 Register b = r5; 3486 Register c = r6; 3487 Register d = r7; 3488 Register rscratch3 = r10; 3489 Register rscratch4 = r11; 3490 3491 Register state_regs[2] = { r12, r13 }; 3492 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3493 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3494 3495 __ push(saved_regs, sp); 3496 3497 __ ldp(state_regs[0], state_regs[1], Address(state)); 3498 __ ubfx(a, state_regs[0], 0, 32); 3499 __ ubfx(b, state_regs[0], 32, 32); 3500 __ ubfx(c, state_regs[1], 0, 32); 3501 __ ubfx(d, state_regs[1], 32, 32); 3502 3503 Label md5_loop; 3504 __ BIND(md5_loop); 3505 3506 reg_cache.gen_loads(buf); 3507 3508 // Round 1 3509 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3510 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3511 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3512 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3513 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3514 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3515 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3516 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3517 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3518 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3519 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3520 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3521 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3522 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3523 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3524 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3525 3526 // Round 2 3527 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3528 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3529 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3530 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3531 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3532 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3533 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3534 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3535 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3536 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3537 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3538 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3539 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3540 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3541 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3542 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3543 3544 // Round 3 3545 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3546 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3547 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3548 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3549 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3550 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3551 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3552 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3553 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3554 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3555 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3556 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3557 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3558 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3559 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3560 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3561 3562 // Round 4 3563 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3564 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3565 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3566 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3567 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3568 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3569 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3570 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3571 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3572 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3573 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3574 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3575 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3576 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3577 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3578 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3579 3580 __ addw(a, state_regs[0], a); 3581 __ ubfx(rscratch2, state_regs[0], 32, 32); 3582 __ addw(b, rscratch2, b); 3583 __ addw(c, state_regs[1], c); 3584 __ ubfx(rscratch4, state_regs[1], 32, 32); 3585 __ addw(d, rscratch4, d); 3586 3587 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3588 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3589 3590 if (multi_block) { 3591 __ add(buf, buf, 64); 3592 __ add(ofs, ofs, 64); 3593 __ cmp(ofs, limit); 3594 __ br(Assembler::LE, md5_loop); 3595 __ mov(c_rarg0, ofs); // return ofs 3596 } 3597 3598 // write hash values back in the correct order 3599 __ stp(state_regs[0], state_regs[1], Address(state)); 3600 3601 __ pop(saved_regs, sp); 3602 3603 __ ret(lr); 3604 3605 return start; 3606 } 3607 3608 // Arguments: 3609 // 3610 // Inputs: 3611 // c_rarg0 - byte[] source+offset 3612 // c_rarg1 - int[] SHA.state 3613 // c_rarg2 - int offset 3614 // c_rarg3 - int limit 3615 // 3616 address generate_sha1_implCompress(bool multi_block, const char *name) { 3617 __ align(CodeEntryAlignment); 3618 StubCodeMark mark(this, "StubRoutines", name); 3619 address start = __ pc(); 3620 3621 Register buf = c_rarg0; 3622 Register state = c_rarg1; 3623 Register ofs = c_rarg2; 3624 Register limit = c_rarg3; 3625 3626 Label keys; 3627 Label sha1_loop; 3628 3629 // load the keys into v0..v3 3630 __ adr(rscratch1, keys); 3631 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3632 // load 5 words state into v6, v7 3633 __ ldrq(v6, Address(state, 0)); 3634 __ ldrs(v7, Address(state, 16)); 3635 3636 3637 __ BIND(sha1_loop); 3638 // load 64 bytes of data into v16..v19 3639 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3640 __ rev32(v16, __ T16B, v16); 3641 __ rev32(v17, __ T16B, v17); 3642 __ rev32(v18, __ T16B, v18); 3643 __ rev32(v19, __ T16B, v19); 3644 3645 // do the sha1 3646 __ addv(v4, __ T4S, v16, v0); 3647 __ orr(v20, __ T16B, v6, v6); 3648 3649 FloatRegister d0 = v16; 3650 FloatRegister d1 = v17; 3651 FloatRegister d2 = v18; 3652 FloatRegister d3 = v19; 3653 3654 for (int round = 0; round < 20; round++) { 3655 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3656 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3657 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3658 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3659 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3660 3661 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3662 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3663 __ sha1h(tmp2, __ T4S, v20); 3664 if (round < 5) 3665 __ sha1c(v20, __ T4S, tmp3, tmp4); 3666 else if (round < 10 || round >= 15) 3667 __ sha1p(v20, __ T4S, tmp3, tmp4); 3668 else 3669 __ sha1m(v20, __ T4S, tmp3, tmp4); 3670 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3671 3672 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3673 } 3674 3675 __ addv(v7, __ T2S, v7, v21); 3676 __ addv(v6, __ T4S, v6, v20); 3677 3678 if (multi_block) { 3679 __ add(ofs, ofs, 64); 3680 __ cmp(ofs, limit); 3681 __ br(Assembler::LE, sha1_loop); 3682 __ mov(c_rarg0, ofs); // return ofs 3683 } 3684 3685 __ strq(v6, Address(state, 0)); 3686 __ strs(v7, Address(state, 16)); 3687 3688 __ ret(lr); 3689 3690 __ bind(keys); 3691 __ emit_int32(0x5a827999); 3692 __ emit_int32(0x6ed9eba1); 3693 __ emit_int32(0x8f1bbcdc); 3694 __ emit_int32(0xca62c1d6); 3695 3696 return start; 3697 } 3698 3699 3700 // Arguments: 3701 // 3702 // Inputs: 3703 // c_rarg0 - byte[] source+offset 3704 // c_rarg1 - int[] SHA.state 3705 // c_rarg2 - int offset 3706 // c_rarg3 - int limit 3707 // 3708 address generate_sha256_implCompress(bool multi_block, const char *name) { 3709 static const uint32_t round_consts[64] = { 3710 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3711 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3712 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3713 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3714 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3715 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3716 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3717 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3718 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3719 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3720 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3721 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3722 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3723 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3724 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3725 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3726 }; 3727 __ align(CodeEntryAlignment); 3728 StubCodeMark mark(this, "StubRoutines", name); 3729 address start = __ pc(); 3730 3731 Register buf = c_rarg0; 3732 Register state = c_rarg1; 3733 Register ofs = c_rarg2; 3734 Register limit = c_rarg3; 3735 3736 Label sha1_loop; 3737 3738 __ stpd(v8, v9, __ pre(sp, -32)); 3739 __ stpd(v10, v11, Address(sp, 16)); 3740 3741 // dga == v0 3742 // dgb == v1 3743 // dg0 == v2 3744 // dg1 == v3 3745 // dg2 == v4 3746 // t0 == v6 3747 // t1 == v7 3748 3749 // load 16 keys to v16..v31 3750 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3751 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3752 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3753 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3754 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3755 3756 // load 8 words (256 bits) state 3757 __ ldpq(v0, v1, state); 3758 3759 __ BIND(sha1_loop); 3760 // load 64 bytes of data into v8..v11 3761 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3762 __ rev32(v8, __ T16B, v8); 3763 __ rev32(v9, __ T16B, v9); 3764 __ rev32(v10, __ T16B, v10); 3765 __ rev32(v11, __ T16B, v11); 3766 3767 __ addv(v6, __ T4S, v8, v16); 3768 __ orr(v2, __ T16B, v0, v0); 3769 __ orr(v3, __ T16B, v1, v1); 3770 3771 FloatRegister d0 = v8; 3772 FloatRegister d1 = v9; 3773 FloatRegister d2 = v10; 3774 FloatRegister d3 = v11; 3775 3776 3777 for (int round = 0; round < 16; round++) { 3778 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3779 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3780 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3781 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3782 3783 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3784 __ orr(v4, __ T16B, v2, v2); 3785 if (round < 15) 3786 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3787 __ sha256h(v2, __ T4S, v3, tmp2); 3788 __ sha256h2(v3, __ T4S, v4, tmp2); 3789 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3790 3791 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3792 } 3793 3794 __ addv(v0, __ T4S, v0, v2); 3795 __ addv(v1, __ T4S, v1, v3); 3796 3797 if (multi_block) { 3798 __ add(ofs, ofs, 64); 3799 __ cmp(ofs, limit); 3800 __ br(Assembler::LE, sha1_loop); 3801 __ mov(c_rarg0, ofs); // return ofs 3802 } 3803 3804 __ ldpd(v10, v11, Address(sp, 16)); 3805 __ ldpd(v8, v9, __ post(sp, 32)); 3806 3807 __ stpq(v0, v1, state); 3808 3809 __ ret(lr); 3810 3811 return start; 3812 } 3813 3814 // Double rounds for sha512. 3815 void sha512_dround(int dr, 3816 FloatRegister vi0, FloatRegister vi1, 3817 FloatRegister vi2, FloatRegister vi3, 3818 FloatRegister vi4, FloatRegister vrc0, 3819 FloatRegister vrc1, FloatRegister vin0, 3820 FloatRegister vin1, FloatRegister vin2, 3821 FloatRegister vin3, FloatRegister vin4) { 3822 if (dr < 36) { 3823 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3824 } 3825 __ addv(v5, __ T2D, vrc0, vin0); 3826 __ ext(v6, __ T16B, vi2, vi3, 8); 3827 __ ext(v5, __ T16B, v5, v5, 8); 3828 __ ext(v7, __ T16B, vi1, vi2, 8); 3829 __ addv(vi3, __ T2D, vi3, v5); 3830 if (dr < 32) { 3831 __ ext(v5, __ T16B, vin3, vin4, 8); 3832 __ sha512su0(vin0, __ T2D, vin1); 3833 } 3834 __ sha512h(vi3, __ T2D, v6, v7); 3835 if (dr < 32) { 3836 __ sha512su1(vin0, __ T2D, vin2, v5); 3837 } 3838 __ addv(vi4, __ T2D, vi1, vi3); 3839 __ sha512h2(vi3, __ T2D, vi1, vi0); 3840 } 3841 3842 // Arguments: 3843 // 3844 // Inputs: 3845 // c_rarg0 - byte[] source+offset 3846 // c_rarg1 - int[] SHA.state 3847 // c_rarg2 - int offset 3848 // c_rarg3 - int limit 3849 // 3850 address generate_sha512_implCompress(bool multi_block, const char *name) { 3851 static const uint64_t round_consts[80] = { 3852 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3853 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3854 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3855 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3856 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3857 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3858 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3859 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3860 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3861 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3862 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3863 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3864 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3865 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3866 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3867 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3868 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3869 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3870 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3871 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3872 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3873 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3874 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3875 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3876 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3877 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3878 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3879 }; 3880 3881 __ align(CodeEntryAlignment); 3882 StubCodeMark mark(this, "StubRoutines", name); 3883 address start = __ pc(); 3884 3885 Register buf = c_rarg0; 3886 Register state = c_rarg1; 3887 Register ofs = c_rarg2; 3888 Register limit = c_rarg3; 3889 3890 __ stpd(v8, v9, __ pre(sp, -64)); 3891 __ stpd(v10, v11, Address(sp, 16)); 3892 __ stpd(v12, v13, Address(sp, 32)); 3893 __ stpd(v14, v15, Address(sp, 48)); 3894 3895 Label sha512_loop; 3896 3897 // load state 3898 __ ld1(v8, v9, v10, v11, __ T2D, state); 3899 3900 // load first 4 round constants 3901 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3902 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3903 3904 __ BIND(sha512_loop); 3905 // load 128B of data into v12..v19 3906 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3907 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3908 __ rev64(v12, __ T16B, v12); 3909 __ rev64(v13, __ T16B, v13); 3910 __ rev64(v14, __ T16B, v14); 3911 __ rev64(v15, __ T16B, v15); 3912 __ rev64(v16, __ T16B, v16); 3913 __ rev64(v17, __ T16B, v17); 3914 __ rev64(v18, __ T16B, v18); 3915 __ rev64(v19, __ T16B, v19); 3916 3917 __ mov(rscratch2, rscratch1); 3918 3919 __ mov(v0, __ T16B, v8); 3920 __ mov(v1, __ T16B, v9); 3921 __ mov(v2, __ T16B, v10); 3922 __ mov(v3, __ T16B, v11); 3923 3924 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3925 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3926 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3927 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3928 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3929 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3930 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3931 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3932 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3933 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3934 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3935 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3936 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3937 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3938 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3939 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3940 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3941 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3942 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3943 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3944 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3945 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3946 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3947 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3948 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3949 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3950 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3951 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3952 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3953 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3954 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3955 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3956 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3957 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3958 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3959 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3960 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3961 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3962 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3963 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3964 3965 __ addv(v8, __ T2D, v8, v0); 3966 __ addv(v9, __ T2D, v9, v1); 3967 __ addv(v10, __ T2D, v10, v2); 3968 __ addv(v11, __ T2D, v11, v3); 3969 3970 if (multi_block) { 3971 __ add(ofs, ofs, 128); 3972 __ cmp(ofs, limit); 3973 __ br(Assembler::LE, sha512_loop); 3974 __ mov(c_rarg0, ofs); // return ofs 3975 } 3976 3977 __ st1(v8, v9, v10, v11, __ T2D, state); 3978 3979 __ ldpd(v14, v15, Address(sp, 48)); 3980 __ ldpd(v12, v13, Address(sp, 32)); 3981 __ ldpd(v10, v11, Address(sp, 16)); 3982 __ ldpd(v8, v9, __ post(sp, 64)); 3983 3984 __ ret(lr); 3985 3986 return start; 3987 } 3988 3989 // Arguments: 3990 // 3991 // Inputs: 3992 // c_rarg0 - byte[] source+offset 3993 // c_rarg1 - byte[] SHA.state 3994 // c_rarg2 - int block_size 3995 // c_rarg3 - int offset 3996 // c_rarg4 - int limit 3997 // 3998 address generate_sha3_implCompress(bool multi_block, const char *name) { 3999 static const uint64_t round_consts[24] = { 4000 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4001 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4002 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4003 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4004 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4005 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4006 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4007 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4008 }; 4009 4010 __ align(CodeEntryAlignment); 4011 StubCodeMark mark(this, "StubRoutines", name); 4012 address start = __ pc(); 4013 4014 Register buf = c_rarg0; 4015 Register state = c_rarg1; 4016 Register block_size = c_rarg2; 4017 Register ofs = c_rarg3; 4018 Register limit = c_rarg4; 4019 4020 Label sha3_loop, rounds24_loop; 4021 Label sha3_512_or_sha3_384, shake128; 4022 4023 __ stpd(v8, v9, __ pre(sp, -64)); 4024 __ stpd(v10, v11, Address(sp, 16)); 4025 __ stpd(v12, v13, Address(sp, 32)); 4026 __ stpd(v14, v15, Address(sp, 48)); 4027 4028 // load state 4029 __ add(rscratch1, state, 32); 4030 __ ld1(v0, v1, v2, v3, __ T1D, state); 4031 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4032 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4033 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4034 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4035 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4036 __ ld1(v24, __ T1D, rscratch1); 4037 4038 __ BIND(sha3_loop); 4039 4040 // 24 keccak rounds 4041 __ movw(rscratch2, 24); 4042 4043 // load round_constants base 4044 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4045 4046 // load input 4047 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4048 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4049 __ eor(v0, __ T8B, v0, v25); 4050 __ eor(v1, __ T8B, v1, v26); 4051 __ eor(v2, __ T8B, v2, v27); 4052 __ eor(v3, __ T8B, v3, v28); 4053 __ eor(v4, __ T8B, v4, v29); 4054 __ eor(v5, __ T8B, v5, v30); 4055 __ eor(v6, __ T8B, v6, v31); 4056 4057 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4058 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4059 4060 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4061 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4062 __ eor(v7, __ T8B, v7, v25); 4063 __ eor(v8, __ T8B, v8, v26); 4064 __ eor(v9, __ T8B, v9, v27); 4065 __ eor(v10, __ T8B, v10, v28); 4066 __ eor(v11, __ T8B, v11, v29); 4067 __ eor(v12, __ T8B, v12, v30); 4068 __ eor(v13, __ T8B, v13, v31); 4069 4070 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4071 __ eor(v14, __ T8B, v14, v25); 4072 __ eor(v15, __ T8B, v15, v26); 4073 __ eor(v16, __ T8B, v16, v27); 4074 4075 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4076 __ andw(c_rarg5, block_size, 48); 4077 __ cbzw(c_rarg5, rounds24_loop); 4078 4079 __ tbnz(block_size, 5, shake128); 4080 // block_size == 144, bit5 == 0, SHA3-244 4081 __ ldrd(v28, __ post(buf, 8)); 4082 __ eor(v17, __ T8B, v17, v28); 4083 __ b(rounds24_loop); 4084 4085 __ BIND(shake128); 4086 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4087 __ eor(v17, __ T8B, v17, v28); 4088 __ eor(v18, __ T8B, v18, v29); 4089 __ eor(v19, __ T8B, v19, v30); 4090 __ eor(v20, __ T8B, v20, v31); 4091 __ b(rounds24_loop); // block_size == 168, SHAKE128 4092 4093 __ BIND(sha3_512_or_sha3_384); 4094 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4095 __ eor(v7, __ T8B, v7, v25); 4096 __ eor(v8, __ T8B, v8, v26); 4097 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4098 4099 // SHA3-384 4100 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4101 __ eor(v9, __ T8B, v9, v27); 4102 __ eor(v10, __ T8B, v10, v28); 4103 __ eor(v11, __ T8B, v11, v29); 4104 __ eor(v12, __ T8B, v12, v30); 4105 4106 __ BIND(rounds24_loop); 4107 __ subw(rscratch2, rscratch2, 1); 4108 4109 __ eor3(v29, __ T16B, v4, v9, v14); 4110 __ eor3(v26, __ T16B, v1, v6, v11); 4111 __ eor3(v28, __ T16B, v3, v8, v13); 4112 __ eor3(v25, __ T16B, v0, v5, v10); 4113 __ eor3(v27, __ T16B, v2, v7, v12); 4114 __ eor3(v29, __ T16B, v29, v19, v24); 4115 __ eor3(v26, __ T16B, v26, v16, v21); 4116 __ eor3(v28, __ T16B, v28, v18, v23); 4117 __ eor3(v25, __ T16B, v25, v15, v20); 4118 __ eor3(v27, __ T16B, v27, v17, v22); 4119 4120 __ rax1(v30, __ T2D, v29, v26); 4121 __ rax1(v26, __ T2D, v26, v28); 4122 __ rax1(v28, __ T2D, v28, v25); 4123 __ rax1(v25, __ T2D, v25, v27); 4124 __ rax1(v27, __ T2D, v27, v29); 4125 4126 __ eor(v0, __ T16B, v0, v30); 4127 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4128 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4129 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4130 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4131 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4132 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4133 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4134 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4135 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4136 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4137 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4138 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4139 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4140 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4141 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4142 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4143 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4144 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4145 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4146 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4147 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4148 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4149 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4150 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4151 4152 __ bcax(v20, __ T16B, v31, v22, v8); 4153 __ bcax(v21, __ T16B, v8, v23, v22); 4154 __ bcax(v22, __ T16B, v22, v24, v23); 4155 __ bcax(v23, __ T16B, v23, v31, v24); 4156 __ bcax(v24, __ T16B, v24, v8, v31); 4157 4158 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4159 4160 __ bcax(v17, __ T16B, v25, v19, v3); 4161 __ bcax(v18, __ T16B, v3, v15, v19); 4162 __ bcax(v19, __ T16B, v19, v16, v15); 4163 __ bcax(v15, __ T16B, v15, v25, v16); 4164 __ bcax(v16, __ T16B, v16, v3, v25); 4165 4166 __ bcax(v10, __ T16B, v29, v12, v26); 4167 __ bcax(v11, __ T16B, v26, v13, v12); 4168 __ bcax(v12, __ T16B, v12, v14, v13); 4169 __ bcax(v13, __ T16B, v13, v29, v14); 4170 __ bcax(v14, __ T16B, v14, v26, v29); 4171 4172 __ bcax(v7, __ T16B, v30, v9, v4); 4173 __ bcax(v8, __ T16B, v4, v5, v9); 4174 __ bcax(v9, __ T16B, v9, v6, v5); 4175 __ bcax(v5, __ T16B, v5, v30, v6); 4176 __ bcax(v6, __ T16B, v6, v4, v30); 4177 4178 __ bcax(v3, __ T16B, v27, v0, v28); 4179 __ bcax(v4, __ T16B, v28, v1, v0); 4180 __ bcax(v0, __ T16B, v0, v2, v1); 4181 __ bcax(v1, __ T16B, v1, v27, v2); 4182 __ bcax(v2, __ T16B, v2, v28, v27); 4183 4184 __ eor(v0, __ T16B, v0, v31); 4185 4186 __ cbnzw(rscratch2, rounds24_loop); 4187 4188 if (multi_block) { 4189 __ add(ofs, ofs, block_size); 4190 __ cmp(ofs, limit); 4191 __ br(Assembler::LE, sha3_loop); 4192 __ mov(c_rarg0, ofs); // return ofs 4193 } 4194 4195 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4196 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4197 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4198 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4199 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4200 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4201 __ st1(v24, __ T1D, state); 4202 4203 __ ldpd(v14, v15, Address(sp, 48)); 4204 __ ldpd(v12, v13, Address(sp, 32)); 4205 __ ldpd(v10, v11, Address(sp, 16)); 4206 __ ldpd(v8, v9, __ post(sp, 64)); 4207 4208 __ ret(lr); 4209 4210 return start; 4211 } 4212 4213 /** 4214 * Arguments: 4215 * 4216 * Inputs: 4217 * c_rarg0 - int crc 4218 * c_rarg1 - byte* buf 4219 * c_rarg2 - int length 4220 * 4221 * Output: 4222 * rax - int crc result 4223 */ 4224 address generate_updateBytesCRC32() { 4225 assert(UseCRC32Intrinsics, "what are we doing here?"); 4226 4227 __ align(CodeEntryAlignment); 4228 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4229 4230 address start = __ pc(); 4231 4232 const Register crc = c_rarg0; // crc 4233 const Register buf = c_rarg1; // source java byte array address 4234 const Register len = c_rarg2; // length 4235 const Register table0 = c_rarg3; // crc_table address 4236 const Register table1 = c_rarg4; 4237 const Register table2 = c_rarg5; 4238 const Register table3 = c_rarg6; 4239 const Register tmp3 = c_rarg7; 4240 4241 BLOCK_COMMENT("Entry:"); 4242 __ enter(); // required for proper stackwalking of RuntimeStub frame 4243 4244 __ kernel_crc32(crc, buf, len, 4245 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4246 4247 __ leave(); // required for proper stackwalking of RuntimeStub frame 4248 __ ret(lr); 4249 4250 return start; 4251 } 4252 4253 // ChaCha20 block function. This version parallelizes by loading 4254 // individual 32-bit state elements into vectors for four blocks 4255 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4256 // 4257 // state (int[16]) = c_rarg0 4258 // keystream (byte[1024]) = c_rarg1 4259 // return - number of bytes of keystream (always 256) 4260 address generate_chacha20Block_blockpar() { 4261 Label L_twoRounds, L_cc20_const; 4262 // The constant data is broken into two 128-bit segments to be loaded 4263 // onto FloatRegisters. The first 128 bits are a counter add overlay 4264 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4265 // The second 128-bits is a table constant used for 8-bit left rotations. 4266 __ BIND(L_cc20_const); 4267 __ emit_int64(0x0000000100000000UL); 4268 __ emit_int64(0x0000000300000002UL); 4269 __ emit_int64(0x0605040702010003UL); 4270 __ emit_int64(0x0E0D0C0F0A09080BUL); 4271 4272 __ align(CodeEntryAlignment); 4273 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4274 address start = __ pc(); 4275 __ enter(); 4276 4277 int i, j; 4278 const Register state = c_rarg0; 4279 const Register keystream = c_rarg1; 4280 const Register loopCtr = r10; 4281 const Register tmpAddr = r11; 4282 4283 const FloatRegister stateFirst = v0; 4284 const FloatRegister stateSecond = v1; 4285 const FloatRegister stateThird = v2; 4286 const FloatRegister stateFourth = v3; 4287 const FloatRegister origCtrState = v28; 4288 const FloatRegister scratch = v29; 4289 const FloatRegister lrot8Tbl = v30; 4290 4291 // Organize SIMD registers in an array that facilitates 4292 // putting repetitive opcodes into loop structures. It is 4293 // important that each grouping of 4 registers is monotonically 4294 // increasing to support the requirements of multi-register 4295 // instructions (e.g. ld4r, st4, etc.) 4296 const FloatRegister workSt[16] = { 4297 v4, v5, v6, v7, v16, v17, v18, v19, 4298 v20, v21, v22, v23, v24, v25, v26, v27 4299 }; 4300 4301 // Load from memory and interlace across 16 SIMD registers, 4302 // With each word from memory being broadcast to all lanes of 4303 // each successive SIMD register. 4304 // Addr(0) -> All lanes in workSt[i] 4305 // Addr(4) -> All lanes workSt[i + 1], etc. 4306 __ mov(tmpAddr, state); 4307 for (i = 0; i < 16; i += 4) { 4308 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4309 __ post(tmpAddr, 16)); 4310 } 4311 4312 // Pull in constant data. The first 16 bytes are the add overlay 4313 // which is applied to the vector holding the counter (state[12]). 4314 // The second 16 bytes is the index register for the 8-bit left 4315 // rotation tbl instruction. 4316 __ adr(tmpAddr, L_cc20_const); 4317 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4318 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4319 4320 // Set up the 10 iteration loop and perform all 8 quarter round ops 4321 __ mov(loopCtr, 10); 4322 __ BIND(L_twoRounds); 4323 4324 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4325 scratch, lrot8Tbl); 4326 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4327 scratch, lrot8Tbl); 4328 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4329 scratch, lrot8Tbl); 4330 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4331 scratch, lrot8Tbl); 4332 4333 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4334 scratch, lrot8Tbl); 4335 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4336 scratch, lrot8Tbl); 4337 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4338 scratch, lrot8Tbl); 4339 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4340 scratch, lrot8Tbl); 4341 4342 // Decrement and iterate 4343 __ sub(loopCtr, loopCtr, 1); 4344 __ cbnz(loopCtr, L_twoRounds); 4345 4346 __ mov(tmpAddr, state); 4347 4348 // Add the starting state back to the post-loop keystream 4349 // state. We read/interlace the state array from memory into 4350 // 4 registers similar to what we did in the beginning. Then 4351 // add the counter overlay onto workSt[12] at the end. 4352 for (i = 0; i < 16; i += 4) { 4353 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4354 __ post(tmpAddr, 16)); 4355 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4356 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4357 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4358 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4359 } 4360 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4361 4362 // Write to key stream, storing the same element out of workSt[0..15] 4363 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4364 // for the next element position. 4365 for (i = 0; i < 4; i++) { 4366 for (j = 0; j < 16; j += 4) { 4367 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4368 __ post(keystream, 16)); 4369 } 4370 } 4371 4372 __ mov(r0, 256); // Return length of output keystream 4373 __ leave(); 4374 __ ret(lr); 4375 4376 return start; 4377 } 4378 4379 /** 4380 * Arguments: 4381 * 4382 * Inputs: 4383 * c_rarg0 - int crc 4384 * c_rarg1 - byte* buf 4385 * c_rarg2 - int length 4386 * c_rarg3 - int* table 4387 * 4388 * Output: 4389 * r0 - int crc result 4390 */ 4391 address generate_updateBytesCRC32C() { 4392 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4393 4394 __ align(CodeEntryAlignment); 4395 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4396 4397 address start = __ pc(); 4398 4399 const Register crc = c_rarg0; // crc 4400 const Register buf = c_rarg1; // source java byte array address 4401 const Register len = c_rarg2; // length 4402 const Register table0 = c_rarg3; // crc_table address 4403 const Register table1 = c_rarg4; 4404 const Register table2 = c_rarg5; 4405 const Register table3 = c_rarg6; 4406 const Register tmp3 = c_rarg7; 4407 4408 BLOCK_COMMENT("Entry:"); 4409 __ enter(); // required for proper stackwalking of RuntimeStub frame 4410 4411 __ kernel_crc32c(crc, buf, len, 4412 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4413 4414 __ leave(); // required for proper stackwalking of RuntimeStub frame 4415 __ ret(lr); 4416 4417 return start; 4418 } 4419 4420 /*** 4421 * Arguments: 4422 * 4423 * Inputs: 4424 * c_rarg0 - int adler 4425 * c_rarg1 - byte* buff 4426 * c_rarg2 - int len 4427 * 4428 * Output: 4429 * c_rarg0 - int adler result 4430 */ 4431 address generate_updateBytesAdler32() { 4432 __ align(CodeEntryAlignment); 4433 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4434 address start = __ pc(); 4435 4436 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4437 4438 // Aliases 4439 Register adler = c_rarg0; 4440 Register s1 = c_rarg0; 4441 Register s2 = c_rarg3; 4442 Register buff = c_rarg1; 4443 Register len = c_rarg2; 4444 Register nmax = r4; 4445 Register base = r5; 4446 Register count = r6; 4447 Register temp0 = rscratch1; 4448 Register temp1 = rscratch2; 4449 FloatRegister vbytes = v0; 4450 FloatRegister vs1acc = v1; 4451 FloatRegister vs2acc = v2; 4452 FloatRegister vtable = v3; 4453 4454 // Max number of bytes we can process before having to take the mod 4455 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4456 uint64_t BASE = 0xfff1; 4457 uint64_t NMAX = 0x15B0; 4458 4459 __ mov(base, BASE); 4460 __ mov(nmax, NMAX); 4461 4462 // Load accumulation coefficients for the upper 16 bits 4463 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4464 __ ld1(vtable, __ T16B, Address(temp0)); 4465 4466 // s1 is initialized to the lower 16 bits of adler 4467 // s2 is initialized to the upper 16 bits of adler 4468 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4469 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4470 4471 // The pipelined loop needs at least 16 elements for 1 iteration 4472 // It does check this, but it is more effective to skip to the cleanup loop 4473 __ cmp(len, (u1)16); 4474 __ br(Assembler::HS, L_nmax); 4475 __ cbz(len, L_combine); 4476 4477 __ bind(L_simple_by1_loop); 4478 __ ldrb(temp0, Address(__ post(buff, 1))); 4479 __ add(s1, s1, temp0); 4480 __ add(s2, s2, s1); 4481 __ subs(len, len, 1); 4482 __ br(Assembler::HI, L_simple_by1_loop); 4483 4484 // s1 = s1 % BASE 4485 __ subs(temp0, s1, base); 4486 __ csel(s1, temp0, s1, Assembler::HS); 4487 4488 // s2 = s2 % BASE 4489 __ lsr(temp0, s2, 16); 4490 __ lsl(temp1, temp0, 4); 4491 __ sub(temp1, temp1, temp0); 4492 __ add(s2, temp1, s2, ext::uxth); 4493 4494 __ subs(temp0, s2, base); 4495 __ csel(s2, temp0, s2, Assembler::HS); 4496 4497 __ b(L_combine); 4498 4499 __ bind(L_nmax); 4500 __ subs(len, len, nmax); 4501 __ sub(count, nmax, 16); 4502 __ br(Assembler::LO, L_by16); 4503 4504 __ bind(L_nmax_loop); 4505 4506 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4507 vbytes, vs1acc, vs2acc, vtable); 4508 4509 __ subs(count, count, 16); 4510 __ br(Assembler::HS, L_nmax_loop); 4511 4512 // s1 = s1 % BASE 4513 __ lsr(temp0, s1, 16); 4514 __ lsl(temp1, temp0, 4); 4515 __ sub(temp1, temp1, temp0); 4516 __ add(temp1, temp1, s1, ext::uxth); 4517 4518 __ lsr(temp0, temp1, 16); 4519 __ lsl(s1, temp0, 4); 4520 __ sub(s1, s1, temp0); 4521 __ add(s1, s1, temp1, ext:: uxth); 4522 4523 __ subs(temp0, s1, base); 4524 __ csel(s1, temp0, s1, Assembler::HS); 4525 4526 // s2 = s2 % BASE 4527 __ lsr(temp0, s2, 16); 4528 __ lsl(temp1, temp0, 4); 4529 __ sub(temp1, temp1, temp0); 4530 __ add(temp1, temp1, s2, ext::uxth); 4531 4532 __ lsr(temp0, temp1, 16); 4533 __ lsl(s2, temp0, 4); 4534 __ sub(s2, s2, temp0); 4535 __ add(s2, s2, temp1, ext:: uxth); 4536 4537 __ subs(temp0, s2, base); 4538 __ csel(s2, temp0, s2, Assembler::HS); 4539 4540 __ subs(len, len, nmax); 4541 __ sub(count, nmax, 16); 4542 __ br(Assembler::HS, L_nmax_loop); 4543 4544 __ bind(L_by16); 4545 __ adds(len, len, count); 4546 __ br(Assembler::LO, L_by1); 4547 4548 __ bind(L_by16_loop); 4549 4550 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4551 vbytes, vs1acc, vs2acc, vtable); 4552 4553 __ subs(len, len, 16); 4554 __ br(Assembler::HS, L_by16_loop); 4555 4556 __ bind(L_by1); 4557 __ adds(len, len, 15); 4558 __ br(Assembler::LO, L_do_mod); 4559 4560 __ bind(L_by1_loop); 4561 __ ldrb(temp0, Address(__ post(buff, 1))); 4562 __ add(s1, temp0, s1); 4563 __ add(s2, s2, s1); 4564 __ subs(len, len, 1); 4565 __ br(Assembler::HS, L_by1_loop); 4566 4567 __ bind(L_do_mod); 4568 // s1 = s1 % BASE 4569 __ lsr(temp0, s1, 16); 4570 __ lsl(temp1, temp0, 4); 4571 __ sub(temp1, temp1, temp0); 4572 __ add(temp1, temp1, s1, ext::uxth); 4573 4574 __ lsr(temp0, temp1, 16); 4575 __ lsl(s1, temp0, 4); 4576 __ sub(s1, s1, temp0); 4577 __ add(s1, s1, temp1, ext:: uxth); 4578 4579 __ subs(temp0, s1, base); 4580 __ csel(s1, temp0, s1, Assembler::HS); 4581 4582 // s2 = s2 % BASE 4583 __ lsr(temp0, s2, 16); 4584 __ lsl(temp1, temp0, 4); 4585 __ sub(temp1, temp1, temp0); 4586 __ add(temp1, temp1, s2, ext::uxth); 4587 4588 __ lsr(temp0, temp1, 16); 4589 __ lsl(s2, temp0, 4); 4590 __ sub(s2, s2, temp0); 4591 __ add(s2, s2, temp1, ext:: uxth); 4592 4593 __ subs(temp0, s2, base); 4594 __ csel(s2, temp0, s2, Assembler::HS); 4595 4596 // Combine lower bits and higher bits 4597 __ bind(L_combine); 4598 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4599 4600 __ ret(lr); 4601 4602 return start; 4603 } 4604 4605 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4606 Register temp0, Register temp1, FloatRegister vbytes, 4607 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4608 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4609 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4610 // In non-vectorized code, we update s1 and s2 as: 4611 // s1 <- s1 + b1 4612 // s2 <- s2 + s1 4613 // s1 <- s1 + b2 4614 // s2 <- s2 + b1 4615 // ... 4616 // s1 <- s1 + b16 4617 // s2 <- s2 + s1 4618 // Putting above assignments together, we have: 4619 // s1_new = s1 + b1 + b2 + ... + b16 4620 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4621 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4622 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4623 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4624 4625 // s2 = s2 + s1 * 16 4626 __ add(s2, s2, s1, Assembler::LSL, 4); 4627 4628 // vs1acc = b1 + b2 + b3 + ... + b16 4629 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4630 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4631 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4632 __ uaddlv(vs1acc, __ T16B, vbytes); 4633 __ uaddlv(vs2acc, __ T8H, vs2acc); 4634 4635 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4636 __ fmovd(temp0, vs1acc); 4637 __ fmovd(temp1, vs2acc); 4638 __ add(s1, s1, temp0); 4639 __ add(s2, s2, temp1); 4640 } 4641 4642 /** 4643 * Arguments: 4644 * 4645 * Input: 4646 * c_rarg0 - x address 4647 * c_rarg1 - x length 4648 * c_rarg2 - y address 4649 * c_rarg3 - y length 4650 * c_rarg4 - z address 4651 * c_rarg5 - z length 4652 */ 4653 address generate_multiplyToLen() { 4654 __ align(CodeEntryAlignment); 4655 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4656 4657 address start = __ pc(); 4658 const Register x = r0; 4659 const Register xlen = r1; 4660 const Register y = r2; 4661 const Register ylen = r3; 4662 const Register z = r4; 4663 const Register zlen = r5; 4664 4665 const Register tmp1 = r10; 4666 const Register tmp2 = r11; 4667 const Register tmp3 = r12; 4668 const Register tmp4 = r13; 4669 const Register tmp5 = r14; 4670 const Register tmp6 = r15; 4671 const Register tmp7 = r16; 4672 4673 BLOCK_COMMENT("Entry:"); 4674 __ enter(); // required for proper stackwalking of RuntimeStub frame 4675 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4676 __ leave(); // required for proper stackwalking of RuntimeStub frame 4677 __ ret(lr); 4678 4679 return start; 4680 } 4681 4682 address generate_squareToLen() { 4683 // squareToLen algorithm for sizes 1..127 described in java code works 4684 // faster than multiply_to_len on some CPUs and slower on others, but 4685 // multiply_to_len shows a bit better overall results 4686 __ align(CodeEntryAlignment); 4687 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4688 address start = __ pc(); 4689 4690 const Register x = r0; 4691 const Register xlen = r1; 4692 const Register z = r2; 4693 const Register zlen = r3; 4694 const Register y = r4; // == x 4695 const Register ylen = r5; // == xlen 4696 4697 const Register tmp1 = r10; 4698 const Register tmp2 = r11; 4699 const Register tmp3 = r12; 4700 const Register tmp4 = r13; 4701 const Register tmp5 = r14; 4702 const Register tmp6 = r15; 4703 const Register tmp7 = r16; 4704 4705 RegSet spilled_regs = RegSet::of(y, ylen); 4706 BLOCK_COMMENT("Entry:"); 4707 __ enter(); 4708 __ push(spilled_regs, sp); 4709 __ mov(y, x); 4710 __ mov(ylen, xlen); 4711 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4712 __ pop(spilled_regs, sp); 4713 __ leave(); 4714 __ ret(lr); 4715 return start; 4716 } 4717 4718 address generate_mulAdd() { 4719 __ align(CodeEntryAlignment); 4720 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4721 4722 address start = __ pc(); 4723 4724 const Register out = r0; 4725 const Register in = r1; 4726 const Register offset = r2; 4727 const Register len = r3; 4728 const Register k = r4; 4729 4730 BLOCK_COMMENT("Entry:"); 4731 __ enter(); 4732 __ mul_add(out, in, offset, len, k); 4733 __ leave(); 4734 __ ret(lr); 4735 4736 return start; 4737 } 4738 4739 // Arguments: 4740 // 4741 // Input: 4742 // c_rarg0 - newArr address 4743 // c_rarg1 - oldArr address 4744 // c_rarg2 - newIdx 4745 // c_rarg3 - shiftCount 4746 // c_rarg4 - numIter 4747 // 4748 address generate_bigIntegerRightShift() { 4749 __ align(CodeEntryAlignment); 4750 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4751 address start = __ pc(); 4752 4753 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4754 4755 Register newArr = c_rarg0; 4756 Register oldArr = c_rarg1; 4757 Register newIdx = c_rarg2; 4758 Register shiftCount = c_rarg3; 4759 Register numIter = c_rarg4; 4760 Register idx = numIter; 4761 4762 Register newArrCur = rscratch1; 4763 Register shiftRevCount = rscratch2; 4764 Register oldArrCur = r13; 4765 Register oldArrNext = r14; 4766 4767 FloatRegister oldElem0 = v0; 4768 FloatRegister oldElem1 = v1; 4769 FloatRegister newElem = v2; 4770 FloatRegister shiftVCount = v3; 4771 FloatRegister shiftVRevCount = v4; 4772 4773 __ cbz(idx, Exit); 4774 4775 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4776 4777 // left shift count 4778 __ movw(shiftRevCount, 32); 4779 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4780 4781 // numIter too small to allow a 4-words SIMD loop, rolling back 4782 __ cmp(numIter, (u1)4); 4783 __ br(Assembler::LT, ShiftThree); 4784 4785 __ dup(shiftVCount, __ T4S, shiftCount); 4786 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4787 __ negr(shiftVCount, __ T4S, shiftVCount); 4788 4789 __ BIND(ShiftSIMDLoop); 4790 4791 // Calculate the load addresses 4792 __ sub(idx, idx, 4); 4793 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4794 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4795 __ add(oldArrCur, oldArrNext, 4); 4796 4797 // Load 4 words and process 4798 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4799 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4800 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4801 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4802 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4803 __ st1(newElem, __ T4S, Address(newArrCur)); 4804 4805 __ cmp(idx, (u1)4); 4806 __ br(Assembler::LT, ShiftTwoLoop); 4807 __ b(ShiftSIMDLoop); 4808 4809 __ BIND(ShiftTwoLoop); 4810 __ cbz(idx, Exit); 4811 __ cmp(idx, (u1)1); 4812 __ br(Assembler::EQ, ShiftOne); 4813 4814 // Calculate the load addresses 4815 __ sub(idx, idx, 2); 4816 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4817 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4818 __ add(oldArrCur, oldArrNext, 4); 4819 4820 // Load 2 words and process 4821 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4822 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4823 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4824 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4825 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4826 __ st1(newElem, __ T2S, Address(newArrCur)); 4827 __ b(ShiftTwoLoop); 4828 4829 __ BIND(ShiftThree); 4830 __ tbz(idx, 1, ShiftOne); 4831 __ tbz(idx, 0, ShiftTwo); 4832 __ ldrw(r10, Address(oldArr, 12)); 4833 __ ldrw(r11, Address(oldArr, 8)); 4834 __ lsrvw(r10, r10, shiftCount); 4835 __ lslvw(r11, r11, shiftRevCount); 4836 __ orrw(r12, r10, r11); 4837 __ strw(r12, Address(newArr, 8)); 4838 4839 __ BIND(ShiftTwo); 4840 __ ldrw(r10, Address(oldArr, 8)); 4841 __ ldrw(r11, Address(oldArr, 4)); 4842 __ lsrvw(r10, r10, shiftCount); 4843 __ lslvw(r11, r11, shiftRevCount); 4844 __ orrw(r12, r10, r11); 4845 __ strw(r12, Address(newArr, 4)); 4846 4847 __ BIND(ShiftOne); 4848 __ ldrw(r10, Address(oldArr, 4)); 4849 __ ldrw(r11, Address(oldArr)); 4850 __ lsrvw(r10, r10, shiftCount); 4851 __ lslvw(r11, r11, shiftRevCount); 4852 __ orrw(r12, r10, r11); 4853 __ strw(r12, Address(newArr)); 4854 4855 __ BIND(Exit); 4856 __ ret(lr); 4857 4858 return start; 4859 } 4860 4861 // Arguments: 4862 // 4863 // Input: 4864 // c_rarg0 - newArr address 4865 // c_rarg1 - oldArr address 4866 // c_rarg2 - newIdx 4867 // c_rarg3 - shiftCount 4868 // c_rarg4 - numIter 4869 // 4870 address generate_bigIntegerLeftShift() { 4871 __ align(CodeEntryAlignment); 4872 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4873 address start = __ pc(); 4874 4875 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4876 4877 Register newArr = c_rarg0; 4878 Register oldArr = c_rarg1; 4879 Register newIdx = c_rarg2; 4880 Register shiftCount = c_rarg3; 4881 Register numIter = c_rarg4; 4882 4883 Register shiftRevCount = rscratch1; 4884 Register oldArrNext = rscratch2; 4885 4886 FloatRegister oldElem0 = v0; 4887 FloatRegister oldElem1 = v1; 4888 FloatRegister newElem = v2; 4889 FloatRegister shiftVCount = v3; 4890 FloatRegister shiftVRevCount = v4; 4891 4892 __ cbz(numIter, Exit); 4893 4894 __ add(oldArrNext, oldArr, 4); 4895 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4896 4897 // right shift count 4898 __ movw(shiftRevCount, 32); 4899 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4900 4901 // numIter too small to allow a 4-words SIMD loop, rolling back 4902 __ cmp(numIter, (u1)4); 4903 __ br(Assembler::LT, ShiftThree); 4904 4905 __ dup(shiftVCount, __ T4S, shiftCount); 4906 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4907 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4908 4909 __ BIND(ShiftSIMDLoop); 4910 4911 // load 4 words and process 4912 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4913 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4914 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4915 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4916 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4917 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4918 __ sub(numIter, numIter, 4); 4919 4920 __ cmp(numIter, (u1)4); 4921 __ br(Assembler::LT, ShiftTwoLoop); 4922 __ b(ShiftSIMDLoop); 4923 4924 __ BIND(ShiftTwoLoop); 4925 __ cbz(numIter, Exit); 4926 __ cmp(numIter, (u1)1); 4927 __ br(Assembler::EQ, ShiftOne); 4928 4929 // load 2 words and process 4930 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4931 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4932 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4933 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4934 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4935 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4936 __ sub(numIter, numIter, 2); 4937 __ b(ShiftTwoLoop); 4938 4939 __ BIND(ShiftThree); 4940 __ ldrw(r10, __ post(oldArr, 4)); 4941 __ ldrw(r11, __ post(oldArrNext, 4)); 4942 __ lslvw(r10, r10, shiftCount); 4943 __ lsrvw(r11, r11, shiftRevCount); 4944 __ orrw(r12, r10, r11); 4945 __ strw(r12, __ post(newArr, 4)); 4946 __ tbz(numIter, 1, Exit); 4947 __ tbz(numIter, 0, ShiftOne); 4948 4949 __ BIND(ShiftTwo); 4950 __ ldrw(r10, __ post(oldArr, 4)); 4951 __ ldrw(r11, __ post(oldArrNext, 4)); 4952 __ lslvw(r10, r10, shiftCount); 4953 __ lsrvw(r11, r11, shiftRevCount); 4954 __ orrw(r12, r10, r11); 4955 __ strw(r12, __ post(newArr, 4)); 4956 4957 __ BIND(ShiftOne); 4958 __ ldrw(r10, Address(oldArr)); 4959 __ ldrw(r11, Address(oldArrNext)); 4960 __ lslvw(r10, r10, shiftCount); 4961 __ lsrvw(r11, r11, shiftRevCount); 4962 __ orrw(r12, r10, r11); 4963 __ strw(r12, Address(newArr)); 4964 4965 __ BIND(Exit); 4966 __ ret(lr); 4967 4968 return start; 4969 } 4970 4971 address generate_count_positives(address &count_positives_long) { 4972 const u1 large_loop_size = 64; 4973 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4974 int dcache_line = VM_Version::dcache_line_size(); 4975 4976 Register ary1 = r1, len = r2, result = r0; 4977 4978 __ align(CodeEntryAlignment); 4979 4980 StubCodeMark mark(this, "StubRoutines", "count_positives"); 4981 4982 address entry = __ pc(); 4983 4984 __ enter(); 4985 // precondition: a copy of len is already in result 4986 // __ mov(result, len); 4987 4988 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 4989 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 4990 4991 __ cmp(len, (u1)15); 4992 __ br(Assembler::GT, LEN_OVER_15); 4993 // The only case when execution falls into this code is when pointer is near 4994 // the end of memory page and we have to avoid reading next page 4995 __ add(ary1, ary1, len); 4996 __ subs(len, len, 8); 4997 __ br(Assembler::GT, LEN_OVER_8); 4998 __ ldr(rscratch2, Address(ary1, -8)); 4999 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 5000 __ lsrv(rscratch2, rscratch2, rscratch1); 5001 __ tst(rscratch2, UPPER_BIT_MASK); 5002 __ csel(result, zr, result, Assembler::NE); 5003 __ leave(); 5004 __ ret(lr); 5005 __ bind(LEN_OVER_8); 5006 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5007 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5008 __ tst(rscratch2, UPPER_BIT_MASK); 5009 __ br(Assembler::NE, RET_NO_POP); 5010 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5011 __ lsrv(rscratch1, rscratch1, rscratch2); 5012 __ tst(rscratch1, UPPER_BIT_MASK); 5013 __ bind(RET_NO_POP); 5014 __ csel(result, zr, result, Assembler::NE); 5015 __ leave(); 5016 __ ret(lr); 5017 5018 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5019 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5020 5021 count_positives_long = __ pc(); // 2nd entry point 5022 5023 __ enter(); 5024 5025 __ bind(LEN_OVER_15); 5026 __ push(spilled_regs, sp); 5027 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5028 __ cbz(rscratch2, ALIGNED); 5029 __ ldp(tmp6, tmp1, Address(ary1)); 5030 __ mov(tmp5, 16); 5031 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5032 __ add(ary1, ary1, rscratch1); 5033 __ orr(tmp6, tmp6, tmp1); 5034 __ tst(tmp6, UPPER_BIT_MASK); 5035 __ br(Assembler::NE, RET_ADJUST); 5036 __ sub(len, len, rscratch1); 5037 5038 __ bind(ALIGNED); 5039 __ cmp(len, large_loop_size); 5040 __ br(Assembler::LT, CHECK_16); 5041 // Perform 16-byte load as early return in pre-loop to handle situation 5042 // when initially aligned large array has negative values at starting bytes, 5043 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5044 // slower. Cases with negative bytes further ahead won't be affected that 5045 // much. In fact, it'll be faster due to early loads, less instructions and 5046 // less branches in LARGE_LOOP. 5047 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5048 __ sub(len, len, 16); 5049 __ orr(tmp6, tmp6, tmp1); 5050 __ tst(tmp6, UPPER_BIT_MASK); 5051 __ br(Assembler::NE, RET_ADJUST_16); 5052 __ cmp(len, large_loop_size); 5053 __ br(Assembler::LT, CHECK_16); 5054 5055 if (SoftwarePrefetchHintDistance >= 0 5056 && SoftwarePrefetchHintDistance >= dcache_line) { 5057 // initial prefetch 5058 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5059 } 5060 __ bind(LARGE_LOOP); 5061 if (SoftwarePrefetchHintDistance >= 0) { 5062 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5063 } 5064 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5065 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5066 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5067 // instructions per cycle and have less branches, but this approach disables 5068 // early return, thus, all 64 bytes are loaded and checked every time. 5069 __ ldp(tmp2, tmp3, Address(ary1)); 5070 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5071 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5072 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5073 __ add(ary1, ary1, large_loop_size); 5074 __ sub(len, len, large_loop_size); 5075 __ orr(tmp2, tmp2, tmp3); 5076 __ orr(tmp4, tmp4, tmp5); 5077 __ orr(rscratch1, rscratch1, rscratch2); 5078 __ orr(tmp6, tmp6, tmp1); 5079 __ orr(tmp2, tmp2, tmp4); 5080 __ orr(rscratch1, rscratch1, tmp6); 5081 __ orr(tmp2, tmp2, rscratch1); 5082 __ tst(tmp2, UPPER_BIT_MASK); 5083 __ br(Assembler::NE, RET_ADJUST_LONG); 5084 __ cmp(len, large_loop_size); 5085 __ br(Assembler::GE, LARGE_LOOP); 5086 5087 __ bind(CHECK_16); // small 16-byte load pre-loop 5088 __ cmp(len, (u1)16); 5089 __ br(Assembler::LT, POST_LOOP16); 5090 5091 __ bind(LOOP16); // small 16-byte load loop 5092 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5093 __ sub(len, len, 16); 5094 __ orr(tmp2, tmp2, tmp3); 5095 __ tst(tmp2, UPPER_BIT_MASK); 5096 __ br(Assembler::NE, RET_ADJUST_16); 5097 __ cmp(len, (u1)16); 5098 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5099 5100 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5101 __ cmp(len, (u1)8); 5102 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5103 __ ldr(tmp3, Address(__ post(ary1, 8))); 5104 __ tst(tmp3, UPPER_BIT_MASK); 5105 __ br(Assembler::NE, RET_ADJUST); 5106 __ sub(len, len, 8); 5107 5108 __ bind(POST_LOOP16_LOAD_TAIL); 5109 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5110 __ ldr(tmp1, Address(ary1)); 5111 __ mov(tmp2, 64); 5112 __ sub(tmp4, tmp2, len, __ LSL, 3); 5113 __ lslv(tmp1, tmp1, tmp4); 5114 __ tst(tmp1, UPPER_BIT_MASK); 5115 __ br(Assembler::NE, RET_ADJUST); 5116 // Fallthrough 5117 5118 __ bind(RET_LEN); 5119 __ pop(spilled_regs, sp); 5120 __ leave(); 5121 __ ret(lr); 5122 5123 // difference result - len is the count of guaranteed to be 5124 // positive bytes 5125 5126 __ bind(RET_ADJUST_LONG); 5127 __ add(len, len, (u1)(large_loop_size - 16)); 5128 __ bind(RET_ADJUST_16); 5129 __ add(len, len, 16); 5130 __ bind(RET_ADJUST); 5131 __ pop(spilled_regs, sp); 5132 __ leave(); 5133 __ sub(result, result, len); 5134 __ ret(lr); 5135 5136 return entry; 5137 } 5138 5139 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5140 bool usePrefetch, Label &NOT_EQUAL) { 5141 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5142 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5143 tmp7 = r12, tmp8 = r13; 5144 Label LOOP; 5145 5146 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5147 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5148 __ bind(LOOP); 5149 if (usePrefetch) { 5150 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5151 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5152 } 5153 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5154 __ eor(tmp1, tmp1, tmp2); 5155 __ eor(tmp3, tmp3, tmp4); 5156 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5157 __ orr(tmp1, tmp1, tmp3); 5158 __ cbnz(tmp1, NOT_EQUAL); 5159 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5160 __ eor(tmp5, tmp5, tmp6); 5161 __ eor(tmp7, tmp7, tmp8); 5162 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5163 __ orr(tmp5, tmp5, tmp7); 5164 __ cbnz(tmp5, NOT_EQUAL); 5165 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5166 __ eor(tmp1, tmp1, tmp2); 5167 __ eor(tmp3, tmp3, tmp4); 5168 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5169 __ orr(tmp1, tmp1, tmp3); 5170 __ cbnz(tmp1, NOT_EQUAL); 5171 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5172 __ eor(tmp5, tmp5, tmp6); 5173 __ sub(cnt1, cnt1, 8 * wordSize); 5174 __ eor(tmp7, tmp7, tmp8); 5175 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5176 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5177 // cmp) because subs allows an unlimited range of immediate operand. 5178 __ subs(tmp6, cnt1, loopThreshold); 5179 __ orr(tmp5, tmp5, tmp7); 5180 __ cbnz(tmp5, NOT_EQUAL); 5181 __ br(__ GE, LOOP); 5182 // post-loop 5183 __ eor(tmp1, tmp1, tmp2); 5184 __ eor(tmp3, tmp3, tmp4); 5185 __ orr(tmp1, tmp1, tmp3); 5186 __ sub(cnt1, cnt1, 2 * wordSize); 5187 __ cbnz(tmp1, NOT_EQUAL); 5188 } 5189 5190 void generate_large_array_equals_loop_simd(int loopThreshold, 5191 bool usePrefetch, Label &NOT_EQUAL) { 5192 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5193 tmp2 = rscratch2; 5194 Label LOOP; 5195 5196 __ bind(LOOP); 5197 if (usePrefetch) { 5198 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5199 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5200 } 5201 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5202 __ sub(cnt1, cnt1, 8 * wordSize); 5203 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5204 __ subs(tmp1, cnt1, loopThreshold); 5205 __ eor(v0, __ T16B, v0, v4); 5206 __ eor(v1, __ T16B, v1, v5); 5207 __ eor(v2, __ T16B, v2, v6); 5208 __ eor(v3, __ T16B, v3, v7); 5209 __ orr(v0, __ T16B, v0, v1); 5210 __ orr(v1, __ T16B, v2, v3); 5211 __ orr(v0, __ T16B, v0, v1); 5212 __ umov(tmp1, v0, __ D, 0); 5213 __ umov(tmp2, v0, __ D, 1); 5214 __ orr(tmp1, tmp1, tmp2); 5215 __ cbnz(tmp1, NOT_EQUAL); 5216 __ br(__ GE, LOOP); 5217 } 5218 5219 // a1 = r1 - array1 address 5220 // a2 = r2 - array2 address 5221 // result = r0 - return value. Already contains "false" 5222 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5223 // r3-r5 are reserved temporary registers 5224 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5225 address generate_large_array_equals() { 5226 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5227 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5228 tmp7 = r12, tmp8 = r13; 5229 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5230 SMALL_LOOP, POST_LOOP; 5231 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5232 // calculate if at least 32 prefetched bytes are used 5233 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5234 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5235 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5236 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5237 tmp5, tmp6, tmp7, tmp8); 5238 5239 __ align(CodeEntryAlignment); 5240 5241 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5242 5243 address entry = __ pc(); 5244 __ enter(); 5245 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5246 // also advance pointers to use post-increment instead of pre-increment 5247 __ add(a1, a1, wordSize); 5248 __ add(a2, a2, wordSize); 5249 if (AvoidUnalignedAccesses) { 5250 // both implementations (SIMD/nonSIMD) are using relatively large load 5251 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5252 // on some CPUs in case of address is not at least 16-byte aligned. 5253 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5254 // load if needed at least for 1st address and make if 16-byte aligned. 5255 Label ALIGNED16; 5256 __ tbz(a1, 3, ALIGNED16); 5257 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5258 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5259 __ sub(cnt1, cnt1, wordSize); 5260 __ eor(tmp1, tmp1, tmp2); 5261 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5262 __ bind(ALIGNED16); 5263 } 5264 if (UseSIMDForArrayEquals) { 5265 if (SoftwarePrefetchHintDistance >= 0) { 5266 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5267 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5268 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5269 /* prfm = */ true, NOT_EQUAL); 5270 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5271 __ br(__ LT, TAIL); 5272 } 5273 __ bind(NO_PREFETCH_LARGE_LOOP); 5274 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5275 /* prfm = */ false, NOT_EQUAL); 5276 } else { 5277 __ push(spilled_regs, sp); 5278 if (SoftwarePrefetchHintDistance >= 0) { 5279 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5280 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5281 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5282 /* prfm = */ true, NOT_EQUAL); 5283 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5284 __ br(__ LT, TAIL); 5285 } 5286 __ bind(NO_PREFETCH_LARGE_LOOP); 5287 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5288 /* prfm = */ false, NOT_EQUAL); 5289 } 5290 __ bind(TAIL); 5291 __ cbz(cnt1, EQUAL); 5292 __ subs(cnt1, cnt1, wordSize); 5293 __ br(__ LE, POST_LOOP); 5294 __ bind(SMALL_LOOP); 5295 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5296 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5297 __ subs(cnt1, cnt1, wordSize); 5298 __ eor(tmp1, tmp1, tmp2); 5299 __ cbnz(tmp1, NOT_EQUAL); 5300 __ br(__ GT, SMALL_LOOP); 5301 __ bind(POST_LOOP); 5302 __ ldr(tmp1, Address(a1, cnt1)); 5303 __ ldr(tmp2, Address(a2, cnt1)); 5304 __ eor(tmp1, tmp1, tmp2); 5305 __ cbnz(tmp1, NOT_EQUAL); 5306 __ bind(EQUAL); 5307 __ mov(result, true); 5308 __ bind(NOT_EQUAL); 5309 if (!UseSIMDForArrayEquals) { 5310 __ pop(spilled_regs, sp); 5311 } 5312 __ bind(NOT_EQUAL_NO_POP); 5313 __ leave(); 5314 __ ret(lr); 5315 return entry; 5316 } 5317 5318 address generate_dsin_dcos(bool isCos) { 5319 __ align(CodeEntryAlignment); 5320 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5321 address start = __ pc(); 5322 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5323 (address)StubRoutines::aarch64::_two_over_pi, 5324 (address)StubRoutines::aarch64::_pio2, 5325 (address)StubRoutines::aarch64::_dsin_coef, 5326 (address)StubRoutines::aarch64::_dcos_coef); 5327 return start; 5328 } 5329 5330 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5331 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5332 Label &DIFF2) { 5333 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5334 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5335 5336 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5337 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5338 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5339 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5340 5341 __ fmovd(tmpL, vtmp3); 5342 __ eor(rscratch2, tmp3, tmpL); 5343 __ cbnz(rscratch2, DIFF2); 5344 5345 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5346 __ umov(tmpL, vtmp3, __ D, 1); 5347 __ eor(rscratch2, tmpU, tmpL); 5348 __ cbnz(rscratch2, DIFF1); 5349 5350 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5351 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5352 __ fmovd(tmpL, vtmp); 5353 __ eor(rscratch2, tmp3, tmpL); 5354 __ cbnz(rscratch2, DIFF2); 5355 5356 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5357 __ umov(tmpL, vtmp, __ D, 1); 5358 __ eor(rscratch2, tmpU, tmpL); 5359 __ cbnz(rscratch2, DIFF1); 5360 } 5361 5362 // r0 = result 5363 // r1 = str1 5364 // r2 = cnt1 5365 // r3 = str2 5366 // r4 = cnt2 5367 // r10 = tmp1 5368 // r11 = tmp2 5369 address generate_compare_long_string_different_encoding(bool isLU) { 5370 __ align(CodeEntryAlignment); 5371 StubCodeMark mark(this, "StubRoutines", isLU 5372 ? "compare_long_string_different_encoding LU" 5373 : "compare_long_string_different_encoding UL"); 5374 address entry = __ pc(); 5375 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5376 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5377 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5378 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5379 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5380 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5381 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5382 5383 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5384 5385 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5386 // cnt2 == amount of characters left to compare 5387 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5388 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5389 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5390 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5391 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5392 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5393 __ eor(rscratch2, tmp1, tmp2); 5394 __ mov(rscratch1, tmp2); 5395 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5396 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5397 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5398 __ push(spilled_regs, sp); 5399 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5400 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5401 5402 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5403 5404 if (SoftwarePrefetchHintDistance >= 0) { 5405 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5406 __ br(__ LT, NO_PREFETCH); 5407 __ bind(LARGE_LOOP_PREFETCH); 5408 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5409 __ mov(tmp4, 2); 5410 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5411 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5412 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5413 __ subs(tmp4, tmp4, 1); 5414 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5415 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5416 __ mov(tmp4, 2); 5417 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5418 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5419 __ subs(tmp4, tmp4, 1); 5420 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5421 __ sub(cnt2, cnt2, 64); 5422 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5423 __ br(__ GE, LARGE_LOOP_PREFETCH); 5424 } 5425 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5426 __ bind(NO_PREFETCH); 5427 __ subs(cnt2, cnt2, 16); 5428 __ br(__ LT, TAIL); 5429 __ align(OptoLoopAlignment); 5430 __ bind(SMALL_LOOP); // smaller loop 5431 __ subs(cnt2, cnt2, 16); 5432 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5433 __ br(__ GE, SMALL_LOOP); 5434 __ cmn(cnt2, (u1)16); 5435 __ br(__ EQ, LOAD_LAST); 5436 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5437 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5438 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5439 __ ldr(tmp3, Address(cnt1, -8)); 5440 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5441 __ b(LOAD_LAST); 5442 __ bind(DIFF2); 5443 __ mov(tmpU, tmp3); 5444 __ bind(DIFF1); 5445 __ pop(spilled_regs, sp); 5446 __ b(CALCULATE_DIFFERENCE); 5447 __ bind(LOAD_LAST); 5448 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5449 // No need to load it again 5450 __ mov(tmpU, tmp3); 5451 __ pop(spilled_regs, sp); 5452 5453 // tmp2 points to the address of the last 4 Latin1 characters right now 5454 __ ldrs(vtmp, Address(tmp2)); 5455 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5456 __ fmovd(tmpL, vtmp); 5457 5458 __ eor(rscratch2, tmpU, tmpL); 5459 __ cbz(rscratch2, DONE); 5460 5461 // Find the first different characters in the longwords and 5462 // compute their difference. 5463 __ bind(CALCULATE_DIFFERENCE); 5464 __ rev(rscratch2, rscratch2); 5465 __ clz(rscratch2, rscratch2); 5466 __ andr(rscratch2, rscratch2, -16); 5467 __ lsrv(tmp1, tmp1, rscratch2); 5468 __ uxthw(tmp1, tmp1); 5469 __ lsrv(rscratch1, rscratch1, rscratch2); 5470 __ uxthw(rscratch1, rscratch1); 5471 __ subw(result, tmp1, rscratch1); 5472 __ bind(DONE); 5473 __ ret(lr); 5474 return entry; 5475 } 5476 5477 // r0 = input (float16) 5478 // v0 = result (float) 5479 // v1 = temporary float register 5480 address generate_float16ToFloat() { 5481 __ align(CodeEntryAlignment); 5482 StubCodeMark mark(this, "StubRoutines", "float16ToFloat"); 5483 address entry = __ pc(); 5484 BLOCK_COMMENT("Entry:"); 5485 __ flt16_to_flt(v0, r0, v1); 5486 __ ret(lr); 5487 return entry; 5488 } 5489 5490 // v0 = input (float) 5491 // r0 = result (float16) 5492 // v1 = temporary float register 5493 address generate_floatToFloat16() { 5494 __ align(CodeEntryAlignment); 5495 StubCodeMark mark(this, "StubRoutines", "floatToFloat16"); 5496 address entry = __ pc(); 5497 BLOCK_COMMENT("Entry:"); 5498 __ flt_to_flt16(r0, v0, v1); 5499 __ ret(lr); 5500 return entry; 5501 } 5502 5503 address generate_method_entry_barrier() { 5504 __ align(CodeEntryAlignment); 5505 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5506 5507 Label deoptimize_label; 5508 5509 address start = __ pc(); 5510 5511 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5512 5513 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5514 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5515 // We can get here despite the nmethod being good, if we have not 5516 // yet applied our cross modification fence (or data fence). 5517 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5518 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5519 __ ldrw(rscratch2, rscratch2); 5520 __ strw(rscratch2, thread_epoch_addr); 5521 __ isb(); 5522 __ membar(__ LoadLoad); 5523 } 5524 5525 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5526 5527 __ enter(); 5528 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5529 5530 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5531 5532 __ push_call_clobbered_registers(); 5533 5534 __ mov(c_rarg0, rscratch2); 5535 __ call_VM_leaf 5536 (CAST_FROM_FN_PTR 5537 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5538 5539 __ reset_last_Java_frame(true); 5540 5541 __ mov(rscratch1, r0); 5542 5543 __ pop_call_clobbered_registers(); 5544 5545 __ cbnz(rscratch1, deoptimize_label); 5546 5547 __ leave(); 5548 __ ret(lr); 5549 5550 __ BIND(deoptimize_label); 5551 5552 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5553 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5554 5555 __ mov(sp, rscratch1); 5556 __ br(rscratch2); 5557 5558 return start; 5559 } 5560 5561 // r0 = result 5562 // r1 = str1 5563 // r2 = cnt1 5564 // r3 = str2 5565 // r4 = cnt2 5566 // r10 = tmp1 5567 // r11 = tmp2 5568 address generate_compare_long_string_same_encoding(bool isLL) { 5569 __ align(CodeEntryAlignment); 5570 StubCodeMark mark(this, "StubRoutines", isLL 5571 ? "compare_long_string_same_encoding LL" 5572 : "compare_long_string_same_encoding UU"); 5573 address entry = __ pc(); 5574 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5575 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5576 5577 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5578 5579 // exit from large loop when less than 64 bytes left to read or we're about 5580 // to prefetch memory behind array border 5581 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5582 5583 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5584 __ eor(rscratch2, tmp1, tmp2); 5585 __ cbnz(rscratch2, CAL_DIFFERENCE); 5586 5587 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5588 // update pointers, because of previous read 5589 __ add(str1, str1, wordSize); 5590 __ add(str2, str2, wordSize); 5591 if (SoftwarePrefetchHintDistance >= 0) { 5592 __ align(OptoLoopAlignment); 5593 __ bind(LARGE_LOOP_PREFETCH); 5594 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5595 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5596 5597 for (int i = 0; i < 4; i++) { 5598 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5599 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5600 __ cmp(tmp1, tmp2); 5601 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5602 __ br(Assembler::NE, DIFF); 5603 } 5604 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5605 __ add(str1, str1, 64); 5606 __ add(str2, str2, 64); 5607 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5608 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5609 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5610 } 5611 5612 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5613 __ br(Assembler::LE, LESS16); 5614 __ align(OptoLoopAlignment); 5615 __ bind(LOOP_COMPARE16); 5616 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5617 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5618 __ cmp(tmp1, tmp2); 5619 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5620 __ br(Assembler::NE, DIFF); 5621 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5622 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5623 __ br(Assembler::LT, LESS16); 5624 5625 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5626 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5627 __ cmp(tmp1, tmp2); 5628 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5629 __ br(Assembler::NE, DIFF); 5630 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5631 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5632 __ br(Assembler::GE, LOOP_COMPARE16); 5633 __ cbz(cnt2, LENGTH_DIFF); 5634 5635 __ bind(LESS16); 5636 // each 8 compare 5637 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5638 __ br(Assembler::LE, LESS8); 5639 __ ldr(tmp1, Address(__ post(str1, 8))); 5640 __ ldr(tmp2, Address(__ post(str2, 8))); 5641 __ eor(rscratch2, tmp1, tmp2); 5642 __ cbnz(rscratch2, CAL_DIFFERENCE); 5643 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5644 5645 __ bind(LESS8); // directly load last 8 bytes 5646 if (!isLL) { 5647 __ add(cnt2, cnt2, cnt2); 5648 } 5649 __ ldr(tmp1, Address(str1, cnt2)); 5650 __ ldr(tmp2, Address(str2, cnt2)); 5651 __ eor(rscratch2, tmp1, tmp2); 5652 __ cbz(rscratch2, LENGTH_DIFF); 5653 __ b(CAL_DIFFERENCE); 5654 5655 __ bind(DIFF); 5656 __ cmp(tmp1, tmp2); 5657 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5658 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5659 // reuse rscratch2 register for the result of eor instruction 5660 __ eor(rscratch2, tmp1, tmp2); 5661 5662 __ bind(CAL_DIFFERENCE); 5663 __ rev(rscratch2, rscratch2); 5664 __ clz(rscratch2, rscratch2); 5665 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5666 __ lsrv(tmp1, tmp1, rscratch2); 5667 __ lsrv(tmp2, tmp2, rscratch2); 5668 if (isLL) { 5669 __ uxtbw(tmp1, tmp1); 5670 __ uxtbw(tmp2, tmp2); 5671 } else { 5672 __ uxthw(tmp1, tmp1); 5673 __ uxthw(tmp2, tmp2); 5674 } 5675 __ subw(result, tmp1, tmp2); 5676 5677 __ bind(LENGTH_DIFF); 5678 __ ret(lr); 5679 return entry; 5680 } 5681 5682 enum string_compare_mode { 5683 LL, 5684 LU, 5685 UL, 5686 UU, 5687 }; 5688 5689 // The following registers are declared in aarch64.ad 5690 // r0 = result 5691 // r1 = str1 5692 // r2 = cnt1 5693 // r3 = str2 5694 // r4 = cnt2 5695 // r10 = tmp1 5696 // r11 = tmp2 5697 // z0 = ztmp1 5698 // z1 = ztmp2 5699 // p0 = pgtmp1 5700 // p1 = pgtmp2 5701 address generate_compare_long_string_sve(string_compare_mode mode) { 5702 __ align(CodeEntryAlignment); 5703 address entry = __ pc(); 5704 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5705 tmp1 = r10, tmp2 = r11; 5706 5707 Label LOOP, DONE, MISMATCH; 5708 Register vec_len = tmp1; 5709 Register idx = tmp2; 5710 // The minimum of the string lengths has been stored in cnt2. 5711 Register cnt = cnt2; 5712 FloatRegister ztmp1 = z0, ztmp2 = z1; 5713 PRegister pgtmp1 = p0, pgtmp2 = p1; 5714 5715 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 5716 switch (mode) { \ 5717 case LL: \ 5718 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 5719 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 5720 break; \ 5721 case LU: \ 5722 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 5723 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5724 break; \ 5725 case UL: \ 5726 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5727 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 5728 break; \ 5729 case UU: \ 5730 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5731 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5732 break; \ 5733 default: \ 5734 ShouldNotReachHere(); \ 5735 } 5736 5737 const char* stubname; 5738 switch (mode) { 5739 case LL: stubname = "compare_long_string_same_encoding LL"; break; 5740 case LU: stubname = "compare_long_string_different_encoding LU"; break; 5741 case UL: stubname = "compare_long_string_different_encoding UL"; break; 5742 case UU: stubname = "compare_long_string_same_encoding UU"; break; 5743 default: ShouldNotReachHere(); 5744 } 5745 5746 StubCodeMark mark(this, "StubRoutines", stubname); 5747 5748 __ mov(idx, 0); 5749 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5750 5751 if (mode == LL) { 5752 __ sve_cntb(vec_len); 5753 } else { 5754 __ sve_cnth(vec_len); 5755 } 5756 5757 __ sub(rscratch1, cnt, vec_len); 5758 5759 __ bind(LOOP); 5760 5761 // main loop 5762 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5763 __ add(idx, idx, vec_len); 5764 // Compare strings. 5765 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5766 __ br(__ NE, MISMATCH); 5767 __ cmp(idx, rscratch1); 5768 __ br(__ LT, LOOP); 5769 5770 // post loop, last iteration 5771 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5772 5773 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5774 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5775 __ br(__ EQ, DONE); 5776 5777 __ bind(MISMATCH); 5778 5779 // Crop the vector to find its location. 5780 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 5781 // Extract the first different characters of each string. 5782 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 5783 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 5784 5785 // Compute the difference of the first different characters. 5786 __ sub(result, rscratch1, rscratch2); 5787 5788 __ bind(DONE); 5789 __ ret(lr); 5790 #undef LOAD_PAIR 5791 return entry; 5792 } 5793 5794 void generate_compare_long_strings() { 5795 if (UseSVE == 0) { 5796 StubRoutines::aarch64::_compare_long_string_LL 5797 = generate_compare_long_string_same_encoding(true); 5798 StubRoutines::aarch64::_compare_long_string_UU 5799 = generate_compare_long_string_same_encoding(false); 5800 StubRoutines::aarch64::_compare_long_string_LU 5801 = generate_compare_long_string_different_encoding(true); 5802 StubRoutines::aarch64::_compare_long_string_UL 5803 = generate_compare_long_string_different_encoding(false); 5804 } else { 5805 StubRoutines::aarch64::_compare_long_string_LL 5806 = generate_compare_long_string_sve(LL); 5807 StubRoutines::aarch64::_compare_long_string_UU 5808 = generate_compare_long_string_sve(UU); 5809 StubRoutines::aarch64::_compare_long_string_LU 5810 = generate_compare_long_string_sve(LU); 5811 StubRoutines::aarch64::_compare_long_string_UL 5812 = generate_compare_long_string_sve(UL); 5813 } 5814 } 5815 5816 // R0 = result 5817 // R1 = str2 5818 // R2 = cnt1 5819 // R3 = str1 5820 // R4 = cnt2 5821 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 5822 // 5823 // This generic linear code use few additional ideas, which makes it faster: 5824 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5825 // in order to skip initial loading(help in systems with 1 ld pipeline) 5826 // 2) we can use "fast" algorithm of finding single character to search for 5827 // first symbol with less branches(1 branch per each loaded register instead 5828 // of branch for each symbol), so, this is where constants like 5829 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5830 // 3) after loading and analyzing 1st register of source string, it can be 5831 // used to search for every 1st character entry, saving few loads in 5832 // comparison with "simplier-but-slower" implementation 5833 // 4) in order to avoid lots of push/pop operations, code below is heavily 5834 // re-using/re-initializing/compressing register values, which makes code 5835 // larger and a bit less readable, however, most of extra operations are 5836 // issued during loads or branches, so, penalty is minimal 5837 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5838 const char* stubName = str1_isL 5839 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5840 : "indexof_linear_uu"; 5841 __ align(CodeEntryAlignment); 5842 StubCodeMark mark(this, "StubRoutines", stubName); 5843 address entry = __ pc(); 5844 5845 int str1_chr_size = str1_isL ? 1 : 2; 5846 int str2_chr_size = str2_isL ? 1 : 2; 5847 int str1_chr_shift = str1_isL ? 0 : 1; 5848 int str2_chr_shift = str2_isL ? 0 : 1; 5849 bool isL = str1_isL && str2_isL; 5850 // parameters 5851 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5852 // temporary registers 5853 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5854 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5855 // redefinitions 5856 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5857 5858 __ push(spilled_regs, sp); 5859 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5860 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5861 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5862 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5863 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5864 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5865 // Read whole register from str1. It is safe, because length >=8 here 5866 __ ldr(ch1, Address(str1)); 5867 // Read whole register from str2. It is safe, because length >=8 here 5868 __ ldr(ch2, Address(str2)); 5869 __ sub(cnt2, cnt2, cnt1); 5870 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5871 if (str1_isL != str2_isL) { 5872 __ eor(v0, __ T16B, v0, v0); 5873 } 5874 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5875 __ mul(first, first, tmp1); 5876 // check if we have less than 1 register to check 5877 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5878 if (str1_isL != str2_isL) { 5879 __ fmovd(v1, ch1); 5880 } 5881 __ br(__ LE, L_SMALL); 5882 __ eor(ch2, first, ch2); 5883 if (str1_isL != str2_isL) { 5884 __ zip1(v1, __ T16B, v1, v0); 5885 } 5886 __ sub(tmp2, ch2, tmp1); 5887 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5888 __ bics(tmp2, tmp2, ch2); 5889 if (str1_isL != str2_isL) { 5890 __ fmovd(ch1, v1); 5891 } 5892 __ br(__ NE, L_HAS_ZERO); 5893 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5894 __ add(result, result, wordSize/str2_chr_size); 5895 __ add(str2, str2, wordSize); 5896 __ br(__ LT, L_POST_LOOP); 5897 __ BIND(L_LOOP); 5898 __ ldr(ch2, Address(str2)); 5899 __ eor(ch2, first, ch2); 5900 __ sub(tmp2, ch2, tmp1); 5901 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5902 __ bics(tmp2, tmp2, ch2); 5903 __ br(__ NE, L_HAS_ZERO); 5904 __ BIND(L_LOOP_PROCEED); 5905 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5906 __ add(str2, str2, wordSize); 5907 __ add(result, result, wordSize/str2_chr_size); 5908 __ br(__ GE, L_LOOP); 5909 __ BIND(L_POST_LOOP); 5910 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5911 __ br(__ LE, NOMATCH); 5912 __ ldr(ch2, Address(str2)); 5913 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5914 __ eor(ch2, first, ch2); 5915 __ sub(tmp2, ch2, tmp1); 5916 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5917 __ mov(tmp4, -1); // all bits set 5918 __ b(L_SMALL_PROCEED); 5919 __ align(OptoLoopAlignment); 5920 __ BIND(L_SMALL); 5921 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5922 __ eor(ch2, first, ch2); 5923 if (str1_isL != str2_isL) { 5924 __ zip1(v1, __ T16B, v1, v0); 5925 } 5926 __ sub(tmp2, ch2, tmp1); 5927 __ mov(tmp4, -1); // all bits set 5928 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5929 if (str1_isL != str2_isL) { 5930 __ fmovd(ch1, v1); // move converted 4 symbols 5931 } 5932 __ BIND(L_SMALL_PROCEED); 5933 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5934 __ bic(tmp2, tmp2, ch2); 5935 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5936 __ rbit(tmp2, tmp2); 5937 __ br(__ EQ, NOMATCH); 5938 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5939 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5940 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5941 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5942 if (str2_isL) { // LL 5943 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5944 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5945 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5946 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5947 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5948 } else { 5949 __ mov(ch2, 0xE); // all bits in byte set except last one 5950 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5951 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5952 __ lslv(tmp2, tmp2, tmp4); 5953 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5954 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5955 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5956 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5957 } 5958 __ cmp(ch1, ch2); 5959 __ mov(tmp4, wordSize/str2_chr_size); 5960 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5961 __ BIND(L_SMALL_CMP_LOOP); 5962 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5963 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5964 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5965 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5966 __ add(tmp4, tmp4, 1); 5967 __ cmp(tmp4, cnt1); 5968 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5969 __ cmp(first, ch2); 5970 __ br(__ EQ, L_SMALL_CMP_LOOP); 5971 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5972 __ cbz(tmp2, NOMATCH); // no more matches. exit 5973 __ clz(tmp4, tmp2); 5974 __ add(result, result, 1); // advance index 5975 __ add(str2, str2, str2_chr_size); // advance pointer 5976 __ b(L_SMALL_HAS_ZERO_LOOP); 5977 __ align(OptoLoopAlignment); 5978 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5979 __ cmp(first, ch2); 5980 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5981 __ b(DONE); 5982 __ align(OptoLoopAlignment); 5983 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5984 if (str2_isL) { // LL 5985 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5986 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5987 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5988 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5989 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5990 } else { 5991 __ mov(ch2, 0xE); // all bits in byte set except last one 5992 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5993 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5994 __ lslv(tmp2, tmp2, tmp4); 5995 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5996 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5997 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5998 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5999 } 6000 __ cmp(ch1, ch2); 6001 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6002 __ b(DONE); 6003 __ align(OptoLoopAlignment); 6004 __ BIND(L_HAS_ZERO); 6005 __ rbit(tmp2, tmp2); 6006 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 6007 // Now, perform compression of counters(cnt2 and cnt1) into one register. 6008 // It's fine because both counters are 32bit and are not changed in this 6009 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 6010 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 6011 __ sub(result, result, 1); 6012 __ BIND(L_HAS_ZERO_LOOP); 6013 __ mov(cnt1, wordSize/str2_chr_size); 6014 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6015 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6016 if (str2_isL) { 6017 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6018 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6019 __ lslv(tmp2, tmp2, tmp4); 6020 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6021 __ add(tmp4, tmp4, 1); 6022 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6023 __ lsl(tmp2, tmp2, 1); 6024 __ mov(tmp4, wordSize/str2_chr_size); 6025 } else { 6026 __ mov(ch2, 0xE); 6027 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6028 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6029 __ lslv(tmp2, tmp2, tmp4); 6030 __ add(tmp4, tmp4, 1); 6031 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6032 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6033 __ lsl(tmp2, tmp2, 1); 6034 __ mov(tmp4, wordSize/str2_chr_size); 6035 __ sub(str2, str2, str2_chr_size); 6036 } 6037 __ cmp(ch1, ch2); 6038 __ mov(tmp4, wordSize/str2_chr_size); 6039 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6040 __ BIND(L_CMP_LOOP); 6041 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6042 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6043 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6044 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6045 __ add(tmp4, tmp4, 1); 6046 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6047 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6048 __ cmp(cnt1, ch2); 6049 __ br(__ EQ, L_CMP_LOOP); 6050 __ BIND(L_CMP_LOOP_NOMATCH); 6051 // here we're not matched 6052 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6053 __ clz(tmp4, tmp2); 6054 __ add(str2, str2, str2_chr_size); // advance pointer 6055 __ b(L_HAS_ZERO_LOOP); 6056 __ align(OptoLoopAlignment); 6057 __ BIND(L_CMP_LOOP_LAST_CMP); 6058 __ cmp(cnt1, ch2); 6059 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6060 __ b(DONE); 6061 __ align(OptoLoopAlignment); 6062 __ BIND(L_CMP_LOOP_LAST_CMP2); 6063 if (str2_isL) { 6064 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6065 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6066 __ lslv(tmp2, tmp2, tmp4); 6067 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6068 __ add(tmp4, tmp4, 1); 6069 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6070 __ lsl(tmp2, tmp2, 1); 6071 } else { 6072 __ mov(ch2, 0xE); 6073 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6074 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6075 __ lslv(tmp2, tmp2, tmp4); 6076 __ add(tmp4, tmp4, 1); 6077 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6078 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6079 __ lsl(tmp2, tmp2, 1); 6080 __ sub(str2, str2, str2_chr_size); 6081 } 6082 __ cmp(ch1, ch2); 6083 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6084 __ b(DONE); 6085 __ align(OptoLoopAlignment); 6086 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6087 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6088 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6089 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6090 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6091 // result by analyzed characters value, so, we can just reset lower bits 6092 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6093 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6094 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6095 // index of last analyzed substring inside current octet. So, str2 in at 6096 // respective start address. We need to advance it to next octet 6097 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6098 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6099 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6100 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6101 __ movw(cnt2, cnt2); 6102 __ b(L_LOOP_PROCEED); 6103 __ align(OptoLoopAlignment); 6104 __ BIND(NOMATCH); 6105 __ mov(result, -1); 6106 __ BIND(DONE); 6107 __ pop(spilled_regs, sp); 6108 __ ret(lr); 6109 return entry; 6110 } 6111 6112 void generate_string_indexof_stubs() { 6113 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6114 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6115 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6116 } 6117 6118 void inflate_and_store_2_fp_registers(bool generatePrfm, 6119 FloatRegister src1, FloatRegister src2) { 6120 Register dst = r1; 6121 __ zip1(v1, __ T16B, src1, v0); 6122 __ zip2(v2, __ T16B, src1, v0); 6123 if (generatePrfm) { 6124 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6125 } 6126 __ zip1(v3, __ T16B, src2, v0); 6127 __ zip2(v4, __ T16B, src2, v0); 6128 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6129 } 6130 6131 // R0 = src 6132 // R1 = dst 6133 // R2 = len 6134 // R3 = len >> 3 6135 // V0 = 0 6136 // v1 = loaded 8 bytes 6137 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6138 address generate_large_byte_array_inflate() { 6139 __ align(CodeEntryAlignment); 6140 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6141 address entry = __ pc(); 6142 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6143 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6144 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6145 6146 // do one more 8-byte read to have address 16-byte aligned in most cases 6147 // also use single store instruction 6148 __ ldrd(v2, __ post(src, 8)); 6149 __ sub(octetCounter, octetCounter, 2); 6150 __ zip1(v1, __ T16B, v1, v0); 6151 __ zip1(v2, __ T16B, v2, v0); 6152 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6153 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6154 __ subs(rscratch1, octetCounter, large_loop_threshold); 6155 __ br(__ LE, LOOP_START); 6156 __ b(LOOP_PRFM_START); 6157 __ bind(LOOP_PRFM); 6158 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6159 __ bind(LOOP_PRFM_START); 6160 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6161 __ sub(octetCounter, octetCounter, 8); 6162 __ subs(rscratch1, octetCounter, large_loop_threshold); 6163 inflate_and_store_2_fp_registers(true, v3, v4); 6164 inflate_and_store_2_fp_registers(true, v5, v6); 6165 __ br(__ GT, LOOP_PRFM); 6166 __ cmp(octetCounter, (u1)8); 6167 __ br(__ LT, DONE); 6168 __ bind(LOOP); 6169 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6170 __ bind(LOOP_START); 6171 __ sub(octetCounter, octetCounter, 8); 6172 __ cmp(octetCounter, (u1)8); 6173 inflate_and_store_2_fp_registers(false, v3, v4); 6174 inflate_and_store_2_fp_registers(false, v5, v6); 6175 __ br(__ GE, LOOP); 6176 __ bind(DONE); 6177 __ ret(lr); 6178 return entry; 6179 } 6180 6181 /** 6182 * Arguments: 6183 * 6184 * Input: 6185 * c_rarg0 - current state address 6186 * c_rarg1 - H key address 6187 * c_rarg2 - data address 6188 * c_rarg3 - number of blocks 6189 * 6190 * Output: 6191 * Updated state at c_rarg0 6192 */ 6193 address generate_ghash_processBlocks() { 6194 // Bafflingly, GCM uses little-endian for the byte order, but 6195 // big-endian for the bit order. For example, the polynomial 1 is 6196 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6197 // 6198 // So, we must either reverse the bytes in each word and do 6199 // everything big-endian or reverse the bits in each byte and do 6200 // it little-endian. On AArch64 it's more idiomatic to reverse 6201 // the bits in each byte (we have an instruction, RBIT, to do 6202 // that) and keep the data in little-endian bit order through the 6203 // calculation, bit-reversing the inputs and outputs. 6204 6205 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6206 __ align(wordSize * 2); 6207 address p = __ pc(); 6208 __ emit_int64(0x87); // The low-order bits of the field 6209 // polynomial (i.e. p = z^7+z^2+z+1) 6210 // repeated in the low and high parts of a 6211 // 128-bit vector 6212 __ emit_int64(0x87); 6213 6214 __ align(CodeEntryAlignment); 6215 address start = __ pc(); 6216 6217 Register state = c_rarg0; 6218 Register subkeyH = c_rarg1; 6219 Register data = c_rarg2; 6220 Register blocks = c_rarg3; 6221 6222 FloatRegister vzr = v30; 6223 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6224 6225 __ ldrq(v24, p); // The field polynomial 6226 6227 __ ldrq(v0, Address(state)); 6228 __ ldrq(v1, Address(subkeyH)); 6229 6230 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6231 __ rbit(v0, __ T16B, v0); 6232 __ rev64(v1, __ T16B, v1); 6233 __ rbit(v1, __ T16B, v1); 6234 6235 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6236 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6237 6238 { 6239 Label L_ghash_loop; 6240 __ bind(L_ghash_loop); 6241 6242 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6243 // reversing each byte 6244 __ rbit(v2, __ T16B, v2); 6245 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6246 6247 // Multiply state in v2 by subkey in v1 6248 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6249 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6250 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6251 // Reduce v7:v5 by the field polynomial 6252 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6253 6254 __ sub(blocks, blocks, 1); 6255 __ cbnz(blocks, L_ghash_loop); 6256 } 6257 6258 // The bit-reversed result is at this point in v0 6259 __ rev64(v0, __ T16B, v0); 6260 __ rbit(v0, __ T16B, v0); 6261 6262 __ st1(v0, __ T16B, state); 6263 __ ret(lr); 6264 6265 return start; 6266 } 6267 6268 address generate_ghash_processBlocks_wide() { 6269 address small = generate_ghash_processBlocks(); 6270 6271 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6272 __ align(wordSize * 2); 6273 address p = __ pc(); 6274 __ emit_int64(0x87); // The low-order bits of the field 6275 // polynomial (i.e. p = z^7+z^2+z+1) 6276 // repeated in the low and high parts of a 6277 // 128-bit vector 6278 __ emit_int64(0x87); 6279 6280 __ align(CodeEntryAlignment); 6281 address start = __ pc(); 6282 6283 Register state = c_rarg0; 6284 Register subkeyH = c_rarg1; 6285 Register data = c_rarg2; 6286 Register blocks = c_rarg3; 6287 6288 const int unroll = 4; 6289 6290 __ cmp(blocks, (unsigned char)(unroll * 2)); 6291 __ br(__ LT, small); 6292 6293 if (unroll > 1) { 6294 // Save state before entering routine 6295 __ sub(sp, sp, 4 * 16); 6296 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6297 __ sub(sp, sp, 4 * 16); 6298 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6299 } 6300 6301 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6302 6303 if (unroll > 1) { 6304 // And restore state 6305 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6306 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6307 } 6308 6309 __ cmp(blocks, (unsigned char)0); 6310 __ br(__ GT, small); 6311 6312 __ ret(lr); 6313 6314 return start; 6315 } 6316 6317 void generate_base64_encode_simdround(Register src, Register dst, 6318 FloatRegister codec, u8 size) { 6319 6320 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6321 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6322 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6323 6324 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6325 6326 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6327 6328 __ ushr(ind0, arrangement, in0, 2); 6329 6330 __ ushr(ind1, arrangement, in1, 2); 6331 __ shl(in0, arrangement, in0, 6); 6332 __ orr(ind1, arrangement, ind1, in0); 6333 __ ushr(ind1, arrangement, ind1, 2); 6334 6335 __ ushr(ind2, arrangement, in2, 4); 6336 __ shl(in1, arrangement, in1, 4); 6337 __ orr(ind2, arrangement, in1, ind2); 6338 __ ushr(ind2, arrangement, ind2, 2); 6339 6340 __ shl(ind3, arrangement, in2, 2); 6341 __ ushr(ind3, arrangement, ind3, 2); 6342 6343 __ tbl(out0, arrangement, codec, 4, ind0); 6344 __ tbl(out1, arrangement, codec, 4, ind1); 6345 __ tbl(out2, arrangement, codec, 4, ind2); 6346 __ tbl(out3, arrangement, codec, 4, ind3); 6347 6348 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6349 } 6350 6351 /** 6352 * Arguments: 6353 * 6354 * Input: 6355 * c_rarg0 - src_start 6356 * c_rarg1 - src_offset 6357 * c_rarg2 - src_length 6358 * c_rarg3 - dest_start 6359 * c_rarg4 - dest_offset 6360 * c_rarg5 - isURL 6361 * 6362 */ 6363 address generate_base64_encodeBlock() { 6364 6365 static const char toBase64[64] = { 6366 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6367 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6368 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6369 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6370 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6371 }; 6372 6373 static const char toBase64URL[64] = { 6374 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6375 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6376 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6377 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6378 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6379 }; 6380 6381 __ align(CodeEntryAlignment); 6382 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6383 address start = __ pc(); 6384 6385 Register src = c_rarg0; // source array 6386 Register soff = c_rarg1; // source start offset 6387 Register send = c_rarg2; // source end offset 6388 Register dst = c_rarg3; // dest array 6389 Register doff = c_rarg4; // position for writing to dest array 6390 Register isURL = c_rarg5; // Base64 or URL character set 6391 6392 // c_rarg6 and c_rarg7 are free to use as temps 6393 Register codec = c_rarg6; 6394 Register length = c_rarg7; 6395 6396 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6397 6398 __ add(src, src, soff); 6399 __ add(dst, dst, doff); 6400 __ sub(length, send, soff); 6401 6402 // load the codec base address 6403 __ lea(codec, ExternalAddress((address) toBase64)); 6404 __ cbz(isURL, ProcessData); 6405 __ lea(codec, ExternalAddress((address) toBase64URL)); 6406 6407 __ BIND(ProcessData); 6408 6409 // too short to formup a SIMD loop, roll back 6410 __ cmp(length, (u1)24); 6411 __ br(Assembler::LT, Process3B); 6412 6413 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6414 6415 __ BIND(Process48B); 6416 __ cmp(length, (u1)48); 6417 __ br(Assembler::LT, Process24B); 6418 generate_base64_encode_simdround(src, dst, v0, 16); 6419 __ sub(length, length, 48); 6420 __ b(Process48B); 6421 6422 __ BIND(Process24B); 6423 __ cmp(length, (u1)24); 6424 __ br(Assembler::LT, SIMDExit); 6425 generate_base64_encode_simdround(src, dst, v0, 8); 6426 __ sub(length, length, 24); 6427 6428 __ BIND(SIMDExit); 6429 __ cbz(length, Exit); 6430 6431 __ BIND(Process3B); 6432 // 3 src bytes, 24 bits 6433 __ ldrb(r10, __ post(src, 1)); 6434 __ ldrb(r11, __ post(src, 1)); 6435 __ ldrb(r12, __ post(src, 1)); 6436 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6437 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6438 // codec index 6439 __ ubfmw(r15, r12, 18, 23); 6440 __ ubfmw(r14, r12, 12, 17); 6441 __ ubfmw(r13, r12, 6, 11); 6442 __ andw(r12, r12, 63); 6443 // get the code based on the codec 6444 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6445 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6446 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6447 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6448 __ strb(r15, __ post(dst, 1)); 6449 __ strb(r14, __ post(dst, 1)); 6450 __ strb(r13, __ post(dst, 1)); 6451 __ strb(r12, __ post(dst, 1)); 6452 __ sub(length, length, 3); 6453 __ cbnz(length, Process3B); 6454 6455 __ BIND(Exit); 6456 __ ret(lr); 6457 6458 return start; 6459 } 6460 6461 void generate_base64_decode_simdround(Register src, Register dst, 6462 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6463 6464 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6465 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6466 6467 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6468 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6469 6470 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6471 6472 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6473 6474 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6475 6476 // we need unsigned saturating subtract, to make sure all input values 6477 // in range [0, 63] will have 0U value in the higher half lookup 6478 __ uqsubv(decH0, __ T16B, in0, v27); 6479 __ uqsubv(decH1, __ T16B, in1, v27); 6480 __ uqsubv(decH2, __ T16B, in2, v27); 6481 __ uqsubv(decH3, __ T16B, in3, v27); 6482 6483 // lower half lookup 6484 __ tbl(decL0, arrangement, codecL, 4, in0); 6485 __ tbl(decL1, arrangement, codecL, 4, in1); 6486 __ tbl(decL2, arrangement, codecL, 4, in2); 6487 __ tbl(decL3, arrangement, codecL, 4, in3); 6488 6489 // higher half lookup 6490 __ tbx(decH0, arrangement, codecH, 4, decH0); 6491 __ tbx(decH1, arrangement, codecH, 4, decH1); 6492 __ tbx(decH2, arrangement, codecH, 4, decH2); 6493 __ tbx(decH3, arrangement, codecH, 4, decH3); 6494 6495 // combine lower and higher 6496 __ orr(decL0, arrangement, decL0, decH0); 6497 __ orr(decL1, arrangement, decL1, decH1); 6498 __ orr(decL2, arrangement, decL2, decH2); 6499 __ orr(decL3, arrangement, decL3, decH3); 6500 6501 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6502 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6503 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6504 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6505 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6506 __ orr(in0, arrangement, decH0, decH1); 6507 __ orr(in1, arrangement, decH2, decH3); 6508 __ orr(in2, arrangement, in0, in1); 6509 __ umaxv(in3, arrangement, in2); 6510 __ umov(rscratch2, in3, __ B, 0); 6511 6512 // get the data to output 6513 __ shl(out0, arrangement, decL0, 2); 6514 __ ushr(out1, arrangement, decL1, 4); 6515 __ orr(out0, arrangement, out0, out1); 6516 __ shl(out1, arrangement, decL1, 4); 6517 __ ushr(out2, arrangement, decL2, 2); 6518 __ orr(out1, arrangement, out1, out2); 6519 __ shl(out2, arrangement, decL2, 6); 6520 __ orr(out2, arrangement, out2, decL3); 6521 6522 __ cbz(rscratch2, NoIllegalData); 6523 6524 // handle illegal input 6525 __ umov(r10, in2, __ D, 0); 6526 if (size == 16) { 6527 __ cbnz(r10, ErrorInLowerHalf); 6528 6529 // illegal input is in higher half, store the lower half now. 6530 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6531 6532 __ umov(r10, in2, __ D, 1); 6533 __ umov(r11, out0, __ D, 1); 6534 __ umov(r12, out1, __ D, 1); 6535 __ umov(r13, out2, __ D, 1); 6536 __ b(StoreLegalData); 6537 6538 __ BIND(ErrorInLowerHalf); 6539 } 6540 __ umov(r11, out0, __ D, 0); 6541 __ umov(r12, out1, __ D, 0); 6542 __ umov(r13, out2, __ D, 0); 6543 6544 __ BIND(StoreLegalData); 6545 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6546 __ strb(r11, __ post(dst, 1)); 6547 __ strb(r12, __ post(dst, 1)); 6548 __ strb(r13, __ post(dst, 1)); 6549 __ lsr(r10, r10, 8); 6550 __ lsr(r11, r11, 8); 6551 __ lsr(r12, r12, 8); 6552 __ lsr(r13, r13, 8); 6553 __ b(StoreLegalData); 6554 6555 __ BIND(NoIllegalData); 6556 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6557 } 6558 6559 6560 /** 6561 * Arguments: 6562 * 6563 * Input: 6564 * c_rarg0 - src_start 6565 * c_rarg1 - src_offset 6566 * c_rarg2 - src_length 6567 * c_rarg3 - dest_start 6568 * c_rarg4 - dest_offset 6569 * c_rarg5 - isURL 6570 * c_rarg6 - isMIME 6571 * 6572 */ 6573 address generate_base64_decodeBlock() { 6574 6575 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6576 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6577 // titled "Base64 decoding". 6578 6579 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6580 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6581 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6582 static const uint8_t fromBase64ForNoSIMD[256] = { 6583 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6584 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6585 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6586 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6587 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6588 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6589 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6590 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6591 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6592 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6593 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6594 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6595 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6596 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6597 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6598 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6599 }; 6600 6601 static const uint8_t fromBase64URLForNoSIMD[256] = { 6602 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6603 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6604 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6605 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6606 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6607 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6608 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6609 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6610 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6611 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6612 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6613 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6614 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6615 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6616 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6617 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6618 }; 6619 6620 // A legal value of base64 code is in range [0, 127]. We need two lookups 6621 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6622 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6623 // table vector lookup use tbx, out of range indices are unchanged in 6624 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6625 // The value of index 64 is set to 0, so that we know that we already get the 6626 // decoded data with the 1st lookup. 6627 static const uint8_t fromBase64ForSIMD[128] = { 6628 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6629 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6630 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6631 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6632 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6633 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6634 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6635 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6636 }; 6637 6638 static const uint8_t fromBase64URLForSIMD[128] = { 6639 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6640 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6641 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6642 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6643 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6644 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6645 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6646 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6647 }; 6648 6649 __ align(CodeEntryAlignment); 6650 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6651 address start = __ pc(); 6652 6653 Register src = c_rarg0; // source array 6654 Register soff = c_rarg1; // source start offset 6655 Register send = c_rarg2; // source end offset 6656 Register dst = c_rarg3; // dest array 6657 Register doff = c_rarg4; // position for writing to dest array 6658 Register isURL = c_rarg5; // Base64 or URL character set 6659 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6660 6661 Register length = send; // reuse send as length of source data to process 6662 6663 Register simd_codec = c_rarg6; 6664 Register nosimd_codec = c_rarg7; 6665 6666 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6667 6668 __ enter(); 6669 6670 __ add(src, src, soff); 6671 __ add(dst, dst, doff); 6672 6673 __ mov(doff, dst); 6674 6675 __ sub(length, send, soff); 6676 __ bfm(length, zr, 0, 1); 6677 6678 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6679 __ cbz(isURL, ProcessData); 6680 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6681 6682 __ BIND(ProcessData); 6683 __ mov(rscratch1, length); 6684 __ cmp(length, (u1)144); // 144 = 80 + 64 6685 __ br(Assembler::LT, Process4B); 6686 6687 // In the MIME case, the line length cannot be more than 76 6688 // bytes (see RFC 2045). This is too short a block for SIMD 6689 // to be worthwhile, so we use non-SIMD here. 6690 __ movw(rscratch1, 79); 6691 6692 __ BIND(Process4B); 6693 __ ldrw(r14, __ post(src, 4)); 6694 __ ubfxw(r10, r14, 0, 8); 6695 __ ubfxw(r11, r14, 8, 8); 6696 __ ubfxw(r12, r14, 16, 8); 6697 __ ubfxw(r13, r14, 24, 8); 6698 // get the de-code 6699 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6700 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6701 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6702 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6703 // error detection, 255u indicates an illegal input 6704 __ orrw(r14, r10, r11); 6705 __ orrw(r15, r12, r13); 6706 __ orrw(r14, r14, r15); 6707 __ tbnz(r14, 7, Exit); 6708 // recover the data 6709 __ lslw(r14, r10, 10); 6710 __ bfiw(r14, r11, 4, 6); 6711 __ bfmw(r14, r12, 2, 5); 6712 __ rev16w(r14, r14); 6713 __ bfiw(r13, r12, 6, 2); 6714 __ strh(r14, __ post(dst, 2)); 6715 __ strb(r13, __ post(dst, 1)); 6716 // non-simd loop 6717 __ subsw(rscratch1, rscratch1, 4); 6718 __ br(Assembler::GT, Process4B); 6719 6720 // if exiting from PreProcess80B, rscratch1 == -1; 6721 // otherwise, rscratch1 == 0. 6722 __ cbzw(rscratch1, Exit); 6723 __ sub(length, length, 80); 6724 6725 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6726 __ cbz(isURL, SIMDEnter); 6727 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6728 6729 __ BIND(SIMDEnter); 6730 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6731 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6732 __ mov(rscratch1, 63); 6733 __ dup(v27, __ T16B, rscratch1); 6734 6735 __ BIND(Process64B); 6736 __ cmp(length, (u1)64); 6737 __ br(Assembler::LT, Process32B); 6738 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6739 __ sub(length, length, 64); 6740 __ b(Process64B); 6741 6742 __ BIND(Process32B); 6743 __ cmp(length, (u1)32); 6744 __ br(Assembler::LT, SIMDExit); 6745 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6746 __ sub(length, length, 32); 6747 __ b(Process32B); 6748 6749 __ BIND(SIMDExit); 6750 __ cbz(length, Exit); 6751 __ movw(rscratch1, length); 6752 __ b(Process4B); 6753 6754 __ BIND(Exit); 6755 __ sub(c_rarg0, dst, doff); 6756 6757 __ leave(); 6758 __ ret(lr); 6759 6760 return start; 6761 } 6762 6763 // Support for spin waits. 6764 address generate_spin_wait() { 6765 __ align(CodeEntryAlignment); 6766 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6767 address start = __ pc(); 6768 6769 __ spin_wait(); 6770 __ ret(lr); 6771 6772 return start; 6773 } 6774 6775 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 6776 6777 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6778 // 6779 // If LSE is in use, generate LSE versions of all the stubs. The 6780 // non-LSE versions are in atomic_aarch64.S. 6781 6782 // class AtomicStubMark records the entry point of a stub and the 6783 // stub pointer which will point to it. The stub pointer is set to 6784 // the entry point when ~AtomicStubMark() is called, which must be 6785 // after ICache::invalidate_range. This ensures safe publication of 6786 // the generated code. 6787 class AtomicStubMark { 6788 address _entry_point; 6789 aarch64_atomic_stub_t *_stub; 6790 MacroAssembler *_masm; 6791 public: 6792 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6793 _masm = masm; 6794 __ align(32); 6795 _entry_point = __ pc(); 6796 _stub = stub; 6797 } 6798 ~AtomicStubMark() { 6799 *_stub = (aarch64_atomic_stub_t)_entry_point; 6800 } 6801 }; 6802 6803 // NB: For memory_order_conservative we need a trailing membar after 6804 // LSE atomic operations but not a leading membar. 6805 // 6806 // We don't need a leading membar because a clause in the Arm ARM 6807 // says: 6808 // 6809 // Barrier-ordered-before 6810 // 6811 // Barrier instructions order prior Memory effects before subsequent 6812 // Memory effects generated by the same Observer. A read or a write 6813 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6814 // Observer if and only if RW1 appears in program order before RW 2 6815 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6816 // instruction with both Acquire and Release semantics. 6817 // 6818 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6819 // and Release semantics, therefore we don't need a leading 6820 // barrier. However, there is no corresponding Barrier-ordered-after 6821 // relationship, therefore we need a trailing membar to prevent a 6822 // later store or load from being reordered with the store in an 6823 // atomic instruction. 6824 // 6825 // This was checked by using the herd7 consistency model simulator 6826 // (http://diy.inria.fr/) with this test case: 6827 // 6828 // AArch64 LseCas 6829 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6830 // P0 | P1; 6831 // LDR W4, [X2] | MOV W3, #0; 6832 // DMB LD | MOV W4, #1; 6833 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6834 // | DMB ISH; 6835 // | STR W4, [X2]; 6836 // exists 6837 // (0:X3=0 /\ 0:X4=1) 6838 // 6839 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6840 // with the store to x in P1. Without the DMB in P1 this may happen. 6841 // 6842 // At the time of writing we don't know of any AArch64 hardware that 6843 // reorders stores in this way, but the Reference Manual permits it. 6844 6845 void gen_cas_entry(Assembler::operand_size size, 6846 atomic_memory_order order) { 6847 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6848 exchange_val = c_rarg2; 6849 bool acquire, release; 6850 switch (order) { 6851 case memory_order_relaxed: 6852 acquire = false; 6853 release = false; 6854 break; 6855 case memory_order_release: 6856 acquire = false; 6857 release = true; 6858 break; 6859 default: 6860 acquire = true; 6861 release = true; 6862 break; 6863 } 6864 __ mov(prev, compare_val); 6865 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6866 if (order == memory_order_conservative) { 6867 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6868 } 6869 if (size == Assembler::xword) { 6870 __ mov(r0, prev); 6871 } else { 6872 __ movw(r0, prev); 6873 } 6874 __ ret(lr); 6875 } 6876 6877 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6878 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6879 // If not relaxed, then default to conservative. Relaxed is the only 6880 // case we use enough to be worth specializing. 6881 if (order == memory_order_relaxed) { 6882 __ ldadd(size, incr, prev, addr); 6883 } else { 6884 __ ldaddal(size, incr, prev, addr); 6885 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6886 } 6887 if (size == Assembler::xword) { 6888 __ mov(r0, prev); 6889 } else { 6890 __ movw(r0, prev); 6891 } 6892 __ ret(lr); 6893 } 6894 6895 void gen_swpal_entry(Assembler::operand_size size) { 6896 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6897 __ swpal(size, incr, prev, addr); 6898 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6899 if (size == Assembler::xword) { 6900 __ mov(r0, prev); 6901 } else { 6902 __ movw(r0, prev); 6903 } 6904 __ ret(lr); 6905 } 6906 6907 void generate_atomic_entry_points() { 6908 if (! UseLSE) { 6909 return; 6910 } 6911 6912 __ align(CodeEntryAlignment); 6913 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6914 address first_entry = __ pc(); 6915 6916 // ADD, memory_order_conservative 6917 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6918 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6919 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6920 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6921 6922 // ADD, memory_order_relaxed 6923 AtomicStubMark mark_fetch_add_4_relaxed 6924 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6925 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6926 AtomicStubMark mark_fetch_add_8_relaxed 6927 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6928 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6929 6930 // XCHG, memory_order_conservative 6931 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6932 gen_swpal_entry(Assembler::word); 6933 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6934 gen_swpal_entry(Assembler::xword); 6935 6936 // CAS, memory_order_conservative 6937 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6938 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6939 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6940 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6941 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6942 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6943 6944 // CAS, memory_order_relaxed 6945 AtomicStubMark mark_cmpxchg_1_relaxed 6946 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6947 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6948 AtomicStubMark mark_cmpxchg_4_relaxed 6949 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6950 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6951 AtomicStubMark mark_cmpxchg_8_relaxed 6952 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6953 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6954 6955 AtomicStubMark mark_cmpxchg_4_release 6956 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6957 gen_cas_entry(MacroAssembler::word, memory_order_release); 6958 AtomicStubMark mark_cmpxchg_8_release 6959 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 6960 gen_cas_entry(MacroAssembler::xword, memory_order_release); 6961 6962 AtomicStubMark mark_cmpxchg_4_seq_cst 6963 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 6964 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 6965 AtomicStubMark mark_cmpxchg_8_seq_cst 6966 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 6967 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 6968 6969 ICache::invalidate_range(first_entry, __ pc() - first_entry); 6970 } 6971 #endif // LINUX 6972 6973 address generate_cont_thaw(Continuation::thaw_kind kind) { 6974 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 6975 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 6976 6977 address start = __ pc(); 6978 6979 if (return_barrier) { 6980 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 6981 __ mov(sp, rscratch1); 6982 } 6983 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6984 6985 if (return_barrier) { 6986 // preserve possible return value from a method returning to the return barrier 6987 __ fmovd(rscratch1, v0); 6988 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 6989 } 6990 6991 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 6992 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 6993 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 6994 6995 if (return_barrier) { 6996 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 6997 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 6998 __ fmovd(v0, rscratch1); 6999 } 7000 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7001 7002 7003 Label thaw_success; 7004 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 7005 __ cbnz(rscratch2, thaw_success); 7006 __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry())); 7007 __ br(rscratch1); 7008 __ bind(thaw_success); 7009 7010 // make room for the thawed frames 7011 __ sub(rscratch1, sp, rscratch2); 7012 __ andr(rscratch1, rscratch1, -16); // align 7013 __ mov(sp, rscratch1); 7014 7015 if (return_barrier) { 7016 // save original return value -- again 7017 __ fmovd(rscratch1, v0); 7018 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7019 } 7020 7021 // If we want, we can templatize thaw by kind, and have three different entries 7022 __ movw(c_rarg1, (uint32_t)kind); 7023 7024 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7025 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7026 7027 if (return_barrier) { 7028 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7029 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7030 __ fmovd(v0, rscratch1); 7031 } else { 7032 __ mov(r0, zr); // return 0 (success) from doYield 7033 } 7034 7035 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7036 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7037 __ mov(rfp, sp); 7038 7039 if (return_barrier_exception) { 7040 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7041 __ authenticate_return_address(c_rarg1); 7042 __ verify_oop(r0); 7043 // save return value containing the exception oop in callee-saved R19 7044 __ mov(r19, r0); 7045 7046 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7047 7048 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7049 // __ reinitialize_ptrue(); 7050 7051 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7052 7053 __ mov(r1, r0); // the exception handler 7054 __ mov(r0, r19); // restore return value containing the exception oop 7055 __ verify_oop(r0); 7056 7057 __ leave(); 7058 __ mov(r3, lr); 7059 __ br(r1); // the exception handler 7060 } else { 7061 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7062 __ leave(); 7063 __ ret(lr); 7064 } 7065 7066 return start; 7067 } 7068 7069 address generate_cont_thaw() { 7070 if (!Continuations::enabled()) return nullptr; 7071 7072 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7073 address start = __ pc(); 7074 generate_cont_thaw(Continuation::thaw_top); 7075 return start; 7076 } 7077 7078 address generate_cont_returnBarrier() { 7079 if (!Continuations::enabled()) return nullptr; 7080 7081 // TODO: will probably need multiple return barriers depending on return type 7082 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7083 address start = __ pc(); 7084 7085 generate_cont_thaw(Continuation::thaw_return_barrier); 7086 7087 return start; 7088 } 7089 7090 address generate_cont_returnBarrier_exception() { 7091 if (!Continuations::enabled()) return nullptr; 7092 7093 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7094 address start = __ pc(); 7095 7096 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7097 7098 return start; 7099 } 7100 7101 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7102 // are represented as long[5], with BITS_PER_LIMB = 26. 7103 // Pack five 26-bit limbs into three 64-bit registers. 7104 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7105 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7106 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7107 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7108 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7109 7110 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7111 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7112 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7113 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7114 7115 if (dest2->is_valid()) { 7116 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7117 } else { 7118 #ifdef ASSERT 7119 Label OK; 7120 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7121 __ br(__ EQ, OK); 7122 __ stop("high bits of Poly1305 integer should be zero"); 7123 __ should_not_reach_here(); 7124 __ bind(OK); 7125 #endif 7126 } 7127 } 7128 7129 // As above, but return only a 128-bit integer, packed into two 7130 // 64-bit registers. 7131 void pack_26(Register dest0, Register dest1, Register src) { 7132 pack_26(dest0, dest1, noreg, src); 7133 } 7134 7135 // Multiply and multiply-accumulate unsigned 64-bit registers. 7136 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7137 __ mul(prod_lo, n, m); 7138 __ umulh(prod_hi, n, m); 7139 } 7140 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7141 wide_mul(rscratch1, rscratch2, n, m); 7142 __ adds(sum_lo, sum_lo, rscratch1); 7143 __ adc(sum_hi, sum_hi, rscratch2); 7144 } 7145 7146 // Poly1305, RFC 7539 7147 7148 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7149 // description of the tricks used to simplify and accelerate this 7150 // computation. 7151 7152 address generate_poly1305_processBlocks() { 7153 __ align(CodeEntryAlignment); 7154 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 7155 address start = __ pc(); 7156 Label here; 7157 __ enter(); 7158 RegSet callee_saved = RegSet::range(r19, r28); 7159 __ push(callee_saved, sp); 7160 7161 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7162 7163 // Arguments 7164 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7165 7166 // R_n is the 128-bit randomly-generated key, packed into two 7167 // registers. The caller passes this key to us as long[5], with 7168 // BITS_PER_LIMB = 26. 7169 const Register R_0 = *++regs, R_1 = *++regs; 7170 pack_26(R_0, R_1, r_start); 7171 7172 // RR_n is (R_n >> 2) * 5 7173 const Register RR_0 = *++regs, RR_1 = *++regs; 7174 __ lsr(RR_0, R_0, 2); 7175 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7176 __ lsr(RR_1, R_1, 2); 7177 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7178 7179 // U_n is the current checksum 7180 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7181 pack_26(U_0, U_1, U_2, acc_start); 7182 7183 static constexpr int BLOCK_LENGTH = 16; 7184 Label DONE, LOOP; 7185 7186 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7187 __ br(Assembler::LT, DONE); { 7188 __ bind(LOOP); 7189 7190 // S_n is to be the sum of U_n and the next block of data 7191 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7192 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7193 __ adds(S_0, U_0, S_0); 7194 __ adcs(S_1, U_1, S_1); 7195 __ adc(S_2, U_2, zr); 7196 __ add(S_2, S_2, 1); 7197 7198 const Register U_0HI = *++regs, U_1HI = *++regs; 7199 7200 // NB: this logic depends on some of the special properties of 7201 // Poly1305 keys. In particular, because we know that the top 7202 // four bits of R_0 and R_1 are zero, we can add together 7203 // partial products without any risk of needing to propagate a 7204 // carry out. 7205 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7206 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7207 __ andr(U_2, R_0, 3); 7208 __ mul(U_2, S_2, U_2); 7209 7210 // Recycle registers S_0, S_1, S_2 7211 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7212 7213 // Partial reduction mod 2**130 - 5 7214 __ adds(U_1, U_0HI, U_1); 7215 __ adc(U_2, U_1HI, U_2); 7216 // Sum now in U_2:U_1:U_0. 7217 // Dead: U_0HI, U_1HI. 7218 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7219 7220 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7221 7222 // First, U_2:U_1:U_0 += (U_2 >> 2) 7223 __ lsr(rscratch1, U_2, 2); 7224 __ andr(U_2, U_2, (u8)3); 7225 __ adds(U_0, U_0, rscratch1); 7226 __ adcs(U_1, U_1, zr); 7227 __ adc(U_2, U_2, zr); 7228 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7229 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7230 __ adcs(U_1, U_1, zr); 7231 __ adc(U_2, U_2, zr); 7232 7233 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7234 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7235 __ br(~ Assembler::LT, LOOP); 7236 } 7237 7238 // Further reduce modulo 2^130 - 5 7239 __ lsr(rscratch1, U_2, 2); 7240 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7241 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7242 __ adcs(U_1, U_1, zr); 7243 __ andr(U_2, U_2, (u1)3); 7244 __ adc(U_2, U_2, zr); 7245 7246 // Unpack the sum into five 26-bit limbs and write to memory. 7247 __ ubfiz(rscratch1, U_0, 0, 26); 7248 __ ubfx(rscratch2, U_0, 26, 26); 7249 __ stp(rscratch1, rscratch2, Address(acc_start)); 7250 __ ubfx(rscratch1, U_0, 52, 12); 7251 __ bfi(rscratch1, U_1, 12, 14); 7252 __ ubfx(rscratch2, U_1, 14, 26); 7253 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7254 __ ubfx(rscratch1, U_1, 40, 24); 7255 __ bfi(rscratch1, U_2, 24, 3); 7256 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7257 7258 __ bind(DONE); 7259 __ pop(callee_saved, sp); 7260 __ leave(); 7261 __ ret(lr); 7262 7263 return start; 7264 } 7265 7266 #if INCLUDE_JFR 7267 7268 static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { 7269 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7270 __ mov(c_rarg0, thread); 7271 } 7272 7273 // The handle is dereferenced through a load barrier. 7274 static void jfr_epilogue(MacroAssembler* _masm) { 7275 __ reset_last_Java_frame(true); 7276 } 7277 7278 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 7279 // It returns a jobject handle to the event writer. 7280 // The handle is dereferenced and the return value is the event writer oop. 7281 static RuntimeStub* generate_jfr_write_checkpoint() { 7282 enum layout { 7283 rbp_off, 7284 rbpH_off, 7285 return_off, 7286 return_off2, 7287 framesize // inclusive of return address 7288 }; 7289 7290 int insts_size = 1024; 7291 int locs_size = 64; 7292 CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size); 7293 OopMapSet* oop_maps = new OopMapSet(); 7294 MacroAssembler* masm = new MacroAssembler(&code); 7295 MacroAssembler* _masm = masm; 7296 7297 address start = __ pc(); 7298 __ enter(); 7299 int frame_complete = __ pc() - start; 7300 address the_pc = __ pc(); 7301 jfr_prologue(the_pc, _masm, rthread); 7302 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 7303 jfr_epilogue(_masm); 7304 __ resolve_global_jobject(r0, rscratch1, rscratch2); 7305 __ leave(); 7306 __ ret(lr); 7307 7308 OopMap* map = new OopMap(framesize, 1); // rfp 7309 oop_maps->add_gc_map(the_pc - start, map); 7310 7311 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 7312 RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete, 7313 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7314 oop_maps, false); 7315 return stub; 7316 } 7317 7318 // For c2: call to return a leased buffer. 7319 static RuntimeStub* generate_jfr_return_lease() { 7320 enum layout { 7321 rbp_off, 7322 rbpH_off, 7323 return_off, 7324 return_off2, 7325 framesize // inclusive of return address 7326 }; 7327 7328 int insts_size = 1024; 7329 int locs_size = 64; 7330 CodeBuffer code("jfr_return_lease", insts_size, locs_size); 7331 OopMapSet* oop_maps = new OopMapSet(); 7332 MacroAssembler* masm = new MacroAssembler(&code); 7333 MacroAssembler* _masm = masm; 7334 7335 address start = __ pc(); 7336 __ enter(); 7337 int frame_complete = __ pc() - start; 7338 address the_pc = __ pc(); 7339 jfr_prologue(the_pc, _masm, rthread); 7340 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 7341 jfr_epilogue(_masm); 7342 7343 __ leave(); 7344 __ ret(lr); 7345 7346 OopMap* map = new OopMap(framesize, 1); // rfp 7347 oop_maps->add_gc_map(the_pc - start, map); 7348 7349 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 7350 RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete, 7351 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7352 oop_maps, false); 7353 return stub; 7354 } 7355 7356 #endif // INCLUDE_JFR 7357 7358 // exception handler for upcall stubs 7359 address generate_upcall_stub_exception_handler() { 7360 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 7361 address start = __ pc(); 7362 7363 // Native caller has no idea how to handle exceptions, 7364 // so we just crash here. Up to callee to catch exceptions. 7365 __ verify_oop(r0); 7366 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7367 __ blr(rscratch1); 7368 __ should_not_reach_here(); 7369 7370 return start; 7371 } 7372 7373 // Continuation point for throwing of implicit exceptions that are 7374 // not handled in the current activation. Fabricates an exception 7375 // oop and initiates normal exception dispatching in this 7376 // frame. Since we need to preserve callee-saved values (currently 7377 // only for C2, but done for C1 as well) we need a callee-saved oop 7378 // map and therefore have to make these stubs into RuntimeStubs 7379 // rather than BufferBlobs. If the compiler needs all registers to 7380 // be preserved between the fault point and the exception handler 7381 // then it must assume responsibility for that in 7382 // AbstractCompiler::continuation_for_implicit_null_exception or 7383 // continuation_for_implicit_division_by_zero_exception. All other 7384 // implicit exceptions (e.g., NullPointerException or 7385 // AbstractMethodError on entry) are either at call sites or 7386 // otherwise assume that stack unwinding will be initiated, so 7387 // caller saved registers were assumed volatile in the compiler. 7388 7389 #undef __ 7390 #define __ masm-> 7391 7392 address generate_throw_exception(const char* name, 7393 address runtime_entry, 7394 Register arg1 = noreg, 7395 Register arg2 = noreg) { 7396 // Information about frame layout at time of blocking runtime call. 7397 // Note that we only have to preserve callee-saved registers since 7398 // the compilers are responsible for supplying a continuation point 7399 // if they expect all registers to be preserved. 7400 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 7401 enum layout { 7402 rfp_off = 0, 7403 rfp_off2, 7404 return_off, 7405 return_off2, 7406 framesize // inclusive of return address 7407 }; 7408 7409 int insts_size = 512; 7410 int locs_size = 64; 7411 7412 CodeBuffer code(name, insts_size, locs_size); 7413 OopMapSet* oop_maps = new OopMapSet(); 7414 MacroAssembler* masm = new MacroAssembler(&code); 7415 7416 address start = __ pc(); 7417 7418 // This is an inlined and slightly modified version of call_VM 7419 // which has the ability to fetch the return PC out of 7420 // thread-local storage and also sets up last_Java_sp slightly 7421 // differently than the real call_VM 7422 7423 __ enter(); // Save FP and LR before call 7424 7425 assert(is_even(framesize/2), "sp not 16-byte aligned"); 7426 7427 // lr and fp are already in place 7428 __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog 7429 7430 int frame_complete = __ pc() - start; 7431 7432 // Set up last_Java_sp and last_Java_fp 7433 address the_pc = __ pc(); 7434 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7435 7436 // Call runtime 7437 if (arg1 != noreg) { 7438 assert(arg2 != c_rarg1, "clobbered"); 7439 __ mov(c_rarg1, arg1); 7440 } 7441 if (arg2 != noreg) { 7442 __ mov(c_rarg2, arg2); 7443 } 7444 __ mov(c_rarg0, rthread); 7445 BLOCK_COMMENT("call runtime_entry"); 7446 __ mov(rscratch1, runtime_entry); 7447 __ blr(rscratch1); 7448 7449 // Generate oop map 7450 OopMap* map = new OopMap(framesize, 0); 7451 7452 oop_maps->add_gc_map(the_pc - start, map); 7453 7454 __ reset_last_Java_frame(true); 7455 7456 // Reinitialize the ptrue predicate register, in case the external runtime 7457 // call clobbers ptrue reg, as we may return to SVE compiled code. 7458 __ reinitialize_ptrue(); 7459 7460 __ leave(); 7461 7462 // check for pending exceptions 7463 #ifdef ASSERT 7464 Label L; 7465 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 7466 __ cbnz(rscratch1, L); 7467 __ should_not_reach_here(); 7468 __ bind(L); 7469 #endif // ASSERT 7470 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 7471 7472 // codeBlob framesize is in words (not VMRegImpl::slot_size) 7473 RuntimeStub* stub = 7474 RuntimeStub::new_runtime_stub(name, 7475 &code, 7476 frame_complete, 7477 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7478 oop_maps, false); 7479 return stub->entry_point(); 7480 } 7481 7482 class MontgomeryMultiplyGenerator : public MacroAssembler { 7483 7484 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7485 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7486 7487 RegSet _toSave; 7488 bool _squaring; 7489 7490 public: 7491 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7492 : MacroAssembler(as->code()), _squaring(squaring) { 7493 7494 // Register allocation 7495 7496 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7497 Pa_base = *regs; // Argument registers 7498 if (squaring) 7499 Pb_base = Pa_base; 7500 else 7501 Pb_base = *++regs; 7502 Pn_base = *++regs; 7503 Rlen= *++regs; 7504 inv = *++regs; 7505 Pm_base = *++regs; 7506 7507 // Working registers: 7508 Ra = *++regs; // The current digit of a, b, n, and m. 7509 Rb = *++regs; 7510 Rm = *++regs; 7511 Rn = *++regs; 7512 7513 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7514 Pb = *++regs; 7515 Pm = *++regs; 7516 Pn = *++regs; 7517 7518 t0 = *++regs; // Three registers which form a 7519 t1 = *++regs; // triple-precision accumuator. 7520 t2 = *++regs; 7521 7522 Ri = *++regs; // Inner and outer loop indexes. 7523 Rj = *++regs; 7524 7525 Rhi_ab = *++regs; // Product registers: low and high parts 7526 Rlo_ab = *++regs; // of a*b and m*n. 7527 Rhi_mn = *++regs; 7528 Rlo_mn = *++regs; 7529 7530 // r19 and up are callee-saved. 7531 _toSave = RegSet::range(r19, *regs) + Pm_base; 7532 } 7533 7534 private: 7535 void save_regs() { 7536 push(_toSave, sp); 7537 } 7538 7539 void restore_regs() { 7540 pop(_toSave, sp); 7541 } 7542 7543 template <typename T> 7544 void unroll_2(Register count, T block) { 7545 Label loop, end, odd; 7546 tbnz(count, 0, odd); 7547 cbz(count, end); 7548 align(16); 7549 bind(loop); 7550 (this->*block)(); 7551 bind(odd); 7552 (this->*block)(); 7553 subs(count, count, 2); 7554 br(Assembler::GT, loop); 7555 bind(end); 7556 } 7557 7558 template <typename T> 7559 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7560 Label loop, end, odd; 7561 tbnz(count, 0, odd); 7562 cbz(count, end); 7563 align(16); 7564 bind(loop); 7565 (this->*block)(d, s, tmp); 7566 bind(odd); 7567 (this->*block)(d, s, tmp); 7568 subs(count, count, 2); 7569 br(Assembler::GT, loop); 7570 bind(end); 7571 } 7572 7573 void pre1(RegisterOrConstant i) { 7574 block_comment("pre1"); 7575 // Pa = Pa_base; 7576 // Pb = Pb_base + i; 7577 // Pm = Pm_base; 7578 // Pn = Pn_base + i; 7579 // Ra = *Pa; 7580 // Rb = *Pb; 7581 // Rm = *Pm; 7582 // Rn = *Pn; 7583 ldr(Ra, Address(Pa_base)); 7584 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7585 ldr(Rm, Address(Pm_base)); 7586 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7587 lea(Pa, Address(Pa_base)); 7588 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7589 lea(Pm, Address(Pm_base)); 7590 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7591 7592 // Zero the m*n result. 7593 mov(Rhi_mn, zr); 7594 mov(Rlo_mn, zr); 7595 } 7596 7597 // The core multiply-accumulate step of a Montgomery 7598 // multiplication. The idea is to schedule operations as a 7599 // pipeline so that instructions with long latencies (loads and 7600 // multiplies) have time to complete before their results are 7601 // used. This most benefits in-order implementations of the 7602 // architecture but out-of-order ones also benefit. 7603 void step() { 7604 block_comment("step"); 7605 // MACC(Ra, Rb, t0, t1, t2); 7606 // Ra = *++Pa; 7607 // Rb = *--Pb; 7608 umulh(Rhi_ab, Ra, Rb); 7609 mul(Rlo_ab, Ra, Rb); 7610 ldr(Ra, pre(Pa, wordSize)); 7611 ldr(Rb, pre(Pb, -wordSize)); 7612 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7613 // previous iteration. 7614 // MACC(Rm, Rn, t0, t1, t2); 7615 // Rm = *++Pm; 7616 // Rn = *--Pn; 7617 umulh(Rhi_mn, Rm, Rn); 7618 mul(Rlo_mn, Rm, Rn); 7619 ldr(Rm, pre(Pm, wordSize)); 7620 ldr(Rn, pre(Pn, -wordSize)); 7621 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7622 } 7623 7624 void post1() { 7625 block_comment("post1"); 7626 7627 // MACC(Ra, Rb, t0, t1, t2); 7628 // Ra = *++Pa; 7629 // Rb = *--Pb; 7630 umulh(Rhi_ab, Ra, Rb); 7631 mul(Rlo_ab, Ra, Rb); 7632 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7633 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7634 7635 // *Pm = Rm = t0 * inv; 7636 mul(Rm, t0, inv); 7637 str(Rm, Address(Pm)); 7638 7639 // MACC(Rm, Rn, t0, t1, t2); 7640 // t0 = t1; t1 = t2; t2 = 0; 7641 umulh(Rhi_mn, Rm, Rn); 7642 7643 #ifndef PRODUCT 7644 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7645 { 7646 mul(Rlo_mn, Rm, Rn); 7647 add(Rlo_mn, t0, Rlo_mn); 7648 Label ok; 7649 cbz(Rlo_mn, ok); { 7650 stop("broken Montgomery multiply"); 7651 } bind(ok); 7652 } 7653 #endif 7654 // We have very carefully set things up so that 7655 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7656 // the lower half of Rm * Rn because we know the result already: 7657 // it must be -t0. t0 + (-t0) must generate a carry iff 7658 // t0 != 0. So, rather than do a mul and an adds we just set 7659 // the carry flag iff t0 is nonzero. 7660 // 7661 // mul(Rlo_mn, Rm, Rn); 7662 // adds(zr, t0, Rlo_mn); 7663 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7664 adcs(t0, t1, Rhi_mn); 7665 adc(t1, t2, zr); 7666 mov(t2, zr); 7667 } 7668 7669 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7670 block_comment("pre2"); 7671 // Pa = Pa_base + i-len; 7672 // Pb = Pb_base + len; 7673 // Pm = Pm_base + i-len; 7674 // Pn = Pn_base + len; 7675 7676 if (i.is_register()) { 7677 sub(Rj, i.as_register(), len); 7678 } else { 7679 mov(Rj, i.as_constant()); 7680 sub(Rj, Rj, len); 7681 } 7682 // Rj == i-len 7683 7684 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7685 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7686 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7687 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7688 7689 // Ra = *++Pa; 7690 // Rb = *--Pb; 7691 // Rm = *++Pm; 7692 // Rn = *--Pn; 7693 ldr(Ra, pre(Pa, wordSize)); 7694 ldr(Rb, pre(Pb, -wordSize)); 7695 ldr(Rm, pre(Pm, wordSize)); 7696 ldr(Rn, pre(Pn, -wordSize)); 7697 7698 mov(Rhi_mn, zr); 7699 mov(Rlo_mn, zr); 7700 } 7701 7702 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7703 block_comment("post2"); 7704 if (i.is_constant()) { 7705 mov(Rj, i.as_constant()-len.as_constant()); 7706 } else { 7707 sub(Rj, i.as_register(), len); 7708 } 7709 7710 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7711 7712 // As soon as we know the least significant digit of our result, 7713 // store it. 7714 // Pm_base[i-len] = t0; 7715 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7716 7717 // t0 = t1; t1 = t2; t2 = 0; 7718 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7719 adc(t1, t2, zr); 7720 mov(t2, zr); 7721 } 7722 7723 // A carry in t0 after Montgomery multiplication means that we 7724 // should subtract multiples of n from our result in m. We'll 7725 // keep doing that until there is no carry. 7726 void normalize(RegisterOrConstant len) { 7727 block_comment("normalize"); 7728 // while (t0) 7729 // t0 = sub(Pm_base, Pn_base, t0, len); 7730 Label loop, post, again; 7731 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7732 cbz(t0, post); { 7733 bind(again); { 7734 mov(i, zr); 7735 mov(cnt, len); 7736 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7737 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7738 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7739 align(16); 7740 bind(loop); { 7741 sbcs(Rm, Rm, Rn); 7742 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7743 add(i, i, 1); 7744 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7745 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7746 sub(cnt, cnt, 1); 7747 } cbnz(cnt, loop); 7748 sbc(t0, t0, zr); 7749 } cbnz(t0, again); 7750 } bind(post); 7751 } 7752 7753 // Move memory at s to d, reversing words. 7754 // Increments d to end of copied memory 7755 // Destroys tmp1, tmp2 7756 // Preserves len 7757 // Leaves s pointing to the address which was in d at start 7758 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7759 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7760 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7761 7762 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7763 mov(tmp1, len); 7764 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7765 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7766 } 7767 // where 7768 void reverse1(Register d, Register s, Register tmp) { 7769 ldr(tmp, pre(s, -wordSize)); 7770 ror(tmp, tmp, 32); 7771 str(tmp, post(d, wordSize)); 7772 } 7773 7774 void step_squaring() { 7775 // An extra ACC 7776 step(); 7777 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7778 } 7779 7780 void last_squaring(RegisterOrConstant i) { 7781 Label dont; 7782 // if ((i & 1) == 0) { 7783 tbnz(i.as_register(), 0, dont); { 7784 // MACC(Ra, Rb, t0, t1, t2); 7785 // Ra = *++Pa; 7786 // Rb = *--Pb; 7787 umulh(Rhi_ab, Ra, Rb); 7788 mul(Rlo_ab, Ra, Rb); 7789 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7790 } bind(dont); 7791 } 7792 7793 void extra_step_squaring() { 7794 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7795 7796 // MACC(Rm, Rn, t0, t1, t2); 7797 // Rm = *++Pm; 7798 // Rn = *--Pn; 7799 umulh(Rhi_mn, Rm, Rn); 7800 mul(Rlo_mn, Rm, Rn); 7801 ldr(Rm, pre(Pm, wordSize)); 7802 ldr(Rn, pre(Pn, -wordSize)); 7803 } 7804 7805 void post1_squaring() { 7806 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7807 7808 // *Pm = Rm = t0 * inv; 7809 mul(Rm, t0, inv); 7810 str(Rm, Address(Pm)); 7811 7812 // MACC(Rm, Rn, t0, t1, t2); 7813 // t0 = t1; t1 = t2; t2 = 0; 7814 umulh(Rhi_mn, Rm, Rn); 7815 7816 #ifndef PRODUCT 7817 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7818 { 7819 mul(Rlo_mn, Rm, Rn); 7820 add(Rlo_mn, t0, Rlo_mn); 7821 Label ok; 7822 cbz(Rlo_mn, ok); { 7823 stop("broken Montgomery multiply"); 7824 } bind(ok); 7825 } 7826 #endif 7827 // We have very carefully set things up so that 7828 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7829 // the lower half of Rm * Rn because we know the result already: 7830 // it must be -t0. t0 + (-t0) must generate a carry iff 7831 // t0 != 0. So, rather than do a mul and an adds we just set 7832 // the carry flag iff t0 is nonzero. 7833 // 7834 // mul(Rlo_mn, Rm, Rn); 7835 // adds(zr, t0, Rlo_mn); 7836 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7837 adcs(t0, t1, Rhi_mn); 7838 adc(t1, t2, zr); 7839 mov(t2, zr); 7840 } 7841 7842 void acc(Register Rhi, Register Rlo, 7843 Register t0, Register t1, Register t2) { 7844 adds(t0, t0, Rlo); 7845 adcs(t1, t1, Rhi); 7846 adc(t2, t2, zr); 7847 } 7848 7849 public: 7850 /** 7851 * Fast Montgomery multiplication. The derivation of the 7852 * algorithm is in A Cryptographic Library for the Motorola 7853 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7854 * 7855 * Arguments: 7856 * 7857 * Inputs for multiplication: 7858 * c_rarg0 - int array elements a 7859 * c_rarg1 - int array elements b 7860 * c_rarg2 - int array elements n (the modulus) 7861 * c_rarg3 - int length 7862 * c_rarg4 - int inv 7863 * c_rarg5 - int array elements m (the result) 7864 * 7865 * Inputs for squaring: 7866 * c_rarg0 - int array elements a 7867 * c_rarg1 - int array elements n (the modulus) 7868 * c_rarg2 - int length 7869 * c_rarg3 - int inv 7870 * c_rarg4 - int array elements m (the result) 7871 * 7872 */ 7873 address generate_multiply() { 7874 Label argh, nothing; 7875 bind(argh); 7876 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7877 7878 align(CodeEntryAlignment); 7879 address entry = pc(); 7880 7881 cbzw(Rlen, nothing); 7882 7883 enter(); 7884 7885 // Make room. 7886 cmpw(Rlen, 512); 7887 br(Assembler::HI, argh); 7888 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7889 andr(sp, Ra, -2 * wordSize); 7890 7891 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7892 7893 { 7894 // Copy input args, reversing as we go. We use Ra as a 7895 // temporary variable. 7896 reverse(Ra, Pa_base, Rlen, t0, t1); 7897 if (!_squaring) 7898 reverse(Ra, Pb_base, Rlen, t0, t1); 7899 reverse(Ra, Pn_base, Rlen, t0, t1); 7900 } 7901 7902 // Push all call-saved registers and also Pm_base which we'll need 7903 // at the end. 7904 save_regs(); 7905 7906 #ifndef PRODUCT 7907 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7908 { 7909 ldr(Rn, Address(Pn_base, 0)); 7910 mul(Rlo_mn, Rn, inv); 7911 subs(zr, Rlo_mn, -1); 7912 Label ok; 7913 br(EQ, ok); { 7914 stop("broken inverse in Montgomery multiply"); 7915 } bind(ok); 7916 } 7917 #endif 7918 7919 mov(Pm_base, Ra); 7920 7921 mov(t0, zr); 7922 mov(t1, zr); 7923 mov(t2, zr); 7924 7925 block_comment("for (int i = 0; i < len; i++) {"); 7926 mov(Ri, zr); { 7927 Label loop, end; 7928 cmpw(Ri, Rlen); 7929 br(Assembler::GE, end); 7930 7931 bind(loop); 7932 pre1(Ri); 7933 7934 block_comment(" for (j = i; j; j--) {"); { 7935 movw(Rj, Ri); 7936 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7937 } block_comment(" } // j"); 7938 7939 post1(); 7940 addw(Ri, Ri, 1); 7941 cmpw(Ri, Rlen); 7942 br(Assembler::LT, loop); 7943 bind(end); 7944 block_comment("} // i"); 7945 } 7946 7947 block_comment("for (int i = len; i < 2*len; i++) {"); 7948 mov(Ri, Rlen); { 7949 Label loop, end; 7950 cmpw(Ri, Rlen, Assembler::LSL, 1); 7951 br(Assembler::GE, end); 7952 7953 bind(loop); 7954 pre2(Ri, Rlen); 7955 7956 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7957 lslw(Rj, Rlen, 1); 7958 subw(Rj, Rj, Ri); 7959 subw(Rj, Rj, 1); 7960 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7961 } block_comment(" } // j"); 7962 7963 post2(Ri, Rlen); 7964 addw(Ri, Ri, 1); 7965 cmpw(Ri, Rlen, Assembler::LSL, 1); 7966 br(Assembler::LT, loop); 7967 bind(end); 7968 } 7969 block_comment("} // i"); 7970 7971 normalize(Rlen); 7972 7973 mov(Ra, Pm_base); // Save Pm_base in Ra 7974 restore_regs(); // Restore caller's Pm_base 7975 7976 // Copy our result into caller's Pm_base 7977 reverse(Pm_base, Ra, Rlen, t0, t1); 7978 7979 leave(); 7980 bind(nothing); 7981 ret(lr); 7982 7983 return entry; 7984 } 7985 // In C, approximately: 7986 7987 // void 7988 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7989 // julong Pn_base[], julong Pm_base[], 7990 // julong inv, int len) { 7991 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7992 // julong *Pa, *Pb, *Pn, *Pm; 7993 // julong Ra, Rb, Rn, Rm; 7994 7995 // int i; 7996 7997 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7998 7999 // for (i = 0; i < len; i++) { 8000 // int j; 8001 8002 // Pa = Pa_base; 8003 // Pb = Pb_base + i; 8004 // Pm = Pm_base; 8005 // Pn = Pn_base + i; 8006 8007 // Ra = *Pa; 8008 // Rb = *Pb; 8009 // Rm = *Pm; 8010 // Rn = *Pn; 8011 8012 // int iters = i; 8013 // for (j = 0; iters--; j++) { 8014 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8015 // MACC(Ra, Rb, t0, t1, t2); 8016 // Ra = *++Pa; 8017 // Rb = *--Pb; 8018 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8019 // MACC(Rm, Rn, t0, t1, t2); 8020 // Rm = *++Pm; 8021 // Rn = *--Pn; 8022 // } 8023 8024 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 8025 // MACC(Ra, Rb, t0, t1, t2); 8026 // *Pm = Rm = t0 * inv; 8027 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8028 // MACC(Rm, Rn, t0, t1, t2); 8029 8030 // assert(t0 == 0, "broken Montgomery multiply"); 8031 8032 // t0 = t1; t1 = t2; t2 = 0; 8033 // } 8034 8035 // for (i = len; i < 2*len; i++) { 8036 // int j; 8037 8038 // Pa = Pa_base + i-len; 8039 // Pb = Pb_base + len; 8040 // Pm = Pm_base + i-len; 8041 // Pn = Pn_base + len; 8042 8043 // Ra = *++Pa; 8044 // Rb = *--Pb; 8045 // Rm = *++Pm; 8046 // Rn = *--Pn; 8047 8048 // int iters = len*2-i-1; 8049 // for (j = i-len+1; iters--; j++) { 8050 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8051 // MACC(Ra, Rb, t0, t1, t2); 8052 // Ra = *++Pa; 8053 // Rb = *--Pb; 8054 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8055 // MACC(Rm, Rn, t0, t1, t2); 8056 // Rm = *++Pm; 8057 // Rn = *--Pn; 8058 // } 8059 8060 // Pm_base[i-len] = t0; 8061 // t0 = t1; t1 = t2; t2 = 0; 8062 // } 8063 8064 // while (t0) 8065 // t0 = sub(Pm_base, Pn_base, t0, len); 8066 // } 8067 8068 /** 8069 * Fast Montgomery squaring. This uses asymptotically 25% fewer 8070 * multiplies than Montgomery multiplication so it should be up to 8071 * 25% faster. However, its loop control is more complex and it 8072 * may actually run slower on some machines. 8073 * 8074 * Arguments: 8075 * 8076 * Inputs: 8077 * c_rarg0 - int array elements a 8078 * c_rarg1 - int array elements n (the modulus) 8079 * c_rarg2 - int length 8080 * c_rarg3 - int inv 8081 * c_rarg4 - int array elements m (the result) 8082 * 8083 */ 8084 address generate_square() { 8085 Label argh; 8086 bind(argh); 8087 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8088 8089 align(CodeEntryAlignment); 8090 address entry = pc(); 8091 8092 enter(); 8093 8094 // Make room. 8095 cmpw(Rlen, 512); 8096 br(Assembler::HI, argh); 8097 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8098 andr(sp, Ra, -2 * wordSize); 8099 8100 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8101 8102 { 8103 // Copy input args, reversing as we go. We use Ra as a 8104 // temporary variable. 8105 reverse(Ra, Pa_base, Rlen, t0, t1); 8106 reverse(Ra, Pn_base, Rlen, t0, t1); 8107 } 8108 8109 // Push all call-saved registers and also Pm_base which we'll need 8110 // at the end. 8111 save_regs(); 8112 8113 mov(Pm_base, Ra); 8114 8115 mov(t0, zr); 8116 mov(t1, zr); 8117 mov(t2, zr); 8118 8119 block_comment("for (int i = 0; i < len; i++) {"); 8120 mov(Ri, zr); { 8121 Label loop, end; 8122 bind(loop); 8123 cmp(Ri, Rlen); 8124 br(Assembler::GE, end); 8125 8126 pre1(Ri); 8127 8128 block_comment("for (j = (i+1)/2; j; j--) {"); { 8129 add(Rj, Ri, 1); 8130 lsr(Rj, Rj, 1); 8131 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8132 } block_comment(" } // j"); 8133 8134 last_squaring(Ri); 8135 8136 block_comment(" for (j = i/2; j; j--) {"); { 8137 lsr(Rj, Ri, 1); 8138 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8139 } block_comment(" } // j"); 8140 8141 post1_squaring(); 8142 add(Ri, Ri, 1); 8143 cmp(Ri, Rlen); 8144 br(Assembler::LT, loop); 8145 8146 bind(end); 8147 block_comment("} // i"); 8148 } 8149 8150 block_comment("for (int i = len; i < 2*len; i++) {"); 8151 mov(Ri, Rlen); { 8152 Label loop, end; 8153 bind(loop); 8154 cmp(Ri, Rlen, Assembler::LSL, 1); 8155 br(Assembler::GE, end); 8156 8157 pre2(Ri, Rlen); 8158 8159 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8160 lsl(Rj, Rlen, 1); 8161 sub(Rj, Rj, Ri); 8162 sub(Rj, Rj, 1); 8163 lsr(Rj, Rj, 1); 8164 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8165 } block_comment(" } // j"); 8166 8167 last_squaring(Ri); 8168 8169 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8170 lsl(Rj, Rlen, 1); 8171 sub(Rj, Rj, Ri); 8172 lsr(Rj, Rj, 1); 8173 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8174 } block_comment(" } // j"); 8175 8176 post2(Ri, Rlen); 8177 add(Ri, Ri, 1); 8178 cmp(Ri, Rlen, Assembler::LSL, 1); 8179 8180 br(Assembler::LT, loop); 8181 bind(end); 8182 block_comment("} // i"); 8183 } 8184 8185 normalize(Rlen); 8186 8187 mov(Ra, Pm_base); // Save Pm_base in Ra 8188 restore_regs(); // Restore caller's Pm_base 8189 8190 // Copy our result into caller's Pm_base 8191 reverse(Pm_base, Ra, Rlen, t0, t1); 8192 8193 leave(); 8194 ret(lr); 8195 8196 return entry; 8197 } 8198 // In C, approximately: 8199 8200 // void 8201 // montgomery_square(julong Pa_base[], julong Pn_base[], 8202 // julong Pm_base[], julong inv, int len) { 8203 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8204 // julong *Pa, *Pb, *Pn, *Pm; 8205 // julong Ra, Rb, Rn, Rm; 8206 8207 // int i; 8208 8209 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8210 8211 // for (i = 0; i < len; i++) { 8212 // int j; 8213 8214 // Pa = Pa_base; 8215 // Pb = Pa_base + i; 8216 // Pm = Pm_base; 8217 // Pn = Pn_base + i; 8218 8219 // Ra = *Pa; 8220 // Rb = *Pb; 8221 // Rm = *Pm; 8222 // Rn = *Pn; 8223 8224 // int iters = (i+1)/2; 8225 // for (j = 0; iters--; j++) { 8226 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8227 // MACC2(Ra, Rb, t0, t1, t2); 8228 // Ra = *++Pa; 8229 // Rb = *--Pb; 8230 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8231 // MACC(Rm, Rn, t0, t1, t2); 8232 // Rm = *++Pm; 8233 // Rn = *--Pn; 8234 // } 8235 // if ((i & 1) == 0) { 8236 // assert(Ra == Pa_base[j], "must be"); 8237 // MACC(Ra, Ra, t0, t1, t2); 8238 // } 8239 // iters = i/2; 8240 // assert(iters == i-j, "must be"); 8241 // for (; iters--; j++) { 8242 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8243 // MACC(Rm, Rn, t0, t1, t2); 8244 // Rm = *++Pm; 8245 // Rn = *--Pn; 8246 // } 8247 8248 // *Pm = Rm = t0 * inv; 8249 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8250 // MACC(Rm, Rn, t0, t1, t2); 8251 8252 // assert(t0 == 0, "broken Montgomery multiply"); 8253 8254 // t0 = t1; t1 = t2; t2 = 0; 8255 // } 8256 8257 // for (i = len; i < 2*len; i++) { 8258 // int start = i-len+1; 8259 // int end = start + (len - start)/2; 8260 // int j; 8261 8262 // Pa = Pa_base + i-len; 8263 // Pb = Pa_base + len; 8264 // Pm = Pm_base + i-len; 8265 // Pn = Pn_base + len; 8266 8267 // Ra = *++Pa; 8268 // Rb = *--Pb; 8269 // Rm = *++Pm; 8270 // Rn = *--Pn; 8271 8272 // int iters = (2*len-i-1)/2; 8273 // assert(iters == end-start, "must be"); 8274 // for (j = start; iters--; j++) { 8275 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8276 // MACC2(Ra, Rb, t0, t1, t2); 8277 // Ra = *++Pa; 8278 // Rb = *--Pb; 8279 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8280 // MACC(Rm, Rn, t0, t1, t2); 8281 // Rm = *++Pm; 8282 // Rn = *--Pn; 8283 // } 8284 // if ((i & 1) == 0) { 8285 // assert(Ra == Pa_base[j], "must be"); 8286 // MACC(Ra, Ra, t0, t1, t2); 8287 // } 8288 // iters = (2*len-i)/2; 8289 // assert(iters == len-j, "must be"); 8290 // for (; iters--; j++) { 8291 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8292 // MACC(Rm, Rn, t0, t1, t2); 8293 // Rm = *++Pm; 8294 // Rn = *--Pn; 8295 // } 8296 // Pm_base[i-len] = t0; 8297 // t0 = t1; t1 = t2; t2 = 0; 8298 // } 8299 8300 // while (t0) 8301 // t0 = sub(Pm_base, Pn_base, t0, len); 8302 // } 8303 }; 8304 8305 8306 // Initialization 8307 void generate_initial_stubs() { 8308 // Generate initial stubs and initializes the entry points 8309 8310 // entry points that exist in all platforms Note: This is code 8311 // that could be shared among different platforms - however the 8312 // benefit seems to be smaller than the disadvantage of having a 8313 // much more complicated generator structure. See also comment in 8314 // stubRoutines.hpp. 8315 8316 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8317 8318 StubRoutines::_call_stub_entry = 8319 generate_call_stub(StubRoutines::_call_stub_return_address); 8320 8321 // is referenced by megamorphic call 8322 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8323 8324 // Build this early so it's available for the interpreter. 8325 StubRoutines::_throw_StackOverflowError_entry = 8326 generate_throw_exception("StackOverflowError throw_exception", 8327 CAST_FROM_FN_PTR(address, 8328 SharedRuntime::throw_StackOverflowError)); 8329 StubRoutines::_throw_delayed_StackOverflowError_entry = 8330 generate_throw_exception("delayed StackOverflowError throw_exception", 8331 CAST_FROM_FN_PTR(address, 8332 SharedRuntime::throw_delayed_StackOverflowError)); 8333 8334 // Initialize table for copy memory (arraycopy) check. 8335 if (UnsafeCopyMemory::_table == nullptr) { 8336 UnsafeCopyMemory::create_table(8); 8337 } 8338 8339 if (UseCRC32Intrinsics) { 8340 // set table address before stub generation which use it 8341 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8342 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8343 } 8344 8345 if (UseCRC32CIntrinsics) { 8346 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8347 } 8348 8349 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8350 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8351 } 8352 8353 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8354 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8355 } 8356 8357 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 8358 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 8359 StubRoutines::_hf2f = generate_float16ToFloat(); 8360 StubRoutines::_f2hf = generate_floatToFloat16(); 8361 } 8362 } 8363 8364 void generate_continuation_stubs() { 8365 // Continuation stubs: 8366 StubRoutines::_cont_thaw = generate_cont_thaw(); 8367 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8368 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8369 8370 JFR_ONLY(generate_jfr_stubs();) 8371 } 8372 8373 #if INCLUDE_JFR 8374 void generate_jfr_stubs() { 8375 StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint(); 8376 StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point(); 8377 StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease(); 8378 StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point(); 8379 } 8380 #endif // INCLUDE_JFR 8381 8382 void generate_final_stubs() { 8383 // support for verify_oop (must happen after universe_init) 8384 if (VerifyOops) { 8385 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8386 } 8387 StubRoutines::_throw_AbstractMethodError_entry = 8388 generate_throw_exception("AbstractMethodError throw_exception", 8389 CAST_FROM_FN_PTR(address, 8390 SharedRuntime:: 8391 throw_AbstractMethodError)); 8392 8393 StubRoutines::_throw_IncompatibleClassChangeError_entry = 8394 generate_throw_exception("IncompatibleClassChangeError throw_exception", 8395 CAST_FROM_FN_PTR(address, 8396 SharedRuntime:: 8397 throw_IncompatibleClassChangeError)); 8398 8399 StubRoutines::_throw_NullPointerException_at_call_entry = 8400 generate_throw_exception("NullPointerException at call throw_exception", 8401 CAST_FROM_FN_PTR(address, 8402 SharedRuntime:: 8403 throw_NullPointerException_at_call)); 8404 8405 // arraycopy stubs used by compilers 8406 generate_arraycopy_stubs(); 8407 8408 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8409 if (bs_nm != nullptr) { 8410 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8411 } 8412 8413 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8414 8415 if (UsePoly1305Intrinsics) { 8416 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 8417 } 8418 8419 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8420 8421 generate_atomic_entry_points(); 8422 8423 #endif // LINUX 8424 8425 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8426 8427 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8428 } 8429 8430 void generate_compiler_stubs() { 8431 #if COMPILER2_OR_JVMCI 8432 8433 if (UseSVE == 0) { 8434 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8435 } 8436 8437 // array equals stub for large arrays. 8438 if (!UseSimpleArrayEquals) { 8439 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8440 } 8441 8442 // byte_array_inflate stub for large arrays. 8443 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8444 8445 // countPositives stub for large arrays. 8446 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8447 8448 generate_compare_long_strings(); 8449 8450 generate_string_indexof_stubs(); 8451 8452 #ifdef COMPILER2 8453 if (UseMultiplyToLenIntrinsic) { 8454 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8455 } 8456 8457 if (UseSquareToLenIntrinsic) { 8458 StubRoutines::_squareToLen = generate_squareToLen(); 8459 } 8460 8461 if (UseMulAddIntrinsic) { 8462 StubRoutines::_mulAdd = generate_mulAdd(); 8463 } 8464 8465 if (UseSIMDForBigIntegerShiftIntrinsics) { 8466 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8467 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8468 } 8469 8470 if (UseMontgomeryMultiplyIntrinsic) { 8471 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8472 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8473 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8474 } 8475 8476 if (UseMontgomerySquareIntrinsic) { 8477 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8478 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8479 // We use generate_multiply() rather than generate_square() 8480 // because it's faster for the sizes of modulus we care about. 8481 StubRoutines::_montgomerySquare = g.generate_multiply(); 8482 } 8483 #endif // COMPILER2 8484 8485 if (UseChaCha20Intrinsics) { 8486 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8487 } 8488 8489 if (UseBASE64Intrinsics) { 8490 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8491 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8492 } 8493 8494 // data cache line writeback 8495 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8496 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8497 8498 if (UseAESIntrinsics) { 8499 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8500 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8501 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8502 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8503 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8504 } 8505 if (UseGHASHIntrinsics) { 8506 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8507 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8508 } 8509 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8510 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8511 } 8512 8513 if (UseMD5Intrinsics) { 8514 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8515 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8516 } 8517 if (UseSHA1Intrinsics) { 8518 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8519 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8520 } 8521 if (UseSHA256Intrinsics) { 8522 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8523 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8524 } 8525 if (UseSHA512Intrinsics) { 8526 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8527 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8528 } 8529 if (UseSHA3Intrinsics) { 8530 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8531 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8532 } 8533 8534 // generate Adler32 intrinsics code 8535 if (UseAdler32Intrinsics) { 8536 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8537 } 8538 #endif // COMPILER2_OR_JVMCI 8539 } 8540 8541 public: 8542 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8543 switch(kind) { 8544 case Initial_stubs: 8545 generate_initial_stubs(); 8546 break; 8547 case Continuation_stubs: 8548 generate_continuation_stubs(); 8549 break; 8550 case Compiler_stubs: 8551 generate_compiler_stubs(); 8552 break; 8553 case Final_stubs: 8554 generate_final_stubs(); 8555 break; 8556 default: 8557 fatal("unexpected stubs kind: %d", kind); 8558 break; 8559 }; 8560 } 8561 }; // end class declaration 8562 8563 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8564 StubGenerator g(code, kind); 8565 } 8566 8567 8568 #if defined (LINUX) 8569 8570 // Define pointers to atomic stubs and initialize them to point to the 8571 // code in atomic_aarch64.S. 8572 8573 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8574 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8575 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8576 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8577 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8578 8579 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8580 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8581 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8582 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8583 DEFAULT_ATOMIC_OP(xchg, 4, ) 8584 DEFAULT_ATOMIC_OP(xchg, 8, ) 8585 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8586 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8587 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8588 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8589 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8590 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8591 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8592 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8593 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8594 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8595 8596 #undef DEFAULT_ATOMIC_OP 8597 8598 #endif // LINUX