1 /* 2 * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/globalDefinitions.hpp" 57 #include "utilities/powerOfTwo.hpp" 58 #ifdef COMPILER2 59 #include "opto/runtime.hpp" 60 #endif 61 #if INCLUDE_ZGC 62 #include "gc/z/zThreadLocalData.hpp" 63 #endif 64 65 // Declaration and definition of StubGenerator (no .hpp file). 66 // For a more detailed description of the stub routine structure 67 // see the comment in stubRoutines.hpp 68 69 #undef __ 70 #define __ _masm-> 71 72 #ifdef PRODUCT 73 #define BLOCK_COMMENT(str) /* nothing */ 74 #else 75 #define BLOCK_COMMENT(str) __ block_comment(str) 76 #endif 77 78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 79 80 // Stub Code definitions 81 82 class StubGenerator: public StubCodeGenerator { 83 private: 84 85 #ifdef PRODUCT 86 #define inc_counter_np(counter) ((void)0) 87 #else 88 void inc_counter_np_(uint& counter) { 89 __ lea(rscratch2, ExternalAddress((address)&counter)); 90 __ ldrw(rscratch1, Address(rscratch2)); 91 __ addw(rscratch1, rscratch1, 1); 92 __ strw(rscratch1, Address(rscratch2)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubCodeMark mark(this, "StubRoutines", "call_stub"); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 __ ldr(j_rarg2, result); 332 Label is_long, is_float, is_double, exit; 333 __ ldr(j_rarg1, result_type); 334 __ cmp(j_rarg1, (u1)T_OBJECT); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, (u1)T_LONG); 337 __ br(Assembler::EQ, is_long); 338 __ cmp(j_rarg1, (u1)T_FLOAT); 339 __ br(Assembler::EQ, is_float); 340 __ cmp(j_rarg1, (u1)T_DOUBLE); 341 __ br(Assembler::EQ, is_double); 342 343 // handle T_INT case 344 __ strw(r0, Address(j_rarg2)); 345 346 __ BIND(exit); 347 348 // pop parameters 349 __ sub(esp, rfp, -sp_after_call_off * wordSize); 350 351 #ifdef ASSERT 352 // verify that threads correspond 353 { 354 Label L, S; 355 __ ldr(rscratch1, thread); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::NE, S); 358 __ get_thread(rscratch1); 359 __ cmp(rthread, rscratch1); 360 __ br(Assembler::EQ, L); 361 __ BIND(S); 362 __ stop("StubRoutines::call_stub: threads must correspond"); 363 __ BIND(L); 364 } 365 #endif 366 367 __ pop_cont_fastpath(rthread); 368 369 // restore callee-save registers 370 __ ldpd(v15, v14, d15_save); 371 __ ldpd(v13, v12, d13_save); 372 __ ldpd(v11, v10, d11_save); 373 __ ldpd(v9, v8, d9_save); 374 375 __ ldp(r28, r27, r28_save); 376 __ ldp(r26, r25, r26_save); 377 __ ldp(r24, r23, r24_save); 378 __ ldp(r22, r21, r22_save); 379 __ ldp(r20, r19, r20_save); 380 381 // restore fpcr 382 __ ldr(rscratch1, fpcr_save); 383 __ set_fpcr(rscratch1); 384 385 __ ldp(c_rarg0, c_rarg1, call_wrapper); 386 __ ldrw(c_rarg2, result_type); 387 __ ldr(c_rarg3, method); 388 __ ldp(c_rarg4, c_rarg5, entry_point); 389 __ ldp(c_rarg6, c_rarg7, parameter_size); 390 391 // leave frame and return to caller 392 __ leave(); 393 __ ret(lr); 394 395 // handle return types different from T_INT 396 397 __ BIND(is_long); 398 __ str(r0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 __ BIND(is_float); 402 __ strs(j_farg0, Address(j_rarg2, 0)); 403 __ br(Assembler::AL, exit); 404 405 __ BIND(is_double); 406 __ strd(j_farg0, Address(j_rarg2, 0)); 407 __ br(Assembler::AL, exit); 408 409 return start; 410 } 411 412 // Return point for a Java call if there's an exception thrown in 413 // Java code. The exception is caught and transformed into a 414 // pending exception stored in JavaThread that can be tested from 415 // within the VM. 416 // 417 // Note: Usually the parameters are removed by the callee. In case 418 // of an exception crossing an activation frame boundary, that is 419 // not the case if the callee is compiled code => need to setup the 420 // rsp. 421 // 422 // r0: exception oop 423 424 address generate_catch_exception() { 425 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 426 address start = __ pc(); 427 428 // same as in generate_call_stub(): 429 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 430 const Address thread (rfp, thread_off * wordSize); 431 432 #ifdef ASSERT 433 // verify that threads correspond 434 { 435 Label L, S; 436 __ ldr(rscratch1, thread); 437 __ cmp(rthread, rscratch1); 438 __ br(Assembler::NE, S); 439 __ get_thread(rscratch1); 440 __ cmp(rthread, rscratch1); 441 __ br(Assembler::EQ, L); 442 __ bind(S); 443 __ stop("StubRoutines::catch_exception: threads must correspond"); 444 __ bind(L); 445 } 446 #endif 447 448 // set pending exception 449 __ verify_oop(r0); 450 451 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 452 __ mov(rscratch1, (address)__FILE__); 453 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 454 __ movw(rscratch1, (int)__LINE__); 455 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 456 457 // complete return to VM 458 assert(StubRoutines::_call_stub_return_address != nullptr, 459 "_call_stub_return_address must have been generated before"); 460 __ b(StubRoutines::_call_stub_return_address); 461 462 return start; 463 } 464 465 // Continuation point for runtime calls returning with a pending 466 // exception. The pending exception check happened in the runtime 467 // or native call stub. The pending exception in Thread is 468 // converted into a Java-level exception. 469 // 470 // Contract with Java-level exception handlers: 471 // r0: exception 472 // r3: throwing pc 473 // 474 // NOTE: At entry of this stub, exception-pc must be in LR !! 475 476 // NOTE: this is always used as a jump target within generated code 477 // so it just needs to be generated code with no x86 prolog 478 479 address generate_forward_exception() { 480 StubCodeMark mark(this, "StubRoutines", "forward exception"); 481 address start = __ pc(); 482 483 // Upon entry, LR points to the return address returning into 484 // Java (interpreted or compiled) code; i.e., the return address 485 // becomes the throwing pc. 486 // 487 // Arguments pushed before the runtime call are still on the stack 488 // but the exception handler will reset the stack pointer -> 489 // ignore them. A potential result in registers can be ignored as 490 // well. 491 492 #ifdef ASSERT 493 // make sure this code is only executed if there is a pending exception 494 { 495 Label L; 496 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 497 __ cbnz(rscratch1, L); 498 __ stop("StubRoutines::forward exception: no pending exception (1)"); 499 __ bind(L); 500 } 501 #endif 502 503 // compute exception handler into r19 504 505 // call the VM to find the handler address associated with the 506 // caller address. pass thread in r0 and caller pc (ret address) 507 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 508 // the stack. 509 __ mov(c_rarg1, lr); 510 // lr will be trashed by the VM call so we move it to R19 511 // (callee-saved) because we also need to pass it to the handler 512 // returned by this call. 513 __ mov(r19, lr); 514 BLOCK_COMMENT("call exception_handler_for_return_address"); 515 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 516 SharedRuntime::exception_handler_for_return_address), 517 rthread, c_rarg1); 518 // Reinitialize the ptrue predicate register, in case the external runtime 519 // call clobbers ptrue reg, as we may return to SVE compiled code. 520 __ reinitialize_ptrue(); 521 522 // we should not really care that lr is no longer the callee 523 // address. we saved the value the handler needs in r19 so we can 524 // just copy it to r3. however, the C2 handler will push its own 525 // frame and then calls into the VM and the VM code asserts that 526 // the PC for the frame above the handler belongs to a compiled 527 // Java method. So, we restore lr here to satisfy that assert. 528 __ mov(lr, r19); 529 // setup r0 & r3 & clear pending exception 530 __ mov(r3, r19); 531 __ mov(r19, r0); 532 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 533 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 534 535 #ifdef ASSERT 536 // make sure exception is set 537 { 538 Label L; 539 __ cbnz(r0, L); 540 __ stop("StubRoutines::forward exception: no pending exception (2)"); 541 __ bind(L); 542 } 543 #endif 544 545 // continue at exception handler 546 // r0: exception 547 // r3: throwing pc 548 // r19: exception handler 549 __ verify_oop(r0); 550 __ br(r19); 551 552 return start; 553 } 554 555 // Non-destructive plausibility checks for oops 556 // 557 // Arguments: 558 // r0: oop to verify 559 // rscratch1: error message 560 // 561 // Stack after saving c_rarg3: 562 // [tos + 0]: saved c_rarg3 563 // [tos + 1]: saved c_rarg2 564 // [tos + 2]: saved lr 565 // [tos + 3]: saved rscratch2 566 // [tos + 4]: saved r0 567 // [tos + 5]: saved rscratch1 568 address generate_verify_oop() { 569 570 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 571 address start = __ pc(); 572 573 Label exit, error; 574 575 // save c_rarg2 and c_rarg3 576 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 577 578 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 579 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 580 __ ldr(c_rarg3, Address(c_rarg2)); 581 __ add(c_rarg3, c_rarg3, 1); 582 __ str(c_rarg3, Address(c_rarg2)); 583 584 // object is in r0 585 // make sure object is 'reasonable' 586 __ cbz(r0, exit); // if obj is null it is OK 587 588 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 589 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 590 591 // return if everything seems ok 592 __ bind(exit); 593 594 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 595 __ ret(lr); 596 597 // handle errors 598 __ bind(error); 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 601 __ push(RegSet::range(r0, r29), sp); 602 // debug(char* msg, int64_t pc, int64_t regs[]) 603 __ mov(c_rarg0, rscratch1); // pass address of error message 604 __ mov(c_rarg1, lr); // pass return address 605 __ mov(c_rarg2, sp); // pass address of regs on stack 606 #ifndef PRODUCT 607 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 608 #endif 609 BLOCK_COMMENT("call MacroAssembler::debug"); 610 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 611 __ blr(rscratch1); 612 __ hlt(0); 613 614 return start; 615 } 616 617 // Generate indices for iota vector. 618 address generate_iota_indices(const char *stub_name) { 619 __ align(CodeEntryAlignment); 620 StubCodeMark mark(this, "StubRoutines", stub_name); 621 address start = __ pc(); 622 // B 623 __ emit_data64(0x0706050403020100, relocInfo::none); 624 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 625 // H 626 __ emit_data64(0x0003000200010000, relocInfo::none); 627 __ emit_data64(0x0007000600050004, relocInfo::none); 628 // S 629 __ emit_data64(0x0000000100000000, relocInfo::none); 630 __ emit_data64(0x0000000300000002, relocInfo::none); 631 // D 632 __ emit_data64(0x0000000000000000, relocInfo::none); 633 __ emit_data64(0x0000000000000001, relocInfo::none); 634 // S - FP 635 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 636 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 637 // D - FP 638 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 639 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 640 return start; 641 } 642 643 // The inner part of zero_words(). This is the bulk operation, 644 // zeroing words in blocks, possibly using DC ZVA to do it. The 645 // caller is responsible for zeroing the last few words. 646 // 647 // Inputs: 648 // r10: the HeapWord-aligned base address of an array to zero. 649 // r11: the count in HeapWords, r11 > 0. 650 // 651 // Returns r10 and r11, adjusted for the caller to clear. 652 // r10: the base address of the tail of words left to clear. 653 // r11: the number of words in the tail. 654 // r11 < MacroAssembler::zero_words_block_size. 655 656 address generate_zero_blocks() { 657 Label done; 658 Label base_aligned; 659 660 Register base = r10, cnt = r11; 661 662 __ align(CodeEntryAlignment); 663 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 664 address start = __ pc(); 665 666 if (UseBlockZeroing) { 667 int zva_length = VM_Version::zva_length(); 668 669 // Ensure ZVA length can be divided by 16. This is required by 670 // the subsequent operations. 671 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 672 673 __ tbz(base, 3, base_aligned); 674 __ str(zr, Address(__ post(base, 8))); 675 __ sub(cnt, cnt, 1); 676 __ bind(base_aligned); 677 678 // Ensure count >= zva_length * 2 so that it still deserves a zva after 679 // alignment. 680 Label small; 681 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 682 __ subs(rscratch1, cnt, low_limit >> 3); 683 __ br(Assembler::LT, small); 684 __ zero_dcache_blocks(base, cnt); 685 __ bind(small); 686 } 687 688 { 689 // Number of stp instructions we'll unroll 690 const int unroll = 691 MacroAssembler::zero_words_block_size / 2; 692 // Clear the remaining blocks. 693 Label loop; 694 __ subs(cnt, cnt, unroll * 2); 695 __ br(Assembler::LT, done); 696 __ bind(loop); 697 for (int i = 0; i < unroll; i++) 698 __ stp(zr, zr, __ post(base, 16)); 699 __ subs(cnt, cnt, unroll * 2); 700 __ br(Assembler::GE, loop); 701 __ bind(done); 702 __ add(cnt, cnt, unroll * 2); 703 } 704 705 __ ret(lr); 706 707 return start; 708 } 709 710 711 typedef enum { 712 copy_forwards = 1, 713 copy_backwards = -1 714 } copy_direction; 715 716 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 717 // for arraycopy stubs. 718 class ArrayCopyBarrierSetHelper : StackObj { 719 BarrierSetAssembler* _bs_asm; 720 MacroAssembler* _masm; 721 DecoratorSet _decorators; 722 BasicType _type; 723 Register _gct1; 724 Register _gct2; 725 Register _gct3; 726 FloatRegister _gcvt1; 727 FloatRegister _gcvt2; 728 FloatRegister _gcvt3; 729 730 public: 731 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 732 DecoratorSet decorators, 733 BasicType type, 734 Register gct1, 735 Register gct2, 736 Register gct3, 737 FloatRegister gcvt1, 738 FloatRegister gcvt2, 739 FloatRegister gcvt3) 740 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 741 _masm(masm), 742 _decorators(decorators), 743 _type(type), 744 _gct1(gct1), 745 _gct2(gct2), 746 _gct3(gct3), 747 _gcvt1(gcvt1), 748 _gcvt2(gcvt2), 749 _gcvt3(gcvt3) { 750 } 751 752 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 753 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 754 dst1, dst2, src, 755 _gct1, _gct2, _gcvt1); 756 } 757 758 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 759 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 760 dst, src1, src2, 761 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 762 } 763 764 void copy_load_at_16(Register dst1, Register dst2, Address src) { 765 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 766 dst1, dst2, src, 767 _gct1); 768 } 769 770 void copy_store_at_16(Address dst, Register src1, Register src2) { 771 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 772 dst, src1, src2, 773 _gct1, _gct2, _gct3); 774 } 775 776 void copy_load_at_8(Register dst, Address src) { 777 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 778 dst, noreg, src, 779 _gct1); 780 } 781 782 void copy_store_at_8(Address dst, Register src) { 783 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 784 dst, src, noreg, 785 _gct1, _gct2, _gct3); 786 } 787 }; 788 789 // Bulk copy of blocks of 8 words. 790 // 791 // count is a count of words. 792 // 793 // Precondition: count >= 8 794 // 795 // Postconditions: 796 // 797 // The least significant bit of count contains the remaining count 798 // of words to copy. The rest of count is trash. 799 // 800 // s and d are adjusted to point to the remaining words to copy 801 // 802 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 803 copy_direction direction) { 804 int unit = wordSize * direction; 805 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 806 807 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 808 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 809 const Register stride = r14; 810 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 811 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 812 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 813 814 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 815 assert_different_registers(s, d, count, rscratch1, rscratch2); 816 817 Label again, drain; 818 const char *stub_name; 819 if (direction == copy_forwards) 820 stub_name = "forward_copy_longs"; 821 else 822 stub_name = "backward_copy_longs"; 823 824 __ align(CodeEntryAlignment); 825 826 StubCodeMark mark(this, "StubRoutines", stub_name); 827 828 __ bind(start); 829 830 Label unaligned_copy_long; 831 if (AvoidUnalignedAccesses) { 832 __ tbnz(d, 3, unaligned_copy_long); 833 } 834 835 if (direction == copy_forwards) { 836 __ sub(s, s, bias); 837 __ sub(d, d, bias); 838 } 839 840 #ifdef ASSERT 841 // Make sure we are never given < 8 words 842 { 843 Label L; 844 __ cmp(count, (u1)8); 845 __ br(Assembler::GE, L); 846 __ stop("genrate_copy_longs called with < 8 words"); 847 __ bind(L); 848 } 849 #endif 850 851 // Fill 8 registers 852 if (UseSIMDForMemoryOps) { 853 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 854 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 855 } else { 856 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 857 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 858 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 859 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 860 } 861 862 __ subs(count, count, 16); 863 __ br(Assembler::LO, drain); 864 865 int prefetch = PrefetchCopyIntervalInBytes; 866 bool use_stride = false; 867 if (direction == copy_backwards) { 868 use_stride = prefetch > 256; 869 prefetch = -prefetch; 870 if (use_stride) __ mov(stride, prefetch); 871 } 872 873 __ bind(again); 874 875 if (PrefetchCopyIntervalInBytes > 0) 876 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 877 878 if (UseSIMDForMemoryOps) { 879 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 880 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 881 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 882 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 883 } else { 884 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 886 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 888 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 889 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 890 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 891 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 892 } 893 894 __ subs(count, count, 8); 895 __ br(Assembler::HS, again); 896 897 // Drain 898 __ bind(drain); 899 if (UseSIMDForMemoryOps) { 900 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 901 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 902 } else { 903 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 904 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 905 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 906 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 907 } 908 909 { 910 Label L1, L2; 911 __ tbz(count, exact_log2(4), L1); 912 if (UseSIMDForMemoryOps) { 913 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 914 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 915 } else { 916 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 917 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 918 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 919 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 920 } 921 __ bind(L1); 922 923 if (direction == copy_forwards) { 924 __ add(s, s, bias); 925 __ add(d, d, bias); 926 } 927 928 __ tbz(count, 1, L2); 929 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 930 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 931 __ bind(L2); 932 } 933 934 __ ret(lr); 935 936 if (AvoidUnalignedAccesses) { 937 Label drain, again; 938 // Register order for storing. Order is different for backward copy. 939 940 __ bind(unaligned_copy_long); 941 942 // source address is even aligned, target odd aligned 943 // 944 // when forward copying word pairs we read long pairs at offsets 945 // {0, 2, 4, 6} (in long words). when backwards copying we read 946 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 947 // address by -2 in the forwards case so we can compute the 948 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 949 // or -1. 950 // 951 // when forward copying we need to store 1 word, 3 pairs and 952 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 953 // zero offset We adjust the destination by -1 which means we 954 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 955 // 956 // When backwards copyng we need to store 1 word, 3 pairs and 957 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 958 // offsets {1, 3, 5, 7, 8} * unit. 959 960 if (direction == copy_forwards) { 961 __ sub(s, s, 16); 962 __ sub(d, d, 8); 963 } 964 965 // Fill 8 registers 966 // 967 // for forwards copy s was offset by -16 from the original input 968 // value of s so the register contents are at these offsets 969 // relative to the 64 bit block addressed by that original input 970 // and so on for each successive 64 byte block when s is updated 971 // 972 // t0 at offset 0, t1 at offset 8 973 // t2 at offset 16, t3 at offset 24 974 // t4 at offset 32, t5 at offset 40 975 // t6 at offset 48, t7 at offset 56 976 977 // for backwards copy s was not offset so the register contents 978 // are at these offsets into the preceding 64 byte block 979 // relative to that original input and so on for each successive 980 // preceding 64 byte block when s is updated. this explains the 981 // slightly counter-intuitive looking pattern of register usage 982 // in the stp instructions for backwards copy. 983 // 984 // t0 at offset -16, t1 at offset -8 985 // t2 at offset -32, t3 at offset -24 986 // t4 at offset -48, t5 at offset -40 987 // t6 at offset -64, t7 at offset -56 988 989 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 990 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 991 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 992 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 993 994 __ subs(count, count, 16); 995 __ br(Assembler::LO, drain); 996 997 int prefetch = PrefetchCopyIntervalInBytes; 998 bool use_stride = false; 999 if (direction == copy_backwards) { 1000 use_stride = prefetch > 256; 1001 prefetch = -prefetch; 1002 if (use_stride) __ mov(stride, prefetch); 1003 } 1004 1005 __ bind(again); 1006 1007 if (PrefetchCopyIntervalInBytes > 0) 1008 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1009 1010 if (direction == copy_forwards) { 1011 // allowing for the offset of -8 the store instructions place 1012 // registers into the target 64 bit block at the following 1013 // offsets 1014 // 1015 // t0 at offset 0 1016 // t1 at offset 8, t2 at offset 16 1017 // t3 at offset 24, t4 at offset 32 1018 // t5 at offset 40, t6 at offset 48 1019 // t7 at offset 56 1020 1021 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1022 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1023 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1024 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1025 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1026 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1027 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1028 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1029 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1030 } else { 1031 // d was not offset when we started so the registers are 1032 // written into the 64 bit block preceding d with the following 1033 // offsets 1034 // 1035 // t1 at offset -8 1036 // t3 at offset -24, t0 at offset -16 1037 // t5 at offset -48, t2 at offset -32 1038 // t7 at offset -56, t4 at offset -48 1039 // t6 at offset -64 1040 // 1041 // note that this matches the offsets previously noted for the 1042 // loads 1043 1044 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1045 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1046 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1047 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1048 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1049 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1050 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1051 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1052 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1053 } 1054 1055 __ subs(count, count, 8); 1056 __ br(Assembler::HS, again); 1057 1058 // Drain 1059 // 1060 // this uses the same pattern of offsets and register arguments 1061 // as above 1062 __ bind(drain); 1063 if (direction == copy_forwards) { 1064 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1065 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1066 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1067 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1068 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1069 } else { 1070 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1071 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1072 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1073 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1074 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1075 } 1076 // now we need to copy any remaining part block which may 1077 // include a 4 word block subblock and/or a 2 word subblock. 1078 // bits 2 and 1 in the count are the tell-tale for whether we 1079 // have each such subblock 1080 { 1081 Label L1, L2; 1082 __ tbz(count, exact_log2(4), L1); 1083 // this is the same as above but copying only 4 longs hence 1084 // with only one intervening stp between the str instructions 1085 // but note that the offsets and registers still follow the 1086 // same pattern 1087 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1088 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1089 if (direction == copy_forwards) { 1090 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1091 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1092 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1093 } else { 1094 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1095 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1096 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1097 } 1098 __ bind(L1); 1099 1100 __ tbz(count, 1, L2); 1101 // this is the same as above but copying only 2 longs hence 1102 // there is no intervening stp between the str instructions 1103 // but note that the offset and register patterns are still 1104 // the same 1105 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1106 if (direction == copy_forwards) { 1107 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1108 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1109 } else { 1110 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1111 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1112 } 1113 __ bind(L2); 1114 1115 // for forwards copy we need to re-adjust the offsets we 1116 // applied so that s and d are follow the last words written 1117 1118 if (direction == copy_forwards) { 1119 __ add(s, s, 16); 1120 __ add(d, d, 8); 1121 } 1122 1123 } 1124 1125 __ ret(lr); 1126 } 1127 } 1128 1129 // Small copy: less than 16 bytes. 1130 // 1131 // NB: Ignores all of the bits of count which represent more than 15 1132 // bytes, so a caller doesn't have to mask them. 1133 1134 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1135 bool is_backwards = step < 0; 1136 size_t granularity = uabs(step); 1137 int direction = is_backwards ? -1 : 1; 1138 1139 Label Lword, Lint, Lshort, Lbyte; 1140 1141 assert(granularity 1142 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1143 1144 const Register t0 = r3; 1145 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1146 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1147 1148 // ??? I don't know if this bit-test-and-branch is the right thing 1149 // to do. It does a lot of jumping, resulting in several 1150 // mispredicted branches. It might make more sense to do this 1151 // with something like Duff's device with a single computed branch. 1152 1153 __ tbz(count, 3 - exact_log2(granularity), Lword); 1154 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1155 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1156 __ bind(Lword); 1157 1158 if (granularity <= sizeof (jint)) { 1159 __ tbz(count, 2 - exact_log2(granularity), Lint); 1160 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1161 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1162 __ bind(Lint); 1163 } 1164 1165 if (granularity <= sizeof (jshort)) { 1166 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1167 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1168 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1169 __ bind(Lshort); 1170 } 1171 1172 if (granularity <= sizeof (jbyte)) { 1173 __ tbz(count, 0, Lbyte); 1174 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1175 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1176 __ bind(Lbyte); 1177 } 1178 } 1179 1180 Label copy_f, copy_b; 1181 Label copy_obj_f, copy_obj_b; 1182 Label copy_obj_uninit_f, copy_obj_uninit_b; 1183 1184 // All-singing all-dancing memory copy. 1185 // 1186 // Copy count units of memory from s to d. The size of a unit is 1187 // step, which can be positive or negative depending on the direction 1188 // of copy. If is_aligned is false, we align the source address. 1189 // 1190 1191 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1192 Register s, Register d, Register count, int step) { 1193 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1194 bool is_backwards = step < 0; 1195 unsigned int granularity = uabs(step); 1196 const Register t0 = r3, t1 = r4; 1197 1198 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1199 // load all the data before writing anything 1200 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1201 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1202 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1203 const Register send = r17, dend = r16; 1204 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1205 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1206 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1207 1208 if (PrefetchCopyIntervalInBytes > 0) 1209 __ prfm(Address(s, 0), PLDL1KEEP); 1210 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1211 __ br(Assembler::HI, copy_big); 1212 1213 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1214 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1215 1216 __ cmp(count, u1(16/granularity)); 1217 __ br(Assembler::LS, copy16); 1218 1219 __ cmp(count, u1(64/granularity)); 1220 __ br(Assembler::HI, copy80); 1221 1222 __ cmp(count, u1(32/granularity)); 1223 __ br(Assembler::LS, copy32); 1224 1225 // 33..64 bytes 1226 if (UseSIMDForMemoryOps) { 1227 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1228 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1229 bs.copy_store_at_32(Address(d, 0), v0, v1); 1230 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1231 } else { 1232 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1233 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1234 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1235 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1236 1237 bs.copy_store_at_16(Address(d, 0), t0, t1); 1238 bs.copy_store_at_16(Address(d, 16), t2, t3); 1239 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1240 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1241 } 1242 __ b(finish); 1243 1244 // 17..32 bytes 1245 __ bind(copy32); 1246 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1247 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1248 1249 bs.copy_store_at_16(Address(d, 0), t0, t1); 1250 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1251 __ b(finish); 1252 1253 // 65..80/96 bytes 1254 // (96 bytes if SIMD because we do 32 byes per instruction) 1255 __ bind(copy80); 1256 if (UseSIMDForMemoryOps) { 1257 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1258 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1259 // Unaligned pointers can be an issue for copying. 1260 // The issue has more chances to happen when granularity of data is 1261 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1262 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1263 // The most performance drop has been seen for the range 65-80 bytes. 1264 // For such cases using the pair of ldp/stp instead of the third pair of 1265 // ldpq/stpq fixes the performance issue. 1266 if (granularity < sizeof (jint)) { 1267 Label copy96; 1268 __ cmp(count, u1(80/granularity)); 1269 __ br(Assembler::HI, copy96); 1270 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1271 1272 bs.copy_store_at_32(Address(d, 0), v0, v1); 1273 bs.copy_store_at_32(Address(d, 32), v2, v3); 1274 1275 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1276 __ b(finish); 1277 1278 __ bind(copy96); 1279 } 1280 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1281 1282 bs.copy_store_at_32(Address(d, 0), v0, v1); 1283 bs.copy_store_at_32(Address(d, 32), v2, v3); 1284 1285 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1286 } else { 1287 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1288 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1289 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1290 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1291 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1292 1293 bs.copy_store_at_16(Address(d, 0), t0, t1); 1294 bs.copy_store_at_16(Address(d, 16), t2, t3); 1295 bs.copy_store_at_16(Address(d, 32), t4, t5); 1296 bs.copy_store_at_16(Address(d, 48), t6, t7); 1297 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1298 } 1299 __ b(finish); 1300 1301 // 0..16 bytes 1302 __ bind(copy16); 1303 __ cmp(count, u1(8/granularity)); 1304 __ br(Assembler::LO, copy8); 1305 1306 // 8..16 bytes 1307 bs.copy_load_at_8(t0, Address(s, 0)); 1308 bs.copy_load_at_8(t1, Address(send, -8)); 1309 bs.copy_store_at_8(Address(d, 0), t0); 1310 bs.copy_store_at_8(Address(dend, -8), t1); 1311 __ b(finish); 1312 1313 if (granularity < 8) { 1314 // 4..7 bytes 1315 __ bind(copy8); 1316 __ tbz(count, 2 - exact_log2(granularity), copy4); 1317 __ ldrw(t0, Address(s, 0)); 1318 __ ldrw(t1, Address(send, -4)); 1319 __ strw(t0, Address(d, 0)); 1320 __ strw(t1, Address(dend, -4)); 1321 __ b(finish); 1322 if (granularity < 4) { 1323 // 0..3 bytes 1324 __ bind(copy4); 1325 __ cbz(count, finish); // get rid of 0 case 1326 if (granularity == 2) { 1327 __ ldrh(t0, Address(s, 0)); 1328 __ strh(t0, Address(d, 0)); 1329 } else { // granularity == 1 1330 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1331 // the first and last byte. 1332 // Handle the 3 byte case by loading and storing base + count/2 1333 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1334 // This does means in the 1 byte case we load/store the same 1335 // byte 3 times. 1336 __ lsr(count, count, 1); 1337 __ ldrb(t0, Address(s, 0)); 1338 __ ldrb(t1, Address(send, -1)); 1339 __ ldrb(t2, Address(s, count)); 1340 __ strb(t0, Address(d, 0)); 1341 __ strb(t1, Address(dend, -1)); 1342 __ strb(t2, Address(d, count)); 1343 } 1344 __ b(finish); 1345 } 1346 } 1347 1348 __ bind(copy_big); 1349 if (is_backwards) { 1350 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1351 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1352 } 1353 1354 // Now we've got the small case out of the way we can align the 1355 // source address on a 2-word boundary. 1356 1357 // Here we will materialize a count in r15, which is used by copy_memory_small 1358 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1359 // Up until here, we have used t9, which aliases r15, but from here on, that register 1360 // can not be used as a temp register, as it contains the count. 1361 1362 Label aligned; 1363 1364 if (is_aligned) { 1365 // We may have to adjust by 1 word to get s 2-word-aligned. 1366 __ tbz(s, exact_log2(wordSize), aligned); 1367 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1368 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1369 __ sub(count, count, wordSize/granularity); 1370 } else { 1371 if (is_backwards) { 1372 __ andr(r15, s, 2 * wordSize - 1); 1373 } else { 1374 __ neg(r15, s); 1375 __ andr(r15, r15, 2 * wordSize - 1); 1376 } 1377 // r15 is the byte adjustment needed to align s. 1378 __ cbz(r15, aligned); 1379 int shift = exact_log2(granularity); 1380 if (shift) __ lsr(r15, r15, shift); 1381 __ sub(count, count, r15); 1382 1383 #if 0 1384 // ?? This code is only correct for a disjoint copy. It may or 1385 // may not make sense to use it in that case. 1386 1387 // Copy the first pair; s and d may not be aligned. 1388 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1389 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1390 1391 // Align s and d, adjust count 1392 if (is_backwards) { 1393 __ sub(s, s, r15); 1394 __ sub(d, d, r15); 1395 } else { 1396 __ add(s, s, r15); 1397 __ add(d, d, r15); 1398 } 1399 #else 1400 copy_memory_small(decorators, type, s, d, r15, step); 1401 #endif 1402 } 1403 1404 __ bind(aligned); 1405 1406 // s is now 2-word-aligned. 1407 1408 // We have a count of units and some trailing bytes. Adjust the 1409 // count and do a bulk copy of words. 1410 __ lsr(r15, count, exact_log2(wordSize/granularity)); 1411 if (direction == copy_forwards) { 1412 if (type != T_OBJECT) { 1413 __ bl(copy_f); 1414 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1415 __ bl(copy_obj_uninit_f); 1416 } else { 1417 __ bl(copy_obj_f); 1418 } 1419 } else { 1420 if (type != T_OBJECT) { 1421 __ bl(copy_b); 1422 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1423 __ bl(copy_obj_uninit_b); 1424 } else { 1425 __ bl(copy_obj_b); 1426 } 1427 } 1428 1429 // And the tail. 1430 copy_memory_small(decorators, type, s, d, count, step); 1431 1432 if (granularity >= 8) __ bind(copy8); 1433 if (granularity >= 4) __ bind(copy4); 1434 __ bind(finish); 1435 } 1436 1437 1438 void clobber_registers() { 1439 #ifdef ASSERT 1440 RegSet clobbered 1441 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1442 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1443 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1444 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1445 __ mov(*it, rscratch1); 1446 } 1447 #endif 1448 1449 } 1450 1451 // Scan over array at a for count oops, verifying each one. 1452 // Preserves a and count, clobbers rscratch1 and rscratch2. 1453 void verify_oop_array (int size, Register a, Register count, Register temp) { 1454 Label loop, end; 1455 __ mov(rscratch1, a); 1456 __ mov(rscratch2, zr); 1457 __ bind(loop); 1458 __ cmp(rscratch2, count); 1459 __ br(Assembler::HS, end); 1460 if (size == wordSize) { 1461 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1462 __ verify_oop(temp); 1463 } else { 1464 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1465 __ decode_heap_oop(temp); // calls verify_oop 1466 } 1467 __ add(rscratch2, rscratch2, 1); 1468 __ b(loop); 1469 __ bind(end); 1470 } 1471 1472 // Arguments: 1473 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1474 // ignored 1475 // is_oop - true => oop array, so generate store check code 1476 // name - stub name string 1477 // 1478 // Inputs: 1479 // c_rarg0 - source array address 1480 // c_rarg1 - destination array address 1481 // c_rarg2 - element count, treated as ssize_t, can be zero 1482 // 1483 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1484 // the hardware handle it. The two dwords within qwords that span 1485 // cache line boundaries will still be loaded and stored atomically. 1486 // 1487 // Side Effects: 1488 // disjoint_int_copy_entry is set to the no-overlap entry point 1489 // used by generate_conjoint_int_oop_copy(). 1490 // 1491 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1492 const char *name, bool dest_uninitialized = false) { 1493 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1494 RegSet saved_reg = RegSet::of(s, d, count); 1495 __ align(CodeEntryAlignment); 1496 StubCodeMark mark(this, "StubRoutines", name); 1497 address start = __ pc(); 1498 __ enter(); 1499 1500 if (entry != nullptr) { 1501 *entry = __ pc(); 1502 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1503 BLOCK_COMMENT("Entry:"); 1504 } 1505 1506 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1507 if (dest_uninitialized) { 1508 decorators |= IS_DEST_UNINITIALIZED; 1509 } 1510 if (aligned) { 1511 decorators |= ARRAYCOPY_ALIGNED; 1512 } 1513 1514 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1515 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1516 1517 if (is_oop) { 1518 // save regs before copy_memory 1519 __ push(RegSet::of(d, count), sp); 1520 } 1521 { 1522 // UnsafeCopyMemory page error: continue after ucm 1523 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1524 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1525 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1526 } 1527 1528 if (is_oop) { 1529 __ pop(RegSet::of(d, count), sp); 1530 if (VerifyOops) 1531 verify_oop_array(size, d, count, r16); 1532 } 1533 1534 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1535 1536 __ leave(); 1537 __ mov(r0, zr); // return 0 1538 __ ret(lr); 1539 return start; 1540 } 1541 1542 // Arguments: 1543 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1544 // ignored 1545 // is_oop - true => oop array, so generate store check code 1546 // name - stub name string 1547 // 1548 // Inputs: 1549 // c_rarg0 - source array address 1550 // c_rarg1 - destination array address 1551 // c_rarg2 - element count, treated as ssize_t, can be zero 1552 // 1553 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1554 // the hardware handle it. The two dwords within qwords that span 1555 // cache line boundaries will still be loaded and stored atomically. 1556 // 1557 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1558 address *entry, const char *name, 1559 bool dest_uninitialized = false) { 1560 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1561 RegSet saved_regs = RegSet::of(s, d, count); 1562 StubCodeMark mark(this, "StubRoutines", name); 1563 address start = __ pc(); 1564 __ enter(); 1565 1566 if (entry != nullptr) { 1567 *entry = __ pc(); 1568 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1569 BLOCK_COMMENT("Entry:"); 1570 } 1571 1572 // use fwd copy when (d-s) above_equal (count*size) 1573 __ sub(rscratch1, d, s); 1574 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1575 __ br(Assembler::HS, nooverlap_target); 1576 1577 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1578 if (dest_uninitialized) { 1579 decorators |= IS_DEST_UNINITIALIZED; 1580 } 1581 if (aligned) { 1582 decorators |= ARRAYCOPY_ALIGNED; 1583 } 1584 1585 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1586 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1587 1588 if (is_oop) { 1589 // save regs before copy_memory 1590 __ push(RegSet::of(d, count), sp); 1591 } 1592 { 1593 // UnsafeCopyMemory page error: continue after ucm 1594 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1595 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1596 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1597 } 1598 if (is_oop) { 1599 __ pop(RegSet::of(d, count), sp); 1600 if (VerifyOops) 1601 verify_oop_array(size, d, count, r16); 1602 } 1603 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1604 __ leave(); 1605 __ mov(r0, zr); // return 0 1606 __ ret(lr); 1607 return start; 1608 } 1609 1610 // Arguments: 1611 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1612 // ignored 1613 // name - stub name string 1614 // 1615 // Inputs: 1616 // c_rarg0 - source array address 1617 // c_rarg1 - destination array address 1618 // c_rarg2 - element count, treated as ssize_t, can be zero 1619 // 1620 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1621 // we let the hardware handle it. The one to eight bytes within words, 1622 // dwords or qwords that span cache line boundaries will still be loaded 1623 // and stored atomically. 1624 // 1625 // Side Effects: 1626 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1627 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1628 // we let the hardware handle it. The one to eight bytes within words, 1629 // dwords or qwords that span cache line boundaries will still be loaded 1630 // and stored atomically. 1631 // 1632 // Side Effects: 1633 // disjoint_byte_copy_entry is set to the no-overlap entry point 1634 // used by generate_conjoint_byte_copy(). 1635 // 1636 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1637 const bool not_oop = false; 1638 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1639 } 1640 1641 // Arguments: 1642 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1643 // ignored 1644 // name - stub name string 1645 // 1646 // Inputs: 1647 // c_rarg0 - source array address 1648 // c_rarg1 - destination array address 1649 // c_rarg2 - element count, treated as ssize_t, can be zero 1650 // 1651 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1652 // we let the hardware handle it. The one to eight bytes within words, 1653 // dwords or qwords that span cache line boundaries will still be loaded 1654 // and stored atomically. 1655 // 1656 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1657 address* entry, const char *name) { 1658 const bool not_oop = false; 1659 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1660 } 1661 1662 // Arguments: 1663 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1664 // ignored 1665 // name - stub name string 1666 // 1667 // Inputs: 1668 // c_rarg0 - source array address 1669 // c_rarg1 - destination array address 1670 // c_rarg2 - element count, treated as ssize_t, can be zero 1671 // 1672 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1673 // let the hardware handle it. The two or four words within dwords 1674 // or qwords that span cache line boundaries will still be loaded 1675 // and stored atomically. 1676 // 1677 // Side Effects: 1678 // disjoint_short_copy_entry is set to the no-overlap entry point 1679 // used by generate_conjoint_short_copy(). 1680 // 1681 address generate_disjoint_short_copy(bool aligned, 1682 address* entry, const char *name) { 1683 const bool not_oop = false; 1684 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1685 } 1686 1687 // Arguments: 1688 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1689 // ignored 1690 // name - stub name string 1691 // 1692 // Inputs: 1693 // c_rarg0 - source array address 1694 // c_rarg1 - destination array address 1695 // c_rarg2 - element count, treated as ssize_t, can be zero 1696 // 1697 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1698 // let the hardware handle it. The two or four words within dwords 1699 // or qwords that span cache line boundaries will still be loaded 1700 // and stored atomically. 1701 // 1702 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1703 address *entry, const char *name) { 1704 const bool not_oop = false; 1705 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1706 1707 } 1708 // Arguments: 1709 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1710 // ignored 1711 // name - stub name string 1712 // 1713 // Inputs: 1714 // c_rarg0 - source array address 1715 // c_rarg1 - destination array address 1716 // c_rarg2 - element count, treated as ssize_t, can be zero 1717 // 1718 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1719 // the hardware handle it. The two dwords within qwords that span 1720 // cache line boundaries will still be loaded and stored atomically. 1721 // 1722 // Side Effects: 1723 // disjoint_int_copy_entry is set to the no-overlap entry point 1724 // used by generate_conjoint_int_oop_copy(). 1725 // 1726 address generate_disjoint_int_copy(bool aligned, address *entry, 1727 const char *name, bool dest_uninitialized = false) { 1728 const bool not_oop = false; 1729 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1730 } 1731 1732 // Arguments: 1733 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1734 // ignored 1735 // name - stub name string 1736 // 1737 // Inputs: 1738 // c_rarg0 - source array address 1739 // c_rarg1 - destination array address 1740 // c_rarg2 - element count, treated as ssize_t, can be zero 1741 // 1742 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1743 // the hardware handle it. The two dwords within qwords that span 1744 // cache line boundaries will still be loaded and stored atomically. 1745 // 1746 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1747 address *entry, const char *name, 1748 bool dest_uninitialized = false) { 1749 const bool not_oop = false; 1750 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1751 } 1752 1753 1754 // Arguments: 1755 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1756 // ignored 1757 // name - stub name string 1758 // 1759 // Inputs: 1760 // c_rarg0 - source array address 1761 // c_rarg1 - destination array address 1762 // c_rarg2 - element count, treated as size_t, can be zero 1763 // 1764 // Side Effects: 1765 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1766 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1767 // 1768 address generate_disjoint_long_copy(bool aligned, address *entry, 1769 const char *name, bool dest_uninitialized = false) { 1770 const bool not_oop = false; 1771 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1772 } 1773 1774 // Arguments: 1775 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1776 // ignored 1777 // name - stub name string 1778 // 1779 // Inputs: 1780 // c_rarg0 - source array address 1781 // c_rarg1 - destination array address 1782 // c_rarg2 - element count, treated as size_t, can be zero 1783 // 1784 address generate_conjoint_long_copy(bool aligned, 1785 address nooverlap_target, address *entry, 1786 const char *name, bool dest_uninitialized = false) { 1787 const bool not_oop = false; 1788 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1789 } 1790 1791 // Arguments: 1792 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1793 // ignored 1794 // name - stub name string 1795 // 1796 // Inputs: 1797 // c_rarg0 - source array address 1798 // c_rarg1 - destination array address 1799 // c_rarg2 - element count, treated as size_t, can be zero 1800 // 1801 // Side Effects: 1802 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1803 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1804 // 1805 address generate_disjoint_oop_copy(bool aligned, address *entry, 1806 const char *name, bool dest_uninitialized) { 1807 const bool is_oop = true; 1808 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1809 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1810 } 1811 1812 // Arguments: 1813 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1814 // ignored 1815 // name - stub name string 1816 // 1817 // Inputs: 1818 // c_rarg0 - source array address 1819 // c_rarg1 - destination array address 1820 // c_rarg2 - element count, treated as size_t, can be zero 1821 // 1822 address generate_conjoint_oop_copy(bool aligned, 1823 address nooverlap_target, address *entry, 1824 const char *name, bool dest_uninitialized) { 1825 const bool is_oop = true; 1826 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1827 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1828 name, dest_uninitialized); 1829 } 1830 1831 1832 // Helper for generating a dynamic type check. 1833 // Smashes rscratch1, rscratch2. 1834 void generate_type_check(Register sub_klass, 1835 Register super_check_offset, 1836 Register super_klass, 1837 Label& L_success) { 1838 assert_different_registers(sub_klass, super_check_offset, super_klass); 1839 1840 BLOCK_COMMENT("type_check:"); 1841 1842 Label L_miss; 1843 1844 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1845 super_check_offset); 1846 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1847 1848 // Fall through on failure! 1849 __ BIND(L_miss); 1850 } 1851 1852 // 1853 // Generate checkcasting array copy stub 1854 // 1855 // Input: 1856 // c_rarg0 - source array address 1857 // c_rarg1 - destination array address 1858 // c_rarg2 - element count, treated as ssize_t, can be zero 1859 // c_rarg3 - size_t ckoff (super_check_offset) 1860 // c_rarg4 - oop ckval (super_klass) 1861 // 1862 // Output: 1863 // r0 == 0 - success 1864 // r0 == -1^K - failure, where K is partial transfer count 1865 // 1866 address generate_checkcast_copy(const char *name, address *entry, 1867 bool dest_uninitialized = false) { 1868 1869 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1870 1871 // Input registers (after setup_arg_regs) 1872 const Register from = c_rarg0; // source array address 1873 const Register to = c_rarg1; // destination array address 1874 const Register count = c_rarg2; // elementscount 1875 const Register ckoff = c_rarg3; // super_check_offset 1876 const Register ckval = c_rarg4; // super_klass 1877 1878 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1879 RegSet wb_post_saved_regs = RegSet::of(count); 1880 1881 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1882 const Register copied_oop = r22; // actual oop copied 1883 const Register count_save = r21; // orig elementscount 1884 const Register start_to = r20; // destination array start address 1885 const Register r19_klass = r19; // oop._klass 1886 1887 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1888 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1889 1890 //--------------------------------------------------------------- 1891 // Assembler stub will be used for this call to arraycopy 1892 // if the two arrays are subtypes of Object[] but the 1893 // destination array type is not equal to or a supertype 1894 // of the source type. Each element must be separately 1895 // checked. 1896 1897 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1898 copied_oop, r19_klass, count_save); 1899 1900 __ align(CodeEntryAlignment); 1901 StubCodeMark mark(this, "StubRoutines", name); 1902 address start = __ pc(); 1903 1904 __ enter(); // required for proper stackwalking of RuntimeStub frame 1905 1906 #ifdef ASSERT 1907 // caller guarantees that the arrays really are different 1908 // otherwise, we would have to make conjoint checks 1909 { Label L; 1910 __ b(L); // conjoint check not yet implemented 1911 __ stop("checkcast_copy within a single array"); 1912 __ bind(L); 1913 } 1914 #endif //ASSERT 1915 1916 // Caller of this entry point must set up the argument registers. 1917 if (entry != nullptr) { 1918 *entry = __ pc(); 1919 BLOCK_COMMENT("Entry:"); 1920 } 1921 1922 // Empty array: Nothing to do. 1923 __ cbz(count, L_done); 1924 __ push(RegSet::of(r19, r20, r21, r22), sp); 1925 1926 #ifdef ASSERT 1927 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1928 // The ckoff and ckval must be mutually consistent, 1929 // even though caller generates both. 1930 { Label L; 1931 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1932 __ ldrw(start_to, Address(ckval, sco_offset)); 1933 __ cmpw(ckoff, start_to); 1934 __ br(Assembler::EQ, L); 1935 __ stop("super_check_offset inconsistent"); 1936 __ bind(L); 1937 } 1938 #endif //ASSERT 1939 1940 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1941 bool is_oop = true; 1942 int element_size = UseCompressedOops ? 4 : 8; 1943 if (dest_uninitialized) { 1944 decorators |= IS_DEST_UNINITIALIZED; 1945 } 1946 1947 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1948 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1949 1950 // save the original count 1951 __ mov(count_save, count); 1952 1953 // Copy from low to high addresses 1954 __ mov(start_to, to); // Save destination array start address 1955 __ b(L_load_element); 1956 1957 // ======== begin loop ======== 1958 // (Loop is rotated; its entry is L_load_element.) 1959 // Loop control: 1960 // for (; count != 0; count--) { 1961 // copied_oop = load_heap_oop(from++); 1962 // ... generate_type_check ...; 1963 // store_heap_oop(to++, copied_oop); 1964 // } 1965 __ align(OptoLoopAlignment); 1966 1967 __ BIND(L_store_element); 1968 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1969 __ post(to, element_size), copied_oop, noreg, 1970 gct1, gct2, gct3); 1971 __ sub(count, count, 1); 1972 __ cbz(count, L_do_card_marks); 1973 1974 // ======== loop entry is here ======== 1975 __ BIND(L_load_element); 1976 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1977 copied_oop, noreg, __ post(from, element_size), 1978 gct1); 1979 __ cbz(copied_oop, L_store_element); 1980 1981 __ load_klass(r19_klass, copied_oop);// query the object klass 1982 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1983 // ======== end loop ======== 1984 1985 // It was a real error; we must depend on the caller to finish the job. 1986 // Register count = remaining oops, count_orig = total oops. 1987 // Emit GC store barriers for the oops we have copied and report 1988 // their number to the caller. 1989 1990 __ subs(count, count_save, count); // K = partially copied oop count 1991 __ eon(count, count, zr); // report (-1^K) to caller 1992 __ br(Assembler::EQ, L_done_pop); 1993 1994 __ BIND(L_do_card_marks); 1995 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1996 1997 __ bind(L_done_pop); 1998 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1999 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2000 2001 __ bind(L_done); 2002 __ mov(r0, count); 2003 __ leave(); 2004 __ ret(lr); 2005 2006 return start; 2007 } 2008 2009 // Perform range checks on the proposed arraycopy. 2010 // Kills temp, but nothing else. 2011 // Also, clean the sign bits of src_pos and dst_pos. 2012 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2013 Register src_pos, // source position (c_rarg1) 2014 Register dst, // destination array oo (c_rarg2) 2015 Register dst_pos, // destination position (c_rarg3) 2016 Register length, 2017 Register temp, 2018 Label& L_failed) { 2019 BLOCK_COMMENT("arraycopy_range_checks:"); 2020 2021 assert_different_registers(rscratch1, temp); 2022 2023 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2024 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2025 __ addw(temp, length, src_pos); 2026 __ cmpw(temp, rscratch1); 2027 __ br(Assembler::HI, L_failed); 2028 2029 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2030 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2031 __ addw(temp, length, dst_pos); 2032 __ cmpw(temp, rscratch1); 2033 __ br(Assembler::HI, L_failed); 2034 2035 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2036 __ movw(src_pos, src_pos); 2037 __ movw(dst_pos, dst_pos); 2038 2039 BLOCK_COMMENT("arraycopy_range_checks done"); 2040 } 2041 2042 // These stubs get called from some dumb test routine. 2043 // I'll write them properly when they're called from 2044 // something that's actually doing something. 2045 static void fake_arraycopy_stub(address src, address dst, int count) { 2046 assert(count == 0, "huh?"); 2047 } 2048 2049 2050 // 2051 // Generate 'unsafe' array copy stub 2052 // Though just as safe as the other stubs, it takes an unscaled 2053 // size_t argument instead of an element count. 2054 // 2055 // Input: 2056 // c_rarg0 - source array address 2057 // c_rarg1 - destination array address 2058 // c_rarg2 - byte count, treated as ssize_t, can be zero 2059 // 2060 // Examines the alignment of the operands and dispatches 2061 // to a long, int, short, or byte copy loop. 2062 // 2063 address generate_unsafe_copy(const char *name, 2064 address byte_copy_entry, 2065 address short_copy_entry, 2066 address int_copy_entry, 2067 address long_copy_entry) { 2068 Label L_long_aligned, L_int_aligned, L_short_aligned; 2069 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2070 2071 __ align(CodeEntryAlignment); 2072 StubCodeMark mark(this, "StubRoutines", name); 2073 address start = __ pc(); 2074 __ enter(); // required for proper stackwalking of RuntimeStub frame 2075 2076 // bump this on entry, not on exit: 2077 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2078 2079 __ orr(rscratch1, s, d); 2080 __ orr(rscratch1, rscratch1, count); 2081 2082 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2083 __ cbz(rscratch1, L_long_aligned); 2084 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2085 __ cbz(rscratch1, L_int_aligned); 2086 __ tbz(rscratch1, 0, L_short_aligned); 2087 __ b(RuntimeAddress(byte_copy_entry)); 2088 2089 __ BIND(L_short_aligned); 2090 __ lsr(count, count, LogBytesPerShort); // size => short_count 2091 __ b(RuntimeAddress(short_copy_entry)); 2092 __ BIND(L_int_aligned); 2093 __ lsr(count, count, LogBytesPerInt); // size => int_count 2094 __ b(RuntimeAddress(int_copy_entry)); 2095 __ BIND(L_long_aligned); 2096 __ lsr(count, count, LogBytesPerLong); // size => long_count 2097 __ b(RuntimeAddress(long_copy_entry)); 2098 2099 return start; 2100 } 2101 2102 // 2103 // Generate generic array copy stubs 2104 // 2105 // Input: 2106 // c_rarg0 - src oop 2107 // c_rarg1 - src_pos (32-bits) 2108 // c_rarg2 - dst oop 2109 // c_rarg3 - dst_pos (32-bits) 2110 // c_rarg4 - element count (32-bits) 2111 // 2112 // Output: 2113 // r0 == 0 - success 2114 // r0 == -1^K - failure, where K is partial transfer count 2115 // 2116 address generate_generic_copy(const char *name, 2117 address byte_copy_entry, address short_copy_entry, 2118 address int_copy_entry, address oop_copy_entry, 2119 address long_copy_entry, address checkcast_copy_entry) { 2120 2121 Label L_failed, L_objArray; 2122 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2123 2124 // Input registers 2125 const Register src = c_rarg0; // source array oop 2126 const Register src_pos = c_rarg1; // source position 2127 const Register dst = c_rarg2; // destination array oop 2128 const Register dst_pos = c_rarg3; // destination position 2129 const Register length = c_rarg4; 2130 2131 2132 // Registers used as temps 2133 const Register dst_klass = c_rarg5; 2134 2135 __ align(CodeEntryAlignment); 2136 2137 StubCodeMark mark(this, "StubRoutines", name); 2138 2139 address start = __ pc(); 2140 2141 __ enter(); // required for proper stackwalking of RuntimeStub frame 2142 2143 // bump this on entry, not on exit: 2144 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2145 2146 //----------------------------------------------------------------------- 2147 // Assembler stub will be used for this call to arraycopy 2148 // if the following conditions are met: 2149 // 2150 // (1) src and dst must not be null. 2151 // (2) src_pos must not be negative. 2152 // (3) dst_pos must not be negative. 2153 // (4) length must not be negative. 2154 // (5) src klass and dst klass should be the same and not null. 2155 // (6) src and dst should be arrays. 2156 // (7) src_pos + length must not exceed length of src. 2157 // (8) dst_pos + length must not exceed length of dst. 2158 // 2159 2160 // if (src == nullptr) return -1; 2161 __ cbz(src, L_failed); 2162 2163 // if (src_pos < 0) return -1; 2164 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2165 2166 // if (dst == nullptr) return -1; 2167 __ cbz(dst, L_failed); 2168 2169 // if (dst_pos < 0) return -1; 2170 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2171 2172 // registers used as temp 2173 const Register scratch_length = r16; // elements count to copy 2174 const Register scratch_src_klass = r17; // array klass 2175 const Register lh = r15; // layout helper 2176 2177 // if (length < 0) return -1; 2178 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2179 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2180 2181 __ load_klass(scratch_src_klass, src); 2182 #ifdef ASSERT 2183 // assert(src->klass() != nullptr); 2184 { 2185 BLOCK_COMMENT("assert klasses not null {"); 2186 Label L1, L2; 2187 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2188 __ bind(L1); 2189 __ stop("broken null klass"); 2190 __ bind(L2); 2191 __ load_klass(rscratch1, dst); 2192 __ cbz(rscratch1, L1); // this would be broken also 2193 BLOCK_COMMENT("} assert klasses not null done"); 2194 } 2195 #endif 2196 2197 // Load layout helper (32-bits) 2198 // 2199 // |array_tag| | header_size | element_type | |log2_element_size| 2200 // 32 30 24 16 8 2 0 2201 // 2202 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2203 // 2204 2205 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2206 2207 // Handle objArrays completely differently... 2208 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2209 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2210 __ movw(rscratch1, objArray_lh); 2211 __ eorw(rscratch2, lh, rscratch1); 2212 __ cbzw(rscratch2, L_objArray); 2213 2214 // if (src->klass() != dst->klass()) return -1; 2215 __ load_klass(rscratch2, dst); 2216 __ eor(rscratch2, rscratch2, scratch_src_klass); 2217 __ cbnz(rscratch2, L_failed); 2218 2219 // if (!src->is_Array()) return -1; 2220 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2221 2222 // At this point, it is known to be a typeArray (array_tag 0x3). 2223 #ifdef ASSERT 2224 { 2225 BLOCK_COMMENT("assert primitive array {"); 2226 Label L; 2227 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2228 __ cmpw(lh, rscratch2); 2229 __ br(Assembler::GE, L); 2230 __ stop("must be a primitive array"); 2231 __ bind(L); 2232 BLOCK_COMMENT("} assert primitive array done"); 2233 } 2234 #endif 2235 2236 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2237 rscratch2, L_failed); 2238 2239 // TypeArrayKlass 2240 // 2241 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2242 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2243 // 2244 2245 const Register rscratch1_offset = rscratch1; // array offset 2246 const Register r15_elsize = lh; // element size 2247 2248 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2249 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2250 __ add(src, src, rscratch1_offset); // src array offset 2251 __ add(dst, dst, rscratch1_offset); // dst array offset 2252 BLOCK_COMMENT("choose copy loop based on element size"); 2253 2254 // next registers should be set before the jump to corresponding stub 2255 const Register from = c_rarg0; // source array address 2256 const Register to = c_rarg1; // destination array address 2257 const Register count = c_rarg2; // elements count 2258 2259 // 'from', 'to', 'count' registers should be set in such order 2260 // since they are the same as 'src', 'src_pos', 'dst'. 2261 2262 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2263 2264 // The possible values of elsize are 0-3, i.e. exact_log2(element 2265 // size in bytes). We do a simple bitwise binary search. 2266 __ BIND(L_copy_bytes); 2267 __ tbnz(r15_elsize, 1, L_copy_ints); 2268 __ tbnz(r15_elsize, 0, L_copy_shorts); 2269 __ lea(from, Address(src, src_pos));// src_addr 2270 __ lea(to, Address(dst, dst_pos));// dst_addr 2271 __ movw(count, scratch_length); // length 2272 __ b(RuntimeAddress(byte_copy_entry)); 2273 2274 __ BIND(L_copy_shorts); 2275 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2276 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2277 __ movw(count, scratch_length); // length 2278 __ b(RuntimeAddress(short_copy_entry)); 2279 2280 __ BIND(L_copy_ints); 2281 __ tbnz(r15_elsize, 0, L_copy_longs); 2282 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2283 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2284 __ movw(count, scratch_length); // length 2285 __ b(RuntimeAddress(int_copy_entry)); 2286 2287 __ BIND(L_copy_longs); 2288 #ifdef ASSERT 2289 { 2290 BLOCK_COMMENT("assert long copy {"); 2291 Label L; 2292 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2293 __ cmpw(r15_elsize, LogBytesPerLong); 2294 __ br(Assembler::EQ, L); 2295 __ stop("must be long copy, but elsize is wrong"); 2296 __ bind(L); 2297 BLOCK_COMMENT("} assert long copy done"); 2298 } 2299 #endif 2300 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2301 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2302 __ movw(count, scratch_length); // length 2303 __ b(RuntimeAddress(long_copy_entry)); 2304 2305 // ObjArrayKlass 2306 __ BIND(L_objArray); 2307 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2308 2309 Label L_plain_copy, L_checkcast_copy; 2310 // test array classes for subtyping 2311 __ load_klass(r15, dst); 2312 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2313 __ br(Assembler::NE, L_checkcast_copy); 2314 2315 // Identically typed arrays can be copied without element-wise checks. 2316 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2317 rscratch2, L_failed); 2318 2319 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2320 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2321 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2322 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2323 __ movw(count, scratch_length); // length 2324 __ BIND(L_plain_copy); 2325 __ b(RuntimeAddress(oop_copy_entry)); 2326 2327 __ BIND(L_checkcast_copy); 2328 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2329 { 2330 // Before looking at dst.length, make sure dst is also an objArray. 2331 __ ldrw(rscratch1, Address(r15, lh_offset)); 2332 __ movw(rscratch2, objArray_lh); 2333 __ eorw(rscratch1, rscratch1, rscratch2); 2334 __ cbnzw(rscratch1, L_failed); 2335 2336 // It is safe to examine both src.length and dst.length. 2337 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2338 r15, L_failed); 2339 2340 __ load_klass(dst_klass, dst); // reload 2341 2342 // Marshal the base address arguments now, freeing registers. 2343 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2344 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2345 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2346 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2347 __ movw(count, length); // length (reloaded) 2348 Register sco_temp = c_rarg3; // this register is free now 2349 assert_different_registers(from, to, count, sco_temp, 2350 dst_klass, scratch_src_klass); 2351 // assert_clean_int(count, sco_temp); 2352 2353 // Generate the type check. 2354 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2355 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2356 2357 // Smashes rscratch1, rscratch2 2358 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2359 2360 // Fetch destination element klass from the ObjArrayKlass header. 2361 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2362 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2363 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2364 2365 // the checkcast_copy loop needs two extra arguments: 2366 assert(c_rarg3 == sco_temp, "#3 already in place"); 2367 // Set up arguments for checkcast_copy_entry. 2368 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2369 __ b(RuntimeAddress(checkcast_copy_entry)); 2370 } 2371 2372 __ BIND(L_failed); 2373 __ mov(r0, -1); 2374 __ leave(); // required for proper stackwalking of RuntimeStub frame 2375 __ ret(lr); 2376 2377 return start; 2378 } 2379 2380 // 2381 // Generate stub for array fill. If "aligned" is true, the 2382 // "to" address is assumed to be heapword aligned. 2383 // 2384 // Arguments for generated stub: 2385 // to: c_rarg0 2386 // value: c_rarg1 2387 // count: c_rarg2 treated as signed 2388 // 2389 address generate_fill(BasicType t, bool aligned, const char *name) { 2390 __ align(CodeEntryAlignment); 2391 StubCodeMark mark(this, "StubRoutines", name); 2392 address start = __ pc(); 2393 2394 BLOCK_COMMENT("Entry:"); 2395 2396 const Register to = c_rarg0; // source array address 2397 const Register value = c_rarg1; // value 2398 const Register count = c_rarg2; // elements count 2399 2400 const Register bz_base = r10; // base for block_zero routine 2401 const Register cnt_words = r11; // temp register 2402 2403 __ enter(); 2404 2405 Label L_fill_elements, L_exit1; 2406 2407 int shift = -1; 2408 switch (t) { 2409 case T_BYTE: 2410 shift = 0; 2411 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2412 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2413 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2414 __ br(Assembler::LO, L_fill_elements); 2415 break; 2416 case T_SHORT: 2417 shift = 1; 2418 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2419 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2420 __ br(Assembler::LO, L_fill_elements); 2421 break; 2422 case T_INT: 2423 shift = 2; 2424 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2425 __ br(Assembler::LO, L_fill_elements); 2426 break; 2427 default: ShouldNotReachHere(); 2428 } 2429 2430 // Align source address at 8 bytes address boundary. 2431 Label L_skip_align1, L_skip_align2, L_skip_align4; 2432 if (!aligned) { 2433 switch (t) { 2434 case T_BYTE: 2435 // One byte misalignment happens only for byte arrays. 2436 __ tbz(to, 0, L_skip_align1); 2437 __ strb(value, Address(__ post(to, 1))); 2438 __ subw(count, count, 1); 2439 __ bind(L_skip_align1); 2440 // Fallthrough 2441 case T_SHORT: 2442 // Two bytes misalignment happens only for byte and short (char) arrays. 2443 __ tbz(to, 1, L_skip_align2); 2444 __ strh(value, Address(__ post(to, 2))); 2445 __ subw(count, count, 2 >> shift); 2446 __ bind(L_skip_align2); 2447 // Fallthrough 2448 case T_INT: 2449 // Align to 8 bytes, we know we are 4 byte aligned to start. 2450 __ tbz(to, 2, L_skip_align4); 2451 __ strw(value, Address(__ post(to, 4))); 2452 __ subw(count, count, 4 >> shift); 2453 __ bind(L_skip_align4); 2454 break; 2455 default: ShouldNotReachHere(); 2456 } 2457 } 2458 2459 // 2460 // Fill large chunks 2461 // 2462 __ lsrw(cnt_words, count, 3 - shift); // number of words 2463 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2464 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2465 if (UseBlockZeroing) { 2466 Label non_block_zeroing, rest; 2467 // If the fill value is zero we can use the fast zero_words(). 2468 __ cbnz(value, non_block_zeroing); 2469 __ mov(bz_base, to); 2470 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2471 address tpc = __ zero_words(bz_base, cnt_words); 2472 if (tpc == nullptr) { 2473 fatal("CodeCache is full at generate_fill"); 2474 } 2475 __ b(rest); 2476 __ bind(non_block_zeroing); 2477 __ fill_words(to, cnt_words, value); 2478 __ bind(rest); 2479 } else { 2480 __ fill_words(to, cnt_words, value); 2481 } 2482 2483 // Remaining count is less than 8 bytes. Fill it by a single store. 2484 // Note that the total length is no less than 8 bytes. 2485 if (t == T_BYTE || t == T_SHORT) { 2486 Label L_exit1; 2487 __ cbzw(count, L_exit1); 2488 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2489 __ str(value, Address(to, -8)); // overwrite some elements 2490 __ bind(L_exit1); 2491 __ leave(); 2492 __ ret(lr); 2493 } 2494 2495 // Handle copies less than 8 bytes. 2496 Label L_fill_2, L_fill_4, L_exit2; 2497 __ bind(L_fill_elements); 2498 switch (t) { 2499 case T_BYTE: 2500 __ tbz(count, 0, L_fill_2); 2501 __ strb(value, Address(__ post(to, 1))); 2502 __ bind(L_fill_2); 2503 __ tbz(count, 1, L_fill_4); 2504 __ strh(value, Address(__ post(to, 2))); 2505 __ bind(L_fill_4); 2506 __ tbz(count, 2, L_exit2); 2507 __ strw(value, Address(to)); 2508 break; 2509 case T_SHORT: 2510 __ tbz(count, 0, L_fill_4); 2511 __ strh(value, Address(__ post(to, 2))); 2512 __ bind(L_fill_4); 2513 __ tbz(count, 1, L_exit2); 2514 __ strw(value, Address(to)); 2515 break; 2516 case T_INT: 2517 __ cbzw(count, L_exit2); 2518 __ strw(value, Address(to)); 2519 break; 2520 default: ShouldNotReachHere(); 2521 } 2522 __ bind(L_exit2); 2523 __ leave(); 2524 __ ret(lr); 2525 return start; 2526 } 2527 2528 address generate_data_cache_writeback() { 2529 const Register line = c_rarg0; // address of line to write back 2530 2531 __ align(CodeEntryAlignment); 2532 2533 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2534 2535 address start = __ pc(); 2536 __ enter(); 2537 __ cache_wb(Address(line, 0)); 2538 __ leave(); 2539 __ ret(lr); 2540 2541 return start; 2542 } 2543 2544 address generate_data_cache_writeback_sync() { 2545 const Register is_pre = c_rarg0; // pre or post sync 2546 2547 __ align(CodeEntryAlignment); 2548 2549 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2550 2551 // pre wbsync is a no-op 2552 // post wbsync translates to an sfence 2553 2554 Label skip; 2555 address start = __ pc(); 2556 __ enter(); 2557 __ cbnz(is_pre, skip); 2558 __ cache_wbsync(false); 2559 __ bind(skip); 2560 __ leave(); 2561 __ ret(lr); 2562 2563 return start; 2564 } 2565 2566 void generate_arraycopy_stubs() { 2567 address entry; 2568 address entry_jbyte_arraycopy; 2569 address entry_jshort_arraycopy; 2570 address entry_jint_arraycopy; 2571 address entry_oop_arraycopy; 2572 address entry_jlong_arraycopy; 2573 address entry_checkcast_arraycopy; 2574 2575 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2576 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2577 2578 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2579 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2580 2581 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2582 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2583 2584 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2585 2586 //*** jbyte 2587 // Always need aligned and unaligned versions 2588 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2589 "jbyte_disjoint_arraycopy"); 2590 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2591 &entry_jbyte_arraycopy, 2592 "jbyte_arraycopy"); 2593 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2594 "arrayof_jbyte_disjoint_arraycopy"); 2595 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2596 "arrayof_jbyte_arraycopy"); 2597 2598 //*** jshort 2599 // Always need aligned and unaligned versions 2600 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2601 "jshort_disjoint_arraycopy"); 2602 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2603 &entry_jshort_arraycopy, 2604 "jshort_arraycopy"); 2605 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2606 "arrayof_jshort_disjoint_arraycopy"); 2607 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2608 "arrayof_jshort_arraycopy"); 2609 2610 //*** jint 2611 // Aligned versions 2612 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2613 "arrayof_jint_disjoint_arraycopy"); 2614 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2615 "arrayof_jint_arraycopy"); 2616 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2617 // entry_jint_arraycopy always points to the unaligned version 2618 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2619 "jint_disjoint_arraycopy"); 2620 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2621 &entry_jint_arraycopy, 2622 "jint_arraycopy"); 2623 2624 //*** jlong 2625 // It is always aligned 2626 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2627 "arrayof_jlong_disjoint_arraycopy"); 2628 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2629 "arrayof_jlong_arraycopy"); 2630 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2631 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2632 2633 //*** oops 2634 { 2635 // With compressed oops we need unaligned versions; notice that 2636 // we overwrite entry_oop_arraycopy. 2637 bool aligned = !UseCompressedOops; 2638 2639 StubRoutines::_arrayof_oop_disjoint_arraycopy 2640 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2641 /*dest_uninitialized*/false); 2642 StubRoutines::_arrayof_oop_arraycopy 2643 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2644 /*dest_uninitialized*/false); 2645 // Aligned versions without pre-barriers 2646 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2647 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2648 /*dest_uninitialized*/true); 2649 StubRoutines::_arrayof_oop_arraycopy_uninit 2650 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2651 /*dest_uninitialized*/true); 2652 } 2653 2654 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2655 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2656 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2657 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2658 2659 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2660 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2661 /*dest_uninitialized*/true); 2662 2663 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2664 entry_jbyte_arraycopy, 2665 entry_jshort_arraycopy, 2666 entry_jint_arraycopy, 2667 entry_jlong_arraycopy); 2668 2669 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2670 entry_jbyte_arraycopy, 2671 entry_jshort_arraycopy, 2672 entry_jint_arraycopy, 2673 entry_oop_arraycopy, 2674 entry_jlong_arraycopy, 2675 entry_checkcast_arraycopy); 2676 2677 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2678 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2679 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2680 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2681 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2682 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2683 } 2684 2685 void generate_math_stubs() { Unimplemented(); } 2686 2687 // Arguments: 2688 // 2689 // Inputs: 2690 // c_rarg0 - source byte array address 2691 // c_rarg1 - destination byte array address 2692 // c_rarg2 - K (key) in little endian int array 2693 // 2694 address generate_aescrypt_encryptBlock() { 2695 __ align(CodeEntryAlignment); 2696 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2697 2698 const Register from = c_rarg0; // source array address 2699 const Register to = c_rarg1; // destination array address 2700 const Register key = c_rarg2; // key array address 2701 const Register keylen = rscratch1; 2702 2703 address start = __ pc(); 2704 __ enter(); 2705 2706 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2707 2708 __ aesenc_loadkeys(key, keylen); 2709 __ aesecb_encrypt(from, to, keylen); 2710 2711 __ mov(r0, 0); 2712 2713 __ leave(); 2714 __ ret(lr); 2715 2716 return start; 2717 } 2718 2719 // Arguments: 2720 // 2721 // Inputs: 2722 // c_rarg0 - source byte array address 2723 // c_rarg1 - destination byte array address 2724 // c_rarg2 - K (key) in little endian int array 2725 // 2726 address generate_aescrypt_decryptBlock() { 2727 assert(UseAES, "need AES cryptographic extension support"); 2728 __ align(CodeEntryAlignment); 2729 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2730 Label L_doLast; 2731 2732 const Register from = c_rarg0; // source array address 2733 const Register to = c_rarg1; // destination array address 2734 const Register key = c_rarg2; // key array address 2735 const Register keylen = rscratch1; 2736 2737 address start = __ pc(); 2738 __ enter(); // required for proper stackwalking of RuntimeStub frame 2739 2740 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2741 2742 __ aesecb_decrypt(from, to, key, keylen); 2743 2744 __ mov(r0, 0); 2745 2746 __ leave(); 2747 __ ret(lr); 2748 2749 return start; 2750 } 2751 2752 // Arguments: 2753 // 2754 // Inputs: 2755 // c_rarg0 - source byte array address 2756 // c_rarg1 - destination byte array address 2757 // c_rarg2 - K (key) in little endian int array 2758 // c_rarg3 - r vector byte array address 2759 // c_rarg4 - input length 2760 // 2761 // Output: 2762 // x0 - input length 2763 // 2764 address generate_cipherBlockChaining_encryptAESCrypt() { 2765 assert(UseAES, "need AES cryptographic extension support"); 2766 __ align(CodeEntryAlignment); 2767 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2768 2769 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2770 2771 const Register from = c_rarg0; // source array address 2772 const Register to = c_rarg1; // destination array address 2773 const Register key = c_rarg2; // key array address 2774 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2775 // and left with the results of the last encryption block 2776 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2777 const Register keylen = rscratch1; 2778 2779 address start = __ pc(); 2780 2781 __ enter(); 2782 2783 __ movw(rscratch2, len_reg); 2784 2785 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2786 2787 __ ld1(v0, __ T16B, rvec); 2788 2789 __ cmpw(keylen, 52); 2790 __ br(Assembler::CC, L_loadkeys_44); 2791 __ br(Assembler::EQ, L_loadkeys_52); 2792 2793 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2794 __ rev32(v17, __ T16B, v17); 2795 __ rev32(v18, __ T16B, v18); 2796 __ BIND(L_loadkeys_52); 2797 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2798 __ rev32(v19, __ T16B, v19); 2799 __ rev32(v20, __ T16B, v20); 2800 __ BIND(L_loadkeys_44); 2801 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2802 __ rev32(v21, __ T16B, v21); 2803 __ rev32(v22, __ T16B, v22); 2804 __ rev32(v23, __ T16B, v23); 2805 __ rev32(v24, __ T16B, v24); 2806 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2807 __ rev32(v25, __ T16B, v25); 2808 __ rev32(v26, __ T16B, v26); 2809 __ rev32(v27, __ T16B, v27); 2810 __ rev32(v28, __ T16B, v28); 2811 __ ld1(v29, v30, v31, __ T16B, key); 2812 __ rev32(v29, __ T16B, v29); 2813 __ rev32(v30, __ T16B, v30); 2814 __ rev32(v31, __ T16B, v31); 2815 2816 __ BIND(L_aes_loop); 2817 __ ld1(v1, __ T16B, __ post(from, 16)); 2818 __ eor(v0, __ T16B, v0, v1); 2819 2820 __ br(Assembler::CC, L_rounds_44); 2821 __ br(Assembler::EQ, L_rounds_52); 2822 2823 __ aese(v0, v17); __ aesmc(v0, v0); 2824 __ aese(v0, v18); __ aesmc(v0, v0); 2825 __ BIND(L_rounds_52); 2826 __ aese(v0, v19); __ aesmc(v0, v0); 2827 __ aese(v0, v20); __ aesmc(v0, v0); 2828 __ BIND(L_rounds_44); 2829 __ aese(v0, v21); __ aesmc(v0, v0); 2830 __ aese(v0, v22); __ aesmc(v0, v0); 2831 __ aese(v0, v23); __ aesmc(v0, v0); 2832 __ aese(v0, v24); __ aesmc(v0, v0); 2833 __ aese(v0, v25); __ aesmc(v0, v0); 2834 __ aese(v0, v26); __ aesmc(v0, v0); 2835 __ aese(v0, v27); __ aesmc(v0, v0); 2836 __ aese(v0, v28); __ aesmc(v0, v0); 2837 __ aese(v0, v29); __ aesmc(v0, v0); 2838 __ aese(v0, v30); 2839 __ eor(v0, __ T16B, v0, v31); 2840 2841 __ st1(v0, __ T16B, __ post(to, 16)); 2842 2843 __ subw(len_reg, len_reg, 16); 2844 __ cbnzw(len_reg, L_aes_loop); 2845 2846 __ st1(v0, __ T16B, rvec); 2847 2848 __ mov(r0, rscratch2); 2849 2850 __ leave(); 2851 __ ret(lr); 2852 2853 return start; 2854 } 2855 2856 // Arguments: 2857 // 2858 // Inputs: 2859 // c_rarg0 - source byte array address 2860 // c_rarg1 - destination byte array address 2861 // c_rarg2 - K (key) in little endian int array 2862 // c_rarg3 - r vector byte array address 2863 // c_rarg4 - input length 2864 // 2865 // Output: 2866 // r0 - input length 2867 // 2868 address generate_cipherBlockChaining_decryptAESCrypt() { 2869 assert(UseAES, "need AES cryptographic extension support"); 2870 __ align(CodeEntryAlignment); 2871 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2872 2873 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2874 2875 const Register from = c_rarg0; // source array address 2876 const Register to = c_rarg1; // destination array address 2877 const Register key = c_rarg2; // key array address 2878 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2879 // and left with the results of the last encryption block 2880 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2881 const Register keylen = rscratch1; 2882 2883 address start = __ pc(); 2884 2885 __ enter(); 2886 2887 __ movw(rscratch2, len_reg); 2888 2889 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2890 2891 __ ld1(v2, __ T16B, rvec); 2892 2893 __ ld1(v31, __ T16B, __ post(key, 16)); 2894 __ rev32(v31, __ T16B, v31); 2895 2896 __ cmpw(keylen, 52); 2897 __ br(Assembler::CC, L_loadkeys_44); 2898 __ br(Assembler::EQ, L_loadkeys_52); 2899 2900 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2901 __ rev32(v17, __ T16B, v17); 2902 __ rev32(v18, __ T16B, v18); 2903 __ BIND(L_loadkeys_52); 2904 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2905 __ rev32(v19, __ T16B, v19); 2906 __ rev32(v20, __ T16B, v20); 2907 __ BIND(L_loadkeys_44); 2908 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2909 __ rev32(v21, __ T16B, v21); 2910 __ rev32(v22, __ T16B, v22); 2911 __ rev32(v23, __ T16B, v23); 2912 __ rev32(v24, __ T16B, v24); 2913 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2914 __ rev32(v25, __ T16B, v25); 2915 __ rev32(v26, __ T16B, v26); 2916 __ rev32(v27, __ T16B, v27); 2917 __ rev32(v28, __ T16B, v28); 2918 __ ld1(v29, v30, __ T16B, key); 2919 __ rev32(v29, __ T16B, v29); 2920 __ rev32(v30, __ T16B, v30); 2921 2922 __ BIND(L_aes_loop); 2923 __ ld1(v0, __ T16B, __ post(from, 16)); 2924 __ orr(v1, __ T16B, v0, v0); 2925 2926 __ br(Assembler::CC, L_rounds_44); 2927 __ br(Assembler::EQ, L_rounds_52); 2928 2929 __ aesd(v0, v17); __ aesimc(v0, v0); 2930 __ aesd(v0, v18); __ aesimc(v0, v0); 2931 __ BIND(L_rounds_52); 2932 __ aesd(v0, v19); __ aesimc(v0, v0); 2933 __ aesd(v0, v20); __ aesimc(v0, v0); 2934 __ BIND(L_rounds_44); 2935 __ aesd(v0, v21); __ aesimc(v0, v0); 2936 __ aesd(v0, v22); __ aesimc(v0, v0); 2937 __ aesd(v0, v23); __ aesimc(v0, v0); 2938 __ aesd(v0, v24); __ aesimc(v0, v0); 2939 __ aesd(v0, v25); __ aesimc(v0, v0); 2940 __ aesd(v0, v26); __ aesimc(v0, v0); 2941 __ aesd(v0, v27); __ aesimc(v0, v0); 2942 __ aesd(v0, v28); __ aesimc(v0, v0); 2943 __ aesd(v0, v29); __ aesimc(v0, v0); 2944 __ aesd(v0, v30); 2945 __ eor(v0, __ T16B, v0, v31); 2946 __ eor(v0, __ T16B, v0, v2); 2947 2948 __ st1(v0, __ T16B, __ post(to, 16)); 2949 __ orr(v2, __ T16B, v1, v1); 2950 2951 __ subw(len_reg, len_reg, 16); 2952 __ cbnzw(len_reg, L_aes_loop); 2953 2954 __ st1(v2, __ T16B, rvec); 2955 2956 __ mov(r0, rscratch2); 2957 2958 __ leave(); 2959 __ ret(lr); 2960 2961 return start; 2962 } 2963 2964 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2965 // Inputs: 128-bits. in is preserved. 2966 // The least-significant 64-bit word is in the upper dword of each vector. 2967 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2968 // Output: result 2969 void be_add_128_64(FloatRegister result, FloatRegister in, 2970 FloatRegister inc, FloatRegister tmp) { 2971 assert_different_registers(result, tmp, inc); 2972 2973 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2974 // input 2975 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 2976 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 2977 // MSD == 0 (must be!) to LSD 2978 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 2979 } 2980 2981 // CTR AES crypt. 2982 // Arguments: 2983 // 2984 // Inputs: 2985 // c_rarg0 - source byte array address 2986 // c_rarg1 - destination byte array address 2987 // c_rarg2 - K (key) in little endian int array 2988 // c_rarg3 - counter vector byte array address 2989 // c_rarg4 - input length 2990 // c_rarg5 - saved encryptedCounter start 2991 // c_rarg6 - saved used length 2992 // 2993 // Output: 2994 // r0 - input length 2995 // 2996 address generate_counterMode_AESCrypt() { 2997 const Register in = c_rarg0; 2998 const Register out = c_rarg1; 2999 const Register key = c_rarg2; 3000 const Register counter = c_rarg3; 3001 const Register saved_len = c_rarg4, len = r10; 3002 const Register saved_encrypted_ctr = c_rarg5; 3003 const Register used_ptr = c_rarg6, used = r12; 3004 3005 const Register offset = r7; 3006 const Register keylen = r11; 3007 3008 const unsigned char block_size = 16; 3009 const int bulk_width = 4; 3010 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3011 // performance with larger data sizes, but it also means that the 3012 // fast path isn't used until you have at least 8 blocks, and up 3013 // to 127 bytes of data will be executed on the slow path. For 3014 // that reason, and also so as not to blow away too much icache, 4 3015 // blocks seems like a sensible compromise. 3016 3017 // Algorithm: 3018 // 3019 // if (len == 0) { 3020 // goto DONE; 3021 // } 3022 // int result = len; 3023 // do { 3024 // if (used >= blockSize) { 3025 // if (len >= bulk_width * blockSize) { 3026 // CTR_large_block(); 3027 // if (len == 0) 3028 // goto DONE; 3029 // } 3030 // for (;;) { 3031 // 16ByteVector v0 = counter; 3032 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3033 // used = 0; 3034 // if (len < blockSize) 3035 // break; /* goto NEXT */ 3036 // 16ByteVector v1 = load16Bytes(in, offset); 3037 // v1 = v1 ^ encryptedCounter; 3038 // store16Bytes(out, offset); 3039 // used = blockSize; 3040 // offset += blockSize; 3041 // len -= blockSize; 3042 // if (len == 0) 3043 // goto DONE; 3044 // } 3045 // } 3046 // NEXT: 3047 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3048 // len--; 3049 // } while (len != 0); 3050 // DONE: 3051 // return result; 3052 // 3053 // CTR_large_block() 3054 // Wide bulk encryption of whole blocks. 3055 3056 __ align(CodeEntryAlignment); 3057 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3058 const address start = __ pc(); 3059 __ enter(); 3060 3061 Label DONE, CTR_large_block, large_block_return; 3062 __ ldrw(used, Address(used_ptr)); 3063 __ cbzw(saved_len, DONE); 3064 3065 __ mov(len, saved_len); 3066 __ mov(offset, 0); 3067 3068 // Compute #rounds for AES based on the length of the key array 3069 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3070 3071 __ aesenc_loadkeys(key, keylen); 3072 3073 { 3074 Label L_CTR_loop, NEXT; 3075 3076 __ bind(L_CTR_loop); 3077 3078 __ cmp(used, block_size); 3079 __ br(__ LO, NEXT); 3080 3081 // Maybe we have a lot of data 3082 __ subsw(rscratch1, len, bulk_width * block_size); 3083 __ br(__ HS, CTR_large_block); 3084 __ BIND(large_block_return); 3085 __ cbzw(len, DONE); 3086 3087 // Setup the counter 3088 __ movi(v4, __ T4S, 0); 3089 __ movi(v5, __ T4S, 1); 3090 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3091 3092 // 128-bit big-endian increment 3093 __ ld1(v0, __ T16B, counter); 3094 __ rev64(v16, __ T16B, v0); 3095 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3096 __ rev64(v16, __ T16B, v16); 3097 __ st1(v16, __ T16B, counter); 3098 // Previous counter value is in v0 3099 // v4 contains { 0, 1 } 3100 3101 { 3102 // We have fewer than bulk_width blocks of data left. Encrypt 3103 // them one by one until there is less than a full block 3104 // remaining, being careful to save both the encrypted counter 3105 // and the counter. 3106 3107 Label inner_loop; 3108 __ bind(inner_loop); 3109 // Counter to encrypt is in v0 3110 __ aesecb_encrypt(noreg, noreg, keylen); 3111 __ st1(v0, __ T16B, saved_encrypted_ctr); 3112 3113 // Do we have a remaining full block? 3114 3115 __ mov(used, 0); 3116 __ cmp(len, block_size); 3117 __ br(__ LO, NEXT); 3118 3119 // Yes, we have a full block 3120 __ ldrq(v1, Address(in, offset)); 3121 __ eor(v1, __ T16B, v1, v0); 3122 __ strq(v1, Address(out, offset)); 3123 __ mov(used, block_size); 3124 __ add(offset, offset, block_size); 3125 3126 __ subw(len, len, block_size); 3127 __ cbzw(len, DONE); 3128 3129 // Increment the counter, store it back 3130 __ orr(v0, __ T16B, v16, v16); 3131 __ rev64(v16, __ T16B, v16); 3132 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3133 __ rev64(v16, __ T16B, v16); 3134 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3135 3136 __ b(inner_loop); 3137 } 3138 3139 __ BIND(NEXT); 3140 3141 // Encrypt a single byte, and loop. 3142 // We expect this to be a rare event. 3143 __ ldrb(rscratch1, Address(in, offset)); 3144 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3145 __ eor(rscratch1, rscratch1, rscratch2); 3146 __ strb(rscratch1, Address(out, offset)); 3147 __ add(offset, offset, 1); 3148 __ add(used, used, 1); 3149 __ subw(len, len,1); 3150 __ cbnzw(len, L_CTR_loop); 3151 } 3152 3153 __ bind(DONE); 3154 __ strw(used, Address(used_ptr)); 3155 __ mov(r0, saved_len); 3156 3157 __ leave(); // required for proper stackwalking of RuntimeStub frame 3158 __ ret(lr); 3159 3160 // Bulk encryption 3161 3162 __ BIND (CTR_large_block); 3163 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3164 3165 if (bulk_width == 8) { 3166 __ sub(sp, sp, 4 * 16); 3167 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3168 } 3169 __ sub(sp, sp, 4 * 16); 3170 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3171 RegSet saved_regs = (RegSet::of(in, out, offset) 3172 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3173 __ push(saved_regs, sp); 3174 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3175 __ add(in, in, offset); 3176 __ add(out, out, offset); 3177 3178 // Keys should already be loaded into the correct registers 3179 3180 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3181 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3182 3183 // AES/CTR loop 3184 { 3185 Label L_CTR_loop; 3186 __ BIND(L_CTR_loop); 3187 3188 // Setup the counters 3189 __ movi(v8, __ T4S, 0); 3190 __ movi(v9, __ T4S, 1); 3191 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3192 3193 for (int i = 0; i < bulk_width; i++) { 3194 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3195 __ rev64(v0_ofs, __ T16B, v16); 3196 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3197 } 3198 3199 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3200 3201 // Encrypt the counters 3202 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3203 3204 if (bulk_width == 8) { 3205 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3206 } 3207 3208 // XOR the encrypted counters with the inputs 3209 for (int i = 0; i < bulk_width; i++) { 3210 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3211 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3212 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3213 } 3214 3215 // Write the encrypted data 3216 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3217 if (bulk_width == 8) { 3218 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3219 } 3220 3221 __ subw(len, len, 16 * bulk_width); 3222 __ cbnzw(len, L_CTR_loop); 3223 } 3224 3225 // Save the counter back where it goes 3226 __ rev64(v16, __ T16B, v16); 3227 __ st1(v16, __ T16B, counter); 3228 3229 __ pop(saved_regs, sp); 3230 3231 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3232 if (bulk_width == 8) { 3233 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3234 } 3235 3236 __ andr(rscratch1, len, -16 * bulk_width); 3237 __ sub(len, len, rscratch1); 3238 __ add(offset, offset, rscratch1); 3239 __ mov(used, 16); 3240 __ strw(used, Address(used_ptr)); 3241 __ b(large_block_return); 3242 3243 return start; 3244 } 3245 3246 // Vector AES Galois Counter Mode implementation. Parameters: 3247 // 3248 // in = c_rarg0 3249 // len = c_rarg1 3250 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3251 // out = c_rarg3 3252 // key = c_rarg4 3253 // state = c_rarg5 - GHASH.state 3254 // subkeyHtbl = c_rarg6 - powers of H 3255 // counter = c_rarg7 - 16 bytes of CTR 3256 // return - number of processed bytes 3257 address generate_galoisCounterMode_AESCrypt() { 3258 address ghash_polynomial = __ pc(); 3259 __ emit_int64(0x87); // The low-order bits of the field 3260 // polynomial (i.e. p = z^7+z^2+z+1) 3261 // repeated in the low and high parts of a 3262 // 128-bit vector 3263 __ emit_int64(0x87); 3264 3265 __ align(CodeEntryAlignment); 3266 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3267 address start = __ pc(); 3268 __ enter(); 3269 3270 const Register in = c_rarg0; 3271 const Register len = c_rarg1; 3272 const Register ct = c_rarg2; 3273 const Register out = c_rarg3; 3274 // and updated with the incremented counter in the end 3275 3276 const Register key = c_rarg4; 3277 const Register state = c_rarg5; 3278 3279 const Register subkeyHtbl = c_rarg6; 3280 3281 const Register counter = c_rarg7; 3282 3283 const Register keylen = r10; 3284 // Save state before entering routine 3285 __ sub(sp, sp, 4 * 16); 3286 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3287 __ sub(sp, sp, 4 * 16); 3288 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3289 3290 // __ andr(len, len, -512); 3291 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3292 __ str(len, __ pre(sp, -2 * wordSize)); 3293 3294 Label DONE; 3295 __ cbz(len, DONE); 3296 3297 // Compute #rounds for AES based on the length of the key array 3298 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3299 3300 __ aesenc_loadkeys(key, keylen); 3301 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3302 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3303 3304 // AES/CTR loop 3305 { 3306 Label L_CTR_loop; 3307 __ BIND(L_CTR_loop); 3308 3309 // Setup the counters 3310 __ movi(v8, __ T4S, 0); 3311 __ movi(v9, __ T4S, 1); 3312 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3313 3314 assert(v0->encoding() < v8->encoding(), ""); 3315 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3316 FloatRegister f = as_FloatRegister(i); 3317 __ rev32(f, __ T16B, v16); 3318 __ addv(v16, __ T4S, v16, v8); 3319 } 3320 3321 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3322 3323 // Encrypt the counters 3324 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3325 3326 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3327 3328 // XOR the encrypted counters with the inputs 3329 for (int i = 0; i < 8; i++) { 3330 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3331 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3332 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3333 } 3334 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3335 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3336 3337 __ subw(len, len, 16 * 8); 3338 __ cbnzw(len, L_CTR_loop); 3339 } 3340 3341 __ rev32(v16, __ T16B, v16); 3342 __ st1(v16, __ T16B, counter); 3343 3344 __ ldr(len, Address(sp)); 3345 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3346 3347 // GHASH/CTR loop 3348 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3349 len, /*unrolls*/4); 3350 3351 #ifdef ASSERT 3352 { Label L; 3353 __ cmp(len, (unsigned char)0); 3354 __ br(Assembler::EQ, L); 3355 __ stop("stubGenerator: abort"); 3356 __ bind(L); 3357 } 3358 #endif 3359 3360 __ bind(DONE); 3361 // Return the number of bytes processed 3362 __ ldr(r0, __ post(sp, 2 * wordSize)); 3363 3364 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3365 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3366 3367 __ leave(); // required for proper stackwalking of RuntimeStub frame 3368 __ ret(lr); 3369 return start; 3370 } 3371 3372 class Cached64Bytes { 3373 private: 3374 MacroAssembler *_masm; 3375 Register _regs[8]; 3376 3377 public: 3378 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3379 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3380 auto it = rs.begin(); 3381 for (auto &r: _regs) { 3382 r = *it; 3383 ++it; 3384 } 3385 } 3386 3387 void gen_loads(Register base) { 3388 for (int i = 0; i < 8; i += 2) { 3389 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3390 } 3391 } 3392 3393 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3394 void extract_u32(Register dest, int i) { 3395 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3396 } 3397 }; 3398 3399 // Utility routines for md5. 3400 // Clobbers r10 and r11. 3401 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3402 int k, int s, int t) { 3403 Register rscratch3 = r10; 3404 Register rscratch4 = r11; 3405 3406 __ eorw(rscratch3, r3, r4); 3407 __ movw(rscratch2, t); 3408 __ andw(rscratch3, rscratch3, r2); 3409 __ addw(rscratch4, r1, rscratch2); 3410 reg_cache.extract_u32(rscratch1, k); 3411 __ eorw(rscratch3, rscratch3, r4); 3412 __ addw(rscratch4, rscratch4, rscratch1); 3413 __ addw(rscratch3, rscratch3, rscratch4); 3414 __ rorw(rscratch2, rscratch3, 32 - s); 3415 __ addw(r1, rscratch2, r2); 3416 } 3417 3418 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3419 int k, int s, int t) { 3420 Register rscratch3 = r10; 3421 Register rscratch4 = r11; 3422 3423 __ andw(rscratch3, r2, r4); 3424 __ bicw(rscratch4, r3, r4); 3425 reg_cache.extract_u32(rscratch1, k); 3426 __ movw(rscratch2, t); 3427 __ orrw(rscratch3, rscratch3, rscratch4); 3428 __ addw(rscratch4, r1, rscratch2); 3429 __ addw(rscratch4, rscratch4, rscratch1); 3430 __ addw(rscratch3, rscratch3, rscratch4); 3431 __ rorw(rscratch2, rscratch3, 32 - s); 3432 __ addw(r1, rscratch2, r2); 3433 } 3434 3435 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3436 int k, int s, int t) { 3437 Register rscratch3 = r10; 3438 Register rscratch4 = r11; 3439 3440 __ eorw(rscratch3, r3, r4); 3441 __ movw(rscratch2, t); 3442 __ addw(rscratch4, r1, rscratch2); 3443 reg_cache.extract_u32(rscratch1, k); 3444 __ eorw(rscratch3, rscratch3, r2); 3445 __ addw(rscratch4, rscratch4, rscratch1); 3446 __ addw(rscratch3, rscratch3, rscratch4); 3447 __ rorw(rscratch2, rscratch3, 32 - s); 3448 __ addw(r1, rscratch2, r2); 3449 } 3450 3451 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3452 int k, int s, int t) { 3453 Register rscratch3 = r10; 3454 Register rscratch4 = r11; 3455 3456 __ movw(rscratch3, t); 3457 __ ornw(rscratch2, r2, r4); 3458 __ addw(rscratch4, r1, rscratch3); 3459 reg_cache.extract_u32(rscratch1, k); 3460 __ eorw(rscratch3, rscratch2, r3); 3461 __ addw(rscratch4, rscratch4, rscratch1); 3462 __ addw(rscratch3, rscratch3, rscratch4); 3463 __ rorw(rscratch2, rscratch3, 32 - s); 3464 __ addw(r1, rscratch2, r2); 3465 } 3466 3467 // Arguments: 3468 // 3469 // Inputs: 3470 // c_rarg0 - byte[] source+offset 3471 // c_rarg1 - int[] SHA.state 3472 // c_rarg2 - int offset 3473 // c_rarg3 - int limit 3474 // 3475 address generate_md5_implCompress(bool multi_block, const char *name) { 3476 __ align(CodeEntryAlignment); 3477 StubCodeMark mark(this, "StubRoutines", name); 3478 address start = __ pc(); 3479 3480 Register buf = c_rarg0; 3481 Register state = c_rarg1; 3482 Register ofs = c_rarg2; 3483 Register limit = c_rarg3; 3484 Register a = r4; 3485 Register b = r5; 3486 Register c = r6; 3487 Register d = r7; 3488 Register rscratch3 = r10; 3489 Register rscratch4 = r11; 3490 3491 Register state_regs[2] = { r12, r13 }; 3492 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3493 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3494 3495 __ push(saved_regs, sp); 3496 3497 __ ldp(state_regs[0], state_regs[1], Address(state)); 3498 __ ubfx(a, state_regs[0], 0, 32); 3499 __ ubfx(b, state_regs[0], 32, 32); 3500 __ ubfx(c, state_regs[1], 0, 32); 3501 __ ubfx(d, state_regs[1], 32, 32); 3502 3503 Label md5_loop; 3504 __ BIND(md5_loop); 3505 3506 reg_cache.gen_loads(buf); 3507 3508 // Round 1 3509 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3510 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3511 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3512 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3513 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3514 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3515 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3516 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3517 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3518 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3519 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3520 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3521 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3522 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3523 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3524 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3525 3526 // Round 2 3527 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3528 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3529 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3530 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3531 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3532 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3533 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3534 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3535 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3536 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3537 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3538 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3539 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3540 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3541 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3542 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3543 3544 // Round 3 3545 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3546 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3547 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3548 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3549 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3550 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3551 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3552 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3553 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3554 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3555 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3556 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3557 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3558 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3559 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3560 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3561 3562 // Round 4 3563 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3564 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3565 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3566 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3567 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3568 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3569 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3570 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3571 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3572 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3573 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3574 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3575 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3576 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3577 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3578 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3579 3580 __ addw(a, state_regs[0], a); 3581 __ ubfx(rscratch2, state_regs[0], 32, 32); 3582 __ addw(b, rscratch2, b); 3583 __ addw(c, state_regs[1], c); 3584 __ ubfx(rscratch4, state_regs[1], 32, 32); 3585 __ addw(d, rscratch4, d); 3586 3587 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3588 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3589 3590 if (multi_block) { 3591 __ add(buf, buf, 64); 3592 __ add(ofs, ofs, 64); 3593 __ cmp(ofs, limit); 3594 __ br(Assembler::LE, md5_loop); 3595 __ mov(c_rarg0, ofs); // return ofs 3596 } 3597 3598 // write hash values back in the correct order 3599 __ stp(state_regs[0], state_regs[1], Address(state)); 3600 3601 __ pop(saved_regs, sp); 3602 3603 __ ret(lr); 3604 3605 return start; 3606 } 3607 3608 // Arguments: 3609 // 3610 // Inputs: 3611 // c_rarg0 - byte[] source+offset 3612 // c_rarg1 - int[] SHA.state 3613 // c_rarg2 - int offset 3614 // c_rarg3 - int limit 3615 // 3616 address generate_sha1_implCompress(bool multi_block, const char *name) { 3617 __ align(CodeEntryAlignment); 3618 StubCodeMark mark(this, "StubRoutines", name); 3619 address start = __ pc(); 3620 3621 Register buf = c_rarg0; 3622 Register state = c_rarg1; 3623 Register ofs = c_rarg2; 3624 Register limit = c_rarg3; 3625 3626 Label keys; 3627 Label sha1_loop; 3628 3629 // load the keys into v0..v3 3630 __ adr(rscratch1, keys); 3631 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3632 // load 5 words state into v6, v7 3633 __ ldrq(v6, Address(state, 0)); 3634 __ ldrs(v7, Address(state, 16)); 3635 3636 3637 __ BIND(sha1_loop); 3638 // load 64 bytes of data into v16..v19 3639 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3640 __ rev32(v16, __ T16B, v16); 3641 __ rev32(v17, __ T16B, v17); 3642 __ rev32(v18, __ T16B, v18); 3643 __ rev32(v19, __ T16B, v19); 3644 3645 // do the sha1 3646 __ addv(v4, __ T4S, v16, v0); 3647 __ orr(v20, __ T16B, v6, v6); 3648 3649 FloatRegister d0 = v16; 3650 FloatRegister d1 = v17; 3651 FloatRegister d2 = v18; 3652 FloatRegister d3 = v19; 3653 3654 for (int round = 0; round < 20; round++) { 3655 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3656 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3657 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3658 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3659 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3660 3661 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3662 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3663 __ sha1h(tmp2, __ T4S, v20); 3664 if (round < 5) 3665 __ sha1c(v20, __ T4S, tmp3, tmp4); 3666 else if (round < 10 || round >= 15) 3667 __ sha1p(v20, __ T4S, tmp3, tmp4); 3668 else 3669 __ sha1m(v20, __ T4S, tmp3, tmp4); 3670 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3671 3672 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3673 } 3674 3675 __ addv(v7, __ T2S, v7, v21); 3676 __ addv(v6, __ T4S, v6, v20); 3677 3678 if (multi_block) { 3679 __ add(ofs, ofs, 64); 3680 __ cmp(ofs, limit); 3681 __ br(Assembler::LE, sha1_loop); 3682 __ mov(c_rarg0, ofs); // return ofs 3683 } 3684 3685 __ strq(v6, Address(state, 0)); 3686 __ strs(v7, Address(state, 16)); 3687 3688 __ ret(lr); 3689 3690 __ bind(keys); 3691 __ emit_int32(0x5a827999); 3692 __ emit_int32(0x6ed9eba1); 3693 __ emit_int32(0x8f1bbcdc); 3694 __ emit_int32(0xca62c1d6); 3695 3696 return start; 3697 } 3698 3699 3700 // Arguments: 3701 // 3702 // Inputs: 3703 // c_rarg0 - byte[] source+offset 3704 // c_rarg1 - int[] SHA.state 3705 // c_rarg2 - int offset 3706 // c_rarg3 - int limit 3707 // 3708 address generate_sha256_implCompress(bool multi_block, const char *name) { 3709 static const uint32_t round_consts[64] = { 3710 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3711 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3712 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3713 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3714 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3715 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3716 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3717 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3718 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3719 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3720 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3721 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3722 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3723 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3724 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3725 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3726 }; 3727 __ align(CodeEntryAlignment); 3728 StubCodeMark mark(this, "StubRoutines", name); 3729 address start = __ pc(); 3730 3731 Register buf = c_rarg0; 3732 Register state = c_rarg1; 3733 Register ofs = c_rarg2; 3734 Register limit = c_rarg3; 3735 3736 Label sha1_loop; 3737 3738 __ stpd(v8, v9, __ pre(sp, -32)); 3739 __ stpd(v10, v11, Address(sp, 16)); 3740 3741 // dga == v0 3742 // dgb == v1 3743 // dg0 == v2 3744 // dg1 == v3 3745 // dg2 == v4 3746 // t0 == v6 3747 // t1 == v7 3748 3749 // load 16 keys to v16..v31 3750 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3751 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3752 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3753 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3754 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3755 3756 // load 8 words (256 bits) state 3757 __ ldpq(v0, v1, state); 3758 3759 __ BIND(sha1_loop); 3760 // load 64 bytes of data into v8..v11 3761 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3762 __ rev32(v8, __ T16B, v8); 3763 __ rev32(v9, __ T16B, v9); 3764 __ rev32(v10, __ T16B, v10); 3765 __ rev32(v11, __ T16B, v11); 3766 3767 __ addv(v6, __ T4S, v8, v16); 3768 __ orr(v2, __ T16B, v0, v0); 3769 __ orr(v3, __ T16B, v1, v1); 3770 3771 FloatRegister d0 = v8; 3772 FloatRegister d1 = v9; 3773 FloatRegister d2 = v10; 3774 FloatRegister d3 = v11; 3775 3776 3777 for (int round = 0; round < 16; round++) { 3778 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3779 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3780 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3781 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3782 3783 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3784 __ orr(v4, __ T16B, v2, v2); 3785 if (round < 15) 3786 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3787 __ sha256h(v2, __ T4S, v3, tmp2); 3788 __ sha256h2(v3, __ T4S, v4, tmp2); 3789 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3790 3791 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3792 } 3793 3794 __ addv(v0, __ T4S, v0, v2); 3795 __ addv(v1, __ T4S, v1, v3); 3796 3797 if (multi_block) { 3798 __ add(ofs, ofs, 64); 3799 __ cmp(ofs, limit); 3800 __ br(Assembler::LE, sha1_loop); 3801 __ mov(c_rarg0, ofs); // return ofs 3802 } 3803 3804 __ ldpd(v10, v11, Address(sp, 16)); 3805 __ ldpd(v8, v9, __ post(sp, 32)); 3806 3807 __ stpq(v0, v1, state); 3808 3809 __ ret(lr); 3810 3811 return start; 3812 } 3813 3814 // Double rounds for sha512. 3815 void sha512_dround(int dr, 3816 FloatRegister vi0, FloatRegister vi1, 3817 FloatRegister vi2, FloatRegister vi3, 3818 FloatRegister vi4, FloatRegister vrc0, 3819 FloatRegister vrc1, FloatRegister vin0, 3820 FloatRegister vin1, FloatRegister vin2, 3821 FloatRegister vin3, FloatRegister vin4) { 3822 if (dr < 36) { 3823 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3824 } 3825 __ addv(v5, __ T2D, vrc0, vin0); 3826 __ ext(v6, __ T16B, vi2, vi3, 8); 3827 __ ext(v5, __ T16B, v5, v5, 8); 3828 __ ext(v7, __ T16B, vi1, vi2, 8); 3829 __ addv(vi3, __ T2D, vi3, v5); 3830 if (dr < 32) { 3831 __ ext(v5, __ T16B, vin3, vin4, 8); 3832 __ sha512su0(vin0, __ T2D, vin1); 3833 } 3834 __ sha512h(vi3, __ T2D, v6, v7); 3835 if (dr < 32) { 3836 __ sha512su1(vin0, __ T2D, vin2, v5); 3837 } 3838 __ addv(vi4, __ T2D, vi1, vi3); 3839 __ sha512h2(vi3, __ T2D, vi1, vi0); 3840 } 3841 3842 // Arguments: 3843 // 3844 // Inputs: 3845 // c_rarg0 - byte[] source+offset 3846 // c_rarg1 - int[] SHA.state 3847 // c_rarg2 - int offset 3848 // c_rarg3 - int limit 3849 // 3850 address generate_sha512_implCompress(bool multi_block, const char *name) { 3851 static const uint64_t round_consts[80] = { 3852 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3853 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3854 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3855 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3856 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3857 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3858 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3859 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3860 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3861 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3862 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3863 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3864 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3865 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3866 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3867 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3868 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3869 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3870 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3871 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3872 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3873 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3874 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3875 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3876 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3877 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3878 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3879 }; 3880 3881 __ align(CodeEntryAlignment); 3882 StubCodeMark mark(this, "StubRoutines", name); 3883 address start = __ pc(); 3884 3885 Register buf = c_rarg0; 3886 Register state = c_rarg1; 3887 Register ofs = c_rarg2; 3888 Register limit = c_rarg3; 3889 3890 __ stpd(v8, v9, __ pre(sp, -64)); 3891 __ stpd(v10, v11, Address(sp, 16)); 3892 __ stpd(v12, v13, Address(sp, 32)); 3893 __ stpd(v14, v15, Address(sp, 48)); 3894 3895 Label sha512_loop; 3896 3897 // load state 3898 __ ld1(v8, v9, v10, v11, __ T2D, state); 3899 3900 // load first 4 round constants 3901 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3902 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3903 3904 __ BIND(sha512_loop); 3905 // load 128B of data into v12..v19 3906 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3907 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3908 __ rev64(v12, __ T16B, v12); 3909 __ rev64(v13, __ T16B, v13); 3910 __ rev64(v14, __ T16B, v14); 3911 __ rev64(v15, __ T16B, v15); 3912 __ rev64(v16, __ T16B, v16); 3913 __ rev64(v17, __ T16B, v17); 3914 __ rev64(v18, __ T16B, v18); 3915 __ rev64(v19, __ T16B, v19); 3916 3917 __ mov(rscratch2, rscratch1); 3918 3919 __ mov(v0, __ T16B, v8); 3920 __ mov(v1, __ T16B, v9); 3921 __ mov(v2, __ T16B, v10); 3922 __ mov(v3, __ T16B, v11); 3923 3924 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3925 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3926 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3927 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3928 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3929 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3930 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3931 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3932 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3933 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3934 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3935 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3936 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3937 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3938 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3939 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3940 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3941 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3942 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3943 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3944 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3945 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3946 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3947 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3948 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3949 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3950 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3951 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3952 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3953 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3954 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3955 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3956 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3957 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3958 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3959 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3960 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3961 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3962 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3963 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3964 3965 __ addv(v8, __ T2D, v8, v0); 3966 __ addv(v9, __ T2D, v9, v1); 3967 __ addv(v10, __ T2D, v10, v2); 3968 __ addv(v11, __ T2D, v11, v3); 3969 3970 if (multi_block) { 3971 __ add(ofs, ofs, 128); 3972 __ cmp(ofs, limit); 3973 __ br(Assembler::LE, sha512_loop); 3974 __ mov(c_rarg0, ofs); // return ofs 3975 } 3976 3977 __ st1(v8, v9, v10, v11, __ T2D, state); 3978 3979 __ ldpd(v14, v15, Address(sp, 48)); 3980 __ ldpd(v12, v13, Address(sp, 32)); 3981 __ ldpd(v10, v11, Address(sp, 16)); 3982 __ ldpd(v8, v9, __ post(sp, 64)); 3983 3984 __ ret(lr); 3985 3986 return start; 3987 } 3988 3989 // Arguments: 3990 // 3991 // Inputs: 3992 // c_rarg0 - byte[] source+offset 3993 // c_rarg1 - byte[] SHA.state 3994 // c_rarg2 - int block_size 3995 // c_rarg3 - int offset 3996 // c_rarg4 - int limit 3997 // 3998 address generate_sha3_implCompress(bool multi_block, const char *name) { 3999 static const uint64_t round_consts[24] = { 4000 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4001 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4002 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4003 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4004 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4005 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4006 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4007 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4008 }; 4009 4010 __ align(CodeEntryAlignment); 4011 StubCodeMark mark(this, "StubRoutines", name); 4012 address start = __ pc(); 4013 4014 Register buf = c_rarg0; 4015 Register state = c_rarg1; 4016 Register block_size = c_rarg2; 4017 Register ofs = c_rarg3; 4018 Register limit = c_rarg4; 4019 4020 Label sha3_loop, rounds24_loop; 4021 Label sha3_512_or_sha3_384, shake128; 4022 4023 __ stpd(v8, v9, __ pre(sp, -64)); 4024 __ stpd(v10, v11, Address(sp, 16)); 4025 __ stpd(v12, v13, Address(sp, 32)); 4026 __ stpd(v14, v15, Address(sp, 48)); 4027 4028 // load state 4029 __ add(rscratch1, state, 32); 4030 __ ld1(v0, v1, v2, v3, __ T1D, state); 4031 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4032 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4033 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4034 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4035 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4036 __ ld1(v24, __ T1D, rscratch1); 4037 4038 __ BIND(sha3_loop); 4039 4040 // 24 keccak rounds 4041 __ movw(rscratch2, 24); 4042 4043 // load round_constants base 4044 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4045 4046 // load input 4047 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4048 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4049 __ eor(v0, __ T8B, v0, v25); 4050 __ eor(v1, __ T8B, v1, v26); 4051 __ eor(v2, __ T8B, v2, v27); 4052 __ eor(v3, __ T8B, v3, v28); 4053 __ eor(v4, __ T8B, v4, v29); 4054 __ eor(v5, __ T8B, v5, v30); 4055 __ eor(v6, __ T8B, v6, v31); 4056 4057 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4058 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4059 4060 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4061 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4062 __ eor(v7, __ T8B, v7, v25); 4063 __ eor(v8, __ T8B, v8, v26); 4064 __ eor(v9, __ T8B, v9, v27); 4065 __ eor(v10, __ T8B, v10, v28); 4066 __ eor(v11, __ T8B, v11, v29); 4067 __ eor(v12, __ T8B, v12, v30); 4068 __ eor(v13, __ T8B, v13, v31); 4069 4070 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4071 __ eor(v14, __ T8B, v14, v25); 4072 __ eor(v15, __ T8B, v15, v26); 4073 __ eor(v16, __ T8B, v16, v27); 4074 4075 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4076 __ andw(c_rarg5, block_size, 48); 4077 __ cbzw(c_rarg5, rounds24_loop); 4078 4079 __ tbnz(block_size, 5, shake128); 4080 // block_size == 144, bit5 == 0, SHA3-244 4081 __ ldrd(v28, __ post(buf, 8)); 4082 __ eor(v17, __ T8B, v17, v28); 4083 __ b(rounds24_loop); 4084 4085 __ BIND(shake128); 4086 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4087 __ eor(v17, __ T8B, v17, v28); 4088 __ eor(v18, __ T8B, v18, v29); 4089 __ eor(v19, __ T8B, v19, v30); 4090 __ eor(v20, __ T8B, v20, v31); 4091 __ b(rounds24_loop); // block_size == 168, SHAKE128 4092 4093 __ BIND(sha3_512_or_sha3_384); 4094 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4095 __ eor(v7, __ T8B, v7, v25); 4096 __ eor(v8, __ T8B, v8, v26); 4097 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4098 4099 // SHA3-384 4100 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4101 __ eor(v9, __ T8B, v9, v27); 4102 __ eor(v10, __ T8B, v10, v28); 4103 __ eor(v11, __ T8B, v11, v29); 4104 __ eor(v12, __ T8B, v12, v30); 4105 4106 __ BIND(rounds24_loop); 4107 __ subw(rscratch2, rscratch2, 1); 4108 4109 __ eor3(v29, __ T16B, v4, v9, v14); 4110 __ eor3(v26, __ T16B, v1, v6, v11); 4111 __ eor3(v28, __ T16B, v3, v8, v13); 4112 __ eor3(v25, __ T16B, v0, v5, v10); 4113 __ eor3(v27, __ T16B, v2, v7, v12); 4114 __ eor3(v29, __ T16B, v29, v19, v24); 4115 __ eor3(v26, __ T16B, v26, v16, v21); 4116 __ eor3(v28, __ T16B, v28, v18, v23); 4117 __ eor3(v25, __ T16B, v25, v15, v20); 4118 __ eor3(v27, __ T16B, v27, v17, v22); 4119 4120 __ rax1(v30, __ T2D, v29, v26); 4121 __ rax1(v26, __ T2D, v26, v28); 4122 __ rax1(v28, __ T2D, v28, v25); 4123 __ rax1(v25, __ T2D, v25, v27); 4124 __ rax1(v27, __ T2D, v27, v29); 4125 4126 __ eor(v0, __ T16B, v0, v30); 4127 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4128 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4129 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4130 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4131 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4132 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4133 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4134 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4135 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4136 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4137 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4138 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4139 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4140 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4141 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4142 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4143 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4144 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4145 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4146 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4147 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4148 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4149 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4150 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4151 4152 __ bcax(v20, __ T16B, v31, v22, v8); 4153 __ bcax(v21, __ T16B, v8, v23, v22); 4154 __ bcax(v22, __ T16B, v22, v24, v23); 4155 __ bcax(v23, __ T16B, v23, v31, v24); 4156 __ bcax(v24, __ T16B, v24, v8, v31); 4157 4158 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4159 4160 __ bcax(v17, __ T16B, v25, v19, v3); 4161 __ bcax(v18, __ T16B, v3, v15, v19); 4162 __ bcax(v19, __ T16B, v19, v16, v15); 4163 __ bcax(v15, __ T16B, v15, v25, v16); 4164 __ bcax(v16, __ T16B, v16, v3, v25); 4165 4166 __ bcax(v10, __ T16B, v29, v12, v26); 4167 __ bcax(v11, __ T16B, v26, v13, v12); 4168 __ bcax(v12, __ T16B, v12, v14, v13); 4169 __ bcax(v13, __ T16B, v13, v29, v14); 4170 __ bcax(v14, __ T16B, v14, v26, v29); 4171 4172 __ bcax(v7, __ T16B, v30, v9, v4); 4173 __ bcax(v8, __ T16B, v4, v5, v9); 4174 __ bcax(v9, __ T16B, v9, v6, v5); 4175 __ bcax(v5, __ T16B, v5, v30, v6); 4176 __ bcax(v6, __ T16B, v6, v4, v30); 4177 4178 __ bcax(v3, __ T16B, v27, v0, v28); 4179 __ bcax(v4, __ T16B, v28, v1, v0); 4180 __ bcax(v0, __ T16B, v0, v2, v1); 4181 __ bcax(v1, __ T16B, v1, v27, v2); 4182 __ bcax(v2, __ T16B, v2, v28, v27); 4183 4184 __ eor(v0, __ T16B, v0, v31); 4185 4186 __ cbnzw(rscratch2, rounds24_loop); 4187 4188 if (multi_block) { 4189 __ add(ofs, ofs, block_size); 4190 __ cmp(ofs, limit); 4191 __ br(Assembler::LE, sha3_loop); 4192 __ mov(c_rarg0, ofs); // return ofs 4193 } 4194 4195 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4196 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4197 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4198 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4199 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4200 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4201 __ st1(v24, __ T1D, state); 4202 4203 __ ldpd(v14, v15, Address(sp, 48)); 4204 __ ldpd(v12, v13, Address(sp, 32)); 4205 __ ldpd(v10, v11, Address(sp, 16)); 4206 __ ldpd(v8, v9, __ post(sp, 64)); 4207 4208 __ ret(lr); 4209 4210 return start; 4211 } 4212 4213 /** 4214 * Arguments: 4215 * 4216 * Inputs: 4217 * c_rarg0 - int crc 4218 * c_rarg1 - byte* buf 4219 * c_rarg2 - int length 4220 * 4221 * Output: 4222 * rax - int crc result 4223 */ 4224 address generate_updateBytesCRC32() { 4225 assert(UseCRC32Intrinsics, "what are we doing here?"); 4226 4227 __ align(CodeEntryAlignment); 4228 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4229 4230 address start = __ pc(); 4231 4232 const Register crc = c_rarg0; // crc 4233 const Register buf = c_rarg1; // source java byte array address 4234 const Register len = c_rarg2; // length 4235 const Register table0 = c_rarg3; // crc_table address 4236 const Register table1 = c_rarg4; 4237 const Register table2 = c_rarg5; 4238 const Register table3 = c_rarg6; 4239 const Register tmp3 = c_rarg7; 4240 4241 BLOCK_COMMENT("Entry:"); 4242 __ enter(); // required for proper stackwalking of RuntimeStub frame 4243 4244 __ kernel_crc32(crc, buf, len, 4245 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4246 4247 __ leave(); // required for proper stackwalking of RuntimeStub frame 4248 __ ret(lr); 4249 4250 return start; 4251 } 4252 4253 // ChaCha20 block function. This version parallelizes by loading 4254 // individual 32-bit state elements into vectors for four blocks 4255 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4256 // 4257 // state (int[16]) = c_rarg0 4258 // keystream (byte[1024]) = c_rarg1 4259 // return - number of bytes of keystream (always 256) 4260 address generate_chacha20Block_blockpar() { 4261 Label L_twoRounds, L_cc20_const; 4262 // The constant data is broken into two 128-bit segments to be loaded 4263 // onto FloatRegisters. The first 128 bits are a counter add overlay 4264 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4265 // The second 128-bits is a table constant used for 8-bit left rotations. 4266 __ BIND(L_cc20_const); 4267 __ emit_int64(0x0000000100000000UL); 4268 __ emit_int64(0x0000000300000002UL); 4269 __ emit_int64(0x0605040702010003UL); 4270 __ emit_int64(0x0E0D0C0F0A09080BUL); 4271 4272 __ align(CodeEntryAlignment); 4273 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4274 address start = __ pc(); 4275 __ enter(); 4276 4277 int i, j; 4278 const Register state = c_rarg0; 4279 const Register keystream = c_rarg1; 4280 const Register loopCtr = r10; 4281 const Register tmpAddr = r11; 4282 4283 const FloatRegister stateFirst = v0; 4284 const FloatRegister stateSecond = v1; 4285 const FloatRegister stateThird = v2; 4286 const FloatRegister stateFourth = v3; 4287 const FloatRegister origCtrState = v28; 4288 const FloatRegister scratch = v29; 4289 const FloatRegister lrot8Tbl = v30; 4290 4291 // Organize SIMD registers in an array that facilitates 4292 // putting repetitive opcodes into loop structures. It is 4293 // important that each grouping of 4 registers is monotonically 4294 // increasing to support the requirements of multi-register 4295 // instructions (e.g. ld4r, st4, etc.) 4296 const FloatRegister workSt[16] = { 4297 v4, v5, v6, v7, v16, v17, v18, v19, 4298 v20, v21, v22, v23, v24, v25, v26, v27 4299 }; 4300 4301 // Load from memory and interlace across 16 SIMD registers, 4302 // With each word from memory being broadcast to all lanes of 4303 // each successive SIMD register. 4304 // Addr(0) -> All lanes in workSt[i] 4305 // Addr(4) -> All lanes workSt[i + 1], etc. 4306 __ mov(tmpAddr, state); 4307 for (i = 0; i < 16; i += 4) { 4308 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4309 __ post(tmpAddr, 16)); 4310 } 4311 4312 // Pull in constant data. The first 16 bytes are the add overlay 4313 // which is applied to the vector holding the counter (state[12]). 4314 // The second 16 bytes is the index register for the 8-bit left 4315 // rotation tbl instruction. 4316 __ adr(tmpAddr, L_cc20_const); 4317 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4318 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4319 4320 // Set up the 10 iteration loop and perform all 8 quarter round ops 4321 __ mov(loopCtr, 10); 4322 __ BIND(L_twoRounds); 4323 4324 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4325 scratch, lrot8Tbl); 4326 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4327 scratch, lrot8Tbl); 4328 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4329 scratch, lrot8Tbl); 4330 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4331 scratch, lrot8Tbl); 4332 4333 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4334 scratch, lrot8Tbl); 4335 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4336 scratch, lrot8Tbl); 4337 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4338 scratch, lrot8Tbl); 4339 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4340 scratch, lrot8Tbl); 4341 4342 // Decrement and iterate 4343 __ sub(loopCtr, loopCtr, 1); 4344 __ cbnz(loopCtr, L_twoRounds); 4345 4346 __ mov(tmpAddr, state); 4347 4348 // Add the starting state back to the post-loop keystream 4349 // state. We read/interlace the state array from memory into 4350 // 4 registers similar to what we did in the beginning. Then 4351 // add the counter overlay onto workSt[12] at the end. 4352 for (i = 0; i < 16; i += 4) { 4353 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4354 __ post(tmpAddr, 16)); 4355 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4356 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4357 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4358 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4359 } 4360 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4361 4362 // Write to key stream, storing the same element out of workSt[0..15] 4363 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4364 // for the next element position. 4365 for (i = 0; i < 4; i++) { 4366 for (j = 0; j < 16; j += 4) { 4367 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4368 __ post(keystream, 16)); 4369 } 4370 } 4371 4372 __ mov(r0, 256); // Return length of output keystream 4373 __ leave(); 4374 __ ret(lr); 4375 4376 return start; 4377 } 4378 4379 /** 4380 * Arguments: 4381 * 4382 * Inputs: 4383 * c_rarg0 - int crc 4384 * c_rarg1 - byte* buf 4385 * c_rarg2 - int length 4386 * c_rarg3 - int* table 4387 * 4388 * Output: 4389 * r0 - int crc result 4390 */ 4391 address generate_updateBytesCRC32C() { 4392 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4393 4394 __ align(CodeEntryAlignment); 4395 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4396 4397 address start = __ pc(); 4398 4399 const Register crc = c_rarg0; // crc 4400 const Register buf = c_rarg1; // source java byte array address 4401 const Register len = c_rarg2; // length 4402 const Register table0 = c_rarg3; // crc_table address 4403 const Register table1 = c_rarg4; 4404 const Register table2 = c_rarg5; 4405 const Register table3 = c_rarg6; 4406 const Register tmp3 = c_rarg7; 4407 4408 BLOCK_COMMENT("Entry:"); 4409 __ enter(); // required for proper stackwalking of RuntimeStub frame 4410 4411 __ kernel_crc32c(crc, buf, len, 4412 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4413 4414 __ leave(); // required for proper stackwalking of RuntimeStub frame 4415 __ ret(lr); 4416 4417 return start; 4418 } 4419 4420 /*** 4421 * Arguments: 4422 * 4423 * Inputs: 4424 * c_rarg0 - int adler 4425 * c_rarg1 - byte* buff 4426 * c_rarg2 - int len 4427 * 4428 * Output: 4429 * c_rarg0 - int adler result 4430 */ 4431 address generate_updateBytesAdler32() { 4432 __ align(CodeEntryAlignment); 4433 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4434 address start = __ pc(); 4435 4436 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4437 4438 // Aliases 4439 Register adler = c_rarg0; 4440 Register s1 = c_rarg0; 4441 Register s2 = c_rarg3; 4442 Register buff = c_rarg1; 4443 Register len = c_rarg2; 4444 Register nmax = r4; 4445 Register base = r5; 4446 Register count = r6; 4447 Register temp0 = rscratch1; 4448 Register temp1 = rscratch2; 4449 FloatRegister vbytes = v0; 4450 FloatRegister vs1acc = v1; 4451 FloatRegister vs2acc = v2; 4452 FloatRegister vtable = v3; 4453 4454 // Max number of bytes we can process before having to take the mod 4455 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4456 uint64_t BASE = 0xfff1; 4457 uint64_t NMAX = 0x15B0; 4458 4459 __ mov(base, BASE); 4460 __ mov(nmax, NMAX); 4461 4462 // Load accumulation coefficients for the upper 16 bits 4463 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4464 __ ld1(vtable, __ T16B, Address(temp0)); 4465 4466 // s1 is initialized to the lower 16 bits of adler 4467 // s2 is initialized to the upper 16 bits of adler 4468 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4469 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4470 4471 // The pipelined loop needs at least 16 elements for 1 iteration 4472 // It does check this, but it is more effective to skip to the cleanup loop 4473 __ cmp(len, (u1)16); 4474 __ br(Assembler::HS, L_nmax); 4475 __ cbz(len, L_combine); 4476 4477 __ bind(L_simple_by1_loop); 4478 __ ldrb(temp0, Address(__ post(buff, 1))); 4479 __ add(s1, s1, temp0); 4480 __ add(s2, s2, s1); 4481 __ subs(len, len, 1); 4482 __ br(Assembler::HI, L_simple_by1_loop); 4483 4484 // s1 = s1 % BASE 4485 __ subs(temp0, s1, base); 4486 __ csel(s1, temp0, s1, Assembler::HS); 4487 4488 // s2 = s2 % BASE 4489 __ lsr(temp0, s2, 16); 4490 __ lsl(temp1, temp0, 4); 4491 __ sub(temp1, temp1, temp0); 4492 __ add(s2, temp1, s2, ext::uxth); 4493 4494 __ subs(temp0, s2, base); 4495 __ csel(s2, temp0, s2, Assembler::HS); 4496 4497 __ b(L_combine); 4498 4499 __ bind(L_nmax); 4500 __ subs(len, len, nmax); 4501 __ sub(count, nmax, 16); 4502 __ br(Assembler::LO, L_by16); 4503 4504 __ bind(L_nmax_loop); 4505 4506 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4507 vbytes, vs1acc, vs2acc, vtable); 4508 4509 __ subs(count, count, 16); 4510 __ br(Assembler::HS, L_nmax_loop); 4511 4512 // s1 = s1 % BASE 4513 __ lsr(temp0, s1, 16); 4514 __ lsl(temp1, temp0, 4); 4515 __ sub(temp1, temp1, temp0); 4516 __ add(temp1, temp1, s1, ext::uxth); 4517 4518 __ lsr(temp0, temp1, 16); 4519 __ lsl(s1, temp0, 4); 4520 __ sub(s1, s1, temp0); 4521 __ add(s1, s1, temp1, ext:: uxth); 4522 4523 __ subs(temp0, s1, base); 4524 __ csel(s1, temp0, s1, Assembler::HS); 4525 4526 // s2 = s2 % BASE 4527 __ lsr(temp0, s2, 16); 4528 __ lsl(temp1, temp0, 4); 4529 __ sub(temp1, temp1, temp0); 4530 __ add(temp1, temp1, s2, ext::uxth); 4531 4532 __ lsr(temp0, temp1, 16); 4533 __ lsl(s2, temp0, 4); 4534 __ sub(s2, s2, temp0); 4535 __ add(s2, s2, temp1, ext:: uxth); 4536 4537 __ subs(temp0, s2, base); 4538 __ csel(s2, temp0, s2, Assembler::HS); 4539 4540 __ subs(len, len, nmax); 4541 __ sub(count, nmax, 16); 4542 __ br(Assembler::HS, L_nmax_loop); 4543 4544 __ bind(L_by16); 4545 __ adds(len, len, count); 4546 __ br(Assembler::LO, L_by1); 4547 4548 __ bind(L_by16_loop); 4549 4550 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4551 vbytes, vs1acc, vs2acc, vtable); 4552 4553 __ subs(len, len, 16); 4554 __ br(Assembler::HS, L_by16_loop); 4555 4556 __ bind(L_by1); 4557 __ adds(len, len, 15); 4558 __ br(Assembler::LO, L_do_mod); 4559 4560 __ bind(L_by1_loop); 4561 __ ldrb(temp0, Address(__ post(buff, 1))); 4562 __ add(s1, temp0, s1); 4563 __ add(s2, s2, s1); 4564 __ subs(len, len, 1); 4565 __ br(Assembler::HS, L_by1_loop); 4566 4567 __ bind(L_do_mod); 4568 // s1 = s1 % BASE 4569 __ lsr(temp0, s1, 16); 4570 __ lsl(temp1, temp0, 4); 4571 __ sub(temp1, temp1, temp0); 4572 __ add(temp1, temp1, s1, ext::uxth); 4573 4574 __ lsr(temp0, temp1, 16); 4575 __ lsl(s1, temp0, 4); 4576 __ sub(s1, s1, temp0); 4577 __ add(s1, s1, temp1, ext:: uxth); 4578 4579 __ subs(temp0, s1, base); 4580 __ csel(s1, temp0, s1, Assembler::HS); 4581 4582 // s2 = s2 % BASE 4583 __ lsr(temp0, s2, 16); 4584 __ lsl(temp1, temp0, 4); 4585 __ sub(temp1, temp1, temp0); 4586 __ add(temp1, temp1, s2, ext::uxth); 4587 4588 __ lsr(temp0, temp1, 16); 4589 __ lsl(s2, temp0, 4); 4590 __ sub(s2, s2, temp0); 4591 __ add(s2, s2, temp1, ext:: uxth); 4592 4593 __ subs(temp0, s2, base); 4594 __ csel(s2, temp0, s2, Assembler::HS); 4595 4596 // Combine lower bits and higher bits 4597 __ bind(L_combine); 4598 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4599 4600 __ ret(lr); 4601 4602 return start; 4603 } 4604 4605 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4606 Register temp0, Register temp1, FloatRegister vbytes, 4607 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4608 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4609 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4610 // In non-vectorized code, we update s1 and s2 as: 4611 // s1 <- s1 + b1 4612 // s2 <- s2 + s1 4613 // s1 <- s1 + b2 4614 // s2 <- s2 + b1 4615 // ... 4616 // s1 <- s1 + b16 4617 // s2 <- s2 + s1 4618 // Putting above assignments together, we have: 4619 // s1_new = s1 + b1 + b2 + ... + b16 4620 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4621 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4622 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4623 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4624 4625 // s2 = s2 + s1 * 16 4626 __ add(s2, s2, s1, Assembler::LSL, 4); 4627 4628 // vs1acc = b1 + b2 + b3 + ... + b16 4629 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4630 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4631 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4632 __ uaddlv(vs1acc, __ T16B, vbytes); 4633 __ uaddlv(vs2acc, __ T8H, vs2acc); 4634 4635 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4636 __ fmovd(temp0, vs1acc); 4637 __ fmovd(temp1, vs2acc); 4638 __ add(s1, s1, temp0); 4639 __ add(s2, s2, temp1); 4640 } 4641 4642 /** 4643 * Arguments: 4644 * 4645 * Input: 4646 * c_rarg0 - x address 4647 * c_rarg1 - x length 4648 * c_rarg2 - y address 4649 * c_rarg3 - y length 4650 * c_rarg4 - z address 4651 * c_rarg5 - z length 4652 */ 4653 address generate_multiplyToLen() { 4654 __ align(CodeEntryAlignment); 4655 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4656 4657 address start = __ pc(); 4658 const Register x = r0; 4659 const Register xlen = r1; 4660 const Register y = r2; 4661 const Register ylen = r3; 4662 const Register z = r4; 4663 const Register zlen = r5; 4664 4665 const Register tmp1 = r10; 4666 const Register tmp2 = r11; 4667 const Register tmp3 = r12; 4668 const Register tmp4 = r13; 4669 const Register tmp5 = r14; 4670 const Register tmp6 = r15; 4671 const Register tmp7 = r16; 4672 4673 BLOCK_COMMENT("Entry:"); 4674 __ enter(); // required for proper stackwalking of RuntimeStub frame 4675 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4676 __ leave(); // required for proper stackwalking of RuntimeStub frame 4677 __ ret(lr); 4678 4679 return start; 4680 } 4681 4682 address generate_squareToLen() { 4683 // squareToLen algorithm for sizes 1..127 described in java code works 4684 // faster than multiply_to_len on some CPUs and slower on others, but 4685 // multiply_to_len shows a bit better overall results 4686 __ align(CodeEntryAlignment); 4687 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4688 address start = __ pc(); 4689 4690 const Register x = r0; 4691 const Register xlen = r1; 4692 const Register z = r2; 4693 const Register zlen = r3; 4694 const Register y = r4; // == x 4695 const Register ylen = r5; // == xlen 4696 4697 const Register tmp1 = r10; 4698 const Register tmp2 = r11; 4699 const Register tmp3 = r12; 4700 const Register tmp4 = r13; 4701 const Register tmp5 = r14; 4702 const Register tmp6 = r15; 4703 const Register tmp7 = r16; 4704 4705 RegSet spilled_regs = RegSet::of(y, ylen); 4706 BLOCK_COMMENT("Entry:"); 4707 __ enter(); 4708 __ push(spilled_regs, sp); 4709 __ mov(y, x); 4710 __ mov(ylen, xlen); 4711 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4712 __ pop(spilled_regs, sp); 4713 __ leave(); 4714 __ ret(lr); 4715 return start; 4716 } 4717 4718 address generate_mulAdd() { 4719 __ align(CodeEntryAlignment); 4720 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4721 4722 address start = __ pc(); 4723 4724 const Register out = r0; 4725 const Register in = r1; 4726 const Register offset = r2; 4727 const Register len = r3; 4728 const Register k = r4; 4729 4730 BLOCK_COMMENT("Entry:"); 4731 __ enter(); 4732 __ mul_add(out, in, offset, len, k); 4733 __ leave(); 4734 __ ret(lr); 4735 4736 return start; 4737 } 4738 4739 // Arguments: 4740 // 4741 // Input: 4742 // c_rarg0 - newArr address 4743 // c_rarg1 - oldArr address 4744 // c_rarg2 - newIdx 4745 // c_rarg3 - shiftCount 4746 // c_rarg4 - numIter 4747 // 4748 address generate_bigIntegerRightShift() { 4749 __ align(CodeEntryAlignment); 4750 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4751 address start = __ pc(); 4752 4753 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4754 4755 Register newArr = c_rarg0; 4756 Register oldArr = c_rarg1; 4757 Register newIdx = c_rarg2; 4758 Register shiftCount = c_rarg3; 4759 Register numIter = c_rarg4; 4760 Register idx = numIter; 4761 4762 Register newArrCur = rscratch1; 4763 Register shiftRevCount = rscratch2; 4764 Register oldArrCur = r13; 4765 Register oldArrNext = r14; 4766 4767 FloatRegister oldElem0 = v0; 4768 FloatRegister oldElem1 = v1; 4769 FloatRegister newElem = v2; 4770 FloatRegister shiftVCount = v3; 4771 FloatRegister shiftVRevCount = v4; 4772 4773 __ cbz(idx, Exit); 4774 4775 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4776 4777 // left shift count 4778 __ movw(shiftRevCount, 32); 4779 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4780 4781 // numIter too small to allow a 4-words SIMD loop, rolling back 4782 __ cmp(numIter, (u1)4); 4783 __ br(Assembler::LT, ShiftThree); 4784 4785 __ dup(shiftVCount, __ T4S, shiftCount); 4786 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4787 __ negr(shiftVCount, __ T4S, shiftVCount); 4788 4789 __ BIND(ShiftSIMDLoop); 4790 4791 // Calculate the load addresses 4792 __ sub(idx, idx, 4); 4793 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4794 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4795 __ add(oldArrCur, oldArrNext, 4); 4796 4797 // Load 4 words and process 4798 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4799 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4800 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4801 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4802 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4803 __ st1(newElem, __ T4S, Address(newArrCur)); 4804 4805 __ cmp(idx, (u1)4); 4806 __ br(Assembler::LT, ShiftTwoLoop); 4807 __ b(ShiftSIMDLoop); 4808 4809 __ BIND(ShiftTwoLoop); 4810 __ cbz(idx, Exit); 4811 __ cmp(idx, (u1)1); 4812 __ br(Assembler::EQ, ShiftOne); 4813 4814 // Calculate the load addresses 4815 __ sub(idx, idx, 2); 4816 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4817 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4818 __ add(oldArrCur, oldArrNext, 4); 4819 4820 // Load 2 words and process 4821 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4822 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4823 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4824 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4825 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4826 __ st1(newElem, __ T2S, Address(newArrCur)); 4827 __ b(ShiftTwoLoop); 4828 4829 __ BIND(ShiftThree); 4830 __ tbz(idx, 1, ShiftOne); 4831 __ tbz(idx, 0, ShiftTwo); 4832 __ ldrw(r10, Address(oldArr, 12)); 4833 __ ldrw(r11, Address(oldArr, 8)); 4834 __ lsrvw(r10, r10, shiftCount); 4835 __ lslvw(r11, r11, shiftRevCount); 4836 __ orrw(r12, r10, r11); 4837 __ strw(r12, Address(newArr, 8)); 4838 4839 __ BIND(ShiftTwo); 4840 __ ldrw(r10, Address(oldArr, 8)); 4841 __ ldrw(r11, Address(oldArr, 4)); 4842 __ lsrvw(r10, r10, shiftCount); 4843 __ lslvw(r11, r11, shiftRevCount); 4844 __ orrw(r12, r10, r11); 4845 __ strw(r12, Address(newArr, 4)); 4846 4847 __ BIND(ShiftOne); 4848 __ ldrw(r10, Address(oldArr, 4)); 4849 __ ldrw(r11, Address(oldArr)); 4850 __ lsrvw(r10, r10, shiftCount); 4851 __ lslvw(r11, r11, shiftRevCount); 4852 __ orrw(r12, r10, r11); 4853 __ strw(r12, Address(newArr)); 4854 4855 __ BIND(Exit); 4856 __ ret(lr); 4857 4858 return start; 4859 } 4860 4861 // Arguments: 4862 // 4863 // Input: 4864 // c_rarg0 - newArr address 4865 // c_rarg1 - oldArr address 4866 // c_rarg2 - newIdx 4867 // c_rarg3 - shiftCount 4868 // c_rarg4 - numIter 4869 // 4870 address generate_bigIntegerLeftShift() { 4871 __ align(CodeEntryAlignment); 4872 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4873 address start = __ pc(); 4874 4875 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4876 4877 Register newArr = c_rarg0; 4878 Register oldArr = c_rarg1; 4879 Register newIdx = c_rarg2; 4880 Register shiftCount = c_rarg3; 4881 Register numIter = c_rarg4; 4882 4883 Register shiftRevCount = rscratch1; 4884 Register oldArrNext = rscratch2; 4885 4886 FloatRegister oldElem0 = v0; 4887 FloatRegister oldElem1 = v1; 4888 FloatRegister newElem = v2; 4889 FloatRegister shiftVCount = v3; 4890 FloatRegister shiftVRevCount = v4; 4891 4892 __ cbz(numIter, Exit); 4893 4894 __ add(oldArrNext, oldArr, 4); 4895 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4896 4897 // right shift count 4898 __ movw(shiftRevCount, 32); 4899 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4900 4901 // numIter too small to allow a 4-words SIMD loop, rolling back 4902 __ cmp(numIter, (u1)4); 4903 __ br(Assembler::LT, ShiftThree); 4904 4905 __ dup(shiftVCount, __ T4S, shiftCount); 4906 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4907 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4908 4909 __ BIND(ShiftSIMDLoop); 4910 4911 // load 4 words and process 4912 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4913 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4914 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4915 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4916 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4917 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4918 __ sub(numIter, numIter, 4); 4919 4920 __ cmp(numIter, (u1)4); 4921 __ br(Assembler::LT, ShiftTwoLoop); 4922 __ b(ShiftSIMDLoop); 4923 4924 __ BIND(ShiftTwoLoop); 4925 __ cbz(numIter, Exit); 4926 __ cmp(numIter, (u1)1); 4927 __ br(Assembler::EQ, ShiftOne); 4928 4929 // load 2 words and process 4930 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4931 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4932 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4933 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4934 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4935 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4936 __ sub(numIter, numIter, 2); 4937 __ b(ShiftTwoLoop); 4938 4939 __ BIND(ShiftThree); 4940 __ ldrw(r10, __ post(oldArr, 4)); 4941 __ ldrw(r11, __ post(oldArrNext, 4)); 4942 __ lslvw(r10, r10, shiftCount); 4943 __ lsrvw(r11, r11, shiftRevCount); 4944 __ orrw(r12, r10, r11); 4945 __ strw(r12, __ post(newArr, 4)); 4946 __ tbz(numIter, 1, Exit); 4947 __ tbz(numIter, 0, ShiftOne); 4948 4949 __ BIND(ShiftTwo); 4950 __ ldrw(r10, __ post(oldArr, 4)); 4951 __ ldrw(r11, __ post(oldArrNext, 4)); 4952 __ lslvw(r10, r10, shiftCount); 4953 __ lsrvw(r11, r11, shiftRevCount); 4954 __ orrw(r12, r10, r11); 4955 __ strw(r12, __ post(newArr, 4)); 4956 4957 __ BIND(ShiftOne); 4958 __ ldrw(r10, Address(oldArr)); 4959 __ ldrw(r11, Address(oldArrNext)); 4960 __ lslvw(r10, r10, shiftCount); 4961 __ lsrvw(r11, r11, shiftRevCount); 4962 __ orrw(r12, r10, r11); 4963 __ strw(r12, Address(newArr)); 4964 4965 __ BIND(Exit); 4966 __ ret(lr); 4967 4968 return start; 4969 } 4970 4971 address generate_count_positives(address &count_positives_long) { 4972 const u1 large_loop_size = 64; 4973 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4974 int dcache_line = VM_Version::dcache_line_size(); 4975 4976 Register ary1 = r1, len = r2, result = r0; 4977 4978 __ align(CodeEntryAlignment); 4979 4980 StubCodeMark mark(this, "StubRoutines", "count_positives"); 4981 4982 address entry = __ pc(); 4983 4984 __ enter(); 4985 // precondition: a copy of len is already in result 4986 // __ mov(result, len); 4987 4988 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 4989 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 4990 4991 __ cmp(len, (u1)15); 4992 __ br(Assembler::GT, LEN_OVER_15); 4993 // The only case when execution falls into this code is when pointer is near 4994 // the end of memory page and we have to avoid reading next page 4995 __ add(ary1, ary1, len); 4996 __ subs(len, len, 8); 4997 __ br(Assembler::GT, LEN_OVER_8); 4998 __ ldr(rscratch2, Address(ary1, -8)); 4999 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 5000 __ lsrv(rscratch2, rscratch2, rscratch1); 5001 __ tst(rscratch2, UPPER_BIT_MASK); 5002 __ csel(result, zr, result, Assembler::NE); 5003 __ leave(); 5004 __ ret(lr); 5005 __ bind(LEN_OVER_8); 5006 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5007 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5008 __ tst(rscratch2, UPPER_BIT_MASK); 5009 __ br(Assembler::NE, RET_NO_POP); 5010 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5011 __ lsrv(rscratch1, rscratch1, rscratch2); 5012 __ tst(rscratch1, UPPER_BIT_MASK); 5013 __ bind(RET_NO_POP); 5014 __ csel(result, zr, result, Assembler::NE); 5015 __ leave(); 5016 __ ret(lr); 5017 5018 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5019 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5020 5021 count_positives_long = __ pc(); // 2nd entry point 5022 5023 __ enter(); 5024 5025 __ bind(LEN_OVER_15); 5026 __ push(spilled_regs, sp); 5027 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5028 __ cbz(rscratch2, ALIGNED); 5029 __ ldp(tmp6, tmp1, Address(ary1)); 5030 __ mov(tmp5, 16); 5031 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5032 __ add(ary1, ary1, rscratch1); 5033 __ orr(tmp6, tmp6, tmp1); 5034 __ tst(tmp6, UPPER_BIT_MASK); 5035 __ br(Assembler::NE, RET_ADJUST); 5036 __ sub(len, len, rscratch1); 5037 5038 __ bind(ALIGNED); 5039 __ cmp(len, large_loop_size); 5040 __ br(Assembler::LT, CHECK_16); 5041 // Perform 16-byte load as early return in pre-loop to handle situation 5042 // when initially aligned large array has negative values at starting bytes, 5043 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5044 // slower. Cases with negative bytes further ahead won't be affected that 5045 // much. In fact, it'll be faster due to early loads, less instructions and 5046 // less branches in LARGE_LOOP. 5047 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5048 __ sub(len, len, 16); 5049 __ orr(tmp6, tmp6, tmp1); 5050 __ tst(tmp6, UPPER_BIT_MASK); 5051 __ br(Assembler::NE, RET_ADJUST_16); 5052 __ cmp(len, large_loop_size); 5053 __ br(Assembler::LT, CHECK_16); 5054 5055 if (SoftwarePrefetchHintDistance >= 0 5056 && SoftwarePrefetchHintDistance >= dcache_line) { 5057 // initial prefetch 5058 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5059 } 5060 __ bind(LARGE_LOOP); 5061 if (SoftwarePrefetchHintDistance >= 0) { 5062 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5063 } 5064 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5065 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5066 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5067 // instructions per cycle and have less branches, but this approach disables 5068 // early return, thus, all 64 bytes are loaded and checked every time. 5069 __ ldp(tmp2, tmp3, Address(ary1)); 5070 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5071 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5072 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5073 __ add(ary1, ary1, large_loop_size); 5074 __ sub(len, len, large_loop_size); 5075 __ orr(tmp2, tmp2, tmp3); 5076 __ orr(tmp4, tmp4, tmp5); 5077 __ orr(rscratch1, rscratch1, rscratch2); 5078 __ orr(tmp6, tmp6, tmp1); 5079 __ orr(tmp2, tmp2, tmp4); 5080 __ orr(rscratch1, rscratch1, tmp6); 5081 __ orr(tmp2, tmp2, rscratch1); 5082 __ tst(tmp2, UPPER_BIT_MASK); 5083 __ br(Assembler::NE, RET_ADJUST_LONG); 5084 __ cmp(len, large_loop_size); 5085 __ br(Assembler::GE, LARGE_LOOP); 5086 5087 __ bind(CHECK_16); // small 16-byte load pre-loop 5088 __ cmp(len, (u1)16); 5089 __ br(Assembler::LT, POST_LOOP16); 5090 5091 __ bind(LOOP16); // small 16-byte load loop 5092 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5093 __ sub(len, len, 16); 5094 __ orr(tmp2, tmp2, tmp3); 5095 __ tst(tmp2, UPPER_BIT_MASK); 5096 __ br(Assembler::NE, RET_ADJUST_16); 5097 __ cmp(len, (u1)16); 5098 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5099 5100 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5101 __ cmp(len, (u1)8); 5102 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5103 __ ldr(tmp3, Address(__ post(ary1, 8))); 5104 __ tst(tmp3, UPPER_BIT_MASK); 5105 __ br(Assembler::NE, RET_ADJUST); 5106 __ sub(len, len, 8); 5107 5108 __ bind(POST_LOOP16_LOAD_TAIL); 5109 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5110 __ ldr(tmp1, Address(ary1)); 5111 __ mov(tmp2, 64); 5112 __ sub(tmp4, tmp2, len, __ LSL, 3); 5113 __ lslv(tmp1, tmp1, tmp4); 5114 __ tst(tmp1, UPPER_BIT_MASK); 5115 __ br(Assembler::NE, RET_ADJUST); 5116 // Fallthrough 5117 5118 __ bind(RET_LEN); 5119 __ pop(spilled_regs, sp); 5120 __ leave(); 5121 __ ret(lr); 5122 5123 // difference result - len is the count of guaranteed to be 5124 // positive bytes 5125 5126 __ bind(RET_ADJUST_LONG); 5127 __ add(len, len, (u1)(large_loop_size - 16)); 5128 __ bind(RET_ADJUST_16); 5129 __ add(len, len, 16); 5130 __ bind(RET_ADJUST); 5131 __ pop(spilled_regs, sp); 5132 __ leave(); 5133 __ sub(result, result, len); 5134 __ ret(lr); 5135 5136 return entry; 5137 } 5138 5139 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5140 bool usePrefetch, Label &NOT_EQUAL) { 5141 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5142 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5143 tmp7 = r12, tmp8 = r13; 5144 Label LOOP; 5145 5146 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5147 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5148 __ bind(LOOP); 5149 if (usePrefetch) { 5150 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5151 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5152 } 5153 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5154 __ eor(tmp1, tmp1, tmp2); 5155 __ eor(tmp3, tmp3, tmp4); 5156 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5157 __ orr(tmp1, tmp1, tmp3); 5158 __ cbnz(tmp1, NOT_EQUAL); 5159 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5160 __ eor(tmp5, tmp5, tmp6); 5161 __ eor(tmp7, tmp7, tmp8); 5162 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5163 __ orr(tmp5, tmp5, tmp7); 5164 __ cbnz(tmp5, NOT_EQUAL); 5165 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5166 __ eor(tmp1, tmp1, tmp2); 5167 __ eor(tmp3, tmp3, tmp4); 5168 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5169 __ orr(tmp1, tmp1, tmp3); 5170 __ cbnz(tmp1, NOT_EQUAL); 5171 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5172 __ eor(tmp5, tmp5, tmp6); 5173 __ sub(cnt1, cnt1, 8 * wordSize); 5174 __ eor(tmp7, tmp7, tmp8); 5175 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5176 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5177 // cmp) because subs allows an unlimited range of immediate operand. 5178 __ subs(tmp6, cnt1, loopThreshold); 5179 __ orr(tmp5, tmp5, tmp7); 5180 __ cbnz(tmp5, NOT_EQUAL); 5181 __ br(__ GE, LOOP); 5182 // post-loop 5183 __ eor(tmp1, tmp1, tmp2); 5184 __ eor(tmp3, tmp3, tmp4); 5185 __ orr(tmp1, tmp1, tmp3); 5186 __ sub(cnt1, cnt1, 2 * wordSize); 5187 __ cbnz(tmp1, NOT_EQUAL); 5188 } 5189 5190 void generate_large_array_equals_loop_simd(int loopThreshold, 5191 bool usePrefetch, Label &NOT_EQUAL) { 5192 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5193 tmp2 = rscratch2; 5194 Label LOOP; 5195 5196 __ bind(LOOP); 5197 if (usePrefetch) { 5198 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5199 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5200 } 5201 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5202 __ sub(cnt1, cnt1, 8 * wordSize); 5203 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5204 __ subs(tmp1, cnt1, loopThreshold); 5205 __ eor(v0, __ T16B, v0, v4); 5206 __ eor(v1, __ T16B, v1, v5); 5207 __ eor(v2, __ T16B, v2, v6); 5208 __ eor(v3, __ T16B, v3, v7); 5209 __ orr(v0, __ T16B, v0, v1); 5210 __ orr(v1, __ T16B, v2, v3); 5211 __ orr(v0, __ T16B, v0, v1); 5212 __ umov(tmp1, v0, __ D, 0); 5213 __ umov(tmp2, v0, __ D, 1); 5214 __ orr(tmp1, tmp1, tmp2); 5215 __ cbnz(tmp1, NOT_EQUAL); 5216 __ br(__ GE, LOOP); 5217 } 5218 5219 // a1 = r1 - array1 address 5220 // a2 = r2 - array2 address 5221 // result = r0 - return value. Already contains "false" 5222 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5223 // r3-r5 are reserved temporary registers 5224 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5225 address generate_large_array_equals() { 5226 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5227 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5228 tmp7 = r12, tmp8 = r13; 5229 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5230 SMALL_LOOP, POST_LOOP; 5231 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5232 // calculate if at least 32 prefetched bytes are used 5233 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5234 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5235 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5236 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5237 tmp5, tmp6, tmp7, tmp8); 5238 5239 __ align(CodeEntryAlignment); 5240 5241 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5242 5243 address entry = __ pc(); 5244 __ enter(); 5245 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5246 // also advance pointers to use post-increment instead of pre-increment 5247 __ add(a1, a1, wordSize); 5248 __ add(a2, a2, wordSize); 5249 if (AvoidUnalignedAccesses) { 5250 // both implementations (SIMD/nonSIMD) are using relatively large load 5251 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5252 // on some CPUs in case of address is not at least 16-byte aligned. 5253 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5254 // load if needed at least for 1st address and make if 16-byte aligned. 5255 Label ALIGNED16; 5256 __ tbz(a1, 3, ALIGNED16); 5257 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5258 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5259 __ sub(cnt1, cnt1, wordSize); 5260 __ eor(tmp1, tmp1, tmp2); 5261 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5262 __ bind(ALIGNED16); 5263 } 5264 if (UseSIMDForArrayEquals) { 5265 if (SoftwarePrefetchHintDistance >= 0) { 5266 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5267 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5268 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5269 /* prfm = */ true, NOT_EQUAL); 5270 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5271 __ br(__ LT, TAIL); 5272 } 5273 __ bind(NO_PREFETCH_LARGE_LOOP); 5274 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5275 /* prfm = */ false, NOT_EQUAL); 5276 } else { 5277 __ push(spilled_regs, sp); 5278 if (SoftwarePrefetchHintDistance >= 0) { 5279 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5280 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5281 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5282 /* prfm = */ true, NOT_EQUAL); 5283 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5284 __ br(__ LT, TAIL); 5285 } 5286 __ bind(NO_PREFETCH_LARGE_LOOP); 5287 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5288 /* prfm = */ false, NOT_EQUAL); 5289 } 5290 __ bind(TAIL); 5291 __ cbz(cnt1, EQUAL); 5292 __ subs(cnt1, cnt1, wordSize); 5293 __ br(__ LE, POST_LOOP); 5294 __ bind(SMALL_LOOP); 5295 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5296 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5297 __ subs(cnt1, cnt1, wordSize); 5298 __ eor(tmp1, tmp1, tmp2); 5299 __ cbnz(tmp1, NOT_EQUAL); 5300 __ br(__ GT, SMALL_LOOP); 5301 __ bind(POST_LOOP); 5302 __ ldr(tmp1, Address(a1, cnt1)); 5303 __ ldr(tmp2, Address(a2, cnt1)); 5304 __ eor(tmp1, tmp1, tmp2); 5305 __ cbnz(tmp1, NOT_EQUAL); 5306 __ bind(EQUAL); 5307 __ mov(result, true); 5308 __ bind(NOT_EQUAL); 5309 if (!UseSIMDForArrayEquals) { 5310 __ pop(spilled_regs, sp); 5311 } 5312 __ bind(NOT_EQUAL_NO_POP); 5313 __ leave(); 5314 __ ret(lr); 5315 return entry; 5316 } 5317 5318 address generate_dsin_dcos(bool isCos) { 5319 __ align(CodeEntryAlignment); 5320 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5321 address start = __ pc(); 5322 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5323 (address)StubRoutines::aarch64::_two_over_pi, 5324 (address)StubRoutines::aarch64::_pio2, 5325 (address)StubRoutines::aarch64::_dsin_coef, 5326 (address)StubRoutines::aarch64::_dcos_coef); 5327 return start; 5328 } 5329 5330 address generate_dlog() { 5331 __ align(CodeEntryAlignment); 5332 StubCodeMark mark(this, "StubRoutines", "dlog"); 5333 address entry = __ pc(); 5334 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 5335 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 5336 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 5337 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 5338 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 5339 return entry; 5340 } 5341 5342 5343 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5344 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5345 Label &DIFF2) { 5346 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5347 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5348 5349 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5350 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5351 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5352 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5353 5354 __ fmovd(tmpL, vtmp3); 5355 __ eor(rscratch2, tmp3, tmpL); 5356 __ cbnz(rscratch2, DIFF2); 5357 5358 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5359 __ umov(tmpL, vtmp3, __ D, 1); 5360 __ eor(rscratch2, tmpU, tmpL); 5361 __ cbnz(rscratch2, DIFF1); 5362 5363 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5364 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5365 __ fmovd(tmpL, vtmp); 5366 __ eor(rscratch2, tmp3, tmpL); 5367 __ cbnz(rscratch2, DIFF2); 5368 5369 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5370 __ umov(tmpL, vtmp, __ D, 1); 5371 __ eor(rscratch2, tmpU, tmpL); 5372 __ cbnz(rscratch2, DIFF1); 5373 } 5374 5375 // r0 = result 5376 // r1 = str1 5377 // r2 = cnt1 5378 // r3 = str2 5379 // r4 = cnt2 5380 // r10 = tmp1 5381 // r11 = tmp2 5382 address generate_compare_long_string_different_encoding(bool isLU) { 5383 __ align(CodeEntryAlignment); 5384 StubCodeMark mark(this, "StubRoutines", isLU 5385 ? "compare_long_string_different_encoding LU" 5386 : "compare_long_string_different_encoding UL"); 5387 address entry = __ pc(); 5388 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5389 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5390 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5391 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5392 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5393 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5394 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5395 5396 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5397 5398 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5399 // cnt2 == amount of characters left to compare 5400 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5401 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5402 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5403 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5404 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5405 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5406 __ eor(rscratch2, tmp1, tmp2); 5407 __ mov(rscratch1, tmp2); 5408 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5409 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5410 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5411 __ push(spilled_regs, sp); 5412 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5413 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5414 5415 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5416 5417 if (SoftwarePrefetchHintDistance >= 0) { 5418 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5419 __ br(__ LT, NO_PREFETCH); 5420 __ bind(LARGE_LOOP_PREFETCH); 5421 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5422 __ mov(tmp4, 2); 5423 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5424 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5425 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5426 __ subs(tmp4, tmp4, 1); 5427 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5428 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5429 __ mov(tmp4, 2); 5430 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5431 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5432 __ subs(tmp4, tmp4, 1); 5433 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5434 __ sub(cnt2, cnt2, 64); 5435 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5436 __ br(__ GE, LARGE_LOOP_PREFETCH); 5437 } 5438 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5439 __ bind(NO_PREFETCH); 5440 __ subs(cnt2, cnt2, 16); 5441 __ br(__ LT, TAIL); 5442 __ align(OptoLoopAlignment); 5443 __ bind(SMALL_LOOP); // smaller loop 5444 __ subs(cnt2, cnt2, 16); 5445 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5446 __ br(__ GE, SMALL_LOOP); 5447 __ cmn(cnt2, (u1)16); 5448 __ br(__ EQ, LOAD_LAST); 5449 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5450 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5451 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5452 __ ldr(tmp3, Address(cnt1, -8)); 5453 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5454 __ b(LOAD_LAST); 5455 __ bind(DIFF2); 5456 __ mov(tmpU, tmp3); 5457 __ bind(DIFF1); 5458 __ pop(spilled_regs, sp); 5459 __ b(CALCULATE_DIFFERENCE); 5460 __ bind(LOAD_LAST); 5461 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5462 // No need to load it again 5463 __ mov(tmpU, tmp3); 5464 __ pop(spilled_regs, sp); 5465 5466 // tmp2 points to the address of the last 4 Latin1 characters right now 5467 __ ldrs(vtmp, Address(tmp2)); 5468 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5469 __ fmovd(tmpL, vtmp); 5470 5471 __ eor(rscratch2, tmpU, tmpL); 5472 __ cbz(rscratch2, DONE); 5473 5474 // Find the first different characters in the longwords and 5475 // compute their difference. 5476 __ bind(CALCULATE_DIFFERENCE); 5477 __ rev(rscratch2, rscratch2); 5478 __ clz(rscratch2, rscratch2); 5479 __ andr(rscratch2, rscratch2, -16); 5480 __ lsrv(tmp1, tmp1, rscratch2); 5481 __ uxthw(tmp1, tmp1); 5482 __ lsrv(rscratch1, rscratch1, rscratch2); 5483 __ uxthw(rscratch1, rscratch1); 5484 __ subw(result, tmp1, rscratch1); 5485 __ bind(DONE); 5486 __ ret(lr); 5487 return entry; 5488 } 5489 5490 address generate_method_entry_barrier() { 5491 __ align(CodeEntryAlignment); 5492 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5493 5494 Label deoptimize_label; 5495 5496 address start = __ pc(); 5497 5498 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5499 5500 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5501 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5502 // We can get here despite the nmethod being good, if we have not 5503 // yet applied our cross modification fence (or data fence). 5504 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5505 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5506 __ ldrw(rscratch2, rscratch2); 5507 __ strw(rscratch2, thread_epoch_addr); 5508 __ isb(); 5509 __ membar(__ LoadLoad); 5510 } 5511 5512 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5513 5514 __ enter(); 5515 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5516 5517 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5518 5519 __ push_call_clobbered_registers(); 5520 5521 __ mov(c_rarg0, rscratch2); 5522 __ call_VM_leaf 5523 (CAST_FROM_FN_PTR 5524 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5525 5526 __ reset_last_Java_frame(true); 5527 5528 __ mov(rscratch1, r0); 5529 5530 __ pop_call_clobbered_registers(); 5531 5532 __ cbnz(rscratch1, deoptimize_label); 5533 5534 __ leave(); 5535 __ ret(lr); 5536 5537 __ BIND(deoptimize_label); 5538 5539 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5540 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5541 5542 __ mov(sp, rscratch1); 5543 __ br(rscratch2); 5544 5545 return start; 5546 } 5547 5548 // r0 = result 5549 // r1 = str1 5550 // r2 = cnt1 5551 // r3 = str2 5552 // r4 = cnt2 5553 // r10 = tmp1 5554 // r11 = tmp2 5555 address generate_compare_long_string_same_encoding(bool isLL) { 5556 __ align(CodeEntryAlignment); 5557 StubCodeMark mark(this, "StubRoutines", isLL 5558 ? "compare_long_string_same_encoding LL" 5559 : "compare_long_string_same_encoding UU"); 5560 address entry = __ pc(); 5561 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5562 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5563 5564 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5565 5566 // exit from large loop when less than 64 bytes left to read or we're about 5567 // to prefetch memory behind array border 5568 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5569 5570 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5571 __ eor(rscratch2, tmp1, tmp2); 5572 __ cbnz(rscratch2, CAL_DIFFERENCE); 5573 5574 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5575 // update pointers, because of previous read 5576 __ add(str1, str1, wordSize); 5577 __ add(str2, str2, wordSize); 5578 if (SoftwarePrefetchHintDistance >= 0) { 5579 __ align(OptoLoopAlignment); 5580 __ bind(LARGE_LOOP_PREFETCH); 5581 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5582 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5583 5584 for (int i = 0; i < 4; i++) { 5585 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5586 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5587 __ cmp(tmp1, tmp2); 5588 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5589 __ br(Assembler::NE, DIFF); 5590 } 5591 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5592 __ add(str1, str1, 64); 5593 __ add(str2, str2, 64); 5594 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5595 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5596 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5597 } 5598 5599 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5600 __ br(Assembler::LE, LESS16); 5601 __ align(OptoLoopAlignment); 5602 __ bind(LOOP_COMPARE16); 5603 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5604 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5605 __ cmp(tmp1, tmp2); 5606 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5607 __ br(Assembler::NE, DIFF); 5608 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5609 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5610 __ br(Assembler::LT, LESS16); 5611 5612 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5613 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5614 __ cmp(tmp1, tmp2); 5615 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5616 __ br(Assembler::NE, DIFF); 5617 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5618 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5619 __ br(Assembler::GE, LOOP_COMPARE16); 5620 __ cbz(cnt2, LENGTH_DIFF); 5621 5622 __ bind(LESS16); 5623 // each 8 compare 5624 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5625 __ br(Assembler::LE, LESS8); 5626 __ ldr(tmp1, Address(__ post(str1, 8))); 5627 __ ldr(tmp2, Address(__ post(str2, 8))); 5628 __ eor(rscratch2, tmp1, tmp2); 5629 __ cbnz(rscratch2, CAL_DIFFERENCE); 5630 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5631 5632 __ bind(LESS8); // directly load last 8 bytes 5633 if (!isLL) { 5634 __ add(cnt2, cnt2, cnt2); 5635 } 5636 __ ldr(tmp1, Address(str1, cnt2)); 5637 __ ldr(tmp2, Address(str2, cnt2)); 5638 __ eor(rscratch2, tmp1, tmp2); 5639 __ cbz(rscratch2, LENGTH_DIFF); 5640 __ b(CAL_DIFFERENCE); 5641 5642 __ bind(DIFF); 5643 __ cmp(tmp1, tmp2); 5644 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5645 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5646 // reuse rscratch2 register for the result of eor instruction 5647 __ eor(rscratch2, tmp1, tmp2); 5648 5649 __ bind(CAL_DIFFERENCE); 5650 __ rev(rscratch2, rscratch2); 5651 __ clz(rscratch2, rscratch2); 5652 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5653 __ lsrv(tmp1, tmp1, rscratch2); 5654 __ lsrv(tmp2, tmp2, rscratch2); 5655 if (isLL) { 5656 __ uxtbw(tmp1, tmp1); 5657 __ uxtbw(tmp2, tmp2); 5658 } else { 5659 __ uxthw(tmp1, tmp1); 5660 __ uxthw(tmp2, tmp2); 5661 } 5662 __ subw(result, tmp1, tmp2); 5663 5664 __ bind(LENGTH_DIFF); 5665 __ ret(lr); 5666 return entry; 5667 } 5668 5669 enum string_compare_mode { 5670 LL, 5671 LU, 5672 UL, 5673 UU, 5674 }; 5675 5676 // The following registers are declared in aarch64.ad 5677 // r0 = result 5678 // r1 = str1 5679 // r2 = cnt1 5680 // r3 = str2 5681 // r4 = cnt2 5682 // r10 = tmp1 5683 // r11 = tmp2 5684 // z0 = ztmp1 5685 // z1 = ztmp2 5686 // p0 = pgtmp1 5687 // p1 = pgtmp2 5688 address generate_compare_long_string_sve(string_compare_mode mode) { 5689 __ align(CodeEntryAlignment); 5690 address entry = __ pc(); 5691 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5692 tmp1 = r10, tmp2 = r11; 5693 5694 Label LOOP, DONE, MISMATCH; 5695 Register vec_len = tmp1; 5696 Register idx = tmp2; 5697 // The minimum of the string lengths has been stored in cnt2. 5698 Register cnt = cnt2; 5699 FloatRegister ztmp1 = z0, ztmp2 = z1; 5700 PRegister pgtmp1 = p0, pgtmp2 = p1; 5701 5702 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 5703 switch (mode) { \ 5704 case LL: \ 5705 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 5706 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 5707 break; \ 5708 case LU: \ 5709 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 5710 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5711 break; \ 5712 case UL: \ 5713 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5714 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 5715 break; \ 5716 case UU: \ 5717 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5718 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5719 break; \ 5720 default: \ 5721 ShouldNotReachHere(); \ 5722 } 5723 5724 const char* stubname; 5725 switch (mode) { 5726 case LL: stubname = "compare_long_string_same_encoding LL"; break; 5727 case LU: stubname = "compare_long_string_different_encoding LU"; break; 5728 case UL: stubname = "compare_long_string_different_encoding UL"; break; 5729 case UU: stubname = "compare_long_string_same_encoding UU"; break; 5730 default: ShouldNotReachHere(); 5731 } 5732 5733 StubCodeMark mark(this, "StubRoutines", stubname); 5734 5735 __ mov(idx, 0); 5736 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5737 5738 if (mode == LL) { 5739 __ sve_cntb(vec_len); 5740 } else { 5741 __ sve_cnth(vec_len); 5742 } 5743 5744 __ sub(rscratch1, cnt, vec_len); 5745 5746 __ bind(LOOP); 5747 5748 // main loop 5749 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5750 __ add(idx, idx, vec_len); 5751 // Compare strings. 5752 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5753 __ br(__ NE, MISMATCH); 5754 __ cmp(idx, rscratch1); 5755 __ br(__ LT, LOOP); 5756 5757 // post loop, last iteration 5758 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5759 5760 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5761 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5762 __ br(__ EQ, DONE); 5763 5764 __ bind(MISMATCH); 5765 5766 // Crop the vector to find its location. 5767 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 5768 // Extract the first different characters of each string. 5769 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 5770 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 5771 5772 // Compute the difference of the first different characters. 5773 __ sub(result, rscratch1, rscratch2); 5774 5775 __ bind(DONE); 5776 __ ret(lr); 5777 #undef LOAD_PAIR 5778 return entry; 5779 } 5780 5781 void generate_compare_long_strings() { 5782 if (UseSVE == 0) { 5783 StubRoutines::aarch64::_compare_long_string_LL 5784 = generate_compare_long_string_same_encoding(true); 5785 StubRoutines::aarch64::_compare_long_string_UU 5786 = generate_compare_long_string_same_encoding(false); 5787 StubRoutines::aarch64::_compare_long_string_LU 5788 = generate_compare_long_string_different_encoding(true); 5789 StubRoutines::aarch64::_compare_long_string_UL 5790 = generate_compare_long_string_different_encoding(false); 5791 } else { 5792 StubRoutines::aarch64::_compare_long_string_LL 5793 = generate_compare_long_string_sve(LL); 5794 StubRoutines::aarch64::_compare_long_string_UU 5795 = generate_compare_long_string_sve(UU); 5796 StubRoutines::aarch64::_compare_long_string_LU 5797 = generate_compare_long_string_sve(LU); 5798 StubRoutines::aarch64::_compare_long_string_UL 5799 = generate_compare_long_string_sve(UL); 5800 } 5801 } 5802 5803 // R0 = result 5804 // R1 = str2 5805 // R2 = cnt1 5806 // R3 = str1 5807 // R4 = cnt2 5808 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 5809 // 5810 // This generic linear code use few additional ideas, which makes it faster: 5811 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5812 // in order to skip initial loading(help in systems with 1 ld pipeline) 5813 // 2) we can use "fast" algorithm of finding single character to search for 5814 // first symbol with less branches(1 branch per each loaded register instead 5815 // of branch for each symbol), so, this is where constants like 5816 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5817 // 3) after loading and analyzing 1st register of source string, it can be 5818 // used to search for every 1st character entry, saving few loads in 5819 // comparison with "simplier-but-slower" implementation 5820 // 4) in order to avoid lots of push/pop operations, code below is heavily 5821 // re-using/re-initializing/compressing register values, which makes code 5822 // larger and a bit less readable, however, most of extra operations are 5823 // issued during loads or branches, so, penalty is minimal 5824 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5825 const char* stubName = str1_isL 5826 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5827 : "indexof_linear_uu"; 5828 __ align(CodeEntryAlignment); 5829 StubCodeMark mark(this, "StubRoutines", stubName); 5830 address entry = __ pc(); 5831 5832 int str1_chr_size = str1_isL ? 1 : 2; 5833 int str2_chr_size = str2_isL ? 1 : 2; 5834 int str1_chr_shift = str1_isL ? 0 : 1; 5835 int str2_chr_shift = str2_isL ? 0 : 1; 5836 bool isL = str1_isL && str2_isL; 5837 // parameters 5838 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5839 // temporary registers 5840 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5841 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5842 // redefinitions 5843 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5844 5845 __ push(spilled_regs, sp); 5846 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5847 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5848 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5849 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5850 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5851 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5852 // Read whole register from str1. It is safe, because length >=8 here 5853 __ ldr(ch1, Address(str1)); 5854 // Read whole register from str2. It is safe, because length >=8 here 5855 __ ldr(ch2, Address(str2)); 5856 __ sub(cnt2, cnt2, cnt1); 5857 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5858 if (str1_isL != str2_isL) { 5859 __ eor(v0, __ T16B, v0, v0); 5860 } 5861 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5862 __ mul(first, first, tmp1); 5863 // check if we have less than 1 register to check 5864 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5865 if (str1_isL != str2_isL) { 5866 __ fmovd(v1, ch1); 5867 } 5868 __ br(__ LE, L_SMALL); 5869 __ eor(ch2, first, ch2); 5870 if (str1_isL != str2_isL) { 5871 __ zip1(v1, __ T16B, v1, v0); 5872 } 5873 __ sub(tmp2, ch2, tmp1); 5874 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5875 __ bics(tmp2, tmp2, ch2); 5876 if (str1_isL != str2_isL) { 5877 __ fmovd(ch1, v1); 5878 } 5879 __ br(__ NE, L_HAS_ZERO); 5880 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5881 __ add(result, result, wordSize/str2_chr_size); 5882 __ add(str2, str2, wordSize); 5883 __ br(__ LT, L_POST_LOOP); 5884 __ BIND(L_LOOP); 5885 __ ldr(ch2, Address(str2)); 5886 __ eor(ch2, first, ch2); 5887 __ sub(tmp2, ch2, tmp1); 5888 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5889 __ bics(tmp2, tmp2, ch2); 5890 __ br(__ NE, L_HAS_ZERO); 5891 __ BIND(L_LOOP_PROCEED); 5892 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5893 __ add(str2, str2, wordSize); 5894 __ add(result, result, wordSize/str2_chr_size); 5895 __ br(__ GE, L_LOOP); 5896 __ BIND(L_POST_LOOP); 5897 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5898 __ br(__ LE, NOMATCH); 5899 __ ldr(ch2, Address(str2)); 5900 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5901 __ eor(ch2, first, ch2); 5902 __ sub(tmp2, ch2, tmp1); 5903 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5904 __ mov(tmp4, -1); // all bits set 5905 __ b(L_SMALL_PROCEED); 5906 __ align(OptoLoopAlignment); 5907 __ BIND(L_SMALL); 5908 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5909 __ eor(ch2, first, ch2); 5910 if (str1_isL != str2_isL) { 5911 __ zip1(v1, __ T16B, v1, v0); 5912 } 5913 __ sub(tmp2, ch2, tmp1); 5914 __ mov(tmp4, -1); // all bits set 5915 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5916 if (str1_isL != str2_isL) { 5917 __ fmovd(ch1, v1); // move converted 4 symbols 5918 } 5919 __ BIND(L_SMALL_PROCEED); 5920 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5921 __ bic(tmp2, tmp2, ch2); 5922 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5923 __ rbit(tmp2, tmp2); 5924 __ br(__ EQ, NOMATCH); 5925 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5926 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5927 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5928 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5929 if (str2_isL) { // LL 5930 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5931 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5932 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5933 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5934 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5935 } else { 5936 __ mov(ch2, 0xE); // all bits in byte set except last one 5937 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5938 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5939 __ lslv(tmp2, tmp2, tmp4); 5940 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5941 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5942 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5943 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5944 } 5945 __ cmp(ch1, ch2); 5946 __ mov(tmp4, wordSize/str2_chr_size); 5947 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5948 __ BIND(L_SMALL_CMP_LOOP); 5949 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5950 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5951 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5952 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5953 __ add(tmp4, tmp4, 1); 5954 __ cmp(tmp4, cnt1); 5955 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5956 __ cmp(first, ch2); 5957 __ br(__ EQ, L_SMALL_CMP_LOOP); 5958 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5959 __ cbz(tmp2, NOMATCH); // no more matches. exit 5960 __ clz(tmp4, tmp2); 5961 __ add(result, result, 1); // advance index 5962 __ add(str2, str2, str2_chr_size); // advance pointer 5963 __ b(L_SMALL_HAS_ZERO_LOOP); 5964 __ align(OptoLoopAlignment); 5965 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5966 __ cmp(first, ch2); 5967 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5968 __ b(DONE); 5969 __ align(OptoLoopAlignment); 5970 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5971 if (str2_isL) { // LL 5972 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5973 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5974 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5975 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5976 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5977 } else { 5978 __ mov(ch2, 0xE); // all bits in byte set except last one 5979 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5980 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5981 __ lslv(tmp2, tmp2, tmp4); 5982 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5983 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5984 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5985 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5986 } 5987 __ cmp(ch1, ch2); 5988 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5989 __ b(DONE); 5990 __ align(OptoLoopAlignment); 5991 __ BIND(L_HAS_ZERO); 5992 __ rbit(tmp2, tmp2); 5993 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 5994 // Now, perform compression of counters(cnt2 and cnt1) into one register. 5995 // It's fine because both counters are 32bit and are not changed in this 5996 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 5997 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 5998 __ sub(result, result, 1); 5999 __ BIND(L_HAS_ZERO_LOOP); 6000 __ mov(cnt1, wordSize/str2_chr_size); 6001 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6002 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6003 if (str2_isL) { 6004 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6005 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6006 __ lslv(tmp2, tmp2, tmp4); 6007 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6008 __ add(tmp4, tmp4, 1); 6009 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6010 __ lsl(tmp2, tmp2, 1); 6011 __ mov(tmp4, wordSize/str2_chr_size); 6012 } else { 6013 __ mov(ch2, 0xE); 6014 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6015 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6016 __ lslv(tmp2, tmp2, tmp4); 6017 __ add(tmp4, tmp4, 1); 6018 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6019 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6020 __ lsl(tmp2, tmp2, 1); 6021 __ mov(tmp4, wordSize/str2_chr_size); 6022 __ sub(str2, str2, str2_chr_size); 6023 } 6024 __ cmp(ch1, ch2); 6025 __ mov(tmp4, wordSize/str2_chr_size); 6026 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6027 __ BIND(L_CMP_LOOP); 6028 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6029 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6030 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6031 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6032 __ add(tmp4, tmp4, 1); 6033 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6034 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6035 __ cmp(cnt1, ch2); 6036 __ br(__ EQ, L_CMP_LOOP); 6037 __ BIND(L_CMP_LOOP_NOMATCH); 6038 // here we're not matched 6039 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6040 __ clz(tmp4, tmp2); 6041 __ add(str2, str2, str2_chr_size); // advance pointer 6042 __ b(L_HAS_ZERO_LOOP); 6043 __ align(OptoLoopAlignment); 6044 __ BIND(L_CMP_LOOP_LAST_CMP); 6045 __ cmp(cnt1, ch2); 6046 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6047 __ b(DONE); 6048 __ align(OptoLoopAlignment); 6049 __ BIND(L_CMP_LOOP_LAST_CMP2); 6050 if (str2_isL) { 6051 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6052 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6053 __ lslv(tmp2, tmp2, tmp4); 6054 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6055 __ add(tmp4, tmp4, 1); 6056 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6057 __ lsl(tmp2, tmp2, 1); 6058 } else { 6059 __ mov(ch2, 0xE); 6060 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6061 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6062 __ lslv(tmp2, tmp2, tmp4); 6063 __ add(tmp4, tmp4, 1); 6064 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6065 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6066 __ lsl(tmp2, tmp2, 1); 6067 __ sub(str2, str2, str2_chr_size); 6068 } 6069 __ cmp(ch1, ch2); 6070 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6071 __ b(DONE); 6072 __ align(OptoLoopAlignment); 6073 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6074 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6075 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6076 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6077 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6078 // result by analyzed characters value, so, we can just reset lower bits 6079 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6080 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6081 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6082 // index of last analyzed substring inside current octet. So, str2 in at 6083 // respective start address. We need to advance it to next octet 6084 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6085 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6086 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6087 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6088 __ movw(cnt2, cnt2); 6089 __ b(L_LOOP_PROCEED); 6090 __ align(OptoLoopAlignment); 6091 __ BIND(NOMATCH); 6092 __ mov(result, -1); 6093 __ BIND(DONE); 6094 __ pop(spilled_regs, sp); 6095 __ ret(lr); 6096 return entry; 6097 } 6098 6099 void generate_string_indexof_stubs() { 6100 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6101 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6102 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6103 } 6104 6105 void inflate_and_store_2_fp_registers(bool generatePrfm, 6106 FloatRegister src1, FloatRegister src2) { 6107 Register dst = r1; 6108 __ zip1(v1, __ T16B, src1, v0); 6109 __ zip2(v2, __ T16B, src1, v0); 6110 if (generatePrfm) { 6111 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6112 } 6113 __ zip1(v3, __ T16B, src2, v0); 6114 __ zip2(v4, __ T16B, src2, v0); 6115 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6116 } 6117 6118 // R0 = src 6119 // R1 = dst 6120 // R2 = len 6121 // R3 = len >> 3 6122 // V0 = 0 6123 // v1 = loaded 8 bytes 6124 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6125 address generate_large_byte_array_inflate() { 6126 __ align(CodeEntryAlignment); 6127 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6128 address entry = __ pc(); 6129 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6130 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6131 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6132 6133 // do one more 8-byte read to have address 16-byte aligned in most cases 6134 // also use single store instruction 6135 __ ldrd(v2, __ post(src, 8)); 6136 __ sub(octetCounter, octetCounter, 2); 6137 __ zip1(v1, __ T16B, v1, v0); 6138 __ zip1(v2, __ T16B, v2, v0); 6139 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6140 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6141 __ subs(rscratch1, octetCounter, large_loop_threshold); 6142 __ br(__ LE, LOOP_START); 6143 __ b(LOOP_PRFM_START); 6144 __ bind(LOOP_PRFM); 6145 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6146 __ bind(LOOP_PRFM_START); 6147 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6148 __ sub(octetCounter, octetCounter, 8); 6149 __ subs(rscratch1, octetCounter, large_loop_threshold); 6150 inflate_and_store_2_fp_registers(true, v3, v4); 6151 inflate_and_store_2_fp_registers(true, v5, v6); 6152 __ br(__ GT, LOOP_PRFM); 6153 __ cmp(octetCounter, (u1)8); 6154 __ br(__ LT, DONE); 6155 __ bind(LOOP); 6156 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6157 __ bind(LOOP_START); 6158 __ sub(octetCounter, octetCounter, 8); 6159 __ cmp(octetCounter, (u1)8); 6160 inflate_and_store_2_fp_registers(false, v3, v4); 6161 inflate_and_store_2_fp_registers(false, v5, v6); 6162 __ br(__ GE, LOOP); 6163 __ bind(DONE); 6164 __ ret(lr); 6165 return entry; 6166 } 6167 6168 /** 6169 * Arguments: 6170 * 6171 * Input: 6172 * c_rarg0 - current state address 6173 * c_rarg1 - H key address 6174 * c_rarg2 - data address 6175 * c_rarg3 - number of blocks 6176 * 6177 * Output: 6178 * Updated state at c_rarg0 6179 */ 6180 address generate_ghash_processBlocks() { 6181 // Bafflingly, GCM uses little-endian for the byte order, but 6182 // big-endian for the bit order. For example, the polynomial 1 is 6183 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6184 // 6185 // So, we must either reverse the bytes in each word and do 6186 // everything big-endian or reverse the bits in each byte and do 6187 // it little-endian. On AArch64 it's more idiomatic to reverse 6188 // the bits in each byte (we have an instruction, RBIT, to do 6189 // that) and keep the data in little-endian bit order through the 6190 // calculation, bit-reversing the inputs and outputs. 6191 6192 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6193 __ align(wordSize * 2); 6194 address p = __ pc(); 6195 __ emit_int64(0x87); // The low-order bits of the field 6196 // polynomial (i.e. p = z^7+z^2+z+1) 6197 // repeated in the low and high parts of a 6198 // 128-bit vector 6199 __ emit_int64(0x87); 6200 6201 __ align(CodeEntryAlignment); 6202 address start = __ pc(); 6203 6204 Register state = c_rarg0; 6205 Register subkeyH = c_rarg1; 6206 Register data = c_rarg2; 6207 Register blocks = c_rarg3; 6208 6209 FloatRegister vzr = v30; 6210 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6211 6212 __ ldrq(v24, p); // The field polynomial 6213 6214 __ ldrq(v0, Address(state)); 6215 __ ldrq(v1, Address(subkeyH)); 6216 6217 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6218 __ rbit(v0, __ T16B, v0); 6219 __ rev64(v1, __ T16B, v1); 6220 __ rbit(v1, __ T16B, v1); 6221 6222 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6223 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6224 6225 { 6226 Label L_ghash_loop; 6227 __ bind(L_ghash_loop); 6228 6229 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6230 // reversing each byte 6231 __ rbit(v2, __ T16B, v2); 6232 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6233 6234 // Multiply state in v2 by subkey in v1 6235 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6236 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6237 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6238 // Reduce v7:v5 by the field polynomial 6239 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6240 6241 __ sub(blocks, blocks, 1); 6242 __ cbnz(blocks, L_ghash_loop); 6243 } 6244 6245 // The bit-reversed result is at this point in v0 6246 __ rev64(v0, __ T16B, v0); 6247 __ rbit(v0, __ T16B, v0); 6248 6249 __ st1(v0, __ T16B, state); 6250 __ ret(lr); 6251 6252 return start; 6253 } 6254 6255 address generate_ghash_processBlocks_wide() { 6256 address small = generate_ghash_processBlocks(); 6257 6258 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6259 __ align(wordSize * 2); 6260 address p = __ pc(); 6261 __ emit_int64(0x87); // The low-order bits of the field 6262 // polynomial (i.e. p = z^7+z^2+z+1) 6263 // repeated in the low and high parts of a 6264 // 128-bit vector 6265 __ emit_int64(0x87); 6266 6267 __ align(CodeEntryAlignment); 6268 address start = __ pc(); 6269 6270 Register state = c_rarg0; 6271 Register subkeyH = c_rarg1; 6272 Register data = c_rarg2; 6273 Register blocks = c_rarg3; 6274 6275 const int unroll = 4; 6276 6277 __ cmp(blocks, (unsigned char)(unroll * 2)); 6278 __ br(__ LT, small); 6279 6280 if (unroll > 1) { 6281 // Save state before entering routine 6282 __ sub(sp, sp, 4 * 16); 6283 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6284 __ sub(sp, sp, 4 * 16); 6285 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6286 } 6287 6288 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6289 6290 if (unroll > 1) { 6291 // And restore state 6292 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6293 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6294 } 6295 6296 __ cmp(blocks, (unsigned char)0); 6297 __ br(__ GT, small); 6298 6299 __ ret(lr); 6300 6301 return start; 6302 } 6303 6304 void generate_base64_encode_simdround(Register src, Register dst, 6305 FloatRegister codec, u8 size) { 6306 6307 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6308 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6309 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6310 6311 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6312 6313 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6314 6315 __ ushr(ind0, arrangement, in0, 2); 6316 6317 __ ushr(ind1, arrangement, in1, 2); 6318 __ shl(in0, arrangement, in0, 6); 6319 __ orr(ind1, arrangement, ind1, in0); 6320 __ ushr(ind1, arrangement, ind1, 2); 6321 6322 __ ushr(ind2, arrangement, in2, 4); 6323 __ shl(in1, arrangement, in1, 4); 6324 __ orr(ind2, arrangement, in1, ind2); 6325 __ ushr(ind2, arrangement, ind2, 2); 6326 6327 __ shl(ind3, arrangement, in2, 2); 6328 __ ushr(ind3, arrangement, ind3, 2); 6329 6330 __ tbl(out0, arrangement, codec, 4, ind0); 6331 __ tbl(out1, arrangement, codec, 4, ind1); 6332 __ tbl(out2, arrangement, codec, 4, ind2); 6333 __ tbl(out3, arrangement, codec, 4, ind3); 6334 6335 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6336 } 6337 6338 /** 6339 * Arguments: 6340 * 6341 * Input: 6342 * c_rarg0 - src_start 6343 * c_rarg1 - src_offset 6344 * c_rarg2 - src_length 6345 * c_rarg3 - dest_start 6346 * c_rarg4 - dest_offset 6347 * c_rarg5 - isURL 6348 * 6349 */ 6350 address generate_base64_encodeBlock() { 6351 6352 static const char toBase64[64] = { 6353 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6354 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6355 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6356 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6357 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6358 }; 6359 6360 static const char toBase64URL[64] = { 6361 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6362 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6363 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6364 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6365 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6366 }; 6367 6368 __ align(CodeEntryAlignment); 6369 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6370 address start = __ pc(); 6371 6372 Register src = c_rarg0; // source array 6373 Register soff = c_rarg1; // source start offset 6374 Register send = c_rarg2; // source end offset 6375 Register dst = c_rarg3; // dest array 6376 Register doff = c_rarg4; // position for writing to dest array 6377 Register isURL = c_rarg5; // Base64 or URL character set 6378 6379 // c_rarg6 and c_rarg7 are free to use as temps 6380 Register codec = c_rarg6; 6381 Register length = c_rarg7; 6382 6383 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6384 6385 __ add(src, src, soff); 6386 __ add(dst, dst, doff); 6387 __ sub(length, send, soff); 6388 6389 // load the codec base address 6390 __ lea(codec, ExternalAddress((address) toBase64)); 6391 __ cbz(isURL, ProcessData); 6392 __ lea(codec, ExternalAddress((address) toBase64URL)); 6393 6394 __ BIND(ProcessData); 6395 6396 // too short to formup a SIMD loop, roll back 6397 __ cmp(length, (u1)24); 6398 __ br(Assembler::LT, Process3B); 6399 6400 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6401 6402 __ BIND(Process48B); 6403 __ cmp(length, (u1)48); 6404 __ br(Assembler::LT, Process24B); 6405 generate_base64_encode_simdround(src, dst, v0, 16); 6406 __ sub(length, length, 48); 6407 __ b(Process48B); 6408 6409 __ BIND(Process24B); 6410 __ cmp(length, (u1)24); 6411 __ br(Assembler::LT, SIMDExit); 6412 generate_base64_encode_simdround(src, dst, v0, 8); 6413 __ sub(length, length, 24); 6414 6415 __ BIND(SIMDExit); 6416 __ cbz(length, Exit); 6417 6418 __ BIND(Process3B); 6419 // 3 src bytes, 24 bits 6420 __ ldrb(r10, __ post(src, 1)); 6421 __ ldrb(r11, __ post(src, 1)); 6422 __ ldrb(r12, __ post(src, 1)); 6423 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6424 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6425 // codec index 6426 __ ubfmw(r15, r12, 18, 23); 6427 __ ubfmw(r14, r12, 12, 17); 6428 __ ubfmw(r13, r12, 6, 11); 6429 __ andw(r12, r12, 63); 6430 // get the code based on the codec 6431 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6432 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6433 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6434 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6435 __ strb(r15, __ post(dst, 1)); 6436 __ strb(r14, __ post(dst, 1)); 6437 __ strb(r13, __ post(dst, 1)); 6438 __ strb(r12, __ post(dst, 1)); 6439 __ sub(length, length, 3); 6440 __ cbnz(length, Process3B); 6441 6442 __ BIND(Exit); 6443 __ ret(lr); 6444 6445 return start; 6446 } 6447 6448 void generate_base64_decode_simdround(Register src, Register dst, 6449 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6450 6451 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6452 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6453 6454 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6455 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6456 6457 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6458 6459 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6460 6461 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6462 6463 // we need unsigned saturating subtract, to make sure all input values 6464 // in range [0, 63] will have 0U value in the higher half lookup 6465 __ uqsubv(decH0, __ T16B, in0, v27); 6466 __ uqsubv(decH1, __ T16B, in1, v27); 6467 __ uqsubv(decH2, __ T16B, in2, v27); 6468 __ uqsubv(decH3, __ T16B, in3, v27); 6469 6470 // lower half lookup 6471 __ tbl(decL0, arrangement, codecL, 4, in0); 6472 __ tbl(decL1, arrangement, codecL, 4, in1); 6473 __ tbl(decL2, arrangement, codecL, 4, in2); 6474 __ tbl(decL3, arrangement, codecL, 4, in3); 6475 6476 // higher half lookup 6477 __ tbx(decH0, arrangement, codecH, 4, decH0); 6478 __ tbx(decH1, arrangement, codecH, 4, decH1); 6479 __ tbx(decH2, arrangement, codecH, 4, decH2); 6480 __ tbx(decH3, arrangement, codecH, 4, decH3); 6481 6482 // combine lower and higher 6483 __ orr(decL0, arrangement, decL0, decH0); 6484 __ orr(decL1, arrangement, decL1, decH1); 6485 __ orr(decL2, arrangement, decL2, decH2); 6486 __ orr(decL3, arrangement, decL3, decH3); 6487 6488 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6489 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6490 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6491 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6492 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6493 __ orr(in0, arrangement, decH0, decH1); 6494 __ orr(in1, arrangement, decH2, decH3); 6495 __ orr(in2, arrangement, in0, in1); 6496 __ umaxv(in3, arrangement, in2); 6497 __ umov(rscratch2, in3, __ B, 0); 6498 6499 // get the data to output 6500 __ shl(out0, arrangement, decL0, 2); 6501 __ ushr(out1, arrangement, decL1, 4); 6502 __ orr(out0, arrangement, out0, out1); 6503 __ shl(out1, arrangement, decL1, 4); 6504 __ ushr(out2, arrangement, decL2, 2); 6505 __ orr(out1, arrangement, out1, out2); 6506 __ shl(out2, arrangement, decL2, 6); 6507 __ orr(out2, arrangement, out2, decL3); 6508 6509 __ cbz(rscratch2, NoIllegalData); 6510 6511 // handle illegal input 6512 __ umov(r10, in2, __ D, 0); 6513 if (size == 16) { 6514 __ cbnz(r10, ErrorInLowerHalf); 6515 6516 // illegal input is in higher half, store the lower half now. 6517 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6518 6519 __ umov(r10, in2, __ D, 1); 6520 __ umov(r11, out0, __ D, 1); 6521 __ umov(r12, out1, __ D, 1); 6522 __ umov(r13, out2, __ D, 1); 6523 __ b(StoreLegalData); 6524 6525 __ BIND(ErrorInLowerHalf); 6526 } 6527 __ umov(r11, out0, __ D, 0); 6528 __ umov(r12, out1, __ D, 0); 6529 __ umov(r13, out2, __ D, 0); 6530 6531 __ BIND(StoreLegalData); 6532 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6533 __ strb(r11, __ post(dst, 1)); 6534 __ strb(r12, __ post(dst, 1)); 6535 __ strb(r13, __ post(dst, 1)); 6536 __ lsr(r10, r10, 8); 6537 __ lsr(r11, r11, 8); 6538 __ lsr(r12, r12, 8); 6539 __ lsr(r13, r13, 8); 6540 __ b(StoreLegalData); 6541 6542 __ BIND(NoIllegalData); 6543 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6544 } 6545 6546 6547 /** 6548 * Arguments: 6549 * 6550 * Input: 6551 * c_rarg0 - src_start 6552 * c_rarg1 - src_offset 6553 * c_rarg2 - src_length 6554 * c_rarg3 - dest_start 6555 * c_rarg4 - dest_offset 6556 * c_rarg5 - isURL 6557 * c_rarg6 - isMIME 6558 * 6559 */ 6560 address generate_base64_decodeBlock() { 6561 6562 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6563 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6564 // titled "Base64 decoding". 6565 6566 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6567 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6568 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6569 static const uint8_t fromBase64ForNoSIMD[256] = { 6570 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6571 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6572 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6573 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6574 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6575 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6576 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6577 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6578 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6579 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6580 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6581 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6582 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6583 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6584 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6585 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6586 }; 6587 6588 static const uint8_t fromBase64URLForNoSIMD[256] = { 6589 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6590 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6591 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6592 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6593 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6594 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6595 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6596 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6597 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6598 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6599 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6600 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6601 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6602 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6603 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6604 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6605 }; 6606 6607 // A legal value of base64 code is in range [0, 127]. We need two lookups 6608 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6609 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6610 // table vector lookup use tbx, out of range indices are unchanged in 6611 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6612 // The value of index 64 is set to 0, so that we know that we already get the 6613 // decoded data with the 1st lookup. 6614 static const uint8_t fromBase64ForSIMD[128] = { 6615 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6616 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6617 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6618 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6619 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6620 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6621 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6622 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6623 }; 6624 6625 static const uint8_t fromBase64URLForSIMD[128] = { 6626 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6627 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6628 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6629 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6630 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6631 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6632 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6633 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6634 }; 6635 6636 __ align(CodeEntryAlignment); 6637 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6638 address start = __ pc(); 6639 6640 Register src = c_rarg0; // source array 6641 Register soff = c_rarg1; // source start offset 6642 Register send = c_rarg2; // source end offset 6643 Register dst = c_rarg3; // dest array 6644 Register doff = c_rarg4; // position for writing to dest array 6645 Register isURL = c_rarg5; // Base64 or URL character set 6646 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6647 6648 Register length = send; // reuse send as length of source data to process 6649 6650 Register simd_codec = c_rarg6; 6651 Register nosimd_codec = c_rarg7; 6652 6653 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6654 6655 __ enter(); 6656 6657 __ add(src, src, soff); 6658 __ add(dst, dst, doff); 6659 6660 __ mov(doff, dst); 6661 6662 __ sub(length, send, soff); 6663 __ bfm(length, zr, 0, 1); 6664 6665 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6666 __ cbz(isURL, ProcessData); 6667 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6668 6669 __ BIND(ProcessData); 6670 __ mov(rscratch1, length); 6671 __ cmp(length, (u1)144); // 144 = 80 + 64 6672 __ br(Assembler::LT, Process4B); 6673 6674 // In the MIME case, the line length cannot be more than 76 6675 // bytes (see RFC 2045). This is too short a block for SIMD 6676 // to be worthwhile, so we use non-SIMD here. 6677 __ movw(rscratch1, 79); 6678 6679 __ BIND(Process4B); 6680 __ ldrw(r14, __ post(src, 4)); 6681 __ ubfxw(r10, r14, 0, 8); 6682 __ ubfxw(r11, r14, 8, 8); 6683 __ ubfxw(r12, r14, 16, 8); 6684 __ ubfxw(r13, r14, 24, 8); 6685 // get the de-code 6686 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6687 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6688 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6689 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6690 // error detection, 255u indicates an illegal input 6691 __ orrw(r14, r10, r11); 6692 __ orrw(r15, r12, r13); 6693 __ orrw(r14, r14, r15); 6694 __ tbnz(r14, 7, Exit); 6695 // recover the data 6696 __ lslw(r14, r10, 10); 6697 __ bfiw(r14, r11, 4, 6); 6698 __ bfmw(r14, r12, 2, 5); 6699 __ rev16w(r14, r14); 6700 __ bfiw(r13, r12, 6, 2); 6701 __ strh(r14, __ post(dst, 2)); 6702 __ strb(r13, __ post(dst, 1)); 6703 // non-simd loop 6704 __ subsw(rscratch1, rscratch1, 4); 6705 __ br(Assembler::GT, Process4B); 6706 6707 // if exiting from PreProcess80B, rscratch1 == -1; 6708 // otherwise, rscratch1 == 0. 6709 __ cbzw(rscratch1, Exit); 6710 __ sub(length, length, 80); 6711 6712 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6713 __ cbz(isURL, SIMDEnter); 6714 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6715 6716 __ BIND(SIMDEnter); 6717 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6718 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6719 __ mov(rscratch1, 63); 6720 __ dup(v27, __ T16B, rscratch1); 6721 6722 __ BIND(Process64B); 6723 __ cmp(length, (u1)64); 6724 __ br(Assembler::LT, Process32B); 6725 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6726 __ sub(length, length, 64); 6727 __ b(Process64B); 6728 6729 __ BIND(Process32B); 6730 __ cmp(length, (u1)32); 6731 __ br(Assembler::LT, SIMDExit); 6732 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6733 __ sub(length, length, 32); 6734 __ b(Process32B); 6735 6736 __ BIND(SIMDExit); 6737 __ cbz(length, Exit); 6738 __ movw(rscratch1, length); 6739 __ b(Process4B); 6740 6741 __ BIND(Exit); 6742 __ sub(c_rarg0, dst, doff); 6743 6744 __ leave(); 6745 __ ret(lr); 6746 6747 return start; 6748 } 6749 6750 // Support for spin waits. 6751 address generate_spin_wait() { 6752 __ align(CodeEntryAlignment); 6753 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6754 address start = __ pc(); 6755 6756 __ spin_wait(); 6757 __ ret(lr); 6758 6759 return start; 6760 } 6761 6762 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 6763 6764 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6765 // 6766 // If LSE is in use, generate LSE versions of all the stubs. The 6767 // non-LSE versions are in atomic_aarch64.S. 6768 6769 // class AtomicStubMark records the entry point of a stub and the 6770 // stub pointer which will point to it. The stub pointer is set to 6771 // the entry point when ~AtomicStubMark() is called, which must be 6772 // after ICache::invalidate_range. This ensures safe publication of 6773 // the generated code. 6774 class AtomicStubMark { 6775 address _entry_point; 6776 aarch64_atomic_stub_t *_stub; 6777 MacroAssembler *_masm; 6778 public: 6779 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6780 _masm = masm; 6781 __ align(32); 6782 _entry_point = __ pc(); 6783 _stub = stub; 6784 } 6785 ~AtomicStubMark() { 6786 *_stub = (aarch64_atomic_stub_t)_entry_point; 6787 } 6788 }; 6789 6790 // NB: For memory_order_conservative we need a trailing membar after 6791 // LSE atomic operations but not a leading membar. 6792 // 6793 // We don't need a leading membar because a clause in the Arm ARM 6794 // says: 6795 // 6796 // Barrier-ordered-before 6797 // 6798 // Barrier instructions order prior Memory effects before subsequent 6799 // Memory effects generated by the same Observer. A read or a write 6800 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6801 // Observer if and only if RW1 appears in program order before RW 2 6802 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6803 // instruction with both Acquire and Release semantics. 6804 // 6805 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6806 // and Release semantics, therefore we don't need a leading 6807 // barrier. However, there is no corresponding Barrier-ordered-after 6808 // relationship, therefore we need a trailing membar to prevent a 6809 // later store or load from being reordered with the store in an 6810 // atomic instruction. 6811 // 6812 // This was checked by using the herd7 consistency model simulator 6813 // (http://diy.inria.fr/) with this test case: 6814 // 6815 // AArch64 LseCas 6816 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6817 // P0 | P1; 6818 // LDR W4, [X2] | MOV W3, #0; 6819 // DMB LD | MOV W4, #1; 6820 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6821 // | DMB ISH; 6822 // | STR W4, [X2]; 6823 // exists 6824 // (0:X3=0 /\ 0:X4=1) 6825 // 6826 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6827 // with the store to x in P1. Without the DMB in P1 this may happen. 6828 // 6829 // At the time of writing we don't know of any AArch64 hardware that 6830 // reorders stores in this way, but the Reference Manual permits it. 6831 6832 void gen_cas_entry(Assembler::operand_size size, 6833 atomic_memory_order order) { 6834 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6835 exchange_val = c_rarg2; 6836 bool acquire, release; 6837 switch (order) { 6838 case memory_order_relaxed: 6839 acquire = false; 6840 release = false; 6841 break; 6842 case memory_order_release: 6843 acquire = false; 6844 release = true; 6845 break; 6846 default: 6847 acquire = true; 6848 release = true; 6849 break; 6850 } 6851 __ mov(prev, compare_val); 6852 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6853 if (order == memory_order_conservative) { 6854 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6855 } 6856 if (size == Assembler::xword) { 6857 __ mov(r0, prev); 6858 } else { 6859 __ movw(r0, prev); 6860 } 6861 __ ret(lr); 6862 } 6863 6864 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6865 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6866 // If not relaxed, then default to conservative. Relaxed is the only 6867 // case we use enough to be worth specializing. 6868 if (order == memory_order_relaxed) { 6869 __ ldadd(size, incr, prev, addr); 6870 } else { 6871 __ ldaddal(size, incr, prev, addr); 6872 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6873 } 6874 if (size == Assembler::xword) { 6875 __ mov(r0, prev); 6876 } else { 6877 __ movw(r0, prev); 6878 } 6879 __ ret(lr); 6880 } 6881 6882 void gen_swpal_entry(Assembler::operand_size size) { 6883 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6884 __ swpal(size, incr, prev, addr); 6885 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6886 if (size == Assembler::xword) { 6887 __ mov(r0, prev); 6888 } else { 6889 __ movw(r0, prev); 6890 } 6891 __ ret(lr); 6892 } 6893 6894 void generate_atomic_entry_points() { 6895 if (! UseLSE) { 6896 return; 6897 } 6898 6899 __ align(CodeEntryAlignment); 6900 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6901 address first_entry = __ pc(); 6902 6903 // ADD, memory_order_conservative 6904 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6905 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6906 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6907 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6908 6909 // ADD, memory_order_relaxed 6910 AtomicStubMark mark_fetch_add_4_relaxed 6911 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6912 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6913 AtomicStubMark mark_fetch_add_8_relaxed 6914 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6915 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6916 6917 // XCHG, memory_order_conservative 6918 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6919 gen_swpal_entry(Assembler::word); 6920 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6921 gen_swpal_entry(Assembler::xword); 6922 6923 // CAS, memory_order_conservative 6924 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6925 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6926 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6927 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6928 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6929 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6930 6931 // CAS, memory_order_relaxed 6932 AtomicStubMark mark_cmpxchg_1_relaxed 6933 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6934 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6935 AtomicStubMark mark_cmpxchg_4_relaxed 6936 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6937 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6938 AtomicStubMark mark_cmpxchg_8_relaxed 6939 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6940 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6941 6942 AtomicStubMark mark_cmpxchg_4_release 6943 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6944 gen_cas_entry(MacroAssembler::word, memory_order_release); 6945 AtomicStubMark mark_cmpxchg_8_release 6946 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 6947 gen_cas_entry(MacroAssembler::xword, memory_order_release); 6948 6949 AtomicStubMark mark_cmpxchg_4_seq_cst 6950 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 6951 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 6952 AtomicStubMark mark_cmpxchg_8_seq_cst 6953 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 6954 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 6955 6956 ICache::invalidate_range(first_entry, __ pc() - first_entry); 6957 } 6958 #endif // LINUX 6959 6960 address generate_cont_thaw(Continuation::thaw_kind kind) { 6961 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 6962 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 6963 6964 address start = __ pc(); 6965 6966 if (return_barrier) { 6967 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 6968 __ mov(sp, rscratch1); 6969 } 6970 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6971 6972 if (return_barrier) { 6973 // preserve possible return value from a method returning to the return barrier 6974 __ fmovd(rscratch1, v0); 6975 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 6976 } 6977 6978 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 6979 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 6980 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 6981 6982 if (return_barrier) { 6983 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 6984 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 6985 __ fmovd(v0, rscratch1); 6986 } 6987 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6988 6989 6990 Label thaw_success; 6991 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 6992 __ cbnz(rscratch2, thaw_success); 6993 __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry())); 6994 __ br(rscratch1); 6995 __ bind(thaw_success); 6996 6997 // make room for the thawed frames 6998 __ sub(rscratch1, sp, rscratch2); 6999 __ andr(rscratch1, rscratch1, -16); // align 7000 __ mov(sp, rscratch1); 7001 7002 if (return_barrier) { 7003 // save original return value -- again 7004 __ fmovd(rscratch1, v0); 7005 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7006 } 7007 7008 // If we want, we can templatize thaw by kind, and have three different entries 7009 __ movw(c_rarg1, (uint32_t)kind); 7010 7011 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7012 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7013 7014 if (return_barrier) { 7015 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7016 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7017 __ fmovd(v0, rscratch1); 7018 } else { 7019 __ mov(r0, zr); // return 0 (success) from doYield 7020 } 7021 7022 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7023 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7024 __ mov(rfp, sp); 7025 7026 if (return_barrier_exception) { 7027 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7028 __ authenticate_return_address(c_rarg1); 7029 __ verify_oop(r0); 7030 // save return value containing the exception oop in callee-saved R19 7031 __ mov(r19, r0); 7032 7033 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7034 7035 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7036 // __ reinitialize_ptrue(); 7037 7038 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7039 7040 __ mov(r1, r0); // the exception handler 7041 __ mov(r0, r19); // restore return value containing the exception oop 7042 __ verify_oop(r0); 7043 7044 __ leave(); 7045 __ mov(r3, lr); 7046 __ br(r1); // the exception handler 7047 } else { 7048 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7049 __ leave(); 7050 __ ret(lr); 7051 } 7052 7053 return start; 7054 } 7055 7056 address generate_cont_thaw() { 7057 if (!Continuations::enabled()) return nullptr; 7058 7059 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7060 address start = __ pc(); 7061 generate_cont_thaw(Continuation::thaw_top); 7062 return start; 7063 } 7064 7065 address generate_cont_returnBarrier() { 7066 if (!Continuations::enabled()) return nullptr; 7067 7068 // TODO: will probably need multiple return barriers depending on return type 7069 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7070 address start = __ pc(); 7071 7072 generate_cont_thaw(Continuation::thaw_return_barrier); 7073 7074 return start; 7075 } 7076 7077 address generate_cont_returnBarrier_exception() { 7078 if (!Continuations::enabled()) return nullptr; 7079 7080 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7081 address start = __ pc(); 7082 7083 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7084 7085 return start; 7086 } 7087 7088 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7089 // are represented as long[5], with BITS_PER_LIMB = 26. 7090 // Pack five 26-bit limbs into three 64-bit registers. 7091 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7092 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7093 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7094 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7095 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7096 7097 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7098 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7099 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7100 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7101 7102 if (dest2->is_valid()) { 7103 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7104 } else { 7105 #ifdef ASSERT 7106 Label OK; 7107 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7108 __ br(__ EQ, OK); 7109 __ stop("high bits of Poly1305 integer should be zero"); 7110 __ should_not_reach_here(); 7111 __ bind(OK); 7112 #endif 7113 } 7114 } 7115 7116 // As above, but return only a 128-bit integer, packed into two 7117 // 64-bit registers. 7118 void pack_26(Register dest0, Register dest1, Register src) { 7119 pack_26(dest0, dest1, noreg, src); 7120 } 7121 7122 // Multiply and multiply-accumulate unsigned 64-bit registers. 7123 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7124 __ mul(prod_lo, n, m); 7125 __ umulh(prod_hi, n, m); 7126 } 7127 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7128 wide_mul(rscratch1, rscratch2, n, m); 7129 __ adds(sum_lo, sum_lo, rscratch1); 7130 __ adc(sum_hi, sum_hi, rscratch2); 7131 } 7132 7133 // Poly1305, RFC 7539 7134 7135 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7136 // description of the tricks used to simplify and accelerate this 7137 // computation. 7138 7139 address generate_poly1305_processBlocks() { 7140 __ align(CodeEntryAlignment); 7141 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 7142 address start = __ pc(); 7143 Label here; 7144 __ enter(); 7145 RegSet callee_saved = RegSet::range(r19, r28); 7146 __ push(callee_saved, sp); 7147 7148 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7149 7150 // Arguments 7151 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7152 7153 // R_n is the 128-bit randomly-generated key, packed into two 7154 // registers. The caller passes this key to us as long[5], with 7155 // BITS_PER_LIMB = 26. 7156 const Register R_0 = *++regs, R_1 = *++regs; 7157 pack_26(R_0, R_1, r_start); 7158 7159 // RR_n is (R_n >> 2) * 5 7160 const Register RR_0 = *++regs, RR_1 = *++regs; 7161 __ lsr(RR_0, R_0, 2); 7162 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7163 __ lsr(RR_1, R_1, 2); 7164 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7165 7166 // U_n is the current checksum 7167 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7168 pack_26(U_0, U_1, U_2, acc_start); 7169 7170 static constexpr int BLOCK_LENGTH = 16; 7171 Label DONE, LOOP; 7172 7173 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7174 __ br(Assembler::LT, DONE); { 7175 __ bind(LOOP); 7176 7177 // S_n is to be the sum of U_n and the next block of data 7178 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7179 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7180 __ adds(S_0, U_0, S_0); 7181 __ adcs(S_1, U_1, S_1); 7182 __ adc(S_2, U_2, zr); 7183 __ add(S_2, S_2, 1); 7184 7185 const Register U_0HI = *++regs, U_1HI = *++regs; 7186 7187 // NB: this logic depends on some of the special properties of 7188 // Poly1305 keys. In particular, because we know that the top 7189 // four bits of R_0 and R_1 are zero, we can add together 7190 // partial products without any risk of needing to propagate a 7191 // carry out. 7192 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7193 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7194 __ andr(U_2, R_0, 3); 7195 __ mul(U_2, S_2, U_2); 7196 7197 // Recycle registers S_0, S_1, S_2 7198 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7199 7200 // Partial reduction mod 2**130 - 5 7201 __ adds(U_1, U_0HI, U_1); 7202 __ adc(U_2, U_1HI, U_2); 7203 // Sum now in U_2:U_1:U_0. 7204 // Dead: U_0HI, U_1HI. 7205 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7206 7207 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7208 7209 // First, U_2:U_1:U_0 += (U_2 >> 2) 7210 __ lsr(rscratch1, U_2, 2); 7211 __ andr(U_2, U_2, (u8)3); 7212 __ adds(U_0, U_0, rscratch1); 7213 __ adcs(U_1, U_1, zr); 7214 __ adc(U_2, U_2, zr); 7215 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7216 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7217 __ adcs(U_1, U_1, zr); 7218 __ adc(U_2, U_2, zr); 7219 7220 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7221 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7222 __ br(~ Assembler::LT, LOOP); 7223 } 7224 7225 // Further reduce modulo 2^130 - 5 7226 __ lsr(rscratch1, U_2, 2); 7227 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7228 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7229 __ adcs(U_1, U_1, zr); 7230 __ andr(U_2, U_2, (u1)3); 7231 __ adc(U_2, U_2, zr); 7232 7233 // Unpack the sum into five 26-bit limbs and write to memory. 7234 __ ubfiz(rscratch1, U_0, 0, 26); 7235 __ ubfx(rscratch2, U_0, 26, 26); 7236 __ stp(rscratch1, rscratch2, Address(acc_start)); 7237 __ ubfx(rscratch1, U_0, 52, 12); 7238 __ bfi(rscratch1, U_1, 12, 14); 7239 __ ubfx(rscratch2, U_1, 14, 26); 7240 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7241 __ ubfx(rscratch1, U_1, 40, 24); 7242 __ bfi(rscratch1, U_2, 24, 3); 7243 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7244 7245 __ bind(DONE); 7246 __ pop(callee_saved, sp); 7247 __ leave(); 7248 __ ret(lr); 7249 7250 return start; 7251 } 7252 7253 #if INCLUDE_JFR 7254 7255 static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { 7256 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7257 __ mov(c_rarg0, thread); 7258 } 7259 7260 // The handle is dereferenced through a load barrier. 7261 static void jfr_epilogue(MacroAssembler* _masm) { 7262 __ reset_last_Java_frame(true); 7263 } 7264 7265 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 7266 // It returns a jobject handle to the event writer. 7267 // The handle is dereferenced and the return value is the event writer oop. 7268 static RuntimeStub* generate_jfr_write_checkpoint() { 7269 enum layout { 7270 rbp_off, 7271 rbpH_off, 7272 return_off, 7273 return_off2, 7274 framesize // inclusive of return address 7275 }; 7276 7277 int insts_size = 1024; 7278 int locs_size = 64; 7279 CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size); 7280 OopMapSet* oop_maps = new OopMapSet(); 7281 MacroAssembler* masm = new MacroAssembler(&code); 7282 MacroAssembler* _masm = masm; 7283 7284 address start = __ pc(); 7285 __ enter(); 7286 int frame_complete = __ pc() - start; 7287 address the_pc = __ pc(); 7288 jfr_prologue(the_pc, _masm, rthread); 7289 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 7290 jfr_epilogue(_masm); 7291 __ resolve_global_jobject(r0, rscratch1, rscratch2); 7292 __ leave(); 7293 __ ret(lr); 7294 7295 OopMap* map = new OopMap(framesize, 1); // rfp 7296 oop_maps->add_gc_map(the_pc - start, map); 7297 7298 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 7299 RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete, 7300 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7301 oop_maps, false); 7302 return stub; 7303 } 7304 7305 // For c2: call to return a leased buffer. 7306 static RuntimeStub* generate_jfr_return_lease() { 7307 enum layout { 7308 rbp_off, 7309 rbpH_off, 7310 return_off, 7311 return_off2, 7312 framesize // inclusive of return address 7313 }; 7314 7315 int insts_size = 1024; 7316 int locs_size = 64; 7317 CodeBuffer code("jfr_return_lease", insts_size, locs_size); 7318 OopMapSet* oop_maps = new OopMapSet(); 7319 MacroAssembler* masm = new MacroAssembler(&code); 7320 MacroAssembler* _masm = masm; 7321 7322 address start = __ pc(); 7323 __ enter(); 7324 int frame_complete = __ pc() - start; 7325 address the_pc = __ pc(); 7326 jfr_prologue(the_pc, _masm, rthread); 7327 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1); 7328 jfr_epilogue(_masm); 7329 7330 __ leave(); 7331 __ ret(lr); 7332 7333 OopMap* map = new OopMap(framesize, 1); // rfp 7334 oop_maps->add_gc_map(the_pc - start, map); 7335 7336 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 7337 RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete, 7338 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7339 oop_maps, false); 7340 return stub; 7341 } 7342 7343 #endif // INCLUDE_JFR 7344 7345 // exception handler for upcall stubs 7346 address generate_upcall_stub_exception_handler() { 7347 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 7348 address start = __ pc(); 7349 7350 // Native caller has no idea how to handle exceptions, 7351 // so we just crash here. Up to callee to catch exceptions. 7352 __ verify_oop(r0); 7353 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7354 __ blr(rscratch1); 7355 __ should_not_reach_here(); 7356 7357 return start; 7358 } 7359 7360 // Continuation point for throwing of implicit exceptions that are 7361 // not handled in the current activation. Fabricates an exception 7362 // oop and initiates normal exception dispatching in this 7363 // frame. Since we need to preserve callee-saved values (currently 7364 // only for C2, but done for C1 as well) we need a callee-saved oop 7365 // map and therefore have to make these stubs into RuntimeStubs 7366 // rather than BufferBlobs. If the compiler needs all registers to 7367 // be preserved between the fault point and the exception handler 7368 // then it must assume responsibility for that in 7369 // AbstractCompiler::continuation_for_implicit_null_exception or 7370 // continuation_for_implicit_division_by_zero_exception. All other 7371 // implicit exceptions (e.g., NullPointerException or 7372 // AbstractMethodError on entry) are either at call sites or 7373 // otherwise assume that stack unwinding will be initiated, so 7374 // caller saved registers were assumed volatile in the compiler. 7375 7376 #undef __ 7377 #define __ masm-> 7378 7379 address generate_throw_exception(const char* name, 7380 address runtime_entry, 7381 Register arg1 = noreg, 7382 Register arg2 = noreg) { 7383 // Information about frame layout at time of blocking runtime call. 7384 // Note that we only have to preserve callee-saved registers since 7385 // the compilers are responsible for supplying a continuation point 7386 // if they expect all registers to be preserved. 7387 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 7388 enum layout { 7389 rfp_off = 0, 7390 rfp_off2, 7391 return_off, 7392 return_off2, 7393 framesize // inclusive of return address 7394 }; 7395 7396 int insts_size = 512; 7397 int locs_size = 64; 7398 7399 CodeBuffer code(name, insts_size, locs_size); 7400 OopMapSet* oop_maps = new OopMapSet(); 7401 MacroAssembler* masm = new MacroAssembler(&code); 7402 7403 address start = __ pc(); 7404 7405 // This is an inlined and slightly modified version of call_VM 7406 // which has the ability to fetch the return PC out of 7407 // thread-local storage and also sets up last_Java_sp slightly 7408 // differently than the real call_VM 7409 7410 __ enter(); // Save FP and LR before call 7411 7412 assert(is_even(framesize/2), "sp not 16-byte aligned"); 7413 7414 // lr and fp are already in place 7415 __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog 7416 7417 int frame_complete = __ pc() - start; 7418 7419 // Set up last_Java_sp and last_Java_fp 7420 address the_pc = __ pc(); 7421 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7422 7423 // Call runtime 7424 if (arg1 != noreg) { 7425 assert(arg2 != c_rarg1, "clobbered"); 7426 __ mov(c_rarg1, arg1); 7427 } 7428 if (arg2 != noreg) { 7429 __ mov(c_rarg2, arg2); 7430 } 7431 __ mov(c_rarg0, rthread); 7432 BLOCK_COMMENT("call runtime_entry"); 7433 __ mov(rscratch1, runtime_entry); 7434 __ blr(rscratch1); 7435 7436 // Generate oop map 7437 OopMap* map = new OopMap(framesize, 0); 7438 7439 oop_maps->add_gc_map(the_pc - start, map); 7440 7441 __ reset_last_Java_frame(true); 7442 7443 // Reinitialize the ptrue predicate register, in case the external runtime 7444 // call clobbers ptrue reg, as we may return to SVE compiled code. 7445 __ reinitialize_ptrue(); 7446 7447 __ leave(); 7448 7449 // check for pending exceptions 7450 #ifdef ASSERT 7451 Label L; 7452 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 7453 __ cbnz(rscratch1, L); 7454 __ should_not_reach_here(); 7455 __ bind(L); 7456 #endif // ASSERT 7457 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 7458 7459 // codeBlob framesize is in words (not VMRegImpl::slot_size) 7460 RuntimeStub* stub = 7461 RuntimeStub::new_runtime_stub(name, 7462 &code, 7463 frame_complete, 7464 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7465 oop_maps, false); 7466 return stub->entry_point(); 7467 } 7468 7469 class MontgomeryMultiplyGenerator : public MacroAssembler { 7470 7471 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7472 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7473 7474 RegSet _toSave; 7475 bool _squaring; 7476 7477 public: 7478 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7479 : MacroAssembler(as->code()), _squaring(squaring) { 7480 7481 // Register allocation 7482 7483 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7484 Pa_base = *regs; // Argument registers 7485 if (squaring) 7486 Pb_base = Pa_base; 7487 else 7488 Pb_base = *++regs; 7489 Pn_base = *++regs; 7490 Rlen= *++regs; 7491 inv = *++regs; 7492 Pm_base = *++regs; 7493 7494 // Working registers: 7495 Ra = *++regs; // The current digit of a, b, n, and m. 7496 Rb = *++regs; 7497 Rm = *++regs; 7498 Rn = *++regs; 7499 7500 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7501 Pb = *++regs; 7502 Pm = *++regs; 7503 Pn = *++regs; 7504 7505 t0 = *++regs; // Three registers which form a 7506 t1 = *++regs; // triple-precision accumuator. 7507 t2 = *++regs; 7508 7509 Ri = *++regs; // Inner and outer loop indexes. 7510 Rj = *++regs; 7511 7512 Rhi_ab = *++regs; // Product registers: low and high parts 7513 Rlo_ab = *++regs; // of a*b and m*n. 7514 Rhi_mn = *++regs; 7515 Rlo_mn = *++regs; 7516 7517 // r19 and up are callee-saved. 7518 _toSave = RegSet::range(r19, *regs) + Pm_base; 7519 } 7520 7521 private: 7522 void save_regs() { 7523 push(_toSave, sp); 7524 } 7525 7526 void restore_regs() { 7527 pop(_toSave, sp); 7528 } 7529 7530 template <typename T> 7531 void unroll_2(Register count, T block) { 7532 Label loop, end, odd; 7533 tbnz(count, 0, odd); 7534 cbz(count, end); 7535 align(16); 7536 bind(loop); 7537 (this->*block)(); 7538 bind(odd); 7539 (this->*block)(); 7540 subs(count, count, 2); 7541 br(Assembler::GT, loop); 7542 bind(end); 7543 } 7544 7545 template <typename T> 7546 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7547 Label loop, end, odd; 7548 tbnz(count, 0, odd); 7549 cbz(count, end); 7550 align(16); 7551 bind(loop); 7552 (this->*block)(d, s, tmp); 7553 bind(odd); 7554 (this->*block)(d, s, tmp); 7555 subs(count, count, 2); 7556 br(Assembler::GT, loop); 7557 bind(end); 7558 } 7559 7560 void pre1(RegisterOrConstant i) { 7561 block_comment("pre1"); 7562 // Pa = Pa_base; 7563 // Pb = Pb_base + i; 7564 // Pm = Pm_base; 7565 // Pn = Pn_base + i; 7566 // Ra = *Pa; 7567 // Rb = *Pb; 7568 // Rm = *Pm; 7569 // Rn = *Pn; 7570 ldr(Ra, Address(Pa_base)); 7571 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7572 ldr(Rm, Address(Pm_base)); 7573 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7574 lea(Pa, Address(Pa_base)); 7575 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7576 lea(Pm, Address(Pm_base)); 7577 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7578 7579 // Zero the m*n result. 7580 mov(Rhi_mn, zr); 7581 mov(Rlo_mn, zr); 7582 } 7583 7584 // The core multiply-accumulate step of a Montgomery 7585 // multiplication. The idea is to schedule operations as a 7586 // pipeline so that instructions with long latencies (loads and 7587 // multiplies) have time to complete before their results are 7588 // used. This most benefits in-order implementations of the 7589 // architecture but out-of-order ones also benefit. 7590 void step() { 7591 block_comment("step"); 7592 // MACC(Ra, Rb, t0, t1, t2); 7593 // Ra = *++Pa; 7594 // Rb = *--Pb; 7595 umulh(Rhi_ab, Ra, Rb); 7596 mul(Rlo_ab, Ra, Rb); 7597 ldr(Ra, pre(Pa, wordSize)); 7598 ldr(Rb, pre(Pb, -wordSize)); 7599 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7600 // previous iteration. 7601 // MACC(Rm, Rn, t0, t1, t2); 7602 // Rm = *++Pm; 7603 // Rn = *--Pn; 7604 umulh(Rhi_mn, Rm, Rn); 7605 mul(Rlo_mn, Rm, Rn); 7606 ldr(Rm, pre(Pm, wordSize)); 7607 ldr(Rn, pre(Pn, -wordSize)); 7608 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7609 } 7610 7611 void post1() { 7612 block_comment("post1"); 7613 7614 // MACC(Ra, Rb, t0, t1, t2); 7615 // Ra = *++Pa; 7616 // Rb = *--Pb; 7617 umulh(Rhi_ab, Ra, Rb); 7618 mul(Rlo_ab, Ra, Rb); 7619 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7620 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7621 7622 // *Pm = Rm = t0 * inv; 7623 mul(Rm, t0, inv); 7624 str(Rm, Address(Pm)); 7625 7626 // MACC(Rm, Rn, t0, t1, t2); 7627 // t0 = t1; t1 = t2; t2 = 0; 7628 umulh(Rhi_mn, Rm, Rn); 7629 7630 #ifndef PRODUCT 7631 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7632 { 7633 mul(Rlo_mn, Rm, Rn); 7634 add(Rlo_mn, t0, Rlo_mn); 7635 Label ok; 7636 cbz(Rlo_mn, ok); { 7637 stop("broken Montgomery multiply"); 7638 } bind(ok); 7639 } 7640 #endif 7641 // We have very carefully set things up so that 7642 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7643 // the lower half of Rm * Rn because we know the result already: 7644 // it must be -t0. t0 + (-t0) must generate a carry iff 7645 // t0 != 0. So, rather than do a mul and an adds we just set 7646 // the carry flag iff t0 is nonzero. 7647 // 7648 // mul(Rlo_mn, Rm, Rn); 7649 // adds(zr, t0, Rlo_mn); 7650 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7651 adcs(t0, t1, Rhi_mn); 7652 adc(t1, t2, zr); 7653 mov(t2, zr); 7654 } 7655 7656 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7657 block_comment("pre2"); 7658 // Pa = Pa_base + i-len; 7659 // Pb = Pb_base + len; 7660 // Pm = Pm_base + i-len; 7661 // Pn = Pn_base + len; 7662 7663 if (i.is_register()) { 7664 sub(Rj, i.as_register(), len); 7665 } else { 7666 mov(Rj, i.as_constant()); 7667 sub(Rj, Rj, len); 7668 } 7669 // Rj == i-len 7670 7671 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7672 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7673 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7674 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7675 7676 // Ra = *++Pa; 7677 // Rb = *--Pb; 7678 // Rm = *++Pm; 7679 // Rn = *--Pn; 7680 ldr(Ra, pre(Pa, wordSize)); 7681 ldr(Rb, pre(Pb, -wordSize)); 7682 ldr(Rm, pre(Pm, wordSize)); 7683 ldr(Rn, pre(Pn, -wordSize)); 7684 7685 mov(Rhi_mn, zr); 7686 mov(Rlo_mn, zr); 7687 } 7688 7689 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7690 block_comment("post2"); 7691 if (i.is_constant()) { 7692 mov(Rj, i.as_constant()-len.as_constant()); 7693 } else { 7694 sub(Rj, i.as_register(), len); 7695 } 7696 7697 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7698 7699 // As soon as we know the least significant digit of our result, 7700 // store it. 7701 // Pm_base[i-len] = t0; 7702 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7703 7704 // t0 = t1; t1 = t2; t2 = 0; 7705 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7706 adc(t1, t2, zr); 7707 mov(t2, zr); 7708 } 7709 7710 // A carry in t0 after Montgomery multiplication means that we 7711 // should subtract multiples of n from our result in m. We'll 7712 // keep doing that until there is no carry. 7713 void normalize(RegisterOrConstant len) { 7714 block_comment("normalize"); 7715 // while (t0) 7716 // t0 = sub(Pm_base, Pn_base, t0, len); 7717 Label loop, post, again; 7718 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7719 cbz(t0, post); { 7720 bind(again); { 7721 mov(i, zr); 7722 mov(cnt, len); 7723 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7724 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7725 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7726 align(16); 7727 bind(loop); { 7728 sbcs(Rm, Rm, Rn); 7729 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7730 add(i, i, 1); 7731 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7732 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7733 sub(cnt, cnt, 1); 7734 } cbnz(cnt, loop); 7735 sbc(t0, t0, zr); 7736 } cbnz(t0, again); 7737 } bind(post); 7738 } 7739 7740 // Move memory at s to d, reversing words. 7741 // Increments d to end of copied memory 7742 // Destroys tmp1, tmp2 7743 // Preserves len 7744 // Leaves s pointing to the address which was in d at start 7745 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7746 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7747 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7748 7749 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7750 mov(tmp1, len); 7751 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7752 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7753 } 7754 // where 7755 void reverse1(Register d, Register s, Register tmp) { 7756 ldr(tmp, pre(s, -wordSize)); 7757 ror(tmp, tmp, 32); 7758 str(tmp, post(d, wordSize)); 7759 } 7760 7761 void step_squaring() { 7762 // An extra ACC 7763 step(); 7764 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7765 } 7766 7767 void last_squaring(RegisterOrConstant i) { 7768 Label dont; 7769 // if ((i & 1) == 0) { 7770 tbnz(i.as_register(), 0, dont); { 7771 // MACC(Ra, Rb, t0, t1, t2); 7772 // Ra = *++Pa; 7773 // Rb = *--Pb; 7774 umulh(Rhi_ab, Ra, Rb); 7775 mul(Rlo_ab, Ra, Rb); 7776 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7777 } bind(dont); 7778 } 7779 7780 void extra_step_squaring() { 7781 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7782 7783 // MACC(Rm, Rn, t0, t1, t2); 7784 // Rm = *++Pm; 7785 // Rn = *--Pn; 7786 umulh(Rhi_mn, Rm, Rn); 7787 mul(Rlo_mn, Rm, Rn); 7788 ldr(Rm, pre(Pm, wordSize)); 7789 ldr(Rn, pre(Pn, -wordSize)); 7790 } 7791 7792 void post1_squaring() { 7793 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7794 7795 // *Pm = Rm = t0 * inv; 7796 mul(Rm, t0, inv); 7797 str(Rm, Address(Pm)); 7798 7799 // MACC(Rm, Rn, t0, t1, t2); 7800 // t0 = t1; t1 = t2; t2 = 0; 7801 umulh(Rhi_mn, Rm, Rn); 7802 7803 #ifndef PRODUCT 7804 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7805 { 7806 mul(Rlo_mn, Rm, Rn); 7807 add(Rlo_mn, t0, Rlo_mn); 7808 Label ok; 7809 cbz(Rlo_mn, ok); { 7810 stop("broken Montgomery multiply"); 7811 } bind(ok); 7812 } 7813 #endif 7814 // We have very carefully set things up so that 7815 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7816 // the lower half of Rm * Rn because we know the result already: 7817 // it must be -t0. t0 + (-t0) must generate a carry iff 7818 // t0 != 0. So, rather than do a mul and an adds we just set 7819 // the carry flag iff t0 is nonzero. 7820 // 7821 // mul(Rlo_mn, Rm, Rn); 7822 // adds(zr, t0, Rlo_mn); 7823 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7824 adcs(t0, t1, Rhi_mn); 7825 adc(t1, t2, zr); 7826 mov(t2, zr); 7827 } 7828 7829 void acc(Register Rhi, Register Rlo, 7830 Register t0, Register t1, Register t2) { 7831 adds(t0, t0, Rlo); 7832 adcs(t1, t1, Rhi); 7833 adc(t2, t2, zr); 7834 } 7835 7836 public: 7837 /** 7838 * Fast Montgomery multiplication. The derivation of the 7839 * algorithm is in A Cryptographic Library for the Motorola 7840 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7841 * 7842 * Arguments: 7843 * 7844 * Inputs for multiplication: 7845 * c_rarg0 - int array elements a 7846 * c_rarg1 - int array elements b 7847 * c_rarg2 - int array elements n (the modulus) 7848 * c_rarg3 - int length 7849 * c_rarg4 - int inv 7850 * c_rarg5 - int array elements m (the result) 7851 * 7852 * Inputs for squaring: 7853 * c_rarg0 - int array elements a 7854 * c_rarg1 - int array elements n (the modulus) 7855 * c_rarg2 - int length 7856 * c_rarg3 - int inv 7857 * c_rarg4 - int array elements m (the result) 7858 * 7859 */ 7860 address generate_multiply() { 7861 Label argh, nothing; 7862 bind(argh); 7863 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7864 7865 align(CodeEntryAlignment); 7866 address entry = pc(); 7867 7868 cbzw(Rlen, nothing); 7869 7870 enter(); 7871 7872 // Make room. 7873 cmpw(Rlen, 512); 7874 br(Assembler::HI, argh); 7875 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7876 andr(sp, Ra, -2 * wordSize); 7877 7878 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7879 7880 { 7881 // Copy input args, reversing as we go. We use Ra as a 7882 // temporary variable. 7883 reverse(Ra, Pa_base, Rlen, t0, t1); 7884 if (!_squaring) 7885 reverse(Ra, Pb_base, Rlen, t0, t1); 7886 reverse(Ra, Pn_base, Rlen, t0, t1); 7887 } 7888 7889 // Push all call-saved registers and also Pm_base which we'll need 7890 // at the end. 7891 save_regs(); 7892 7893 #ifndef PRODUCT 7894 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7895 { 7896 ldr(Rn, Address(Pn_base, 0)); 7897 mul(Rlo_mn, Rn, inv); 7898 subs(zr, Rlo_mn, -1); 7899 Label ok; 7900 br(EQ, ok); { 7901 stop("broken inverse in Montgomery multiply"); 7902 } bind(ok); 7903 } 7904 #endif 7905 7906 mov(Pm_base, Ra); 7907 7908 mov(t0, zr); 7909 mov(t1, zr); 7910 mov(t2, zr); 7911 7912 block_comment("for (int i = 0; i < len; i++) {"); 7913 mov(Ri, zr); { 7914 Label loop, end; 7915 cmpw(Ri, Rlen); 7916 br(Assembler::GE, end); 7917 7918 bind(loop); 7919 pre1(Ri); 7920 7921 block_comment(" for (j = i; j; j--) {"); { 7922 movw(Rj, Ri); 7923 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7924 } block_comment(" } // j"); 7925 7926 post1(); 7927 addw(Ri, Ri, 1); 7928 cmpw(Ri, Rlen); 7929 br(Assembler::LT, loop); 7930 bind(end); 7931 block_comment("} // i"); 7932 } 7933 7934 block_comment("for (int i = len; i < 2*len; i++) {"); 7935 mov(Ri, Rlen); { 7936 Label loop, end; 7937 cmpw(Ri, Rlen, Assembler::LSL, 1); 7938 br(Assembler::GE, end); 7939 7940 bind(loop); 7941 pre2(Ri, Rlen); 7942 7943 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7944 lslw(Rj, Rlen, 1); 7945 subw(Rj, Rj, Ri); 7946 subw(Rj, Rj, 1); 7947 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7948 } block_comment(" } // j"); 7949 7950 post2(Ri, Rlen); 7951 addw(Ri, Ri, 1); 7952 cmpw(Ri, Rlen, Assembler::LSL, 1); 7953 br(Assembler::LT, loop); 7954 bind(end); 7955 } 7956 block_comment("} // i"); 7957 7958 normalize(Rlen); 7959 7960 mov(Ra, Pm_base); // Save Pm_base in Ra 7961 restore_regs(); // Restore caller's Pm_base 7962 7963 // Copy our result into caller's Pm_base 7964 reverse(Pm_base, Ra, Rlen, t0, t1); 7965 7966 leave(); 7967 bind(nothing); 7968 ret(lr); 7969 7970 return entry; 7971 } 7972 // In C, approximately: 7973 7974 // void 7975 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7976 // julong Pn_base[], julong Pm_base[], 7977 // julong inv, int len) { 7978 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7979 // julong *Pa, *Pb, *Pn, *Pm; 7980 // julong Ra, Rb, Rn, Rm; 7981 7982 // int i; 7983 7984 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7985 7986 // for (i = 0; i < len; i++) { 7987 // int j; 7988 7989 // Pa = Pa_base; 7990 // Pb = Pb_base + i; 7991 // Pm = Pm_base; 7992 // Pn = Pn_base + i; 7993 7994 // Ra = *Pa; 7995 // Rb = *Pb; 7996 // Rm = *Pm; 7997 // Rn = *Pn; 7998 7999 // int iters = i; 8000 // for (j = 0; iters--; j++) { 8001 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8002 // MACC(Ra, Rb, t0, t1, t2); 8003 // Ra = *++Pa; 8004 // Rb = *--Pb; 8005 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8006 // MACC(Rm, Rn, t0, t1, t2); 8007 // Rm = *++Pm; 8008 // Rn = *--Pn; 8009 // } 8010 8011 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 8012 // MACC(Ra, Rb, t0, t1, t2); 8013 // *Pm = Rm = t0 * inv; 8014 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8015 // MACC(Rm, Rn, t0, t1, t2); 8016 8017 // assert(t0 == 0, "broken Montgomery multiply"); 8018 8019 // t0 = t1; t1 = t2; t2 = 0; 8020 // } 8021 8022 // for (i = len; i < 2*len; i++) { 8023 // int j; 8024 8025 // Pa = Pa_base + i-len; 8026 // Pb = Pb_base + len; 8027 // Pm = Pm_base + i-len; 8028 // Pn = Pn_base + len; 8029 8030 // Ra = *++Pa; 8031 // Rb = *--Pb; 8032 // Rm = *++Pm; 8033 // Rn = *--Pn; 8034 8035 // int iters = len*2-i-1; 8036 // for (j = i-len+1; iters--; j++) { 8037 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8038 // MACC(Ra, Rb, t0, t1, t2); 8039 // Ra = *++Pa; 8040 // Rb = *--Pb; 8041 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8042 // MACC(Rm, Rn, t0, t1, t2); 8043 // Rm = *++Pm; 8044 // Rn = *--Pn; 8045 // } 8046 8047 // Pm_base[i-len] = t0; 8048 // t0 = t1; t1 = t2; t2 = 0; 8049 // } 8050 8051 // while (t0) 8052 // t0 = sub(Pm_base, Pn_base, t0, len); 8053 // } 8054 8055 /** 8056 * Fast Montgomery squaring. This uses asymptotically 25% fewer 8057 * multiplies than Montgomery multiplication so it should be up to 8058 * 25% faster. However, its loop control is more complex and it 8059 * may actually run slower on some machines. 8060 * 8061 * Arguments: 8062 * 8063 * Inputs: 8064 * c_rarg0 - int array elements a 8065 * c_rarg1 - int array elements n (the modulus) 8066 * c_rarg2 - int length 8067 * c_rarg3 - int inv 8068 * c_rarg4 - int array elements m (the result) 8069 * 8070 */ 8071 address generate_square() { 8072 Label argh; 8073 bind(argh); 8074 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8075 8076 align(CodeEntryAlignment); 8077 address entry = pc(); 8078 8079 enter(); 8080 8081 // Make room. 8082 cmpw(Rlen, 512); 8083 br(Assembler::HI, argh); 8084 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8085 andr(sp, Ra, -2 * wordSize); 8086 8087 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8088 8089 { 8090 // Copy input args, reversing as we go. We use Ra as a 8091 // temporary variable. 8092 reverse(Ra, Pa_base, Rlen, t0, t1); 8093 reverse(Ra, Pn_base, Rlen, t0, t1); 8094 } 8095 8096 // Push all call-saved registers and also Pm_base which we'll need 8097 // at the end. 8098 save_regs(); 8099 8100 mov(Pm_base, Ra); 8101 8102 mov(t0, zr); 8103 mov(t1, zr); 8104 mov(t2, zr); 8105 8106 block_comment("for (int i = 0; i < len; i++) {"); 8107 mov(Ri, zr); { 8108 Label loop, end; 8109 bind(loop); 8110 cmp(Ri, Rlen); 8111 br(Assembler::GE, end); 8112 8113 pre1(Ri); 8114 8115 block_comment("for (j = (i+1)/2; j; j--) {"); { 8116 add(Rj, Ri, 1); 8117 lsr(Rj, Rj, 1); 8118 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8119 } block_comment(" } // j"); 8120 8121 last_squaring(Ri); 8122 8123 block_comment(" for (j = i/2; j; j--) {"); { 8124 lsr(Rj, Ri, 1); 8125 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8126 } block_comment(" } // j"); 8127 8128 post1_squaring(); 8129 add(Ri, Ri, 1); 8130 cmp(Ri, Rlen); 8131 br(Assembler::LT, loop); 8132 8133 bind(end); 8134 block_comment("} // i"); 8135 } 8136 8137 block_comment("for (int i = len; i < 2*len; i++) {"); 8138 mov(Ri, Rlen); { 8139 Label loop, end; 8140 bind(loop); 8141 cmp(Ri, Rlen, Assembler::LSL, 1); 8142 br(Assembler::GE, end); 8143 8144 pre2(Ri, Rlen); 8145 8146 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8147 lsl(Rj, Rlen, 1); 8148 sub(Rj, Rj, Ri); 8149 sub(Rj, Rj, 1); 8150 lsr(Rj, Rj, 1); 8151 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8152 } block_comment(" } // j"); 8153 8154 last_squaring(Ri); 8155 8156 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8157 lsl(Rj, Rlen, 1); 8158 sub(Rj, Rj, Ri); 8159 lsr(Rj, Rj, 1); 8160 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8161 } block_comment(" } // j"); 8162 8163 post2(Ri, Rlen); 8164 add(Ri, Ri, 1); 8165 cmp(Ri, Rlen, Assembler::LSL, 1); 8166 8167 br(Assembler::LT, loop); 8168 bind(end); 8169 block_comment("} // i"); 8170 } 8171 8172 normalize(Rlen); 8173 8174 mov(Ra, Pm_base); // Save Pm_base in Ra 8175 restore_regs(); // Restore caller's Pm_base 8176 8177 // Copy our result into caller's Pm_base 8178 reverse(Pm_base, Ra, Rlen, t0, t1); 8179 8180 leave(); 8181 ret(lr); 8182 8183 return entry; 8184 } 8185 // In C, approximately: 8186 8187 // void 8188 // montgomery_square(julong Pa_base[], julong Pn_base[], 8189 // julong Pm_base[], julong inv, int len) { 8190 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8191 // julong *Pa, *Pb, *Pn, *Pm; 8192 // julong Ra, Rb, Rn, Rm; 8193 8194 // int i; 8195 8196 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8197 8198 // for (i = 0; i < len; i++) { 8199 // int j; 8200 8201 // Pa = Pa_base; 8202 // Pb = Pa_base + i; 8203 // Pm = Pm_base; 8204 // Pn = Pn_base + i; 8205 8206 // Ra = *Pa; 8207 // Rb = *Pb; 8208 // Rm = *Pm; 8209 // Rn = *Pn; 8210 8211 // int iters = (i+1)/2; 8212 // for (j = 0; iters--; j++) { 8213 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8214 // MACC2(Ra, Rb, t0, t1, t2); 8215 // Ra = *++Pa; 8216 // Rb = *--Pb; 8217 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8218 // MACC(Rm, Rn, t0, t1, t2); 8219 // Rm = *++Pm; 8220 // Rn = *--Pn; 8221 // } 8222 // if ((i & 1) == 0) { 8223 // assert(Ra == Pa_base[j], "must be"); 8224 // MACC(Ra, Ra, t0, t1, t2); 8225 // } 8226 // iters = i/2; 8227 // assert(iters == i-j, "must be"); 8228 // for (; iters--; j++) { 8229 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8230 // MACC(Rm, Rn, t0, t1, t2); 8231 // Rm = *++Pm; 8232 // Rn = *--Pn; 8233 // } 8234 8235 // *Pm = Rm = t0 * inv; 8236 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8237 // MACC(Rm, Rn, t0, t1, t2); 8238 8239 // assert(t0 == 0, "broken Montgomery multiply"); 8240 8241 // t0 = t1; t1 = t2; t2 = 0; 8242 // } 8243 8244 // for (i = len; i < 2*len; i++) { 8245 // int start = i-len+1; 8246 // int end = start + (len - start)/2; 8247 // int j; 8248 8249 // Pa = Pa_base + i-len; 8250 // Pb = Pa_base + len; 8251 // Pm = Pm_base + i-len; 8252 // Pn = Pn_base + len; 8253 8254 // Ra = *++Pa; 8255 // Rb = *--Pb; 8256 // Rm = *++Pm; 8257 // Rn = *--Pn; 8258 8259 // int iters = (2*len-i-1)/2; 8260 // assert(iters == end-start, "must be"); 8261 // for (j = start; iters--; j++) { 8262 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8263 // MACC2(Ra, Rb, t0, t1, t2); 8264 // Ra = *++Pa; 8265 // Rb = *--Pb; 8266 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8267 // MACC(Rm, Rn, t0, t1, t2); 8268 // Rm = *++Pm; 8269 // Rn = *--Pn; 8270 // } 8271 // if ((i & 1) == 0) { 8272 // assert(Ra == Pa_base[j], "must be"); 8273 // MACC(Ra, Ra, t0, t1, t2); 8274 // } 8275 // iters = (2*len-i)/2; 8276 // assert(iters == len-j, "must be"); 8277 // for (; iters--; j++) { 8278 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8279 // MACC(Rm, Rn, t0, t1, t2); 8280 // Rm = *++Pm; 8281 // Rn = *--Pn; 8282 // } 8283 // Pm_base[i-len] = t0; 8284 // t0 = t1; t1 = t2; t2 = 0; 8285 // } 8286 8287 // while (t0) 8288 // t0 = sub(Pm_base, Pn_base, t0, len); 8289 // } 8290 }; 8291 8292 8293 // Initialization 8294 void generate_initial_stubs() { 8295 // Generate initial stubs and initializes the entry points 8296 8297 // entry points that exist in all platforms Note: This is code 8298 // that could be shared among different platforms - however the 8299 // benefit seems to be smaller than the disadvantage of having a 8300 // much more complicated generator structure. See also comment in 8301 // stubRoutines.hpp. 8302 8303 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8304 8305 StubRoutines::_call_stub_entry = 8306 generate_call_stub(StubRoutines::_call_stub_return_address); 8307 8308 // is referenced by megamorphic call 8309 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8310 8311 // Build this early so it's available for the interpreter. 8312 StubRoutines::_throw_StackOverflowError_entry = 8313 generate_throw_exception("StackOverflowError throw_exception", 8314 CAST_FROM_FN_PTR(address, 8315 SharedRuntime::throw_StackOverflowError)); 8316 StubRoutines::_throw_delayed_StackOverflowError_entry = 8317 generate_throw_exception("delayed StackOverflowError throw_exception", 8318 CAST_FROM_FN_PTR(address, 8319 SharedRuntime::throw_delayed_StackOverflowError)); 8320 8321 // Initialize table for copy memory (arraycopy) check. 8322 if (UnsafeCopyMemory::_table == nullptr) { 8323 UnsafeCopyMemory::create_table(8); 8324 } 8325 8326 if (UseCRC32Intrinsics) { 8327 // set table address before stub generation which use it 8328 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8329 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8330 } 8331 8332 if (UseCRC32CIntrinsics) { 8333 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8334 } 8335 8336 // Disabled until JDK-8210858 is fixed 8337 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 8338 // StubRoutines::_dlog = generate_dlog(); 8339 // } 8340 8341 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8342 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8343 } 8344 8345 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8346 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8347 } 8348 } 8349 8350 void generate_continuation_stubs() { 8351 // Continuation stubs: 8352 StubRoutines::_cont_thaw = generate_cont_thaw(); 8353 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8354 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8355 8356 JFR_ONLY(generate_jfr_stubs();) 8357 } 8358 8359 #if INCLUDE_JFR 8360 void generate_jfr_stubs() { 8361 StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint(); 8362 StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point(); 8363 StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease(); 8364 StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point(); 8365 } 8366 #endif // INCLUDE_JFR 8367 8368 void generate_final_stubs() { 8369 // support for verify_oop (must happen after universe_init) 8370 if (VerifyOops) { 8371 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8372 } 8373 StubRoutines::_throw_AbstractMethodError_entry = 8374 generate_throw_exception("AbstractMethodError throw_exception", 8375 CAST_FROM_FN_PTR(address, 8376 SharedRuntime:: 8377 throw_AbstractMethodError)); 8378 8379 StubRoutines::_throw_IncompatibleClassChangeError_entry = 8380 generate_throw_exception("IncompatibleClassChangeError throw_exception", 8381 CAST_FROM_FN_PTR(address, 8382 SharedRuntime:: 8383 throw_IncompatibleClassChangeError)); 8384 8385 StubRoutines::_throw_NullPointerException_at_call_entry = 8386 generate_throw_exception("NullPointerException at call throw_exception", 8387 CAST_FROM_FN_PTR(address, 8388 SharedRuntime:: 8389 throw_NullPointerException_at_call)); 8390 8391 // arraycopy stubs used by compilers 8392 generate_arraycopy_stubs(); 8393 8394 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8395 if (bs_nm != nullptr) { 8396 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8397 } 8398 8399 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8400 8401 if (UsePoly1305Intrinsics) { 8402 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 8403 } 8404 8405 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8406 8407 generate_atomic_entry_points(); 8408 8409 #endif // LINUX 8410 8411 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8412 8413 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8414 } 8415 8416 void generate_compiler_stubs() { 8417 #if COMPILER2_OR_JVMCI 8418 8419 if (UseSVE == 0) { 8420 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8421 } 8422 8423 // array equals stub for large arrays. 8424 if (!UseSimpleArrayEquals) { 8425 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8426 } 8427 8428 // byte_array_inflate stub for large arrays. 8429 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8430 8431 // countPositives stub for large arrays. 8432 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8433 8434 generate_compare_long_strings(); 8435 8436 generate_string_indexof_stubs(); 8437 8438 #ifdef COMPILER2 8439 if (UseMultiplyToLenIntrinsic) { 8440 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8441 } 8442 8443 if (UseSquareToLenIntrinsic) { 8444 StubRoutines::_squareToLen = generate_squareToLen(); 8445 } 8446 8447 if (UseMulAddIntrinsic) { 8448 StubRoutines::_mulAdd = generate_mulAdd(); 8449 } 8450 8451 if (UseSIMDForBigIntegerShiftIntrinsics) { 8452 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8453 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8454 } 8455 8456 if (UseMontgomeryMultiplyIntrinsic) { 8457 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8458 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8459 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8460 } 8461 8462 if (UseMontgomerySquareIntrinsic) { 8463 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8464 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8465 // We use generate_multiply() rather than generate_square() 8466 // because it's faster for the sizes of modulus we care about. 8467 StubRoutines::_montgomerySquare = g.generate_multiply(); 8468 } 8469 #endif // COMPILER2 8470 8471 if (UseChaCha20Intrinsics) { 8472 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8473 } 8474 8475 if (UseBASE64Intrinsics) { 8476 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8477 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8478 } 8479 8480 // data cache line writeback 8481 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8482 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8483 8484 if (UseAESIntrinsics) { 8485 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8486 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8487 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8488 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8489 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8490 } 8491 if (UseGHASHIntrinsics) { 8492 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8493 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8494 } 8495 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8496 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8497 } 8498 8499 if (UseMD5Intrinsics) { 8500 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8501 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8502 } 8503 if (UseSHA1Intrinsics) { 8504 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8505 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8506 } 8507 if (UseSHA256Intrinsics) { 8508 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8509 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8510 } 8511 if (UseSHA512Intrinsics) { 8512 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8513 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8514 } 8515 if (UseSHA3Intrinsics) { 8516 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8517 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8518 } 8519 8520 // generate Adler32 intrinsics code 8521 if (UseAdler32Intrinsics) { 8522 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8523 } 8524 #endif // COMPILER2_OR_JVMCI 8525 } 8526 8527 public: 8528 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8529 switch(kind) { 8530 case Initial_stubs: 8531 generate_initial_stubs(); 8532 break; 8533 case Continuation_stubs: 8534 generate_continuation_stubs(); 8535 break; 8536 case Compiler_stubs: 8537 generate_compiler_stubs(); 8538 break; 8539 case Final_stubs: 8540 generate_final_stubs(); 8541 break; 8542 default: 8543 fatal("unexpected stubs kind: %d", kind); 8544 break; 8545 }; 8546 } 8547 }; // end class declaration 8548 8549 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8550 StubGenerator g(code, kind); 8551 } 8552 8553 8554 #if defined (LINUX) 8555 8556 // Define pointers to atomic stubs and initialize them to point to the 8557 // code in atomic_aarch64.S. 8558 8559 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8560 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8561 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8562 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8563 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8564 8565 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8566 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8567 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8568 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8569 DEFAULT_ATOMIC_OP(xchg, 4, ) 8570 DEFAULT_ATOMIC_OP(xchg, 8, ) 8571 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8572 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8573 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8574 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8575 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8576 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8577 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8578 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8579 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8580 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8581 8582 #undef DEFAULT_ATOMIC_OP 8583 8584 #endif // LINUX