1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubGenStubId stub_id = StubGenStubId::call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 __ ldr(j_rarg2, result); 332 Label is_long, is_float, is_double, exit; 333 __ ldr(j_rarg1, result_type); 334 __ cmp(j_rarg1, (u1)T_OBJECT); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, (u1)T_LONG); 337 __ br(Assembler::EQ, is_long); 338 __ cmp(j_rarg1, (u1)T_FLOAT); 339 __ br(Assembler::EQ, is_float); 340 __ cmp(j_rarg1, (u1)T_DOUBLE); 341 __ br(Assembler::EQ, is_double); 342 343 // handle T_INT case 344 __ strw(r0, Address(j_rarg2)); 345 346 __ BIND(exit); 347 348 // pop parameters 349 __ sub(esp, rfp, -sp_after_call_off * wordSize); 350 351 #ifdef ASSERT 352 // verify that threads correspond 353 { 354 Label L, S; 355 __ ldr(rscratch1, thread); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::NE, S); 358 __ get_thread(rscratch1); 359 __ cmp(rthread, rscratch1); 360 __ br(Assembler::EQ, L); 361 __ BIND(S); 362 __ stop("StubRoutines::call_stub: threads must correspond"); 363 __ BIND(L); 364 } 365 #endif 366 367 __ pop_cont_fastpath(rthread); 368 369 // restore callee-save registers 370 __ ldpd(v15, v14, d15_save); 371 __ ldpd(v13, v12, d13_save); 372 __ ldpd(v11, v10, d11_save); 373 __ ldpd(v9, v8, d9_save); 374 375 __ ldp(r28, r27, r28_save); 376 __ ldp(r26, r25, r26_save); 377 __ ldp(r24, r23, r24_save); 378 __ ldp(r22, r21, r22_save); 379 __ ldp(r20, r19, r20_save); 380 381 // restore fpcr 382 __ ldr(rscratch1, fpcr_save); 383 __ set_fpcr(rscratch1); 384 385 __ ldp(c_rarg0, c_rarg1, call_wrapper); 386 __ ldrw(c_rarg2, result_type); 387 __ ldr(c_rarg3, method); 388 __ ldp(c_rarg4, c_rarg5, entry_point); 389 __ ldp(c_rarg6, c_rarg7, parameter_size); 390 391 // leave frame and return to caller 392 __ leave(); 393 __ ret(lr); 394 395 // handle return types different from T_INT 396 397 __ BIND(is_long); 398 __ str(r0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 __ BIND(is_float); 402 __ strs(j_farg0, Address(j_rarg2, 0)); 403 __ br(Assembler::AL, exit); 404 405 __ BIND(is_double); 406 __ strd(j_farg0, Address(j_rarg2, 0)); 407 __ br(Assembler::AL, exit); 408 409 return start; 410 } 411 412 // Return point for a Java call if there's an exception thrown in 413 // Java code. The exception is caught and transformed into a 414 // pending exception stored in JavaThread that can be tested from 415 // within the VM. 416 // 417 // Note: Usually the parameters are removed by the callee. In case 418 // of an exception crossing an activation frame boundary, that is 419 // not the case if the callee is compiled code => need to setup the 420 // rsp. 421 // 422 // r0: exception oop 423 424 address generate_catch_exception() { 425 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 426 StubCodeMark mark(this, stub_id); 427 address start = __ pc(); 428 429 // same as in generate_call_stub(): 430 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 431 const Address thread (rfp, thread_off * wordSize); 432 433 #ifdef ASSERT 434 // verify that threads correspond 435 { 436 Label L, S; 437 __ ldr(rscratch1, thread); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::NE, S); 440 __ get_thread(rscratch1); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::EQ, L); 443 __ bind(S); 444 __ stop("StubRoutines::catch_exception: threads must correspond"); 445 __ bind(L); 446 } 447 #endif 448 449 // set pending exception 450 __ verify_oop(r0); 451 452 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 453 __ mov(rscratch1, (address)__FILE__); 454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 455 __ movw(rscratch1, (int)__LINE__); 456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 457 458 // complete return to VM 459 assert(StubRoutines::_call_stub_return_address != nullptr, 460 "_call_stub_return_address must have been generated before"); 461 __ b(StubRoutines::_call_stub_return_address); 462 463 return start; 464 } 465 466 // Continuation point for runtime calls returning with a pending 467 // exception. The pending exception check happened in the runtime 468 // or native call stub. The pending exception in Thread is 469 // converted into a Java-level exception. 470 // 471 // Contract with Java-level exception handlers: 472 // r0: exception 473 // r3: throwing pc 474 // 475 // NOTE: At entry of this stub, exception-pc must be in LR !! 476 477 // NOTE: this is always used as a jump target within generated code 478 // so it just needs to be generated code with no x86 prolog 479 480 address generate_forward_exception() { 481 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 482 StubCodeMark mark(this, stub_id); 483 address start = __ pc(); 484 485 // Upon entry, LR points to the return address returning into 486 // Java (interpreted or compiled) code; i.e., the return address 487 // becomes the throwing pc. 488 // 489 // Arguments pushed before the runtime call are still on the stack 490 // but the exception handler will reset the stack pointer -> 491 // ignore them. A potential result in registers can be ignored as 492 // well. 493 494 #ifdef ASSERT 495 // make sure this code is only executed if there is a pending exception 496 { 497 Label L; 498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 499 __ cbnz(rscratch1, L); 500 __ stop("StubRoutines::forward exception: no pending exception (1)"); 501 __ bind(L); 502 } 503 #endif 504 505 // compute exception handler into r19 506 507 // call the VM to find the handler address associated with the 508 // caller address. pass thread in r0 and caller pc (ret address) 509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 510 // the stack. 511 __ mov(c_rarg1, lr); 512 // lr will be trashed by the VM call so we move it to R19 513 // (callee-saved) because we also need to pass it to the handler 514 // returned by this call. 515 __ mov(r19, lr); 516 BLOCK_COMMENT("call exception_handler_for_return_address"); 517 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 518 SharedRuntime::exception_handler_for_return_address), 519 rthread, c_rarg1); 520 // Reinitialize the ptrue predicate register, in case the external runtime 521 // call clobbers ptrue reg, as we may return to SVE compiled code. 522 __ reinitialize_ptrue(); 523 524 // we should not really care that lr is no longer the callee 525 // address. we saved the value the handler needs in r19 so we can 526 // just copy it to r3. however, the C2 handler will push its own 527 // frame and then calls into the VM and the VM code asserts that 528 // the PC for the frame above the handler belongs to a compiled 529 // Java method. So, we restore lr here to satisfy that assert. 530 __ mov(lr, r19); 531 // setup r0 & r3 & clear pending exception 532 __ mov(r3, r19); 533 __ mov(r19, r0); 534 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 535 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 536 537 #ifdef ASSERT 538 // make sure exception is set 539 { 540 Label L; 541 __ cbnz(r0, L); 542 __ stop("StubRoutines::forward exception: no pending exception (2)"); 543 __ bind(L); 544 } 545 #endif 546 547 // continue at exception handler 548 // r0: exception 549 // r3: throwing pc 550 // r19: exception handler 551 __ verify_oop(r0); 552 __ br(r19); 553 554 return start; 555 } 556 557 // Non-destructive plausibility checks for oops 558 // 559 // Arguments: 560 // r0: oop to verify 561 // rscratch1: error message 562 // 563 // Stack after saving c_rarg3: 564 // [tos + 0]: saved c_rarg3 565 // [tos + 1]: saved c_rarg2 566 // [tos + 2]: saved lr 567 // [tos + 3]: saved rscratch2 568 // [tos + 4]: saved r0 569 // [tos + 5]: saved rscratch1 570 address generate_verify_oop() { 571 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 572 StubCodeMark mark(this, stub_id); 573 address start = __ pc(); 574 575 Label exit, error; 576 577 // save c_rarg2 and c_rarg3 578 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 579 580 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 581 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ ldr(c_rarg3, Address(c_rarg2)); 583 __ add(c_rarg3, c_rarg3, 1); 584 __ str(c_rarg3, Address(c_rarg2)); 585 586 // object is in r0 587 // make sure object is 'reasonable' 588 __ cbz(r0, exit); // if obj is null it is OK 589 590 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 591 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 592 593 // return if everything seems ok 594 __ bind(exit); 595 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 __ ret(lr); 598 599 // handle errors 600 __ bind(error); 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 603 __ push(RegSet::range(r0, r29), sp); 604 // debug(char* msg, int64_t pc, int64_t regs[]) 605 __ mov(c_rarg0, rscratch1); // pass address of error message 606 __ mov(c_rarg1, lr); // pass return address 607 __ mov(c_rarg2, sp); // pass address of regs on stack 608 #ifndef PRODUCT 609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 610 #endif 611 BLOCK_COMMENT("call MacroAssembler::debug"); 612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 613 __ blr(rscratch1); 614 __ hlt(0); 615 616 return start; 617 } 618 619 // Generate indices for iota vector. 620 address generate_iota_indices(StubGenStubId stub_id) { 621 __ align(CodeEntryAlignment); 622 StubCodeMark mark(this, stub_id); 623 address start = __ pc(); 624 // B 625 __ emit_data64(0x0706050403020100, relocInfo::none); 626 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 627 // H 628 __ emit_data64(0x0003000200010000, relocInfo::none); 629 __ emit_data64(0x0007000600050004, relocInfo::none); 630 // S 631 __ emit_data64(0x0000000100000000, relocInfo::none); 632 __ emit_data64(0x0000000300000002, relocInfo::none); 633 // D 634 __ emit_data64(0x0000000000000000, relocInfo::none); 635 __ emit_data64(0x0000000000000001, relocInfo::none); 636 // S - FP 637 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 638 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 639 // D - FP 640 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 641 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 642 return start; 643 } 644 645 // The inner part of zero_words(). This is the bulk operation, 646 // zeroing words in blocks, possibly using DC ZVA to do it. The 647 // caller is responsible for zeroing the last few words. 648 // 649 // Inputs: 650 // r10: the HeapWord-aligned base address of an array to zero. 651 // r11: the count in HeapWords, r11 > 0. 652 // 653 // Returns r10 and r11, adjusted for the caller to clear. 654 // r10: the base address of the tail of words left to clear. 655 // r11: the number of words in the tail. 656 // r11 < MacroAssembler::zero_words_block_size. 657 658 address generate_zero_blocks() { 659 Label done; 660 Label base_aligned; 661 662 Register base = r10, cnt = r11; 663 664 __ align(CodeEntryAlignment); 665 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 666 StubCodeMark mark(this, stub_id); 667 address start = __ pc(); 668 669 if (UseBlockZeroing) { 670 int zva_length = VM_Version::zva_length(); 671 672 // Ensure ZVA length can be divided by 16. This is required by 673 // the subsequent operations. 674 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 675 676 __ tbz(base, 3, base_aligned); 677 __ str(zr, Address(__ post(base, 8))); 678 __ sub(cnt, cnt, 1); 679 __ bind(base_aligned); 680 681 // Ensure count >= zva_length * 2 so that it still deserves a zva after 682 // alignment. 683 Label small; 684 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 685 __ subs(rscratch1, cnt, low_limit >> 3); 686 __ br(Assembler::LT, small); 687 __ zero_dcache_blocks(base, cnt); 688 __ bind(small); 689 } 690 691 { 692 // Number of stp instructions we'll unroll 693 const int unroll = 694 MacroAssembler::zero_words_block_size / 2; 695 // Clear the remaining blocks. 696 Label loop; 697 __ subs(cnt, cnt, unroll * 2); 698 __ br(Assembler::LT, done); 699 __ bind(loop); 700 for (int i = 0; i < unroll; i++) 701 __ stp(zr, zr, __ post(base, 16)); 702 __ subs(cnt, cnt, unroll * 2); 703 __ br(Assembler::GE, loop); 704 __ bind(done); 705 __ add(cnt, cnt, unroll * 2); 706 } 707 708 __ ret(lr); 709 710 return start; 711 } 712 713 714 typedef enum { 715 copy_forwards = 1, 716 copy_backwards = -1 717 } copy_direction; 718 719 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 720 // for arraycopy stubs. 721 class ArrayCopyBarrierSetHelper : StackObj { 722 BarrierSetAssembler* _bs_asm; 723 MacroAssembler* _masm; 724 DecoratorSet _decorators; 725 BasicType _type; 726 Register _gct1; 727 Register _gct2; 728 Register _gct3; 729 FloatRegister _gcvt1; 730 FloatRegister _gcvt2; 731 FloatRegister _gcvt3; 732 733 public: 734 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 735 DecoratorSet decorators, 736 BasicType type, 737 Register gct1, 738 Register gct2, 739 Register gct3, 740 FloatRegister gcvt1, 741 FloatRegister gcvt2, 742 FloatRegister gcvt3) 743 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 744 _masm(masm), 745 _decorators(decorators), 746 _type(type), 747 _gct1(gct1), 748 _gct2(gct2), 749 _gct3(gct3), 750 _gcvt1(gcvt1), 751 _gcvt2(gcvt2), 752 _gcvt3(gcvt3) { 753 } 754 755 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 756 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 757 dst1, dst2, src, 758 _gct1, _gct2, _gcvt1); 759 } 760 761 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 762 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 763 dst, src1, src2, 764 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 765 } 766 767 void copy_load_at_16(Register dst1, Register dst2, Address src) { 768 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 769 dst1, dst2, src, 770 _gct1); 771 } 772 773 void copy_store_at_16(Address dst, Register src1, Register src2) { 774 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 775 dst, src1, src2, 776 _gct1, _gct2, _gct3); 777 } 778 779 void copy_load_at_8(Register dst, Address src) { 780 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 781 dst, noreg, src, 782 _gct1); 783 } 784 785 void copy_store_at_8(Address dst, Register src) { 786 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 787 dst, src, noreg, 788 _gct1, _gct2, _gct3); 789 } 790 }; 791 792 // Bulk copy of blocks of 8 words. 793 // 794 // count is a count of words. 795 // 796 // Precondition: count >= 8 797 // 798 // Postconditions: 799 // 800 // The least significant bit of count contains the remaining count 801 // of words to copy. The rest of count is trash. 802 // 803 // s and d are adjusted to point to the remaining words to copy 804 // 805 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 806 BasicType type; 807 copy_direction direction; 808 809 switch (stub_id) { 810 case copy_byte_f_id: 811 direction = copy_forwards; 812 type = T_BYTE; 813 break; 814 case copy_byte_b_id: 815 direction = copy_backwards; 816 type = T_BYTE; 817 break; 818 case copy_oop_f_id: 819 direction = copy_forwards; 820 type = T_OBJECT; 821 break; 822 case copy_oop_b_id: 823 direction = copy_backwards; 824 type = T_OBJECT; 825 break; 826 case copy_oop_uninit_f_id: 827 direction = copy_forwards; 828 type = T_OBJECT; 829 break; 830 case copy_oop_uninit_b_id: 831 direction = copy_backwards; 832 type = T_OBJECT; 833 break; 834 default: 835 ShouldNotReachHere(); 836 } 837 838 int unit = wordSize * direction; 839 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 840 841 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 842 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 843 const Register stride = r14; 844 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 845 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 846 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 847 848 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 849 assert_different_registers(s, d, count, rscratch1, rscratch2); 850 851 Label again, drain; 852 853 __ align(CodeEntryAlignment); 854 855 StubCodeMark mark(this, stub_id); 856 857 __ bind(start); 858 859 Label unaligned_copy_long; 860 if (AvoidUnalignedAccesses) { 861 __ tbnz(d, 3, unaligned_copy_long); 862 } 863 864 if (direction == copy_forwards) { 865 __ sub(s, s, bias); 866 __ sub(d, d, bias); 867 } 868 869 #ifdef ASSERT 870 // Make sure we are never given < 8 words 871 { 872 Label L; 873 __ cmp(count, (u1)8); 874 __ br(Assembler::GE, L); 875 __ stop("genrate_copy_longs called with < 8 words"); 876 __ bind(L); 877 } 878 #endif 879 880 // Fill 8 registers 881 if (UseSIMDForMemoryOps) { 882 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 884 } else { 885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 886 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 887 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 888 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 889 } 890 891 __ subs(count, count, 16); 892 __ br(Assembler::LO, drain); 893 894 int prefetch = PrefetchCopyIntervalInBytes; 895 bool use_stride = false; 896 if (direction == copy_backwards) { 897 use_stride = prefetch > 256; 898 prefetch = -prefetch; 899 if (use_stride) __ mov(stride, prefetch); 900 } 901 902 __ bind(again); 903 904 if (PrefetchCopyIntervalInBytes > 0) 905 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 906 907 if (UseSIMDForMemoryOps) { 908 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 909 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 910 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 911 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 912 } else { 913 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 914 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 915 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 916 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 917 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 918 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 919 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 920 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 921 } 922 923 __ subs(count, count, 8); 924 __ br(Assembler::HS, again); 925 926 // Drain 927 __ bind(drain); 928 if (UseSIMDForMemoryOps) { 929 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 930 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 931 } else { 932 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 933 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 934 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 } 937 938 { 939 Label L1, L2; 940 __ tbz(count, exact_log2(4), L1); 941 if (UseSIMDForMemoryOps) { 942 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 943 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 944 } else { 945 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 946 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 947 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 948 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 949 } 950 __ bind(L1); 951 952 if (direction == copy_forwards) { 953 __ add(s, s, bias); 954 __ add(d, d, bias); 955 } 956 957 __ tbz(count, 1, L2); 958 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 959 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 960 __ bind(L2); 961 } 962 963 __ ret(lr); 964 965 if (AvoidUnalignedAccesses) { 966 Label drain, again; 967 // Register order for storing. Order is different for backward copy. 968 969 __ bind(unaligned_copy_long); 970 971 // source address is even aligned, target odd aligned 972 // 973 // when forward copying word pairs we read long pairs at offsets 974 // {0, 2, 4, 6} (in long words). when backwards copying we read 975 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 976 // address by -2 in the forwards case so we can compute the 977 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 978 // or -1. 979 // 980 // when forward copying we need to store 1 word, 3 pairs and 981 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 982 // zero offset We adjust the destination by -1 which means we 983 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 984 // 985 // When backwards copyng we need to store 1 word, 3 pairs and 986 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 987 // offsets {1, 3, 5, 7, 8} * unit. 988 989 if (direction == copy_forwards) { 990 __ sub(s, s, 16); 991 __ sub(d, d, 8); 992 } 993 994 // Fill 8 registers 995 // 996 // for forwards copy s was offset by -16 from the original input 997 // value of s so the register contents are at these offsets 998 // relative to the 64 bit block addressed by that original input 999 // and so on for each successive 64 byte block when s is updated 1000 // 1001 // t0 at offset 0, t1 at offset 8 1002 // t2 at offset 16, t3 at offset 24 1003 // t4 at offset 32, t5 at offset 40 1004 // t6 at offset 48, t7 at offset 56 1005 1006 // for backwards copy s was not offset so the register contents 1007 // are at these offsets into the preceding 64 byte block 1008 // relative to that original input and so on for each successive 1009 // preceding 64 byte block when s is updated. this explains the 1010 // slightly counter-intuitive looking pattern of register usage 1011 // in the stp instructions for backwards copy. 1012 // 1013 // t0 at offset -16, t1 at offset -8 1014 // t2 at offset -32, t3 at offset -24 1015 // t4 at offset -48, t5 at offset -40 1016 // t6 at offset -64, t7 at offset -56 1017 1018 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1019 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1020 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1021 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1022 1023 __ subs(count, count, 16); 1024 __ br(Assembler::LO, drain); 1025 1026 int prefetch = PrefetchCopyIntervalInBytes; 1027 bool use_stride = false; 1028 if (direction == copy_backwards) { 1029 use_stride = prefetch > 256; 1030 prefetch = -prefetch; 1031 if (use_stride) __ mov(stride, prefetch); 1032 } 1033 1034 __ bind(again); 1035 1036 if (PrefetchCopyIntervalInBytes > 0) 1037 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1038 1039 if (direction == copy_forwards) { 1040 // allowing for the offset of -8 the store instructions place 1041 // registers into the target 64 bit block at the following 1042 // offsets 1043 // 1044 // t0 at offset 0 1045 // t1 at offset 8, t2 at offset 16 1046 // t3 at offset 24, t4 at offset 32 1047 // t5 at offset 40, t6 at offset 48 1048 // t7 at offset 56 1049 1050 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1051 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1052 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1053 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1054 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1055 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1056 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1057 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1058 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1059 } else { 1060 // d was not offset when we started so the registers are 1061 // written into the 64 bit block preceding d with the following 1062 // offsets 1063 // 1064 // t1 at offset -8 1065 // t3 at offset -24, t0 at offset -16 1066 // t5 at offset -48, t2 at offset -32 1067 // t7 at offset -56, t4 at offset -48 1068 // t6 at offset -64 1069 // 1070 // note that this matches the offsets previously noted for the 1071 // loads 1072 1073 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1074 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1075 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1076 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1077 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1078 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1079 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1080 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1081 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1082 } 1083 1084 __ subs(count, count, 8); 1085 __ br(Assembler::HS, again); 1086 1087 // Drain 1088 // 1089 // this uses the same pattern of offsets and register arguments 1090 // as above 1091 __ bind(drain); 1092 if (direction == copy_forwards) { 1093 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1094 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1095 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1096 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1097 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1098 } else { 1099 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1100 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1101 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1102 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1103 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1104 } 1105 // now we need to copy any remaining part block which may 1106 // include a 4 word block subblock and/or a 2 word subblock. 1107 // bits 2 and 1 in the count are the tell-tale for whether we 1108 // have each such subblock 1109 { 1110 Label L1, L2; 1111 __ tbz(count, exact_log2(4), L1); 1112 // this is the same as above but copying only 4 longs hence 1113 // with only one intervening stp between the str instructions 1114 // but note that the offsets and registers still follow the 1115 // same pattern 1116 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1117 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1118 if (direction == copy_forwards) { 1119 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1120 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1121 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1122 } else { 1123 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1124 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1125 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1126 } 1127 __ bind(L1); 1128 1129 __ tbz(count, 1, L2); 1130 // this is the same as above but copying only 2 longs hence 1131 // there is no intervening stp between the str instructions 1132 // but note that the offset and register patterns are still 1133 // the same 1134 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1135 if (direction == copy_forwards) { 1136 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1137 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1141 } 1142 __ bind(L2); 1143 1144 // for forwards copy we need to re-adjust the offsets we 1145 // applied so that s and d are follow the last words written 1146 1147 if (direction == copy_forwards) { 1148 __ add(s, s, 16); 1149 __ add(d, d, 8); 1150 } 1151 1152 } 1153 1154 __ ret(lr); 1155 } 1156 } 1157 1158 // Small copy: less than 16 bytes. 1159 // 1160 // NB: Ignores all of the bits of count which represent more than 15 1161 // bytes, so a caller doesn't have to mask them. 1162 1163 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1164 bool is_backwards = step < 0; 1165 size_t granularity = uabs(step); 1166 int direction = is_backwards ? -1 : 1; 1167 1168 Label Lword, Lint, Lshort, Lbyte; 1169 1170 assert(granularity 1171 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1172 1173 const Register t0 = r3; 1174 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1175 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1176 1177 // ??? I don't know if this bit-test-and-branch is the right thing 1178 // to do. It does a lot of jumping, resulting in several 1179 // mispredicted branches. It might make more sense to do this 1180 // with something like Duff's device with a single computed branch. 1181 1182 __ tbz(count, 3 - exact_log2(granularity), Lword); 1183 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1184 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1185 __ bind(Lword); 1186 1187 if (granularity <= sizeof (jint)) { 1188 __ tbz(count, 2 - exact_log2(granularity), Lint); 1189 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1190 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1191 __ bind(Lint); 1192 } 1193 1194 if (granularity <= sizeof (jshort)) { 1195 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1196 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1197 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1198 __ bind(Lshort); 1199 } 1200 1201 if (granularity <= sizeof (jbyte)) { 1202 __ tbz(count, 0, Lbyte); 1203 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1204 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1205 __ bind(Lbyte); 1206 } 1207 } 1208 1209 Label copy_f, copy_b; 1210 Label copy_obj_f, copy_obj_b; 1211 Label copy_obj_uninit_f, copy_obj_uninit_b; 1212 1213 // All-singing all-dancing memory copy. 1214 // 1215 // Copy count units of memory from s to d. The size of a unit is 1216 // step, which can be positive or negative depending on the direction 1217 // of copy. If is_aligned is false, we align the source address. 1218 // 1219 1220 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1221 Register s, Register d, Register count, int step) { 1222 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1223 bool is_backwards = step < 0; 1224 unsigned int granularity = uabs(step); 1225 const Register t0 = r3, t1 = r4; 1226 1227 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1228 // load all the data before writing anything 1229 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1230 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1231 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1232 const Register send = r17, dend = r16; 1233 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1234 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1235 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1236 1237 if (PrefetchCopyIntervalInBytes > 0) 1238 __ prfm(Address(s, 0), PLDL1KEEP); 1239 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1240 __ br(Assembler::HI, copy_big); 1241 1242 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1243 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1244 1245 __ cmp(count, u1(16/granularity)); 1246 __ br(Assembler::LS, copy16); 1247 1248 __ cmp(count, u1(64/granularity)); 1249 __ br(Assembler::HI, copy80); 1250 1251 __ cmp(count, u1(32/granularity)); 1252 __ br(Assembler::LS, copy32); 1253 1254 // 33..64 bytes 1255 if (UseSIMDForMemoryOps) { 1256 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1257 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1258 bs.copy_store_at_32(Address(d, 0), v0, v1); 1259 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1260 } else { 1261 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1262 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1263 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1264 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1265 1266 bs.copy_store_at_16(Address(d, 0), t0, t1); 1267 bs.copy_store_at_16(Address(d, 16), t2, t3); 1268 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1269 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1270 } 1271 __ b(finish); 1272 1273 // 17..32 bytes 1274 __ bind(copy32); 1275 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1276 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1277 1278 bs.copy_store_at_16(Address(d, 0), t0, t1); 1279 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1280 __ b(finish); 1281 1282 // 65..80/96 bytes 1283 // (96 bytes if SIMD because we do 32 byes per instruction) 1284 __ bind(copy80); 1285 if (UseSIMDForMemoryOps) { 1286 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1287 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1288 // Unaligned pointers can be an issue for copying. 1289 // The issue has more chances to happen when granularity of data is 1290 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1291 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1292 // The most performance drop has been seen for the range 65-80 bytes. 1293 // For such cases using the pair of ldp/stp instead of the third pair of 1294 // ldpq/stpq fixes the performance issue. 1295 if (granularity < sizeof (jint)) { 1296 Label copy96; 1297 __ cmp(count, u1(80/granularity)); 1298 __ br(Assembler::HI, copy96); 1299 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1300 1301 bs.copy_store_at_32(Address(d, 0), v0, v1); 1302 bs.copy_store_at_32(Address(d, 32), v2, v3); 1303 1304 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1305 __ b(finish); 1306 1307 __ bind(copy96); 1308 } 1309 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1310 1311 bs.copy_store_at_32(Address(d, 0), v0, v1); 1312 bs.copy_store_at_32(Address(d, 32), v2, v3); 1313 1314 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1315 } else { 1316 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1317 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1318 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1319 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1320 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1321 1322 bs.copy_store_at_16(Address(d, 0), t0, t1); 1323 bs.copy_store_at_16(Address(d, 16), t2, t3); 1324 bs.copy_store_at_16(Address(d, 32), t4, t5); 1325 bs.copy_store_at_16(Address(d, 48), t6, t7); 1326 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1327 } 1328 __ b(finish); 1329 1330 // 0..16 bytes 1331 __ bind(copy16); 1332 __ cmp(count, u1(8/granularity)); 1333 __ br(Assembler::LO, copy8); 1334 1335 // 8..16 bytes 1336 bs.copy_load_at_8(t0, Address(s, 0)); 1337 bs.copy_load_at_8(t1, Address(send, -8)); 1338 bs.copy_store_at_8(Address(d, 0), t0); 1339 bs.copy_store_at_8(Address(dend, -8), t1); 1340 __ b(finish); 1341 1342 if (granularity < 8) { 1343 // 4..7 bytes 1344 __ bind(copy8); 1345 __ tbz(count, 2 - exact_log2(granularity), copy4); 1346 __ ldrw(t0, Address(s, 0)); 1347 __ ldrw(t1, Address(send, -4)); 1348 __ strw(t0, Address(d, 0)); 1349 __ strw(t1, Address(dend, -4)); 1350 __ b(finish); 1351 if (granularity < 4) { 1352 // 0..3 bytes 1353 __ bind(copy4); 1354 __ cbz(count, finish); // get rid of 0 case 1355 if (granularity == 2) { 1356 __ ldrh(t0, Address(s, 0)); 1357 __ strh(t0, Address(d, 0)); 1358 } else { // granularity == 1 1359 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1360 // the first and last byte. 1361 // Handle the 3 byte case by loading and storing base + count/2 1362 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1363 // This does means in the 1 byte case we load/store the same 1364 // byte 3 times. 1365 __ lsr(count, count, 1); 1366 __ ldrb(t0, Address(s, 0)); 1367 __ ldrb(t1, Address(send, -1)); 1368 __ ldrb(t2, Address(s, count)); 1369 __ strb(t0, Address(d, 0)); 1370 __ strb(t1, Address(dend, -1)); 1371 __ strb(t2, Address(d, count)); 1372 } 1373 __ b(finish); 1374 } 1375 } 1376 1377 __ bind(copy_big); 1378 if (is_backwards) { 1379 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1380 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1381 } 1382 1383 // Now we've got the small case out of the way we can align the 1384 // source address on a 2-word boundary. 1385 1386 // Here we will materialize a count in r15, which is used by copy_memory_small 1387 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1388 // Up until here, we have used t9, which aliases r15, but from here on, that register 1389 // can not be used as a temp register, as it contains the count. 1390 1391 Label aligned; 1392 1393 if (is_aligned) { 1394 // We may have to adjust by 1 word to get s 2-word-aligned. 1395 __ tbz(s, exact_log2(wordSize), aligned); 1396 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1397 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1398 __ sub(count, count, wordSize/granularity); 1399 } else { 1400 if (is_backwards) { 1401 __ andr(r15, s, 2 * wordSize - 1); 1402 } else { 1403 __ neg(r15, s); 1404 __ andr(r15, r15, 2 * wordSize - 1); 1405 } 1406 // r15 is the byte adjustment needed to align s. 1407 __ cbz(r15, aligned); 1408 int shift = exact_log2(granularity); 1409 if (shift > 0) { 1410 __ lsr(r15, r15, shift); 1411 } 1412 __ sub(count, count, r15); 1413 1414 #if 0 1415 // ?? This code is only correct for a disjoint copy. It may or 1416 // may not make sense to use it in that case. 1417 1418 // Copy the first pair; s and d may not be aligned. 1419 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1420 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1421 1422 // Align s and d, adjust count 1423 if (is_backwards) { 1424 __ sub(s, s, r15); 1425 __ sub(d, d, r15); 1426 } else { 1427 __ add(s, s, r15); 1428 __ add(d, d, r15); 1429 } 1430 #else 1431 copy_memory_small(decorators, type, s, d, r15, step); 1432 #endif 1433 } 1434 1435 __ bind(aligned); 1436 1437 // s is now 2-word-aligned. 1438 1439 // We have a count of units and some trailing bytes. Adjust the 1440 // count and do a bulk copy of words. If the shift is zero 1441 // perform a move instead to benefit from zero latency moves. 1442 int shift = exact_log2(wordSize/granularity); 1443 if (shift > 0) { 1444 __ lsr(r15, count, shift); 1445 } else { 1446 __ mov(r15, count); 1447 } 1448 if (direction == copy_forwards) { 1449 if (type != T_OBJECT) { 1450 __ bl(copy_f); 1451 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1452 __ bl(copy_obj_uninit_f); 1453 } else { 1454 __ bl(copy_obj_f); 1455 } 1456 } else { 1457 if (type != T_OBJECT) { 1458 __ bl(copy_b); 1459 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1460 __ bl(copy_obj_uninit_b); 1461 } else { 1462 __ bl(copy_obj_b); 1463 } 1464 } 1465 1466 // And the tail. 1467 copy_memory_small(decorators, type, s, d, count, step); 1468 1469 if (granularity >= 8) __ bind(copy8); 1470 if (granularity >= 4) __ bind(copy4); 1471 __ bind(finish); 1472 } 1473 1474 1475 void clobber_registers() { 1476 #ifdef ASSERT 1477 RegSet clobbered 1478 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1479 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1480 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1481 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1482 __ mov(*it, rscratch1); 1483 } 1484 #endif 1485 1486 } 1487 1488 // Scan over array at a for count oops, verifying each one. 1489 // Preserves a and count, clobbers rscratch1 and rscratch2. 1490 void verify_oop_array (int size, Register a, Register count, Register temp) { 1491 Label loop, end; 1492 __ mov(rscratch1, a); 1493 __ mov(rscratch2, zr); 1494 __ bind(loop); 1495 __ cmp(rscratch2, count); 1496 __ br(Assembler::HS, end); 1497 if (size == wordSize) { 1498 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1499 __ verify_oop(temp); 1500 } else { 1501 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1502 __ decode_heap_oop(temp); // calls verify_oop 1503 } 1504 __ add(rscratch2, rscratch2, 1); 1505 __ b(loop); 1506 __ bind(end); 1507 } 1508 1509 // Arguments: 1510 // stub_id - is used to name the stub and identify all details of 1511 // how to perform the copy. 1512 // 1513 // entry - is assigned to the stub's post push entry point unless 1514 // it is null 1515 // 1516 // Inputs: 1517 // c_rarg0 - source array address 1518 // c_rarg1 - destination array address 1519 // c_rarg2 - element count, treated as ssize_t, can be zero 1520 // 1521 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1522 // the hardware handle it. The two dwords within qwords that span 1523 // cache line boundaries will still be loaded and stored atomically. 1524 // 1525 // Side Effects: entry is set to the (post push) entry point so it 1526 // can be used by the corresponding conjoint copy 1527 // method 1528 // 1529 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1530 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1531 RegSet saved_reg = RegSet::of(s, d, count); 1532 int size; 1533 bool aligned; 1534 bool is_oop; 1535 bool dest_uninitialized; 1536 switch (stub_id) { 1537 case jbyte_disjoint_arraycopy_id: 1538 size = sizeof(jbyte); 1539 aligned = false; 1540 is_oop = false; 1541 dest_uninitialized = false; 1542 break; 1543 case arrayof_jbyte_disjoint_arraycopy_id: 1544 size = sizeof(jbyte); 1545 aligned = true; 1546 is_oop = false; 1547 dest_uninitialized = false; 1548 break; 1549 case jshort_disjoint_arraycopy_id: 1550 size = sizeof(jshort); 1551 aligned = false; 1552 is_oop = false; 1553 dest_uninitialized = false; 1554 break; 1555 case arrayof_jshort_disjoint_arraycopy_id: 1556 size = sizeof(jshort); 1557 aligned = true; 1558 is_oop = false; 1559 dest_uninitialized = false; 1560 break; 1561 case jint_disjoint_arraycopy_id: 1562 size = sizeof(jint); 1563 aligned = false; 1564 is_oop = false; 1565 dest_uninitialized = false; 1566 break; 1567 case arrayof_jint_disjoint_arraycopy_id: 1568 size = sizeof(jint); 1569 aligned = true; 1570 is_oop = false; 1571 dest_uninitialized = false; 1572 break; 1573 case jlong_disjoint_arraycopy_id: 1574 // since this is always aligned we can (should!) use the same 1575 // stub as for case arrayof_jlong_disjoint_arraycopy 1576 ShouldNotReachHere(); 1577 break; 1578 case arrayof_jlong_disjoint_arraycopy_id: 1579 size = sizeof(jlong); 1580 aligned = true; 1581 is_oop = false; 1582 dest_uninitialized = false; 1583 break; 1584 case oop_disjoint_arraycopy_id: 1585 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1586 aligned = !UseCompressedOops; 1587 is_oop = true; 1588 dest_uninitialized = false; 1589 break; 1590 case arrayof_oop_disjoint_arraycopy_id: 1591 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1592 aligned = !UseCompressedOops; 1593 is_oop = true; 1594 dest_uninitialized = false; 1595 break; 1596 case oop_disjoint_arraycopy_uninit_id: 1597 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1598 aligned = !UseCompressedOops; 1599 is_oop = true; 1600 dest_uninitialized = true; 1601 break; 1602 case arrayof_oop_disjoint_arraycopy_uninit_id: 1603 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1604 aligned = !UseCompressedOops; 1605 is_oop = true; 1606 dest_uninitialized = true; 1607 break; 1608 default: 1609 ShouldNotReachHere(); 1610 break; 1611 } 1612 1613 __ align(CodeEntryAlignment); 1614 StubCodeMark mark(this, stub_id); 1615 address start = __ pc(); 1616 __ enter(); 1617 1618 if (entry != nullptr) { 1619 *entry = __ pc(); 1620 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1621 BLOCK_COMMENT("Entry:"); 1622 } 1623 1624 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1625 if (dest_uninitialized) { 1626 decorators |= IS_DEST_UNINITIALIZED; 1627 } 1628 if (aligned) { 1629 decorators |= ARRAYCOPY_ALIGNED; 1630 } 1631 1632 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1633 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1634 1635 if (is_oop) { 1636 // save regs before copy_memory 1637 __ push(RegSet::of(d, count), sp); 1638 } 1639 { 1640 // UnsafeMemoryAccess page error: continue after unsafe access 1641 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1642 UnsafeMemoryAccessMark umam(this, add_entry, true); 1643 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1644 } 1645 1646 if (is_oop) { 1647 __ pop(RegSet::of(d, count), sp); 1648 if (VerifyOops) 1649 verify_oop_array(size, d, count, r16); 1650 } 1651 1652 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1653 1654 __ leave(); 1655 __ mov(r0, zr); // return 0 1656 __ ret(lr); 1657 return start; 1658 } 1659 1660 // Arguments: 1661 // stub_id - is used to name the stub and identify all details of 1662 // how to perform the copy. 1663 // 1664 // nooverlap_target - identifes the (post push) entry for the 1665 // corresponding disjoint copy routine which can be 1666 // jumped to if the ranges do not actually overlap 1667 // 1668 // entry - is assigned to the stub's post push entry point unless 1669 // it is null 1670 // 1671 // 1672 // Inputs: 1673 // c_rarg0 - source array address 1674 // c_rarg1 - destination array address 1675 // c_rarg2 - element count, treated as ssize_t, can be zero 1676 // 1677 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1678 // the hardware handle it. The two dwords within qwords that span 1679 // cache line boundaries will still be loaded and stored atomically. 1680 // 1681 // Side Effects: 1682 // entry is set to the no-overlap entry point so it can be used by 1683 // some other conjoint copy method 1684 // 1685 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1686 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1687 RegSet saved_regs = RegSet::of(s, d, count); 1688 int size; 1689 bool aligned; 1690 bool is_oop; 1691 bool dest_uninitialized; 1692 switch (stub_id) { 1693 case jbyte_arraycopy_id: 1694 size = sizeof(jbyte); 1695 aligned = false; 1696 is_oop = false; 1697 dest_uninitialized = false; 1698 break; 1699 case arrayof_jbyte_arraycopy_id: 1700 size = sizeof(jbyte); 1701 aligned = true; 1702 is_oop = false; 1703 dest_uninitialized = false; 1704 break; 1705 case jshort_arraycopy_id: 1706 size = sizeof(jshort); 1707 aligned = false; 1708 is_oop = false; 1709 dest_uninitialized = false; 1710 break; 1711 case arrayof_jshort_arraycopy_id: 1712 size = sizeof(jshort); 1713 aligned = true; 1714 is_oop = false; 1715 dest_uninitialized = false; 1716 break; 1717 case jint_arraycopy_id: 1718 size = sizeof(jint); 1719 aligned = false; 1720 is_oop = false; 1721 dest_uninitialized = false; 1722 break; 1723 case arrayof_jint_arraycopy_id: 1724 size = sizeof(jint); 1725 aligned = true; 1726 is_oop = false; 1727 dest_uninitialized = false; 1728 break; 1729 case jlong_arraycopy_id: 1730 // since this is always aligned we can (should!) use the same 1731 // stub as for case arrayof_jlong_disjoint_arraycopy 1732 ShouldNotReachHere(); 1733 break; 1734 case arrayof_jlong_arraycopy_id: 1735 size = sizeof(jlong); 1736 aligned = true; 1737 is_oop = false; 1738 dest_uninitialized = false; 1739 break; 1740 case oop_arraycopy_id: 1741 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1742 aligned = !UseCompressedOops; 1743 is_oop = true; 1744 dest_uninitialized = false; 1745 break; 1746 case arrayof_oop_arraycopy_id: 1747 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1748 aligned = !UseCompressedOops; 1749 is_oop = true; 1750 dest_uninitialized = false; 1751 break; 1752 case oop_arraycopy_uninit_id: 1753 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1754 aligned = !UseCompressedOops; 1755 is_oop = true; 1756 dest_uninitialized = true; 1757 break; 1758 case arrayof_oop_arraycopy_uninit_id: 1759 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1760 aligned = !UseCompressedOops; 1761 is_oop = true; 1762 dest_uninitialized = true; 1763 break; 1764 default: 1765 ShouldNotReachHere(); 1766 } 1767 1768 StubCodeMark mark(this, stub_id); 1769 address start = __ pc(); 1770 __ enter(); 1771 1772 if (entry != nullptr) { 1773 *entry = __ pc(); 1774 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1775 BLOCK_COMMENT("Entry:"); 1776 } 1777 1778 // use fwd copy when (d-s) above_equal (count*size) 1779 __ sub(rscratch1, d, s); 1780 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1781 __ br(Assembler::HS, nooverlap_target); 1782 1783 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1784 if (dest_uninitialized) { 1785 decorators |= IS_DEST_UNINITIALIZED; 1786 } 1787 if (aligned) { 1788 decorators |= ARRAYCOPY_ALIGNED; 1789 } 1790 1791 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1792 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1793 1794 if (is_oop) { 1795 // save regs before copy_memory 1796 __ push(RegSet::of(d, count), sp); 1797 } 1798 { 1799 // UnsafeMemoryAccess page error: continue after unsafe access 1800 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1801 UnsafeMemoryAccessMark umam(this, add_entry, true); 1802 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1803 } 1804 if (is_oop) { 1805 __ pop(RegSet::of(d, count), sp); 1806 if (VerifyOops) 1807 verify_oop_array(size, d, count, r16); 1808 } 1809 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1810 __ leave(); 1811 __ mov(r0, zr); // return 0 1812 __ ret(lr); 1813 return start; 1814 } 1815 1816 // Helper for generating a dynamic type check. 1817 // Smashes rscratch1, rscratch2. 1818 void generate_type_check(Register sub_klass, 1819 Register super_check_offset, 1820 Register super_klass, 1821 Register temp1, 1822 Register temp2, 1823 Register result, 1824 Label& L_success) { 1825 assert_different_registers(sub_klass, super_check_offset, super_klass); 1826 1827 BLOCK_COMMENT("type_check:"); 1828 1829 Label L_miss; 1830 1831 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1832 super_check_offset); 1833 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1834 1835 // Fall through on failure! 1836 __ BIND(L_miss); 1837 } 1838 1839 // 1840 // Generate checkcasting array copy stub 1841 // 1842 // Input: 1843 // c_rarg0 - source array address 1844 // c_rarg1 - destination array address 1845 // c_rarg2 - element count, treated as ssize_t, can be zero 1846 // c_rarg3 - size_t ckoff (super_check_offset) 1847 // c_rarg4 - oop ckval (super_klass) 1848 // 1849 // Output: 1850 // r0 == 0 - success 1851 // r0 == -1^K - failure, where K is partial transfer count 1852 // 1853 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1854 bool dest_uninitialized; 1855 switch (stub_id) { 1856 case checkcast_arraycopy_id: 1857 dest_uninitialized = false; 1858 break; 1859 case checkcast_arraycopy_uninit_id: 1860 dest_uninitialized = true; 1861 break; 1862 default: 1863 ShouldNotReachHere(); 1864 } 1865 1866 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1867 1868 // Input registers (after setup_arg_regs) 1869 const Register from = c_rarg0; // source array address 1870 const Register to = c_rarg1; // destination array address 1871 const Register count = c_rarg2; // elementscount 1872 const Register ckoff = c_rarg3; // super_check_offset 1873 const Register ckval = c_rarg4; // super_klass 1874 1875 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1876 RegSet wb_post_saved_regs = RegSet::of(count); 1877 1878 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1879 const Register copied_oop = r22; // actual oop copied 1880 const Register count_save = r21; // orig elementscount 1881 const Register start_to = r20; // destination array start address 1882 const Register r19_klass = r19; // oop._klass 1883 1884 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1885 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1886 1887 //--------------------------------------------------------------- 1888 // Assembler stub will be used for this call to arraycopy 1889 // if the two arrays are subtypes of Object[] but the 1890 // destination array type is not equal to or a supertype 1891 // of the source type. Each element must be separately 1892 // checked. 1893 1894 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1895 copied_oop, r19_klass, count_save); 1896 1897 __ align(CodeEntryAlignment); 1898 StubCodeMark mark(this, stub_id); 1899 address start = __ pc(); 1900 1901 __ enter(); // required for proper stackwalking of RuntimeStub frame 1902 1903 #ifdef ASSERT 1904 // caller guarantees that the arrays really are different 1905 // otherwise, we would have to make conjoint checks 1906 { Label L; 1907 __ b(L); // conjoint check not yet implemented 1908 __ stop("checkcast_copy within a single array"); 1909 __ bind(L); 1910 } 1911 #endif //ASSERT 1912 1913 // Caller of this entry point must set up the argument registers. 1914 if (entry != nullptr) { 1915 *entry = __ pc(); 1916 BLOCK_COMMENT("Entry:"); 1917 } 1918 1919 // Empty array: Nothing to do. 1920 __ cbz(count, L_done); 1921 __ push(RegSet::of(r19, r20, r21, r22), sp); 1922 1923 #ifdef ASSERT 1924 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1925 // The ckoff and ckval must be mutually consistent, 1926 // even though caller generates both. 1927 { Label L; 1928 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1929 __ ldrw(start_to, Address(ckval, sco_offset)); 1930 __ cmpw(ckoff, start_to); 1931 __ br(Assembler::EQ, L); 1932 __ stop("super_check_offset inconsistent"); 1933 __ bind(L); 1934 } 1935 #endif //ASSERT 1936 1937 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1938 bool is_oop = true; 1939 int element_size = UseCompressedOops ? 4 : 8; 1940 if (dest_uninitialized) { 1941 decorators |= IS_DEST_UNINITIALIZED; 1942 } 1943 1944 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1945 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1946 1947 // save the original count 1948 __ mov(count_save, count); 1949 1950 // Copy from low to high addresses 1951 __ mov(start_to, to); // Save destination array start address 1952 __ b(L_load_element); 1953 1954 // ======== begin loop ======== 1955 // (Loop is rotated; its entry is L_load_element.) 1956 // Loop control: 1957 // for (; count != 0; count--) { 1958 // copied_oop = load_heap_oop(from++); 1959 // ... generate_type_check ...; 1960 // store_heap_oop(to++, copied_oop); 1961 // } 1962 __ align(OptoLoopAlignment); 1963 1964 __ BIND(L_store_element); 1965 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1966 __ post(to, element_size), copied_oop, noreg, 1967 gct1, gct2, gct3); 1968 __ sub(count, count, 1); 1969 __ cbz(count, L_do_card_marks); 1970 1971 // ======== loop entry is here ======== 1972 __ BIND(L_load_element); 1973 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1974 copied_oop, noreg, __ post(from, element_size), 1975 gct1); 1976 __ cbz(copied_oop, L_store_element); 1977 1978 __ load_klass(r19_klass, copied_oop);// query the object klass 1979 1980 BLOCK_COMMENT("type_check:"); 1981 generate_type_check(/*sub_klass*/r19_klass, 1982 /*super_check_offset*/ckoff, 1983 /*super_klass*/ckval, 1984 /*r_array_base*/gct1, 1985 /*temp2*/gct2, 1986 /*result*/r10, L_store_element); 1987 1988 // Fall through on failure! 1989 1990 // ======== end loop ======== 1991 1992 // It was a real error; we must depend on the caller to finish the job. 1993 // Register count = remaining oops, count_orig = total oops. 1994 // Emit GC store barriers for the oops we have copied and report 1995 // their number to the caller. 1996 1997 __ subs(count, count_save, count); // K = partially copied oop count 1998 __ eon(count, count, zr); // report (-1^K) to caller 1999 __ br(Assembler::EQ, L_done_pop); 2000 2001 __ BIND(L_do_card_marks); 2002 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2003 2004 __ bind(L_done_pop); 2005 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2006 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2007 2008 __ bind(L_done); 2009 __ mov(r0, count); 2010 __ leave(); 2011 __ ret(lr); 2012 2013 return start; 2014 } 2015 2016 // Perform range checks on the proposed arraycopy. 2017 // Kills temp, but nothing else. 2018 // Also, clean the sign bits of src_pos and dst_pos. 2019 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2020 Register src_pos, // source position (c_rarg1) 2021 Register dst, // destination array oo (c_rarg2) 2022 Register dst_pos, // destination position (c_rarg3) 2023 Register length, 2024 Register temp, 2025 Label& L_failed) { 2026 BLOCK_COMMENT("arraycopy_range_checks:"); 2027 2028 assert_different_registers(rscratch1, temp); 2029 2030 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2031 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2032 __ addw(temp, length, src_pos); 2033 __ cmpw(temp, rscratch1); 2034 __ br(Assembler::HI, L_failed); 2035 2036 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2037 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2038 __ addw(temp, length, dst_pos); 2039 __ cmpw(temp, rscratch1); 2040 __ br(Assembler::HI, L_failed); 2041 2042 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2043 __ movw(src_pos, src_pos); 2044 __ movw(dst_pos, dst_pos); 2045 2046 BLOCK_COMMENT("arraycopy_range_checks done"); 2047 } 2048 2049 // These stubs get called from some dumb test routine. 2050 // I'll write them properly when they're called from 2051 // something that's actually doing something. 2052 static void fake_arraycopy_stub(address src, address dst, int count) { 2053 assert(count == 0, "huh?"); 2054 } 2055 2056 2057 // 2058 // Generate 'unsafe' array copy stub 2059 // Though just as safe as the other stubs, it takes an unscaled 2060 // size_t argument instead of an element count. 2061 // 2062 // Input: 2063 // c_rarg0 - source array address 2064 // c_rarg1 - destination array address 2065 // c_rarg2 - byte count, treated as ssize_t, can be zero 2066 // 2067 // Examines the alignment of the operands and dispatches 2068 // to a long, int, short, or byte copy loop. 2069 // 2070 address generate_unsafe_copy(address byte_copy_entry, 2071 address short_copy_entry, 2072 address int_copy_entry, 2073 address long_copy_entry) { 2074 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2075 2076 Label L_long_aligned, L_int_aligned, L_short_aligned; 2077 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2078 2079 __ align(CodeEntryAlignment); 2080 StubCodeMark mark(this, stub_id); 2081 address start = __ pc(); 2082 __ enter(); // required for proper stackwalking of RuntimeStub frame 2083 2084 // bump this on entry, not on exit: 2085 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2086 2087 __ orr(rscratch1, s, d); 2088 __ orr(rscratch1, rscratch1, count); 2089 2090 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2091 __ cbz(rscratch1, L_long_aligned); 2092 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2093 __ cbz(rscratch1, L_int_aligned); 2094 __ tbz(rscratch1, 0, L_short_aligned); 2095 __ b(RuntimeAddress(byte_copy_entry)); 2096 2097 __ BIND(L_short_aligned); 2098 __ lsr(count, count, LogBytesPerShort); // size => short_count 2099 __ b(RuntimeAddress(short_copy_entry)); 2100 __ BIND(L_int_aligned); 2101 __ lsr(count, count, LogBytesPerInt); // size => int_count 2102 __ b(RuntimeAddress(int_copy_entry)); 2103 __ BIND(L_long_aligned); 2104 __ lsr(count, count, LogBytesPerLong); // size => long_count 2105 __ b(RuntimeAddress(long_copy_entry)); 2106 2107 return start; 2108 } 2109 2110 // 2111 // Generate generic array copy stubs 2112 // 2113 // Input: 2114 // c_rarg0 - src oop 2115 // c_rarg1 - src_pos (32-bits) 2116 // c_rarg2 - dst oop 2117 // c_rarg3 - dst_pos (32-bits) 2118 // c_rarg4 - element count (32-bits) 2119 // 2120 // Output: 2121 // r0 == 0 - success 2122 // r0 == -1^K - failure, where K is partial transfer count 2123 // 2124 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2125 address int_copy_entry, address oop_copy_entry, 2126 address long_copy_entry, address checkcast_copy_entry) { 2127 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2128 2129 Label L_failed, L_objArray; 2130 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2131 2132 // Input registers 2133 const Register src = c_rarg0; // source array oop 2134 const Register src_pos = c_rarg1; // source position 2135 const Register dst = c_rarg2; // destination array oop 2136 const Register dst_pos = c_rarg3; // destination position 2137 const Register length = c_rarg4; 2138 2139 2140 // Registers used as temps 2141 const Register dst_klass = c_rarg5; 2142 2143 __ align(CodeEntryAlignment); 2144 2145 StubCodeMark mark(this, stub_id); 2146 2147 address start = __ pc(); 2148 2149 __ enter(); // required for proper stackwalking of RuntimeStub frame 2150 2151 // bump this on entry, not on exit: 2152 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2153 2154 //----------------------------------------------------------------------- 2155 // Assembler stub will be used for this call to arraycopy 2156 // if the following conditions are met: 2157 // 2158 // (1) src and dst must not be null. 2159 // (2) src_pos must not be negative. 2160 // (3) dst_pos must not be negative. 2161 // (4) length must not be negative. 2162 // (5) src klass and dst klass should be the same and not null. 2163 // (6) src and dst should be arrays. 2164 // (7) src_pos + length must not exceed length of src. 2165 // (8) dst_pos + length must not exceed length of dst. 2166 // 2167 2168 // if (src == nullptr) return -1; 2169 __ cbz(src, L_failed); 2170 2171 // if (src_pos < 0) return -1; 2172 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2173 2174 // if (dst == nullptr) return -1; 2175 __ cbz(dst, L_failed); 2176 2177 // if (dst_pos < 0) return -1; 2178 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2179 2180 // registers used as temp 2181 const Register scratch_length = r16; // elements count to copy 2182 const Register scratch_src_klass = r17; // array klass 2183 const Register lh = r15; // layout helper 2184 2185 // if (length < 0) return -1; 2186 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2187 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2188 2189 __ load_klass(scratch_src_klass, src); 2190 #ifdef ASSERT 2191 // assert(src->klass() != nullptr); 2192 { 2193 BLOCK_COMMENT("assert klasses not null {"); 2194 Label L1, L2; 2195 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2196 __ bind(L1); 2197 __ stop("broken null klass"); 2198 __ bind(L2); 2199 __ load_klass(rscratch1, dst); 2200 __ cbz(rscratch1, L1); // this would be broken also 2201 BLOCK_COMMENT("} assert klasses not null done"); 2202 } 2203 #endif 2204 2205 // Load layout helper (32-bits) 2206 // 2207 // |array_tag| | header_size | element_type | |log2_element_size| 2208 // 32 30 24 16 8 2 0 2209 // 2210 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2211 // 2212 2213 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2214 2215 // Handle objArrays completely differently... 2216 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2217 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2218 __ movw(rscratch1, objArray_lh); 2219 __ eorw(rscratch2, lh, rscratch1); 2220 __ cbzw(rscratch2, L_objArray); 2221 2222 // if (src->klass() != dst->klass()) return -1; 2223 __ load_klass(rscratch2, dst); 2224 __ eor(rscratch2, rscratch2, scratch_src_klass); 2225 __ cbnz(rscratch2, L_failed); 2226 2227 // if (!src->is_Array()) return -1; 2228 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2229 2230 // At this point, it is known to be a typeArray (array_tag 0x3). 2231 #ifdef ASSERT 2232 { 2233 BLOCK_COMMENT("assert primitive array {"); 2234 Label L; 2235 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2236 __ cmpw(lh, rscratch2); 2237 __ br(Assembler::GE, L); 2238 __ stop("must be a primitive array"); 2239 __ bind(L); 2240 BLOCK_COMMENT("} assert primitive array done"); 2241 } 2242 #endif 2243 2244 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2245 rscratch2, L_failed); 2246 2247 // TypeArrayKlass 2248 // 2249 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2250 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2251 // 2252 2253 const Register rscratch1_offset = rscratch1; // array offset 2254 const Register r15_elsize = lh; // element size 2255 2256 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2257 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2258 __ add(src, src, rscratch1_offset); // src array offset 2259 __ add(dst, dst, rscratch1_offset); // dst array offset 2260 BLOCK_COMMENT("choose copy loop based on element size"); 2261 2262 // next registers should be set before the jump to corresponding stub 2263 const Register from = c_rarg0; // source array address 2264 const Register to = c_rarg1; // destination array address 2265 const Register count = c_rarg2; // elements count 2266 2267 // 'from', 'to', 'count' registers should be set in such order 2268 // since they are the same as 'src', 'src_pos', 'dst'. 2269 2270 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2271 2272 // The possible values of elsize are 0-3, i.e. exact_log2(element 2273 // size in bytes). We do a simple bitwise binary search. 2274 __ BIND(L_copy_bytes); 2275 __ tbnz(r15_elsize, 1, L_copy_ints); 2276 __ tbnz(r15_elsize, 0, L_copy_shorts); 2277 __ lea(from, Address(src, src_pos));// src_addr 2278 __ lea(to, Address(dst, dst_pos));// dst_addr 2279 __ movw(count, scratch_length); // length 2280 __ b(RuntimeAddress(byte_copy_entry)); 2281 2282 __ BIND(L_copy_shorts); 2283 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2284 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2285 __ movw(count, scratch_length); // length 2286 __ b(RuntimeAddress(short_copy_entry)); 2287 2288 __ BIND(L_copy_ints); 2289 __ tbnz(r15_elsize, 0, L_copy_longs); 2290 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2291 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2292 __ movw(count, scratch_length); // length 2293 __ b(RuntimeAddress(int_copy_entry)); 2294 2295 __ BIND(L_copy_longs); 2296 #ifdef ASSERT 2297 { 2298 BLOCK_COMMENT("assert long copy {"); 2299 Label L; 2300 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2301 __ cmpw(r15_elsize, LogBytesPerLong); 2302 __ br(Assembler::EQ, L); 2303 __ stop("must be long copy, but elsize is wrong"); 2304 __ bind(L); 2305 BLOCK_COMMENT("} assert long copy done"); 2306 } 2307 #endif 2308 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2309 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2310 __ movw(count, scratch_length); // length 2311 __ b(RuntimeAddress(long_copy_entry)); 2312 2313 // ObjArrayKlass 2314 __ BIND(L_objArray); 2315 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2316 2317 Label L_plain_copy, L_checkcast_copy; 2318 // test array classes for subtyping 2319 __ load_klass(r15, dst); 2320 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2321 __ br(Assembler::NE, L_checkcast_copy); 2322 2323 // Identically typed arrays can be copied without element-wise checks. 2324 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2325 rscratch2, L_failed); 2326 2327 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2328 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2329 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2330 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2331 __ movw(count, scratch_length); // length 2332 __ BIND(L_plain_copy); 2333 __ b(RuntimeAddress(oop_copy_entry)); 2334 2335 __ BIND(L_checkcast_copy); 2336 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2337 { 2338 // Before looking at dst.length, make sure dst is also an objArray. 2339 __ ldrw(rscratch1, Address(r15, lh_offset)); 2340 __ movw(rscratch2, objArray_lh); 2341 __ eorw(rscratch1, rscratch1, rscratch2); 2342 __ cbnzw(rscratch1, L_failed); 2343 2344 // It is safe to examine both src.length and dst.length. 2345 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2346 r15, L_failed); 2347 2348 __ load_klass(dst_klass, dst); // reload 2349 2350 // Marshal the base address arguments now, freeing registers. 2351 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2354 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2355 __ movw(count, length); // length (reloaded) 2356 Register sco_temp = c_rarg3; // this register is free now 2357 assert_different_registers(from, to, count, sco_temp, 2358 dst_klass, scratch_src_klass); 2359 // assert_clean_int(count, sco_temp); 2360 2361 // Generate the type check. 2362 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2363 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2364 2365 // Smashes rscratch1, rscratch2 2366 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2367 L_plain_copy); 2368 2369 // Fetch destination element klass from the ObjArrayKlass header. 2370 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2371 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2372 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2373 2374 // the checkcast_copy loop needs two extra arguments: 2375 assert(c_rarg3 == sco_temp, "#3 already in place"); 2376 // Set up arguments for checkcast_copy_entry. 2377 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2378 __ b(RuntimeAddress(checkcast_copy_entry)); 2379 } 2380 2381 __ BIND(L_failed); 2382 __ mov(r0, -1); 2383 __ leave(); // required for proper stackwalking of RuntimeStub frame 2384 __ ret(lr); 2385 2386 return start; 2387 } 2388 2389 // 2390 // Generate stub for array fill. If "aligned" is true, the 2391 // "to" address is assumed to be heapword aligned. 2392 // 2393 // Arguments for generated stub: 2394 // to: c_rarg0 2395 // value: c_rarg1 2396 // count: c_rarg2 treated as signed 2397 // 2398 address generate_fill(StubGenStubId stub_id) { 2399 BasicType t; 2400 bool aligned; 2401 2402 switch (stub_id) { 2403 case jbyte_fill_id: 2404 t = T_BYTE; 2405 aligned = false; 2406 break; 2407 case jshort_fill_id: 2408 t = T_SHORT; 2409 aligned = false; 2410 break; 2411 case jint_fill_id: 2412 t = T_INT; 2413 aligned = false; 2414 break; 2415 case arrayof_jbyte_fill_id: 2416 t = T_BYTE; 2417 aligned = true; 2418 break; 2419 case arrayof_jshort_fill_id: 2420 t = T_SHORT; 2421 aligned = true; 2422 break; 2423 case arrayof_jint_fill_id: 2424 t = T_INT; 2425 aligned = true; 2426 break; 2427 default: 2428 ShouldNotReachHere(); 2429 }; 2430 2431 __ align(CodeEntryAlignment); 2432 StubCodeMark mark(this, stub_id); 2433 address start = __ pc(); 2434 2435 BLOCK_COMMENT("Entry:"); 2436 2437 const Register to = c_rarg0; // source array address 2438 const Register value = c_rarg1; // value 2439 const Register count = c_rarg2; // elements count 2440 2441 const Register bz_base = r10; // base for block_zero routine 2442 const Register cnt_words = r11; // temp register 2443 2444 __ enter(); 2445 2446 Label L_fill_elements, L_exit1; 2447 2448 int shift = -1; 2449 switch (t) { 2450 case T_BYTE: 2451 shift = 0; 2452 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2453 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2454 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2455 __ br(Assembler::LO, L_fill_elements); 2456 break; 2457 case T_SHORT: 2458 shift = 1; 2459 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2460 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2461 __ br(Assembler::LO, L_fill_elements); 2462 break; 2463 case T_INT: 2464 shift = 2; 2465 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2466 __ br(Assembler::LO, L_fill_elements); 2467 break; 2468 default: ShouldNotReachHere(); 2469 } 2470 2471 // Align source address at 8 bytes address boundary. 2472 Label L_skip_align1, L_skip_align2, L_skip_align4; 2473 if (!aligned) { 2474 switch (t) { 2475 case T_BYTE: 2476 // One byte misalignment happens only for byte arrays. 2477 __ tbz(to, 0, L_skip_align1); 2478 __ strb(value, Address(__ post(to, 1))); 2479 __ subw(count, count, 1); 2480 __ bind(L_skip_align1); 2481 // Fallthrough 2482 case T_SHORT: 2483 // Two bytes misalignment happens only for byte and short (char) arrays. 2484 __ tbz(to, 1, L_skip_align2); 2485 __ strh(value, Address(__ post(to, 2))); 2486 __ subw(count, count, 2 >> shift); 2487 __ bind(L_skip_align2); 2488 // Fallthrough 2489 case T_INT: 2490 // Align to 8 bytes, we know we are 4 byte aligned to start. 2491 __ tbz(to, 2, L_skip_align4); 2492 __ strw(value, Address(__ post(to, 4))); 2493 __ subw(count, count, 4 >> shift); 2494 __ bind(L_skip_align4); 2495 break; 2496 default: ShouldNotReachHere(); 2497 } 2498 } 2499 2500 // 2501 // Fill large chunks 2502 // 2503 __ lsrw(cnt_words, count, 3 - shift); // number of words 2504 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2505 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2506 if (UseBlockZeroing) { 2507 Label non_block_zeroing, rest; 2508 // If the fill value is zero we can use the fast zero_words(). 2509 __ cbnz(value, non_block_zeroing); 2510 __ mov(bz_base, to); 2511 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2512 address tpc = __ zero_words(bz_base, cnt_words); 2513 if (tpc == nullptr) { 2514 fatal("CodeCache is full at generate_fill"); 2515 } 2516 __ b(rest); 2517 __ bind(non_block_zeroing); 2518 __ fill_words(to, cnt_words, value); 2519 __ bind(rest); 2520 } else { 2521 __ fill_words(to, cnt_words, value); 2522 } 2523 2524 // Remaining count is less than 8 bytes. Fill it by a single store. 2525 // Note that the total length is no less than 8 bytes. 2526 if (t == T_BYTE || t == T_SHORT) { 2527 Label L_exit1; 2528 __ cbzw(count, L_exit1); 2529 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2530 __ str(value, Address(to, -8)); // overwrite some elements 2531 __ bind(L_exit1); 2532 __ leave(); 2533 __ ret(lr); 2534 } 2535 2536 // Handle copies less than 8 bytes. 2537 Label L_fill_2, L_fill_4, L_exit2; 2538 __ bind(L_fill_elements); 2539 switch (t) { 2540 case T_BYTE: 2541 __ tbz(count, 0, L_fill_2); 2542 __ strb(value, Address(__ post(to, 1))); 2543 __ bind(L_fill_2); 2544 __ tbz(count, 1, L_fill_4); 2545 __ strh(value, Address(__ post(to, 2))); 2546 __ bind(L_fill_4); 2547 __ tbz(count, 2, L_exit2); 2548 __ strw(value, Address(to)); 2549 break; 2550 case T_SHORT: 2551 __ tbz(count, 0, L_fill_4); 2552 __ strh(value, Address(__ post(to, 2))); 2553 __ bind(L_fill_4); 2554 __ tbz(count, 1, L_exit2); 2555 __ strw(value, Address(to)); 2556 break; 2557 case T_INT: 2558 __ cbzw(count, L_exit2); 2559 __ strw(value, Address(to)); 2560 break; 2561 default: ShouldNotReachHere(); 2562 } 2563 __ bind(L_exit2); 2564 __ leave(); 2565 __ ret(lr); 2566 return start; 2567 } 2568 2569 address generate_data_cache_writeback() { 2570 const Register line = c_rarg0; // address of line to write back 2571 2572 __ align(CodeEntryAlignment); 2573 2574 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2575 StubCodeMark mark(this, stub_id); 2576 2577 address start = __ pc(); 2578 __ enter(); 2579 __ cache_wb(Address(line, 0)); 2580 __ leave(); 2581 __ ret(lr); 2582 2583 return start; 2584 } 2585 2586 address generate_data_cache_writeback_sync() { 2587 const Register is_pre = c_rarg0; // pre or post sync 2588 2589 __ align(CodeEntryAlignment); 2590 2591 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2592 StubCodeMark mark(this, stub_id); 2593 2594 // pre wbsync is a no-op 2595 // post wbsync translates to an sfence 2596 2597 Label skip; 2598 address start = __ pc(); 2599 __ enter(); 2600 __ cbnz(is_pre, skip); 2601 __ cache_wbsync(false); 2602 __ bind(skip); 2603 __ leave(); 2604 __ ret(lr); 2605 2606 return start; 2607 } 2608 2609 void generate_arraycopy_stubs() { 2610 address entry; 2611 address entry_jbyte_arraycopy; 2612 address entry_jshort_arraycopy; 2613 address entry_jint_arraycopy; 2614 address entry_oop_arraycopy; 2615 address entry_jlong_arraycopy; 2616 address entry_checkcast_arraycopy; 2617 2618 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2619 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2620 2621 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2622 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2623 2624 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2625 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2626 2627 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2628 2629 //*** jbyte 2630 // Always need aligned and unaligned versions 2631 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2632 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2633 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2634 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2635 2636 //*** jshort 2637 // Always need aligned and unaligned versions 2638 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2639 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2640 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2641 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2642 2643 //*** jint 2644 // Aligned versions 2645 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2646 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2647 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2648 // entry_jint_arraycopy always points to the unaligned version 2649 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2650 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2651 2652 //*** jlong 2653 // It is always aligned 2654 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2655 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2656 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2657 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2658 2659 //*** oops 2660 { 2661 // With compressed oops we need unaligned versions; notice that 2662 // we overwrite entry_oop_arraycopy. 2663 bool aligned = !UseCompressedOops; 2664 2665 StubRoutines::_arrayof_oop_disjoint_arraycopy 2666 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2667 StubRoutines::_arrayof_oop_arraycopy 2668 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2669 // Aligned versions without pre-barriers 2670 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2671 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2672 StubRoutines::_arrayof_oop_arraycopy_uninit 2673 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2674 } 2675 2676 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2677 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2678 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2679 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2680 2681 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2682 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2683 2684 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2685 entry_jshort_arraycopy, 2686 entry_jint_arraycopy, 2687 entry_jlong_arraycopy); 2688 2689 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2690 entry_jshort_arraycopy, 2691 entry_jint_arraycopy, 2692 entry_oop_arraycopy, 2693 entry_jlong_arraycopy, 2694 entry_checkcast_arraycopy); 2695 2696 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2697 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2698 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2699 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2700 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2701 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2702 } 2703 2704 void generate_math_stubs() { Unimplemented(); } 2705 2706 // Arguments: 2707 // 2708 // Inputs: 2709 // c_rarg0 - source byte array address 2710 // c_rarg1 - destination byte array address 2711 // c_rarg2 - K (key) in little endian int array 2712 // 2713 address generate_aescrypt_encryptBlock() { 2714 __ align(CodeEntryAlignment); 2715 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2716 StubCodeMark mark(this, stub_id); 2717 2718 const Register from = c_rarg0; // source array address 2719 const Register to = c_rarg1; // destination array address 2720 const Register key = c_rarg2; // key array address 2721 const Register keylen = rscratch1; 2722 2723 address start = __ pc(); 2724 __ enter(); 2725 2726 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2727 2728 __ aesenc_loadkeys(key, keylen); 2729 __ aesecb_encrypt(from, to, keylen); 2730 2731 __ mov(r0, 0); 2732 2733 __ leave(); 2734 __ ret(lr); 2735 2736 return start; 2737 } 2738 2739 // Arguments: 2740 // 2741 // Inputs: 2742 // c_rarg0 - source byte array address 2743 // c_rarg1 - destination byte array address 2744 // c_rarg2 - K (key) in little endian int array 2745 // 2746 address generate_aescrypt_decryptBlock() { 2747 assert(UseAES, "need AES cryptographic extension support"); 2748 __ align(CodeEntryAlignment); 2749 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2750 StubCodeMark mark(this, stub_id); 2751 Label L_doLast; 2752 2753 const Register from = c_rarg0; // source array address 2754 const Register to = c_rarg1; // destination array address 2755 const Register key = c_rarg2; // key array address 2756 const Register keylen = rscratch1; 2757 2758 address start = __ pc(); 2759 __ enter(); // required for proper stackwalking of RuntimeStub frame 2760 2761 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2762 2763 __ aesecb_decrypt(from, to, key, keylen); 2764 2765 __ mov(r0, 0); 2766 2767 __ leave(); 2768 __ ret(lr); 2769 2770 return start; 2771 } 2772 2773 // Arguments: 2774 // 2775 // Inputs: 2776 // c_rarg0 - source byte array address 2777 // c_rarg1 - destination byte array address 2778 // c_rarg2 - K (key) in little endian int array 2779 // c_rarg3 - r vector byte array address 2780 // c_rarg4 - input length 2781 // 2782 // Output: 2783 // x0 - input length 2784 // 2785 address generate_cipherBlockChaining_encryptAESCrypt() { 2786 assert(UseAES, "need AES cryptographic extension support"); 2787 __ align(CodeEntryAlignment); 2788 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2789 StubCodeMark mark(this, stub_id); 2790 2791 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2792 2793 const Register from = c_rarg0; // source array address 2794 const Register to = c_rarg1; // destination array address 2795 const Register key = c_rarg2; // key array address 2796 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2797 // and left with the results of the last encryption block 2798 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2799 const Register keylen = rscratch1; 2800 2801 address start = __ pc(); 2802 2803 __ enter(); 2804 2805 __ movw(rscratch2, len_reg); 2806 2807 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2808 2809 __ ld1(v0, __ T16B, rvec); 2810 2811 __ cmpw(keylen, 52); 2812 __ br(Assembler::CC, L_loadkeys_44); 2813 __ br(Assembler::EQ, L_loadkeys_52); 2814 2815 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2816 __ rev32(v17, __ T16B, v17); 2817 __ rev32(v18, __ T16B, v18); 2818 __ BIND(L_loadkeys_52); 2819 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2820 __ rev32(v19, __ T16B, v19); 2821 __ rev32(v20, __ T16B, v20); 2822 __ BIND(L_loadkeys_44); 2823 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2824 __ rev32(v21, __ T16B, v21); 2825 __ rev32(v22, __ T16B, v22); 2826 __ rev32(v23, __ T16B, v23); 2827 __ rev32(v24, __ T16B, v24); 2828 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2829 __ rev32(v25, __ T16B, v25); 2830 __ rev32(v26, __ T16B, v26); 2831 __ rev32(v27, __ T16B, v27); 2832 __ rev32(v28, __ T16B, v28); 2833 __ ld1(v29, v30, v31, __ T16B, key); 2834 __ rev32(v29, __ T16B, v29); 2835 __ rev32(v30, __ T16B, v30); 2836 __ rev32(v31, __ T16B, v31); 2837 2838 __ BIND(L_aes_loop); 2839 __ ld1(v1, __ T16B, __ post(from, 16)); 2840 __ eor(v0, __ T16B, v0, v1); 2841 2842 __ br(Assembler::CC, L_rounds_44); 2843 __ br(Assembler::EQ, L_rounds_52); 2844 2845 __ aese(v0, v17); __ aesmc(v0, v0); 2846 __ aese(v0, v18); __ aesmc(v0, v0); 2847 __ BIND(L_rounds_52); 2848 __ aese(v0, v19); __ aesmc(v0, v0); 2849 __ aese(v0, v20); __ aesmc(v0, v0); 2850 __ BIND(L_rounds_44); 2851 __ aese(v0, v21); __ aesmc(v0, v0); 2852 __ aese(v0, v22); __ aesmc(v0, v0); 2853 __ aese(v0, v23); __ aesmc(v0, v0); 2854 __ aese(v0, v24); __ aesmc(v0, v0); 2855 __ aese(v0, v25); __ aesmc(v0, v0); 2856 __ aese(v0, v26); __ aesmc(v0, v0); 2857 __ aese(v0, v27); __ aesmc(v0, v0); 2858 __ aese(v0, v28); __ aesmc(v0, v0); 2859 __ aese(v0, v29); __ aesmc(v0, v0); 2860 __ aese(v0, v30); 2861 __ eor(v0, __ T16B, v0, v31); 2862 2863 __ st1(v0, __ T16B, __ post(to, 16)); 2864 2865 __ subw(len_reg, len_reg, 16); 2866 __ cbnzw(len_reg, L_aes_loop); 2867 2868 __ st1(v0, __ T16B, rvec); 2869 2870 __ mov(r0, rscratch2); 2871 2872 __ leave(); 2873 __ ret(lr); 2874 2875 return start; 2876 } 2877 2878 // Arguments: 2879 // 2880 // Inputs: 2881 // c_rarg0 - source byte array address 2882 // c_rarg1 - destination byte array address 2883 // c_rarg2 - K (key) in little endian int array 2884 // c_rarg3 - r vector byte array address 2885 // c_rarg4 - input length 2886 // 2887 // Output: 2888 // r0 - input length 2889 // 2890 address generate_cipherBlockChaining_decryptAESCrypt() { 2891 assert(UseAES, "need AES cryptographic extension support"); 2892 __ align(CodeEntryAlignment); 2893 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 2894 StubCodeMark mark(this, stub_id); 2895 2896 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2897 2898 const Register from = c_rarg0; // source array address 2899 const Register to = c_rarg1; // destination array address 2900 const Register key = c_rarg2; // key array address 2901 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2902 // and left with the results of the last encryption block 2903 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2904 const Register keylen = rscratch1; 2905 2906 address start = __ pc(); 2907 2908 __ enter(); 2909 2910 __ movw(rscratch2, len_reg); 2911 2912 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2913 2914 __ ld1(v2, __ T16B, rvec); 2915 2916 __ ld1(v31, __ T16B, __ post(key, 16)); 2917 __ rev32(v31, __ T16B, v31); 2918 2919 __ cmpw(keylen, 52); 2920 __ br(Assembler::CC, L_loadkeys_44); 2921 __ br(Assembler::EQ, L_loadkeys_52); 2922 2923 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2924 __ rev32(v17, __ T16B, v17); 2925 __ rev32(v18, __ T16B, v18); 2926 __ BIND(L_loadkeys_52); 2927 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2928 __ rev32(v19, __ T16B, v19); 2929 __ rev32(v20, __ T16B, v20); 2930 __ BIND(L_loadkeys_44); 2931 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2932 __ rev32(v21, __ T16B, v21); 2933 __ rev32(v22, __ T16B, v22); 2934 __ rev32(v23, __ T16B, v23); 2935 __ rev32(v24, __ T16B, v24); 2936 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2937 __ rev32(v25, __ T16B, v25); 2938 __ rev32(v26, __ T16B, v26); 2939 __ rev32(v27, __ T16B, v27); 2940 __ rev32(v28, __ T16B, v28); 2941 __ ld1(v29, v30, __ T16B, key); 2942 __ rev32(v29, __ T16B, v29); 2943 __ rev32(v30, __ T16B, v30); 2944 2945 __ BIND(L_aes_loop); 2946 __ ld1(v0, __ T16B, __ post(from, 16)); 2947 __ orr(v1, __ T16B, v0, v0); 2948 2949 __ br(Assembler::CC, L_rounds_44); 2950 __ br(Assembler::EQ, L_rounds_52); 2951 2952 __ aesd(v0, v17); __ aesimc(v0, v0); 2953 __ aesd(v0, v18); __ aesimc(v0, v0); 2954 __ BIND(L_rounds_52); 2955 __ aesd(v0, v19); __ aesimc(v0, v0); 2956 __ aesd(v0, v20); __ aesimc(v0, v0); 2957 __ BIND(L_rounds_44); 2958 __ aesd(v0, v21); __ aesimc(v0, v0); 2959 __ aesd(v0, v22); __ aesimc(v0, v0); 2960 __ aesd(v0, v23); __ aesimc(v0, v0); 2961 __ aesd(v0, v24); __ aesimc(v0, v0); 2962 __ aesd(v0, v25); __ aesimc(v0, v0); 2963 __ aesd(v0, v26); __ aesimc(v0, v0); 2964 __ aesd(v0, v27); __ aesimc(v0, v0); 2965 __ aesd(v0, v28); __ aesimc(v0, v0); 2966 __ aesd(v0, v29); __ aesimc(v0, v0); 2967 __ aesd(v0, v30); 2968 __ eor(v0, __ T16B, v0, v31); 2969 __ eor(v0, __ T16B, v0, v2); 2970 2971 __ st1(v0, __ T16B, __ post(to, 16)); 2972 __ orr(v2, __ T16B, v1, v1); 2973 2974 __ subw(len_reg, len_reg, 16); 2975 __ cbnzw(len_reg, L_aes_loop); 2976 2977 __ st1(v2, __ T16B, rvec); 2978 2979 __ mov(r0, rscratch2); 2980 2981 __ leave(); 2982 __ ret(lr); 2983 2984 return start; 2985 } 2986 2987 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2988 // Inputs: 128-bits. in is preserved. 2989 // The least-significant 64-bit word is in the upper dword of each vector. 2990 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2991 // Output: result 2992 void be_add_128_64(FloatRegister result, FloatRegister in, 2993 FloatRegister inc, FloatRegister tmp) { 2994 assert_different_registers(result, tmp, inc); 2995 2996 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2997 // input 2998 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 2999 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3000 // MSD == 0 (must be!) to LSD 3001 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3002 } 3003 3004 // CTR AES crypt. 3005 // Arguments: 3006 // 3007 // Inputs: 3008 // c_rarg0 - source byte array address 3009 // c_rarg1 - destination byte array address 3010 // c_rarg2 - K (key) in little endian int array 3011 // c_rarg3 - counter vector byte array address 3012 // c_rarg4 - input length 3013 // c_rarg5 - saved encryptedCounter start 3014 // c_rarg6 - saved used length 3015 // 3016 // Output: 3017 // r0 - input length 3018 // 3019 address generate_counterMode_AESCrypt() { 3020 const Register in = c_rarg0; 3021 const Register out = c_rarg1; 3022 const Register key = c_rarg2; 3023 const Register counter = c_rarg3; 3024 const Register saved_len = c_rarg4, len = r10; 3025 const Register saved_encrypted_ctr = c_rarg5; 3026 const Register used_ptr = c_rarg6, used = r12; 3027 3028 const Register offset = r7; 3029 const Register keylen = r11; 3030 3031 const unsigned char block_size = 16; 3032 const int bulk_width = 4; 3033 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3034 // performance with larger data sizes, but it also means that the 3035 // fast path isn't used until you have at least 8 blocks, and up 3036 // to 127 bytes of data will be executed on the slow path. For 3037 // that reason, and also so as not to blow away too much icache, 4 3038 // blocks seems like a sensible compromise. 3039 3040 // Algorithm: 3041 // 3042 // if (len == 0) { 3043 // goto DONE; 3044 // } 3045 // int result = len; 3046 // do { 3047 // if (used >= blockSize) { 3048 // if (len >= bulk_width * blockSize) { 3049 // CTR_large_block(); 3050 // if (len == 0) 3051 // goto DONE; 3052 // } 3053 // for (;;) { 3054 // 16ByteVector v0 = counter; 3055 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3056 // used = 0; 3057 // if (len < blockSize) 3058 // break; /* goto NEXT */ 3059 // 16ByteVector v1 = load16Bytes(in, offset); 3060 // v1 = v1 ^ encryptedCounter; 3061 // store16Bytes(out, offset); 3062 // used = blockSize; 3063 // offset += blockSize; 3064 // len -= blockSize; 3065 // if (len == 0) 3066 // goto DONE; 3067 // } 3068 // } 3069 // NEXT: 3070 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3071 // len--; 3072 // } while (len != 0); 3073 // DONE: 3074 // return result; 3075 // 3076 // CTR_large_block() 3077 // Wide bulk encryption of whole blocks. 3078 3079 __ align(CodeEntryAlignment); 3080 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3081 StubCodeMark mark(this, stub_id); 3082 const address start = __ pc(); 3083 __ enter(); 3084 3085 Label DONE, CTR_large_block, large_block_return; 3086 __ ldrw(used, Address(used_ptr)); 3087 __ cbzw(saved_len, DONE); 3088 3089 __ mov(len, saved_len); 3090 __ mov(offset, 0); 3091 3092 // Compute #rounds for AES based on the length of the key array 3093 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3094 3095 __ aesenc_loadkeys(key, keylen); 3096 3097 { 3098 Label L_CTR_loop, NEXT; 3099 3100 __ bind(L_CTR_loop); 3101 3102 __ cmp(used, block_size); 3103 __ br(__ LO, NEXT); 3104 3105 // Maybe we have a lot of data 3106 __ subsw(rscratch1, len, bulk_width * block_size); 3107 __ br(__ HS, CTR_large_block); 3108 __ BIND(large_block_return); 3109 __ cbzw(len, DONE); 3110 3111 // Setup the counter 3112 __ movi(v4, __ T4S, 0); 3113 __ movi(v5, __ T4S, 1); 3114 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3115 3116 // 128-bit big-endian increment 3117 __ ld1(v0, __ T16B, counter); 3118 __ rev64(v16, __ T16B, v0); 3119 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3120 __ rev64(v16, __ T16B, v16); 3121 __ st1(v16, __ T16B, counter); 3122 // Previous counter value is in v0 3123 // v4 contains { 0, 1 } 3124 3125 { 3126 // We have fewer than bulk_width blocks of data left. Encrypt 3127 // them one by one until there is less than a full block 3128 // remaining, being careful to save both the encrypted counter 3129 // and the counter. 3130 3131 Label inner_loop; 3132 __ bind(inner_loop); 3133 // Counter to encrypt is in v0 3134 __ aesecb_encrypt(noreg, noreg, keylen); 3135 __ st1(v0, __ T16B, saved_encrypted_ctr); 3136 3137 // Do we have a remaining full block? 3138 3139 __ mov(used, 0); 3140 __ cmp(len, block_size); 3141 __ br(__ LO, NEXT); 3142 3143 // Yes, we have a full block 3144 __ ldrq(v1, Address(in, offset)); 3145 __ eor(v1, __ T16B, v1, v0); 3146 __ strq(v1, Address(out, offset)); 3147 __ mov(used, block_size); 3148 __ add(offset, offset, block_size); 3149 3150 __ subw(len, len, block_size); 3151 __ cbzw(len, DONE); 3152 3153 // Increment the counter, store it back 3154 __ orr(v0, __ T16B, v16, v16); 3155 __ rev64(v16, __ T16B, v16); 3156 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3157 __ rev64(v16, __ T16B, v16); 3158 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3159 3160 __ b(inner_loop); 3161 } 3162 3163 __ BIND(NEXT); 3164 3165 // Encrypt a single byte, and loop. 3166 // We expect this to be a rare event. 3167 __ ldrb(rscratch1, Address(in, offset)); 3168 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3169 __ eor(rscratch1, rscratch1, rscratch2); 3170 __ strb(rscratch1, Address(out, offset)); 3171 __ add(offset, offset, 1); 3172 __ add(used, used, 1); 3173 __ subw(len, len,1); 3174 __ cbnzw(len, L_CTR_loop); 3175 } 3176 3177 __ bind(DONE); 3178 __ strw(used, Address(used_ptr)); 3179 __ mov(r0, saved_len); 3180 3181 __ leave(); // required for proper stackwalking of RuntimeStub frame 3182 __ ret(lr); 3183 3184 // Bulk encryption 3185 3186 __ BIND (CTR_large_block); 3187 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3188 3189 if (bulk_width == 8) { 3190 __ sub(sp, sp, 4 * 16); 3191 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3192 } 3193 __ sub(sp, sp, 4 * 16); 3194 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3195 RegSet saved_regs = (RegSet::of(in, out, offset) 3196 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3197 __ push(saved_regs, sp); 3198 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3199 __ add(in, in, offset); 3200 __ add(out, out, offset); 3201 3202 // Keys should already be loaded into the correct registers 3203 3204 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3205 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3206 3207 // AES/CTR loop 3208 { 3209 Label L_CTR_loop; 3210 __ BIND(L_CTR_loop); 3211 3212 // Setup the counters 3213 __ movi(v8, __ T4S, 0); 3214 __ movi(v9, __ T4S, 1); 3215 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3216 3217 for (int i = 0; i < bulk_width; i++) { 3218 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3219 __ rev64(v0_ofs, __ T16B, v16); 3220 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3221 } 3222 3223 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3224 3225 // Encrypt the counters 3226 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3227 3228 if (bulk_width == 8) { 3229 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3230 } 3231 3232 // XOR the encrypted counters with the inputs 3233 for (int i = 0; i < bulk_width; i++) { 3234 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3235 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3236 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3237 } 3238 3239 // Write the encrypted data 3240 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3241 if (bulk_width == 8) { 3242 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3243 } 3244 3245 __ subw(len, len, 16 * bulk_width); 3246 __ cbnzw(len, L_CTR_loop); 3247 } 3248 3249 // Save the counter back where it goes 3250 __ rev64(v16, __ T16B, v16); 3251 __ st1(v16, __ T16B, counter); 3252 3253 __ pop(saved_regs, sp); 3254 3255 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3256 if (bulk_width == 8) { 3257 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3258 } 3259 3260 __ andr(rscratch1, len, -16 * bulk_width); 3261 __ sub(len, len, rscratch1); 3262 __ add(offset, offset, rscratch1); 3263 __ mov(used, 16); 3264 __ strw(used, Address(used_ptr)); 3265 __ b(large_block_return); 3266 3267 return start; 3268 } 3269 3270 // Vector AES Galois Counter Mode implementation. Parameters: 3271 // 3272 // in = c_rarg0 3273 // len = c_rarg1 3274 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3275 // out = c_rarg3 3276 // key = c_rarg4 3277 // state = c_rarg5 - GHASH.state 3278 // subkeyHtbl = c_rarg6 - powers of H 3279 // counter = c_rarg7 - 16 bytes of CTR 3280 // return - number of processed bytes 3281 address generate_galoisCounterMode_AESCrypt() { 3282 address ghash_polynomial = __ pc(); 3283 __ emit_int64(0x87); // The low-order bits of the field 3284 // polynomial (i.e. p = z^7+z^2+z+1) 3285 // repeated in the low and high parts of a 3286 // 128-bit vector 3287 __ emit_int64(0x87); 3288 3289 __ align(CodeEntryAlignment); 3290 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3291 StubCodeMark mark(this, stub_id); 3292 address start = __ pc(); 3293 __ enter(); 3294 3295 const Register in = c_rarg0; 3296 const Register len = c_rarg1; 3297 const Register ct = c_rarg2; 3298 const Register out = c_rarg3; 3299 // and updated with the incremented counter in the end 3300 3301 const Register key = c_rarg4; 3302 const Register state = c_rarg5; 3303 3304 const Register subkeyHtbl = c_rarg6; 3305 3306 const Register counter = c_rarg7; 3307 3308 const Register keylen = r10; 3309 // Save state before entering routine 3310 __ sub(sp, sp, 4 * 16); 3311 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3312 __ sub(sp, sp, 4 * 16); 3313 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3314 3315 // __ andr(len, len, -512); 3316 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3317 __ str(len, __ pre(sp, -2 * wordSize)); 3318 3319 Label DONE; 3320 __ cbz(len, DONE); 3321 3322 // Compute #rounds for AES based on the length of the key array 3323 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3324 3325 __ aesenc_loadkeys(key, keylen); 3326 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3327 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3328 3329 // AES/CTR loop 3330 { 3331 Label L_CTR_loop; 3332 __ BIND(L_CTR_loop); 3333 3334 // Setup the counters 3335 __ movi(v8, __ T4S, 0); 3336 __ movi(v9, __ T4S, 1); 3337 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3338 3339 assert(v0->encoding() < v8->encoding(), ""); 3340 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3341 FloatRegister f = as_FloatRegister(i); 3342 __ rev32(f, __ T16B, v16); 3343 __ addv(v16, __ T4S, v16, v8); 3344 } 3345 3346 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3347 3348 // Encrypt the counters 3349 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3350 3351 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3352 3353 // XOR the encrypted counters with the inputs 3354 for (int i = 0; i < 8; i++) { 3355 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3356 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3357 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3358 } 3359 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3360 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3361 3362 __ subw(len, len, 16 * 8); 3363 __ cbnzw(len, L_CTR_loop); 3364 } 3365 3366 __ rev32(v16, __ T16B, v16); 3367 __ st1(v16, __ T16B, counter); 3368 3369 __ ldr(len, Address(sp)); 3370 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3371 3372 // GHASH/CTR loop 3373 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3374 len, /*unrolls*/4); 3375 3376 #ifdef ASSERT 3377 { Label L; 3378 __ cmp(len, (unsigned char)0); 3379 __ br(Assembler::EQ, L); 3380 __ stop("stubGenerator: abort"); 3381 __ bind(L); 3382 } 3383 #endif 3384 3385 __ bind(DONE); 3386 // Return the number of bytes processed 3387 __ ldr(r0, __ post(sp, 2 * wordSize)); 3388 3389 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3390 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3391 3392 __ leave(); // required for proper stackwalking of RuntimeStub frame 3393 __ ret(lr); 3394 return start; 3395 } 3396 3397 class Cached64Bytes { 3398 private: 3399 MacroAssembler *_masm; 3400 Register _regs[8]; 3401 3402 public: 3403 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3404 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3405 auto it = rs.begin(); 3406 for (auto &r: _regs) { 3407 r = *it; 3408 ++it; 3409 } 3410 } 3411 3412 void gen_loads(Register base) { 3413 for (int i = 0; i < 8; i += 2) { 3414 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3415 } 3416 } 3417 3418 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3419 void extract_u32(Register dest, int i) { 3420 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3421 } 3422 }; 3423 3424 // Utility routines for md5. 3425 // Clobbers r10 and r11. 3426 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3427 int k, int s, int t) { 3428 Register rscratch3 = r10; 3429 Register rscratch4 = r11; 3430 3431 __ eorw(rscratch3, r3, r4); 3432 __ movw(rscratch2, t); 3433 __ andw(rscratch3, rscratch3, r2); 3434 __ addw(rscratch4, r1, rscratch2); 3435 reg_cache.extract_u32(rscratch1, k); 3436 __ eorw(rscratch3, rscratch3, r4); 3437 __ addw(rscratch4, rscratch4, rscratch1); 3438 __ addw(rscratch3, rscratch3, rscratch4); 3439 __ rorw(rscratch2, rscratch3, 32 - s); 3440 __ addw(r1, rscratch2, r2); 3441 } 3442 3443 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3444 int k, int s, int t) { 3445 Register rscratch3 = r10; 3446 Register rscratch4 = r11; 3447 3448 reg_cache.extract_u32(rscratch1, k); 3449 __ movw(rscratch2, t); 3450 __ addw(rscratch4, r1, rscratch2); 3451 __ addw(rscratch4, rscratch4, rscratch1); 3452 __ bicw(rscratch2, r3, r4); 3453 __ andw(rscratch3, r2, r4); 3454 __ addw(rscratch2, rscratch2, rscratch4); 3455 __ addw(rscratch2, rscratch2, rscratch3); 3456 __ rorw(rscratch2, rscratch2, 32 - s); 3457 __ addw(r1, rscratch2, r2); 3458 } 3459 3460 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3461 int k, int s, int t) { 3462 Register rscratch3 = r10; 3463 Register rscratch4 = r11; 3464 3465 __ eorw(rscratch3, r3, r4); 3466 __ movw(rscratch2, t); 3467 __ addw(rscratch4, r1, rscratch2); 3468 reg_cache.extract_u32(rscratch1, k); 3469 __ eorw(rscratch3, rscratch3, r2); 3470 __ addw(rscratch4, rscratch4, rscratch1); 3471 __ addw(rscratch3, rscratch3, rscratch4); 3472 __ rorw(rscratch2, rscratch3, 32 - s); 3473 __ addw(r1, rscratch2, r2); 3474 } 3475 3476 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3477 int k, int s, int t) { 3478 Register rscratch3 = r10; 3479 Register rscratch4 = r11; 3480 3481 __ movw(rscratch3, t); 3482 __ ornw(rscratch2, r2, r4); 3483 __ addw(rscratch4, r1, rscratch3); 3484 reg_cache.extract_u32(rscratch1, k); 3485 __ eorw(rscratch3, rscratch2, r3); 3486 __ addw(rscratch4, rscratch4, rscratch1); 3487 __ addw(rscratch3, rscratch3, rscratch4); 3488 __ rorw(rscratch2, rscratch3, 32 - s); 3489 __ addw(r1, rscratch2, r2); 3490 } 3491 3492 // Arguments: 3493 // 3494 // Inputs: 3495 // c_rarg0 - byte[] source+offset 3496 // c_rarg1 - int[] SHA.state 3497 // c_rarg2 - int offset 3498 // c_rarg3 - int limit 3499 // 3500 address generate_md5_implCompress(StubGenStubId stub_id) { 3501 bool multi_block; 3502 switch (stub_id) { 3503 case md5_implCompress_id: 3504 multi_block = false; 3505 break; 3506 case md5_implCompressMB_id: 3507 multi_block = true; 3508 break; 3509 default: 3510 ShouldNotReachHere(); 3511 } 3512 __ align(CodeEntryAlignment); 3513 3514 StubCodeMark mark(this, stub_id); 3515 address start = __ pc(); 3516 3517 Register buf = c_rarg0; 3518 Register state = c_rarg1; 3519 Register ofs = c_rarg2; 3520 Register limit = c_rarg3; 3521 Register a = r4; 3522 Register b = r5; 3523 Register c = r6; 3524 Register d = r7; 3525 Register rscratch3 = r10; 3526 Register rscratch4 = r11; 3527 3528 Register state_regs[2] = { r12, r13 }; 3529 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3530 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3531 3532 __ push(saved_regs, sp); 3533 3534 __ ldp(state_regs[0], state_regs[1], Address(state)); 3535 __ ubfx(a, state_regs[0], 0, 32); 3536 __ ubfx(b, state_regs[0], 32, 32); 3537 __ ubfx(c, state_regs[1], 0, 32); 3538 __ ubfx(d, state_regs[1], 32, 32); 3539 3540 Label md5_loop; 3541 __ BIND(md5_loop); 3542 3543 reg_cache.gen_loads(buf); 3544 3545 // Round 1 3546 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3547 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3548 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3549 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3550 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3551 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3552 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3553 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3554 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3555 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3556 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3557 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3558 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3559 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3560 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3561 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3562 3563 // Round 2 3564 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3565 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3566 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3567 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3568 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3569 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3570 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3571 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3572 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3573 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3574 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3575 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3576 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3577 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3578 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3579 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3580 3581 // Round 3 3582 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3583 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3584 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3585 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3586 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3587 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3588 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3589 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3590 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3591 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3592 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3593 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3594 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3595 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3596 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3597 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3598 3599 // Round 4 3600 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3601 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3602 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3603 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3604 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3605 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3606 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3607 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3608 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3609 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3610 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3611 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3612 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3613 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3614 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3615 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3616 3617 __ addw(a, state_regs[0], a); 3618 __ ubfx(rscratch2, state_regs[0], 32, 32); 3619 __ addw(b, rscratch2, b); 3620 __ addw(c, state_regs[1], c); 3621 __ ubfx(rscratch4, state_regs[1], 32, 32); 3622 __ addw(d, rscratch4, d); 3623 3624 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3625 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3626 3627 if (multi_block) { 3628 __ add(buf, buf, 64); 3629 __ add(ofs, ofs, 64); 3630 __ cmp(ofs, limit); 3631 __ br(Assembler::LE, md5_loop); 3632 __ mov(c_rarg0, ofs); // return ofs 3633 } 3634 3635 // write hash values back in the correct order 3636 __ stp(state_regs[0], state_regs[1], Address(state)); 3637 3638 __ pop(saved_regs, sp); 3639 3640 __ ret(lr); 3641 3642 return start; 3643 } 3644 3645 // Arguments: 3646 // 3647 // Inputs: 3648 // c_rarg0 - byte[] source+offset 3649 // c_rarg1 - int[] SHA.state 3650 // c_rarg2 - int offset 3651 // c_rarg3 - int limit 3652 // 3653 address generate_sha1_implCompress(StubGenStubId stub_id) { 3654 bool multi_block; 3655 switch (stub_id) { 3656 case sha1_implCompress_id: 3657 multi_block = false; 3658 break; 3659 case sha1_implCompressMB_id: 3660 multi_block = true; 3661 break; 3662 default: 3663 ShouldNotReachHere(); 3664 } 3665 3666 __ align(CodeEntryAlignment); 3667 3668 StubCodeMark mark(this, stub_id); 3669 address start = __ pc(); 3670 3671 Register buf = c_rarg0; 3672 Register state = c_rarg1; 3673 Register ofs = c_rarg2; 3674 Register limit = c_rarg3; 3675 3676 Label keys; 3677 Label sha1_loop; 3678 3679 // load the keys into v0..v3 3680 __ adr(rscratch1, keys); 3681 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3682 // load 5 words state into v6, v7 3683 __ ldrq(v6, Address(state, 0)); 3684 __ ldrs(v7, Address(state, 16)); 3685 3686 3687 __ BIND(sha1_loop); 3688 // load 64 bytes of data into v16..v19 3689 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3690 __ rev32(v16, __ T16B, v16); 3691 __ rev32(v17, __ T16B, v17); 3692 __ rev32(v18, __ T16B, v18); 3693 __ rev32(v19, __ T16B, v19); 3694 3695 // do the sha1 3696 __ addv(v4, __ T4S, v16, v0); 3697 __ orr(v20, __ T16B, v6, v6); 3698 3699 FloatRegister d0 = v16; 3700 FloatRegister d1 = v17; 3701 FloatRegister d2 = v18; 3702 FloatRegister d3 = v19; 3703 3704 for (int round = 0; round < 20; round++) { 3705 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3706 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3707 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3708 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3709 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3710 3711 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3712 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3713 __ sha1h(tmp2, __ T4S, v20); 3714 if (round < 5) 3715 __ sha1c(v20, __ T4S, tmp3, tmp4); 3716 else if (round < 10 || round >= 15) 3717 __ sha1p(v20, __ T4S, tmp3, tmp4); 3718 else 3719 __ sha1m(v20, __ T4S, tmp3, tmp4); 3720 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3721 3722 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3723 } 3724 3725 __ addv(v7, __ T2S, v7, v21); 3726 __ addv(v6, __ T4S, v6, v20); 3727 3728 if (multi_block) { 3729 __ add(ofs, ofs, 64); 3730 __ cmp(ofs, limit); 3731 __ br(Assembler::LE, sha1_loop); 3732 __ mov(c_rarg0, ofs); // return ofs 3733 } 3734 3735 __ strq(v6, Address(state, 0)); 3736 __ strs(v7, Address(state, 16)); 3737 3738 __ ret(lr); 3739 3740 __ bind(keys); 3741 __ emit_int32(0x5a827999); 3742 __ emit_int32(0x6ed9eba1); 3743 __ emit_int32(0x8f1bbcdc); 3744 __ emit_int32(0xca62c1d6); 3745 3746 return start; 3747 } 3748 3749 3750 // Arguments: 3751 // 3752 // Inputs: 3753 // c_rarg0 - byte[] source+offset 3754 // c_rarg1 - int[] SHA.state 3755 // c_rarg2 - int offset 3756 // c_rarg3 - int limit 3757 // 3758 address generate_sha256_implCompress(StubGenStubId stub_id) { 3759 bool multi_block; 3760 switch (stub_id) { 3761 case sha256_implCompress_id: 3762 multi_block = false; 3763 break; 3764 case sha256_implCompressMB_id: 3765 multi_block = true; 3766 break; 3767 default: 3768 ShouldNotReachHere(); 3769 } 3770 3771 static const uint32_t round_consts[64] = { 3772 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3773 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3774 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3775 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3776 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3777 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3778 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3779 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3780 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3781 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3782 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3783 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3784 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3785 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3786 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3787 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3788 }; 3789 3790 __ align(CodeEntryAlignment); 3791 3792 StubCodeMark mark(this, stub_id); 3793 address start = __ pc(); 3794 3795 Register buf = c_rarg0; 3796 Register state = c_rarg1; 3797 Register ofs = c_rarg2; 3798 Register limit = c_rarg3; 3799 3800 Label sha1_loop; 3801 3802 __ stpd(v8, v9, __ pre(sp, -32)); 3803 __ stpd(v10, v11, Address(sp, 16)); 3804 3805 // dga == v0 3806 // dgb == v1 3807 // dg0 == v2 3808 // dg1 == v3 3809 // dg2 == v4 3810 // t0 == v6 3811 // t1 == v7 3812 3813 // load 16 keys to v16..v31 3814 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3815 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3816 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3817 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3818 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3819 3820 // load 8 words (256 bits) state 3821 __ ldpq(v0, v1, state); 3822 3823 __ BIND(sha1_loop); 3824 // load 64 bytes of data into v8..v11 3825 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3826 __ rev32(v8, __ T16B, v8); 3827 __ rev32(v9, __ T16B, v9); 3828 __ rev32(v10, __ T16B, v10); 3829 __ rev32(v11, __ T16B, v11); 3830 3831 __ addv(v6, __ T4S, v8, v16); 3832 __ orr(v2, __ T16B, v0, v0); 3833 __ orr(v3, __ T16B, v1, v1); 3834 3835 FloatRegister d0 = v8; 3836 FloatRegister d1 = v9; 3837 FloatRegister d2 = v10; 3838 FloatRegister d3 = v11; 3839 3840 3841 for (int round = 0; round < 16; round++) { 3842 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3843 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3844 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3845 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3846 3847 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3848 __ orr(v4, __ T16B, v2, v2); 3849 if (round < 15) 3850 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3851 __ sha256h(v2, __ T4S, v3, tmp2); 3852 __ sha256h2(v3, __ T4S, v4, tmp2); 3853 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3854 3855 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3856 } 3857 3858 __ addv(v0, __ T4S, v0, v2); 3859 __ addv(v1, __ T4S, v1, v3); 3860 3861 if (multi_block) { 3862 __ add(ofs, ofs, 64); 3863 __ cmp(ofs, limit); 3864 __ br(Assembler::LE, sha1_loop); 3865 __ mov(c_rarg0, ofs); // return ofs 3866 } 3867 3868 __ ldpd(v10, v11, Address(sp, 16)); 3869 __ ldpd(v8, v9, __ post(sp, 32)); 3870 3871 __ stpq(v0, v1, state); 3872 3873 __ ret(lr); 3874 3875 return start; 3876 } 3877 3878 // Double rounds for sha512. 3879 void sha512_dround(int dr, 3880 FloatRegister vi0, FloatRegister vi1, 3881 FloatRegister vi2, FloatRegister vi3, 3882 FloatRegister vi4, FloatRegister vrc0, 3883 FloatRegister vrc1, FloatRegister vin0, 3884 FloatRegister vin1, FloatRegister vin2, 3885 FloatRegister vin3, FloatRegister vin4) { 3886 if (dr < 36) { 3887 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3888 } 3889 __ addv(v5, __ T2D, vrc0, vin0); 3890 __ ext(v6, __ T16B, vi2, vi3, 8); 3891 __ ext(v5, __ T16B, v5, v5, 8); 3892 __ ext(v7, __ T16B, vi1, vi2, 8); 3893 __ addv(vi3, __ T2D, vi3, v5); 3894 if (dr < 32) { 3895 __ ext(v5, __ T16B, vin3, vin4, 8); 3896 __ sha512su0(vin0, __ T2D, vin1); 3897 } 3898 __ sha512h(vi3, __ T2D, v6, v7); 3899 if (dr < 32) { 3900 __ sha512su1(vin0, __ T2D, vin2, v5); 3901 } 3902 __ addv(vi4, __ T2D, vi1, vi3); 3903 __ sha512h2(vi3, __ T2D, vi1, vi0); 3904 } 3905 3906 // Arguments: 3907 // 3908 // Inputs: 3909 // c_rarg0 - byte[] source+offset 3910 // c_rarg1 - int[] SHA.state 3911 // c_rarg2 - int offset 3912 // c_rarg3 - int limit 3913 // 3914 address generate_sha512_implCompress(StubGenStubId stub_id) { 3915 bool multi_block; 3916 switch (stub_id) { 3917 case sha512_implCompress_id: 3918 multi_block = false; 3919 break; 3920 case sha512_implCompressMB_id: 3921 multi_block = true; 3922 break; 3923 default: 3924 ShouldNotReachHere(); 3925 } 3926 3927 static const uint64_t round_consts[80] = { 3928 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3929 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3930 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3931 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3932 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3933 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3934 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3935 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3936 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3937 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3938 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3939 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3940 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3941 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3942 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3943 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3944 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3945 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3946 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3947 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3948 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3949 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3950 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3951 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3952 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3953 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3954 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3955 }; 3956 3957 __ align(CodeEntryAlignment); 3958 3959 StubCodeMark mark(this, stub_id); 3960 address start = __ pc(); 3961 3962 Register buf = c_rarg0; 3963 Register state = c_rarg1; 3964 Register ofs = c_rarg2; 3965 Register limit = c_rarg3; 3966 3967 __ stpd(v8, v9, __ pre(sp, -64)); 3968 __ stpd(v10, v11, Address(sp, 16)); 3969 __ stpd(v12, v13, Address(sp, 32)); 3970 __ stpd(v14, v15, Address(sp, 48)); 3971 3972 Label sha512_loop; 3973 3974 // load state 3975 __ ld1(v8, v9, v10, v11, __ T2D, state); 3976 3977 // load first 4 round constants 3978 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3979 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3980 3981 __ BIND(sha512_loop); 3982 // load 128B of data into v12..v19 3983 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3984 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3985 __ rev64(v12, __ T16B, v12); 3986 __ rev64(v13, __ T16B, v13); 3987 __ rev64(v14, __ T16B, v14); 3988 __ rev64(v15, __ T16B, v15); 3989 __ rev64(v16, __ T16B, v16); 3990 __ rev64(v17, __ T16B, v17); 3991 __ rev64(v18, __ T16B, v18); 3992 __ rev64(v19, __ T16B, v19); 3993 3994 __ mov(rscratch2, rscratch1); 3995 3996 __ mov(v0, __ T16B, v8); 3997 __ mov(v1, __ T16B, v9); 3998 __ mov(v2, __ T16B, v10); 3999 __ mov(v3, __ T16B, v11); 4000 4001 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4002 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4003 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4004 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4005 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4006 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4007 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4008 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4009 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4010 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4011 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4012 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4013 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4014 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4015 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4016 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4017 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4018 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4019 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4020 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4021 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4022 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4023 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4024 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4025 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4026 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4027 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4028 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4029 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4030 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4031 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4032 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4033 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4034 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4035 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4036 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4037 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4038 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4039 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4040 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4041 4042 __ addv(v8, __ T2D, v8, v0); 4043 __ addv(v9, __ T2D, v9, v1); 4044 __ addv(v10, __ T2D, v10, v2); 4045 __ addv(v11, __ T2D, v11, v3); 4046 4047 if (multi_block) { 4048 __ add(ofs, ofs, 128); 4049 __ cmp(ofs, limit); 4050 __ br(Assembler::LE, sha512_loop); 4051 __ mov(c_rarg0, ofs); // return ofs 4052 } 4053 4054 __ st1(v8, v9, v10, v11, __ T2D, state); 4055 4056 __ ldpd(v14, v15, Address(sp, 48)); 4057 __ ldpd(v12, v13, Address(sp, 32)); 4058 __ ldpd(v10, v11, Address(sp, 16)); 4059 __ ldpd(v8, v9, __ post(sp, 64)); 4060 4061 __ ret(lr); 4062 4063 return start; 4064 } 4065 4066 // Arguments: 4067 // 4068 // Inputs: 4069 // c_rarg0 - byte[] source+offset 4070 // c_rarg1 - byte[] SHA.state 4071 // c_rarg2 - int block_size 4072 // c_rarg3 - int offset 4073 // c_rarg4 - int limit 4074 // 4075 address generate_sha3_implCompress(StubGenStubId stub_id) { 4076 bool multi_block; 4077 switch (stub_id) { 4078 case sha3_implCompress_id: 4079 multi_block = false; 4080 break; 4081 case sha3_implCompressMB_id: 4082 multi_block = true; 4083 break; 4084 default: 4085 ShouldNotReachHere(); 4086 } 4087 4088 static const uint64_t round_consts[24] = { 4089 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4090 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4091 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4092 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4093 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4094 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4095 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4096 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4097 }; 4098 4099 __ align(CodeEntryAlignment); 4100 4101 StubCodeMark mark(this, stub_id); 4102 address start = __ pc(); 4103 4104 Register buf = c_rarg0; 4105 Register state = c_rarg1; 4106 Register block_size = c_rarg2; 4107 Register ofs = c_rarg3; 4108 Register limit = c_rarg4; 4109 4110 Label sha3_loop, rounds24_loop; 4111 Label sha3_512_or_sha3_384, shake128; 4112 4113 __ stpd(v8, v9, __ pre(sp, -64)); 4114 __ stpd(v10, v11, Address(sp, 16)); 4115 __ stpd(v12, v13, Address(sp, 32)); 4116 __ stpd(v14, v15, Address(sp, 48)); 4117 4118 // load state 4119 __ add(rscratch1, state, 32); 4120 __ ld1(v0, v1, v2, v3, __ T1D, state); 4121 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4122 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4123 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4124 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4125 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4126 __ ld1(v24, __ T1D, rscratch1); 4127 4128 __ BIND(sha3_loop); 4129 4130 // 24 keccak rounds 4131 __ movw(rscratch2, 24); 4132 4133 // load round_constants base 4134 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4135 4136 // load input 4137 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4138 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4139 __ eor(v0, __ T8B, v0, v25); 4140 __ eor(v1, __ T8B, v1, v26); 4141 __ eor(v2, __ T8B, v2, v27); 4142 __ eor(v3, __ T8B, v3, v28); 4143 __ eor(v4, __ T8B, v4, v29); 4144 __ eor(v5, __ T8B, v5, v30); 4145 __ eor(v6, __ T8B, v6, v31); 4146 4147 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4148 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4149 4150 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4151 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4152 __ eor(v7, __ T8B, v7, v25); 4153 __ eor(v8, __ T8B, v8, v26); 4154 __ eor(v9, __ T8B, v9, v27); 4155 __ eor(v10, __ T8B, v10, v28); 4156 __ eor(v11, __ T8B, v11, v29); 4157 __ eor(v12, __ T8B, v12, v30); 4158 __ eor(v13, __ T8B, v13, v31); 4159 4160 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4161 __ eor(v14, __ T8B, v14, v25); 4162 __ eor(v15, __ T8B, v15, v26); 4163 __ eor(v16, __ T8B, v16, v27); 4164 4165 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4166 __ andw(c_rarg5, block_size, 48); 4167 __ cbzw(c_rarg5, rounds24_loop); 4168 4169 __ tbnz(block_size, 5, shake128); 4170 // block_size == 144, bit5 == 0, SHA3-244 4171 __ ldrd(v28, __ post(buf, 8)); 4172 __ eor(v17, __ T8B, v17, v28); 4173 __ b(rounds24_loop); 4174 4175 __ BIND(shake128); 4176 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4177 __ eor(v17, __ T8B, v17, v28); 4178 __ eor(v18, __ T8B, v18, v29); 4179 __ eor(v19, __ T8B, v19, v30); 4180 __ eor(v20, __ T8B, v20, v31); 4181 __ b(rounds24_loop); // block_size == 168, SHAKE128 4182 4183 __ BIND(sha3_512_or_sha3_384); 4184 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4185 __ eor(v7, __ T8B, v7, v25); 4186 __ eor(v8, __ T8B, v8, v26); 4187 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4188 4189 // SHA3-384 4190 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4191 __ eor(v9, __ T8B, v9, v27); 4192 __ eor(v10, __ T8B, v10, v28); 4193 __ eor(v11, __ T8B, v11, v29); 4194 __ eor(v12, __ T8B, v12, v30); 4195 4196 __ BIND(rounds24_loop); 4197 __ subw(rscratch2, rscratch2, 1); 4198 4199 __ eor3(v29, __ T16B, v4, v9, v14); 4200 __ eor3(v26, __ T16B, v1, v6, v11); 4201 __ eor3(v28, __ T16B, v3, v8, v13); 4202 __ eor3(v25, __ T16B, v0, v5, v10); 4203 __ eor3(v27, __ T16B, v2, v7, v12); 4204 __ eor3(v29, __ T16B, v29, v19, v24); 4205 __ eor3(v26, __ T16B, v26, v16, v21); 4206 __ eor3(v28, __ T16B, v28, v18, v23); 4207 __ eor3(v25, __ T16B, v25, v15, v20); 4208 __ eor3(v27, __ T16B, v27, v17, v22); 4209 4210 __ rax1(v30, __ T2D, v29, v26); 4211 __ rax1(v26, __ T2D, v26, v28); 4212 __ rax1(v28, __ T2D, v28, v25); 4213 __ rax1(v25, __ T2D, v25, v27); 4214 __ rax1(v27, __ T2D, v27, v29); 4215 4216 __ eor(v0, __ T16B, v0, v30); 4217 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4218 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4219 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4220 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4221 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4222 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4223 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4224 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4225 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4226 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4227 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4228 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4229 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4230 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4231 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4232 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4233 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4234 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4235 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4236 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4237 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4238 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4239 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4240 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4241 4242 __ bcax(v20, __ T16B, v31, v22, v8); 4243 __ bcax(v21, __ T16B, v8, v23, v22); 4244 __ bcax(v22, __ T16B, v22, v24, v23); 4245 __ bcax(v23, __ T16B, v23, v31, v24); 4246 __ bcax(v24, __ T16B, v24, v8, v31); 4247 4248 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4249 4250 __ bcax(v17, __ T16B, v25, v19, v3); 4251 __ bcax(v18, __ T16B, v3, v15, v19); 4252 __ bcax(v19, __ T16B, v19, v16, v15); 4253 __ bcax(v15, __ T16B, v15, v25, v16); 4254 __ bcax(v16, __ T16B, v16, v3, v25); 4255 4256 __ bcax(v10, __ T16B, v29, v12, v26); 4257 __ bcax(v11, __ T16B, v26, v13, v12); 4258 __ bcax(v12, __ T16B, v12, v14, v13); 4259 __ bcax(v13, __ T16B, v13, v29, v14); 4260 __ bcax(v14, __ T16B, v14, v26, v29); 4261 4262 __ bcax(v7, __ T16B, v30, v9, v4); 4263 __ bcax(v8, __ T16B, v4, v5, v9); 4264 __ bcax(v9, __ T16B, v9, v6, v5); 4265 __ bcax(v5, __ T16B, v5, v30, v6); 4266 __ bcax(v6, __ T16B, v6, v4, v30); 4267 4268 __ bcax(v3, __ T16B, v27, v0, v28); 4269 __ bcax(v4, __ T16B, v28, v1, v0); 4270 __ bcax(v0, __ T16B, v0, v2, v1); 4271 __ bcax(v1, __ T16B, v1, v27, v2); 4272 __ bcax(v2, __ T16B, v2, v28, v27); 4273 4274 __ eor(v0, __ T16B, v0, v31); 4275 4276 __ cbnzw(rscratch2, rounds24_loop); 4277 4278 if (multi_block) { 4279 __ add(ofs, ofs, block_size); 4280 __ cmp(ofs, limit); 4281 __ br(Assembler::LE, sha3_loop); 4282 __ mov(c_rarg0, ofs); // return ofs 4283 } 4284 4285 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4286 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4287 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4288 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4289 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4290 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4291 __ st1(v24, __ T1D, state); 4292 4293 __ ldpd(v14, v15, Address(sp, 48)); 4294 __ ldpd(v12, v13, Address(sp, 32)); 4295 __ ldpd(v10, v11, Address(sp, 16)); 4296 __ ldpd(v8, v9, __ post(sp, 64)); 4297 4298 __ ret(lr); 4299 4300 return start; 4301 } 4302 4303 /** 4304 * Arguments: 4305 * 4306 * Inputs: 4307 * c_rarg0 - int crc 4308 * c_rarg1 - byte* buf 4309 * c_rarg2 - int length 4310 * 4311 * Output: 4312 * rax - int crc result 4313 */ 4314 address generate_updateBytesCRC32() { 4315 assert(UseCRC32Intrinsics, "what are we doing here?"); 4316 4317 __ align(CodeEntryAlignment); 4318 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 4319 StubCodeMark mark(this, stub_id); 4320 4321 address start = __ pc(); 4322 4323 const Register crc = c_rarg0; // crc 4324 const Register buf = c_rarg1; // source java byte array address 4325 const Register len = c_rarg2; // length 4326 const Register table0 = c_rarg3; // crc_table address 4327 const Register table1 = c_rarg4; 4328 const Register table2 = c_rarg5; 4329 const Register table3 = c_rarg6; 4330 const Register tmp3 = c_rarg7; 4331 4332 BLOCK_COMMENT("Entry:"); 4333 __ enter(); // required for proper stackwalking of RuntimeStub frame 4334 4335 __ kernel_crc32(crc, buf, len, 4336 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4337 4338 __ leave(); // required for proper stackwalking of RuntimeStub frame 4339 __ ret(lr); 4340 4341 return start; 4342 } 4343 4344 // ChaCha20 block function. This version parallelizes 4 quarter 4345 // round operations at a time. It uses 16 SIMD registers to 4346 // produce 4 blocks of key stream. 4347 // 4348 // state (int[16]) = c_rarg0 4349 // keystream (byte[256]) = c_rarg1 4350 // return - number of bytes of keystream (always 256) 4351 // 4352 // In this approach, we load the 512-bit start state sequentially into 4353 // 4 128-bit vectors. We then make 4 4-vector copies of that starting 4354 // state, with each successive set of 4 vectors having a +1 added into 4355 // the first 32-bit lane of the 4th vector in that group (the counter). 4356 // By doing this, we can perform the block function on 4 512-bit blocks 4357 // within one run of this intrinsic. 4358 // The alignment of the data across the 4-vector group is such that at 4359 // the start it is already aligned for the first round of each two-round 4360 // loop iteration. In other words, the corresponding lanes of each vector 4361 // will contain the values needed for that quarter round operation (e.g. 4362 // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.). 4363 // In between each full round, a lane shift must occur. Within a loop 4364 // iteration, between the first and second rounds, the 2nd, 3rd, and 4th 4365 // vectors are rotated left 32, 64 and 96 bits, respectively. The result 4366 // is effectively a diagonal orientation in columnar form. After the 4367 // second full round, those registers are left-rotated again, this time 4368 // 96, 64, and 32 bits - returning the vectors to their columnar organization. 4369 // After all 10 iterations, the original state is added to each 4-vector 4370 // working state along with the add mask, and the 4 vector groups are 4371 // sequentially written to the memory dedicated for the output key stream. 4372 // 4373 // For a more detailed explanation, see Goll and Gueron, "Vectorization of 4374 // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology: 4375 // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33 4376 address generate_chacha20Block_qrpar() { 4377 Label L_Q_twoRounds, L_Q_cc20_const; 4378 // The constant data is broken into two 128-bit segments to be loaded 4379 // onto SIMD registers. The first 128 bits are a counter add overlay 4380 // that adds +1/+0/+0/+0 to the vectors holding replicated state[12]. 4381 // The second 128-bits is a table constant used for 8-bit left rotations. 4382 // on 32-bit lanes within a SIMD register. 4383 __ BIND(L_Q_cc20_const); 4384 __ emit_int64(0x0000000000000001UL); 4385 __ emit_int64(0x0000000000000000UL); 4386 __ emit_int64(0x0605040702010003UL); 4387 __ emit_int64(0x0E0D0C0F0A09080BUL); 4388 4389 __ align(CodeEntryAlignment); 4390 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4391 StubCodeMark mark(this, stub_id); 4392 address start = __ pc(); 4393 __ enter(); 4394 4395 const Register state = c_rarg0; 4396 const Register keystream = c_rarg1; 4397 const Register loopCtr = r10; 4398 const Register tmpAddr = r11; 4399 4400 const FloatRegister aState = v0; 4401 const FloatRegister bState = v1; 4402 const FloatRegister cState = v2; 4403 const FloatRegister dState = v3; 4404 const FloatRegister a1Vec = v4; 4405 const FloatRegister b1Vec = v5; 4406 const FloatRegister c1Vec = v6; 4407 const FloatRegister d1Vec = v7; 4408 // Skip the callee-saved registers v8 - v15 4409 const FloatRegister a2Vec = v16; 4410 const FloatRegister b2Vec = v17; 4411 const FloatRegister c2Vec = v18; 4412 const FloatRegister d2Vec = v19; 4413 const FloatRegister a3Vec = v20; 4414 const FloatRegister b3Vec = v21; 4415 const FloatRegister c3Vec = v22; 4416 const FloatRegister d3Vec = v23; 4417 const FloatRegister a4Vec = v24; 4418 const FloatRegister b4Vec = v25; 4419 const FloatRegister c4Vec = v26; 4420 const FloatRegister d4Vec = v27; 4421 const FloatRegister scratch = v28; 4422 const FloatRegister addMask = v29; 4423 const FloatRegister lrot8Tbl = v30; 4424 4425 // Load the initial state in the first 4 quadword registers, 4426 // then copy the initial state into the next 4 quadword registers 4427 // that will be used for the working state. 4428 __ ld1(aState, bState, cState, dState, __ T16B, Address(state)); 4429 4430 // Load the index register for 2 constant 128-bit data fields. 4431 // The first represents the +1/+0/+0/+0 add mask. The second is 4432 // the 8-bit left rotation. 4433 __ adr(tmpAddr, L_Q_cc20_const); 4434 __ ldpq(addMask, lrot8Tbl, Address(tmpAddr)); 4435 4436 __ mov(a1Vec, __ T16B, aState); 4437 __ mov(b1Vec, __ T16B, bState); 4438 __ mov(c1Vec, __ T16B, cState); 4439 __ mov(d1Vec, __ T16B, dState); 4440 4441 __ mov(a2Vec, __ T16B, aState); 4442 __ mov(b2Vec, __ T16B, bState); 4443 __ mov(c2Vec, __ T16B, cState); 4444 __ addv(d2Vec, __ T4S, d1Vec, addMask); 4445 4446 __ mov(a3Vec, __ T16B, aState); 4447 __ mov(b3Vec, __ T16B, bState); 4448 __ mov(c3Vec, __ T16B, cState); 4449 __ addv(d3Vec, __ T4S, d2Vec, addMask); 4450 4451 __ mov(a4Vec, __ T16B, aState); 4452 __ mov(b4Vec, __ T16B, bState); 4453 __ mov(c4Vec, __ T16B, cState); 4454 __ addv(d4Vec, __ T4S, d3Vec, addMask); 4455 4456 // Set up the 10 iteration loop 4457 __ mov(loopCtr, 10); 4458 __ BIND(L_Q_twoRounds); 4459 4460 // The first set of operations on the vectors covers the first 4 quarter 4461 // round operations: 4462 // Qround(state, 0, 4, 8,12) 4463 // Qround(state, 1, 5, 9,13) 4464 // Qround(state, 2, 6,10,14) 4465 // Qround(state, 3, 7,11,15) 4466 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4467 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4468 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4469 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4470 4471 // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to 4472 // diagonals. The a1Vec does not need to change orientation. 4473 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true); 4474 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true); 4475 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true); 4476 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true); 4477 4478 // The second set of operations on the vectors covers the second 4 quarter 4479 // round operations, now acting on the diagonals: 4480 // Qround(state, 0, 5,10,15) 4481 // Qround(state, 1, 6,11,12) 4482 // Qround(state, 2, 7, 8,13) 4483 // Qround(state, 3, 4, 9,14) 4484 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4485 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4486 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4487 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4488 4489 // Before we start the next iteration, we need to perform shuffles 4490 // on the b/c/d vectors to move them back to columnar organizations 4491 // from their current diagonal orientation. 4492 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false); 4493 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false); 4494 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false); 4495 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false); 4496 4497 // Decrement and iterate 4498 __ sub(loopCtr, loopCtr, 1); 4499 __ cbnz(loopCtr, L_Q_twoRounds); 4500 4501 // Once the counter reaches zero, we fall out of the loop 4502 // and need to add the initial state back into the working state 4503 // represented by the a/b/c/d1Vec registers. This is destructive 4504 // on the dState register but we no longer will need it. 4505 __ addv(a1Vec, __ T4S, a1Vec, aState); 4506 __ addv(b1Vec, __ T4S, b1Vec, bState); 4507 __ addv(c1Vec, __ T4S, c1Vec, cState); 4508 __ addv(d1Vec, __ T4S, d1Vec, dState); 4509 4510 __ addv(a2Vec, __ T4S, a2Vec, aState); 4511 __ addv(b2Vec, __ T4S, b2Vec, bState); 4512 __ addv(c2Vec, __ T4S, c2Vec, cState); 4513 __ addv(dState, __ T4S, dState, addMask); 4514 __ addv(d2Vec, __ T4S, d2Vec, dState); 4515 4516 __ addv(a3Vec, __ T4S, a3Vec, aState); 4517 __ addv(b3Vec, __ T4S, b3Vec, bState); 4518 __ addv(c3Vec, __ T4S, c3Vec, cState); 4519 __ addv(dState, __ T4S, dState, addMask); 4520 __ addv(d3Vec, __ T4S, d3Vec, dState); 4521 4522 __ addv(a4Vec, __ T4S, a4Vec, aState); 4523 __ addv(b4Vec, __ T4S, b4Vec, bState); 4524 __ addv(c4Vec, __ T4S, c4Vec, cState); 4525 __ addv(dState, __ T4S, dState, addMask); 4526 __ addv(d4Vec, __ T4S, d4Vec, dState); 4527 4528 // Write the final state back to the result buffer 4529 __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64)); 4530 __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64)); 4531 __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64)); 4532 __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64)); 4533 4534 __ mov(r0, 256); // Return length of output keystream 4535 __ leave(); 4536 __ ret(lr); 4537 4538 return start; 4539 } 4540 4541 /** 4542 * Arguments: 4543 * 4544 * Inputs: 4545 * c_rarg0 - int crc 4546 * c_rarg1 - byte* buf 4547 * c_rarg2 - int length 4548 * c_rarg3 - int* table 4549 * 4550 * Output: 4551 * r0 - int crc result 4552 */ 4553 address generate_updateBytesCRC32C() { 4554 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4555 4556 __ align(CodeEntryAlignment); 4557 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 4558 StubCodeMark mark(this, stub_id); 4559 4560 address start = __ pc(); 4561 4562 const Register crc = c_rarg0; // crc 4563 const Register buf = c_rarg1; // source java byte array address 4564 const Register len = c_rarg2; // length 4565 const Register table0 = c_rarg3; // crc_table address 4566 const Register table1 = c_rarg4; 4567 const Register table2 = c_rarg5; 4568 const Register table3 = c_rarg6; 4569 const Register tmp3 = c_rarg7; 4570 4571 BLOCK_COMMENT("Entry:"); 4572 __ enter(); // required for proper stackwalking of RuntimeStub frame 4573 4574 __ kernel_crc32c(crc, buf, len, 4575 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4576 4577 __ leave(); // required for proper stackwalking of RuntimeStub frame 4578 __ ret(lr); 4579 4580 return start; 4581 } 4582 4583 /*** 4584 * Arguments: 4585 * 4586 * Inputs: 4587 * c_rarg0 - int adler 4588 * c_rarg1 - byte* buff 4589 * c_rarg2 - int len 4590 * 4591 * Output: 4592 * c_rarg0 - int adler result 4593 */ 4594 address generate_updateBytesAdler32() { 4595 __ align(CodeEntryAlignment); 4596 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 4597 StubCodeMark mark(this, stub_id); 4598 address start = __ pc(); 4599 4600 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4601 4602 // Aliases 4603 Register adler = c_rarg0; 4604 Register s1 = c_rarg0; 4605 Register s2 = c_rarg3; 4606 Register buff = c_rarg1; 4607 Register len = c_rarg2; 4608 Register nmax = r4; 4609 Register base = r5; 4610 Register count = r6; 4611 Register temp0 = rscratch1; 4612 Register temp1 = rscratch2; 4613 FloatRegister vbytes = v0; 4614 FloatRegister vs1acc = v1; 4615 FloatRegister vs2acc = v2; 4616 FloatRegister vtable = v3; 4617 4618 // Max number of bytes we can process before having to take the mod 4619 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4620 uint64_t BASE = 0xfff1; 4621 uint64_t NMAX = 0x15B0; 4622 4623 __ mov(base, BASE); 4624 __ mov(nmax, NMAX); 4625 4626 // Load accumulation coefficients for the upper 16 bits 4627 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4628 __ ld1(vtable, __ T16B, Address(temp0)); 4629 4630 // s1 is initialized to the lower 16 bits of adler 4631 // s2 is initialized to the upper 16 bits of adler 4632 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4633 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4634 4635 // The pipelined loop needs at least 16 elements for 1 iteration 4636 // It does check this, but it is more effective to skip to the cleanup loop 4637 __ cmp(len, (u1)16); 4638 __ br(Assembler::HS, L_nmax); 4639 __ cbz(len, L_combine); 4640 4641 __ bind(L_simple_by1_loop); 4642 __ ldrb(temp0, Address(__ post(buff, 1))); 4643 __ add(s1, s1, temp0); 4644 __ add(s2, s2, s1); 4645 __ subs(len, len, 1); 4646 __ br(Assembler::HI, L_simple_by1_loop); 4647 4648 // s1 = s1 % BASE 4649 __ subs(temp0, s1, base); 4650 __ csel(s1, temp0, s1, Assembler::HS); 4651 4652 // s2 = s2 % BASE 4653 __ lsr(temp0, s2, 16); 4654 __ lsl(temp1, temp0, 4); 4655 __ sub(temp1, temp1, temp0); 4656 __ add(s2, temp1, s2, ext::uxth); 4657 4658 __ subs(temp0, s2, base); 4659 __ csel(s2, temp0, s2, Assembler::HS); 4660 4661 __ b(L_combine); 4662 4663 __ bind(L_nmax); 4664 __ subs(len, len, nmax); 4665 __ sub(count, nmax, 16); 4666 __ br(Assembler::LO, L_by16); 4667 4668 __ bind(L_nmax_loop); 4669 4670 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4671 vbytes, vs1acc, vs2acc, vtable); 4672 4673 __ subs(count, count, 16); 4674 __ br(Assembler::HS, L_nmax_loop); 4675 4676 // s1 = s1 % BASE 4677 __ lsr(temp0, s1, 16); 4678 __ lsl(temp1, temp0, 4); 4679 __ sub(temp1, temp1, temp0); 4680 __ add(temp1, temp1, s1, ext::uxth); 4681 4682 __ lsr(temp0, temp1, 16); 4683 __ lsl(s1, temp0, 4); 4684 __ sub(s1, s1, temp0); 4685 __ add(s1, s1, temp1, ext:: uxth); 4686 4687 __ subs(temp0, s1, base); 4688 __ csel(s1, temp0, s1, Assembler::HS); 4689 4690 // s2 = s2 % BASE 4691 __ lsr(temp0, s2, 16); 4692 __ lsl(temp1, temp0, 4); 4693 __ sub(temp1, temp1, temp0); 4694 __ add(temp1, temp1, s2, ext::uxth); 4695 4696 __ lsr(temp0, temp1, 16); 4697 __ lsl(s2, temp0, 4); 4698 __ sub(s2, s2, temp0); 4699 __ add(s2, s2, temp1, ext:: uxth); 4700 4701 __ subs(temp0, s2, base); 4702 __ csel(s2, temp0, s2, Assembler::HS); 4703 4704 __ subs(len, len, nmax); 4705 __ sub(count, nmax, 16); 4706 __ br(Assembler::HS, L_nmax_loop); 4707 4708 __ bind(L_by16); 4709 __ adds(len, len, count); 4710 __ br(Assembler::LO, L_by1); 4711 4712 __ bind(L_by16_loop); 4713 4714 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4715 vbytes, vs1acc, vs2acc, vtable); 4716 4717 __ subs(len, len, 16); 4718 __ br(Assembler::HS, L_by16_loop); 4719 4720 __ bind(L_by1); 4721 __ adds(len, len, 15); 4722 __ br(Assembler::LO, L_do_mod); 4723 4724 __ bind(L_by1_loop); 4725 __ ldrb(temp0, Address(__ post(buff, 1))); 4726 __ add(s1, temp0, s1); 4727 __ add(s2, s2, s1); 4728 __ subs(len, len, 1); 4729 __ br(Assembler::HS, L_by1_loop); 4730 4731 __ bind(L_do_mod); 4732 // s1 = s1 % BASE 4733 __ lsr(temp0, s1, 16); 4734 __ lsl(temp1, temp0, 4); 4735 __ sub(temp1, temp1, temp0); 4736 __ add(temp1, temp1, s1, ext::uxth); 4737 4738 __ lsr(temp0, temp1, 16); 4739 __ lsl(s1, temp0, 4); 4740 __ sub(s1, s1, temp0); 4741 __ add(s1, s1, temp1, ext:: uxth); 4742 4743 __ subs(temp0, s1, base); 4744 __ csel(s1, temp0, s1, Assembler::HS); 4745 4746 // s2 = s2 % BASE 4747 __ lsr(temp0, s2, 16); 4748 __ lsl(temp1, temp0, 4); 4749 __ sub(temp1, temp1, temp0); 4750 __ add(temp1, temp1, s2, ext::uxth); 4751 4752 __ lsr(temp0, temp1, 16); 4753 __ lsl(s2, temp0, 4); 4754 __ sub(s2, s2, temp0); 4755 __ add(s2, s2, temp1, ext:: uxth); 4756 4757 __ subs(temp0, s2, base); 4758 __ csel(s2, temp0, s2, Assembler::HS); 4759 4760 // Combine lower bits and higher bits 4761 __ bind(L_combine); 4762 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4763 4764 __ ret(lr); 4765 4766 return start; 4767 } 4768 4769 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4770 Register temp0, Register temp1, FloatRegister vbytes, 4771 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4772 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4773 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4774 // In non-vectorized code, we update s1 and s2 as: 4775 // s1 <- s1 + b1 4776 // s2 <- s2 + s1 4777 // s1 <- s1 + b2 4778 // s2 <- s2 + b1 4779 // ... 4780 // s1 <- s1 + b16 4781 // s2 <- s2 + s1 4782 // Putting above assignments together, we have: 4783 // s1_new = s1 + b1 + b2 + ... + b16 4784 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4785 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4786 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4787 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4788 4789 // s2 = s2 + s1 * 16 4790 __ add(s2, s2, s1, Assembler::LSL, 4); 4791 4792 // vs1acc = b1 + b2 + b3 + ... + b16 4793 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4794 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4795 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4796 __ uaddlv(vs1acc, __ T16B, vbytes); 4797 __ uaddlv(vs2acc, __ T8H, vs2acc); 4798 4799 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4800 __ fmovd(temp0, vs1acc); 4801 __ fmovd(temp1, vs2acc); 4802 __ add(s1, s1, temp0); 4803 __ add(s2, s2, temp1); 4804 } 4805 4806 /** 4807 * Arguments: 4808 * 4809 * Input: 4810 * c_rarg0 - x address 4811 * c_rarg1 - x length 4812 * c_rarg2 - y address 4813 * c_rarg3 - y length 4814 * c_rarg4 - z address 4815 */ 4816 address generate_multiplyToLen() { 4817 __ align(CodeEntryAlignment); 4818 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 4819 StubCodeMark mark(this, stub_id); 4820 4821 address start = __ pc(); 4822 const Register x = r0; 4823 const Register xlen = r1; 4824 const Register y = r2; 4825 const Register ylen = r3; 4826 const Register z = r4; 4827 4828 const Register tmp0 = r5; 4829 const Register tmp1 = r10; 4830 const Register tmp2 = r11; 4831 const Register tmp3 = r12; 4832 const Register tmp4 = r13; 4833 const Register tmp5 = r14; 4834 const Register tmp6 = r15; 4835 const Register tmp7 = r16; 4836 4837 BLOCK_COMMENT("Entry:"); 4838 __ enter(); // required for proper stackwalking of RuntimeStub frame 4839 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4840 __ leave(); // required for proper stackwalking of RuntimeStub frame 4841 __ ret(lr); 4842 4843 return start; 4844 } 4845 4846 address generate_squareToLen() { 4847 // squareToLen algorithm for sizes 1..127 described in java code works 4848 // faster than multiply_to_len on some CPUs and slower on others, but 4849 // multiply_to_len shows a bit better overall results 4850 __ align(CodeEntryAlignment); 4851 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 4852 StubCodeMark mark(this, stub_id); 4853 address start = __ pc(); 4854 4855 const Register x = r0; 4856 const Register xlen = r1; 4857 const Register z = r2; 4858 const Register y = r4; // == x 4859 const Register ylen = r5; // == xlen 4860 4861 const Register tmp0 = r3; 4862 const Register tmp1 = r10; 4863 const Register tmp2 = r11; 4864 const Register tmp3 = r12; 4865 const Register tmp4 = r13; 4866 const Register tmp5 = r14; 4867 const Register tmp6 = r15; 4868 const Register tmp7 = r16; 4869 4870 RegSet spilled_regs = RegSet::of(y, ylen); 4871 BLOCK_COMMENT("Entry:"); 4872 __ enter(); 4873 __ push(spilled_regs, sp); 4874 __ mov(y, x); 4875 __ mov(ylen, xlen); 4876 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4877 __ pop(spilled_regs, sp); 4878 __ leave(); 4879 __ ret(lr); 4880 return start; 4881 } 4882 4883 address generate_mulAdd() { 4884 __ align(CodeEntryAlignment); 4885 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 4886 StubCodeMark mark(this, stub_id); 4887 4888 address start = __ pc(); 4889 4890 const Register out = r0; 4891 const Register in = r1; 4892 const Register offset = r2; 4893 const Register len = r3; 4894 const Register k = r4; 4895 4896 BLOCK_COMMENT("Entry:"); 4897 __ enter(); 4898 __ mul_add(out, in, offset, len, k); 4899 __ leave(); 4900 __ ret(lr); 4901 4902 return start; 4903 } 4904 4905 // Arguments: 4906 // 4907 // Input: 4908 // c_rarg0 - newArr address 4909 // c_rarg1 - oldArr address 4910 // c_rarg2 - newIdx 4911 // c_rarg3 - shiftCount 4912 // c_rarg4 - numIter 4913 // 4914 address generate_bigIntegerRightShift() { 4915 __ align(CodeEntryAlignment); 4916 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 4917 StubCodeMark mark(this, stub_id); 4918 address start = __ pc(); 4919 4920 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4921 4922 Register newArr = c_rarg0; 4923 Register oldArr = c_rarg1; 4924 Register newIdx = c_rarg2; 4925 Register shiftCount = c_rarg3; 4926 Register numIter = c_rarg4; 4927 Register idx = numIter; 4928 4929 Register newArrCur = rscratch1; 4930 Register shiftRevCount = rscratch2; 4931 Register oldArrCur = r13; 4932 Register oldArrNext = r14; 4933 4934 FloatRegister oldElem0 = v0; 4935 FloatRegister oldElem1 = v1; 4936 FloatRegister newElem = v2; 4937 FloatRegister shiftVCount = v3; 4938 FloatRegister shiftVRevCount = v4; 4939 4940 __ cbz(idx, Exit); 4941 4942 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4943 4944 // left shift count 4945 __ movw(shiftRevCount, 32); 4946 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4947 4948 // numIter too small to allow a 4-words SIMD loop, rolling back 4949 __ cmp(numIter, (u1)4); 4950 __ br(Assembler::LT, ShiftThree); 4951 4952 __ dup(shiftVCount, __ T4S, shiftCount); 4953 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4954 __ negr(shiftVCount, __ T4S, shiftVCount); 4955 4956 __ BIND(ShiftSIMDLoop); 4957 4958 // Calculate the load addresses 4959 __ sub(idx, idx, 4); 4960 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4961 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4962 __ add(oldArrCur, oldArrNext, 4); 4963 4964 // Load 4 words and process 4965 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4966 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4967 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4968 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4969 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4970 __ st1(newElem, __ T4S, Address(newArrCur)); 4971 4972 __ cmp(idx, (u1)4); 4973 __ br(Assembler::LT, ShiftTwoLoop); 4974 __ b(ShiftSIMDLoop); 4975 4976 __ BIND(ShiftTwoLoop); 4977 __ cbz(idx, Exit); 4978 __ cmp(idx, (u1)1); 4979 __ br(Assembler::EQ, ShiftOne); 4980 4981 // Calculate the load addresses 4982 __ sub(idx, idx, 2); 4983 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4984 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4985 __ add(oldArrCur, oldArrNext, 4); 4986 4987 // Load 2 words and process 4988 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4989 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4990 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4991 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4992 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4993 __ st1(newElem, __ T2S, Address(newArrCur)); 4994 __ b(ShiftTwoLoop); 4995 4996 __ BIND(ShiftThree); 4997 __ tbz(idx, 1, ShiftOne); 4998 __ tbz(idx, 0, ShiftTwo); 4999 __ ldrw(r10, Address(oldArr, 12)); 5000 __ ldrw(r11, Address(oldArr, 8)); 5001 __ lsrvw(r10, r10, shiftCount); 5002 __ lslvw(r11, r11, shiftRevCount); 5003 __ orrw(r12, r10, r11); 5004 __ strw(r12, Address(newArr, 8)); 5005 5006 __ BIND(ShiftTwo); 5007 __ ldrw(r10, Address(oldArr, 8)); 5008 __ ldrw(r11, Address(oldArr, 4)); 5009 __ lsrvw(r10, r10, shiftCount); 5010 __ lslvw(r11, r11, shiftRevCount); 5011 __ orrw(r12, r10, r11); 5012 __ strw(r12, Address(newArr, 4)); 5013 5014 __ BIND(ShiftOne); 5015 __ ldrw(r10, Address(oldArr, 4)); 5016 __ ldrw(r11, Address(oldArr)); 5017 __ lsrvw(r10, r10, shiftCount); 5018 __ lslvw(r11, r11, shiftRevCount); 5019 __ orrw(r12, r10, r11); 5020 __ strw(r12, Address(newArr)); 5021 5022 __ BIND(Exit); 5023 __ ret(lr); 5024 5025 return start; 5026 } 5027 5028 // Arguments: 5029 // 5030 // Input: 5031 // c_rarg0 - newArr address 5032 // c_rarg1 - oldArr address 5033 // c_rarg2 - newIdx 5034 // c_rarg3 - shiftCount 5035 // c_rarg4 - numIter 5036 // 5037 address generate_bigIntegerLeftShift() { 5038 __ align(CodeEntryAlignment); 5039 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 5040 StubCodeMark mark(this, stub_id); 5041 address start = __ pc(); 5042 5043 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 5044 5045 Register newArr = c_rarg0; 5046 Register oldArr = c_rarg1; 5047 Register newIdx = c_rarg2; 5048 Register shiftCount = c_rarg3; 5049 Register numIter = c_rarg4; 5050 5051 Register shiftRevCount = rscratch1; 5052 Register oldArrNext = rscratch2; 5053 5054 FloatRegister oldElem0 = v0; 5055 FloatRegister oldElem1 = v1; 5056 FloatRegister newElem = v2; 5057 FloatRegister shiftVCount = v3; 5058 FloatRegister shiftVRevCount = v4; 5059 5060 __ cbz(numIter, Exit); 5061 5062 __ add(oldArrNext, oldArr, 4); 5063 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 5064 5065 // right shift count 5066 __ movw(shiftRevCount, 32); 5067 __ subw(shiftRevCount, shiftRevCount, shiftCount); 5068 5069 // numIter too small to allow a 4-words SIMD loop, rolling back 5070 __ cmp(numIter, (u1)4); 5071 __ br(Assembler::LT, ShiftThree); 5072 5073 __ dup(shiftVCount, __ T4S, shiftCount); 5074 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 5075 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 5076 5077 __ BIND(ShiftSIMDLoop); 5078 5079 // load 4 words and process 5080 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 5081 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 5082 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 5083 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 5084 __ orr(newElem, __ T16B, oldElem0, oldElem1); 5085 __ st1(newElem, __ T4S, __ post(newArr, 16)); 5086 __ sub(numIter, numIter, 4); 5087 5088 __ cmp(numIter, (u1)4); 5089 __ br(Assembler::LT, ShiftTwoLoop); 5090 __ b(ShiftSIMDLoop); 5091 5092 __ BIND(ShiftTwoLoop); 5093 __ cbz(numIter, Exit); 5094 __ cmp(numIter, (u1)1); 5095 __ br(Assembler::EQ, ShiftOne); 5096 5097 // load 2 words and process 5098 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 5099 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 5100 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 5101 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 5102 __ orr(newElem, __ T8B, oldElem0, oldElem1); 5103 __ st1(newElem, __ T2S, __ post(newArr, 8)); 5104 __ sub(numIter, numIter, 2); 5105 __ b(ShiftTwoLoop); 5106 5107 __ BIND(ShiftThree); 5108 __ ldrw(r10, __ post(oldArr, 4)); 5109 __ ldrw(r11, __ post(oldArrNext, 4)); 5110 __ lslvw(r10, r10, shiftCount); 5111 __ lsrvw(r11, r11, shiftRevCount); 5112 __ orrw(r12, r10, r11); 5113 __ strw(r12, __ post(newArr, 4)); 5114 __ tbz(numIter, 1, Exit); 5115 __ tbz(numIter, 0, ShiftOne); 5116 5117 __ BIND(ShiftTwo); 5118 __ ldrw(r10, __ post(oldArr, 4)); 5119 __ ldrw(r11, __ post(oldArrNext, 4)); 5120 __ lslvw(r10, r10, shiftCount); 5121 __ lsrvw(r11, r11, shiftRevCount); 5122 __ orrw(r12, r10, r11); 5123 __ strw(r12, __ post(newArr, 4)); 5124 5125 __ BIND(ShiftOne); 5126 __ ldrw(r10, Address(oldArr)); 5127 __ ldrw(r11, Address(oldArrNext)); 5128 __ lslvw(r10, r10, shiftCount); 5129 __ lsrvw(r11, r11, shiftRevCount); 5130 __ orrw(r12, r10, r11); 5131 __ strw(r12, Address(newArr)); 5132 5133 __ BIND(Exit); 5134 __ ret(lr); 5135 5136 return start; 5137 } 5138 5139 address generate_count_positives(address &count_positives_long) { 5140 const u1 large_loop_size = 64; 5141 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5142 int dcache_line = VM_Version::dcache_line_size(); 5143 5144 Register ary1 = r1, len = r2, result = r0; 5145 5146 __ align(CodeEntryAlignment); 5147 5148 StubGenStubId stub_id = StubGenStubId::count_positives_id; 5149 StubCodeMark mark(this, stub_id); 5150 5151 address entry = __ pc(); 5152 5153 __ enter(); 5154 // precondition: a copy of len is already in result 5155 // __ mov(result, len); 5156 5157 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 5158 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 5159 5160 __ cmp(len, (u1)15); 5161 __ br(Assembler::GT, LEN_OVER_15); 5162 // The only case when execution falls into this code is when pointer is near 5163 // the end of memory page and we have to avoid reading next page 5164 __ add(ary1, ary1, len); 5165 __ subs(len, len, 8); 5166 __ br(Assembler::GT, LEN_OVER_8); 5167 __ ldr(rscratch2, Address(ary1, -8)); 5168 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 5169 __ lsrv(rscratch2, rscratch2, rscratch1); 5170 __ tst(rscratch2, UPPER_BIT_MASK); 5171 __ csel(result, zr, result, Assembler::NE); 5172 __ leave(); 5173 __ ret(lr); 5174 __ bind(LEN_OVER_8); 5175 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5176 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5177 __ tst(rscratch2, UPPER_BIT_MASK); 5178 __ br(Assembler::NE, RET_NO_POP); 5179 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5180 __ lsrv(rscratch1, rscratch1, rscratch2); 5181 __ tst(rscratch1, UPPER_BIT_MASK); 5182 __ bind(RET_NO_POP); 5183 __ csel(result, zr, result, Assembler::NE); 5184 __ leave(); 5185 __ ret(lr); 5186 5187 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5188 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5189 5190 count_positives_long = __ pc(); // 2nd entry point 5191 5192 __ enter(); 5193 5194 __ bind(LEN_OVER_15); 5195 __ push(spilled_regs, sp); 5196 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5197 __ cbz(rscratch2, ALIGNED); 5198 __ ldp(tmp6, tmp1, Address(ary1)); 5199 __ mov(tmp5, 16); 5200 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5201 __ add(ary1, ary1, rscratch1); 5202 __ orr(tmp6, tmp6, tmp1); 5203 __ tst(tmp6, UPPER_BIT_MASK); 5204 __ br(Assembler::NE, RET_ADJUST); 5205 __ sub(len, len, rscratch1); 5206 5207 __ bind(ALIGNED); 5208 __ cmp(len, large_loop_size); 5209 __ br(Assembler::LT, CHECK_16); 5210 // Perform 16-byte load as early return in pre-loop to handle situation 5211 // when initially aligned large array has negative values at starting bytes, 5212 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5213 // slower. Cases with negative bytes further ahead won't be affected that 5214 // much. In fact, it'll be faster due to early loads, less instructions and 5215 // less branches in LARGE_LOOP. 5216 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5217 __ sub(len, len, 16); 5218 __ orr(tmp6, tmp6, tmp1); 5219 __ tst(tmp6, UPPER_BIT_MASK); 5220 __ br(Assembler::NE, RET_ADJUST_16); 5221 __ cmp(len, large_loop_size); 5222 __ br(Assembler::LT, CHECK_16); 5223 5224 if (SoftwarePrefetchHintDistance >= 0 5225 && SoftwarePrefetchHintDistance >= dcache_line) { 5226 // initial prefetch 5227 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5228 } 5229 __ bind(LARGE_LOOP); 5230 if (SoftwarePrefetchHintDistance >= 0) { 5231 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5232 } 5233 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5234 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5235 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5236 // instructions per cycle and have less branches, but this approach disables 5237 // early return, thus, all 64 bytes are loaded and checked every time. 5238 __ ldp(tmp2, tmp3, Address(ary1)); 5239 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5240 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5241 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5242 __ add(ary1, ary1, large_loop_size); 5243 __ sub(len, len, large_loop_size); 5244 __ orr(tmp2, tmp2, tmp3); 5245 __ orr(tmp4, tmp4, tmp5); 5246 __ orr(rscratch1, rscratch1, rscratch2); 5247 __ orr(tmp6, tmp6, tmp1); 5248 __ orr(tmp2, tmp2, tmp4); 5249 __ orr(rscratch1, rscratch1, tmp6); 5250 __ orr(tmp2, tmp2, rscratch1); 5251 __ tst(tmp2, UPPER_BIT_MASK); 5252 __ br(Assembler::NE, RET_ADJUST_LONG); 5253 __ cmp(len, large_loop_size); 5254 __ br(Assembler::GE, LARGE_LOOP); 5255 5256 __ bind(CHECK_16); // small 16-byte load pre-loop 5257 __ cmp(len, (u1)16); 5258 __ br(Assembler::LT, POST_LOOP16); 5259 5260 __ bind(LOOP16); // small 16-byte load loop 5261 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5262 __ sub(len, len, 16); 5263 __ orr(tmp2, tmp2, tmp3); 5264 __ tst(tmp2, UPPER_BIT_MASK); 5265 __ br(Assembler::NE, RET_ADJUST_16); 5266 __ cmp(len, (u1)16); 5267 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5268 5269 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5270 __ cmp(len, (u1)8); 5271 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5272 __ ldr(tmp3, Address(__ post(ary1, 8))); 5273 __ tst(tmp3, UPPER_BIT_MASK); 5274 __ br(Assembler::NE, RET_ADJUST); 5275 __ sub(len, len, 8); 5276 5277 __ bind(POST_LOOP16_LOAD_TAIL); 5278 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5279 __ ldr(tmp1, Address(ary1)); 5280 __ mov(tmp2, 64); 5281 __ sub(tmp4, tmp2, len, __ LSL, 3); 5282 __ lslv(tmp1, tmp1, tmp4); 5283 __ tst(tmp1, UPPER_BIT_MASK); 5284 __ br(Assembler::NE, RET_ADJUST); 5285 // Fallthrough 5286 5287 __ bind(RET_LEN); 5288 __ pop(spilled_regs, sp); 5289 __ leave(); 5290 __ ret(lr); 5291 5292 // difference result - len is the count of guaranteed to be 5293 // positive bytes 5294 5295 __ bind(RET_ADJUST_LONG); 5296 __ add(len, len, (u1)(large_loop_size - 16)); 5297 __ bind(RET_ADJUST_16); 5298 __ add(len, len, 16); 5299 __ bind(RET_ADJUST); 5300 __ pop(spilled_regs, sp); 5301 __ leave(); 5302 __ sub(result, result, len); 5303 __ ret(lr); 5304 5305 return entry; 5306 } 5307 5308 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5309 bool usePrefetch, Label &NOT_EQUAL) { 5310 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5311 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5312 tmp7 = r12, tmp8 = r13; 5313 Label LOOP; 5314 5315 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5316 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5317 __ bind(LOOP); 5318 if (usePrefetch) { 5319 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5320 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5321 } 5322 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5323 __ eor(tmp1, tmp1, tmp2); 5324 __ eor(tmp3, tmp3, tmp4); 5325 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5326 __ orr(tmp1, tmp1, tmp3); 5327 __ cbnz(tmp1, NOT_EQUAL); 5328 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5329 __ eor(tmp5, tmp5, tmp6); 5330 __ eor(tmp7, tmp7, tmp8); 5331 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5332 __ orr(tmp5, tmp5, tmp7); 5333 __ cbnz(tmp5, NOT_EQUAL); 5334 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5335 __ eor(tmp1, tmp1, tmp2); 5336 __ eor(tmp3, tmp3, tmp4); 5337 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5338 __ orr(tmp1, tmp1, tmp3); 5339 __ cbnz(tmp1, NOT_EQUAL); 5340 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5341 __ eor(tmp5, tmp5, tmp6); 5342 __ sub(cnt1, cnt1, 8 * wordSize); 5343 __ eor(tmp7, tmp7, tmp8); 5344 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5345 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5346 // cmp) because subs allows an unlimited range of immediate operand. 5347 __ subs(tmp6, cnt1, loopThreshold); 5348 __ orr(tmp5, tmp5, tmp7); 5349 __ cbnz(tmp5, NOT_EQUAL); 5350 __ br(__ GE, LOOP); 5351 // post-loop 5352 __ eor(tmp1, tmp1, tmp2); 5353 __ eor(tmp3, tmp3, tmp4); 5354 __ orr(tmp1, tmp1, tmp3); 5355 __ sub(cnt1, cnt1, 2 * wordSize); 5356 __ cbnz(tmp1, NOT_EQUAL); 5357 } 5358 5359 void generate_large_array_equals_loop_simd(int loopThreshold, 5360 bool usePrefetch, Label &NOT_EQUAL) { 5361 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5362 tmp2 = rscratch2; 5363 Label LOOP; 5364 5365 __ bind(LOOP); 5366 if (usePrefetch) { 5367 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5368 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5369 } 5370 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5371 __ sub(cnt1, cnt1, 8 * wordSize); 5372 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5373 __ subs(tmp1, cnt1, loopThreshold); 5374 __ eor(v0, __ T16B, v0, v4); 5375 __ eor(v1, __ T16B, v1, v5); 5376 __ eor(v2, __ T16B, v2, v6); 5377 __ eor(v3, __ T16B, v3, v7); 5378 __ orr(v0, __ T16B, v0, v1); 5379 __ orr(v1, __ T16B, v2, v3); 5380 __ orr(v0, __ T16B, v0, v1); 5381 __ umov(tmp1, v0, __ D, 0); 5382 __ umov(tmp2, v0, __ D, 1); 5383 __ orr(tmp1, tmp1, tmp2); 5384 __ cbnz(tmp1, NOT_EQUAL); 5385 __ br(__ GE, LOOP); 5386 } 5387 5388 // a1 = r1 - array1 address 5389 // a2 = r2 - array2 address 5390 // result = r0 - return value. Already contains "false" 5391 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5392 // r3-r5 are reserved temporary registers 5393 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5394 address generate_large_array_equals() { 5395 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5396 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5397 tmp7 = r12, tmp8 = r13; 5398 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5399 SMALL_LOOP, POST_LOOP; 5400 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5401 // calculate if at least 32 prefetched bytes are used 5402 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5403 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5404 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5405 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5406 tmp5, tmp6, tmp7, tmp8); 5407 5408 __ align(CodeEntryAlignment); 5409 5410 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 5411 StubCodeMark mark(this, stub_id); 5412 5413 address entry = __ pc(); 5414 __ enter(); 5415 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5416 // also advance pointers to use post-increment instead of pre-increment 5417 __ add(a1, a1, wordSize); 5418 __ add(a2, a2, wordSize); 5419 if (AvoidUnalignedAccesses) { 5420 // both implementations (SIMD/nonSIMD) are using relatively large load 5421 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5422 // on some CPUs in case of address is not at least 16-byte aligned. 5423 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5424 // load if needed at least for 1st address and make if 16-byte aligned. 5425 Label ALIGNED16; 5426 __ tbz(a1, 3, ALIGNED16); 5427 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5428 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5429 __ sub(cnt1, cnt1, wordSize); 5430 __ eor(tmp1, tmp1, tmp2); 5431 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5432 __ bind(ALIGNED16); 5433 } 5434 if (UseSIMDForArrayEquals) { 5435 if (SoftwarePrefetchHintDistance >= 0) { 5436 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5437 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5438 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5439 /* prfm = */ true, NOT_EQUAL); 5440 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5441 __ br(__ LT, TAIL); 5442 } 5443 __ bind(NO_PREFETCH_LARGE_LOOP); 5444 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5445 /* prfm = */ false, NOT_EQUAL); 5446 } else { 5447 __ push(spilled_regs, sp); 5448 if (SoftwarePrefetchHintDistance >= 0) { 5449 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5450 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5451 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5452 /* prfm = */ true, NOT_EQUAL); 5453 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5454 __ br(__ LT, TAIL); 5455 } 5456 __ bind(NO_PREFETCH_LARGE_LOOP); 5457 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5458 /* prfm = */ false, NOT_EQUAL); 5459 } 5460 __ bind(TAIL); 5461 __ cbz(cnt1, EQUAL); 5462 __ subs(cnt1, cnt1, wordSize); 5463 __ br(__ LE, POST_LOOP); 5464 __ bind(SMALL_LOOP); 5465 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5466 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5467 __ subs(cnt1, cnt1, wordSize); 5468 __ eor(tmp1, tmp1, tmp2); 5469 __ cbnz(tmp1, NOT_EQUAL); 5470 __ br(__ GT, SMALL_LOOP); 5471 __ bind(POST_LOOP); 5472 __ ldr(tmp1, Address(a1, cnt1)); 5473 __ ldr(tmp2, Address(a2, cnt1)); 5474 __ eor(tmp1, tmp1, tmp2); 5475 __ cbnz(tmp1, NOT_EQUAL); 5476 __ bind(EQUAL); 5477 __ mov(result, true); 5478 __ bind(NOT_EQUAL); 5479 if (!UseSIMDForArrayEquals) { 5480 __ pop(spilled_regs, sp); 5481 } 5482 __ bind(NOT_EQUAL_NO_POP); 5483 __ leave(); 5484 __ ret(lr); 5485 return entry; 5486 } 5487 5488 // result = r0 - return value. Contains initial hashcode value on entry. 5489 // ary = r1 - array address 5490 // cnt = r2 - elements count 5491 // Clobbers: v0-v13, rscratch1, rscratch2 5492 address generate_large_arrays_hashcode(BasicType eltype) { 5493 const Register result = r0, ary = r1, cnt = r2; 5494 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 5495 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 5496 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 5497 const FloatRegister vpowm = v13; 5498 5499 ARRAYS_HASHCODE_REGISTERS; 5500 5501 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 5502 5503 unsigned int vf; // vectorization factor 5504 bool multiply_by_halves; 5505 Assembler::SIMD_Arrangement load_arrangement; 5506 switch (eltype) { 5507 case T_BOOLEAN: 5508 case T_BYTE: 5509 load_arrangement = Assembler::T8B; 5510 multiply_by_halves = true; 5511 vf = 8; 5512 break; 5513 case T_CHAR: 5514 case T_SHORT: 5515 load_arrangement = Assembler::T8H; 5516 multiply_by_halves = true; 5517 vf = 8; 5518 break; 5519 case T_INT: 5520 load_arrangement = Assembler::T4S; 5521 multiply_by_halves = false; 5522 vf = 4; 5523 break; 5524 default: 5525 ShouldNotReachHere(); 5526 } 5527 5528 // Unroll factor 5529 const unsigned uf = 4; 5530 5531 // Effective vectorization factor 5532 const unsigned evf = vf * uf; 5533 5534 __ align(CodeEntryAlignment); 5535 5536 StubGenStubId stub_id; 5537 switch (eltype) { 5538 case T_BOOLEAN: 5539 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 5540 break; 5541 case T_BYTE: 5542 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 5543 break; 5544 case T_CHAR: 5545 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 5546 break; 5547 case T_SHORT: 5548 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 5549 break; 5550 case T_INT: 5551 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 5552 break; 5553 default: 5554 stub_id = StubGenStubId::NO_STUBID; 5555 ShouldNotReachHere(); 5556 }; 5557 5558 StubCodeMark mark(this, stub_id); 5559 5560 address entry = __ pc(); 5561 __ enter(); 5562 5563 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 5564 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 5565 // value shouldn't change throughout both loops. 5566 __ movw(rscratch1, intpow(31U, 3)); 5567 __ mov(vpow, Assembler::S, 0, rscratch1); 5568 __ movw(rscratch1, intpow(31U, 2)); 5569 __ mov(vpow, Assembler::S, 1, rscratch1); 5570 __ movw(rscratch1, intpow(31U, 1)); 5571 __ mov(vpow, Assembler::S, 2, rscratch1); 5572 __ movw(rscratch1, intpow(31U, 0)); 5573 __ mov(vpow, Assembler::S, 3, rscratch1); 5574 5575 __ mov(vmul0, Assembler::T16B, 0); 5576 __ mov(vmul0, Assembler::S, 3, result); 5577 5578 __ andr(rscratch2, cnt, (uf - 1) * vf); 5579 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 5580 5581 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 5582 __ mov(vpowm, Assembler::S, 0, rscratch1); 5583 5584 // SMALL LOOP 5585 __ bind(SMALL_LOOP); 5586 5587 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 5588 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5589 __ subsw(rscratch2, rscratch2, vf); 5590 5591 if (load_arrangement == Assembler::T8B) { 5592 // Extend 8B to 8H to be able to use vector multiply 5593 // instructions 5594 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 5595 if (is_signed_subword_type(eltype)) { 5596 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5597 } else { 5598 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5599 } 5600 } 5601 5602 switch (load_arrangement) { 5603 case Assembler::T4S: 5604 __ addv(vmul0, load_arrangement, vmul0, vdata0); 5605 break; 5606 case Assembler::T8B: 5607 case Assembler::T8H: 5608 assert(is_subword_type(eltype), "subword type expected"); 5609 if (is_signed_subword_type(eltype)) { 5610 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5611 } else { 5612 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5613 } 5614 break; 5615 default: 5616 __ should_not_reach_here(); 5617 } 5618 5619 // Process the upper half of a vector 5620 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 5621 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5622 if (is_signed_subword_type(eltype)) { 5623 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5624 } else { 5625 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5626 } 5627 } 5628 5629 __ br(Assembler::HI, SMALL_LOOP); 5630 5631 // SMALL LOOP'S EPILOQUE 5632 __ lsr(rscratch2, cnt, exact_log2(evf)); 5633 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 5634 5635 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 5636 __ addv(vmul0, Assembler::T4S, vmul0); 5637 __ umov(result, vmul0, Assembler::S, 0); 5638 5639 // TAIL 5640 __ bind(TAIL); 5641 5642 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 5643 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 5644 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 5645 __ andr(rscratch2, cnt, vf - 1); 5646 __ bind(TAIL_SHORTCUT); 5647 __ adr(rscratch1, BR_BASE); 5648 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 5649 __ movw(rscratch2, 0x1f); 5650 __ br(rscratch1); 5651 5652 for (size_t i = 0; i < vf - 1; ++i) { 5653 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 5654 eltype); 5655 __ maddw(result, result, rscratch2, rscratch1); 5656 } 5657 __ bind(BR_BASE); 5658 5659 __ leave(); 5660 __ ret(lr); 5661 5662 // LARGE LOOP 5663 __ bind(LARGE_LOOP_PREHEADER); 5664 5665 __ lsr(rscratch2, cnt, exact_log2(evf)); 5666 5667 if (multiply_by_halves) { 5668 // 31^4 - multiplier between lower and upper parts of a register 5669 __ movw(rscratch1, intpow(31U, vf / 2)); 5670 __ mov(vpowm, Assembler::S, 1, rscratch1); 5671 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 5672 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 5673 __ mov(vpowm, Assembler::S, 0, rscratch1); 5674 } else { 5675 // 31^16 5676 __ movw(rscratch1, intpow(31U, evf)); 5677 __ mov(vpowm, Assembler::S, 0, rscratch1); 5678 } 5679 5680 __ mov(vmul3, Assembler::T16B, 0); 5681 __ mov(vmul2, Assembler::T16B, 0); 5682 __ mov(vmul1, Assembler::T16B, 0); 5683 5684 __ bind(LARGE_LOOP); 5685 5686 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 5687 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 5688 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 5689 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5690 5691 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 5692 Address(__ post(ary, evf * type2aelembytes(eltype)))); 5693 5694 if (load_arrangement == Assembler::T8B) { 5695 // Extend 8B to 8H to be able to use vector multiply 5696 // instructions 5697 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 5698 if (is_signed_subword_type(eltype)) { 5699 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 5700 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 5701 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 5702 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5703 } else { 5704 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 5705 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 5706 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 5707 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5708 } 5709 } 5710 5711 switch (load_arrangement) { 5712 case Assembler::T4S: 5713 __ addv(vmul3, load_arrangement, vmul3, vdata3); 5714 __ addv(vmul2, load_arrangement, vmul2, vdata2); 5715 __ addv(vmul1, load_arrangement, vmul1, vdata1); 5716 __ addv(vmul0, load_arrangement, vmul0, vdata0); 5717 break; 5718 case Assembler::T8B: 5719 case Assembler::T8H: 5720 assert(is_subword_type(eltype), "subword type expected"); 5721 if (is_signed_subword_type(eltype)) { 5722 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 5723 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 5724 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 5725 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5726 } else { 5727 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 5728 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 5729 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 5730 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5731 } 5732 break; 5733 default: 5734 __ should_not_reach_here(); 5735 } 5736 5737 // Process the upper half of a vector 5738 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 5739 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 5740 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 5741 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 5742 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 5743 if (is_signed_subword_type(eltype)) { 5744 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 5745 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 5746 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 5747 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5748 } else { 5749 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 5750 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 5751 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 5752 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5753 } 5754 } 5755 5756 __ subsw(rscratch2, rscratch2, 1); 5757 __ br(Assembler::HI, LARGE_LOOP); 5758 5759 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 5760 __ addv(vmul3, Assembler::T4S, vmul3); 5761 __ umov(result, vmul3, Assembler::S, 0); 5762 5763 __ mov(rscratch2, intpow(31U, vf)); 5764 5765 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 5766 __ addv(vmul2, Assembler::T4S, vmul2); 5767 __ umov(rscratch1, vmul2, Assembler::S, 0); 5768 __ maddw(result, result, rscratch2, rscratch1); 5769 5770 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 5771 __ addv(vmul1, Assembler::T4S, vmul1); 5772 __ umov(rscratch1, vmul1, Assembler::S, 0); 5773 __ maddw(result, result, rscratch2, rscratch1); 5774 5775 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 5776 __ addv(vmul0, Assembler::T4S, vmul0); 5777 __ umov(rscratch1, vmul0, Assembler::S, 0); 5778 __ maddw(result, result, rscratch2, rscratch1); 5779 5780 __ andr(rscratch2, cnt, vf - 1); 5781 __ cbnz(rscratch2, TAIL_SHORTCUT); 5782 5783 __ leave(); 5784 __ ret(lr); 5785 5786 return entry; 5787 } 5788 5789 address generate_dsin_dcos(bool isCos) { 5790 __ align(CodeEntryAlignment); 5791 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 5792 StubCodeMark mark(this, stub_id); 5793 address start = __ pc(); 5794 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5795 (address)StubRoutines::aarch64::_two_over_pi, 5796 (address)StubRoutines::aarch64::_pio2, 5797 (address)StubRoutines::aarch64::_dsin_coef, 5798 (address)StubRoutines::aarch64::_dcos_coef); 5799 return start; 5800 } 5801 5802 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5803 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5804 Label &DIFF2) { 5805 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5806 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5807 5808 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5809 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5810 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5811 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5812 5813 __ fmovd(tmpL, vtmp3); 5814 __ eor(rscratch2, tmp3, tmpL); 5815 __ cbnz(rscratch2, DIFF2); 5816 5817 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5818 __ umov(tmpL, vtmp3, __ D, 1); 5819 __ eor(rscratch2, tmpU, tmpL); 5820 __ cbnz(rscratch2, DIFF1); 5821 5822 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5823 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5824 __ fmovd(tmpL, vtmp); 5825 __ eor(rscratch2, tmp3, tmpL); 5826 __ cbnz(rscratch2, DIFF2); 5827 5828 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5829 __ umov(tmpL, vtmp, __ D, 1); 5830 __ eor(rscratch2, tmpU, tmpL); 5831 __ cbnz(rscratch2, DIFF1); 5832 } 5833 5834 // r0 = result 5835 // r1 = str1 5836 // r2 = cnt1 5837 // r3 = str2 5838 // r4 = cnt2 5839 // r10 = tmp1 5840 // r11 = tmp2 5841 address generate_compare_long_string_different_encoding(bool isLU) { 5842 __ align(CodeEntryAlignment); 5843 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 5844 StubCodeMark mark(this, stub_id); 5845 address entry = __ pc(); 5846 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5847 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5848 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5849 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5850 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5851 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5852 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5853 5854 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5855 5856 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5857 // cnt2 == amount of characters left to compare 5858 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5859 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5860 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5861 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5862 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5863 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5864 __ eor(rscratch2, tmp1, tmp2); 5865 __ mov(rscratch1, tmp2); 5866 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5867 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5868 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5869 __ push(spilled_regs, sp); 5870 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5871 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5872 5873 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5874 5875 if (SoftwarePrefetchHintDistance >= 0) { 5876 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5877 __ br(__ LT, NO_PREFETCH); 5878 __ bind(LARGE_LOOP_PREFETCH); 5879 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5880 __ mov(tmp4, 2); 5881 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5882 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5883 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5884 __ subs(tmp4, tmp4, 1); 5885 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5886 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5887 __ mov(tmp4, 2); 5888 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5889 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5890 __ subs(tmp4, tmp4, 1); 5891 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5892 __ sub(cnt2, cnt2, 64); 5893 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5894 __ br(__ GE, LARGE_LOOP_PREFETCH); 5895 } 5896 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5897 __ bind(NO_PREFETCH); 5898 __ subs(cnt2, cnt2, 16); 5899 __ br(__ LT, TAIL); 5900 __ align(OptoLoopAlignment); 5901 __ bind(SMALL_LOOP); // smaller loop 5902 __ subs(cnt2, cnt2, 16); 5903 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5904 __ br(__ GE, SMALL_LOOP); 5905 __ cmn(cnt2, (u1)16); 5906 __ br(__ EQ, LOAD_LAST); 5907 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5908 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5909 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5910 __ ldr(tmp3, Address(cnt1, -8)); 5911 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5912 __ b(LOAD_LAST); 5913 __ bind(DIFF2); 5914 __ mov(tmpU, tmp3); 5915 __ bind(DIFF1); 5916 __ pop(spilled_regs, sp); 5917 __ b(CALCULATE_DIFFERENCE); 5918 __ bind(LOAD_LAST); 5919 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5920 // No need to load it again 5921 __ mov(tmpU, tmp3); 5922 __ pop(spilled_regs, sp); 5923 5924 // tmp2 points to the address of the last 4 Latin1 characters right now 5925 __ ldrs(vtmp, Address(tmp2)); 5926 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5927 __ fmovd(tmpL, vtmp); 5928 5929 __ eor(rscratch2, tmpU, tmpL); 5930 __ cbz(rscratch2, DONE); 5931 5932 // Find the first different characters in the longwords and 5933 // compute their difference. 5934 __ bind(CALCULATE_DIFFERENCE); 5935 __ rev(rscratch2, rscratch2); 5936 __ clz(rscratch2, rscratch2); 5937 __ andr(rscratch2, rscratch2, -16); 5938 __ lsrv(tmp1, tmp1, rscratch2); 5939 __ uxthw(tmp1, tmp1); 5940 __ lsrv(rscratch1, rscratch1, rscratch2); 5941 __ uxthw(rscratch1, rscratch1); 5942 __ subw(result, tmp1, rscratch1); 5943 __ bind(DONE); 5944 __ ret(lr); 5945 return entry; 5946 } 5947 5948 // r0 = input (float16) 5949 // v0 = result (float) 5950 // v1 = temporary float register 5951 address generate_float16ToFloat() { 5952 __ align(CodeEntryAlignment); 5953 StubGenStubId stub_id = StubGenStubId::hf2f_id; 5954 StubCodeMark mark(this, stub_id); 5955 address entry = __ pc(); 5956 BLOCK_COMMENT("Entry:"); 5957 __ flt16_to_flt(v0, r0, v1); 5958 __ ret(lr); 5959 return entry; 5960 } 5961 5962 // v0 = input (float) 5963 // r0 = result (float16) 5964 // v1 = temporary float register 5965 address generate_floatToFloat16() { 5966 __ align(CodeEntryAlignment); 5967 StubGenStubId stub_id = StubGenStubId::f2hf_id; 5968 StubCodeMark mark(this, stub_id); 5969 address entry = __ pc(); 5970 BLOCK_COMMENT("Entry:"); 5971 __ flt_to_flt16(r0, v0, v1); 5972 __ ret(lr); 5973 return entry; 5974 } 5975 5976 address generate_method_entry_barrier() { 5977 __ align(CodeEntryAlignment); 5978 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 5979 StubCodeMark mark(this, stub_id); 5980 5981 Label deoptimize_label; 5982 5983 address start = __ pc(); 5984 5985 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5986 5987 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5988 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5989 // We can get here despite the nmethod being good, if we have not 5990 // yet applied our cross modification fence (or data fence). 5991 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5992 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5993 __ ldrw(rscratch2, rscratch2); 5994 __ strw(rscratch2, thread_epoch_addr); 5995 __ isb(); 5996 __ membar(__ LoadLoad); 5997 } 5998 5999 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 6000 6001 __ enter(); 6002 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 6003 6004 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 6005 6006 __ push_call_clobbered_registers(); 6007 6008 __ mov(c_rarg0, rscratch2); 6009 __ call_VM_leaf 6010 (CAST_FROM_FN_PTR 6011 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 6012 6013 __ reset_last_Java_frame(true); 6014 6015 __ mov(rscratch1, r0); 6016 6017 __ pop_call_clobbered_registers(); 6018 6019 __ cbnz(rscratch1, deoptimize_label); 6020 6021 __ leave(); 6022 __ ret(lr); 6023 6024 __ BIND(deoptimize_label); 6025 6026 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 6027 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 6028 6029 __ mov(sp, rscratch1); 6030 __ br(rscratch2); 6031 6032 return start; 6033 } 6034 6035 // r0 = result 6036 // r1 = str1 6037 // r2 = cnt1 6038 // r3 = str2 6039 // r4 = cnt2 6040 // r10 = tmp1 6041 // r11 = tmp2 6042 address generate_compare_long_string_same_encoding(bool isLL) { 6043 __ align(CodeEntryAlignment); 6044 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 6045 StubCodeMark mark(this, stub_id); 6046 address entry = __ pc(); 6047 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6048 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 6049 6050 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 6051 6052 // exit from large loop when less than 64 bytes left to read or we're about 6053 // to prefetch memory behind array border 6054 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 6055 6056 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 6057 __ eor(rscratch2, tmp1, tmp2); 6058 __ cbnz(rscratch2, CAL_DIFFERENCE); 6059 6060 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 6061 // update pointers, because of previous read 6062 __ add(str1, str1, wordSize); 6063 __ add(str2, str2, wordSize); 6064 if (SoftwarePrefetchHintDistance >= 0) { 6065 __ align(OptoLoopAlignment); 6066 __ bind(LARGE_LOOP_PREFETCH); 6067 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 6068 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 6069 6070 for (int i = 0; i < 4; i++) { 6071 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 6072 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 6073 __ cmp(tmp1, tmp2); 6074 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 6075 __ br(Assembler::NE, DIFF); 6076 } 6077 __ sub(cnt2, cnt2, isLL ? 64 : 32); 6078 __ add(str1, str1, 64); 6079 __ add(str2, str2, 64); 6080 __ subs(rscratch2, cnt2, largeLoopExitCondition); 6081 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 6082 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 6083 } 6084 6085 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 6086 __ br(Assembler::LE, LESS16); 6087 __ align(OptoLoopAlignment); 6088 __ bind(LOOP_COMPARE16); 6089 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 6090 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 6091 __ cmp(tmp1, tmp2); 6092 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 6093 __ br(Assembler::NE, DIFF); 6094 __ sub(cnt2, cnt2, isLL ? 16 : 8); 6095 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 6096 __ br(Assembler::LT, LESS16); 6097 6098 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 6099 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 6100 __ cmp(tmp1, tmp2); 6101 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 6102 __ br(Assembler::NE, DIFF); 6103 __ sub(cnt2, cnt2, isLL ? 16 : 8); 6104 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 6105 __ br(Assembler::GE, LOOP_COMPARE16); 6106 __ cbz(cnt2, LENGTH_DIFF); 6107 6108 __ bind(LESS16); 6109 // each 8 compare 6110 __ subs(cnt2, cnt2, isLL ? 8 : 4); 6111 __ br(Assembler::LE, LESS8); 6112 __ ldr(tmp1, Address(__ post(str1, 8))); 6113 __ ldr(tmp2, Address(__ post(str2, 8))); 6114 __ eor(rscratch2, tmp1, tmp2); 6115 __ cbnz(rscratch2, CAL_DIFFERENCE); 6116 __ sub(cnt2, cnt2, isLL ? 8 : 4); 6117 6118 __ bind(LESS8); // directly load last 8 bytes 6119 if (!isLL) { 6120 __ add(cnt2, cnt2, cnt2); 6121 } 6122 __ ldr(tmp1, Address(str1, cnt2)); 6123 __ ldr(tmp2, Address(str2, cnt2)); 6124 __ eor(rscratch2, tmp1, tmp2); 6125 __ cbz(rscratch2, LENGTH_DIFF); 6126 __ b(CAL_DIFFERENCE); 6127 6128 __ bind(DIFF); 6129 __ cmp(tmp1, tmp2); 6130 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 6131 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 6132 // reuse rscratch2 register for the result of eor instruction 6133 __ eor(rscratch2, tmp1, tmp2); 6134 6135 __ bind(CAL_DIFFERENCE); 6136 __ rev(rscratch2, rscratch2); 6137 __ clz(rscratch2, rscratch2); 6138 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 6139 __ lsrv(tmp1, tmp1, rscratch2); 6140 __ lsrv(tmp2, tmp2, rscratch2); 6141 if (isLL) { 6142 __ uxtbw(tmp1, tmp1); 6143 __ uxtbw(tmp2, tmp2); 6144 } else { 6145 __ uxthw(tmp1, tmp1); 6146 __ uxthw(tmp2, tmp2); 6147 } 6148 __ subw(result, tmp1, tmp2); 6149 6150 __ bind(LENGTH_DIFF); 6151 __ ret(lr); 6152 return entry; 6153 } 6154 6155 enum string_compare_mode { 6156 LL, 6157 LU, 6158 UL, 6159 UU, 6160 }; 6161 6162 // The following registers are declared in aarch64.ad 6163 // r0 = result 6164 // r1 = str1 6165 // r2 = cnt1 6166 // r3 = str2 6167 // r4 = cnt2 6168 // r10 = tmp1 6169 // r11 = tmp2 6170 // z0 = ztmp1 6171 // z1 = ztmp2 6172 // p0 = pgtmp1 6173 // p1 = pgtmp2 6174 address generate_compare_long_string_sve(string_compare_mode mode) { 6175 StubGenStubId stub_id; 6176 switch (mode) { 6177 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 6178 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 6179 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 6180 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 6181 default: ShouldNotReachHere(); 6182 } 6183 6184 __ align(CodeEntryAlignment); 6185 address entry = __ pc(); 6186 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6187 tmp1 = r10, tmp2 = r11; 6188 6189 Label LOOP, DONE, MISMATCH; 6190 Register vec_len = tmp1; 6191 Register idx = tmp2; 6192 // The minimum of the string lengths has been stored in cnt2. 6193 Register cnt = cnt2; 6194 FloatRegister ztmp1 = z0, ztmp2 = z1; 6195 PRegister pgtmp1 = p0, pgtmp2 = p1; 6196 6197 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 6198 switch (mode) { \ 6199 case LL: \ 6200 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 6201 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 6202 break; \ 6203 case LU: \ 6204 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 6205 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 6206 break; \ 6207 case UL: \ 6208 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 6209 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 6210 break; \ 6211 case UU: \ 6212 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 6213 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 6214 break; \ 6215 default: \ 6216 ShouldNotReachHere(); \ 6217 } 6218 6219 StubCodeMark mark(this, stub_id); 6220 6221 __ mov(idx, 0); 6222 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 6223 6224 if (mode == LL) { 6225 __ sve_cntb(vec_len); 6226 } else { 6227 __ sve_cnth(vec_len); 6228 } 6229 6230 __ sub(rscratch1, cnt, vec_len); 6231 6232 __ bind(LOOP); 6233 6234 // main loop 6235 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 6236 __ add(idx, idx, vec_len); 6237 // Compare strings. 6238 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 6239 __ br(__ NE, MISMATCH); 6240 __ cmp(idx, rscratch1); 6241 __ br(__ LT, LOOP); 6242 6243 // post loop, last iteration 6244 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 6245 6246 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 6247 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 6248 __ br(__ EQ, DONE); 6249 6250 __ bind(MISMATCH); 6251 6252 // Crop the vector to find its location. 6253 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 6254 // Extract the first different characters of each string. 6255 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 6256 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 6257 6258 // Compute the difference of the first different characters. 6259 __ sub(result, rscratch1, rscratch2); 6260 6261 __ bind(DONE); 6262 __ ret(lr); 6263 #undef LOAD_PAIR 6264 return entry; 6265 } 6266 6267 void generate_compare_long_strings() { 6268 if (UseSVE == 0) { 6269 StubRoutines::aarch64::_compare_long_string_LL 6270 = generate_compare_long_string_same_encoding(true); 6271 StubRoutines::aarch64::_compare_long_string_UU 6272 = generate_compare_long_string_same_encoding(false); 6273 StubRoutines::aarch64::_compare_long_string_LU 6274 = generate_compare_long_string_different_encoding(true); 6275 StubRoutines::aarch64::_compare_long_string_UL 6276 = generate_compare_long_string_different_encoding(false); 6277 } else { 6278 StubRoutines::aarch64::_compare_long_string_LL 6279 = generate_compare_long_string_sve(LL); 6280 StubRoutines::aarch64::_compare_long_string_UU 6281 = generate_compare_long_string_sve(UU); 6282 StubRoutines::aarch64::_compare_long_string_LU 6283 = generate_compare_long_string_sve(LU); 6284 StubRoutines::aarch64::_compare_long_string_UL 6285 = generate_compare_long_string_sve(UL); 6286 } 6287 } 6288 6289 // R0 = result 6290 // R1 = str2 6291 // R2 = cnt1 6292 // R3 = str1 6293 // R4 = cnt2 6294 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 6295 // 6296 // This generic linear code use few additional ideas, which makes it faster: 6297 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 6298 // in order to skip initial loading(help in systems with 1 ld pipeline) 6299 // 2) we can use "fast" algorithm of finding single character to search for 6300 // first symbol with less branches(1 branch per each loaded register instead 6301 // of branch for each symbol), so, this is where constants like 6302 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 6303 // 3) after loading and analyzing 1st register of source string, it can be 6304 // used to search for every 1st character entry, saving few loads in 6305 // comparison with "simplier-but-slower" implementation 6306 // 4) in order to avoid lots of push/pop operations, code below is heavily 6307 // re-using/re-initializing/compressing register values, which makes code 6308 // larger and a bit less readable, however, most of extra operations are 6309 // issued during loads or branches, so, penalty is minimal 6310 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 6311 StubGenStubId stub_id; 6312 if (str1_isL) { 6313 if (str2_isL) { 6314 stub_id = StubGenStubId::string_indexof_linear_ll_id; 6315 } else { 6316 stub_id = StubGenStubId::string_indexof_linear_ul_id; 6317 } 6318 } else { 6319 if (str2_isL) { 6320 ShouldNotReachHere(); 6321 } else { 6322 stub_id = StubGenStubId::string_indexof_linear_uu_id; 6323 } 6324 } 6325 __ align(CodeEntryAlignment); 6326 StubCodeMark mark(this, stub_id); 6327 address entry = __ pc(); 6328 6329 int str1_chr_size = str1_isL ? 1 : 2; 6330 int str2_chr_size = str2_isL ? 1 : 2; 6331 int str1_chr_shift = str1_isL ? 0 : 1; 6332 int str2_chr_shift = str2_isL ? 0 : 1; 6333 bool isL = str1_isL && str2_isL; 6334 // parameters 6335 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 6336 // temporary registers 6337 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 6338 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 6339 // redefinitions 6340 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 6341 6342 __ push(spilled_regs, sp); 6343 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 6344 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 6345 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 6346 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 6347 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 6348 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 6349 // Read whole register from str1. It is safe, because length >=8 here 6350 __ ldr(ch1, Address(str1)); 6351 // Read whole register from str2. It is safe, because length >=8 here 6352 __ ldr(ch2, Address(str2)); 6353 __ sub(cnt2, cnt2, cnt1); 6354 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 6355 if (str1_isL != str2_isL) { 6356 __ eor(v0, __ T16B, v0, v0); 6357 } 6358 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 6359 __ mul(first, first, tmp1); 6360 // check if we have less than 1 register to check 6361 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 6362 if (str1_isL != str2_isL) { 6363 __ fmovd(v1, ch1); 6364 } 6365 __ br(__ LE, L_SMALL); 6366 __ eor(ch2, first, ch2); 6367 if (str1_isL != str2_isL) { 6368 __ zip1(v1, __ T16B, v1, v0); 6369 } 6370 __ sub(tmp2, ch2, tmp1); 6371 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6372 __ bics(tmp2, tmp2, ch2); 6373 if (str1_isL != str2_isL) { 6374 __ fmovd(ch1, v1); 6375 } 6376 __ br(__ NE, L_HAS_ZERO); 6377 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 6378 __ add(result, result, wordSize/str2_chr_size); 6379 __ add(str2, str2, wordSize); 6380 __ br(__ LT, L_POST_LOOP); 6381 __ BIND(L_LOOP); 6382 __ ldr(ch2, Address(str2)); 6383 __ eor(ch2, first, ch2); 6384 __ sub(tmp2, ch2, tmp1); 6385 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6386 __ bics(tmp2, tmp2, ch2); 6387 __ br(__ NE, L_HAS_ZERO); 6388 __ BIND(L_LOOP_PROCEED); 6389 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 6390 __ add(str2, str2, wordSize); 6391 __ add(result, result, wordSize/str2_chr_size); 6392 __ br(__ GE, L_LOOP); 6393 __ BIND(L_POST_LOOP); 6394 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 6395 __ br(__ LE, NOMATCH); 6396 __ ldr(ch2, Address(str2)); 6397 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 6398 __ eor(ch2, first, ch2); 6399 __ sub(tmp2, ch2, tmp1); 6400 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6401 __ mov(tmp4, -1); // all bits set 6402 __ b(L_SMALL_PROCEED); 6403 __ align(OptoLoopAlignment); 6404 __ BIND(L_SMALL); 6405 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 6406 __ eor(ch2, first, ch2); 6407 if (str1_isL != str2_isL) { 6408 __ zip1(v1, __ T16B, v1, v0); 6409 } 6410 __ sub(tmp2, ch2, tmp1); 6411 __ mov(tmp4, -1); // all bits set 6412 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6413 if (str1_isL != str2_isL) { 6414 __ fmovd(ch1, v1); // move converted 4 symbols 6415 } 6416 __ BIND(L_SMALL_PROCEED); 6417 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 6418 __ bic(tmp2, tmp2, ch2); 6419 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 6420 __ rbit(tmp2, tmp2); 6421 __ br(__ EQ, NOMATCH); 6422 __ BIND(L_SMALL_HAS_ZERO_LOOP); 6423 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 6424 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 6425 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 6426 if (str2_isL) { // LL 6427 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6428 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6429 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6430 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6431 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6432 } else { 6433 __ mov(ch2, 0xE); // all bits in byte set except last one 6434 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6435 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6436 __ lslv(tmp2, tmp2, tmp4); 6437 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6438 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6439 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6440 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6441 } 6442 __ cmp(ch1, ch2); 6443 __ mov(tmp4, wordSize/str2_chr_size); 6444 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6445 __ BIND(L_SMALL_CMP_LOOP); 6446 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6447 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6448 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6449 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6450 __ add(tmp4, tmp4, 1); 6451 __ cmp(tmp4, cnt1); 6452 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 6453 __ cmp(first, ch2); 6454 __ br(__ EQ, L_SMALL_CMP_LOOP); 6455 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 6456 __ cbz(tmp2, NOMATCH); // no more matches. exit 6457 __ clz(tmp4, tmp2); 6458 __ add(result, result, 1); // advance index 6459 __ add(str2, str2, str2_chr_size); // advance pointer 6460 __ b(L_SMALL_HAS_ZERO_LOOP); 6461 __ align(OptoLoopAlignment); 6462 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 6463 __ cmp(first, ch2); 6464 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6465 __ b(DONE); 6466 __ align(OptoLoopAlignment); 6467 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 6468 if (str2_isL) { // LL 6469 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6470 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6471 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6472 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6473 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6474 } else { 6475 __ mov(ch2, 0xE); // all bits in byte set except last one 6476 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6477 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6478 __ lslv(tmp2, tmp2, tmp4); 6479 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6480 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6481 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6482 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6483 } 6484 __ cmp(ch1, ch2); 6485 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6486 __ b(DONE); 6487 __ align(OptoLoopAlignment); 6488 __ BIND(L_HAS_ZERO); 6489 __ rbit(tmp2, tmp2); 6490 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 6491 // Now, perform compression of counters(cnt2 and cnt1) into one register. 6492 // It's fine because both counters are 32bit and are not changed in this 6493 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 6494 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 6495 __ sub(result, result, 1); 6496 __ BIND(L_HAS_ZERO_LOOP); 6497 __ mov(cnt1, wordSize/str2_chr_size); 6498 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6499 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6500 if (str2_isL) { 6501 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6502 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6503 __ lslv(tmp2, tmp2, tmp4); 6504 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6505 __ add(tmp4, tmp4, 1); 6506 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6507 __ lsl(tmp2, tmp2, 1); 6508 __ mov(tmp4, wordSize/str2_chr_size); 6509 } else { 6510 __ mov(ch2, 0xE); 6511 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6512 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6513 __ lslv(tmp2, tmp2, tmp4); 6514 __ add(tmp4, tmp4, 1); 6515 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6516 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6517 __ lsl(tmp2, tmp2, 1); 6518 __ mov(tmp4, wordSize/str2_chr_size); 6519 __ sub(str2, str2, str2_chr_size); 6520 } 6521 __ cmp(ch1, ch2); 6522 __ mov(tmp4, wordSize/str2_chr_size); 6523 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6524 __ BIND(L_CMP_LOOP); 6525 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6526 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6527 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6528 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6529 __ add(tmp4, tmp4, 1); 6530 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6531 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6532 __ cmp(cnt1, ch2); 6533 __ br(__ EQ, L_CMP_LOOP); 6534 __ BIND(L_CMP_LOOP_NOMATCH); 6535 // here we're not matched 6536 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6537 __ clz(tmp4, tmp2); 6538 __ add(str2, str2, str2_chr_size); // advance pointer 6539 __ b(L_HAS_ZERO_LOOP); 6540 __ align(OptoLoopAlignment); 6541 __ BIND(L_CMP_LOOP_LAST_CMP); 6542 __ cmp(cnt1, ch2); 6543 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6544 __ b(DONE); 6545 __ align(OptoLoopAlignment); 6546 __ BIND(L_CMP_LOOP_LAST_CMP2); 6547 if (str2_isL) { 6548 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6549 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6550 __ lslv(tmp2, tmp2, tmp4); 6551 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6552 __ add(tmp4, tmp4, 1); 6553 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6554 __ lsl(tmp2, tmp2, 1); 6555 } else { 6556 __ mov(ch2, 0xE); 6557 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6558 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6559 __ lslv(tmp2, tmp2, tmp4); 6560 __ add(tmp4, tmp4, 1); 6561 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6562 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6563 __ lsl(tmp2, tmp2, 1); 6564 __ sub(str2, str2, str2_chr_size); 6565 } 6566 __ cmp(ch1, ch2); 6567 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6568 __ b(DONE); 6569 __ align(OptoLoopAlignment); 6570 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6571 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6572 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6573 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6574 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6575 // result by analyzed characters value, so, we can just reset lower bits 6576 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6577 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6578 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6579 // index of last analyzed substring inside current octet. So, str2 in at 6580 // respective start address. We need to advance it to next octet 6581 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6582 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6583 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6584 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6585 __ movw(cnt2, cnt2); 6586 __ b(L_LOOP_PROCEED); 6587 __ align(OptoLoopAlignment); 6588 __ BIND(NOMATCH); 6589 __ mov(result, -1); 6590 __ BIND(DONE); 6591 __ pop(spilled_regs, sp); 6592 __ ret(lr); 6593 return entry; 6594 } 6595 6596 void generate_string_indexof_stubs() { 6597 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6598 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6599 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6600 } 6601 6602 void inflate_and_store_2_fp_registers(bool generatePrfm, 6603 FloatRegister src1, FloatRegister src2) { 6604 Register dst = r1; 6605 __ zip1(v1, __ T16B, src1, v0); 6606 __ zip2(v2, __ T16B, src1, v0); 6607 if (generatePrfm) { 6608 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6609 } 6610 __ zip1(v3, __ T16B, src2, v0); 6611 __ zip2(v4, __ T16B, src2, v0); 6612 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6613 } 6614 6615 // R0 = src 6616 // R1 = dst 6617 // R2 = len 6618 // R3 = len >> 3 6619 // V0 = 0 6620 // v1 = loaded 8 bytes 6621 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6622 address generate_large_byte_array_inflate() { 6623 __ align(CodeEntryAlignment); 6624 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 6625 StubCodeMark mark(this, stub_id); 6626 address entry = __ pc(); 6627 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6628 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6629 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6630 6631 // do one more 8-byte read to have address 16-byte aligned in most cases 6632 // also use single store instruction 6633 __ ldrd(v2, __ post(src, 8)); 6634 __ sub(octetCounter, octetCounter, 2); 6635 __ zip1(v1, __ T16B, v1, v0); 6636 __ zip1(v2, __ T16B, v2, v0); 6637 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6638 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6639 __ subs(rscratch1, octetCounter, large_loop_threshold); 6640 __ br(__ LE, LOOP_START); 6641 __ b(LOOP_PRFM_START); 6642 __ bind(LOOP_PRFM); 6643 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6644 __ bind(LOOP_PRFM_START); 6645 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6646 __ sub(octetCounter, octetCounter, 8); 6647 __ subs(rscratch1, octetCounter, large_loop_threshold); 6648 inflate_and_store_2_fp_registers(true, v3, v4); 6649 inflate_and_store_2_fp_registers(true, v5, v6); 6650 __ br(__ GT, LOOP_PRFM); 6651 __ cmp(octetCounter, (u1)8); 6652 __ br(__ LT, DONE); 6653 __ bind(LOOP); 6654 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6655 __ bind(LOOP_START); 6656 __ sub(octetCounter, octetCounter, 8); 6657 __ cmp(octetCounter, (u1)8); 6658 inflate_and_store_2_fp_registers(false, v3, v4); 6659 inflate_and_store_2_fp_registers(false, v5, v6); 6660 __ br(__ GE, LOOP); 6661 __ bind(DONE); 6662 __ ret(lr); 6663 return entry; 6664 } 6665 6666 /** 6667 * Arguments: 6668 * 6669 * Input: 6670 * c_rarg0 - current state address 6671 * c_rarg1 - H key address 6672 * c_rarg2 - data address 6673 * c_rarg3 - number of blocks 6674 * 6675 * Output: 6676 * Updated state at c_rarg0 6677 */ 6678 address generate_ghash_processBlocks() { 6679 // Bafflingly, GCM uses little-endian for the byte order, but 6680 // big-endian for the bit order. For example, the polynomial 1 is 6681 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6682 // 6683 // So, we must either reverse the bytes in each word and do 6684 // everything big-endian or reverse the bits in each byte and do 6685 // it little-endian. On AArch64 it's more idiomatic to reverse 6686 // the bits in each byte (we have an instruction, RBIT, to do 6687 // that) and keep the data in little-endian bit order through the 6688 // calculation, bit-reversing the inputs and outputs. 6689 6690 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 6691 StubCodeMark mark(this, stub_id); 6692 __ align(wordSize * 2); 6693 address p = __ pc(); 6694 __ emit_int64(0x87); // The low-order bits of the field 6695 // polynomial (i.e. p = z^7+z^2+z+1) 6696 // repeated in the low and high parts of a 6697 // 128-bit vector 6698 __ emit_int64(0x87); 6699 6700 __ align(CodeEntryAlignment); 6701 address start = __ pc(); 6702 6703 Register state = c_rarg0; 6704 Register subkeyH = c_rarg1; 6705 Register data = c_rarg2; 6706 Register blocks = c_rarg3; 6707 6708 FloatRegister vzr = v30; 6709 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6710 6711 __ ldrq(v24, p); // The field polynomial 6712 6713 __ ldrq(v0, Address(state)); 6714 __ ldrq(v1, Address(subkeyH)); 6715 6716 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6717 __ rbit(v0, __ T16B, v0); 6718 __ rev64(v1, __ T16B, v1); 6719 __ rbit(v1, __ T16B, v1); 6720 6721 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6722 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6723 6724 { 6725 Label L_ghash_loop; 6726 __ bind(L_ghash_loop); 6727 6728 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6729 // reversing each byte 6730 __ rbit(v2, __ T16B, v2); 6731 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6732 6733 // Multiply state in v2 by subkey in v1 6734 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6735 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6736 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6737 // Reduce v7:v5 by the field polynomial 6738 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6739 6740 __ sub(blocks, blocks, 1); 6741 __ cbnz(blocks, L_ghash_loop); 6742 } 6743 6744 // The bit-reversed result is at this point in v0 6745 __ rev64(v0, __ T16B, v0); 6746 __ rbit(v0, __ T16B, v0); 6747 6748 __ st1(v0, __ T16B, state); 6749 __ ret(lr); 6750 6751 return start; 6752 } 6753 6754 address generate_ghash_processBlocks_wide() { 6755 address small = generate_ghash_processBlocks(); 6756 6757 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 6758 StubCodeMark mark(this, stub_id); 6759 __ align(wordSize * 2); 6760 address p = __ pc(); 6761 __ emit_int64(0x87); // The low-order bits of the field 6762 // polynomial (i.e. p = z^7+z^2+z+1) 6763 // repeated in the low and high parts of a 6764 // 128-bit vector 6765 __ emit_int64(0x87); 6766 6767 __ align(CodeEntryAlignment); 6768 address start = __ pc(); 6769 6770 Register state = c_rarg0; 6771 Register subkeyH = c_rarg1; 6772 Register data = c_rarg2; 6773 Register blocks = c_rarg3; 6774 6775 const int unroll = 4; 6776 6777 __ cmp(blocks, (unsigned char)(unroll * 2)); 6778 __ br(__ LT, small); 6779 6780 if (unroll > 1) { 6781 // Save state before entering routine 6782 __ sub(sp, sp, 4 * 16); 6783 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6784 __ sub(sp, sp, 4 * 16); 6785 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6786 } 6787 6788 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6789 6790 if (unroll > 1) { 6791 // And restore state 6792 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6793 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6794 } 6795 6796 __ cmp(blocks, (unsigned char)0); 6797 __ br(__ GT, small); 6798 6799 __ ret(lr); 6800 6801 return start; 6802 } 6803 6804 void generate_base64_encode_simdround(Register src, Register dst, 6805 FloatRegister codec, u8 size) { 6806 6807 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6808 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6809 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6810 6811 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6812 6813 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6814 6815 __ ushr(ind0, arrangement, in0, 2); 6816 6817 __ ushr(ind1, arrangement, in1, 2); 6818 __ shl(in0, arrangement, in0, 6); 6819 __ orr(ind1, arrangement, ind1, in0); 6820 __ ushr(ind1, arrangement, ind1, 2); 6821 6822 __ ushr(ind2, arrangement, in2, 4); 6823 __ shl(in1, arrangement, in1, 4); 6824 __ orr(ind2, arrangement, in1, ind2); 6825 __ ushr(ind2, arrangement, ind2, 2); 6826 6827 __ shl(ind3, arrangement, in2, 2); 6828 __ ushr(ind3, arrangement, ind3, 2); 6829 6830 __ tbl(out0, arrangement, codec, 4, ind0); 6831 __ tbl(out1, arrangement, codec, 4, ind1); 6832 __ tbl(out2, arrangement, codec, 4, ind2); 6833 __ tbl(out3, arrangement, codec, 4, ind3); 6834 6835 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6836 } 6837 6838 /** 6839 * Arguments: 6840 * 6841 * Input: 6842 * c_rarg0 - src_start 6843 * c_rarg1 - src_offset 6844 * c_rarg2 - src_length 6845 * c_rarg3 - dest_start 6846 * c_rarg4 - dest_offset 6847 * c_rarg5 - isURL 6848 * 6849 */ 6850 address generate_base64_encodeBlock() { 6851 6852 static const char toBase64[64] = { 6853 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6854 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6855 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6856 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6857 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6858 }; 6859 6860 static const char toBase64URL[64] = { 6861 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6862 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6863 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6864 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6865 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6866 }; 6867 6868 __ align(CodeEntryAlignment); 6869 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 6870 StubCodeMark mark(this, stub_id); 6871 address start = __ pc(); 6872 6873 Register src = c_rarg0; // source array 6874 Register soff = c_rarg1; // source start offset 6875 Register send = c_rarg2; // source end offset 6876 Register dst = c_rarg3; // dest array 6877 Register doff = c_rarg4; // position for writing to dest array 6878 Register isURL = c_rarg5; // Base64 or URL character set 6879 6880 // c_rarg6 and c_rarg7 are free to use as temps 6881 Register codec = c_rarg6; 6882 Register length = c_rarg7; 6883 6884 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6885 6886 __ add(src, src, soff); 6887 __ add(dst, dst, doff); 6888 __ sub(length, send, soff); 6889 6890 // load the codec base address 6891 __ lea(codec, ExternalAddress((address) toBase64)); 6892 __ cbz(isURL, ProcessData); 6893 __ lea(codec, ExternalAddress((address) toBase64URL)); 6894 6895 __ BIND(ProcessData); 6896 6897 // too short to formup a SIMD loop, roll back 6898 __ cmp(length, (u1)24); 6899 __ br(Assembler::LT, Process3B); 6900 6901 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6902 6903 __ BIND(Process48B); 6904 __ cmp(length, (u1)48); 6905 __ br(Assembler::LT, Process24B); 6906 generate_base64_encode_simdround(src, dst, v0, 16); 6907 __ sub(length, length, 48); 6908 __ b(Process48B); 6909 6910 __ BIND(Process24B); 6911 __ cmp(length, (u1)24); 6912 __ br(Assembler::LT, SIMDExit); 6913 generate_base64_encode_simdround(src, dst, v0, 8); 6914 __ sub(length, length, 24); 6915 6916 __ BIND(SIMDExit); 6917 __ cbz(length, Exit); 6918 6919 __ BIND(Process3B); 6920 // 3 src bytes, 24 bits 6921 __ ldrb(r10, __ post(src, 1)); 6922 __ ldrb(r11, __ post(src, 1)); 6923 __ ldrb(r12, __ post(src, 1)); 6924 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6925 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6926 // codec index 6927 __ ubfmw(r15, r12, 18, 23); 6928 __ ubfmw(r14, r12, 12, 17); 6929 __ ubfmw(r13, r12, 6, 11); 6930 __ andw(r12, r12, 63); 6931 // get the code based on the codec 6932 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6933 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6934 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6935 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6936 __ strb(r15, __ post(dst, 1)); 6937 __ strb(r14, __ post(dst, 1)); 6938 __ strb(r13, __ post(dst, 1)); 6939 __ strb(r12, __ post(dst, 1)); 6940 __ sub(length, length, 3); 6941 __ cbnz(length, Process3B); 6942 6943 __ BIND(Exit); 6944 __ ret(lr); 6945 6946 return start; 6947 } 6948 6949 void generate_base64_decode_simdround(Register src, Register dst, 6950 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6951 6952 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6953 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6954 6955 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6956 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6957 6958 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6959 6960 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6961 6962 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6963 6964 // we need unsigned saturating subtract, to make sure all input values 6965 // in range [0, 63] will have 0U value in the higher half lookup 6966 __ uqsubv(decH0, __ T16B, in0, v27); 6967 __ uqsubv(decH1, __ T16B, in1, v27); 6968 __ uqsubv(decH2, __ T16B, in2, v27); 6969 __ uqsubv(decH3, __ T16B, in3, v27); 6970 6971 // lower half lookup 6972 __ tbl(decL0, arrangement, codecL, 4, in0); 6973 __ tbl(decL1, arrangement, codecL, 4, in1); 6974 __ tbl(decL2, arrangement, codecL, 4, in2); 6975 __ tbl(decL3, arrangement, codecL, 4, in3); 6976 6977 // higher half lookup 6978 __ tbx(decH0, arrangement, codecH, 4, decH0); 6979 __ tbx(decH1, arrangement, codecH, 4, decH1); 6980 __ tbx(decH2, arrangement, codecH, 4, decH2); 6981 __ tbx(decH3, arrangement, codecH, 4, decH3); 6982 6983 // combine lower and higher 6984 __ orr(decL0, arrangement, decL0, decH0); 6985 __ orr(decL1, arrangement, decL1, decH1); 6986 __ orr(decL2, arrangement, decL2, decH2); 6987 __ orr(decL3, arrangement, decL3, decH3); 6988 6989 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6990 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6991 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6992 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6993 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6994 __ orr(in0, arrangement, decH0, decH1); 6995 __ orr(in1, arrangement, decH2, decH3); 6996 __ orr(in2, arrangement, in0, in1); 6997 __ umaxv(in3, arrangement, in2); 6998 __ umov(rscratch2, in3, __ B, 0); 6999 7000 // get the data to output 7001 __ shl(out0, arrangement, decL0, 2); 7002 __ ushr(out1, arrangement, decL1, 4); 7003 __ orr(out0, arrangement, out0, out1); 7004 __ shl(out1, arrangement, decL1, 4); 7005 __ ushr(out2, arrangement, decL2, 2); 7006 __ orr(out1, arrangement, out1, out2); 7007 __ shl(out2, arrangement, decL2, 6); 7008 __ orr(out2, arrangement, out2, decL3); 7009 7010 __ cbz(rscratch2, NoIllegalData); 7011 7012 // handle illegal input 7013 __ umov(r10, in2, __ D, 0); 7014 if (size == 16) { 7015 __ cbnz(r10, ErrorInLowerHalf); 7016 7017 // illegal input is in higher half, store the lower half now. 7018 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 7019 7020 __ umov(r10, in2, __ D, 1); 7021 __ umov(r11, out0, __ D, 1); 7022 __ umov(r12, out1, __ D, 1); 7023 __ umov(r13, out2, __ D, 1); 7024 __ b(StoreLegalData); 7025 7026 __ BIND(ErrorInLowerHalf); 7027 } 7028 __ umov(r11, out0, __ D, 0); 7029 __ umov(r12, out1, __ D, 0); 7030 __ umov(r13, out2, __ D, 0); 7031 7032 __ BIND(StoreLegalData); 7033 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 7034 __ strb(r11, __ post(dst, 1)); 7035 __ strb(r12, __ post(dst, 1)); 7036 __ strb(r13, __ post(dst, 1)); 7037 __ lsr(r10, r10, 8); 7038 __ lsr(r11, r11, 8); 7039 __ lsr(r12, r12, 8); 7040 __ lsr(r13, r13, 8); 7041 __ b(StoreLegalData); 7042 7043 __ BIND(NoIllegalData); 7044 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 7045 } 7046 7047 7048 /** 7049 * Arguments: 7050 * 7051 * Input: 7052 * c_rarg0 - src_start 7053 * c_rarg1 - src_offset 7054 * c_rarg2 - src_length 7055 * c_rarg3 - dest_start 7056 * c_rarg4 - dest_offset 7057 * c_rarg5 - isURL 7058 * c_rarg6 - isMIME 7059 * 7060 */ 7061 address generate_base64_decodeBlock() { 7062 7063 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 7064 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 7065 // titled "Base64 decoding". 7066 7067 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 7068 // except the trailing character '=' is also treated illegal value in this intrinsic. That 7069 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 7070 static const uint8_t fromBase64ForNoSIMD[256] = { 7071 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7072 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7073 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 7074 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 7075 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 7076 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 7077 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 7078 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 7079 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7080 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7081 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7082 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7083 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7084 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7085 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7086 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7087 }; 7088 7089 static const uint8_t fromBase64URLForNoSIMD[256] = { 7090 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7091 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7092 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 7093 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 7094 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 7095 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 7096 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 7097 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 7098 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7099 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7100 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7101 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7102 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7103 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7104 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7105 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7106 }; 7107 7108 // A legal value of base64 code is in range [0, 127]. We need two lookups 7109 // with tbl/tbx and combine them to get the decode data. The 1st table vector 7110 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 7111 // table vector lookup use tbx, out of range indices are unchanged in 7112 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 7113 // The value of index 64 is set to 0, so that we know that we already get the 7114 // decoded data with the 1st lookup. 7115 static const uint8_t fromBase64ForSIMD[128] = { 7116 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7117 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7118 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 7119 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 7120 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 7121 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 7122 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 7123 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 7124 }; 7125 7126 static const uint8_t fromBase64URLForSIMD[128] = { 7127 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7128 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7129 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 7130 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 7131 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 7132 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 7133 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 7134 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 7135 }; 7136 7137 __ align(CodeEntryAlignment); 7138 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 7139 StubCodeMark mark(this, stub_id); 7140 address start = __ pc(); 7141 7142 Register src = c_rarg0; // source array 7143 Register soff = c_rarg1; // source start offset 7144 Register send = c_rarg2; // source end offset 7145 Register dst = c_rarg3; // dest array 7146 Register doff = c_rarg4; // position for writing to dest array 7147 Register isURL = c_rarg5; // Base64 or URL character set 7148 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 7149 7150 Register length = send; // reuse send as length of source data to process 7151 7152 Register simd_codec = c_rarg6; 7153 Register nosimd_codec = c_rarg7; 7154 7155 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 7156 7157 __ enter(); 7158 7159 __ add(src, src, soff); 7160 __ add(dst, dst, doff); 7161 7162 __ mov(doff, dst); 7163 7164 __ sub(length, send, soff); 7165 __ bfm(length, zr, 0, 1); 7166 7167 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 7168 __ cbz(isURL, ProcessData); 7169 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 7170 7171 __ BIND(ProcessData); 7172 __ mov(rscratch1, length); 7173 __ cmp(length, (u1)144); // 144 = 80 + 64 7174 __ br(Assembler::LT, Process4B); 7175 7176 // In the MIME case, the line length cannot be more than 76 7177 // bytes (see RFC 2045). This is too short a block for SIMD 7178 // to be worthwhile, so we use non-SIMD here. 7179 __ movw(rscratch1, 79); 7180 7181 __ BIND(Process4B); 7182 __ ldrw(r14, __ post(src, 4)); 7183 __ ubfxw(r10, r14, 0, 8); 7184 __ ubfxw(r11, r14, 8, 8); 7185 __ ubfxw(r12, r14, 16, 8); 7186 __ ubfxw(r13, r14, 24, 8); 7187 // get the de-code 7188 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 7189 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 7190 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 7191 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 7192 // error detection, 255u indicates an illegal input 7193 __ orrw(r14, r10, r11); 7194 __ orrw(r15, r12, r13); 7195 __ orrw(r14, r14, r15); 7196 __ tbnz(r14, 7, Exit); 7197 // recover the data 7198 __ lslw(r14, r10, 10); 7199 __ bfiw(r14, r11, 4, 6); 7200 __ bfmw(r14, r12, 2, 5); 7201 __ rev16w(r14, r14); 7202 __ bfiw(r13, r12, 6, 2); 7203 __ strh(r14, __ post(dst, 2)); 7204 __ strb(r13, __ post(dst, 1)); 7205 // non-simd loop 7206 __ subsw(rscratch1, rscratch1, 4); 7207 __ br(Assembler::GT, Process4B); 7208 7209 // if exiting from PreProcess80B, rscratch1 == -1; 7210 // otherwise, rscratch1 == 0. 7211 __ cbzw(rscratch1, Exit); 7212 __ sub(length, length, 80); 7213 7214 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 7215 __ cbz(isURL, SIMDEnter); 7216 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 7217 7218 __ BIND(SIMDEnter); 7219 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 7220 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 7221 __ mov(rscratch1, 63); 7222 __ dup(v27, __ T16B, rscratch1); 7223 7224 __ BIND(Process64B); 7225 __ cmp(length, (u1)64); 7226 __ br(Assembler::LT, Process32B); 7227 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 7228 __ sub(length, length, 64); 7229 __ b(Process64B); 7230 7231 __ BIND(Process32B); 7232 __ cmp(length, (u1)32); 7233 __ br(Assembler::LT, SIMDExit); 7234 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 7235 __ sub(length, length, 32); 7236 __ b(Process32B); 7237 7238 __ BIND(SIMDExit); 7239 __ cbz(length, Exit); 7240 __ movw(rscratch1, length); 7241 __ b(Process4B); 7242 7243 __ BIND(Exit); 7244 __ sub(c_rarg0, dst, doff); 7245 7246 __ leave(); 7247 __ ret(lr); 7248 7249 return start; 7250 } 7251 7252 // Support for spin waits. 7253 address generate_spin_wait() { 7254 __ align(CodeEntryAlignment); 7255 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 7256 StubCodeMark mark(this, stub_id); 7257 address start = __ pc(); 7258 7259 __ spin_wait(); 7260 __ ret(lr); 7261 7262 return start; 7263 } 7264 7265 void generate_lookup_secondary_supers_table_stub() { 7266 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 7267 StubCodeMark mark(this, stub_id); 7268 7269 const Register 7270 r_super_klass = r0, 7271 r_array_base = r1, 7272 r_array_length = r2, 7273 r_array_index = r3, 7274 r_sub_klass = r4, 7275 r_bitmap = rscratch2, 7276 result = r5; 7277 const FloatRegister 7278 vtemp = v0; 7279 7280 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 7281 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 7282 Label L_success; 7283 __ enter(); 7284 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 7285 r_array_base, r_array_length, r_array_index, 7286 vtemp, result, slot, 7287 /*stub_is_near*/true); 7288 __ leave(); 7289 __ ret(lr); 7290 } 7291 } 7292 7293 // Slow path implementation for UseSecondarySupersTable. 7294 address generate_lookup_secondary_supers_table_slow_path_stub() { 7295 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 7296 StubCodeMark mark(this, stub_id); 7297 7298 address start = __ pc(); 7299 const Register 7300 r_super_klass = r0, // argument 7301 r_array_base = r1, // argument 7302 temp1 = r2, // temp 7303 r_array_index = r3, // argument 7304 r_bitmap = rscratch2, // argument 7305 result = r5; // argument 7306 7307 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 7308 __ ret(lr); 7309 7310 return start; 7311 } 7312 7313 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 7314 7315 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 7316 // 7317 // If LSE is in use, generate LSE versions of all the stubs. The 7318 // non-LSE versions are in atomic_aarch64.S. 7319 7320 // class AtomicStubMark records the entry point of a stub and the 7321 // stub pointer which will point to it. The stub pointer is set to 7322 // the entry point when ~AtomicStubMark() is called, which must be 7323 // after ICache::invalidate_range. This ensures safe publication of 7324 // the generated code. 7325 class AtomicStubMark { 7326 address _entry_point; 7327 aarch64_atomic_stub_t *_stub; 7328 MacroAssembler *_masm; 7329 public: 7330 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 7331 _masm = masm; 7332 __ align(32); 7333 _entry_point = __ pc(); 7334 _stub = stub; 7335 } 7336 ~AtomicStubMark() { 7337 *_stub = (aarch64_atomic_stub_t)_entry_point; 7338 } 7339 }; 7340 7341 // NB: For memory_order_conservative we need a trailing membar after 7342 // LSE atomic operations but not a leading membar. 7343 // 7344 // We don't need a leading membar because a clause in the Arm ARM 7345 // says: 7346 // 7347 // Barrier-ordered-before 7348 // 7349 // Barrier instructions order prior Memory effects before subsequent 7350 // Memory effects generated by the same Observer. A read or a write 7351 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 7352 // Observer if and only if RW1 appears in program order before RW 2 7353 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 7354 // instruction with both Acquire and Release semantics. 7355 // 7356 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 7357 // and Release semantics, therefore we don't need a leading 7358 // barrier. However, there is no corresponding Barrier-ordered-after 7359 // relationship, therefore we need a trailing membar to prevent a 7360 // later store or load from being reordered with the store in an 7361 // atomic instruction. 7362 // 7363 // This was checked by using the herd7 consistency model simulator 7364 // (http://diy.inria.fr/) with this test case: 7365 // 7366 // AArch64 LseCas 7367 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 7368 // P0 | P1; 7369 // LDR W4, [X2] | MOV W3, #0; 7370 // DMB LD | MOV W4, #1; 7371 // LDR W3, [X1] | CASAL W3, W4, [X1]; 7372 // | DMB ISH; 7373 // | STR W4, [X2]; 7374 // exists 7375 // (0:X3=0 /\ 0:X4=1) 7376 // 7377 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 7378 // with the store to x in P1. Without the DMB in P1 this may happen. 7379 // 7380 // At the time of writing we don't know of any AArch64 hardware that 7381 // reorders stores in this way, but the Reference Manual permits it. 7382 7383 void gen_cas_entry(Assembler::operand_size size, 7384 atomic_memory_order order) { 7385 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 7386 exchange_val = c_rarg2; 7387 bool acquire, release; 7388 switch (order) { 7389 case memory_order_relaxed: 7390 acquire = false; 7391 release = false; 7392 break; 7393 case memory_order_release: 7394 acquire = false; 7395 release = true; 7396 break; 7397 default: 7398 acquire = true; 7399 release = true; 7400 break; 7401 } 7402 __ mov(prev, compare_val); 7403 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 7404 if (order == memory_order_conservative) { 7405 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7406 } 7407 if (size == Assembler::xword) { 7408 __ mov(r0, prev); 7409 } else { 7410 __ movw(r0, prev); 7411 } 7412 __ ret(lr); 7413 } 7414 7415 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 7416 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 7417 // If not relaxed, then default to conservative. Relaxed is the only 7418 // case we use enough to be worth specializing. 7419 if (order == memory_order_relaxed) { 7420 __ ldadd(size, incr, prev, addr); 7421 } else { 7422 __ ldaddal(size, incr, prev, addr); 7423 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7424 } 7425 if (size == Assembler::xword) { 7426 __ mov(r0, prev); 7427 } else { 7428 __ movw(r0, prev); 7429 } 7430 __ ret(lr); 7431 } 7432 7433 void gen_swpal_entry(Assembler::operand_size size) { 7434 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 7435 __ swpal(size, incr, prev, addr); 7436 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7437 if (size == Assembler::xword) { 7438 __ mov(r0, prev); 7439 } else { 7440 __ movw(r0, prev); 7441 } 7442 __ ret(lr); 7443 } 7444 7445 void generate_atomic_entry_points() { 7446 if (! UseLSE) { 7447 return; 7448 } 7449 __ align(CodeEntryAlignment); 7450 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 7451 StubCodeMark mark(this, stub_id); 7452 address first_entry = __ pc(); 7453 7454 // ADD, memory_order_conservative 7455 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 7456 gen_ldadd_entry(Assembler::word, memory_order_conservative); 7457 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 7458 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 7459 7460 // ADD, memory_order_relaxed 7461 AtomicStubMark mark_fetch_add_4_relaxed 7462 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 7463 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 7464 AtomicStubMark mark_fetch_add_8_relaxed 7465 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 7466 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 7467 7468 // XCHG, memory_order_conservative 7469 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 7470 gen_swpal_entry(Assembler::word); 7471 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 7472 gen_swpal_entry(Assembler::xword); 7473 7474 // CAS, memory_order_conservative 7475 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 7476 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 7477 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 7478 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 7479 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 7480 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 7481 7482 // CAS, memory_order_relaxed 7483 AtomicStubMark mark_cmpxchg_1_relaxed 7484 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 7485 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 7486 AtomicStubMark mark_cmpxchg_4_relaxed 7487 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 7488 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 7489 AtomicStubMark mark_cmpxchg_8_relaxed 7490 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 7491 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 7492 7493 AtomicStubMark mark_cmpxchg_4_release 7494 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 7495 gen_cas_entry(MacroAssembler::word, memory_order_release); 7496 AtomicStubMark mark_cmpxchg_8_release 7497 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 7498 gen_cas_entry(MacroAssembler::xword, memory_order_release); 7499 7500 AtomicStubMark mark_cmpxchg_4_seq_cst 7501 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 7502 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 7503 AtomicStubMark mark_cmpxchg_8_seq_cst 7504 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 7505 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 7506 7507 ICache::invalidate_range(first_entry, __ pc() - first_entry); 7508 } 7509 #endif // LINUX 7510 7511 address generate_cont_thaw(Continuation::thaw_kind kind) { 7512 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 7513 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 7514 7515 address start = __ pc(); 7516 7517 if (return_barrier) { 7518 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 7519 __ mov(sp, rscratch1); 7520 } 7521 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7522 7523 if (return_barrier) { 7524 // preserve possible return value from a method returning to the return barrier 7525 __ fmovd(rscratch1, v0); 7526 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7527 } 7528 7529 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 7530 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 7531 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 7532 7533 if (return_barrier) { 7534 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7535 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7536 __ fmovd(v0, rscratch1); 7537 } 7538 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7539 7540 7541 Label thaw_success; 7542 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 7543 __ cbnz(rscratch2, thaw_success); 7544 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 7545 __ br(rscratch1); 7546 __ bind(thaw_success); 7547 7548 // make room for the thawed frames 7549 __ sub(rscratch1, sp, rscratch2); 7550 __ andr(rscratch1, rscratch1, -16); // align 7551 __ mov(sp, rscratch1); 7552 7553 if (return_barrier) { 7554 // save original return value -- again 7555 __ fmovd(rscratch1, v0); 7556 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7557 } 7558 7559 // If we want, we can templatize thaw by kind, and have three different entries 7560 __ movw(c_rarg1, (uint32_t)kind); 7561 7562 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7563 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7564 7565 if (return_barrier) { 7566 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7567 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7568 __ fmovd(v0, rscratch1); 7569 } else { 7570 __ mov(r0, zr); // return 0 (success) from doYield 7571 } 7572 7573 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7574 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7575 __ mov(rfp, sp); 7576 7577 if (return_barrier_exception) { 7578 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7579 __ authenticate_return_address(c_rarg1); 7580 __ verify_oop(r0); 7581 // save return value containing the exception oop in callee-saved R19 7582 __ mov(r19, r0); 7583 7584 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7585 7586 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7587 // __ reinitialize_ptrue(); 7588 7589 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7590 7591 __ mov(r1, r0); // the exception handler 7592 __ mov(r0, r19); // restore return value containing the exception oop 7593 __ verify_oop(r0); 7594 7595 __ leave(); 7596 __ mov(r3, lr); 7597 __ br(r1); // the exception handler 7598 } else { 7599 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7600 __ leave(); 7601 __ ret(lr); 7602 } 7603 7604 return start; 7605 } 7606 7607 address generate_cont_thaw() { 7608 if (!Continuations::enabled()) return nullptr; 7609 7610 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 7611 StubCodeMark mark(this, stub_id); 7612 address start = __ pc(); 7613 generate_cont_thaw(Continuation::thaw_top); 7614 return start; 7615 } 7616 7617 address generate_cont_returnBarrier() { 7618 if (!Continuations::enabled()) return nullptr; 7619 7620 // TODO: will probably need multiple return barriers depending on return type 7621 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 7622 StubCodeMark mark(this, stub_id); 7623 address start = __ pc(); 7624 7625 generate_cont_thaw(Continuation::thaw_return_barrier); 7626 7627 return start; 7628 } 7629 7630 address generate_cont_returnBarrier_exception() { 7631 if (!Continuations::enabled()) return nullptr; 7632 7633 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 7634 StubCodeMark mark(this, stub_id); 7635 address start = __ pc(); 7636 7637 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7638 7639 return start; 7640 } 7641 7642 address generate_cont_preempt_stub() { 7643 if (!Continuations::enabled()) return nullptr; 7644 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 7645 StubCodeMark mark(this, stub_id); 7646 address start = __ pc(); 7647 7648 __ reset_last_Java_frame(true); 7649 7650 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 7651 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 7652 __ mov(sp, rscratch2); 7653 7654 Label preemption_cancelled; 7655 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 7656 __ cbnz(rscratch1, preemption_cancelled); 7657 7658 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 7659 SharedRuntime::continuation_enter_cleanup(_masm); 7660 __ leave(); 7661 __ ret(lr); 7662 7663 // We acquired the monitor after freezing the frames so call thaw to continue execution. 7664 __ bind(preemption_cancelled); 7665 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 7666 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 7667 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 7668 __ ldr(rscratch1, Address(rscratch1)); 7669 __ br(rscratch1); 7670 7671 return start; 7672 } 7673 7674 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7675 // are represented as long[5], with BITS_PER_LIMB = 26. 7676 // Pack five 26-bit limbs into three 64-bit registers. 7677 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7678 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7679 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7680 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7681 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7682 7683 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7684 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7685 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7686 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7687 7688 if (dest2->is_valid()) { 7689 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7690 } else { 7691 #ifdef ASSERT 7692 Label OK; 7693 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7694 __ br(__ EQ, OK); 7695 __ stop("high bits of Poly1305 integer should be zero"); 7696 __ should_not_reach_here(); 7697 __ bind(OK); 7698 #endif 7699 } 7700 } 7701 7702 // As above, but return only a 128-bit integer, packed into two 7703 // 64-bit registers. 7704 void pack_26(Register dest0, Register dest1, Register src) { 7705 pack_26(dest0, dest1, noreg, src); 7706 } 7707 7708 // Multiply and multiply-accumulate unsigned 64-bit registers. 7709 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7710 __ mul(prod_lo, n, m); 7711 __ umulh(prod_hi, n, m); 7712 } 7713 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7714 wide_mul(rscratch1, rscratch2, n, m); 7715 __ adds(sum_lo, sum_lo, rscratch1); 7716 __ adc(sum_hi, sum_hi, rscratch2); 7717 } 7718 7719 // Poly1305, RFC 7539 7720 7721 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7722 // description of the tricks used to simplify and accelerate this 7723 // computation. 7724 7725 address generate_poly1305_processBlocks() { 7726 __ align(CodeEntryAlignment); 7727 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 7728 StubCodeMark mark(this, stub_id); 7729 address start = __ pc(); 7730 Label here; 7731 __ enter(); 7732 RegSet callee_saved = RegSet::range(r19, r28); 7733 __ push(callee_saved, sp); 7734 7735 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7736 7737 // Arguments 7738 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7739 7740 // R_n is the 128-bit randomly-generated key, packed into two 7741 // registers. The caller passes this key to us as long[5], with 7742 // BITS_PER_LIMB = 26. 7743 const Register R_0 = *++regs, R_1 = *++regs; 7744 pack_26(R_0, R_1, r_start); 7745 7746 // RR_n is (R_n >> 2) * 5 7747 const Register RR_0 = *++regs, RR_1 = *++regs; 7748 __ lsr(RR_0, R_0, 2); 7749 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7750 __ lsr(RR_1, R_1, 2); 7751 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7752 7753 // U_n is the current checksum 7754 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7755 pack_26(U_0, U_1, U_2, acc_start); 7756 7757 static constexpr int BLOCK_LENGTH = 16; 7758 Label DONE, LOOP; 7759 7760 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7761 __ br(Assembler::LT, DONE); { 7762 __ bind(LOOP); 7763 7764 // S_n is to be the sum of U_n and the next block of data 7765 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7766 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7767 __ adds(S_0, U_0, S_0); 7768 __ adcs(S_1, U_1, S_1); 7769 __ adc(S_2, U_2, zr); 7770 __ add(S_2, S_2, 1); 7771 7772 const Register U_0HI = *++regs, U_1HI = *++regs; 7773 7774 // NB: this logic depends on some of the special properties of 7775 // Poly1305 keys. In particular, because we know that the top 7776 // four bits of R_0 and R_1 are zero, we can add together 7777 // partial products without any risk of needing to propagate a 7778 // carry out. 7779 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7780 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7781 __ andr(U_2, R_0, 3); 7782 __ mul(U_2, S_2, U_2); 7783 7784 // Recycle registers S_0, S_1, S_2 7785 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7786 7787 // Partial reduction mod 2**130 - 5 7788 __ adds(U_1, U_0HI, U_1); 7789 __ adc(U_2, U_1HI, U_2); 7790 // Sum now in U_2:U_1:U_0. 7791 // Dead: U_0HI, U_1HI. 7792 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7793 7794 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7795 7796 // First, U_2:U_1:U_0 += (U_2 >> 2) 7797 __ lsr(rscratch1, U_2, 2); 7798 __ andr(U_2, U_2, (u8)3); 7799 __ adds(U_0, U_0, rscratch1); 7800 __ adcs(U_1, U_1, zr); 7801 __ adc(U_2, U_2, zr); 7802 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7803 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7804 __ adcs(U_1, U_1, zr); 7805 __ adc(U_2, U_2, zr); 7806 7807 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7808 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7809 __ br(~ Assembler::LT, LOOP); 7810 } 7811 7812 // Further reduce modulo 2^130 - 5 7813 __ lsr(rscratch1, U_2, 2); 7814 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7815 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7816 __ adcs(U_1, U_1, zr); 7817 __ andr(U_2, U_2, (u1)3); 7818 __ adc(U_2, U_2, zr); 7819 7820 // Unpack the sum into five 26-bit limbs and write to memory. 7821 __ ubfiz(rscratch1, U_0, 0, 26); 7822 __ ubfx(rscratch2, U_0, 26, 26); 7823 __ stp(rscratch1, rscratch2, Address(acc_start)); 7824 __ ubfx(rscratch1, U_0, 52, 12); 7825 __ bfi(rscratch1, U_1, 12, 14); 7826 __ ubfx(rscratch2, U_1, 14, 26); 7827 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7828 __ ubfx(rscratch1, U_1, 40, 24); 7829 __ bfi(rscratch1, U_2, 24, 3); 7830 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7831 7832 __ bind(DONE); 7833 __ pop(callee_saved, sp); 7834 __ leave(); 7835 __ ret(lr); 7836 7837 return start; 7838 } 7839 7840 // exception handler for upcall stubs 7841 address generate_upcall_stub_exception_handler() { 7842 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 7843 StubCodeMark mark(this, stub_id); 7844 address start = __ pc(); 7845 7846 // Native caller has no idea how to handle exceptions, 7847 // so we just crash here. Up to callee to catch exceptions. 7848 __ verify_oop(r0); 7849 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7850 __ blr(rscratch1); 7851 __ should_not_reach_here(); 7852 7853 return start; 7854 } 7855 7856 // load Method* target of MethodHandle 7857 // j_rarg0 = jobject receiver 7858 // rmethod = result 7859 address generate_upcall_stub_load_target() { 7860 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 7861 StubCodeMark mark(this, stub_id); 7862 address start = __ pc(); 7863 7864 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 7865 // Load target method from receiver 7866 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 7867 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 7868 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 7869 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 7870 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 7871 noreg, noreg); 7872 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 7873 7874 __ ret(lr); 7875 7876 return start; 7877 } 7878 7879 #undef __ 7880 #define __ masm-> 7881 7882 class MontgomeryMultiplyGenerator : public MacroAssembler { 7883 7884 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7885 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7886 7887 RegSet _toSave; 7888 bool _squaring; 7889 7890 public: 7891 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7892 : MacroAssembler(as->code()), _squaring(squaring) { 7893 7894 // Register allocation 7895 7896 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7897 Pa_base = *regs; // Argument registers 7898 if (squaring) 7899 Pb_base = Pa_base; 7900 else 7901 Pb_base = *++regs; 7902 Pn_base = *++regs; 7903 Rlen= *++regs; 7904 inv = *++regs; 7905 Pm_base = *++regs; 7906 7907 // Working registers: 7908 Ra = *++regs; // The current digit of a, b, n, and m. 7909 Rb = *++regs; 7910 Rm = *++regs; 7911 Rn = *++regs; 7912 7913 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7914 Pb = *++regs; 7915 Pm = *++regs; 7916 Pn = *++regs; 7917 7918 t0 = *++regs; // Three registers which form a 7919 t1 = *++regs; // triple-precision accumuator. 7920 t2 = *++regs; 7921 7922 Ri = *++regs; // Inner and outer loop indexes. 7923 Rj = *++regs; 7924 7925 Rhi_ab = *++regs; // Product registers: low and high parts 7926 Rlo_ab = *++regs; // of a*b and m*n. 7927 Rhi_mn = *++regs; 7928 Rlo_mn = *++regs; 7929 7930 // r19 and up are callee-saved. 7931 _toSave = RegSet::range(r19, *regs) + Pm_base; 7932 } 7933 7934 private: 7935 void save_regs() { 7936 push(_toSave, sp); 7937 } 7938 7939 void restore_regs() { 7940 pop(_toSave, sp); 7941 } 7942 7943 template <typename T> 7944 void unroll_2(Register count, T block) { 7945 Label loop, end, odd; 7946 tbnz(count, 0, odd); 7947 cbz(count, end); 7948 align(16); 7949 bind(loop); 7950 (this->*block)(); 7951 bind(odd); 7952 (this->*block)(); 7953 subs(count, count, 2); 7954 br(Assembler::GT, loop); 7955 bind(end); 7956 } 7957 7958 template <typename T> 7959 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7960 Label loop, end, odd; 7961 tbnz(count, 0, odd); 7962 cbz(count, end); 7963 align(16); 7964 bind(loop); 7965 (this->*block)(d, s, tmp); 7966 bind(odd); 7967 (this->*block)(d, s, tmp); 7968 subs(count, count, 2); 7969 br(Assembler::GT, loop); 7970 bind(end); 7971 } 7972 7973 void pre1(RegisterOrConstant i) { 7974 block_comment("pre1"); 7975 // Pa = Pa_base; 7976 // Pb = Pb_base + i; 7977 // Pm = Pm_base; 7978 // Pn = Pn_base + i; 7979 // Ra = *Pa; 7980 // Rb = *Pb; 7981 // Rm = *Pm; 7982 // Rn = *Pn; 7983 ldr(Ra, Address(Pa_base)); 7984 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7985 ldr(Rm, Address(Pm_base)); 7986 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7987 lea(Pa, Address(Pa_base)); 7988 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7989 lea(Pm, Address(Pm_base)); 7990 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7991 7992 // Zero the m*n result. 7993 mov(Rhi_mn, zr); 7994 mov(Rlo_mn, zr); 7995 } 7996 7997 // The core multiply-accumulate step of a Montgomery 7998 // multiplication. The idea is to schedule operations as a 7999 // pipeline so that instructions with long latencies (loads and 8000 // multiplies) have time to complete before their results are 8001 // used. This most benefits in-order implementations of the 8002 // architecture but out-of-order ones also benefit. 8003 void step() { 8004 block_comment("step"); 8005 // MACC(Ra, Rb, t0, t1, t2); 8006 // Ra = *++Pa; 8007 // Rb = *--Pb; 8008 umulh(Rhi_ab, Ra, Rb); 8009 mul(Rlo_ab, Ra, Rb); 8010 ldr(Ra, pre(Pa, wordSize)); 8011 ldr(Rb, pre(Pb, -wordSize)); 8012 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 8013 // previous iteration. 8014 // MACC(Rm, Rn, t0, t1, t2); 8015 // Rm = *++Pm; 8016 // Rn = *--Pn; 8017 umulh(Rhi_mn, Rm, Rn); 8018 mul(Rlo_mn, Rm, Rn); 8019 ldr(Rm, pre(Pm, wordSize)); 8020 ldr(Rn, pre(Pn, -wordSize)); 8021 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 8022 } 8023 8024 void post1() { 8025 block_comment("post1"); 8026 8027 // MACC(Ra, Rb, t0, t1, t2); 8028 // Ra = *++Pa; 8029 // Rb = *--Pb; 8030 umulh(Rhi_ab, Ra, Rb); 8031 mul(Rlo_ab, Ra, Rb); 8032 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 8033 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 8034 8035 // *Pm = Rm = t0 * inv; 8036 mul(Rm, t0, inv); 8037 str(Rm, Address(Pm)); 8038 8039 // MACC(Rm, Rn, t0, t1, t2); 8040 // t0 = t1; t1 = t2; t2 = 0; 8041 umulh(Rhi_mn, Rm, Rn); 8042 8043 #ifndef PRODUCT 8044 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 8045 { 8046 mul(Rlo_mn, Rm, Rn); 8047 add(Rlo_mn, t0, Rlo_mn); 8048 Label ok; 8049 cbz(Rlo_mn, ok); { 8050 stop("broken Montgomery multiply"); 8051 } bind(ok); 8052 } 8053 #endif 8054 // We have very carefully set things up so that 8055 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 8056 // the lower half of Rm * Rn because we know the result already: 8057 // it must be -t0. t0 + (-t0) must generate a carry iff 8058 // t0 != 0. So, rather than do a mul and an adds we just set 8059 // the carry flag iff t0 is nonzero. 8060 // 8061 // mul(Rlo_mn, Rm, Rn); 8062 // adds(zr, t0, Rlo_mn); 8063 subs(zr, t0, 1); // Set carry iff t0 is nonzero 8064 adcs(t0, t1, Rhi_mn); 8065 adc(t1, t2, zr); 8066 mov(t2, zr); 8067 } 8068 8069 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 8070 block_comment("pre2"); 8071 // Pa = Pa_base + i-len; 8072 // Pb = Pb_base + len; 8073 // Pm = Pm_base + i-len; 8074 // Pn = Pn_base + len; 8075 8076 if (i.is_register()) { 8077 sub(Rj, i.as_register(), len); 8078 } else { 8079 mov(Rj, i.as_constant()); 8080 sub(Rj, Rj, len); 8081 } 8082 // Rj == i-len 8083 8084 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 8085 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 8086 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 8087 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 8088 8089 // Ra = *++Pa; 8090 // Rb = *--Pb; 8091 // Rm = *++Pm; 8092 // Rn = *--Pn; 8093 ldr(Ra, pre(Pa, wordSize)); 8094 ldr(Rb, pre(Pb, -wordSize)); 8095 ldr(Rm, pre(Pm, wordSize)); 8096 ldr(Rn, pre(Pn, -wordSize)); 8097 8098 mov(Rhi_mn, zr); 8099 mov(Rlo_mn, zr); 8100 } 8101 8102 void post2(RegisterOrConstant i, RegisterOrConstant len) { 8103 block_comment("post2"); 8104 if (i.is_constant()) { 8105 mov(Rj, i.as_constant()-len.as_constant()); 8106 } else { 8107 sub(Rj, i.as_register(), len); 8108 } 8109 8110 adds(t0, t0, Rlo_mn); // The pending m*n, low part 8111 8112 // As soon as we know the least significant digit of our result, 8113 // store it. 8114 // Pm_base[i-len] = t0; 8115 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 8116 8117 // t0 = t1; t1 = t2; t2 = 0; 8118 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 8119 adc(t1, t2, zr); 8120 mov(t2, zr); 8121 } 8122 8123 // A carry in t0 after Montgomery multiplication means that we 8124 // should subtract multiples of n from our result in m. We'll 8125 // keep doing that until there is no carry. 8126 void normalize(RegisterOrConstant len) { 8127 block_comment("normalize"); 8128 // while (t0) 8129 // t0 = sub(Pm_base, Pn_base, t0, len); 8130 Label loop, post, again; 8131 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 8132 cbz(t0, post); { 8133 bind(again); { 8134 mov(i, zr); 8135 mov(cnt, len); 8136 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 8137 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 8138 subs(zr, zr, zr); // set carry flag, i.e. no borrow 8139 align(16); 8140 bind(loop); { 8141 sbcs(Rm, Rm, Rn); 8142 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 8143 add(i, i, 1); 8144 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 8145 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 8146 sub(cnt, cnt, 1); 8147 } cbnz(cnt, loop); 8148 sbc(t0, t0, zr); 8149 } cbnz(t0, again); 8150 } bind(post); 8151 } 8152 8153 // Move memory at s to d, reversing words. 8154 // Increments d to end of copied memory 8155 // Destroys tmp1, tmp2 8156 // Preserves len 8157 // Leaves s pointing to the address which was in d at start 8158 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 8159 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 8160 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 8161 8162 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 8163 mov(tmp1, len); 8164 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 8165 sub(s, d, len, ext::uxtw, LogBytesPerWord); 8166 } 8167 // where 8168 void reverse1(Register d, Register s, Register tmp) { 8169 ldr(tmp, pre(s, -wordSize)); 8170 ror(tmp, tmp, 32); 8171 str(tmp, post(d, wordSize)); 8172 } 8173 8174 void step_squaring() { 8175 // An extra ACC 8176 step(); 8177 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 8178 } 8179 8180 void last_squaring(RegisterOrConstant i) { 8181 Label dont; 8182 // if ((i & 1) == 0) { 8183 tbnz(i.as_register(), 0, dont); { 8184 // MACC(Ra, Rb, t0, t1, t2); 8185 // Ra = *++Pa; 8186 // Rb = *--Pb; 8187 umulh(Rhi_ab, Ra, Rb); 8188 mul(Rlo_ab, Ra, Rb); 8189 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 8190 } bind(dont); 8191 } 8192 8193 void extra_step_squaring() { 8194 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 8195 8196 // MACC(Rm, Rn, t0, t1, t2); 8197 // Rm = *++Pm; 8198 // Rn = *--Pn; 8199 umulh(Rhi_mn, Rm, Rn); 8200 mul(Rlo_mn, Rm, Rn); 8201 ldr(Rm, pre(Pm, wordSize)); 8202 ldr(Rn, pre(Pn, -wordSize)); 8203 } 8204 8205 void post1_squaring() { 8206 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 8207 8208 // *Pm = Rm = t0 * inv; 8209 mul(Rm, t0, inv); 8210 str(Rm, Address(Pm)); 8211 8212 // MACC(Rm, Rn, t0, t1, t2); 8213 // t0 = t1; t1 = t2; t2 = 0; 8214 umulh(Rhi_mn, Rm, Rn); 8215 8216 #ifndef PRODUCT 8217 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 8218 { 8219 mul(Rlo_mn, Rm, Rn); 8220 add(Rlo_mn, t0, Rlo_mn); 8221 Label ok; 8222 cbz(Rlo_mn, ok); { 8223 stop("broken Montgomery multiply"); 8224 } bind(ok); 8225 } 8226 #endif 8227 // We have very carefully set things up so that 8228 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 8229 // the lower half of Rm * Rn because we know the result already: 8230 // it must be -t0. t0 + (-t0) must generate a carry iff 8231 // t0 != 0. So, rather than do a mul and an adds we just set 8232 // the carry flag iff t0 is nonzero. 8233 // 8234 // mul(Rlo_mn, Rm, Rn); 8235 // adds(zr, t0, Rlo_mn); 8236 subs(zr, t0, 1); // Set carry iff t0 is nonzero 8237 adcs(t0, t1, Rhi_mn); 8238 adc(t1, t2, zr); 8239 mov(t2, zr); 8240 } 8241 8242 void acc(Register Rhi, Register Rlo, 8243 Register t0, Register t1, Register t2) { 8244 adds(t0, t0, Rlo); 8245 adcs(t1, t1, Rhi); 8246 adc(t2, t2, zr); 8247 } 8248 8249 public: 8250 /** 8251 * Fast Montgomery multiplication. The derivation of the 8252 * algorithm is in A Cryptographic Library for the Motorola 8253 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 8254 * 8255 * Arguments: 8256 * 8257 * Inputs for multiplication: 8258 * c_rarg0 - int array elements a 8259 * c_rarg1 - int array elements b 8260 * c_rarg2 - int array elements n (the modulus) 8261 * c_rarg3 - int length 8262 * c_rarg4 - int inv 8263 * c_rarg5 - int array elements m (the result) 8264 * 8265 * Inputs for squaring: 8266 * c_rarg0 - int array elements a 8267 * c_rarg1 - int array elements n (the modulus) 8268 * c_rarg2 - int length 8269 * c_rarg3 - int inv 8270 * c_rarg4 - int array elements m (the result) 8271 * 8272 */ 8273 address generate_multiply() { 8274 Label argh, nothing; 8275 bind(argh); 8276 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8277 8278 align(CodeEntryAlignment); 8279 address entry = pc(); 8280 8281 cbzw(Rlen, nothing); 8282 8283 enter(); 8284 8285 // Make room. 8286 cmpw(Rlen, 512); 8287 br(Assembler::HI, argh); 8288 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8289 andr(sp, Ra, -2 * wordSize); 8290 8291 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8292 8293 { 8294 // Copy input args, reversing as we go. We use Ra as a 8295 // temporary variable. 8296 reverse(Ra, Pa_base, Rlen, t0, t1); 8297 if (!_squaring) 8298 reverse(Ra, Pb_base, Rlen, t0, t1); 8299 reverse(Ra, Pn_base, Rlen, t0, t1); 8300 } 8301 8302 // Push all call-saved registers and also Pm_base which we'll need 8303 // at the end. 8304 save_regs(); 8305 8306 #ifndef PRODUCT 8307 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 8308 { 8309 ldr(Rn, Address(Pn_base, 0)); 8310 mul(Rlo_mn, Rn, inv); 8311 subs(zr, Rlo_mn, -1); 8312 Label ok; 8313 br(EQ, ok); { 8314 stop("broken inverse in Montgomery multiply"); 8315 } bind(ok); 8316 } 8317 #endif 8318 8319 mov(Pm_base, Ra); 8320 8321 mov(t0, zr); 8322 mov(t1, zr); 8323 mov(t2, zr); 8324 8325 block_comment("for (int i = 0; i < len; i++) {"); 8326 mov(Ri, zr); { 8327 Label loop, end; 8328 cmpw(Ri, Rlen); 8329 br(Assembler::GE, end); 8330 8331 bind(loop); 8332 pre1(Ri); 8333 8334 block_comment(" for (j = i; j; j--) {"); { 8335 movw(Rj, Ri); 8336 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 8337 } block_comment(" } // j"); 8338 8339 post1(); 8340 addw(Ri, Ri, 1); 8341 cmpw(Ri, Rlen); 8342 br(Assembler::LT, loop); 8343 bind(end); 8344 block_comment("} // i"); 8345 } 8346 8347 block_comment("for (int i = len; i < 2*len; i++) {"); 8348 mov(Ri, Rlen); { 8349 Label loop, end; 8350 cmpw(Ri, Rlen, Assembler::LSL, 1); 8351 br(Assembler::GE, end); 8352 8353 bind(loop); 8354 pre2(Ri, Rlen); 8355 8356 block_comment(" for (j = len*2-i-1; j; j--) {"); { 8357 lslw(Rj, Rlen, 1); 8358 subw(Rj, Rj, Ri); 8359 subw(Rj, Rj, 1); 8360 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 8361 } block_comment(" } // j"); 8362 8363 post2(Ri, Rlen); 8364 addw(Ri, Ri, 1); 8365 cmpw(Ri, Rlen, Assembler::LSL, 1); 8366 br(Assembler::LT, loop); 8367 bind(end); 8368 } 8369 block_comment("} // i"); 8370 8371 normalize(Rlen); 8372 8373 mov(Ra, Pm_base); // Save Pm_base in Ra 8374 restore_regs(); // Restore caller's Pm_base 8375 8376 // Copy our result into caller's Pm_base 8377 reverse(Pm_base, Ra, Rlen, t0, t1); 8378 8379 leave(); 8380 bind(nothing); 8381 ret(lr); 8382 8383 return entry; 8384 } 8385 // In C, approximately: 8386 8387 // void 8388 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 8389 // julong Pn_base[], julong Pm_base[], 8390 // julong inv, int len) { 8391 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8392 // julong *Pa, *Pb, *Pn, *Pm; 8393 // julong Ra, Rb, Rn, Rm; 8394 8395 // int i; 8396 8397 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8398 8399 // for (i = 0; i < len; i++) { 8400 // int j; 8401 8402 // Pa = Pa_base; 8403 // Pb = Pb_base + i; 8404 // Pm = Pm_base; 8405 // Pn = Pn_base + i; 8406 8407 // Ra = *Pa; 8408 // Rb = *Pb; 8409 // Rm = *Pm; 8410 // Rn = *Pn; 8411 8412 // int iters = i; 8413 // for (j = 0; iters--; j++) { 8414 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8415 // MACC(Ra, Rb, t0, t1, t2); 8416 // Ra = *++Pa; 8417 // Rb = *--Pb; 8418 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8419 // MACC(Rm, Rn, t0, t1, t2); 8420 // Rm = *++Pm; 8421 // Rn = *--Pn; 8422 // } 8423 8424 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 8425 // MACC(Ra, Rb, t0, t1, t2); 8426 // *Pm = Rm = t0 * inv; 8427 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8428 // MACC(Rm, Rn, t0, t1, t2); 8429 8430 // assert(t0 == 0, "broken Montgomery multiply"); 8431 8432 // t0 = t1; t1 = t2; t2 = 0; 8433 // } 8434 8435 // for (i = len; i < 2*len; i++) { 8436 // int j; 8437 8438 // Pa = Pa_base + i-len; 8439 // Pb = Pb_base + len; 8440 // Pm = Pm_base + i-len; 8441 // Pn = Pn_base + len; 8442 8443 // Ra = *++Pa; 8444 // Rb = *--Pb; 8445 // Rm = *++Pm; 8446 // Rn = *--Pn; 8447 8448 // int iters = len*2-i-1; 8449 // for (j = i-len+1; iters--; j++) { 8450 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8451 // MACC(Ra, Rb, t0, t1, t2); 8452 // Ra = *++Pa; 8453 // Rb = *--Pb; 8454 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8455 // MACC(Rm, Rn, t0, t1, t2); 8456 // Rm = *++Pm; 8457 // Rn = *--Pn; 8458 // } 8459 8460 // Pm_base[i-len] = t0; 8461 // t0 = t1; t1 = t2; t2 = 0; 8462 // } 8463 8464 // while (t0) 8465 // t0 = sub(Pm_base, Pn_base, t0, len); 8466 // } 8467 8468 /** 8469 * Fast Montgomery squaring. This uses asymptotically 25% fewer 8470 * multiplies than Montgomery multiplication so it should be up to 8471 * 25% faster. However, its loop control is more complex and it 8472 * may actually run slower on some machines. 8473 * 8474 * Arguments: 8475 * 8476 * Inputs: 8477 * c_rarg0 - int array elements a 8478 * c_rarg1 - int array elements n (the modulus) 8479 * c_rarg2 - int length 8480 * c_rarg3 - int inv 8481 * c_rarg4 - int array elements m (the result) 8482 * 8483 */ 8484 address generate_square() { 8485 Label argh; 8486 bind(argh); 8487 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8488 8489 align(CodeEntryAlignment); 8490 address entry = pc(); 8491 8492 enter(); 8493 8494 // Make room. 8495 cmpw(Rlen, 512); 8496 br(Assembler::HI, argh); 8497 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8498 andr(sp, Ra, -2 * wordSize); 8499 8500 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8501 8502 { 8503 // Copy input args, reversing as we go. We use Ra as a 8504 // temporary variable. 8505 reverse(Ra, Pa_base, Rlen, t0, t1); 8506 reverse(Ra, Pn_base, Rlen, t0, t1); 8507 } 8508 8509 // Push all call-saved registers and also Pm_base which we'll need 8510 // at the end. 8511 save_regs(); 8512 8513 mov(Pm_base, Ra); 8514 8515 mov(t0, zr); 8516 mov(t1, zr); 8517 mov(t2, zr); 8518 8519 block_comment("for (int i = 0; i < len; i++) {"); 8520 mov(Ri, zr); { 8521 Label loop, end; 8522 bind(loop); 8523 cmp(Ri, Rlen); 8524 br(Assembler::GE, end); 8525 8526 pre1(Ri); 8527 8528 block_comment("for (j = (i+1)/2; j; j--) {"); { 8529 add(Rj, Ri, 1); 8530 lsr(Rj, Rj, 1); 8531 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8532 } block_comment(" } // j"); 8533 8534 last_squaring(Ri); 8535 8536 block_comment(" for (j = i/2; j; j--) {"); { 8537 lsr(Rj, Ri, 1); 8538 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8539 } block_comment(" } // j"); 8540 8541 post1_squaring(); 8542 add(Ri, Ri, 1); 8543 cmp(Ri, Rlen); 8544 br(Assembler::LT, loop); 8545 8546 bind(end); 8547 block_comment("} // i"); 8548 } 8549 8550 block_comment("for (int i = len; i < 2*len; i++) {"); 8551 mov(Ri, Rlen); { 8552 Label loop, end; 8553 bind(loop); 8554 cmp(Ri, Rlen, Assembler::LSL, 1); 8555 br(Assembler::GE, end); 8556 8557 pre2(Ri, Rlen); 8558 8559 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8560 lsl(Rj, Rlen, 1); 8561 sub(Rj, Rj, Ri); 8562 sub(Rj, Rj, 1); 8563 lsr(Rj, Rj, 1); 8564 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8565 } block_comment(" } // j"); 8566 8567 last_squaring(Ri); 8568 8569 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8570 lsl(Rj, Rlen, 1); 8571 sub(Rj, Rj, Ri); 8572 lsr(Rj, Rj, 1); 8573 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8574 } block_comment(" } // j"); 8575 8576 post2(Ri, Rlen); 8577 add(Ri, Ri, 1); 8578 cmp(Ri, Rlen, Assembler::LSL, 1); 8579 8580 br(Assembler::LT, loop); 8581 bind(end); 8582 block_comment("} // i"); 8583 } 8584 8585 normalize(Rlen); 8586 8587 mov(Ra, Pm_base); // Save Pm_base in Ra 8588 restore_regs(); // Restore caller's Pm_base 8589 8590 // Copy our result into caller's Pm_base 8591 reverse(Pm_base, Ra, Rlen, t0, t1); 8592 8593 leave(); 8594 ret(lr); 8595 8596 return entry; 8597 } 8598 // In C, approximately: 8599 8600 // void 8601 // montgomery_square(julong Pa_base[], julong Pn_base[], 8602 // julong Pm_base[], julong inv, int len) { 8603 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8604 // julong *Pa, *Pb, *Pn, *Pm; 8605 // julong Ra, Rb, Rn, Rm; 8606 8607 // int i; 8608 8609 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8610 8611 // for (i = 0; i < len; i++) { 8612 // int j; 8613 8614 // Pa = Pa_base; 8615 // Pb = Pa_base + i; 8616 // Pm = Pm_base; 8617 // Pn = Pn_base + i; 8618 8619 // Ra = *Pa; 8620 // Rb = *Pb; 8621 // Rm = *Pm; 8622 // Rn = *Pn; 8623 8624 // int iters = (i+1)/2; 8625 // for (j = 0; iters--; j++) { 8626 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8627 // MACC2(Ra, Rb, t0, t1, t2); 8628 // Ra = *++Pa; 8629 // Rb = *--Pb; 8630 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8631 // MACC(Rm, Rn, t0, t1, t2); 8632 // Rm = *++Pm; 8633 // Rn = *--Pn; 8634 // } 8635 // if ((i & 1) == 0) { 8636 // assert(Ra == Pa_base[j], "must be"); 8637 // MACC(Ra, Ra, t0, t1, t2); 8638 // } 8639 // iters = i/2; 8640 // assert(iters == i-j, "must be"); 8641 // for (; iters--; j++) { 8642 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8643 // MACC(Rm, Rn, t0, t1, t2); 8644 // Rm = *++Pm; 8645 // Rn = *--Pn; 8646 // } 8647 8648 // *Pm = Rm = t0 * inv; 8649 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8650 // MACC(Rm, Rn, t0, t1, t2); 8651 8652 // assert(t0 == 0, "broken Montgomery multiply"); 8653 8654 // t0 = t1; t1 = t2; t2 = 0; 8655 // } 8656 8657 // for (i = len; i < 2*len; i++) { 8658 // int start = i-len+1; 8659 // int end = start + (len - start)/2; 8660 // int j; 8661 8662 // Pa = Pa_base + i-len; 8663 // Pb = Pa_base + len; 8664 // Pm = Pm_base + i-len; 8665 // Pn = Pn_base + len; 8666 8667 // Ra = *++Pa; 8668 // Rb = *--Pb; 8669 // Rm = *++Pm; 8670 // Rn = *--Pn; 8671 8672 // int iters = (2*len-i-1)/2; 8673 // assert(iters == end-start, "must be"); 8674 // for (j = start; iters--; j++) { 8675 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8676 // MACC2(Ra, Rb, t0, t1, t2); 8677 // Ra = *++Pa; 8678 // Rb = *--Pb; 8679 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8680 // MACC(Rm, Rn, t0, t1, t2); 8681 // Rm = *++Pm; 8682 // Rn = *--Pn; 8683 // } 8684 // if ((i & 1) == 0) { 8685 // assert(Ra == Pa_base[j], "must be"); 8686 // MACC(Ra, Ra, t0, t1, t2); 8687 // } 8688 // iters = (2*len-i)/2; 8689 // assert(iters == len-j, "must be"); 8690 // for (; iters--; j++) { 8691 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8692 // MACC(Rm, Rn, t0, t1, t2); 8693 // Rm = *++Pm; 8694 // Rn = *--Pn; 8695 // } 8696 // Pm_base[i-len] = t0; 8697 // t0 = t1; t1 = t2; t2 = 0; 8698 // } 8699 8700 // while (t0) 8701 // t0 = sub(Pm_base, Pn_base, t0, len); 8702 // } 8703 }; 8704 8705 void generate_vector_math_stubs() { 8706 // Get native vector math stub routine addresses 8707 void* libsleef = nullptr; 8708 char ebuf[1024]; 8709 char dll_name[JVM_MAXPATHLEN]; 8710 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 8711 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 8712 } 8713 if (libsleef == nullptr) { 8714 log_info(library)("Failed to load native vector math library, %s!", ebuf); 8715 return; 8716 } 8717 // Method naming convention 8718 // All the methods are named as <OP><T><N>_<U><suffix> 8719 // Where: 8720 // <OP> is the operation name, e.g. sin 8721 // <T> is optional to indicate float/double 8722 // "f/d" for vector float/double operation 8723 // <N> is the number of elements in the vector 8724 // "2/4" for neon, and "x" for sve 8725 // <U> is the precision level 8726 // "u10/u05" represents 1.0/0.5 ULP error bounds 8727 // We use "u10" for all operations by default 8728 // But for those functions do not have u10 support, we use "u05" instead 8729 // <suffix> indicates neon/sve 8730 // "sve/advsimd" for sve/neon implementations 8731 // e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions 8732 // cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions 8733 // 8734 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 8735 8736 // Math vector stubs implemented with SVE for scalable vector size. 8737 if (UseSVE > 0) { 8738 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 8739 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 8740 // Skip "tanh" because there is performance regression 8741 if (vop == VectorSupport::VECTOR_OP_TANH) { 8742 continue; 8743 } 8744 8745 // The native library does not support u10 level of "hypot". 8746 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 8747 8748 snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf); 8749 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 8750 8751 snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf); 8752 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 8753 } 8754 } 8755 8756 // Math vector stubs implemented with NEON for 64/128 bits vector size. 8757 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 8758 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 8759 // Skip "tanh" because there is performance regression 8760 if (vop == VectorSupport::VECTOR_OP_TANH) { 8761 continue; 8762 } 8763 8764 // The native library does not support u10 level of "hypot". 8765 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 8766 8767 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 8768 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf); 8769 8770 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 8771 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 8772 8773 snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf); 8774 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 8775 } 8776 } 8777 8778 // Initialization 8779 void generate_initial_stubs() { 8780 // Generate initial stubs and initializes the entry points 8781 8782 // entry points that exist in all platforms Note: This is code 8783 // that could be shared among different platforms - however the 8784 // benefit seems to be smaller than the disadvantage of having a 8785 // much more complicated generator structure. See also comment in 8786 // stubRoutines.hpp. 8787 8788 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8789 8790 StubRoutines::_call_stub_entry = 8791 generate_call_stub(StubRoutines::_call_stub_return_address); 8792 8793 // is referenced by megamorphic call 8794 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8795 8796 // Initialize table for copy memory (arraycopy) check. 8797 if (UnsafeMemoryAccess::_table == nullptr) { 8798 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 8799 } 8800 8801 if (UseCRC32Intrinsics) { 8802 // set table address before stub generation which use it 8803 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8804 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8805 } 8806 8807 if (UseCRC32CIntrinsics) { 8808 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8809 } 8810 8811 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8812 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8813 } 8814 8815 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8816 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8817 } 8818 8819 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 8820 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 8821 StubRoutines::_hf2f = generate_float16ToFloat(); 8822 StubRoutines::_f2hf = generate_floatToFloat16(); 8823 } 8824 } 8825 8826 void generate_continuation_stubs() { 8827 // Continuation stubs: 8828 StubRoutines::_cont_thaw = generate_cont_thaw(); 8829 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8830 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8831 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 8832 } 8833 8834 void generate_final_stubs() { 8835 // support for verify_oop (must happen after universe_init) 8836 if (VerifyOops) { 8837 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8838 } 8839 8840 // arraycopy stubs used by compilers 8841 generate_arraycopy_stubs(); 8842 8843 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8844 if (bs_nm != nullptr) { 8845 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8846 } 8847 8848 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8849 8850 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8851 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 8852 8853 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8854 8855 generate_atomic_entry_points(); 8856 8857 #endif // LINUX 8858 8859 #ifdef COMPILER2 8860 if (UseSecondarySupersTable) { 8861 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 8862 if (! InlineSecondarySupersTest) { 8863 generate_lookup_secondary_supers_table_stub(); 8864 } 8865 } 8866 #endif 8867 8868 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8869 } 8870 8871 void generate_compiler_stubs() { 8872 #if COMPILER2_OR_JVMCI 8873 8874 if (UseSVE == 0) { 8875 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 8876 } 8877 8878 // array equals stub for large arrays. 8879 if (!UseSimpleArrayEquals) { 8880 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8881 } 8882 8883 // arrays_hascode stub for large arrays. 8884 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 8885 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 8886 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 8887 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 8888 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 8889 8890 // byte_array_inflate stub for large arrays. 8891 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8892 8893 // countPositives stub for large arrays. 8894 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8895 8896 generate_compare_long_strings(); 8897 8898 generate_string_indexof_stubs(); 8899 8900 #ifdef COMPILER2 8901 if (UseMultiplyToLenIntrinsic) { 8902 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8903 } 8904 8905 if (UseSquareToLenIntrinsic) { 8906 StubRoutines::_squareToLen = generate_squareToLen(); 8907 } 8908 8909 if (UseMulAddIntrinsic) { 8910 StubRoutines::_mulAdd = generate_mulAdd(); 8911 } 8912 8913 if (UseSIMDForBigIntegerShiftIntrinsics) { 8914 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8915 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8916 } 8917 8918 if (UseMontgomeryMultiplyIntrinsic) { 8919 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 8920 StubCodeMark mark(this, stub_id); 8921 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8922 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8923 } 8924 8925 if (UseMontgomerySquareIntrinsic) { 8926 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 8927 StubCodeMark mark(this, stub_id); 8928 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8929 // We use generate_multiply() rather than generate_square() 8930 // because it's faster for the sizes of modulus we care about. 8931 StubRoutines::_montgomerySquare = g.generate_multiply(); 8932 } 8933 8934 generate_vector_math_stubs(); 8935 8936 #endif // COMPILER2 8937 8938 if (UseChaCha20Intrinsics) { 8939 StubRoutines::_chacha20Block = generate_chacha20Block_qrpar(); 8940 } 8941 8942 if (UseBASE64Intrinsics) { 8943 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8944 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8945 } 8946 8947 // data cache line writeback 8948 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8949 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8950 8951 if (UseAESIntrinsics) { 8952 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8953 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8954 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8955 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8956 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8957 } 8958 if (UseGHASHIntrinsics) { 8959 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8960 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8961 } 8962 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8963 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8964 } 8965 8966 if (UseMD5Intrinsics) { 8967 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 8968 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 8969 } 8970 if (UseSHA1Intrinsics) { 8971 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 8972 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 8973 } 8974 if (UseSHA256Intrinsics) { 8975 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 8976 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 8977 } 8978 if (UseSHA512Intrinsics) { 8979 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 8980 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 8981 } 8982 if (UseSHA3Intrinsics) { 8983 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 8984 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 8985 } 8986 8987 if (UsePoly1305Intrinsics) { 8988 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 8989 } 8990 8991 // generate Adler32 intrinsics code 8992 if (UseAdler32Intrinsics) { 8993 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8994 } 8995 8996 #endif // COMPILER2_OR_JVMCI 8997 } 8998 8999 public: 9000 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 9001 switch(blob_id) { 9002 case initial_id: 9003 generate_initial_stubs(); 9004 break; 9005 case continuation_id: 9006 generate_continuation_stubs(); 9007 break; 9008 case compiler_id: 9009 generate_compiler_stubs(); 9010 break; 9011 case final_id: 9012 generate_final_stubs(); 9013 break; 9014 default: 9015 fatal("unexpected blob id: %d", blob_id); 9016 break; 9017 }; 9018 } 9019 }; // end class declaration 9020 9021 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 9022 StubGenerator g(code, blob_id); 9023 } 9024 9025 9026 #if defined (LINUX) 9027 9028 // Define pointers to atomic stubs and initialize them to point to the 9029 // code in atomic_aarch64.S. 9030 9031 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 9032 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 9033 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 9034 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 9035 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 9036 9037 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 9038 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 9039 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 9040 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 9041 DEFAULT_ATOMIC_OP(xchg, 4, ) 9042 DEFAULT_ATOMIC_OP(xchg, 8, ) 9043 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 9044 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 9045 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 9046 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 9047 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 9048 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 9049 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 9050 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 9051 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 9052 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 9053 9054 #undef DEFAULT_ATOMIC_OP 9055 9056 #endif // LINUX