1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "code/SCCache.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/arguments.hpp" 46 #include "runtime/atomic.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/frame.inline.hpp" 50 #include "runtime/handles.inline.hpp" 51 #include "runtime/javaThread.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/stubCodeGenerator.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "utilities/align.hpp" 56 #include "utilities/checkedCast.hpp" 57 #include "utilities/debug.hpp" 58 #include "utilities/globalDefinitions.hpp" 59 #include "utilities/intpow.hpp" 60 #include "utilities/powerOfTwo.hpp" 61 #ifdef COMPILER2 62 #include "opto/runtime.hpp" 63 #endif 64 #if INCLUDE_ZGC 65 #include "gc/z/zThreadLocalData.hpp" 66 #endif 67 68 // Declaration and definition of StubGenerator (no .hpp file). 69 // For a more detailed description of the stub routine structure 70 // see the comment in stubRoutines.hpp 71 72 #undef __ 73 #define __ _masm-> 74 75 #ifdef PRODUCT 76 #define BLOCK_COMMENT(str) /* nothing */ 77 #else 78 #define BLOCK_COMMENT(str) __ block_comment(str) 79 #endif 80 81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 82 83 // Stub Code definitions 84 85 class StubGenerator: public StubCodeGenerator { 86 private: 87 88 #ifdef PRODUCT 89 #define inc_counter_np(counter) ((void)0) 90 #else 91 void inc_counter_np_(uint& counter) { 92 __ incrementw(ExternalAddress((address)&counter)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubGenStubId stub_id = StubGenStubId::call_stub_id; 207 StubCodeMark mark(this, stub_id); 208 address start = __ pc(); 209 210 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 211 212 const Address fpcr_save (rfp, fpcr_off * wordSize); 213 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 214 const Address result (rfp, result_off * wordSize); 215 const Address result_type (rfp, result_type_off * wordSize); 216 const Address method (rfp, method_off * wordSize); 217 const Address entry_point (rfp, entry_point_off * wordSize); 218 const Address parameter_size(rfp, parameter_size_off * wordSize); 219 220 const Address thread (rfp, thread_off * wordSize); 221 222 const Address d15_save (rfp, d15_off * wordSize); 223 const Address d13_save (rfp, d13_off * wordSize); 224 const Address d11_save (rfp, d11_off * wordSize); 225 const Address d9_save (rfp, d9_off * wordSize); 226 227 const Address r28_save (rfp, r28_off * wordSize); 228 const Address r26_save (rfp, r26_off * wordSize); 229 const Address r24_save (rfp, r24_off * wordSize); 230 const Address r22_save (rfp, r22_off * wordSize); 231 const Address r20_save (rfp, r20_off * wordSize); 232 233 // stub code 234 235 address aarch64_entry = __ pc(); 236 237 // set up frame and move sp to end of save area 238 __ enter(); 239 __ sub(sp, rfp, -sp_after_call_off * wordSize); 240 241 // save register parameters and Java scratch/global registers 242 // n.b. we save thread even though it gets installed in 243 // rthread because we want to sanity check rthread later 244 __ str(c_rarg7, thread); 245 __ strw(c_rarg6, parameter_size); 246 __ stp(c_rarg4, c_rarg5, entry_point); 247 __ stp(c_rarg2, c_rarg3, result_type); 248 __ stp(c_rarg0, c_rarg1, call_wrapper); 249 250 __ stp(r20, r19, r20_save); 251 __ stp(r22, r21, r22_save); 252 __ stp(r24, r23, r24_save); 253 __ stp(r26, r25, r26_save); 254 __ stp(r28, r27, r28_save); 255 256 __ stpd(v9, v8, d9_save); 257 __ stpd(v11, v10, d11_save); 258 __ stpd(v13, v12, d13_save); 259 __ stpd(v15, v14, d15_save); 260 261 __ get_fpcr(rscratch1); 262 __ str(rscratch1, fpcr_save); 263 // Set FPCR to the state we need. We do want Round to Nearest. We 264 // don't want non-IEEE rounding modes or floating-point traps. 265 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 266 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 267 __ set_fpcr(rscratch1); 268 269 // install Java thread in global register now we have saved 270 // whatever value it held 271 __ mov(rthread, c_rarg7); 272 // And method 273 __ mov(rmethod, c_rarg3); 274 275 // set up the heapbase register 276 __ reinit_heapbase(); 277 278 #ifdef ASSERT 279 // make sure we have no pending exceptions 280 { 281 Label L; 282 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 283 __ cmp(rscratch1, (u1)NULL_WORD); 284 __ br(Assembler::EQ, L); 285 __ stop("StubRoutines::call_stub: entered with pending exception"); 286 __ BIND(L); 287 } 288 #endif 289 // pass parameters if any 290 __ mov(esp, sp); 291 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 292 __ andr(sp, rscratch1, -2 * wordSize); 293 294 BLOCK_COMMENT("pass parameters if any"); 295 Label parameters_done; 296 // parameter count is still in c_rarg6 297 // and parameter pointer identifying param 1 is in c_rarg5 298 __ cbzw(c_rarg6, parameters_done); 299 300 address loop = __ pc(); 301 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 302 __ subsw(c_rarg6, c_rarg6, 1); 303 __ push(rscratch1); 304 __ br(Assembler::GT, loop); 305 306 __ BIND(parameters_done); 307 308 // call Java entry -- passing methdoOop, and current sp 309 // rmethod: Method* 310 // r19_sender_sp: sender sp 311 BLOCK_COMMENT("call Java function"); 312 __ mov(r19_sender_sp, sp); 313 __ blr(c_rarg4); 314 315 // we do this here because the notify will already have been done 316 // if we get to the next instruction via an exception 317 // 318 // n.b. adding this instruction here affects the calculation of 319 // whether or not a routine returns to the call stub (used when 320 // doing stack walks) since the normal test is to check the return 321 // pc against the address saved below. so we may need to allow for 322 // this extra instruction in the check. 323 324 // save current address for use by exception handling code 325 326 return_address = __ pc(); 327 328 // store result depending on type (everything that is not 329 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 330 // n.b. this assumes Java returns an integral result in r0 331 // and a floating result in j_farg0 332 __ ldr(j_rarg2, result); 333 Label is_long, is_float, is_double, exit; 334 __ ldr(j_rarg1, result_type); 335 __ cmp(j_rarg1, (u1)T_OBJECT); 336 __ br(Assembler::EQ, is_long); 337 __ cmp(j_rarg1, (u1)T_LONG); 338 __ br(Assembler::EQ, is_long); 339 __ cmp(j_rarg1, (u1)T_FLOAT); 340 __ br(Assembler::EQ, is_float); 341 __ cmp(j_rarg1, (u1)T_DOUBLE); 342 __ br(Assembler::EQ, is_double); 343 344 // handle T_INT case 345 __ strw(r0, Address(j_rarg2)); 346 347 __ BIND(exit); 348 349 // pop parameters 350 __ sub(esp, rfp, -sp_after_call_off * wordSize); 351 352 #ifdef ASSERT 353 // verify that threads correspond 354 { 355 Label L, S; 356 __ ldr(rscratch1, thread); 357 __ cmp(rthread, rscratch1); 358 __ br(Assembler::NE, S); 359 __ get_thread(rscratch1); 360 __ cmp(rthread, rscratch1); 361 __ br(Assembler::EQ, L); 362 __ BIND(S); 363 __ stop("StubRoutines::call_stub: threads must correspond"); 364 __ BIND(L); 365 } 366 #endif 367 368 __ pop_cont_fastpath(rthread); 369 370 // restore callee-save registers 371 __ ldpd(v15, v14, d15_save); 372 __ ldpd(v13, v12, d13_save); 373 __ ldpd(v11, v10, d11_save); 374 __ ldpd(v9, v8, d9_save); 375 376 __ ldp(r28, r27, r28_save); 377 __ ldp(r26, r25, r26_save); 378 __ ldp(r24, r23, r24_save); 379 __ ldp(r22, r21, r22_save); 380 __ ldp(r20, r19, r20_save); 381 382 // restore fpcr 383 __ ldr(rscratch1, fpcr_save); 384 __ set_fpcr(rscratch1); 385 386 __ ldp(c_rarg0, c_rarg1, call_wrapper); 387 __ ldrw(c_rarg2, result_type); 388 __ ldr(c_rarg3, method); 389 __ ldp(c_rarg4, c_rarg5, entry_point); 390 __ ldp(c_rarg6, c_rarg7, parameter_size); 391 392 // leave frame and return to caller 393 __ leave(); 394 __ ret(lr); 395 396 // handle return types different from T_INT 397 398 __ BIND(is_long); 399 __ str(r0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_float); 403 __ strs(j_farg0, Address(j_rarg2, 0)); 404 __ br(Assembler::AL, exit); 405 406 __ BIND(is_double); 407 __ strd(j_farg0, Address(j_rarg2, 0)); 408 __ br(Assembler::AL, exit); 409 410 return start; 411 } 412 413 // Return point for a Java call if there's an exception thrown in 414 // Java code. The exception is caught and transformed into a 415 // pending exception stored in JavaThread that can be tested from 416 // within the VM. 417 // 418 // Note: Usually the parameters are removed by the callee. In case 419 // of an exception crossing an activation frame boundary, that is 420 // not the case if the callee is compiled code => need to setup the 421 // rsp. 422 // 423 // r0: exception oop 424 425 address generate_catch_exception() { 426 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 427 StubCodeMark mark(this, stub_id); 428 address start = __ pc(); 429 430 // same as in generate_call_stub(): 431 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 432 const Address thread (rfp, thread_off * wordSize); 433 434 #ifdef ASSERT 435 // verify that threads correspond 436 { 437 Label L, S; 438 __ ldr(rscratch1, thread); 439 __ cmp(rthread, rscratch1); 440 __ br(Assembler::NE, S); 441 __ get_thread(rscratch1); 442 __ cmp(rthread, rscratch1); 443 __ br(Assembler::EQ, L); 444 __ bind(S); 445 __ stop("StubRoutines::catch_exception: threads must correspond"); 446 __ bind(L); 447 } 448 #endif 449 450 // set pending exception 451 __ verify_oop(r0); 452 453 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 454 __ mov(rscratch1, (address)__FILE__); 455 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 456 __ movw(rscratch1, (int)__LINE__); 457 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 458 459 // complete return to VM 460 assert(StubRoutines::_call_stub_return_address != nullptr, 461 "_call_stub_return_address must have been generated before"); 462 __ b(StubRoutines::_call_stub_return_address); 463 464 return start; 465 } 466 467 // Continuation point for runtime calls returning with a pending 468 // exception. The pending exception check happened in the runtime 469 // or native call stub. The pending exception in Thread is 470 // converted into a Java-level exception. 471 // 472 // Contract with Java-level exception handlers: 473 // r0: exception 474 // r3: throwing pc 475 // 476 // NOTE: At entry of this stub, exception-pc must be in LR !! 477 478 // NOTE: this is always used as a jump target within generated code 479 // so it just needs to be generated code with no x86 prolog 480 481 address generate_forward_exception() { 482 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 483 StubCodeMark mark(this, stub_id); 484 address start = __ pc(); 485 486 // Upon entry, LR points to the return address returning into 487 // Java (interpreted or compiled) code; i.e., the return address 488 // becomes the throwing pc. 489 // 490 // Arguments pushed before the runtime call are still on the stack 491 // but the exception handler will reset the stack pointer -> 492 // ignore them. A potential result in registers can be ignored as 493 // well. 494 495 #ifdef ASSERT 496 // make sure this code is only executed if there is a pending exception 497 { 498 Label L; 499 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 500 __ cbnz(rscratch1, L); 501 __ stop("StubRoutines::forward exception: no pending exception (1)"); 502 __ bind(L); 503 } 504 #endif 505 506 // compute exception handler into r19 507 508 // call the VM to find the handler address associated with the 509 // caller address. pass thread in r0 and caller pc (ret address) 510 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 511 // the stack. 512 __ mov(c_rarg1, lr); 513 // lr will be trashed by the VM call so we move it to R19 514 // (callee-saved) because we also need to pass it to the handler 515 // returned by this call. 516 __ mov(r19, lr); 517 BLOCK_COMMENT("call exception_handler_for_return_address"); 518 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 519 SharedRuntime::exception_handler_for_return_address), 520 rthread, c_rarg1); 521 // Reinitialize the ptrue predicate register, in case the external runtime 522 // call clobbers ptrue reg, as we may return to SVE compiled code. 523 __ reinitialize_ptrue(); 524 525 // we should not really care that lr is no longer the callee 526 // address. we saved the value the handler needs in r19 so we can 527 // just copy it to r3. however, the C2 handler will push its own 528 // frame and then calls into the VM and the VM code asserts that 529 // the PC for the frame above the handler belongs to a compiled 530 // Java method. So, we restore lr here to satisfy that assert. 531 __ mov(lr, r19); 532 // setup r0 & r3 & clear pending exception 533 __ mov(r3, r19); 534 __ mov(r19, r0); 535 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 536 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 537 538 #ifdef ASSERT 539 // make sure exception is set 540 { 541 Label L; 542 __ cbnz(r0, L); 543 __ stop("StubRoutines::forward exception: no pending exception (2)"); 544 __ bind(L); 545 } 546 #endif 547 548 // continue at exception handler 549 // r0: exception 550 // r3: throwing pc 551 // r19: exception handler 552 __ verify_oop(r0); 553 __ br(r19); 554 555 return start; 556 } 557 558 // Non-destructive plausibility checks for oops 559 // 560 // Arguments: 561 // r0: oop to verify 562 // rscratch1: error message 563 // 564 // Stack after saving c_rarg3: 565 // [tos + 0]: saved c_rarg3 566 // [tos + 1]: saved c_rarg2 567 // [tos + 2]: saved lr 568 // [tos + 3]: saved rscratch2 569 // [tos + 4]: saved r0 570 // [tos + 5]: saved rscratch1 571 address generate_verify_oop() { 572 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 573 StubCodeMark mark(this, stub_id); 574 address start = __ pc(); 575 576 Label exit, error; 577 578 // save c_rarg2 and c_rarg3 579 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 580 581 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 582 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 583 __ ldr(c_rarg3, Address(c_rarg2)); 584 __ add(c_rarg3, c_rarg3, 1); 585 __ str(c_rarg3, Address(c_rarg2)); 586 587 // object is in r0 588 // make sure object is 'reasonable' 589 __ cbz(r0, exit); // if obj is null it is OK 590 591 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 592 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blr(rscratch1); 615 __ hlt(0); 616 617 return start; 618 } 619 620 // Generate indices for iota vector. 621 address generate_iota_indices(StubGenStubId stub_id) { 622 __ align(CodeEntryAlignment); 623 StubCodeMark mark(this, stub_id); 624 address start = __ pc(); 625 // B 626 __ emit_data64(0x0706050403020100, relocInfo::none); 627 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 628 // H 629 __ emit_data64(0x0003000200010000, relocInfo::none); 630 __ emit_data64(0x0007000600050004, relocInfo::none); 631 // S 632 __ emit_data64(0x0000000100000000, relocInfo::none); 633 __ emit_data64(0x0000000300000002, relocInfo::none); 634 // D 635 __ emit_data64(0x0000000000000000, relocInfo::none); 636 __ emit_data64(0x0000000000000001, relocInfo::none); 637 // S - FP 638 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 639 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 640 // D - FP 641 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 642 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 643 return start; 644 } 645 646 // The inner part of zero_words(). This is the bulk operation, 647 // zeroing words in blocks, possibly using DC ZVA to do it. The 648 // caller is responsible for zeroing the last few words. 649 // 650 // Inputs: 651 // r10: the HeapWord-aligned base address of an array to zero. 652 // r11: the count in HeapWords, r11 > 0. 653 // 654 // Returns r10 and r11, adjusted for the caller to clear. 655 // r10: the base address of the tail of words left to clear. 656 // r11: the number of words in the tail. 657 // r11 < MacroAssembler::zero_words_block_size. 658 659 address generate_zero_blocks() { 660 Label done; 661 Label base_aligned; 662 663 Register base = r10, cnt = r11; 664 665 __ align(CodeEntryAlignment); 666 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 667 StubCodeMark mark(this, stub_id); 668 address start = __ pc(); 669 670 if (UseBlockZeroing) { 671 int zva_length = VM_Version::zva_length(); 672 673 // Ensure ZVA length can be divided by 16. This is required by 674 // the subsequent operations. 675 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 676 677 __ tbz(base, 3, base_aligned); 678 __ str(zr, Address(__ post(base, 8))); 679 __ sub(cnt, cnt, 1); 680 __ bind(base_aligned); 681 682 // Ensure count >= zva_length * 2 so that it still deserves a zva after 683 // alignment. 684 Label small; 685 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 686 __ subs(rscratch1, cnt, low_limit >> 3); 687 __ br(Assembler::LT, small); 688 __ zero_dcache_blocks(base, cnt); 689 __ bind(small); 690 } 691 692 { 693 // Number of stp instructions we'll unroll 694 const int unroll = 695 MacroAssembler::zero_words_block_size / 2; 696 // Clear the remaining blocks. 697 Label loop; 698 __ subs(cnt, cnt, unroll * 2); 699 __ br(Assembler::LT, done); 700 __ bind(loop); 701 for (int i = 0; i < unroll; i++) 702 __ stp(zr, zr, __ post(base, 16)); 703 __ subs(cnt, cnt, unroll * 2); 704 __ br(Assembler::GE, loop); 705 __ bind(done); 706 __ add(cnt, cnt, unroll * 2); 707 } 708 709 __ ret(lr); 710 711 return start; 712 } 713 714 715 typedef enum { 716 copy_forwards = 1, 717 copy_backwards = -1 718 } copy_direction; 719 720 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 721 // for arraycopy stubs. 722 class ArrayCopyBarrierSetHelper : StackObj { 723 BarrierSetAssembler* _bs_asm; 724 MacroAssembler* _masm; 725 DecoratorSet _decorators; 726 BasicType _type; 727 Register _gct1; 728 Register _gct2; 729 Register _gct3; 730 FloatRegister _gcvt1; 731 FloatRegister _gcvt2; 732 FloatRegister _gcvt3; 733 734 public: 735 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 736 DecoratorSet decorators, 737 BasicType type, 738 Register gct1, 739 Register gct2, 740 Register gct3, 741 FloatRegister gcvt1, 742 FloatRegister gcvt2, 743 FloatRegister gcvt3) 744 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 745 _masm(masm), 746 _decorators(decorators), 747 _type(type), 748 _gct1(gct1), 749 _gct2(gct2), 750 _gct3(gct3), 751 _gcvt1(gcvt1), 752 _gcvt2(gcvt2), 753 _gcvt3(gcvt3) { 754 } 755 756 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 757 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 758 dst1, dst2, src, 759 _gct1, _gct2, _gcvt1); 760 } 761 762 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 763 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 764 dst, src1, src2, 765 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 766 } 767 768 void copy_load_at_16(Register dst1, Register dst2, Address src) { 769 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 770 dst1, dst2, src, 771 _gct1); 772 } 773 774 void copy_store_at_16(Address dst, Register src1, Register src2) { 775 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 776 dst, src1, src2, 777 _gct1, _gct2, _gct3); 778 } 779 780 void copy_load_at_8(Register dst, Address src) { 781 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 782 dst, noreg, src, 783 _gct1); 784 } 785 786 void copy_store_at_8(Address dst, Register src) { 787 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 788 dst, src, noreg, 789 _gct1, _gct2, _gct3); 790 } 791 }; 792 793 // Bulk copy of blocks of 8 words. 794 // 795 // count is a count of words. 796 // 797 // Precondition: count >= 8 798 // 799 // Postconditions: 800 // 801 // The least significant bit of count contains the remaining count 802 // of words to copy. The rest of count is trash. 803 // 804 // s and d are adjusted to point to the remaining words to copy 805 // 806 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 807 BasicType type; 808 copy_direction direction; 809 810 switch (stub_id) { 811 case copy_byte_f_id: 812 direction = copy_forwards; 813 type = T_BYTE; 814 break; 815 case copy_byte_b_id: 816 direction = copy_backwards; 817 type = T_BYTE; 818 break; 819 case copy_oop_f_id: 820 direction = copy_forwards; 821 type = T_OBJECT; 822 break; 823 case copy_oop_b_id: 824 direction = copy_backwards; 825 type = T_OBJECT; 826 break; 827 case copy_oop_uninit_f_id: 828 direction = copy_forwards; 829 type = T_OBJECT; 830 break; 831 case copy_oop_uninit_b_id: 832 direction = copy_backwards; 833 type = T_OBJECT; 834 break; 835 default: 836 ShouldNotReachHere(); 837 } 838 839 int unit = wordSize * direction; 840 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 841 842 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 843 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 844 const Register stride = r14; 845 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 846 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 847 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 848 849 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 850 assert_different_registers(s, d, count, rscratch1, rscratch2); 851 852 Label again, drain; 853 854 __ align(CodeEntryAlignment); 855 856 StubCodeMark mark(this, stub_id); 857 858 __ bind(start); 859 860 Label unaligned_copy_long; 861 if (AvoidUnalignedAccesses) { 862 __ tbnz(d, 3, unaligned_copy_long); 863 } 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, bias); 867 __ sub(d, d, bias); 868 } 869 870 #ifdef ASSERT 871 // Make sure we are never given < 8 words 872 { 873 Label L; 874 __ cmp(count, (u1)8); 875 __ br(Assembler::GE, L); 876 __ stop("genrate_copy_longs called with < 8 words"); 877 __ bind(L); 878 } 879 #endif 880 881 // Fill 8 registers 882 if (UseSIMDForMemoryOps) { 883 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 884 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 885 } else { 886 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 888 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 889 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 890 } 891 892 __ subs(count, count, 16); 893 __ br(Assembler::LO, drain); 894 895 int prefetch = PrefetchCopyIntervalInBytes; 896 bool use_stride = false; 897 if (direction == copy_backwards) { 898 use_stride = prefetch > 256; 899 prefetch = -prefetch; 900 if (use_stride) __ mov(stride, prefetch); 901 } 902 903 __ bind(again); 904 905 if (PrefetchCopyIntervalInBytes > 0) 906 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 907 908 if (UseSIMDForMemoryOps) { 909 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 910 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 911 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 912 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 913 } else { 914 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 915 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 916 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 917 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 918 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 919 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 920 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 921 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 922 } 923 924 __ subs(count, count, 8); 925 __ br(Assembler::HS, again); 926 927 // Drain 928 __ bind(drain); 929 if (UseSIMDForMemoryOps) { 930 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 931 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 932 } else { 933 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 934 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 935 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 936 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 937 } 938 939 { 940 Label L1, L2; 941 __ tbz(count, exact_log2(4), L1); 942 if (UseSIMDForMemoryOps) { 943 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 944 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 945 } else { 946 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 947 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 950 } 951 __ bind(L1); 952 953 if (direction == copy_forwards) { 954 __ add(s, s, bias); 955 __ add(d, d, bias); 956 } 957 958 __ tbz(count, 1, L2); 959 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 960 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 961 __ bind(L2); 962 } 963 964 __ ret(lr); 965 966 if (AvoidUnalignedAccesses) { 967 Label drain, again; 968 // Register order for storing. Order is different for backward copy. 969 970 __ bind(unaligned_copy_long); 971 972 // source address is even aligned, target odd aligned 973 // 974 // when forward copying word pairs we read long pairs at offsets 975 // {0, 2, 4, 6} (in long words). when backwards copying we read 976 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 977 // address by -2 in the forwards case so we can compute the 978 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 979 // or -1. 980 // 981 // when forward copying we need to store 1 word, 3 pairs and 982 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 983 // zero offset We adjust the destination by -1 which means we 984 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 985 // 986 // When backwards copyng we need to store 1 word, 3 pairs and 987 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 988 // offsets {1, 3, 5, 7, 8} * unit. 989 990 if (direction == copy_forwards) { 991 __ sub(s, s, 16); 992 __ sub(d, d, 8); 993 } 994 995 // Fill 8 registers 996 // 997 // for forwards copy s was offset by -16 from the original input 998 // value of s so the register contents are at these offsets 999 // relative to the 64 bit block addressed by that original input 1000 // and so on for each successive 64 byte block when s is updated 1001 // 1002 // t0 at offset 0, t1 at offset 8 1003 // t2 at offset 16, t3 at offset 24 1004 // t4 at offset 32, t5 at offset 40 1005 // t6 at offset 48, t7 at offset 56 1006 1007 // for backwards copy s was not offset so the register contents 1008 // are at these offsets into the preceding 64 byte block 1009 // relative to that original input and so on for each successive 1010 // preceding 64 byte block when s is updated. this explains the 1011 // slightly counter-intuitive looking pattern of register usage 1012 // in the stp instructions for backwards copy. 1013 // 1014 // t0 at offset -16, t1 at offset -8 1015 // t2 at offset -32, t3 at offset -24 1016 // t4 at offset -48, t5 at offset -40 1017 // t6 at offset -64, t7 at offset -56 1018 1019 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1020 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1021 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1022 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1023 1024 __ subs(count, count, 16); 1025 __ br(Assembler::LO, drain); 1026 1027 int prefetch = PrefetchCopyIntervalInBytes; 1028 bool use_stride = false; 1029 if (direction == copy_backwards) { 1030 use_stride = prefetch > 256; 1031 prefetch = -prefetch; 1032 if (use_stride) __ mov(stride, prefetch); 1033 } 1034 1035 __ bind(again); 1036 1037 if (PrefetchCopyIntervalInBytes > 0) 1038 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1039 1040 if (direction == copy_forwards) { 1041 // allowing for the offset of -8 the store instructions place 1042 // registers into the target 64 bit block at the following 1043 // offsets 1044 // 1045 // t0 at offset 0 1046 // t1 at offset 8, t2 at offset 16 1047 // t3 at offset 24, t4 at offset 32 1048 // t5 at offset 40, t6 at offset 48 1049 // t7 at offset 56 1050 1051 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1052 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1053 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1054 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1055 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1056 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1057 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1058 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1059 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1060 } else { 1061 // d was not offset when we started so the registers are 1062 // written into the 64 bit block preceding d with the following 1063 // offsets 1064 // 1065 // t1 at offset -8 1066 // t3 at offset -24, t0 at offset -16 1067 // t5 at offset -48, t2 at offset -32 1068 // t7 at offset -56, t4 at offset -48 1069 // t6 at offset -64 1070 // 1071 // note that this matches the offsets previously noted for the 1072 // loads 1073 1074 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1075 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1076 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1077 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1078 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1079 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1080 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1081 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1082 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1083 } 1084 1085 __ subs(count, count, 8); 1086 __ br(Assembler::HS, again); 1087 1088 // Drain 1089 // 1090 // this uses the same pattern of offsets and register arguments 1091 // as above 1092 __ bind(drain); 1093 if (direction == copy_forwards) { 1094 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1095 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1096 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1097 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1098 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1099 } else { 1100 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1101 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1102 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1103 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1104 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1105 } 1106 // now we need to copy any remaining part block which may 1107 // include a 4 word block subblock and/or a 2 word subblock. 1108 // bits 2 and 1 in the count are the tell-tale for whether we 1109 // have each such subblock 1110 { 1111 Label L1, L2; 1112 __ tbz(count, exact_log2(4), L1); 1113 // this is the same as above but copying only 4 longs hence 1114 // with only one intervening stp between the str instructions 1115 // but note that the offsets and registers still follow the 1116 // same pattern 1117 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1118 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1119 if (direction == copy_forwards) { 1120 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1121 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1122 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1123 } else { 1124 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1125 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1126 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1127 } 1128 __ bind(L1); 1129 1130 __ tbz(count, 1, L2); 1131 // this is the same as above but copying only 2 longs hence 1132 // there is no intervening stp between the str instructions 1133 // but note that the offset and register patterns are still 1134 // the same 1135 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1136 if (direction == copy_forwards) { 1137 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1138 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1139 } else { 1140 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1141 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1142 } 1143 __ bind(L2); 1144 1145 // for forwards copy we need to re-adjust the offsets we 1146 // applied so that s and d are follow the last words written 1147 1148 if (direction == copy_forwards) { 1149 __ add(s, s, 16); 1150 __ add(d, d, 8); 1151 } 1152 1153 } 1154 1155 __ ret(lr); 1156 } 1157 } 1158 1159 // Small copy: less than 16 bytes. 1160 // 1161 // NB: Ignores all of the bits of count which represent more than 15 1162 // bytes, so a caller doesn't have to mask them. 1163 1164 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1165 bool is_backwards = step < 0; 1166 size_t granularity = uabs(step); 1167 int direction = is_backwards ? -1 : 1; 1168 1169 Label Lword, Lint, Lshort, Lbyte; 1170 1171 assert(granularity 1172 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1173 1174 const Register t0 = r3; 1175 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1176 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1177 1178 // ??? I don't know if this bit-test-and-branch is the right thing 1179 // to do. It does a lot of jumping, resulting in several 1180 // mispredicted branches. It might make more sense to do this 1181 // with something like Duff's device with a single computed branch. 1182 1183 __ tbz(count, 3 - exact_log2(granularity), Lword); 1184 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1185 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1186 __ bind(Lword); 1187 1188 if (granularity <= sizeof (jint)) { 1189 __ tbz(count, 2 - exact_log2(granularity), Lint); 1190 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1191 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1192 __ bind(Lint); 1193 } 1194 1195 if (granularity <= sizeof (jshort)) { 1196 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1197 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1198 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1199 __ bind(Lshort); 1200 } 1201 1202 if (granularity <= sizeof (jbyte)) { 1203 __ tbz(count, 0, Lbyte); 1204 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1205 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1206 __ bind(Lbyte); 1207 } 1208 } 1209 1210 Label copy_f, copy_b; 1211 Label copy_obj_f, copy_obj_b; 1212 Label copy_obj_uninit_f, copy_obj_uninit_b; 1213 1214 // All-singing all-dancing memory copy. 1215 // 1216 // Copy count units of memory from s to d. The size of a unit is 1217 // step, which can be positive or negative depending on the direction 1218 // of copy. If is_aligned is false, we align the source address. 1219 // 1220 1221 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1222 Register s, Register d, Register count, int step) { 1223 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1224 bool is_backwards = step < 0; 1225 unsigned int granularity = uabs(step); 1226 const Register t0 = r3, t1 = r4; 1227 1228 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1229 // load all the data before writing anything 1230 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1231 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1232 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1233 const Register send = r17, dend = r16; 1234 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1235 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1236 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1237 1238 if (PrefetchCopyIntervalInBytes > 0) 1239 __ prfm(Address(s, 0), PLDL1KEEP); 1240 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1241 __ br(Assembler::HI, copy_big); 1242 1243 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1244 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1245 1246 __ cmp(count, u1(16/granularity)); 1247 __ br(Assembler::LS, copy16); 1248 1249 __ cmp(count, u1(64/granularity)); 1250 __ br(Assembler::HI, copy80); 1251 1252 __ cmp(count, u1(32/granularity)); 1253 __ br(Assembler::LS, copy32); 1254 1255 // 33..64 bytes 1256 if (UseSIMDForMemoryOps) { 1257 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1258 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1259 bs.copy_store_at_32(Address(d, 0), v0, v1); 1260 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1261 } else { 1262 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1263 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1264 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1265 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1266 1267 bs.copy_store_at_16(Address(d, 0), t0, t1); 1268 bs.copy_store_at_16(Address(d, 16), t2, t3); 1269 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1270 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1271 } 1272 __ b(finish); 1273 1274 // 17..32 bytes 1275 __ bind(copy32); 1276 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1277 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1278 1279 bs.copy_store_at_16(Address(d, 0), t0, t1); 1280 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1281 __ b(finish); 1282 1283 // 65..80/96 bytes 1284 // (96 bytes if SIMD because we do 32 byes per instruction) 1285 __ bind(copy80); 1286 if (UseSIMDForMemoryOps) { 1287 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1288 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1289 // Unaligned pointers can be an issue for copying. 1290 // The issue has more chances to happen when granularity of data is 1291 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1292 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1293 // The most performance drop has been seen for the range 65-80 bytes. 1294 // For such cases using the pair of ldp/stp instead of the third pair of 1295 // ldpq/stpq fixes the performance issue. 1296 if (granularity < sizeof (jint)) { 1297 Label copy96; 1298 __ cmp(count, u1(80/granularity)); 1299 __ br(Assembler::HI, copy96); 1300 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1301 1302 bs.copy_store_at_32(Address(d, 0), v0, v1); 1303 bs.copy_store_at_32(Address(d, 32), v2, v3); 1304 1305 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1306 __ b(finish); 1307 1308 __ bind(copy96); 1309 } 1310 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1311 1312 bs.copy_store_at_32(Address(d, 0), v0, v1); 1313 bs.copy_store_at_32(Address(d, 32), v2, v3); 1314 1315 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1316 } else { 1317 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1318 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1319 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1320 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1321 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1322 1323 bs.copy_store_at_16(Address(d, 0), t0, t1); 1324 bs.copy_store_at_16(Address(d, 16), t2, t3); 1325 bs.copy_store_at_16(Address(d, 32), t4, t5); 1326 bs.copy_store_at_16(Address(d, 48), t6, t7); 1327 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1328 } 1329 __ b(finish); 1330 1331 // 0..16 bytes 1332 __ bind(copy16); 1333 __ cmp(count, u1(8/granularity)); 1334 __ br(Assembler::LO, copy8); 1335 1336 // 8..16 bytes 1337 bs.copy_load_at_8(t0, Address(s, 0)); 1338 bs.copy_load_at_8(t1, Address(send, -8)); 1339 bs.copy_store_at_8(Address(d, 0), t0); 1340 bs.copy_store_at_8(Address(dend, -8), t1); 1341 __ b(finish); 1342 1343 if (granularity < 8) { 1344 // 4..7 bytes 1345 __ bind(copy8); 1346 __ tbz(count, 2 - exact_log2(granularity), copy4); 1347 __ ldrw(t0, Address(s, 0)); 1348 __ ldrw(t1, Address(send, -4)); 1349 __ strw(t0, Address(d, 0)); 1350 __ strw(t1, Address(dend, -4)); 1351 __ b(finish); 1352 if (granularity < 4) { 1353 // 0..3 bytes 1354 __ bind(copy4); 1355 __ cbz(count, finish); // get rid of 0 case 1356 if (granularity == 2) { 1357 __ ldrh(t0, Address(s, 0)); 1358 __ strh(t0, Address(d, 0)); 1359 } else { // granularity == 1 1360 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1361 // the first and last byte. 1362 // Handle the 3 byte case by loading and storing base + count/2 1363 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1364 // This does means in the 1 byte case we load/store the same 1365 // byte 3 times. 1366 __ lsr(count, count, 1); 1367 __ ldrb(t0, Address(s, 0)); 1368 __ ldrb(t1, Address(send, -1)); 1369 __ ldrb(t2, Address(s, count)); 1370 __ strb(t0, Address(d, 0)); 1371 __ strb(t1, Address(dend, -1)); 1372 __ strb(t2, Address(d, count)); 1373 } 1374 __ b(finish); 1375 } 1376 } 1377 1378 __ bind(copy_big); 1379 if (is_backwards) { 1380 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1381 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1382 } 1383 1384 // Now we've got the small case out of the way we can align the 1385 // source address on a 2-word boundary. 1386 1387 // Here we will materialize a count in r15, which is used by copy_memory_small 1388 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1389 // Up until here, we have used t9, which aliases r15, but from here on, that register 1390 // can not be used as a temp register, as it contains the count. 1391 1392 Label aligned; 1393 1394 if (is_aligned) { 1395 // We may have to adjust by 1 word to get s 2-word-aligned. 1396 __ tbz(s, exact_log2(wordSize), aligned); 1397 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1398 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1399 __ sub(count, count, wordSize/granularity); 1400 } else { 1401 if (is_backwards) { 1402 __ andr(r15, s, 2 * wordSize - 1); 1403 } else { 1404 __ neg(r15, s); 1405 __ andr(r15, r15, 2 * wordSize - 1); 1406 } 1407 // r15 is the byte adjustment needed to align s. 1408 __ cbz(r15, aligned); 1409 int shift = exact_log2(granularity); 1410 if (shift > 0) { 1411 __ lsr(r15, r15, shift); 1412 } 1413 __ sub(count, count, r15); 1414 1415 #if 0 1416 // ?? This code is only correct for a disjoint copy. It may or 1417 // may not make sense to use it in that case. 1418 1419 // Copy the first pair; s and d may not be aligned. 1420 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1421 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1422 1423 // Align s and d, adjust count 1424 if (is_backwards) { 1425 __ sub(s, s, r15); 1426 __ sub(d, d, r15); 1427 } else { 1428 __ add(s, s, r15); 1429 __ add(d, d, r15); 1430 } 1431 #else 1432 copy_memory_small(decorators, type, s, d, r15, step); 1433 #endif 1434 } 1435 1436 __ bind(aligned); 1437 1438 // s is now 2-word-aligned. 1439 1440 // We have a count of units and some trailing bytes. Adjust the 1441 // count and do a bulk copy of words. If the shift is zero 1442 // perform a move instead to benefit from zero latency moves. 1443 int shift = exact_log2(wordSize/granularity); 1444 if (shift > 0) { 1445 __ lsr(r15, count, shift); 1446 } else { 1447 __ mov(r15, count); 1448 } 1449 if (direction == copy_forwards) { 1450 if (type != T_OBJECT) { 1451 __ bl(copy_f); 1452 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1453 __ bl(copy_obj_uninit_f); 1454 } else { 1455 __ bl(copy_obj_f); 1456 } 1457 } else { 1458 if (type != T_OBJECT) { 1459 __ bl(copy_b); 1460 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1461 __ bl(copy_obj_uninit_b); 1462 } else { 1463 __ bl(copy_obj_b); 1464 } 1465 } 1466 1467 // And the tail. 1468 copy_memory_small(decorators, type, s, d, count, step); 1469 1470 if (granularity >= 8) __ bind(copy8); 1471 if (granularity >= 4) __ bind(copy4); 1472 __ bind(finish); 1473 } 1474 1475 1476 void clobber_registers() { 1477 #ifdef ASSERT 1478 RegSet clobbered 1479 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1480 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1481 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1482 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1483 __ mov(*it, rscratch1); 1484 } 1485 #endif 1486 1487 } 1488 1489 // Scan over array at a for count oops, verifying each one. 1490 // Preserves a and count, clobbers rscratch1 and rscratch2. 1491 void verify_oop_array (int size, Register a, Register count, Register temp) { 1492 Label loop, end; 1493 __ mov(rscratch1, a); 1494 __ mov(rscratch2, zr); 1495 __ bind(loop); 1496 __ cmp(rscratch2, count); 1497 __ br(Assembler::HS, end); 1498 if (size == wordSize) { 1499 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1500 __ verify_oop(temp); 1501 } else { 1502 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1503 __ decode_heap_oop(temp); // calls verify_oop 1504 } 1505 __ add(rscratch2, rscratch2, 1); 1506 __ b(loop); 1507 __ bind(end); 1508 } 1509 1510 // Arguments: 1511 // stub_id - is used to name the stub and identify all details of 1512 // how to perform the copy. 1513 // 1514 // entry - is assigned to the stub's post push entry point unless 1515 // it is null 1516 // 1517 // Inputs: 1518 // c_rarg0 - source array address 1519 // c_rarg1 - destination array address 1520 // c_rarg2 - element count, treated as ssize_t, can be zero 1521 // 1522 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1523 // the hardware handle it. The two dwords within qwords that span 1524 // cache line boundaries will still be loaded and stored atomically. 1525 // 1526 // Side Effects: entry is set to the (post push) entry point so it 1527 // can be used by the corresponding conjoint copy 1528 // method 1529 // 1530 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1531 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1532 RegSet saved_reg = RegSet::of(s, d, count); 1533 int size; 1534 bool aligned; 1535 bool is_oop; 1536 bool dest_uninitialized; 1537 switch (stub_id) { 1538 case jbyte_disjoint_arraycopy_id: 1539 size = sizeof(jbyte); 1540 aligned = false; 1541 is_oop = false; 1542 dest_uninitialized = false; 1543 break; 1544 case arrayof_jbyte_disjoint_arraycopy_id: 1545 size = sizeof(jbyte); 1546 aligned = true; 1547 is_oop = false; 1548 dest_uninitialized = false; 1549 break; 1550 case jshort_disjoint_arraycopy_id: 1551 size = sizeof(jshort); 1552 aligned = false; 1553 is_oop = false; 1554 dest_uninitialized = false; 1555 break; 1556 case arrayof_jshort_disjoint_arraycopy_id: 1557 size = sizeof(jshort); 1558 aligned = true; 1559 is_oop = false; 1560 dest_uninitialized = false; 1561 break; 1562 case jint_disjoint_arraycopy_id: 1563 size = sizeof(jint); 1564 aligned = false; 1565 is_oop = false; 1566 dest_uninitialized = false; 1567 break; 1568 case arrayof_jint_disjoint_arraycopy_id: 1569 size = sizeof(jint); 1570 aligned = true; 1571 is_oop = false; 1572 dest_uninitialized = false; 1573 break; 1574 case jlong_disjoint_arraycopy_id: 1575 // since this is always aligned we can (should!) use the same 1576 // stub as for case arrayof_jlong_disjoint_arraycopy 1577 ShouldNotReachHere(); 1578 break; 1579 case arrayof_jlong_disjoint_arraycopy_id: 1580 size = sizeof(jlong); 1581 aligned = true; 1582 is_oop = false; 1583 dest_uninitialized = false; 1584 break; 1585 case oop_disjoint_arraycopy_id: 1586 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1587 aligned = !UseCompressedOops; 1588 is_oop = true; 1589 dest_uninitialized = false; 1590 break; 1591 case arrayof_oop_disjoint_arraycopy_id: 1592 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1593 aligned = !UseCompressedOops; 1594 is_oop = true; 1595 dest_uninitialized = false; 1596 break; 1597 case oop_disjoint_arraycopy_uninit_id: 1598 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1599 aligned = !UseCompressedOops; 1600 is_oop = true; 1601 dest_uninitialized = true; 1602 break; 1603 case arrayof_oop_disjoint_arraycopy_uninit_id: 1604 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1605 aligned = !UseCompressedOops; 1606 is_oop = true; 1607 dest_uninitialized = true; 1608 break; 1609 default: 1610 ShouldNotReachHere(); 1611 break; 1612 } 1613 1614 __ align(CodeEntryAlignment); 1615 StubCodeMark mark(this, stub_id); 1616 address start = __ pc(); 1617 __ enter(); 1618 1619 if (entry != nullptr) { 1620 *entry = __ pc(); 1621 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1622 BLOCK_COMMENT("Entry:"); 1623 } 1624 1625 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1626 if (dest_uninitialized) { 1627 decorators |= IS_DEST_UNINITIALIZED; 1628 } 1629 if (aligned) { 1630 decorators |= ARRAYCOPY_ALIGNED; 1631 } 1632 1633 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1634 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1635 1636 if (is_oop) { 1637 // save regs before copy_memory 1638 __ push(RegSet::of(d, count), sp); 1639 } 1640 { 1641 // UnsafeMemoryAccess page error: continue after unsafe access 1642 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1643 UnsafeMemoryAccessMark umam(this, add_entry, true); 1644 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1645 } 1646 1647 if (is_oop) { 1648 __ pop(RegSet::of(d, count), sp); 1649 if (VerifyOops) 1650 verify_oop_array(size, d, count, r16); 1651 } 1652 1653 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1654 1655 __ leave(); 1656 __ mov(r0, zr); // return 0 1657 __ ret(lr); 1658 return start; 1659 } 1660 1661 // Arguments: 1662 // stub_id - is used to name the stub and identify all details of 1663 // how to perform the copy. 1664 // 1665 // nooverlap_target - identifes the (post push) entry for the 1666 // corresponding disjoint copy routine which can be 1667 // jumped to if the ranges do not actually overlap 1668 // 1669 // entry - is assigned to the stub's post push entry point unless 1670 // it is null 1671 // 1672 // 1673 // Inputs: 1674 // c_rarg0 - source array address 1675 // c_rarg1 - destination array address 1676 // c_rarg2 - element count, treated as ssize_t, can be zero 1677 // 1678 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1679 // the hardware handle it. The two dwords within qwords that span 1680 // cache line boundaries will still be loaded and stored atomically. 1681 // 1682 // Side Effects: 1683 // entry is set to the no-overlap entry point so it can be used by 1684 // some other conjoint copy method 1685 // 1686 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1687 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1688 RegSet saved_regs = RegSet::of(s, d, count); 1689 int size; 1690 bool aligned; 1691 bool is_oop; 1692 bool dest_uninitialized; 1693 switch (stub_id) { 1694 case jbyte_arraycopy_id: 1695 size = sizeof(jbyte); 1696 aligned = false; 1697 is_oop = false; 1698 dest_uninitialized = false; 1699 break; 1700 case arrayof_jbyte_arraycopy_id: 1701 size = sizeof(jbyte); 1702 aligned = true; 1703 is_oop = false; 1704 dest_uninitialized = false; 1705 break; 1706 case jshort_arraycopy_id: 1707 size = sizeof(jshort); 1708 aligned = false; 1709 is_oop = false; 1710 dest_uninitialized = false; 1711 break; 1712 case arrayof_jshort_arraycopy_id: 1713 size = sizeof(jshort); 1714 aligned = true; 1715 is_oop = false; 1716 dest_uninitialized = false; 1717 break; 1718 case jint_arraycopy_id: 1719 size = sizeof(jint); 1720 aligned = false; 1721 is_oop = false; 1722 dest_uninitialized = false; 1723 break; 1724 case arrayof_jint_arraycopy_id: 1725 size = sizeof(jint); 1726 aligned = true; 1727 is_oop = false; 1728 dest_uninitialized = false; 1729 break; 1730 case jlong_arraycopy_id: 1731 // since this is always aligned we can (should!) use the same 1732 // stub as for case arrayof_jlong_disjoint_arraycopy 1733 ShouldNotReachHere(); 1734 break; 1735 case arrayof_jlong_arraycopy_id: 1736 size = sizeof(jlong); 1737 aligned = true; 1738 is_oop = false; 1739 dest_uninitialized = false; 1740 break; 1741 case oop_arraycopy_id: 1742 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1743 aligned = !UseCompressedOops; 1744 is_oop = true; 1745 dest_uninitialized = false; 1746 break; 1747 case arrayof_oop_arraycopy_id: 1748 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1749 aligned = !UseCompressedOops; 1750 is_oop = true; 1751 dest_uninitialized = false; 1752 break; 1753 case oop_arraycopy_uninit_id: 1754 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1755 aligned = !UseCompressedOops; 1756 is_oop = true; 1757 dest_uninitialized = true; 1758 break; 1759 case arrayof_oop_arraycopy_uninit_id: 1760 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1761 aligned = !UseCompressedOops; 1762 is_oop = true; 1763 dest_uninitialized = true; 1764 break; 1765 default: 1766 ShouldNotReachHere(); 1767 } 1768 1769 StubCodeMark mark(this, stub_id); 1770 address start = __ pc(); 1771 __ enter(); 1772 1773 if (entry != nullptr) { 1774 *entry = __ pc(); 1775 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1776 BLOCK_COMMENT("Entry:"); 1777 } 1778 1779 // use fwd copy when (d-s) above_equal (count*size) 1780 __ sub(rscratch1, d, s); 1781 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1782 __ br(Assembler::HS, nooverlap_target); 1783 1784 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1785 if (dest_uninitialized) { 1786 decorators |= IS_DEST_UNINITIALIZED; 1787 } 1788 if (aligned) { 1789 decorators |= ARRAYCOPY_ALIGNED; 1790 } 1791 1792 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1793 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1794 1795 if (is_oop) { 1796 // save regs before copy_memory 1797 __ push(RegSet::of(d, count), sp); 1798 } 1799 { 1800 // UnsafeMemoryAccess page error: continue after unsafe access 1801 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1802 UnsafeMemoryAccessMark umam(this, add_entry, true); 1803 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1804 } 1805 if (is_oop) { 1806 __ pop(RegSet::of(d, count), sp); 1807 if (VerifyOops) 1808 verify_oop_array(size, d, count, r16); 1809 } 1810 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1811 __ leave(); 1812 __ mov(r0, zr); // return 0 1813 __ ret(lr); 1814 return start; 1815 } 1816 1817 // Helper for generating a dynamic type check. 1818 // Smashes rscratch1, rscratch2. 1819 void generate_type_check(Register sub_klass, 1820 Register super_check_offset, 1821 Register super_klass, 1822 Register temp1, 1823 Register temp2, 1824 Register result, 1825 Label& L_success) { 1826 assert_different_registers(sub_klass, super_check_offset, super_klass); 1827 1828 BLOCK_COMMENT("type_check:"); 1829 1830 Label L_miss; 1831 1832 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1833 super_check_offset); 1834 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1835 1836 // Fall through on failure! 1837 __ BIND(L_miss); 1838 } 1839 1840 // 1841 // Generate checkcasting array copy stub 1842 // 1843 // Input: 1844 // c_rarg0 - source array address 1845 // c_rarg1 - destination array address 1846 // c_rarg2 - element count, treated as ssize_t, can be zero 1847 // c_rarg3 - size_t ckoff (super_check_offset) 1848 // c_rarg4 - oop ckval (super_klass) 1849 // 1850 // Output: 1851 // r0 == 0 - success 1852 // r0 == -1^K - failure, where K is partial transfer count 1853 // 1854 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1855 bool dest_uninitialized; 1856 switch (stub_id) { 1857 case checkcast_arraycopy_id: 1858 dest_uninitialized = false; 1859 break; 1860 case checkcast_arraycopy_uninit_id: 1861 dest_uninitialized = true; 1862 break; 1863 default: 1864 ShouldNotReachHere(); 1865 } 1866 1867 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1868 1869 // Input registers (after setup_arg_regs) 1870 const Register from = c_rarg0; // source array address 1871 const Register to = c_rarg1; // destination array address 1872 const Register count = c_rarg2; // elementscount 1873 const Register ckoff = c_rarg3; // super_check_offset 1874 const Register ckval = c_rarg4; // super_klass 1875 1876 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1877 RegSet wb_post_saved_regs = RegSet::of(count); 1878 1879 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1880 const Register copied_oop = r22; // actual oop copied 1881 const Register count_save = r21; // orig elementscount 1882 const Register start_to = r20; // destination array start address 1883 const Register r19_klass = r19; // oop._klass 1884 1885 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1886 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1887 1888 //--------------------------------------------------------------- 1889 // Assembler stub will be used for this call to arraycopy 1890 // if the two arrays are subtypes of Object[] but the 1891 // destination array type is not equal to or a supertype 1892 // of the source type. Each element must be separately 1893 // checked. 1894 1895 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1896 copied_oop, r19_klass, count_save); 1897 1898 __ align(CodeEntryAlignment); 1899 StubCodeMark mark(this, stub_id); 1900 address start = __ pc(); 1901 1902 __ enter(); // required for proper stackwalking of RuntimeStub frame 1903 1904 #ifdef ASSERT 1905 // caller guarantees that the arrays really are different 1906 // otherwise, we would have to make conjoint checks 1907 { Label L; 1908 __ b(L); // conjoint check not yet implemented 1909 __ stop("checkcast_copy within a single array"); 1910 __ bind(L); 1911 } 1912 #endif //ASSERT 1913 1914 // Caller of this entry point must set up the argument registers. 1915 if (entry != nullptr) { 1916 *entry = __ pc(); 1917 BLOCK_COMMENT("Entry:"); 1918 } 1919 1920 // Empty array: Nothing to do. 1921 __ cbz(count, L_done); 1922 __ push(RegSet::of(r19, r20, r21, r22), sp); 1923 1924 #ifdef ASSERT 1925 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1926 // The ckoff and ckval must be mutually consistent, 1927 // even though caller generates both. 1928 { Label L; 1929 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1930 __ ldrw(start_to, Address(ckval, sco_offset)); 1931 __ cmpw(ckoff, start_to); 1932 __ br(Assembler::EQ, L); 1933 __ stop("super_check_offset inconsistent"); 1934 __ bind(L); 1935 } 1936 #endif //ASSERT 1937 1938 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1939 bool is_oop = true; 1940 int element_size = UseCompressedOops ? 4 : 8; 1941 if (dest_uninitialized) { 1942 decorators |= IS_DEST_UNINITIALIZED; 1943 } 1944 1945 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1946 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1947 1948 // save the original count 1949 __ mov(count_save, count); 1950 1951 // Copy from low to high addresses 1952 __ mov(start_to, to); // Save destination array start address 1953 __ b(L_load_element); 1954 1955 // ======== begin loop ======== 1956 // (Loop is rotated; its entry is L_load_element.) 1957 // Loop control: 1958 // for (; count != 0; count--) { 1959 // copied_oop = load_heap_oop(from++); 1960 // ... generate_type_check ...; 1961 // store_heap_oop(to++, copied_oop); 1962 // } 1963 __ align(OptoLoopAlignment); 1964 1965 __ BIND(L_store_element); 1966 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1967 __ post(to, element_size), copied_oop, noreg, 1968 gct1, gct2, gct3); 1969 __ sub(count, count, 1); 1970 __ cbz(count, L_do_card_marks); 1971 1972 // ======== loop entry is here ======== 1973 __ BIND(L_load_element); 1974 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1975 copied_oop, noreg, __ post(from, element_size), 1976 gct1); 1977 __ cbz(copied_oop, L_store_element); 1978 1979 __ load_klass(r19_klass, copied_oop);// query the object klass 1980 1981 BLOCK_COMMENT("type_check:"); 1982 generate_type_check(/*sub_klass*/r19_klass, 1983 /*super_check_offset*/ckoff, 1984 /*super_klass*/ckval, 1985 /*r_array_base*/gct1, 1986 /*temp2*/gct2, 1987 /*result*/r10, L_store_element); 1988 1989 // Fall through on failure! 1990 1991 // ======== end loop ======== 1992 1993 // It was a real error; we must depend on the caller to finish the job. 1994 // Register count = remaining oops, count_orig = total oops. 1995 // Emit GC store barriers for the oops we have copied and report 1996 // their number to the caller. 1997 1998 __ subs(count, count_save, count); // K = partially copied oop count 1999 __ eon(count, count, zr); // report (-1^K) to caller 2000 __ br(Assembler::EQ, L_done_pop); 2001 2002 __ BIND(L_do_card_marks); 2003 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2004 2005 __ bind(L_done_pop); 2006 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2007 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2008 2009 __ bind(L_done); 2010 __ mov(r0, count); 2011 __ leave(); 2012 __ ret(lr); 2013 2014 return start; 2015 } 2016 2017 // Perform range checks on the proposed arraycopy. 2018 // Kills temp, but nothing else. 2019 // Also, clean the sign bits of src_pos and dst_pos. 2020 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2021 Register src_pos, // source position (c_rarg1) 2022 Register dst, // destination array oo (c_rarg2) 2023 Register dst_pos, // destination position (c_rarg3) 2024 Register length, 2025 Register temp, 2026 Label& L_failed) { 2027 BLOCK_COMMENT("arraycopy_range_checks:"); 2028 2029 assert_different_registers(rscratch1, temp); 2030 2031 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2032 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2033 __ addw(temp, length, src_pos); 2034 __ cmpw(temp, rscratch1); 2035 __ br(Assembler::HI, L_failed); 2036 2037 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2038 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2039 __ addw(temp, length, dst_pos); 2040 __ cmpw(temp, rscratch1); 2041 __ br(Assembler::HI, L_failed); 2042 2043 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2044 __ movw(src_pos, src_pos); 2045 __ movw(dst_pos, dst_pos); 2046 2047 BLOCK_COMMENT("arraycopy_range_checks done"); 2048 } 2049 2050 // These stubs get called from some dumb test routine. 2051 // I'll write them properly when they're called from 2052 // something that's actually doing something. 2053 static void fake_arraycopy_stub(address src, address dst, int count) { 2054 assert(count == 0, "huh?"); 2055 } 2056 2057 2058 // 2059 // Generate 'unsafe' array copy stub 2060 // Though just as safe as the other stubs, it takes an unscaled 2061 // size_t argument instead of an element count. 2062 // 2063 // Input: 2064 // c_rarg0 - source array address 2065 // c_rarg1 - destination array address 2066 // c_rarg2 - byte count, treated as ssize_t, can be zero 2067 // 2068 // Examines the alignment of the operands and dispatches 2069 // to a long, int, short, or byte copy loop. 2070 // 2071 address generate_unsafe_copy(address byte_copy_entry, 2072 address short_copy_entry, 2073 address int_copy_entry, 2074 address long_copy_entry) { 2075 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2076 2077 Label L_long_aligned, L_int_aligned, L_short_aligned; 2078 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2079 2080 __ align(CodeEntryAlignment); 2081 StubCodeMark mark(this, stub_id); 2082 address start = __ pc(); 2083 __ enter(); // required for proper stackwalking of RuntimeStub frame 2084 2085 // bump this on entry, not on exit: 2086 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2087 2088 __ orr(rscratch1, s, d); 2089 __ orr(rscratch1, rscratch1, count); 2090 2091 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2092 __ cbz(rscratch1, L_long_aligned); 2093 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2094 __ cbz(rscratch1, L_int_aligned); 2095 __ tbz(rscratch1, 0, L_short_aligned); 2096 __ b(RuntimeAddress(byte_copy_entry)); 2097 2098 __ BIND(L_short_aligned); 2099 __ lsr(count, count, LogBytesPerShort); // size => short_count 2100 __ b(RuntimeAddress(short_copy_entry)); 2101 __ BIND(L_int_aligned); 2102 __ lsr(count, count, LogBytesPerInt); // size => int_count 2103 __ b(RuntimeAddress(int_copy_entry)); 2104 __ BIND(L_long_aligned); 2105 __ lsr(count, count, LogBytesPerLong); // size => long_count 2106 __ b(RuntimeAddress(long_copy_entry)); 2107 2108 return start; 2109 } 2110 2111 // 2112 // Generate generic array copy stubs 2113 // 2114 // Input: 2115 // c_rarg0 - src oop 2116 // c_rarg1 - src_pos (32-bits) 2117 // c_rarg2 - dst oop 2118 // c_rarg3 - dst_pos (32-bits) 2119 // c_rarg4 - element count (32-bits) 2120 // 2121 // Output: 2122 // r0 == 0 - success 2123 // r0 == -1^K - failure, where K is partial transfer count 2124 // 2125 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2126 address int_copy_entry, address oop_copy_entry, 2127 address long_copy_entry, address checkcast_copy_entry) { 2128 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2129 2130 Label L_failed, L_objArray; 2131 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2132 2133 // Input registers 2134 const Register src = c_rarg0; // source array oop 2135 const Register src_pos = c_rarg1; // source position 2136 const Register dst = c_rarg2; // destination array oop 2137 const Register dst_pos = c_rarg3; // destination position 2138 const Register length = c_rarg4; 2139 2140 2141 // Registers used as temps 2142 const Register dst_klass = c_rarg5; 2143 2144 __ align(CodeEntryAlignment); 2145 2146 StubCodeMark mark(this, stub_id); 2147 2148 address start = __ pc(); 2149 2150 __ enter(); // required for proper stackwalking of RuntimeStub frame 2151 2152 // bump this on entry, not on exit: 2153 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2154 2155 //----------------------------------------------------------------------- 2156 // Assembler stub will be used for this call to arraycopy 2157 // if the following conditions are met: 2158 // 2159 // (1) src and dst must not be null. 2160 // (2) src_pos must not be negative. 2161 // (3) dst_pos must not be negative. 2162 // (4) length must not be negative. 2163 // (5) src klass and dst klass should be the same and not null. 2164 // (6) src and dst should be arrays. 2165 // (7) src_pos + length must not exceed length of src. 2166 // (8) dst_pos + length must not exceed length of dst. 2167 // 2168 2169 // if (src == nullptr) return -1; 2170 __ cbz(src, L_failed); 2171 2172 // if (src_pos < 0) return -1; 2173 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2174 2175 // if (dst == nullptr) return -1; 2176 __ cbz(dst, L_failed); 2177 2178 // if (dst_pos < 0) return -1; 2179 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2180 2181 // registers used as temp 2182 const Register scratch_length = r16; // elements count to copy 2183 const Register scratch_src_klass = r17; // array klass 2184 const Register lh = r15; // layout helper 2185 2186 // if (length < 0) return -1; 2187 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2188 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2189 2190 __ load_klass(scratch_src_klass, src); 2191 #ifdef ASSERT 2192 // assert(src->klass() != nullptr); 2193 { 2194 BLOCK_COMMENT("assert klasses not null {"); 2195 Label L1, L2; 2196 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2197 __ bind(L1); 2198 __ stop("broken null klass"); 2199 __ bind(L2); 2200 __ load_klass(rscratch1, dst); 2201 __ cbz(rscratch1, L1); // this would be broken also 2202 BLOCK_COMMENT("} assert klasses not null done"); 2203 } 2204 #endif 2205 2206 // Load layout helper (32-bits) 2207 // 2208 // |array_tag| | header_size | element_type | |log2_element_size| 2209 // 32 30 24 16 8 2 0 2210 // 2211 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2212 // 2213 2214 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2215 2216 // Handle objArrays completely differently... 2217 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2218 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2219 __ movw(rscratch1, objArray_lh); 2220 __ eorw(rscratch2, lh, rscratch1); 2221 __ cbzw(rscratch2, L_objArray); 2222 2223 // if (src->klass() != dst->klass()) return -1; 2224 __ load_klass(rscratch2, dst); 2225 __ eor(rscratch2, rscratch2, scratch_src_klass); 2226 __ cbnz(rscratch2, L_failed); 2227 2228 // if (!src->is_Array()) return -1; 2229 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2230 2231 // At this point, it is known to be a typeArray (array_tag 0x3). 2232 #ifdef ASSERT 2233 { 2234 BLOCK_COMMENT("assert primitive array {"); 2235 Label L; 2236 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2237 __ cmpw(lh, rscratch2); 2238 __ br(Assembler::GE, L); 2239 __ stop("must be a primitive array"); 2240 __ bind(L); 2241 BLOCK_COMMENT("} assert primitive array done"); 2242 } 2243 #endif 2244 2245 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2246 rscratch2, L_failed); 2247 2248 // TypeArrayKlass 2249 // 2250 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2251 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2252 // 2253 2254 const Register rscratch1_offset = rscratch1; // array offset 2255 const Register r15_elsize = lh; // element size 2256 2257 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2258 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2259 __ add(src, src, rscratch1_offset); // src array offset 2260 __ add(dst, dst, rscratch1_offset); // dst array offset 2261 BLOCK_COMMENT("choose copy loop based on element size"); 2262 2263 // next registers should be set before the jump to corresponding stub 2264 const Register from = c_rarg0; // source array address 2265 const Register to = c_rarg1; // destination array address 2266 const Register count = c_rarg2; // elements count 2267 2268 // 'from', 'to', 'count' registers should be set in such order 2269 // since they are the same as 'src', 'src_pos', 'dst'. 2270 2271 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2272 2273 // The possible values of elsize are 0-3, i.e. exact_log2(element 2274 // size in bytes). We do a simple bitwise binary search. 2275 __ BIND(L_copy_bytes); 2276 __ tbnz(r15_elsize, 1, L_copy_ints); 2277 __ tbnz(r15_elsize, 0, L_copy_shorts); 2278 __ lea(from, Address(src, src_pos));// src_addr 2279 __ lea(to, Address(dst, dst_pos));// dst_addr 2280 __ movw(count, scratch_length); // length 2281 __ b(RuntimeAddress(byte_copy_entry)); 2282 2283 __ BIND(L_copy_shorts); 2284 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2285 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2286 __ movw(count, scratch_length); // length 2287 __ b(RuntimeAddress(short_copy_entry)); 2288 2289 __ BIND(L_copy_ints); 2290 __ tbnz(r15_elsize, 0, L_copy_longs); 2291 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2292 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2293 __ movw(count, scratch_length); // length 2294 __ b(RuntimeAddress(int_copy_entry)); 2295 2296 __ BIND(L_copy_longs); 2297 #ifdef ASSERT 2298 { 2299 BLOCK_COMMENT("assert long copy {"); 2300 Label L; 2301 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2302 __ cmpw(r15_elsize, LogBytesPerLong); 2303 __ br(Assembler::EQ, L); 2304 __ stop("must be long copy, but elsize is wrong"); 2305 __ bind(L); 2306 BLOCK_COMMENT("} assert long copy done"); 2307 } 2308 #endif 2309 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2310 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2311 __ movw(count, scratch_length); // length 2312 __ b(RuntimeAddress(long_copy_entry)); 2313 2314 // ObjArrayKlass 2315 __ BIND(L_objArray); 2316 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2317 2318 Label L_plain_copy, L_checkcast_copy; 2319 // test array classes for subtyping 2320 __ load_klass(r15, dst); 2321 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2322 __ br(Assembler::NE, L_checkcast_copy); 2323 2324 // Identically typed arrays can be copied without element-wise checks. 2325 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2326 rscratch2, L_failed); 2327 2328 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2329 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2330 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2331 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2332 __ movw(count, scratch_length); // length 2333 __ BIND(L_plain_copy); 2334 __ b(RuntimeAddress(oop_copy_entry)); 2335 2336 __ BIND(L_checkcast_copy); 2337 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2338 { 2339 // Before looking at dst.length, make sure dst is also an objArray. 2340 __ ldrw(rscratch1, Address(r15, lh_offset)); 2341 __ movw(rscratch2, objArray_lh); 2342 __ eorw(rscratch1, rscratch1, rscratch2); 2343 __ cbnzw(rscratch1, L_failed); 2344 2345 // It is safe to examine both src.length and dst.length. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 r15, L_failed); 2348 2349 __ load_klass(dst_klass, dst); // reload 2350 2351 // Marshal the base address arguments now, freeing registers. 2352 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2353 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2354 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2355 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2356 __ movw(count, length); // length (reloaded) 2357 Register sco_temp = c_rarg3; // this register is free now 2358 assert_different_registers(from, to, count, sco_temp, 2359 dst_klass, scratch_src_klass); 2360 // assert_clean_int(count, sco_temp); 2361 2362 // Generate the type check. 2363 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2364 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2365 2366 // Smashes rscratch1, rscratch2 2367 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2368 L_plain_copy); 2369 2370 // Fetch destination element klass from the ObjArrayKlass header. 2371 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2372 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2373 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2374 2375 // the checkcast_copy loop needs two extra arguments: 2376 assert(c_rarg3 == sco_temp, "#3 already in place"); 2377 // Set up arguments for checkcast_copy_entry. 2378 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2379 __ b(RuntimeAddress(checkcast_copy_entry)); 2380 } 2381 2382 __ BIND(L_failed); 2383 __ mov(r0, -1); 2384 __ leave(); // required for proper stackwalking of RuntimeStub frame 2385 __ ret(lr); 2386 2387 return start; 2388 } 2389 2390 // 2391 // Generate stub for array fill. If "aligned" is true, the 2392 // "to" address is assumed to be heapword aligned. 2393 // 2394 // Arguments for generated stub: 2395 // to: c_rarg0 2396 // value: c_rarg1 2397 // count: c_rarg2 treated as signed 2398 // 2399 address generate_fill(StubGenStubId stub_id) { 2400 BasicType t; 2401 bool aligned; 2402 2403 switch (stub_id) { 2404 case jbyte_fill_id: 2405 t = T_BYTE; 2406 aligned = false; 2407 break; 2408 case jshort_fill_id: 2409 t = T_SHORT; 2410 aligned = false; 2411 break; 2412 case jint_fill_id: 2413 t = T_INT; 2414 aligned = false; 2415 break; 2416 case arrayof_jbyte_fill_id: 2417 t = T_BYTE; 2418 aligned = true; 2419 break; 2420 case arrayof_jshort_fill_id: 2421 t = T_SHORT; 2422 aligned = true; 2423 break; 2424 case arrayof_jint_fill_id: 2425 t = T_INT; 2426 aligned = true; 2427 break; 2428 default: 2429 ShouldNotReachHere(); 2430 }; 2431 2432 __ align(CodeEntryAlignment); 2433 StubCodeMark mark(this, stub_id); 2434 address start = __ pc(); 2435 2436 BLOCK_COMMENT("Entry:"); 2437 2438 const Register to = c_rarg0; // source array address 2439 const Register value = c_rarg1; // value 2440 const Register count = c_rarg2; // elements count 2441 2442 const Register bz_base = r10; // base for block_zero routine 2443 const Register cnt_words = r11; // temp register 2444 2445 __ enter(); 2446 2447 Label L_fill_elements, L_exit1; 2448 2449 int shift = -1; 2450 switch (t) { 2451 case T_BYTE: 2452 shift = 0; 2453 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2454 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2455 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2456 __ br(Assembler::LO, L_fill_elements); 2457 break; 2458 case T_SHORT: 2459 shift = 1; 2460 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2461 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2462 __ br(Assembler::LO, L_fill_elements); 2463 break; 2464 case T_INT: 2465 shift = 2; 2466 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2467 __ br(Assembler::LO, L_fill_elements); 2468 break; 2469 default: ShouldNotReachHere(); 2470 } 2471 2472 // Align source address at 8 bytes address boundary. 2473 Label L_skip_align1, L_skip_align2, L_skip_align4; 2474 if (!aligned) { 2475 switch (t) { 2476 case T_BYTE: 2477 // One byte misalignment happens only for byte arrays. 2478 __ tbz(to, 0, L_skip_align1); 2479 __ strb(value, Address(__ post(to, 1))); 2480 __ subw(count, count, 1); 2481 __ bind(L_skip_align1); 2482 // Fallthrough 2483 case T_SHORT: 2484 // Two bytes misalignment happens only for byte and short (char) arrays. 2485 __ tbz(to, 1, L_skip_align2); 2486 __ strh(value, Address(__ post(to, 2))); 2487 __ subw(count, count, 2 >> shift); 2488 __ bind(L_skip_align2); 2489 // Fallthrough 2490 case T_INT: 2491 // Align to 8 bytes, we know we are 4 byte aligned to start. 2492 __ tbz(to, 2, L_skip_align4); 2493 __ strw(value, Address(__ post(to, 4))); 2494 __ subw(count, count, 4 >> shift); 2495 __ bind(L_skip_align4); 2496 break; 2497 default: ShouldNotReachHere(); 2498 } 2499 } 2500 2501 // 2502 // Fill large chunks 2503 // 2504 __ lsrw(cnt_words, count, 3 - shift); // number of words 2505 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2506 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2507 if (UseBlockZeroing) { 2508 Label non_block_zeroing, rest; 2509 // If the fill value is zero we can use the fast zero_words(). 2510 __ cbnz(value, non_block_zeroing); 2511 __ mov(bz_base, to); 2512 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2513 address tpc = __ zero_words(bz_base, cnt_words); 2514 if (tpc == nullptr) { 2515 fatal("CodeCache is full at generate_fill"); 2516 } 2517 __ b(rest); 2518 __ bind(non_block_zeroing); 2519 __ fill_words(to, cnt_words, value); 2520 __ bind(rest); 2521 } else { 2522 __ fill_words(to, cnt_words, value); 2523 } 2524 2525 // Remaining count is less than 8 bytes. Fill it by a single store. 2526 // Note that the total length is no less than 8 bytes. 2527 if (t == T_BYTE || t == T_SHORT) { 2528 Label L_exit1; 2529 __ cbzw(count, L_exit1); 2530 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2531 __ str(value, Address(to, -8)); // overwrite some elements 2532 __ bind(L_exit1); 2533 __ leave(); 2534 __ ret(lr); 2535 } 2536 2537 // Handle copies less than 8 bytes. 2538 Label L_fill_2, L_fill_4, L_exit2; 2539 __ bind(L_fill_elements); 2540 switch (t) { 2541 case T_BYTE: 2542 __ tbz(count, 0, L_fill_2); 2543 __ strb(value, Address(__ post(to, 1))); 2544 __ bind(L_fill_2); 2545 __ tbz(count, 1, L_fill_4); 2546 __ strh(value, Address(__ post(to, 2))); 2547 __ bind(L_fill_4); 2548 __ tbz(count, 2, L_exit2); 2549 __ strw(value, Address(to)); 2550 break; 2551 case T_SHORT: 2552 __ tbz(count, 0, L_fill_4); 2553 __ strh(value, Address(__ post(to, 2))); 2554 __ bind(L_fill_4); 2555 __ tbz(count, 1, L_exit2); 2556 __ strw(value, Address(to)); 2557 break; 2558 case T_INT: 2559 __ cbzw(count, L_exit2); 2560 __ strw(value, Address(to)); 2561 break; 2562 default: ShouldNotReachHere(); 2563 } 2564 __ bind(L_exit2); 2565 __ leave(); 2566 __ ret(lr); 2567 return start; 2568 } 2569 2570 address generate_data_cache_writeback() { 2571 const Register line = c_rarg0; // address of line to write back 2572 2573 __ align(CodeEntryAlignment); 2574 2575 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2576 StubCodeMark mark(this, stub_id); 2577 2578 address start = __ pc(); 2579 __ enter(); 2580 __ cache_wb(Address(line, 0)); 2581 __ leave(); 2582 __ ret(lr); 2583 2584 return start; 2585 } 2586 2587 address generate_data_cache_writeback_sync() { 2588 const Register is_pre = c_rarg0; // pre or post sync 2589 2590 __ align(CodeEntryAlignment); 2591 2592 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2593 StubCodeMark mark(this, stub_id); 2594 2595 // pre wbsync is a no-op 2596 // post wbsync translates to an sfence 2597 2598 Label skip; 2599 address start = __ pc(); 2600 __ enter(); 2601 __ cbnz(is_pre, skip); 2602 __ cache_wbsync(false); 2603 __ bind(skip); 2604 __ leave(); 2605 __ ret(lr); 2606 2607 return start; 2608 } 2609 2610 void generate_arraycopy_stubs() { 2611 address entry; 2612 address entry_jbyte_arraycopy; 2613 address entry_jshort_arraycopy; 2614 address entry_jint_arraycopy; 2615 address entry_oop_arraycopy; 2616 address entry_jlong_arraycopy; 2617 address entry_checkcast_arraycopy; 2618 2619 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2620 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2621 2622 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2623 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2624 2625 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2626 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2627 2628 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2629 2630 //*** jbyte 2631 // Always need aligned and unaligned versions 2632 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2633 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2634 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2635 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2636 2637 //*** jshort 2638 // Always need aligned and unaligned versions 2639 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2640 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2641 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2642 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2643 2644 //*** jint 2645 // Aligned versions 2646 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2647 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2648 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2649 // entry_jint_arraycopy always points to the unaligned version 2650 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2651 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2652 2653 //*** jlong 2654 // It is always aligned 2655 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2656 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2657 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2658 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2659 2660 //*** oops 2661 { 2662 // With compressed oops we need unaligned versions; notice that 2663 // we overwrite entry_oop_arraycopy. 2664 bool aligned = !UseCompressedOops; 2665 2666 StubRoutines::_arrayof_oop_disjoint_arraycopy 2667 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2668 StubRoutines::_arrayof_oop_arraycopy 2669 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2670 // Aligned versions without pre-barriers 2671 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2672 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2673 StubRoutines::_arrayof_oop_arraycopy_uninit 2674 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2675 } 2676 2677 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2678 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2679 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2680 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2681 2682 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2683 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2684 2685 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2686 entry_jshort_arraycopy, 2687 entry_jint_arraycopy, 2688 entry_jlong_arraycopy); 2689 2690 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2691 entry_jshort_arraycopy, 2692 entry_jint_arraycopy, 2693 entry_oop_arraycopy, 2694 entry_jlong_arraycopy, 2695 entry_checkcast_arraycopy); 2696 2697 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2698 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2699 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2700 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2701 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2702 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2703 } 2704 2705 void generate_math_stubs() { Unimplemented(); } 2706 2707 // Arguments: 2708 // 2709 // Inputs: 2710 // c_rarg0 - source byte array address 2711 // c_rarg1 - destination byte array address 2712 // c_rarg2 - K (key) in little endian int array 2713 // 2714 address generate_aescrypt_encryptBlock() { 2715 __ align(CodeEntryAlignment); 2716 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2717 StubCodeMark mark(this, stub_id); 2718 2719 const Register from = c_rarg0; // source array address 2720 const Register to = c_rarg1; // destination array address 2721 const Register key = c_rarg2; // key array address 2722 const Register keylen = rscratch1; 2723 2724 address start = __ pc(); 2725 __ enter(); 2726 2727 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2728 2729 __ aesenc_loadkeys(key, keylen); 2730 __ aesecb_encrypt(from, to, keylen); 2731 2732 __ mov(r0, 0); 2733 2734 __ leave(); 2735 __ ret(lr); 2736 2737 return start; 2738 } 2739 2740 // Arguments: 2741 // 2742 // Inputs: 2743 // c_rarg0 - source byte array address 2744 // c_rarg1 - destination byte array address 2745 // c_rarg2 - K (key) in little endian int array 2746 // 2747 address generate_aescrypt_decryptBlock() { 2748 assert(UseAES, "need AES cryptographic extension support"); 2749 __ align(CodeEntryAlignment); 2750 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2751 StubCodeMark mark(this, stub_id); 2752 Label L_doLast; 2753 2754 const Register from = c_rarg0; // source array address 2755 const Register to = c_rarg1; // destination array address 2756 const Register key = c_rarg2; // key array address 2757 const Register keylen = rscratch1; 2758 2759 address start = __ pc(); 2760 __ enter(); // required for proper stackwalking of RuntimeStub frame 2761 2762 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2763 2764 __ aesecb_decrypt(from, to, key, keylen); 2765 2766 __ mov(r0, 0); 2767 2768 __ leave(); 2769 __ ret(lr); 2770 2771 return start; 2772 } 2773 2774 // Arguments: 2775 // 2776 // Inputs: 2777 // c_rarg0 - source byte array address 2778 // c_rarg1 - destination byte array address 2779 // c_rarg2 - K (key) in little endian int array 2780 // c_rarg3 - r vector byte array address 2781 // c_rarg4 - input length 2782 // 2783 // Output: 2784 // x0 - input length 2785 // 2786 address generate_cipherBlockChaining_encryptAESCrypt() { 2787 assert(UseAES, "need AES cryptographic extension support"); 2788 __ align(CodeEntryAlignment); 2789 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2790 StubCodeMark mark(this, stub_id); 2791 2792 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2793 2794 const Register from = c_rarg0; // source array address 2795 const Register to = c_rarg1; // destination array address 2796 const Register key = c_rarg2; // key array address 2797 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2798 // and left with the results of the last encryption block 2799 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2800 const Register keylen = rscratch1; 2801 2802 address start = __ pc(); 2803 2804 __ enter(); 2805 2806 __ movw(rscratch2, len_reg); 2807 2808 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2809 2810 __ ld1(v0, __ T16B, rvec); 2811 2812 __ cmpw(keylen, 52); 2813 __ br(Assembler::CC, L_loadkeys_44); 2814 __ br(Assembler::EQ, L_loadkeys_52); 2815 2816 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2817 __ rev32(v17, __ T16B, v17); 2818 __ rev32(v18, __ T16B, v18); 2819 __ BIND(L_loadkeys_52); 2820 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2821 __ rev32(v19, __ T16B, v19); 2822 __ rev32(v20, __ T16B, v20); 2823 __ BIND(L_loadkeys_44); 2824 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2825 __ rev32(v21, __ T16B, v21); 2826 __ rev32(v22, __ T16B, v22); 2827 __ rev32(v23, __ T16B, v23); 2828 __ rev32(v24, __ T16B, v24); 2829 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2830 __ rev32(v25, __ T16B, v25); 2831 __ rev32(v26, __ T16B, v26); 2832 __ rev32(v27, __ T16B, v27); 2833 __ rev32(v28, __ T16B, v28); 2834 __ ld1(v29, v30, v31, __ T16B, key); 2835 __ rev32(v29, __ T16B, v29); 2836 __ rev32(v30, __ T16B, v30); 2837 __ rev32(v31, __ T16B, v31); 2838 2839 __ BIND(L_aes_loop); 2840 __ ld1(v1, __ T16B, __ post(from, 16)); 2841 __ eor(v0, __ T16B, v0, v1); 2842 2843 __ br(Assembler::CC, L_rounds_44); 2844 __ br(Assembler::EQ, L_rounds_52); 2845 2846 __ aese(v0, v17); __ aesmc(v0, v0); 2847 __ aese(v0, v18); __ aesmc(v0, v0); 2848 __ BIND(L_rounds_52); 2849 __ aese(v0, v19); __ aesmc(v0, v0); 2850 __ aese(v0, v20); __ aesmc(v0, v0); 2851 __ BIND(L_rounds_44); 2852 __ aese(v0, v21); __ aesmc(v0, v0); 2853 __ aese(v0, v22); __ aesmc(v0, v0); 2854 __ aese(v0, v23); __ aesmc(v0, v0); 2855 __ aese(v0, v24); __ aesmc(v0, v0); 2856 __ aese(v0, v25); __ aesmc(v0, v0); 2857 __ aese(v0, v26); __ aesmc(v0, v0); 2858 __ aese(v0, v27); __ aesmc(v0, v0); 2859 __ aese(v0, v28); __ aesmc(v0, v0); 2860 __ aese(v0, v29); __ aesmc(v0, v0); 2861 __ aese(v0, v30); 2862 __ eor(v0, __ T16B, v0, v31); 2863 2864 __ st1(v0, __ T16B, __ post(to, 16)); 2865 2866 __ subw(len_reg, len_reg, 16); 2867 __ cbnzw(len_reg, L_aes_loop); 2868 2869 __ st1(v0, __ T16B, rvec); 2870 2871 __ mov(r0, rscratch2); 2872 2873 __ leave(); 2874 __ ret(lr); 2875 2876 return start; 2877 } 2878 2879 // Arguments: 2880 // 2881 // Inputs: 2882 // c_rarg0 - source byte array address 2883 // c_rarg1 - destination byte array address 2884 // c_rarg2 - K (key) in little endian int array 2885 // c_rarg3 - r vector byte array address 2886 // c_rarg4 - input length 2887 // 2888 // Output: 2889 // r0 - input length 2890 // 2891 address generate_cipherBlockChaining_decryptAESCrypt() { 2892 assert(UseAES, "need AES cryptographic extension support"); 2893 __ align(CodeEntryAlignment); 2894 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 2895 StubCodeMark mark(this, stub_id); 2896 2897 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2898 2899 const Register from = c_rarg0; // source array address 2900 const Register to = c_rarg1; // destination array address 2901 const Register key = c_rarg2; // key array address 2902 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2903 // and left with the results of the last encryption block 2904 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2905 const Register keylen = rscratch1; 2906 2907 address start = __ pc(); 2908 2909 __ enter(); 2910 2911 __ movw(rscratch2, len_reg); 2912 2913 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2914 2915 __ ld1(v2, __ T16B, rvec); 2916 2917 __ ld1(v31, __ T16B, __ post(key, 16)); 2918 __ rev32(v31, __ T16B, v31); 2919 2920 __ cmpw(keylen, 52); 2921 __ br(Assembler::CC, L_loadkeys_44); 2922 __ br(Assembler::EQ, L_loadkeys_52); 2923 2924 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2925 __ rev32(v17, __ T16B, v17); 2926 __ rev32(v18, __ T16B, v18); 2927 __ BIND(L_loadkeys_52); 2928 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2929 __ rev32(v19, __ T16B, v19); 2930 __ rev32(v20, __ T16B, v20); 2931 __ BIND(L_loadkeys_44); 2932 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2933 __ rev32(v21, __ T16B, v21); 2934 __ rev32(v22, __ T16B, v22); 2935 __ rev32(v23, __ T16B, v23); 2936 __ rev32(v24, __ T16B, v24); 2937 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2938 __ rev32(v25, __ T16B, v25); 2939 __ rev32(v26, __ T16B, v26); 2940 __ rev32(v27, __ T16B, v27); 2941 __ rev32(v28, __ T16B, v28); 2942 __ ld1(v29, v30, __ T16B, key); 2943 __ rev32(v29, __ T16B, v29); 2944 __ rev32(v30, __ T16B, v30); 2945 2946 __ BIND(L_aes_loop); 2947 __ ld1(v0, __ T16B, __ post(from, 16)); 2948 __ orr(v1, __ T16B, v0, v0); 2949 2950 __ br(Assembler::CC, L_rounds_44); 2951 __ br(Assembler::EQ, L_rounds_52); 2952 2953 __ aesd(v0, v17); __ aesimc(v0, v0); 2954 __ aesd(v0, v18); __ aesimc(v0, v0); 2955 __ BIND(L_rounds_52); 2956 __ aesd(v0, v19); __ aesimc(v0, v0); 2957 __ aesd(v0, v20); __ aesimc(v0, v0); 2958 __ BIND(L_rounds_44); 2959 __ aesd(v0, v21); __ aesimc(v0, v0); 2960 __ aesd(v0, v22); __ aesimc(v0, v0); 2961 __ aesd(v0, v23); __ aesimc(v0, v0); 2962 __ aesd(v0, v24); __ aesimc(v0, v0); 2963 __ aesd(v0, v25); __ aesimc(v0, v0); 2964 __ aesd(v0, v26); __ aesimc(v0, v0); 2965 __ aesd(v0, v27); __ aesimc(v0, v0); 2966 __ aesd(v0, v28); __ aesimc(v0, v0); 2967 __ aesd(v0, v29); __ aesimc(v0, v0); 2968 __ aesd(v0, v30); 2969 __ eor(v0, __ T16B, v0, v31); 2970 __ eor(v0, __ T16B, v0, v2); 2971 2972 __ st1(v0, __ T16B, __ post(to, 16)); 2973 __ orr(v2, __ T16B, v1, v1); 2974 2975 __ subw(len_reg, len_reg, 16); 2976 __ cbnzw(len_reg, L_aes_loop); 2977 2978 __ st1(v2, __ T16B, rvec); 2979 2980 __ mov(r0, rscratch2); 2981 2982 __ leave(); 2983 __ ret(lr); 2984 2985 return start; 2986 } 2987 2988 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2989 // Inputs: 128-bits. in is preserved. 2990 // The least-significant 64-bit word is in the upper dword of each vector. 2991 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2992 // Output: result 2993 void be_add_128_64(FloatRegister result, FloatRegister in, 2994 FloatRegister inc, FloatRegister tmp) { 2995 assert_different_registers(result, tmp, inc); 2996 2997 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2998 // input 2999 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3000 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3001 // MSD == 0 (must be!) to LSD 3002 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3003 } 3004 3005 // CTR AES crypt. 3006 // Arguments: 3007 // 3008 // Inputs: 3009 // c_rarg0 - source byte array address 3010 // c_rarg1 - destination byte array address 3011 // c_rarg2 - K (key) in little endian int array 3012 // c_rarg3 - counter vector byte array address 3013 // c_rarg4 - input length 3014 // c_rarg5 - saved encryptedCounter start 3015 // c_rarg6 - saved used length 3016 // 3017 // Output: 3018 // r0 - input length 3019 // 3020 address generate_counterMode_AESCrypt() { 3021 const Register in = c_rarg0; 3022 const Register out = c_rarg1; 3023 const Register key = c_rarg2; 3024 const Register counter = c_rarg3; 3025 const Register saved_len = c_rarg4, len = r10; 3026 const Register saved_encrypted_ctr = c_rarg5; 3027 const Register used_ptr = c_rarg6, used = r12; 3028 3029 const Register offset = r7; 3030 const Register keylen = r11; 3031 3032 const unsigned char block_size = 16; 3033 const int bulk_width = 4; 3034 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3035 // performance with larger data sizes, but it also means that the 3036 // fast path isn't used until you have at least 8 blocks, and up 3037 // to 127 bytes of data will be executed on the slow path. For 3038 // that reason, and also so as not to blow away too much icache, 4 3039 // blocks seems like a sensible compromise. 3040 3041 // Algorithm: 3042 // 3043 // if (len == 0) { 3044 // goto DONE; 3045 // } 3046 // int result = len; 3047 // do { 3048 // if (used >= blockSize) { 3049 // if (len >= bulk_width * blockSize) { 3050 // CTR_large_block(); 3051 // if (len == 0) 3052 // goto DONE; 3053 // } 3054 // for (;;) { 3055 // 16ByteVector v0 = counter; 3056 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3057 // used = 0; 3058 // if (len < blockSize) 3059 // break; /* goto NEXT */ 3060 // 16ByteVector v1 = load16Bytes(in, offset); 3061 // v1 = v1 ^ encryptedCounter; 3062 // store16Bytes(out, offset); 3063 // used = blockSize; 3064 // offset += blockSize; 3065 // len -= blockSize; 3066 // if (len == 0) 3067 // goto DONE; 3068 // } 3069 // } 3070 // NEXT: 3071 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3072 // len--; 3073 // } while (len != 0); 3074 // DONE: 3075 // return result; 3076 // 3077 // CTR_large_block() 3078 // Wide bulk encryption of whole blocks. 3079 3080 __ align(CodeEntryAlignment); 3081 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3082 StubCodeMark mark(this, stub_id); 3083 const address start = __ pc(); 3084 __ enter(); 3085 3086 Label DONE, CTR_large_block, large_block_return; 3087 __ ldrw(used, Address(used_ptr)); 3088 __ cbzw(saved_len, DONE); 3089 3090 __ mov(len, saved_len); 3091 __ mov(offset, 0); 3092 3093 // Compute #rounds for AES based on the length of the key array 3094 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3095 3096 __ aesenc_loadkeys(key, keylen); 3097 3098 { 3099 Label L_CTR_loop, NEXT; 3100 3101 __ bind(L_CTR_loop); 3102 3103 __ cmp(used, block_size); 3104 __ br(__ LO, NEXT); 3105 3106 // Maybe we have a lot of data 3107 __ subsw(rscratch1, len, bulk_width * block_size); 3108 __ br(__ HS, CTR_large_block); 3109 __ BIND(large_block_return); 3110 __ cbzw(len, DONE); 3111 3112 // Setup the counter 3113 __ movi(v4, __ T4S, 0); 3114 __ movi(v5, __ T4S, 1); 3115 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3116 3117 // 128-bit big-endian increment 3118 __ ld1(v0, __ T16B, counter); 3119 __ rev64(v16, __ T16B, v0); 3120 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3121 __ rev64(v16, __ T16B, v16); 3122 __ st1(v16, __ T16B, counter); 3123 // Previous counter value is in v0 3124 // v4 contains { 0, 1 } 3125 3126 { 3127 // We have fewer than bulk_width blocks of data left. Encrypt 3128 // them one by one until there is less than a full block 3129 // remaining, being careful to save both the encrypted counter 3130 // and the counter. 3131 3132 Label inner_loop; 3133 __ bind(inner_loop); 3134 // Counter to encrypt is in v0 3135 __ aesecb_encrypt(noreg, noreg, keylen); 3136 __ st1(v0, __ T16B, saved_encrypted_ctr); 3137 3138 // Do we have a remaining full block? 3139 3140 __ mov(used, 0); 3141 __ cmp(len, block_size); 3142 __ br(__ LO, NEXT); 3143 3144 // Yes, we have a full block 3145 __ ldrq(v1, Address(in, offset)); 3146 __ eor(v1, __ T16B, v1, v0); 3147 __ strq(v1, Address(out, offset)); 3148 __ mov(used, block_size); 3149 __ add(offset, offset, block_size); 3150 3151 __ subw(len, len, block_size); 3152 __ cbzw(len, DONE); 3153 3154 // Increment the counter, store it back 3155 __ orr(v0, __ T16B, v16, v16); 3156 __ rev64(v16, __ T16B, v16); 3157 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3158 __ rev64(v16, __ T16B, v16); 3159 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3160 3161 __ b(inner_loop); 3162 } 3163 3164 __ BIND(NEXT); 3165 3166 // Encrypt a single byte, and loop. 3167 // We expect this to be a rare event. 3168 __ ldrb(rscratch1, Address(in, offset)); 3169 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3170 __ eor(rscratch1, rscratch1, rscratch2); 3171 __ strb(rscratch1, Address(out, offset)); 3172 __ add(offset, offset, 1); 3173 __ add(used, used, 1); 3174 __ subw(len, len,1); 3175 __ cbnzw(len, L_CTR_loop); 3176 } 3177 3178 __ bind(DONE); 3179 __ strw(used, Address(used_ptr)); 3180 __ mov(r0, saved_len); 3181 3182 __ leave(); // required for proper stackwalking of RuntimeStub frame 3183 __ ret(lr); 3184 3185 // Bulk encryption 3186 3187 __ BIND (CTR_large_block); 3188 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3189 3190 if (bulk_width == 8) { 3191 __ sub(sp, sp, 4 * 16); 3192 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3193 } 3194 __ sub(sp, sp, 4 * 16); 3195 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3196 RegSet saved_regs = (RegSet::of(in, out, offset) 3197 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3198 __ push(saved_regs, sp); 3199 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3200 __ add(in, in, offset); 3201 __ add(out, out, offset); 3202 3203 // Keys should already be loaded into the correct registers 3204 3205 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3206 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3207 3208 // AES/CTR loop 3209 { 3210 Label L_CTR_loop; 3211 __ BIND(L_CTR_loop); 3212 3213 // Setup the counters 3214 __ movi(v8, __ T4S, 0); 3215 __ movi(v9, __ T4S, 1); 3216 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3217 3218 for (int i = 0; i < bulk_width; i++) { 3219 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3220 __ rev64(v0_ofs, __ T16B, v16); 3221 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3222 } 3223 3224 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3225 3226 // Encrypt the counters 3227 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3228 3229 if (bulk_width == 8) { 3230 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3231 } 3232 3233 // XOR the encrypted counters with the inputs 3234 for (int i = 0; i < bulk_width; i++) { 3235 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3236 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3237 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3238 } 3239 3240 // Write the encrypted data 3241 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3242 if (bulk_width == 8) { 3243 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3244 } 3245 3246 __ subw(len, len, 16 * bulk_width); 3247 __ cbnzw(len, L_CTR_loop); 3248 } 3249 3250 // Save the counter back where it goes 3251 __ rev64(v16, __ T16B, v16); 3252 __ st1(v16, __ T16B, counter); 3253 3254 __ pop(saved_regs, sp); 3255 3256 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3257 if (bulk_width == 8) { 3258 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3259 } 3260 3261 __ andr(rscratch1, len, -16 * bulk_width); 3262 __ sub(len, len, rscratch1); 3263 __ add(offset, offset, rscratch1); 3264 __ mov(used, 16); 3265 __ strw(used, Address(used_ptr)); 3266 __ b(large_block_return); 3267 3268 return start; 3269 } 3270 3271 // Vector AES Galois Counter Mode implementation. Parameters: 3272 // 3273 // in = c_rarg0 3274 // len = c_rarg1 3275 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3276 // out = c_rarg3 3277 // key = c_rarg4 3278 // state = c_rarg5 - GHASH.state 3279 // subkeyHtbl = c_rarg6 - powers of H 3280 // counter = c_rarg7 - 16 bytes of CTR 3281 // return - number of processed bytes 3282 address generate_galoisCounterMode_AESCrypt() { 3283 address ghash_polynomial = __ pc(); 3284 __ emit_int64(0x87); // The low-order bits of the field 3285 // polynomial (i.e. p = z^7+z^2+z+1) 3286 // repeated in the low and high parts of a 3287 // 128-bit vector 3288 __ emit_int64(0x87); 3289 3290 __ align(CodeEntryAlignment); 3291 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3292 StubCodeMark mark(this, stub_id); 3293 address start = __ pc(); 3294 __ enter(); 3295 3296 const Register in = c_rarg0; 3297 const Register len = c_rarg1; 3298 const Register ct = c_rarg2; 3299 const Register out = c_rarg3; 3300 // and updated with the incremented counter in the end 3301 3302 const Register key = c_rarg4; 3303 const Register state = c_rarg5; 3304 3305 const Register subkeyHtbl = c_rarg6; 3306 3307 const Register counter = c_rarg7; 3308 3309 const Register keylen = r10; 3310 // Save state before entering routine 3311 __ sub(sp, sp, 4 * 16); 3312 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3313 __ sub(sp, sp, 4 * 16); 3314 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3315 3316 // __ andr(len, len, -512); 3317 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3318 __ str(len, __ pre(sp, -2 * wordSize)); 3319 3320 Label DONE; 3321 __ cbz(len, DONE); 3322 3323 // Compute #rounds for AES based on the length of the key array 3324 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3325 3326 __ aesenc_loadkeys(key, keylen); 3327 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3328 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3329 3330 // AES/CTR loop 3331 { 3332 Label L_CTR_loop; 3333 __ BIND(L_CTR_loop); 3334 3335 // Setup the counters 3336 __ movi(v8, __ T4S, 0); 3337 __ movi(v9, __ T4S, 1); 3338 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3339 3340 assert(v0->encoding() < v8->encoding(), ""); 3341 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3342 FloatRegister f = as_FloatRegister(i); 3343 __ rev32(f, __ T16B, v16); 3344 __ addv(v16, __ T4S, v16, v8); 3345 } 3346 3347 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3348 3349 // Encrypt the counters 3350 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3351 3352 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3353 3354 // XOR the encrypted counters with the inputs 3355 for (int i = 0; i < 8; i++) { 3356 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3357 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3358 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3359 } 3360 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3361 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3362 3363 __ subw(len, len, 16 * 8); 3364 __ cbnzw(len, L_CTR_loop); 3365 } 3366 3367 __ rev32(v16, __ T16B, v16); 3368 __ st1(v16, __ T16B, counter); 3369 3370 __ ldr(len, Address(sp)); 3371 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3372 3373 // GHASH/CTR loop 3374 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3375 len, /*unrolls*/4); 3376 3377 #ifdef ASSERT 3378 { Label L; 3379 __ cmp(len, (unsigned char)0); 3380 __ br(Assembler::EQ, L); 3381 __ stop("stubGenerator: abort"); 3382 __ bind(L); 3383 } 3384 #endif 3385 3386 __ bind(DONE); 3387 // Return the number of bytes processed 3388 __ ldr(r0, __ post(sp, 2 * wordSize)); 3389 3390 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3391 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3392 3393 __ leave(); // required for proper stackwalking of RuntimeStub frame 3394 __ ret(lr); 3395 return start; 3396 } 3397 3398 class Cached64Bytes { 3399 private: 3400 MacroAssembler *_masm; 3401 Register _regs[8]; 3402 3403 public: 3404 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3405 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3406 auto it = rs.begin(); 3407 for (auto &r: _regs) { 3408 r = *it; 3409 ++it; 3410 } 3411 } 3412 3413 void gen_loads(Register base) { 3414 for (int i = 0; i < 8; i += 2) { 3415 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3416 } 3417 } 3418 3419 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3420 void extract_u32(Register dest, int i) { 3421 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3422 } 3423 }; 3424 3425 // Utility routines for md5. 3426 // Clobbers r10 and r11. 3427 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3428 int k, int s, int t) { 3429 Register rscratch3 = r10; 3430 Register rscratch4 = r11; 3431 3432 __ eorw(rscratch3, r3, r4); 3433 __ movw(rscratch2, t); 3434 __ andw(rscratch3, rscratch3, r2); 3435 __ addw(rscratch4, r1, rscratch2); 3436 reg_cache.extract_u32(rscratch1, k); 3437 __ eorw(rscratch3, rscratch3, r4); 3438 __ addw(rscratch4, rscratch4, rscratch1); 3439 __ addw(rscratch3, rscratch3, rscratch4); 3440 __ rorw(rscratch2, rscratch3, 32 - s); 3441 __ addw(r1, rscratch2, r2); 3442 } 3443 3444 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3445 int k, int s, int t) { 3446 Register rscratch3 = r10; 3447 Register rscratch4 = r11; 3448 3449 reg_cache.extract_u32(rscratch1, k); 3450 __ movw(rscratch2, t); 3451 __ addw(rscratch4, r1, rscratch2); 3452 __ addw(rscratch4, rscratch4, rscratch1); 3453 __ bicw(rscratch2, r3, r4); 3454 __ andw(rscratch3, r2, r4); 3455 __ addw(rscratch2, rscratch2, rscratch4); 3456 __ addw(rscratch2, rscratch2, rscratch3); 3457 __ rorw(rscratch2, rscratch2, 32 - s); 3458 __ addw(r1, rscratch2, r2); 3459 } 3460 3461 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3462 int k, int s, int t) { 3463 Register rscratch3 = r10; 3464 Register rscratch4 = r11; 3465 3466 __ eorw(rscratch3, r3, r4); 3467 __ movw(rscratch2, t); 3468 __ addw(rscratch4, r1, rscratch2); 3469 reg_cache.extract_u32(rscratch1, k); 3470 __ eorw(rscratch3, rscratch3, r2); 3471 __ addw(rscratch4, rscratch4, rscratch1); 3472 __ addw(rscratch3, rscratch3, rscratch4); 3473 __ rorw(rscratch2, rscratch3, 32 - s); 3474 __ addw(r1, rscratch2, r2); 3475 } 3476 3477 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3478 int k, int s, int t) { 3479 Register rscratch3 = r10; 3480 Register rscratch4 = r11; 3481 3482 __ movw(rscratch3, t); 3483 __ ornw(rscratch2, r2, r4); 3484 __ addw(rscratch4, r1, rscratch3); 3485 reg_cache.extract_u32(rscratch1, k); 3486 __ eorw(rscratch3, rscratch2, r3); 3487 __ addw(rscratch4, rscratch4, rscratch1); 3488 __ addw(rscratch3, rscratch3, rscratch4); 3489 __ rorw(rscratch2, rscratch3, 32 - s); 3490 __ addw(r1, rscratch2, r2); 3491 } 3492 3493 // Arguments: 3494 // 3495 // Inputs: 3496 // c_rarg0 - byte[] source+offset 3497 // c_rarg1 - int[] SHA.state 3498 // c_rarg2 - int offset 3499 // c_rarg3 - int limit 3500 // 3501 address generate_md5_implCompress(StubGenStubId stub_id) { 3502 bool multi_block; 3503 switch (stub_id) { 3504 case md5_implCompress_id: 3505 multi_block = false; 3506 break; 3507 case md5_implCompressMB_id: 3508 multi_block = true; 3509 break; 3510 default: 3511 ShouldNotReachHere(); 3512 } 3513 __ align(CodeEntryAlignment); 3514 3515 StubCodeMark mark(this, stub_id); 3516 address start = __ pc(); 3517 3518 Register buf = c_rarg0; 3519 Register state = c_rarg1; 3520 Register ofs = c_rarg2; 3521 Register limit = c_rarg3; 3522 Register a = r4; 3523 Register b = r5; 3524 Register c = r6; 3525 Register d = r7; 3526 Register rscratch3 = r10; 3527 Register rscratch4 = r11; 3528 3529 Register state_regs[2] = { r12, r13 }; 3530 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3531 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3532 3533 __ push(saved_regs, sp); 3534 3535 __ ldp(state_regs[0], state_regs[1], Address(state)); 3536 __ ubfx(a, state_regs[0], 0, 32); 3537 __ ubfx(b, state_regs[0], 32, 32); 3538 __ ubfx(c, state_regs[1], 0, 32); 3539 __ ubfx(d, state_regs[1], 32, 32); 3540 3541 Label md5_loop; 3542 __ BIND(md5_loop); 3543 3544 reg_cache.gen_loads(buf); 3545 3546 // Round 1 3547 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3548 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3549 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3550 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3551 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3552 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3553 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3554 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3555 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3556 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3557 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3558 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3559 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3560 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3561 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3562 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3563 3564 // Round 2 3565 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3566 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3567 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3568 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3569 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3570 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3571 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3572 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3573 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3574 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3575 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3576 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3577 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3578 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3579 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3580 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3581 3582 // Round 3 3583 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3584 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3585 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3586 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3587 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3588 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3589 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3590 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3591 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3592 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3593 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3594 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3595 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3596 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3597 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3598 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3599 3600 // Round 4 3601 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3602 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3603 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3604 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3605 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3606 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3607 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3608 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3609 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3610 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3611 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3612 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3613 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3614 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3615 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3616 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3617 3618 __ addw(a, state_regs[0], a); 3619 __ ubfx(rscratch2, state_regs[0], 32, 32); 3620 __ addw(b, rscratch2, b); 3621 __ addw(c, state_regs[1], c); 3622 __ ubfx(rscratch4, state_regs[1], 32, 32); 3623 __ addw(d, rscratch4, d); 3624 3625 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3626 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3627 3628 if (multi_block) { 3629 __ add(buf, buf, 64); 3630 __ add(ofs, ofs, 64); 3631 __ cmp(ofs, limit); 3632 __ br(Assembler::LE, md5_loop); 3633 __ mov(c_rarg0, ofs); // return ofs 3634 } 3635 3636 // write hash values back in the correct order 3637 __ stp(state_regs[0], state_regs[1], Address(state)); 3638 3639 __ pop(saved_regs, sp); 3640 3641 __ ret(lr); 3642 3643 return start; 3644 } 3645 3646 // Arguments: 3647 // 3648 // Inputs: 3649 // c_rarg0 - byte[] source+offset 3650 // c_rarg1 - int[] SHA.state 3651 // c_rarg2 - int offset 3652 // c_rarg3 - int limit 3653 // 3654 address generate_sha1_implCompress(StubGenStubId stub_id) { 3655 bool multi_block; 3656 switch (stub_id) { 3657 case sha1_implCompress_id: 3658 multi_block = false; 3659 break; 3660 case sha1_implCompressMB_id: 3661 multi_block = true; 3662 break; 3663 default: 3664 ShouldNotReachHere(); 3665 } 3666 3667 __ align(CodeEntryAlignment); 3668 3669 StubCodeMark mark(this, stub_id); 3670 address start = __ pc(); 3671 3672 Register buf = c_rarg0; 3673 Register state = c_rarg1; 3674 Register ofs = c_rarg2; 3675 Register limit = c_rarg3; 3676 3677 Label keys; 3678 Label sha1_loop; 3679 3680 // load the keys into v0..v3 3681 __ adr(rscratch1, keys); 3682 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3683 // load 5 words state into v6, v7 3684 __ ldrq(v6, Address(state, 0)); 3685 __ ldrs(v7, Address(state, 16)); 3686 3687 3688 __ BIND(sha1_loop); 3689 // load 64 bytes of data into v16..v19 3690 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3691 __ rev32(v16, __ T16B, v16); 3692 __ rev32(v17, __ T16B, v17); 3693 __ rev32(v18, __ T16B, v18); 3694 __ rev32(v19, __ T16B, v19); 3695 3696 // do the sha1 3697 __ addv(v4, __ T4S, v16, v0); 3698 __ orr(v20, __ T16B, v6, v6); 3699 3700 FloatRegister d0 = v16; 3701 FloatRegister d1 = v17; 3702 FloatRegister d2 = v18; 3703 FloatRegister d3 = v19; 3704 3705 for (int round = 0; round < 20; round++) { 3706 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3707 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3708 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3709 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3710 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3711 3712 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3713 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3714 __ sha1h(tmp2, __ T4S, v20); 3715 if (round < 5) 3716 __ sha1c(v20, __ T4S, tmp3, tmp4); 3717 else if (round < 10 || round >= 15) 3718 __ sha1p(v20, __ T4S, tmp3, tmp4); 3719 else 3720 __ sha1m(v20, __ T4S, tmp3, tmp4); 3721 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3722 3723 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3724 } 3725 3726 __ addv(v7, __ T2S, v7, v21); 3727 __ addv(v6, __ T4S, v6, v20); 3728 3729 if (multi_block) { 3730 __ add(ofs, ofs, 64); 3731 __ cmp(ofs, limit); 3732 __ br(Assembler::LE, sha1_loop); 3733 __ mov(c_rarg0, ofs); // return ofs 3734 } 3735 3736 __ strq(v6, Address(state, 0)); 3737 __ strs(v7, Address(state, 16)); 3738 3739 __ ret(lr); 3740 3741 __ bind(keys); 3742 __ emit_int32(0x5a827999); 3743 __ emit_int32(0x6ed9eba1); 3744 __ emit_int32(0x8f1bbcdc); 3745 __ emit_int32(0xca62c1d6); 3746 3747 return start; 3748 } 3749 3750 3751 // Arguments: 3752 // 3753 // Inputs: 3754 // c_rarg0 - byte[] source+offset 3755 // c_rarg1 - int[] SHA.state 3756 // c_rarg2 - int offset 3757 // c_rarg3 - int limit 3758 // 3759 address generate_sha256_implCompress(StubGenStubId stub_id) { 3760 bool multi_block; 3761 switch (stub_id) { 3762 case sha256_implCompress_id: 3763 multi_block = false; 3764 break; 3765 case sha256_implCompressMB_id: 3766 multi_block = true; 3767 break; 3768 default: 3769 ShouldNotReachHere(); 3770 } 3771 3772 static const uint32_t round_consts[64] = { 3773 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3774 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3775 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3776 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3777 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3778 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3779 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3780 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3781 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3782 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3783 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3784 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3785 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3786 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3787 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3788 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3789 }; 3790 3791 __ align(CodeEntryAlignment); 3792 3793 StubCodeMark mark(this, stub_id); 3794 address start = __ pc(); 3795 3796 Register buf = c_rarg0; 3797 Register state = c_rarg1; 3798 Register ofs = c_rarg2; 3799 Register limit = c_rarg3; 3800 3801 Label sha1_loop; 3802 3803 __ stpd(v8, v9, __ pre(sp, -32)); 3804 __ stpd(v10, v11, Address(sp, 16)); 3805 3806 // dga == v0 3807 // dgb == v1 3808 // dg0 == v2 3809 // dg1 == v3 3810 // dg2 == v4 3811 // t0 == v6 3812 // t1 == v7 3813 3814 // load 16 keys to v16..v31 3815 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3816 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3817 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3818 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3819 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3820 3821 // load 8 words (256 bits) state 3822 __ ldpq(v0, v1, state); 3823 3824 __ BIND(sha1_loop); 3825 // load 64 bytes of data into v8..v11 3826 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3827 __ rev32(v8, __ T16B, v8); 3828 __ rev32(v9, __ T16B, v9); 3829 __ rev32(v10, __ T16B, v10); 3830 __ rev32(v11, __ T16B, v11); 3831 3832 __ addv(v6, __ T4S, v8, v16); 3833 __ orr(v2, __ T16B, v0, v0); 3834 __ orr(v3, __ T16B, v1, v1); 3835 3836 FloatRegister d0 = v8; 3837 FloatRegister d1 = v9; 3838 FloatRegister d2 = v10; 3839 FloatRegister d3 = v11; 3840 3841 3842 for (int round = 0; round < 16; round++) { 3843 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3844 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3845 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3846 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3847 3848 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3849 __ orr(v4, __ T16B, v2, v2); 3850 if (round < 15) 3851 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3852 __ sha256h(v2, __ T4S, v3, tmp2); 3853 __ sha256h2(v3, __ T4S, v4, tmp2); 3854 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3855 3856 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3857 } 3858 3859 __ addv(v0, __ T4S, v0, v2); 3860 __ addv(v1, __ T4S, v1, v3); 3861 3862 if (multi_block) { 3863 __ add(ofs, ofs, 64); 3864 __ cmp(ofs, limit); 3865 __ br(Assembler::LE, sha1_loop); 3866 __ mov(c_rarg0, ofs); // return ofs 3867 } 3868 3869 __ ldpd(v10, v11, Address(sp, 16)); 3870 __ ldpd(v8, v9, __ post(sp, 32)); 3871 3872 __ stpq(v0, v1, state); 3873 3874 __ ret(lr); 3875 3876 return start; 3877 } 3878 3879 // Double rounds for sha512. 3880 void sha512_dround(int dr, 3881 FloatRegister vi0, FloatRegister vi1, 3882 FloatRegister vi2, FloatRegister vi3, 3883 FloatRegister vi4, FloatRegister vrc0, 3884 FloatRegister vrc1, FloatRegister vin0, 3885 FloatRegister vin1, FloatRegister vin2, 3886 FloatRegister vin3, FloatRegister vin4) { 3887 if (dr < 36) { 3888 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3889 } 3890 __ addv(v5, __ T2D, vrc0, vin0); 3891 __ ext(v6, __ T16B, vi2, vi3, 8); 3892 __ ext(v5, __ T16B, v5, v5, 8); 3893 __ ext(v7, __ T16B, vi1, vi2, 8); 3894 __ addv(vi3, __ T2D, vi3, v5); 3895 if (dr < 32) { 3896 __ ext(v5, __ T16B, vin3, vin4, 8); 3897 __ sha512su0(vin0, __ T2D, vin1); 3898 } 3899 __ sha512h(vi3, __ T2D, v6, v7); 3900 if (dr < 32) { 3901 __ sha512su1(vin0, __ T2D, vin2, v5); 3902 } 3903 __ addv(vi4, __ T2D, vi1, vi3); 3904 __ sha512h2(vi3, __ T2D, vi1, vi0); 3905 } 3906 3907 // Arguments: 3908 // 3909 // Inputs: 3910 // c_rarg0 - byte[] source+offset 3911 // c_rarg1 - int[] SHA.state 3912 // c_rarg2 - int offset 3913 // c_rarg3 - int limit 3914 // 3915 address generate_sha512_implCompress(StubGenStubId stub_id) { 3916 bool multi_block; 3917 switch (stub_id) { 3918 case sha512_implCompress_id: 3919 multi_block = false; 3920 break; 3921 case sha512_implCompressMB_id: 3922 multi_block = true; 3923 break; 3924 default: 3925 ShouldNotReachHere(); 3926 } 3927 3928 static const uint64_t round_consts[80] = { 3929 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3930 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3931 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3932 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3933 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3934 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3935 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3936 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3937 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3938 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3939 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3940 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3941 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3942 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3943 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3944 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3945 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3946 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3947 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3948 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3949 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3950 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3951 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3952 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3953 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3954 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3955 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3956 }; 3957 3958 __ align(CodeEntryAlignment); 3959 3960 StubCodeMark mark(this, stub_id); 3961 address start = __ pc(); 3962 3963 Register buf = c_rarg0; 3964 Register state = c_rarg1; 3965 Register ofs = c_rarg2; 3966 Register limit = c_rarg3; 3967 3968 __ stpd(v8, v9, __ pre(sp, -64)); 3969 __ stpd(v10, v11, Address(sp, 16)); 3970 __ stpd(v12, v13, Address(sp, 32)); 3971 __ stpd(v14, v15, Address(sp, 48)); 3972 3973 Label sha512_loop; 3974 3975 // load state 3976 __ ld1(v8, v9, v10, v11, __ T2D, state); 3977 3978 // load first 4 round constants 3979 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3980 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3981 3982 __ BIND(sha512_loop); 3983 // load 128B of data into v12..v19 3984 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3985 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3986 __ rev64(v12, __ T16B, v12); 3987 __ rev64(v13, __ T16B, v13); 3988 __ rev64(v14, __ T16B, v14); 3989 __ rev64(v15, __ T16B, v15); 3990 __ rev64(v16, __ T16B, v16); 3991 __ rev64(v17, __ T16B, v17); 3992 __ rev64(v18, __ T16B, v18); 3993 __ rev64(v19, __ T16B, v19); 3994 3995 __ mov(rscratch2, rscratch1); 3996 3997 __ mov(v0, __ T16B, v8); 3998 __ mov(v1, __ T16B, v9); 3999 __ mov(v2, __ T16B, v10); 4000 __ mov(v3, __ T16B, v11); 4001 4002 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4003 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4004 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4005 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4006 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4007 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4008 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4009 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4010 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4011 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4012 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4013 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4014 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4015 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4016 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4017 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4018 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4019 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4020 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4021 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4022 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4023 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4024 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4025 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4026 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4027 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4028 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4029 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4030 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4031 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4032 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4033 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4034 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4035 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4036 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4037 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4038 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4039 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4040 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4041 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4042 4043 __ addv(v8, __ T2D, v8, v0); 4044 __ addv(v9, __ T2D, v9, v1); 4045 __ addv(v10, __ T2D, v10, v2); 4046 __ addv(v11, __ T2D, v11, v3); 4047 4048 if (multi_block) { 4049 __ add(ofs, ofs, 128); 4050 __ cmp(ofs, limit); 4051 __ br(Assembler::LE, sha512_loop); 4052 __ mov(c_rarg0, ofs); // return ofs 4053 } 4054 4055 __ st1(v8, v9, v10, v11, __ T2D, state); 4056 4057 __ ldpd(v14, v15, Address(sp, 48)); 4058 __ ldpd(v12, v13, Address(sp, 32)); 4059 __ ldpd(v10, v11, Address(sp, 16)); 4060 __ ldpd(v8, v9, __ post(sp, 64)); 4061 4062 __ ret(lr); 4063 4064 return start; 4065 } 4066 4067 // Arguments: 4068 // 4069 // Inputs: 4070 // c_rarg0 - byte[] source+offset 4071 // c_rarg1 - byte[] SHA.state 4072 // c_rarg2 - int block_size 4073 // c_rarg3 - int offset 4074 // c_rarg4 - int limit 4075 // 4076 address generate_sha3_implCompress(StubGenStubId stub_id) { 4077 bool multi_block; 4078 switch (stub_id) { 4079 case sha3_implCompress_id: 4080 multi_block = false; 4081 break; 4082 case sha3_implCompressMB_id: 4083 multi_block = true; 4084 break; 4085 default: 4086 ShouldNotReachHere(); 4087 } 4088 4089 static const uint64_t round_consts[24] = { 4090 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4091 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4092 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4093 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4094 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4095 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4096 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4097 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4098 }; 4099 4100 __ align(CodeEntryAlignment); 4101 4102 StubCodeMark mark(this, stub_id); 4103 address start = __ pc(); 4104 4105 Register buf = c_rarg0; 4106 Register state = c_rarg1; 4107 Register block_size = c_rarg2; 4108 Register ofs = c_rarg3; 4109 Register limit = c_rarg4; 4110 4111 Label sha3_loop, rounds24_loop; 4112 Label sha3_512_or_sha3_384, shake128; 4113 4114 __ stpd(v8, v9, __ pre(sp, -64)); 4115 __ stpd(v10, v11, Address(sp, 16)); 4116 __ stpd(v12, v13, Address(sp, 32)); 4117 __ stpd(v14, v15, Address(sp, 48)); 4118 4119 // load state 4120 __ add(rscratch1, state, 32); 4121 __ ld1(v0, v1, v2, v3, __ T1D, state); 4122 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4123 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4124 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4125 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4126 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4127 __ ld1(v24, __ T1D, rscratch1); 4128 4129 __ BIND(sha3_loop); 4130 4131 // 24 keccak rounds 4132 __ movw(rscratch2, 24); 4133 4134 // load round_constants base 4135 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4136 4137 // load input 4138 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4139 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4140 __ eor(v0, __ T8B, v0, v25); 4141 __ eor(v1, __ T8B, v1, v26); 4142 __ eor(v2, __ T8B, v2, v27); 4143 __ eor(v3, __ T8B, v3, v28); 4144 __ eor(v4, __ T8B, v4, v29); 4145 __ eor(v5, __ T8B, v5, v30); 4146 __ eor(v6, __ T8B, v6, v31); 4147 4148 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4149 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4150 4151 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4152 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4153 __ eor(v7, __ T8B, v7, v25); 4154 __ eor(v8, __ T8B, v8, v26); 4155 __ eor(v9, __ T8B, v9, v27); 4156 __ eor(v10, __ T8B, v10, v28); 4157 __ eor(v11, __ T8B, v11, v29); 4158 __ eor(v12, __ T8B, v12, v30); 4159 __ eor(v13, __ T8B, v13, v31); 4160 4161 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4162 __ eor(v14, __ T8B, v14, v25); 4163 __ eor(v15, __ T8B, v15, v26); 4164 __ eor(v16, __ T8B, v16, v27); 4165 4166 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4167 __ andw(c_rarg5, block_size, 48); 4168 __ cbzw(c_rarg5, rounds24_loop); 4169 4170 __ tbnz(block_size, 5, shake128); 4171 // block_size == 144, bit5 == 0, SHA3-244 4172 __ ldrd(v28, __ post(buf, 8)); 4173 __ eor(v17, __ T8B, v17, v28); 4174 __ b(rounds24_loop); 4175 4176 __ BIND(shake128); 4177 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4178 __ eor(v17, __ T8B, v17, v28); 4179 __ eor(v18, __ T8B, v18, v29); 4180 __ eor(v19, __ T8B, v19, v30); 4181 __ eor(v20, __ T8B, v20, v31); 4182 __ b(rounds24_loop); // block_size == 168, SHAKE128 4183 4184 __ BIND(sha3_512_or_sha3_384); 4185 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4186 __ eor(v7, __ T8B, v7, v25); 4187 __ eor(v8, __ T8B, v8, v26); 4188 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4189 4190 // SHA3-384 4191 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4192 __ eor(v9, __ T8B, v9, v27); 4193 __ eor(v10, __ T8B, v10, v28); 4194 __ eor(v11, __ T8B, v11, v29); 4195 __ eor(v12, __ T8B, v12, v30); 4196 4197 __ BIND(rounds24_loop); 4198 __ subw(rscratch2, rscratch2, 1); 4199 4200 __ eor3(v29, __ T16B, v4, v9, v14); 4201 __ eor3(v26, __ T16B, v1, v6, v11); 4202 __ eor3(v28, __ T16B, v3, v8, v13); 4203 __ eor3(v25, __ T16B, v0, v5, v10); 4204 __ eor3(v27, __ T16B, v2, v7, v12); 4205 __ eor3(v29, __ T16B, v29, v19, v24); 4206 __ eor3(v26, __ T16B, v26, v16, v21); 4207 __ eor3(v28, __ T16B, v28, v18, v23); 4208 __ eor3(v25, __ T16B, v25, v15, v20); 4209 __ eor3(v27, __ T16B, v27, v17, v22); 4210 4211 __ rax1(v30, __ T2D, v29, v26); 4212 __ rax1(v26, __ T2D, v26, v28); 4213 __ rax1(v28, __ T2D, v28, v25); 4214 __ rax1(v25, __ T2D, v25, v27); 4215 __ rax1(v27, __ T2D, v27, v29); 4216 4217 __ eor(v0, __ T16B, v0, v30); 4218 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4219 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4220 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4221 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4222 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4223 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4224 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4225 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4226 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4227 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4228 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4229 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4230 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4231 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4232 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4233 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4234 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4235 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4236 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4237 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4238 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4239 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4240 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4241 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4242 4243 __ bcax(v20, __ T16B, v31, v22, v8); 4244 __ bcax(v21, __ T16B, v8, v23, v22); 4245 __ bcax(v22, __ T16B, v22, v24, v23); 4246 __ bcax(v23, __ T16B, v23, v31, v24); 4247 __ bcax(v24, __ T16B, v24, v8, v31); 4248 4249 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4250 4251 __ bcax(v17, __ T16B, v25, v19, v3); 4252 __ bcax(v18, __ T16B, v3, v15, v19); 4253 __ bcax(v19, __ T16B, v19, v16, v15); 4254 __ bcax(v15, __ T16B, v15, v25, v16); 4255 __ bcax(v16, __ T16B, v16, v3, v25); 4256 4257 __ bcax(v10, __ T16B, v29, v12, v26); 4258 __ bcax(v11, __ T16B, v26, v13, v12); 4259 __ bcax(v12, __ T16B, v12, v14, v13); 4260 __ bcax(v13, __ T16B, v13, v29, v14); 4261 __ bcax(v14, __ T16B, v14, v26, v29); 4262 4263 __ bcax(v7, __ T16B, v30, v9, v4); 4264 __ bcax(v8, __ T16B, v4, v5, v9); 4265 __ bcax(v9, __ T16B, v9, v6, v5); 4266 __ bcax(v5, __ T16B, v5, v30, v6); 4267 __ bcax(v6, __ T16B, v6, v4, v30); 4268 4269 __ bcax(v3, __ T16B, v27, v0, v28); 4270 __ bcax(v4, __ T16B, v28, v1, v0); 4271 __ bcax(v0, __ T16B, v0, v2, v1); 4272 __ bcax(v1, __ T16B, v1, v27, v2); 4273 __ bcax(v2, __ T16B, v2, v28, v27); 4274 4275 __ eor(v0, __ T16B, v0, v31); 4276 4277 __ cbnzw(rscratch2, rounds24_loop); 4278 4279 if (multi_block) { 4280 __ add(ofs, ofs, block_size); 4281 __ cmp(ofs, limit); 4282 __ br(Assembler::LE, sha3_loop); 4283 __ mov(c_rarg0, ofs); // return ofs 4284 } 4285 4286 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4287 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4288 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4289 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4290 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4291 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4292 __ st1(v24, __ T1D, state); 4293 4294 __ ldpd(v14, v15, Address(sp, 48)); 4295 __ ldpd(v12, v13, Address(sp, 32)); 4296 __ ldpd(v10, v11, Address(sp, 16)); 4297 __ ldpd(v8, v9, __ post(sp, 64)); 4298 4299 __ ret(lr); 4300 4301 return start; 4302 } 4303 4304 /** 4305 * Arguments: 4306 * 4307 * Inputs: 4308 * c_rarg0 - int crc 4309 * c_rarg1 - byte* buf 4310 * c_rarg2 - int length 4311 * 4312 * Output: 4313 * rax - int crc result 4314 */ 4315 address generate_updateBytesCRC32() { 4316 assert(UseCRC32Intrinsics, "what are we doing here?"); 4317 4318 __ align(CodeEntryAlignment); 4319 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 4320 StubCodeMark mark(this, stub_id); 4321 4322 address start = __ pc(); 4323 4324 const Register crc = c_rarg0; // crc 4325 const Register buf = c_rarg1; // source java byte array address 4326 const Register len = c_rarg2; // length 4327 const Register table0 = c_rarg3; // crc_table address 4328 const Register table1 = c_rarg4; 4329 const Register table2 = c_rarg5; 4330 const Register table3 = c_rarg6; 4331 const Register tmp3 = c_rarg7; 4332 4333 BLOCK_COMMENT("Entry:"); 4334 __ enter(); // required for proper stackwalking of RuntimeStub frame 4335 4336 __ kernel_crc32(crc, buf, len, 4337 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4338 4339 __ leave(); // required for proper stackwalking of RuntimeStub frame 4340 __ ret(lr); 4341 4342 return start; 4343 } 4344 4345 // ChaCha20 block function. This version parallelizes 4 quarter 4346 // round operations at a time. It uses 16 SIMD registers to 4347 // produce 4 blocks of key stream. 4348 // 4349 // state (int[16]) = c_rarg0 4350 // keystream (byte[256]) = c_rarg1 4351 // return - number of bytes of keystream (always 256) 4352 // 4353 // In this approach, we load the 512-bit start state sequentially into 4354 // 4 128-bit vectors. We then make 4 4-vector copies of that starting 4355 // state, with each successive set of 4 vectors having a +1 added into 4356 // the first 32-bit lane of the 4th vector in that group (the counter). 4357 // By doing this, we can perform the block function on 4 512-bit blocks 4358 // within one run of this intrinsic. 4359 // The alignment of the data across the 4-vector group is such that at 4360 // the start it is already aligned for the first round of each two-round 4361 // loop iteration. In other words, the corresponding lanes of each vector 4362 // will contain the values needed for that quarter round operation (e.g. 4363 // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.). 4364 // In between each full round, a lane shift must occur. Within a loop 4365 // iteration, between the first and second rounds, the 2nd, 3rd, and 4th 4366 // vectors are rotated left 32, 64 and 96 bits, respectively. The result 4367 // is effectively a diagonal orientation in columnar form. After the 4368 // second full round, those registers are left-rotated again, this time 4369 // 96, 64, and 32 bits - returning the vectors to their columnar organization. 4370 // After all 10 iterations, the original state is added to each 4-vector 4371 // working state along with the add mask, and the 4 vector groups are 4372 // sequentially written to the memory dedicated for the output key stream. 4373 // 4374 // For a more detailed explanation, see Goll and Gueron, "Vectorization of 4375 // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology: 4376 // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33 4377 address generate_chacha20Block_qrpar() { 4378 Label L_Q_twoRounds, L_Q_cc20_const; 4379 // The constant data is broken into two 128-bit segments to be loaded 4380 // onto SIMD registers. The first 128 bits are a counter add overlay 4381 // that adds +1/+0/+0/+0 to the vectors holding replicated state[12]. 4382 // The second 128-bits is a table constant used for 8-bit left rotations. 4383 // on 32-bit lanes within a SIMD register. 4384 __ BIND(L_Q_cc20_const); 4385 __ emit_int64(0x0000000000000001UL); 4386 __ emit_int64(0x0000000000000000UL); 4387 __ emit_int64(0x0605040702010003UL); 4388 __ emit_int64(0x0E0D0C0F0A09080BUL); 4389 4390 __ align(CodeEntryAlignment); 4391 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4392 StubCodeMark mark(this, stub_id); 4393 address start = __ pc(); 4394 __ enter(); 4395 4396 const Register state = c_rarg0; 4397 const Register keystream = c_rarg1; 4398 const Register loopCtr = r10; 4399 const Register tmpAddr = r11; 4400 4401 const FloatRegister aState = v0; 4402 const FloatRegister bState = v1; 4403 const FloatRegister cState = v2; 4404 const FloatRegister dState = v3; 4405 const FloatRegister a1Vec = v4; 4406 const FloatRegister b1Vec = v5; 4407 const FloatRegister c1Vec = v6; 4408 const FloatRegister d1Vec = v7; 4409 // Skip the callee-saved registers v8 - v15 4410 const FloatRegister a2Vec = v16; 4411 const FloatRegister b2Vec = v17; 4412 const FloatRegister c2Vec = v18; 4413 const FloatRegister d2Vec = v19; 4414 const FloatRegister a3Vec = v20; 4415 const FloatRegister b3Vec = v21; 4416 const FloatRegister c3Vec = v22; 4417 const FloatRegister d3Vec = v23; 4418 const FloatRegister a4Vec = v24; 4419 const FloatRegister b4Vec = v25; 4420 const FloatRegister c4Vec = v26; 4421 const FloatRegister d4Vec = v27; 4422 const FloatRegister scratch = v28; 4423 const FloatRegister addMask = v29; 4424 const FloatRegister lrot8Tbl = v30; 4425 4426 // Load the initial state in the first 4 quadword registers, 4427 // then copy the initial state into the next 4 quadword registers 4428 // that will be used for the working state. 4429 __ ld1(aState, bState, cState, dState, __ T16B, Address(state)); 4430 4431 // Load the index register for 2 constant 128-bit data fields. 4432 // The first represents the +1/+0/+0/+0 add mask. The second is 4433 // the 8-bit left rotation. 4434 __ adr(tmpAddr, L_Q_cc20_const); 4435 __ ldpq(addMask, lrot8Tbl, Address(tmpAddr)); 4436 4437 __ mov(a1Vec, __ T16B, aState); 4438 __ mov(b1Vec, __ T16B, bState); 4439 __ mov(c1Vec, __ T16B, cState); 4440 __ mov(d1Vec, __ T16B, dState); 4441 4442 __ mov(a2Vec, __ T16B, aState); 4443 __ mov(b2Vec, __ T16B, bState); 4444 __ mov(c2Vec, __ T16B, cState); 4445 __ addv(d2Vec, __ T4S, d1Vec, addMask); 4446 4447 __ mov(a3Vec, __ T16B, aState); 4448 __ mov(b3Vec, __ T16B, bState); 4449 __ mov(c3Vec, __ T16B, cState); 4450 __ addv(d3Vec, __ T4S, d2Vec, addMask); 4451 4452 __ mov(a4Vec, __ T16B, aState); 4453 __ mov(b4Vec, __ T16B, bState); 4454 __ mov(c4Vec, __ T16B, cState); 4455 __ addv(d4Vec, __ T4S, d3Vec, addMask); 4456 4457 // Set up the 10 iteration loop 4458 __ mov(loopCtr, 10); 4459 __ BIND(L_Q_twoRounds); 4460 4461 // The first set of operations on the vectors covers the first 4 quarter 4462 // round operations: 4463 // Qround(state, 0, 4, 8,12) 4464 // Qround(state, 1, 5, 9,13) 4465 // Qround(state, 2, 6,10,14) 4466 // Qround(state, 3, 7,11,15) 4467 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4468 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4469 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4470 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4471 4472 // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to 4473 // diagonals. The a1Vec does not need to change orientation. 4474 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true); 4475 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true); 4476 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true); 4477 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true); 4478 4479 // The second set of operations on the vectors covers the second 4 quarter 4480 // round operations, now acting on the diagonals: 4481 // Qround(state, 0, 5,10,15) 4482 // Qround(state, 1, 6,11,12) 4483 // Qround(state, 2, 7, 8,13) 4484 // Qround(state, 3, 4, 9,14) 4485 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4486 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4487 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4488 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4489 4490 // Before we start the next iteration, we need to perform shuffles 4491 // on the b/c/d vectors to move them back to columnar organizations 4492 // from their current diagonal orientation. 4493 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false); 4494 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false); 4495 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false); 4496 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false); 4497 4498 // Decrement and iterate 4499 __ sub(loopCtr, loopCtr, 1); 4500 __ cbnz(loopCtr, L_Q_twoRounds); 4501 4502 // Once the counter reaches zero, we fall out of the loop 4503 // and need to add the initial state back into the working state 4504 // represented by the a/b/c/d1Vec registers. This is destructive 4505 // on the dState register but we no longer will need it. 4506 __ addv(a1Vec, __ T4S, a1Vec, aState); 4507 __ addv(b1Vec, __ T4S, b1Vec, bState); 4508 __ addv(c1Vec, __ T4S, c1Vec, cState); 4509 __ addv(d1Vec, __ T4S, d1Vec, dState); 4510 4511 __ addv(a2Vec, __ T4S, a2Vec, aState); 4512 __ addv(b2Vec, __ T4S, b2Vec, bState); 4513 __ addv(c2Vec, __ T4S, c2Vec, cState); 4514 __ addv(dState, __ T4S, dState, addMask); 4515 __ addv(d2Vec, __ T4S, d2Vec, dState); 4516 4517 __ addv(a3Vec, __ T4S, a3Vec, aState); 4518 __ addv(b3Vec, __ T4S, b3Vec, bState); 4519 __ addv(c3Vec, __ T4S, c3Vec, cState); 4520 __ addv(dState, __ T4S, dState, addMask); 4521 __ addv(d3Vec, __ T4S, d3Vec, dState); 4522 4523 __ addv(a4Vec, __ T4S, a4Vec, aState); 4524 __ addv(b4Vec, __ T4S, b4Vec, bState); 4525 __ addv(c4Vec, __ T4S, c4Vec, cState); 4526 __ addv(dState, __ T4S, dState, addMask); 4527 __ addv(d4Vec, __ T4S, d4Vec, dState); 4528 4529 // Write the final state back to the result buffer 4530 __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64)); 4531 __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64)); 4532 __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64)); 4533 __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64)); 4534 4535 __ mov(r0, 256); // Return length of output keystream 4536 __ leave(); 4537 __ ret(lr); 4538 4539 return start; 4540 } 4541 4542 /** 4543 * Arguments: 4544 * 4545 * Inputs: 4546 * c_rarg0 - int crc 4547 * c_rarg1 - byte* buf 4548 * c_rarg2 - int length 4549 * c_rarg3 - int* table 4550 * 4551 * Output: 4552 * r0 - int crc result 4553 */ 4554 address generate_updateBytesCRC32C() { 4555 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4556 4557 __ align(CodeEntryAlignment); 4558 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 4559 StubCodeMark mark(this, stub_id); 4560 4561 address start = __ pc(); 4562 4563 const Register crc = c_rarg0; // crc 4564 const Register buf = c_rarg1; // source java byte array address 4565 const Register len = c_rarg2; // length 4566 const Register table0 = c_rarg3; // crc_table address 4567 const Register table1 = c_rarg4; 4568 const Register table2 = c_rarg5; 4569 const Register table3 = c_rarg6; 4570 const Register tmp3 = c_rarg7; 4571 4572 BLOCK_COMMENT("Entry:"); 4573 __ enter(); // required for proper stackwalking of RuntimeStub frame 4574 4575 __ kernel_crc32c(crc, buf, len, 4576 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4577 4578 __ leave(); // required for proper stackwalking of RuntimeStub frame 4579 __ ret(lr); 4580 4581 return start; 4582 } 4583 4584 /*** 4585 * Arguments: 4586 * 4587 * Inputs: 4588 * c_rarg0 - int adler 4589 * c_rarg1 - byte* buff 4590 * c_rarg2 - int len 4591 * 4592 * Output: 4593 * c_rarg0 - int adler result 4594 */ 4595 address generate_updateBytesAdler32() { 4596 __ align(CodeEntryAlignment); 4597 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 4598 StubCodeMark mark(this, stub_id); 4599 address start = __ pc(); 4600 4601 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4602 4603 // Aliases 4604 Register adler = c_rarg0; 4605 Register s1 = c_rarg0; 4606 Register s2 = c_rarg3; 4607 Register buff = c_rarg1; 4608 Register len = c_rarg2; 4609 Register nmax = r4; 4610 Register base = r5; 4611 Register count = r6; 4612 Register temp0 = rscratch1; 4613 Register temp1 = rscratch2; 4614 FloatRegister vbytes = v0; 4615 FloatRegister vs1acc = v1; 4616 FloatRegister vs2acc = v2; 4617 FloatRegister vtable = v3; 4618 4619 // Max number of bytes we can process before having to take the mod 4620 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4621 uint64_t BASE = 0xfff1; 4622 uint64_t NMAX = 0x15B0; 4623 4624 __ mov(base, BASE); 4625 __ mov(nmax, NMAX); 4626 4627 // Load accumulation coefficients for the upper 16 bits 4628 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4629 __ ld1(vtable, __ T16B, Address(temp0)); 4630 4631 // s1 is initialized to the lower 16 bits of adler 4632 // s2 is initialized to the upper 16 bits of adler 4633 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4634 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4635 4636 // The pipelined loop needs at least 16 elements for 1 iteration 4637 // It does check this, but it is more effective to skip to the cleanup loop 4638 __ cmp(len, (u1)16); 4639 __ br(Assembler::HS, L_nmax); 4640 __ cbz(len, L_combine); 4641 4642 __ bind(L_simple_by1_loop); 4643 __ ldrb(temp0, Address(__ post(buff, 1))); 4644 __ add(s1, s1, temp0); 4645 __ add(s2, s2, s1); 4646 __ subs(len, len, 1); 4647 __ br(Assembler::HI, L_simple_by1_loop); 4648 4649 // s1 = s1 % BASE 4650 __ subs(temp0, s1, base); 4651 __ csel(s1, temp0, s1, Assembler::HS); 4652 4653 // s2 = s2 % BASE 4654 __ lsr(temp0, s2, 16); 4655 __ lsl(temp1, temp0, 4); 4656 __ sub(temp1, temp1, temp0); 4657 __ add(s2, temp1, s2, ext::uxth); 4658 4659 __ subs(temp0, s2, base); 4660 __ csel(s2, temp0, s2, Assembler::HS); 4661 4662 __ b(L_combine); 4663 4664 __ bind(L_nmax); 4665 __ subs(len, len, nmax); 4666 __ sub(count, nmax, 16); 4667 __ br(Assembler::LO, L_by16); 4668 4669 __ bind(L_nmax_loop); 4670 4671 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4672 vbytes, vs1acc, vs2acc, vtable); 4673 4674 __ subs(count, count, 16); 4675 __ br(Assembler::HS, L_nmax_loop); 4676 4677 // s1 = s1 % BASE 4678 __ lsr(temp0, s1, 16); 4679 __ lsl(temp1, temp0, 4); 4680 __ sub(temp1, temp1, temp0); 4681 __ add(temp1, temp1, s1, ext::uxth); 4682 4683 __ lsr(temp0, temp1, 16); 4684 __ lsl(s1, temp0, 4); 4685 __ sub(s1, s1, temp0); 4686 __ add(s1, s1, temp1, ext:: uxth); 4687 4688 __ subs(temp0, s1, base); 4689 __ csel(s1, temp0, s1, Assembler::HS); 4690 4691 // s2 = s2 % BASE 4692 __ lsr(temp0, s2, 16); 4693 __ lsl(temp1, temp0, 4); 4694 __ sub(temp1, temp1, temp0); 4695 __ add(temp1, temp1, s2, ext::uxth); 4696 4697 __ lsr(temp0, temp1, 16); 4698 __ lsl(s2, temp0, 4); 4699 __ sub(s2, s2, temp0); 4700 __ add(s2, s2, temp1, ext:: uxth); 4701 4702 __ subs(temp0, s2, base); 4703 __ csel(s2, temp0, s2, Assembler::HS); 4704 4705 __ subs(len, len, nmax); 4706 __ sub(count, nmax, 16); 4707 __ br(Assembler::HS, L_nmax_loop); 4708 4709 __ bind(L_by16); 4710 __ adds(len, len, count); 4711 __ br(Assembler::LO, L_by1); 4712 4713 __ bind(L_by16_loop); 4714 4715 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4716 vbytes, vs1acc, vs2acc, vtable); 4717 4718 __ subs(len, len, 16); 4719 __ br(Assembler::HS, L_by16_loop); 4720 4721 __ bind(L_by1); 4722 __ adds(len, len, 15); 4723 __ br(Assembler::LO, L_do_mod); 4724 4725 __ bind(L_by1_loop); 4726 __ ldrb(temp0, Address(__ post(buff, 1))); 4727 __ add(s1, temp0, s1); 4728 __ add(s2, s2, s1); 4729 __ subs(len, len, 1); 4730 __ br(Assembler::HS, L_by1_loop); 4731 4732 __ bind(L_do_mod); 4733 // s1 = s1 % BASE 4734 __ lsr(temp0, s1, 16); 4735 __ lsl(temp1, temp0, 4); 4736 __ sub(temp1, temp1, temp0); 4737 __ add(temp1, temp1, s1, ext::uxth); 4738 4739 __ lsr(temp0, temp1, 16); 4740 __ lsl(s1, temp0, 4); 4741 __ sub(s1, s1, temp0); 4742 __ add(s1, s1, temp1, ext:: uxth); 4743 4744 __ subs(temp0, s1, base); 4745 __ csel(s1, temp0, s1, Assembler::HS); 4746 4747 // s2 = s2 % BASE 4748 __ lsr(temp0, s2, 16); 4749 __ lsl(temp1, temp0, 4); 4750 __ sub(temp1, temp1, temp0); 4751 __ add(temp1, temp1, s2, ext::uxth); 4752 4753 __ lsr(temp0, temp1, 16); 4754 __ lsl(s2, temp0, 4); 4755 __ sub(s2, s2, temp0); 4756 __ add(s2, s2, temp1, ext:: uxth); 4757 4758 __ subs(temp0, s2, base); 4759 __ csel(s2, temp0, s2, Assembler::HS); 4760 4761 // Combine lower bits and higher bits 4762 __ bind(L_combine); 4763 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4764 4765 __ ret(lr); 4766 4767 return start; 4768 } 4769 4770 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4771 Register temp0, Register temp1, FloatRegister vbytes, 4772 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4773 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4774 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4775 // In non-vectorized code, we update s1 and s2 as: 4776 // s1 <- s1 + b1 4777 // s2 <- s2 + s1 4778 // s1 <- s1 + b2 4779 // s2 <- s2 + b1 4780 // ... 4781 // s1 <- s1 + b16 4782 // s2 <- s2 + s1 4783 // Putting above assignments together, we have: 4784 // s1_new = s1 + b1 + b2 + ... + b16 4785 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4786 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4787 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4788 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4789 4790 // s2 = s2 + s1 * 16 4791 __ add(s2, s2, s1, Assembler::LSL, 4); 4792 4793 // vs1acc = b1 + b2 + b3 + ... + b16 4794 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4795 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4796 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4797 __ uaddlv(vs1acc, __ T16B, vbytes); 4798 __ uaddlv(vs2acc, __ T8H, vs2acc); 4799 4800 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4801 __ fmovd(temp0, vs1acc); 4802 __ fmovd(temp1, vs2acc); 4803 __ add(s1, s1, temp0); 4804 __ add(s2, s2, temp1); 4805 } 4806 4807 /** 4808 * Arguments: 4809 * 4810 * Input: 4811 * c_rarg0 - x address 4812 * c_rarg1 - x length 4813 * c_rarg2 - y address 4814 * c_rarg3 - y length 4815 * c_rarg4 - z address 4816 */ 4817 address generate_multiplyToLen() { 4818 __ align(CodeEntryAlignment); 4819 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 4820 StubCodeMark mark(this, stub_id); 4821 4822 address start = __ pc(); 4823 4824 if (SCCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) { 4825 return start; 4826 } 4827 const Register x = r0; 4828 const Register xlen = r1; 4829 const Register y = r2; 4830 const Register ylen = r3; 4831 const Register z = r4; 4832 4833 const Register tmp0 = r5; 4834 const Register tmp1 = r10; 4835 const Register tmp2 = r11; 4836 const Register tmp3 = r12; 4837 const Register tmp4 = r13; 4838 const Register tmp5 = r14; 4839 const Register tmp6 = r15; 4840 const Register tmp7 = r16; 4841 4842 BLOCK_COMMENT("Entry:"); 4843 __ enter(); // required for proper stackwalking of RuntimeStub frame 4844 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4845 __ leave(); // required for proper stackwalking of RuntimeStub frame 4846 __ ret(lr); 4847 4848 SCCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start); 4849 return start; 4850 } 4851 4852 address generate_squareToLen() { 4853 // squareToLen algorithm for sizes 1..127 described in java code works 4854 // faster than multiply_to_len on some CPUs and slower on others, but 4855 // multiply_to_len shows a bit better overall results 4856 __ align(CodeEntryAlignment); 4857 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 4858 StubCodeMark mark(this, stub_id); 4859 address start = __ pc(); 4860 4861 if (SCCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) { 4862 return start; 4863 } 4864 const Register x = r0; 4865 const Register xlen = r1; 4866 const Register z = r2; 4867 const Register y = r4; // == x 4868 const Register ylen = r5; // == xlen 4869 4870 const Register tmp0 = r3; 4871 const Register tmp1 = r10; 4872 const Register tmp2 = r11; 4873 const Register tmp3 = r12; 4874 const Register tmp4 = r13; 4875 const Register tmp5 = r14; 4876 const Register tmp6 = r15; 4877 const Register tmp7 = r16; 4878 4879 RegSet spilled_regs = RegSet::of(y, ylen); 4880 BLOCK_COMMENT("Entry:"); 4881 __ enter(); 4882 __ push(spilled_regs, sp); 4883 __ mov(y, x); 4884 __ mov(ylen, xlen); 4885 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4886 __ pop(spilled_regs, sp); 4887 __ leave(); 4888 __ ret(lr); 4889 4890 SCCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start); 4891 return start; 4892 } 4893 4894 address generate_mulAdd() { 4895 __ align(CodeEntryAlignment); 4896 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 4897 StubCodeMark mark(this, stub_id); 4898 4899 address start = __ pc(); 4900 4901 if (SCCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) { 4902 return start; 4903 } 4904 const Register out = r0; 4905 const Register in = r1; 4906 const Register offset = r2; 4907 const Register len = r3; 4908 const Register k = r4; 4909 4910 BLOCK_COMMENT("Entry:"); 4911 __ enter(); 4912 __ mul_add(out, in, offset, len, k); 4913 __ leave(); 4914 __ ret(lr); 4915 4916 SCCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start); 4917 return start; 4918 } 4919 4920 // Arguments: 4921 // 4922 // Input: 4923 // c_rarg0 - newArr address 4924 // c_rarg1 - oldArr address 4925 // c_rarg2 - newIdx 4926 // c_rarg3 - shiftCount 4927 // c_rarg4 - numIter 4928 // 4929 address generate_bigIntegerRightShift() { 4930 __ align(CodeEntryAlignment); 4931 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 4932 StubCodeMark mark(this, stub_id); 4933 address start = __ pc(); 4934 4935 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4936 4937 Register newArr = c_rarg0; 4938 Register oldArr = c_rarg1; 4939 Register newIdx = c_rarg2; 4940 Register shiftCount = c_rarg3; 4941 Register numIter = c_rarg4; 4942 Register idx = numIter; 4943 4944 Register newArrCur = rscratch1; 4945 Register shiftRevCount = rscratch2; 4946 Register oldArrCur = r13; 4947 Register oldArrNext = r14; 4948 4949 FloatRegister oldElem0 = v0; 4950 FloatRegister oldElem1 = v1; 4951 FloatRegister newElem = v2; 4952 FloatRegister shiftVCount = v3; 4953 FloatRegister shiftVRevCount = v4; 4954 4955 __ cbz(idx, Exit); 4956 4957 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4958 4959 // left shift count 4960 __ movw(shiftRevCount, 32); 4961 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4962 4963 // numIter too small to allow a 4-words SIMD loop, rolling back 4964 __ cmp(numIter, (u1)4); 4965 __ br(Assembler::LT, ShiftThree); 4966 4967 __ dup(shiftVCount, __ T4S, shiftCount); 4968 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4969 __ negr(shiftVCount, __ T4S, shiftVCount); 4970 4971 __ BIND(ShiftSIMDLoop); 4972 4973 // Calculate the load addresses 4974 __ sub(idx, idx, 4); 4975 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4976 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4977 __ add(oldArrCur, oldArrNext, 4); 4978 4979 // Load 4 words and process 4980 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4981 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4982 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4983 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4984 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4985 __ st1(newElem, __ T4S, Address(newArrCur)); 4986 4987 __ cmp(idx, (u1)4); 4988 __ br(Assembler::LT, ShiftTwoLoop); 4989 __ b(ShiftSIMDLoop); 4990 4991 __ BIND(ShiftTwoLoop); 4992 __ cbz(idx, Exit); 4993 __ cmp(idx, (u1)1); 4994 __ br(Assembler::EQ, ShiftOne); 4995 4996 // Calculate the load addresses 4997 __ sub(idx, idx, 2); 4998 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4999 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 5000 __ add(oldArrCur, oldArrNext, 4); 5001 5002 // Load 2 words and process 5003 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 5004 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 5005 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 5006 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 5007 __ orr(newElem, __ T8B, oldElem0, oldElem1); 5008 __ st1(newElem, __ T2S, Address(newArrCur)); 5009 __ b(ShiftTwoLoop); 5010 5011 __ BIND(ShiftThree); 5012 __ tbz(idx, 1, ShiftOne); 5013 __ tbz(idx, 0, ShiftTwo); 5014 __ ldrw(r10, Address(oldArr, 12)); 5015 __ ldrw(r11, Address(oldArr, 8)); 5016 __ lsrvw(r10, r10, shiftCount); 5017 __ lslvw(r11, r11, shiftRevCount); 5018 __ orrw(r12, r10, r11); 5019 __ strw(r12, Address(newArr, 8)); 5020 5021 __ BIND(ShiftTwo); 5022 __ ldrw(r10, Address(oldArr, 8)); 5023 __ ldrw(r11, Address(oldArr, 4)); 5024 __ lsrvw(r10, r10, shiftCount); 5025 __ lslvw(r11, r11, shiftRevCount); 5026 __ orrw(r12, r10, r11); 5027 __ strw(r12, Address(newArr, 4)); 5028 5029 __ BIND(ShiftOne); 5030 __ ldrw(r10, Address(oldArr, 4)); 5031 __ ldrw(r11, Address(oldArr)); 5032 __ lsrvw(r10, r10, shiftCount); 5033 __ lslvw(r11, r11, shiftRevCount); 5034 __ orrw(r12, r10, r11); 5035 __ strw(r12, Address(newArr)); 5036 5037 __ BIND(Exit); 5038 __ ret(lr); 5039 5040 return start; 5041 } 5042 5043 // Arguments: 5044 // 5045 // Input: 5046 // c_rarg0 - newArr address 5047 // c_rarg1 - oldArr address 5048 // c_rarg2 - newIdx 5049 // c_rarg3 - shiftCount 5050 // c_rarg4 - numIter 5051 // 5052 address generate_bigIntegerLeftShift() { 5053 __ align(CodeEntryAlignment); 5054 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 5055 StubCodeMark mark(this, stub_id); 5056 address start = __ pc(); 5057 5058 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 5059 5060 Register newArr = c_rarg0; 5061 Register oldArr = c_rarg1; 5062 Register newIdx = c_rarg2; 5063 Register shiftCount = c_rarg3; 5064 Register numIter = c_rarg4; 5065 5066 Register shiftRevCount = rscratch1; 5067 Register oldArrNext = rscratch2; 5068 5069 FloatRegister oldElem0 = v0; 5070 FloatRegister oldElem1 = v1; 5071 FloatRegister newElem = v2; 5072 FloatRegister shiftVCount = v3; 5073 FloatRegister shiftVRevCount = v4; 5074 5075 __ cbz(numIter, Exit); 5076 5077 __ add(oldArrNext, oldArr, 4); 5078 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 5079 5080 // right shift count 5081 __ movw(shiftRevCount, 32); 5082 __ subw(shiftRevCount, shiftRevCount, shiftCount); 5083 5084 // numIter too small to allow a 4-words SIMD loop, rolling back 5085 __ cmp(numIter, (u1)4); 5086 __ br(Assembler::LT, ShiftThree); 5087 5088 __ dup(shiftVCount, __ T4S, shiftCount); 5089 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 5090 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 5091 5092 __ BIND(ShiftSIMDLoop); 5093 5094 // load 4 words and process 5095 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 5096 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 5097 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 5098 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 5099 __ orr(newElem, __ T16B, oldElem0, oldElem1); 5100 __ st1(newElem, __ T4S, __ post(newArr, 16)); 5101 __ sub(numIter, numIter, 4); 5102 5103 __ cmp(numIter, (u1)4); 5104 __ br(Assembler::LT, ShiftTwoLoop); 5105 __ b(ShiftSIMDLoop); 5106 5107 __ BIND(ShiftTwoLoop); 5108 __ cbz(numIter, Exit); 5109 __ cmp(numIter, (u1)1); 5110 __ br(Assembler::EQ, ShiftOne); 5111 5112 // load 2 words and process 5113 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 5114 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 5115 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 5116 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 5117 __ orr(newElem, __ T8B, oldElem0, oldElem1); 5118 __ st1(newElem, __ T2S, __ post(newArr, 8)); 5119 __ sub(numIter, numIter, 2); 5120 __ b(ShiftTwoLoop); 5121 5122 __ BIND(ShiftThree); 5123 __ ldrw(r10, __ post(oldArr, 4)); 5124 __ ldrw(r11, __ post(oldArrNext, 4)); 5125 __ lslvw(r10, r10, shiftCount); 5126 __ lsrvw(r11, r11, shiftRevCount); 5127 __ orrw(r12, r10, r11); 5128 __ strw(r12, __ post(newArr, 4)); 5129 __ tbz(numIter, 1, Exit); 5130 __ tbz(numIter, 0, ShiftOne); 5131 5132 __ BIND(ShiftTwo); 5133 __ ldrw(r10, __ post(oldArr, 4)); 5134 __ ldrw(r11, __ post(oldArrNext, 4)); 5135 __ lslvw(r10, r10, shiftCount); 5136 __ lsrvw(r11, r11, shiftRevCount); 5137 __ orrw(r12, r10, r11); 5138 __ strw(r12, __ post(newArr, 4)); 5139 5140 __ BIND(ShiftOne); 5141 __ ldrw(r10, Address(oldArr)); 5142 __ ldrw(r11, Address(oldArrNext)); 5143 __ lslvw(r10, r10, shiftCount); 5144 __ lsrvw(r11, r11, shiftRevCount); 5145 __ orrw(r12, r10, r11); 5146 __ strw(r12, Address(newArr)); 5147 5148 __ BIND(Exit); 5149 __ ret(lr); 5150 5151 return start; 5152 } 5153 5154 address generate_count_positives(address &count_positives_long) { 5155 const u1 large_loop_size = 64; 5156 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5157 int dcache_line = VM_Version::dcache_line_size(); 5158 5159 Register ary1 = r1, len = r2, result = r0; 5160 5161 __ align(CodeEntryAlignment); 5162 5163 StubGenStubId stub_id = StubGenStubId::count_positives_id; 5164 StubCodeMark mark(this, stub_id); 5165 5166 address entry = __ pc(); 5167 5168 __ enter(); 5169 // precondition: a copy of len is already in result 5170 // __ mov(result, len); 5171 5172 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 5173 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 5174 5175 __ cmp(len, (u1)15); 5176 __ br(Assembler::GT, LEN_OVER_15); 5177 // The only case when execution falls into this code is when pointer is near 5178 // the end of memory page and we have to avoid reading next page 5179 __ add(ary1, ary1, len); 5180 __ subs(len, len, 8); 5181 __ br(Assembler::GT, LEN_OVER_8); 5182 __ ldr(rscratch2, Address(ary1, -8)); 5183 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 5184 __ lsrv(rscratch2, rscratch2, rscratch1); 5185 __ tst(rscratch2, UPPER_BIT_MASK); 5186 __ csel(result, zr, result, Assembler::NE); 5187 __ leave(); 5188 __ ret(lr); 5189 __ bind(LEN_OVER_8); 5190 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5191 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5192 __ tst(rscratch2, UPPER_BIT_MASK); 5193 __ br(Assembler::NE, RET_NO_POP); 5194 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5195 __ lsrv(rscratch1, rscratch1, rscratch2); 5196 __ tst(rscratch1, UPPER_BIT_MASK); 5197 __ bind(RET_NO_POP); 5198 __ csel(result, zr, result, Assembler::NE); 5199 __ leave(); 5200 __ ret(lr); 5201 5202 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5203 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5204 5205 count_positives_long = __ pc(); // 2nd entry point 5206 5207 __ enter(); 5208 5209 __ bind(LEN_OVER_15); 5210 __ push(spilled_regs, sp); 5211 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5212 __ cbz(rscratch2, ALIGNED); 5213 __ ldp(tmp6, tmp1, Address(ary1)); 5214 __ mov(tmp5, 16); 5215 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5216 __ add(ary1, ary1, rscratch1); 5217 __ orr(tmp6, tmp6, tmp1); 5218 __ tst(tmp6, UPPER_BIT_MASK); 5219 __ br(Assembler::NE, RET_ADJUST); 5220 __ sub(len, len, rscratch1); 5221 5222 __ bind(ALIGNED); 5223 __ cmp(len, large_loop_size); 5224 __ br(Assembler::LT, CHECK_16); 5225 // Perform 16-byte load as early return in pre-loop to handle situation 5226 // when initially aligned large array has negative values at starting bytes, 5227 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5228 // slower. Cases with negative bytes further ahead won't be affected that 5229 // much. In fact, it'll be faster due to early loads, less instructions and 5230 // less branches in LARGE_LOOP. 5231 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5232 __ sub(len, len, 16); 5233 __ orr(tmp6, tmp6, tmp1); 5234 __ tst(tmp6, UPPER_BIT_MASK); 5235 __ br(Assembler::NE, RET_ADJUST_16); 5236 __ cmp(len, large_loop_size); 5237 __ br(Assembler::LT, CHECK_16); 5238 5239 if (SoftwarePrefetchHintDistance >= 0 5240 && SoftwarePrefetchHintDistance >= dcache_line) { 5241 // initial prefetch 5242 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5243 } 5244 __ bind(LARGE_LOOP); 5245 if (SoftwarePrefetchHintDistance >= 0) { 5246 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5247 } 5248 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5249 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5250 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5251 // instructions per cycle and have less branches, but this approach disables 5252 // early return, thus, all 64 bytes are loaded and checked every time. 5253 __ ldp(tmp2, tmp3, Address(ary1)); 5254 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5255 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5256 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5257 __ add(ary1, ary1, large_loop_size); 5258 __ sub(len, len, large_loop_size); 5259 __ orr(tmp2, tmp2, tmp3); 5260 __ orr(tmp4, tmp4, tmp5); 5261 __ orr(rscratch1, rscratch1, rscratch2); 5262 __ orr(tmp6, tmp6, tmp1); 5263 __ orr(tmp2, tmp2, tmp4); 5264 __ orr(rscratch1, rscratch1, tmp6); 5265 __ orr(tmp2, tmp2, rscratch1); 5266 __ tst(tmp2, UPPER_BIT_MASK); 5267 __ br(Assembler::NE, RET_ADJUST_LONG); 5268 __ cmp(len, large_loop_size); 5269 __ br(Assembler::GE, LARGE_LOOP); 5270 5271 __ bind(CHECK_16); // small 16-byte load pre-loop 5272 __ cmp(len, (u1)16); 5273 __ br(Assembler::LT, POST_LOOP16); 5274 5275 __ bind(LOOP16); // small 16-byte load loop 5276 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5277 __ sub(len, len, 16); 5278 __ orr(tmp2, tmp2, tmp3); 5279 __ tst(tmp2, UPPER_BIT_MASK); 5280 __ br(Assembler::NE, RET_ADJUST_16); 5281 __ cmp(len, (u1)16); 5282 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5283 5284 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5285 __ cmp(len, (u1)8); 5286 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5287 __ ldr(tmp3, Address(__ post(ary1, 8))); 5288 __ tst(tmp3, UPPER_BIT_MASK); 5289 __ br(Assembler::NE, RET_ADJUST); 5290 __ sub(len, len, 8); 5291 5292 __ bind(POST_LOOP16_LOAD_TAIL); 5293 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5294 __ ldr(tmp1, Address(ary1)); 5295 __ mov(tmp2, 64); 5296 __ sub(tmp4, tmp2, len, __ LSL, 3); 5297 __ lslv(tmp1, tmp1, tmp4); 5298 __ tst(tmp1, UPPER_BIT_MASK); 5299 __ br(Assembler::NE, RET_ADJUST); 5300 // Fallthrough 5301 5302 __ bind(RET_LEN); 5303 __ pop(spilled_regs, sp); 5304 __ leave(); 5305 __ ret(lr); 5306 5307 // difference result - len is the count of guaranteed to be 5308 // positive bytes 5309 5310 __ bind(RET_ADJUST_LONG); 5311 __ add(len, len, (u1)(large_loop_size - 16)); 5312 __ bind(RET_ADJUST_16); 5313 __ add(len, len, 16); 5314 __ bind(RET_ADJUST); 5315 __ pop(spilled_regs, sp); 5316 __ leave(); 5317 __ sub(result, result, len); 5318 __ ret(lr); 5319 5320 return entry; 5321 } 5322 5323 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5324 bool usePrefetch, Label &NOT_EQUAL) { 5325 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5326 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5327 tmp7 = r12, tmp8 = r13; 5328 Label LOOP; 5329 5330 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5331 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5332 __ bind(LOOP); 5333 if (usePrefetch) { 5334 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5335 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5336 } 5337 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5338 __ eor(tmp1, tmp1, tmp2); 5339 __ eor(tmp3, tmp3, tmp4); 5340 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5341 __ orr(tmp1, tmp1, tmp3); 5342 __ cbnz(tmp1, NOT_EQUAL); 5343 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5344 __ eor(tmp5, tmp5, tmp6); 5345 __ eor(tmp7, tmp7, tmp8); 5346 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5347 __ orr(tmp5, tmp5, tmp7); 5348 __ cbnz(tmp5, NOT_EQUAL); 5349 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5350 __ eor(tmp1, tmp1, tmp2); 5351 __ eor(tmp3, tmp3, tmp4); 5352 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5353 __ orr(tmp1, tmp1, tmp3); 5354 __ cbnz(tmp1, NOT_EQUAL); 5355 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5356 __ eor(tmp5, tmp5, tmp6); 5357 __ sub(cnt1, cnt1, 8 * wordSize); 5358 __ eor(tmp7, tmp7, tmp8); 5359 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5360 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5361 // cmp) because subs allows an unlimited range of immediate operand. 5362 __ subs(tmp6, cnt1, loopThreshold); 5363 __ orr(tmp5, tmp5, tmp7); 5364 __ cbnz(tmp5, NOT_EQUAL); 5365 __ br(__ GE, LOOP); 5366 // post-loop 5367 __ eor(tmp1, tmp1, tmp2); 5368 __ eor(tmp3, tmp3, tmp4); 5369 __ orr(tmp1, tmp1, tmp3); 5370 __ sub(cnt1, cnt1, 2 * wordSize); 5371 __ cbnz(tmp1, NOT_EQUAL); 5372 } 5373 5374 void generate_large_array_equals_loop_simd(int loopThreshold, 5375 bool usePrefetch, Label &NOT_EQUAL) { 5376 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5377 tmp2 = rscratch2; 5378 Label LOOP; 5379 5380 __ bind(LOOP); 5381 if (usePrefetch) { 5382 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5383 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5384 } 5385 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5386 __ sub(cnt1, cnt1, 8 * wordSize); 5387 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5388 __ subs(tmp1, cnt1, loopThreshold); 5389 __ eor(v0, __ T16B, v0, v4); 5390 __ eor(v1, __ T16B, v1, v5); 5391 __ eor(v2, __ T16B, v2, v6); 5392 __ eor(v3, __ T16B, v3, v7); 5393 __ orr(v0, __ T16B, v0, v1); 5394 __ orr(v1, __ T16B, v2, v3); 5395 __ orr(v0, __ T16B, v0, v1); 5396 __ umov(tmp1, v0, __ D, 0); 5397 __ umov(tmp2, v0, __ D, 1); 5398 __ orr(tmp1, tmp1, tmp2); 5399 __ cbnz(tmp1, NOT_EQUAL); 5400 __ br(__ GE, LOOP); 5401 } 5402 5403 // a1 = r1 - array1 address 5404 // a2 = r2 - array2 address 5405 // result = r0 - return value. Already contains "false" 5406 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5407 // r3-r5 are reserved temporary registers 5408 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5409 address generate_large_array_equals() { 5410 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5411 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5412 tmp7 = r12, tmp8 = r13; 5413 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5414 SMALL_LOOP, POST_LOOP; 5415 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5416 // calculate if at least 32 prefetched bytes are used 5417 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5418 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5419 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5420 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5421 tmp5, tmp6, tmp7, tmp8); 5422 5423 __ align(CodeEntryAlignment); 5424 5425 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 5426 StubCodeMark mark(this, stub_id); 5427 5428 address entry = __ pc(); 5429 __ enter(); 5430 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5431 // also advance pointers to use post-increment instead of pre-increment 5432 __ add(a1, a1, wordSize); 5433 __ add(a2, a2, wordSize); 5434 if (AvoidUnalignedAccesses) { 5435 // both implementations (SIMD/nonSIMD) are using relatively large load 5436 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5437 // on some CPUs in case of address is not at least 16-byte aligned. 5438 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5439 // load if needed at least for 1st address and make if 16-byte aligned. 5440 Label ALIGNED16; 5441 __ tbz(a1, 3, ALIGNED16); 5442 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5443 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5444 __ sub(cnt1, cnt1, wordSize); 5445 __ eor(tmp1, tmp1, tmp2); 5446 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5447 __ bind(ALIGNED16); 5448 } 5449 if (UseSIMDForArrayEquals) { 5450 if (SoftwarePrefetchHintDistance >= 0) { 5451 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5452 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5453 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5454 /* prfm = */ true, NOT_EQUAL); 5455 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5456 __ br(__ LT, TAIL); 5457 } 5458 __ bind(NO_PREFETCH_LARGE_LOOP); 5459 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5460 /* prfm = */ false, NOT_EQUAL); 5461 } else { 5462 __ push(spilled_regs, sp); 5463 if (SoftwarePrefetchHintDistance >= 0) { 5464 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5465 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5466 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5467 /* prfm = */ true, NOT_EQUAL); 5468 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5469 __ br(__ LT, TAIL); 5470 } 5471 __ bind(NO_PREFETCH_LARGE_LOOP); 5472 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5473 /* prfm = */ false, NOT_EQUAL); 5474 } 5475 __ bind(TAIL); 5476 __ cbz(cnt1, EQUAL); 5477 __ subs(cnt1, cnt1, wordSize); 5478 __ br(__ LE, POST_LOOP); 5479 __ bind(SMALL_LOOP); 5480 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5481 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5482 __ subs(cnt1, cnt1, wordSize); 5483 __ eor(tmp1, tmp1, tmp2); 5484 __ cbnz(tmp1, NOT_EQUAL); 5485 __ br(__ GT, SMALL_LOOP); 5486 __ bind(POST_LOOP); 5487 __ ldr(tmp1, Address(a1, cnt1)); 5488 __ ldr(tmp2, Address(a2, cnt1)); 5489 __ eor(tmp1, tmp1, tmp2); 5490 __ cbnz(tmp1, NOT_EQUAL); 5491 __ bind(EQUAL); 5492 __ mov(result, true); 5493 __ bind(NOT_EQUAL); 5494 if (!UseSIMDForArrayEquals) { 5495 __ pop(spilled_regs, sp); 5496 } 5497 __ bind(NOT_EQUAL_NO_POP); 5498 __ leave(); 5499 __ ret(lr); 5500 return entry; 5501 } 5502 5503 // result = r0 - return value. Contains initial hashcode value on entry. 5504 // ary = r1 - array address 5505 // cnt = r2 - elements count 5506 // Clobbers: v0-v13, rscratch1, rscratch2 5507 address generate_large_arrays_hashcode(BasicType eltype) { 5508 const Register result = r0, ary = r1, cnt = r2; 5509 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 5510 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 5511 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 5512 const FloatRegister vpowm = v13; 5513 5514 ARRAYS_HASHCODE_REGISTERS; 5515 5516 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 5517 5518 unsigned int vf; // vectorization factor 5519 bool multiply_by_halves; 5520 Assembler::SIMD_Arrangement load_arrangement; 5521 switch (eltype) { 5522 case T_BOOLEAN: 5523 case T_BYTE: 5524 load_arrangement = Assembler::T8B; 5525 multiply_by_halves = true; 5526 vf = 8; 5527 break; 5528 case T_CHAR: 5529 case T_SHORT: 5530 load_arrangement = Assembler::T8H; 5531 multiply_by_halves = true; 5532 vf = 8; 5533 break; 5534 case T_INT: 5535 load_arrangement = Assembler::T4S; 5536 multiply_by_halves = false; 5537 vf = 4; 5538 break; 5539 default: 5540 ShouldNotReachHere(); 5541 } 5542 5543 // Unroll factor 5544 const unsigned uf = 4; 5545 5546 // Effective vectorization factor 5547 const unsigned evf = vf * uf; 5548 5549 __ align(CodeEntryAlignment); 5550 5551 StubGenStubId stub_id; 5552 switch (eltype) { 5553 case T_BOOLEAN: 5554 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 5555 break; 5556 case T_BYTE: 5557 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 5558 break; 5559 case T_CHAR: 5560 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 5561 break; 5562 case T_SHORT: 5563 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 5564 break; 5565 case T_INT: 5566 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 5567 break; 5568 default: 5569 stub_id = StubGenStubId::NO_STUBID; 5570 ShouldNotReachHere(); 5571 }; 5572 5573 StubCodeMark mark(this, stub_id); 5574 5575 address entry = __ pc(); 5576 __ enter(); 5577 5578 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 5579 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 5580 // value shouldn't change throughout both loops. 5581 __ movw(rscratch1, intpow(31U, 3)); 5582 __ mov(vpow, Assembler::S, 0, rscratch1); 5583 __ movw(rscratch1, intpow(31U, 2)); 5584 __ mov(vpow, Assembler::S, 1, rscratch1); 5585 __ movw(rscratch1, intpow(31U, 1)); 5586 __ mov(vpow, Assembler::S, 2, rscratch1); 5587 __ movw(rscratch1, intpow(31U, 0)); 5588 __ mov(vpow, Assembler::S, 3, rscratch1); 5589 5590 __ mov(vmul0, Assembler::T16B, 0); 5591 __ mov(vmul0, Assembler::S, 3, result); 5592 5593 __ andr(rscratch2, cnt, (uf - 1) * vf); 5594 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 5595 5596 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 5597 __ mov(vpowm, Assembler::S, 0, rscratch1); 5598 5599 // SMALL LOOP 5600 __ bind(SMALL_LOOP); 5601 5602 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 5603 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5604 __ subsw(rscratch2, rscratch2, vf); 5605 5606 if (load_arrangement == Assembler::T8B) { 5607 // Extend 8B to 8H to be able to use vector multiply 5608 // instructions 5609 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 5610 if (is_signed_subword_type(eltype)) { 5611 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5612 } else { 5613 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5614 } 5615 } 5616 5617 switch (load_arrangement) { 5618 case Assembler::T4S: 5619 __ addv(vmul0, load_arrangement, vmul0, vdata0); 5620 break; 5621 case Assembler::T8B: 5622 case Assembler::T8H: 5623 assert(is_subword_type(eltype), "subword type expected"); 5624 if (is_signed_subword_type(eltype)) { 5625 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5626 } else { 5627 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5628 } 5629 break; 5630 default: 5631 __ should_not_reach_here(); 5632 } 5633 5634 // Process the upper half of a vector 5635 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 5636 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5637 if (is_signed_subword_type(eltype)) { 5638 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5639 } else { 5640 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5641 } 5642 } 5643 5644 __ br(Assembler::HI, SMALL_LOOP); 5645 5646 // SMALL LOOP'S EPILOQUE 5647 __ lsr(rscratch2, cnt, exact_log2(evf)); 5648 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 5649 5650 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 5651 __ addv(vmul0, Assembler::T4S, vmul0); 5652 __ umov(result, vmul0, Assembler::S, 0); 5653 5654 // TAIL 5655 __ bind(TAIL); 5656 5657 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 5658 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 5659 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 5660 __ andr(rscratch2, cnt, vf - 1); 5661 __ bind(TAIL_SHORTCUT); 5662 __ adr(rscratch1, BR_BASE); 5663 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 5664 __ movw(rscratch2, 0x1f); 5665 __ br(rscratch1); 5666 5667 for (size_t i = 0; i < vf - 1; ++i) { 5668 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 5669 eltype); 5670 __ maddw(result, result, rscratch2, rscratch1); 5671 } 5672 __ bind(BR_BASE); 5673 5674 __ leave(); 5675 __ ret(lr); 5676 5677 // LARGE LOOP 5678 __ bind(LARGE_LOOP_PREHEADER); 5679 5680 __ lsr(rscratch2, cnt, exact_log2(evf)); 5681 5682 if (multiply_by_halves) { 5683 // 31^4 - multiplier between lower and upper parts of a register 5684 __ movw(rscratch1, intpow(31U, vf / 2)); 5685 __ mov(vpowm, Assembler::S, 1, rscratch1); 5686 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 5687 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 5688 __ mov(vpowm, Assembler::S, 0, rscratch1); 5689 } else { 5690 // 31^16 5691 __ movw(rscratch1, intpow(31U, evf)); 5692 __ mov(vpowm, Assembler::S, 0, rscratch1); 5693 } 5694 5695 __ mov(vmul3, Assembler::T16B, 0); 5696 __ mov(vmul2, Assembler::T16B, 0); 5697 __ mov(vmul1, Assembler::T16B, 0); 5698 5699 __ bind(LARGE_LOOP); 5700 5701 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 5702 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 5703 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 5704 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5705 5706 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 5707 Address(__ post(ary, evf * type2aelembytes(eltype)))); 5708 5709 if (load_arrangement == Assembler::T8B) { 5710 // Extend 8B to 8H to be able to use vector multiply 5711 // instructions 5712 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 5713 if (is_signed_subword_type(eltype)) { 5714 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 5715 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 5716 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 5717 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5718 } else { 5719 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 5720 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 5721 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 5722 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5723 } 5724 } 5725 5726 switch (load_arrangement) { 5727 case Assembler::T4S: 5728 __ addv(vmul3, load_arrangement, vmul3, vdata3); 5729 __ addv(vmul2, load_arrangement, vmul2, vdata2); 5730 __ addv(vmul1, load_arrangement, vmul1, vdata1); 5731 __ addv(vmul0, load_arrangement, vmul0, vdata0); 5732 break; 5733 case Assembler::T8B: 5734 case Assembler::T8H: 5735 assert(is_subword_type(eltype), "subword type expected"); 5736 if (is_signed_subword_type(eltype)) { 5737 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 5738 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 5739 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 5740 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5741 } else { 5742 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 5743 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 5744 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 5745 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5746 } 5747 break; 5748 default: 5749 __ should_not_reach_here(); 5750 } 5751 5752 // Process the upper half of a vector 5753 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 5754 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 5755 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 5756 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 5757 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 5758 if (is_signed_subword_type(eltype)) { 5759 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 5760 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 5761 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 5762 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5763 } else { 5764 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 5765 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 5766 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 5767 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5768 } 5769 } 5770 5771 __ subsw(rscratch2, rscratch2, 1); 5772 __ br(Assembler::HI, LARGE_LOOP); 5773 5774 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 5775 __ addv(vmul3, Assembler::T4S, vmul3); 5776 __ umov(result, vmul3, Assembler::S, 0); 5777 5778 __ mov(rscratch2, intpow(31U, vf)); 5779 5780 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 5781 __ addv(vmul2, Assembler::T4S, vmul2); 5782 __ umov(rscratch1, vmul2, Assembler::S, 0); 5783 __ maddw(result, result, rscratch2, rscratch1); 5784 5785 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 5786 __ addv(vmul1, Assembler::T4S, vmul1); 5787 __ umov(rscratch1, vmul1, Assembler::S, 0); 5788 __ maddw(result, result, rscratch2, rscratch1); 5789 5790 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 5791 __ addv(vmul0, Assembler::T4S, vmul0); 5792 __ umov(rscratch1, vmul0, Assembler::S, 0); 5793 __ maddw(result, result, rscratch2, rscratch1); 5794 5795 __ andr(rscratch2, cnt, vf - 1); 5796 __ cbnz(rscratch2, TAIL_SHORTCUT); 5797 5798 __ leave(); 5799 __ ret(lr); 5800 5801 return entry; 5802 } 5803 5804 address generate_dsin_dcos(bool isCos) { 5805 __ align(CodeEntryAlignment); 5806 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 5807 StubCodeMark mark(this, stub_id); 5808 address start = __ pc(); 5809 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5810 (address)StubRoutines::aarch64::_two_over_pi, 5811 (address)StubRoutines::aarch64::_pio2, 5812 (address)StubRoutines::aarch64::_dsin_coef, 5813 (address)StubRoutines::aarch64::_dcos_coef); 5814 return start; 5815 } 5816 5817 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5818 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5819 Label &DIFF2) { 5820 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5821 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5822 5823 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5824 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5825 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5826 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5827 5828 __ fmovd(tmpL, vtmp3); 5829 __ eor(rscratch2, tmp3, tmpL); 5830 __ cbnz(rscratch2, DIFF2); 5831 5832 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5833 __ umov(tmpL, vtmp3, __ D, 1); 5834 __ eor(rscratch2, tmpU, tmpL); 5835 __ cbnz(rscratch2, DIFF1); 5836 5837 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5838 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5839 __ fmovd(tmpL, vtmp); 5840 __ eor(rscratch2, tmp3, tmpL); 5841 __ cbnz(rscratch2, DIFF2); 5842 5843 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5844 __ umov(tmpL, vtmp, __ D, 1); 5845 __ eor(rscratch2, tmpU, tmpL); 5846 __ cbnz(rscratch2, DIFF1); 5847 } 5848 5849 // r0 = result 5850 // r1 = str1 5851 // r2 = cnt1 5852 // r3 = str2 5853 // r4 = cnt2 5854 // r10 = tmp1 5855 // r11 = tmp2 5856 address generate_compare_long_string_different_encoding(bool isLU) { 5857 __ align(CodeEntryAlignment); 5858 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 5859 StubCodeMark mark(this, stub_id); 5860 address entry = __ pc(); 5861 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5862 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5863 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5864 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5865 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5866 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5867 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5868 5869 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5870 5871 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5872 // cnt2 == amount of characters left to compare 5873 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5874 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5875 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5876 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5877 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5878 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5879 __ eor(rscratch2, tmp1, tmp2); 5880 __ mov(rscratch1, tmp2); 5881 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5882 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5883 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5884 __ push(spilled_regs, sp); 5885 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5886 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5887 5888 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5889 5890 if (SoftwarePrefetchHintDistance >= 0) { 5891 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5892 __ br(__ LT, NO_PREFETCH); 5893 __ bind(LARGE_LOOP_PREFETCH); 5894 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5895 __ mov(tmp4, 2); 5896 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5897 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5898 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5899 __ subs(tmp4, tmp4, 1); 5900 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5901 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5902 __ mov(tmp4, 2); 5903 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5904 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5905 __ subs(tmp4, tmp4, 1); 5906 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5907 __ sub(cnt2, cnt2, 64); 5908 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5909 __ br(__ GE, LARGE_LOOP_PREFETCH); 5910 } 5911 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5912 __ bind(NO_PREFETCH); 5913 __ subs(cnt2, cnt2, 16); 5914 __ br(__ LT, TAIL); 5915 __ align(OptoLoopAlignment); 5916 __ bind(SMALL_LOOP); // smaller loop 5917 __ subs(cnt2, cnt2, 16); 5918 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5919 __ br(__ GE, SMALL_LOOP); 5920 __ cmn(cnt2, (u1)16); 5921 __ br(__ EQ, LOAD_LAST); 5922 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5923 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5924 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5925 __ ldr(tmp3, Address(cnt1, -8)); 5926 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5927 __ b(LOAD_LAST); 5928 __ bind(DIFF2); 5929 __ mov(tmpU, tmp3); 5930 __ bind(DIFF1); 5931 __ pop(spilled_regs, sp); 5932 __ b(CALCULATE_DIFFERENCE); 5933 __ bind(LOAD_LAST); 5934 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5935 // No need to load it again 5936 __ mov(tmpU, tmp3); 5937 __ pop(spilled_regs, sp); 5938 5939 // tmp2 points to the address of the last 4 Latin1 characters right now 5940 __ ldrs(vtmp, Address(tmp2)); 5941 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5942 __ fmovd(tmpL, vtmp); 5943 5944 __ eor(rscratch2, tmpU, tmpL); 5945 __ cbz(rscratch2, DONE); 5946 5947 // Find the first different characters in the longwords and 5948 // compute their difference. 5949 __ bind(CALCULATE_DIFFERENCE); 5950 __ rev(rscratch2, rscratch2); 5951 __ clz(rscratch2, rscratch2); 5952 __ andr(rscratch2, rscratch2, -16); 5953 __ lsrv(tmp1, tmp1, rscratch2); 5954 __ uxthw(tmp1, tmp1); 5955 __ lsrv(rscratch1, rscratch1, rscratch2); 5956 __ uxthw(rscratch1, rscratch1); 5957 __ subw(result, tmp1, rscratch1); 5958 __ bind(DONE); 5959 __ ret(lr); 5960 return entry; 5961 } 5962 5963 // r0 = input (float16) 5964 // v0 = result (float) 5965 // v1 = temporary float register 5966 address generate_float16ToFloat() { 5967 __ align(CodeEntryAlignment); 5968 StubGenStubId stub_id = StubGenStubId::hf2f_id; 5969 StubCodeMark mark(this, stub_id); 5970 address entry = __ pc(); 5971 BLOCK_COMMENT("Entry:"); 5972 __ flt16_to_flt(v0, r0, v1); 5973 __ ret(lr); 5974 return entry; 5975 } 5976 5977 // v0 = input (float) 5978 // r0 = result (float16) 5979 // v1 = temporary float register 5980 address generate_floatToFloat16() { 5981 __ align(CodeEntryAlignment); 5982 StubGenStubId stub_id = StubGenStubId::f2hf_id; 5983 StubCodeMark mark(this, stub_id); 5984 address entry = __ pc(); 5985 BLOCK_COMMENT("Entry:"); 5986 __ flt_to_flt16(r0, v0, v1); 5987 __ ret(lr); 5988 return entry; 5989 } 5990 5991 address generate_method_entry_barrier() { 5992 __ align(CodeEntryAlignment); 5993 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 5994 StubCodeMark mark(this, stub_id); 5995 5996 Label deoptimize_label; 5997 5998 address start = __ pc(); 5999 6000 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 6001 6002 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 6003 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 6004 // We can get here despite the nmethod being good, if we have not 6005 // yet applied our cross modification fence (or data fence). 6006 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 6007 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 6008 __ ldrw(rscratch2, rscratch2); 6009 __ strw(rscratch2, thread_epoch_addr); 6010 __ isb(); 6011 __ membar(__ LoadLoad); 6012 } 6013 6014 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 6015 6016 __ enter(); 6017 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 6018 6019 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 6020 6021 __ push_call_clobbered_registers(); 6022 6023 __ mov(c_rarg0, rscratch2); 6024 __ call_VM_leaf 6025 (CAST_FROM_FN_PTR 6026 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 6027 6028 __ reset_last_Java_frame(true); 6029 6030 __ mov(rscratch1, r0); 6031 6032 __ pop_call_clobbered_registers(); 6033 6034 __ cbnz(rscratch1, deoptimize_label); 6035 6036 __ leave(); 6037 __ ret(lr); 6038 6039 __ BIND(deoptimize_label); 6040 6041 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 6042 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 6043 6044 __ mov(sp, rscratch1); 6045 __ br(rscratch2); 6046 6047 return start; 6048 } 6049 6050 // r0 = result 6051 // r1 = str1 6052 // r2 = cnt1 6053 // r3 = str2 6054 // r4 = cnt2 6055 // r10 = tmp1 6056 // r11 = tmp2 6057 address generate_compare_long_string_same_encoding(bool isLL) { 6058 __ align(CodeEntryAlignment); 6059 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 6060 StubCodeMark mark(this, stub_id); 6061 address entry = __ pc(); 6062 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6063 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 6064 6065 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 6066 6067 // exit from large loop when less than 64 bytes left to read or we're about 6068 // to prefetch memory behind array border 6069 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 6070 6071 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 6072 __ eor(rscratch2, tmp1, tmp2); 6073 __ cbnz(rscratch2, CAL_DIFFERENCE); 6074 6075 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 6076 // update pointers, because of previous read 6077 __ add(str1, str1, wordSize); 6078 __ add(str2, str2, wordSize); 6079 if (SoftwarePrefetchHintDistance >= 0) { 6080 __ align(OptoLoopAlignment); 6081 __ bind(LARGE_LOOP_PREFETCH); 6082 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 6083 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 6084 6085 for (int i = 0; i < 4; i++) { 6086 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 6087 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 6088 __ cmp(tmp1, tmp2); 6089 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 6090 __ br(Assembler::NE, DIFF); 6091 } 6092 __ sub(cnt2, cnt2, isLL ? 64 : 32); 6093 __ add(str1, str1, 64); 6094 __ add(str2, str2, 64); 6095 __ subs(rscratch2, cnt2, largeLoopExitCondition); 6096 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 6097 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 6098 } 6099 6100 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 6101 __ br(Assembler::LE, LESS16); 6102 __ align(OptoLoopAlignment); 6103 __ bind(LOOP_COMPARE16); 6104 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 6105 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 6106 __ cmp(tmp1, tmp2); 6107 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 6108 __ br(Assembler::NE, DIFF); 6109 __ sub(cnt2, cnt2, isLL ? 16 : 8); 6110 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 6111 __ br(Assembler::LT, LESS16); 6112 6113 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 6114 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 6115 __ cmp(tmp1, tmp2); 6116 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 6117 __ br(Assembler::NE, DIFF); 6118 __ sub(cnt2, cnt2, isLL ? 16 : 8); 6119 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 6120 __ br(Assembler::GE, LOOP_COMPARE16); 6121 __ cbz(cnt2, LENGTH_DIFF); 6122 6123 __ bind(LESS16); 6124 // each 8 compare 6125 __ subs(cnt2, cnt2, isLL ? 8 : 4); 6126 __ br(Assembler::LE, LESS8); 6127 __ ldr(tmp1, Address(__ post(str1, 8))); 6128 __ ldr(tmp2, Address(__ post(str2, 8))); 6129 __ eor(rscratch2, tmp1, tmp2); 6130 __ cbnz(rscratch2, CAL_DIFFERENCE); 6131 __ sub(cnt2, cnt2, isLL ? 8 : 4); 6132 6133 __ bind(LESS8); // directly load last 8 bytes 6134 if (!isLL) { 6135 __ add(cnt2, cnt2, cnt2); 6136 } 6137 __ ldr(tmp1, Address(str1, cnt2)); 6138 __ ldr(tmp2, Address(str2, cnt2)); 6139 __ eor(rscratch2, tmp1, tmp2); 6140 __ cbz(rscratch2, LENGTH_DIFF); 6141 __ b(CAL_DIFFERENCE); 6142 6143 __ bind(DIFF); 6144 __ cmp(tmp1, tmp2); 6145 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 6146 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 6147 // reuse rscratch2 register for the result of eor instruction 6148 __ eor(rscratch2, tmp1, tmp2); 6149 6150 __ bind(CAL_DIFFERENCE); 6151 __ rev(rscratch2, rscratch2); 6152 __ clz(rscratch2, rscratch2); 6153 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 6154 __ lsrv(tmp1, tmp1, rscratch2); 6155 __ lsrv(tmp2, tmp2, rscratch2); 6156 if (isLL) { 6157 __ uxtbw(tmp1, tmp1); 6158 __ uxtbw(tmp2, tmp2); 6159 } else { 6160 __ uxthw(tmp1, tmp1); 6161 __ uxthw(tmp2, tmp2); 6162 } 6163 __ subw(result, tmp1, tmp2); 6164 6165 __ bind(LENGTH_DIFF); 6166 __ ret(lr); 6167 return entry; 6168 } 6169 6170 enum string_compare_mode { 6171 LL, 6172 LU, 6173 UL, 6174 UU, 6175 }; 6176 6177 // The following registers are declared in aarch64.ad 6178 // r0 = result 6179 // r1 = str1 6180 // r2 = cnt1 6181 // r3 = str2 6182 // r4 = cnt2 6183 // r10 = tmp1 6184 // r11 = tmp2 6185 // z0 = ztmp1 6186 // z1 = ztmp2 6187 // p0 = pgtmp1 6188 // p1 = pgtmp2 6189 address generate_compare_long_string_sve(string_compare_mode mode) { 6190 StubGenStubId stub_id; 6191 switch (mode) { 6192 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 6193 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 6194 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 6195 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 6196 default: ShouldNotReachHere(); 6197 } 6198 6199 __ align(CodeEntryAlignment); 6200 address entry = __ pc(); 6201 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6202 tmp1 = r10, tmp2 = r11; 6203 6204 Label LOOP, DONE, MISMATCH; 6205 Register vec_len = tmp1; 6206 Register idx = tmp2; 6207 // The minimum of the string lengths has been stored in cnt2. 6208 Register cnt = cnt2; 6209 FloatRegister ztmp1 = z0, ztmp2 = z1; 6210 PRegister pgtmp1 = p0, pgtmp2 = p1; 6211 6212 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 6213 switch (mode) { \ 6214 case LL: \ 6215 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 6216 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 6217 break; \ 6218 case LU: \ 6219 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 6220 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 6221 break; \ 6222 case UL: \ 6223 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 6224 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 6225 break; \ 6226 case UU: \ 6227 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 6228 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 6229 break; \ 6230 default: \ 6231 ShouldNotReachHere(); \ 6232 } 6233 6234 StubCodeMark mark(this, stub_id); 6235 6236 __ mov(idx, 0); 6237 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 6238 6239 if (mode == LL) { 6240 __ sve_cntb(vec_len); 6241 } else { 6242 __ sve_cnth(vec_len); 6243 } 6244 6245 __ sub(rscratch1, cnt, vec_len); 6246 6247 __ bind(LOOP); 6248 6249 // main loop 6250 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 6251 __ add(idx, idx, vec_len); 6252 // Compare strings. 6253 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 6254 __ br(__ NE, MISMATCH); 6255 __ cmp(idx, rscratch1); 6256 __ br(__ LT, LOOP); 6257 6258 // post loop, last iteration 6259 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 6260 6261 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 6262 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 6263 __ br(__ EQ, DONE); 6264 6265 __ bind(MISMATCH); 6266 6267 // Crop the vector to find its location. 6268 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 6269 // Extract the first different characters of each string. 6270 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 6271 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 6272 6273 // Compute the difference of the first different characters. 6274 __ sub(result, rscratch1, rscratch2); 6275 6276 __ bind(DONE); 6277 __ ret(lr); 6278 #undef LOAD_PAIR 6279 return entry; 6280 } 6281 6282 void generate_compare_long_strings() { 6283 if (UseSVE == 0) { 6284 StubRoutines::aarch64::_compare_long_string_LL 6285 = generate_compare_long_string_same_encoding(true); 6286 StubRoutines::aarch64::_compare_long_string_UU 6287 = generate_compare_long_string_same_encoding(false); 6288 StubRoutines::aarch64::_compare_long_string_LU 6289 = generate_compare_long_string_different_encoding(true); 6290 StubRoutines::aarch64::_compare_long_string_UL 6291 = generate_compare_long_string_different_encoding(false); 6292 } else { 6293 StubRoutines::aarch64::_compare_long_string_LL 6294 = generate_compare_long_string_sve(LL); 6295 StubRoutines::aarch64::_compare_long_string_UU 6296 = generate_compare_long_string_sve(UU); 6297 StubRoutines::aarch64::_compare_long_string_LU 6298 = generate_compare_long_string_sve(LU); 6299 StubRoutines::aarch64::_compare_long_string_UL 6300 = generate_compare_long_string_sve(UL); 6301 } 6302 } 6303 6304 // R0 = result 6305 // R1 = str2 6306 // R2 = cnt1 6307 // R3 = str1 6308 // R4 = cnt2 6309 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 6310 // 6311 // This generic linear code use few additional ideas, which makes it faster: 6312 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 6313 // in order to skip initial loading(help in systems with 1 ld pipeline) 6314 // 2) we can use "fast" algorithm of finding single character to search for 6315 // first symbol with less branches(1 branch per each loaded register instead 6316 // of branch for each symbol), so, this is where constants like 6317 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 6318 // 3) after loading and analyzing 1st register of source string, it can be 6319 // used to search for every 1st character entry, saving few loads in 6320 // comparison with "simplier-but-slower" implementation 6321 // 4) in order to avoid lots of push/pop operations, code below is heavily 6322 // re-using/re-initializing/compressing register values, which makes code 6323 // larger and a bit less readable, however, most of extra operations are 6324 // issued during loads or branches, so, penalty is minimal 6325 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 6326 StubGenStubId stub_id; 6327 if (str1_isL) { 6328 if (str2_isL) { 6329 stub_id = StubGenStubId::string_indexof_linear_ll_id; 6330 } else { 6331 stub_id = StubGenStubId::string_indexof_linear_ul_id; 6332 } 6333 } else { 6334 if (str2_isL) { 6335 ShouldNotReachHere(); 6336 } else { 6337 stub_id = StubGenStubId::string_indexof_linear_uu_id; 6338 } 6339 } 6340 __ align(CodeEntryAlignment); 6341 StubCodeMark mark(this, stub_id); 6342 address entry = __ pc(); 6343 6344 int str1_chr_size = str1_isL ? 1 : 2; 6345 int str2_chr_size = str2_isL ? 1 : 2; 6346 int str1_chr_shift = str1_isL ? 0 : 1; 6347 int str2_chr_shift = str2_isL ? 0 : 1; 6348 bool isL = str1_isL && str2_isL; 6349 // parameters 6350 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 6351 // temporary registers 6352 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 6353 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 6354 // redefinitions 6355 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 6356 6357 __ push(spilled_regs, sp); 6358 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 6359 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 6360 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 6361 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 6362 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 6363 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 6364 // Read whole register from str1. It is safe, because length >=8 here 6365 __ ldr(ch1, Address(str1)); 6366 // Read whole register from str2. It is safe, because length >=8 here 6367 __ ldr(ch2, Address(str2)); 6368 __ sub(cnt2, cnt2, cnt1); 6369 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 6370 if (str1_isL != str2_isL) { 6371 __ eor(v0, __ T16B, v0, v0); 6372 } 6373 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 6374 __ mul(first, first, tmp1); 6375 // check if we have less than 1 register to check 6376 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 6377 if (str1_isL != str2_isL) { 6378 __ fmovd(v1, ch1); 6379 } 6380 __ br(__ LE, L_SMALL); 6381 __ eor(ch2, first, ch2); 6382 if (str1_isL != str2_isL) { 6383 __ zip1(v1, __ T16B, v1, v0); 6384 } 6385 __ sub(tmp2, ch2, tmp1); 6386 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6387 __ bics(tmp2, tmp2, ch2); 6388 if (str1_isL != str2_isL) { 6389 __ fmovd(ch1, v1); 6390 } 6391 __ br(__ NE, L_HAS_ZERO); 6392 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 6393 __ add(result, result, wordSize/str2_chr_size); 6394 __ add(str2, str2, wordSize); 6395 __ br(__ LT, L_POST_LOOP); 6396 __ BIND(L_LOOP); 6397 __ ldr(ch2, Address(str2)); 6398 __ eor(ch2, first, ch2); 6399 __ sub(tmp2, ch2, tmp1); 6400 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6401 __ bics(tmp2, tmp2, ch2); 6402 __ br(__ NE, L_HAS_ZERO); 6403 __ BIND(L_LOOP_PROCEED); 6404 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 6405 __ add(str2, str2, wordSize); 6406 __ add(result, result, wordSize/str2_chr_size); 6407 __ br(__ GE, L_LOOP); 6408 __ BIND(L_POST_LOOP); 6409 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 6410 __ br(__ LE, NOMATCH); 6411 __ ldr(ch2, Address(str2)); 6412 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 6413 __ eor(ch2, first, ch2); 6414 __ sub(tmp2, ch2, tmp1); 6415 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6416 __ mov(tmp4, -1); // all bits set 6417 __ b(L_SMALL_PROCEED); 6418 __ align(OptoLoopAlignment); 6419 __ BIND(L_SMALL); 6420 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 6421 __ eor(ch2, first, ch2); 6422 if (str1_isL != str2_isL) { 6423 __ zip1(v1, __ T16B, v1, v0); 6424 } 6425 __ sub(tmp2, ch2, tmp1); 6426 __ mov(tmp4, -1); // all bits set 6427 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6428 if (str1_isL != str2_isL) { 6429 __ fmovd(ch1, v1); // move converted 4 symbols 6430 } 6431 __ BIND(L_SMALL_PROCEED); 6432 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 6433 __ bic(tmp2, tmp2, ch2); 6434 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 6435 __ rbit(tmp2, tmp2); 6436 __ br(__ EQ, NOMATCH); 6437 __ BIND(L_SMALL_HAS_ZERO_LOOP); 6438 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 6439 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 6440 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 6441 if (str2_isL) { // LL 6442 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6443 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6444 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6445 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6446 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6447 } else { 6448 __ mov(ch2, 0xE); // all bits in byte set except last one 6449 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6450 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6451 __ lslv(tmp2, tmp2, tmp4); 6452 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6453 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6454 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6455 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6456 } 6457 __ cmp(ch1, ch2); 6458 __ mov(tmp4, wordSize/str2_chr_size); 6459 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6460 __ BIND(L_SMALL_CMP_LOOP); 6461 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6462 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6463 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6464 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6465 __ add(tmp4, tmp4, 1); 6466 __ cmp(tmp4, cnt1); 6467 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 6468 __ cmp(first, ch2); 6469 __ br(__ EQ, L_SMALL_CMP_LOOP); 6470 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 6471 __ cbz(tmp2, NOMATCH); // no more matches. exit 6472 __ clz(tmp4, tmp2); 6473 __ add(result, result, 1); // advance index 6474 __ add(str2, str2, str2_chr_size); // advance pointer 6475 __ b(L_SMALL_HAS_ZERO_LOOP); 6476 __ align(OptoLoopAlignment); 6477 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 6478 __ cmp(first, ch2); 6479 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6480 __ b(DONE); 6481 __ align(OptoLoopAlignment); 6482 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 6483 if (str2_isL) { // LL 6484 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6485 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6486 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6487 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6488 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6489 } else { 6490 __ mov(ch2, 0xE); // all bits in byte set except last one 6491 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6492 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6493 __ lslv(tmp2, tmp2, tmp4); 6494 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6495 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6496 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6497 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6498 } 6499 __ cmp(ch1, ch2); 6500 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6501 __ b(DONE); 6502 __ align(OptoLoopAlignment); 6503 __ BIND(L_HAS_ZERO); 6504 __ rbit(tmp2, tmp2); 6505 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 6506 // Now, perform compression of counters(cnt2 and cnt1) into one register. 6507 // It's fine because both counters are 32bit and are not changed in this 6508 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 6509 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 6510 __ sub(result, result, 1); 6511 __ BIND(L_HAS_ZERO_LOOP); 6512 __ mov(cnt1, wordSize/str2_chr_size); 6513 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6514 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6515 if (str2_isL) { 6516 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6517 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6518 __ lslv(tmp2, tmp2, tmp4); 6519 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6520 __ add(tmp4, tmp4, 1); 6521 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6522 __ lsl(tmp2, tmp2, 1); 6523 __ mov(tmp4, wordSize/str2_chr_size); 6524 } else { 6525 __ mov(ch2, 0xE); 6526 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6527 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6528 __ lslv(tmp2, tmp2, tmp4); 6529 __ add(tmp4, tmp4, 1); 6530 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6531 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6532 __ lsl(tmp2, tmp2, 1); 6533 __ mov(tmp4, wordSize/str2_chr_size); 6534 __ sub(str2, str2, str2_chr_size); 6535 } 6536 __ cmp(ch1, ch2); 6537 __ mov(tmp4, wordSize/str2_chr_size); 6538 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6539 __ BIND(L_CMP_LOOP); 6540 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6541 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6542 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6543 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6544 __ add(tmp4, tmp4, 1); 6545 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6546 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6547 __ cmp(cnt1, ch2); 6548 __ br(__ EQ, L_CMP_LOOP); 6549 __ BIND(L_CMP_LOOP_NOMATCH); 6550 // here we're not matched 6551 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6552 __ clz(tmp4, tmp2); 6553 __ add(str2, str2, str2_chr_size); // advance pointer 6554 __ b(L_HAS_ZERO_LOOP); 6555 __ align(OptoLoopAlignment); 6556 __ BIND(L_CMP_LOOP_LAST_CMP); 6557 __ cmp(cnt1, ch2); 6558 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6559 __ b(DONE); 6560 __ align(OptoLoopAlignment); 6561 __ BIND(L_CMP_LOOP_LAST_CMP2); 6562 if (str2_isL) { 6563 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6564 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6565 __ lslv(tmp2, tmp2, tmp4); 6566 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6567 __ add(tmp4, tmp4, 1); 6568 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6569 __ lsl(tmp2, tmp2, 1); 6570 } else { 6571 __ mov(ch2, 0xE); 6572 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6573 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6574 __ lslv(tmp2, tmp2, tmp4); 6575 __ add(tmp4, tmp4, 1); 6576 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6577 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6578 __ lsl(tmp2, tmp2, 1); 6579 __ sub(str2, str2, str2_chr_size); 6580 } 6581 __ cmp(ch1, ch2); 6582 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6583 __ b(DONE); 6584 __ align(OptoLoopAlignment); 6585 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6586 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6587 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6588 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6589 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6590 // result by analyzed characters value, so, we can just reset lower bits 6591 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6592 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6593 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6594 // index of last analyzed substring inside current octet. So, str2 in at 6595 // respective start address. We need to advance it to next octet 6596 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6597 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6598 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6599 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6600 __ movw(cnt2, cnt2); 6601 __ b(L_LOOP_PROCEED); 6602 __ align(OptoLoopAlignment); 6603 __ BIND(NOMATCH); 6604 __ mov(result, -1); 6605 __ BIND(DONE); 6606 __ pop(spilled_regs, sp); 6607 __ ret(lr); 6608 return entry; 6609 } 6610 6611 void generate_string_indexof_stubs() { 6612 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6613 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6614 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6615 } 6616 6617 void inflate_and_store_2_fp_registers(bool generatePrfm, 6618 FloatRegister src1, FloatRegister src2) { 6619 Register dst = r1; 6620 __ zip1(v1, __ T16B, src1, v0); 6621 __ zip2(v2, __ T16B, src1, v0); 6622 if (generatePrfm) { 6623 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6624 } 6625 __ zip1(v3, __ T16B, src2, v0); 6626 __ zip2(v4, __ T16B, src2, v0); 6627 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6628 } 6629 6630 // R0 = src 6631 // R1 = dst 6632 // R2 = len 6633 // R3 = len >> 3 6634 // V0 = 0 6635 // v1 = loaded 8 bytes 6636 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6637 address generate_large_byte_array_inflate() { 6638 __ align(CodeEntryAlignment); 6639 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 6640 StubCodeMark mark(this, stub_id); 6641 address entry = __ pc(); 6642 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6643 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6644 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6645 6646 // do one more 8-byte read to have address 16-byte aligned in most cases 6647 // also use single store instruction 6648 __ ldrd(v2, __ post(src, 8)); 6649 __ sub(octetCounter, octetCounter, 2); 6650 __ zip1(v1, __ T16B, v1, v0); 6651 __ zip1(v2, __ T16B, v2, v0); 6652 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6653 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6654 __ subs(rscratch1, octetCounter, large_loop_threshold); 6655 __ br(__ LE, LOOP_START); 6656 __ b(LOOP_PRFM_START); 6657 __ bind(LOOP_PRFM); 6658 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6659 __ bind(LOOP_PRFM_START); 6660 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6661 __ sub(octetCounter, octetCounter, 8); 6662 __ subs(rscratch1, octetCounter, large_loop_threshold); 6663 inflate_and_store_2_fp_registers(true, v3, v4); 6664 inflate_and_store_2_fp_registers(true, v5, v6); 6665 __ br(__ GT, LOOP_PRFM); 6666 __ cmp(octetCounter, (u1)8); 6667 __ br(__ LT, DONE); 6668 __ bind(LOOP); 6669 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6670 __ bind(LOOP_START); 6671 __ sub(octetCounter, octetCounter, 8); 6672 __ cmp(octetCounter, (u1)8); 6673 inflate_and_store_2_fp_registers(false, v3, v4); 6674 inflate_and_store_2_fp_registers(false, v5, v6); 6675 __ br(__ GE, LOOP); 6676 __ bind(DONE); 6677 __ ret(lr); 6678 return entry; 6679 } 6680 6681 /** 6682 * Arguments: 6683 * 6684 * Input: 6685 * c_rarg0 - current state address 6686 * c_rarg1 - H key address 6687 * c_rarg2 - data address 6688 * c_rarg3 - number of blocks 6689 * 6690 * Output: 6691 * Updated state at c_rarg0 6692 */ 6693 address generate_ghash_processBlocks() { 6694 // Bafflingly, GCM uses little-endian for the byte order, but 6695 // big-endian for the bit order. For example, the polynomial 1 is 6696 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6697 // 6698 // So, we must either reverse the bytes in each word and do 6699 // everything big-endian or reverse the bits in each byte and do 6700 // it little-endian. On AArch64 it's more idiomatic to reverse 6701 // the bits in each byte (we have an instruction, RBIT, to do 6702 // that) and keep the data in little-endian bit order through the 6703 // calculation, bit-reversing the inputs and outputs. 6704 6705 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 6706 StubCodeMark mark(this, stub_id); 6707 __ align(wordSize * 2); 6708 address p = __ pc(); 6709 __ emit_int64(0x87); // The low-order bits of the field 6710 // polynomial (i.e. p = z^7+z^2+z+1) 6711 // repeated in the low and high parts of a 6712 // 128-bit vector 6713 __ emit_int64(0x87); 6714 6715 __ align(CodeEntryAlignment); 6716 address start = __ pc(); 6717 6718 Register state = c_rarg0; 6719 Register subkeyH = c_rarg1; 6720 Register data = c_rarg2; 6721 Register blocks = c_rarg3; 6722 6723 FloatRegister vzr = v30; 6724 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6725 6726 __ ldrq(v24, p); // The field polynomial 6727 6728 __ ldrq(v0, Address(state)); 6729 __ ldrq(v1, Address(subkeyH)); 6730 6731 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6732 __ rbit(v0, __ T16B, v0); 6733 __ rev64(v1, __ T16B, v1); 6734 __ rbit(v1, __ T16B, v1); 6735 6736 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6737 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6738 6739 { 6740 Label L_ghash_loop; 6741 __ bind(L_ghash_loop); 6742 6743 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6744 // reversing each byte 6745 __ rbit(v2, __ T16B, v2); 6746 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6747 6748 // Multiply state in v2 by subkey in v1 6749 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6750 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6751 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6752 // Reduce v7:v5 by the field polynomial 6753 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6754 6755 __ sub(blocks, blocks, 1); 6756 __ cbnz(blocks, L_ghash_loop); 6757 } 6758 6759 // The bit-reversed result is at this point in v0 6760 __ rev64(v0, __ T16B, v0); 6761 __ rbit(v0, __ T16B, v0); 6762 6763 __ st1(v0, __ T16B, state); 6764 __ ret(lr); 6765 6766 return start; 6767 } 6768 6769 address generate_ghash_processBlocks_wide() { 6770 address small = generate_ghash_processBlocks(); 6771 6772 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 6773 StubCodeMark mark(this, stub_id); 6774 __ align(wordSize * 2); 6775 address p = __ pc(); 6776 __ emit_int64(0x87); // The low-order bits of the field 6777 // polynomial (i.e. p = z^7+z^2+z+1) 6778 // repeated in the low and high parts of a 6779 // 128-bit vector 6780 __ emit_int64(0x87); 6781 6782 __ align(CodeEntryAlignment); 6783 address start = __ pc(); 6784 6785 Register state = c_rarg0; 6786 Register subkeyH = c_rarg1; 6787 Register data = c_rarg2; 6788 Register blocks = c_rarg3; 6789 6790 const int unroll = 4; 6791 6792 __ cmp(blocks, (unsigned char)(unroll * 2)); 6793 __ br(__ LT, small); 6794 6795 if (unroll > 1) { 6796 // Save state before entering routine 6797 __ sub(sp, sp, 4 * 16); 6798 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6799 __ sub(sp, sp, 4 * 16); 6800 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6801 } 6802 6803 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6804 6805 if (unroll > 1) { 6806 // And restore state 6807 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6808 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6809 } 6810 6811 __ cmp(blocks, (unsigned char)0); 6812 __ br(__ GT, small); 6813 6814 __ ret(lr); 6815 6816 return start; 6817 } 6818 6819 void generate_base64_encode_simdround(Register src, Register dst, 6820 FloatRegister codec, u8 size) { 6821 6822 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6823 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6824 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6825 6826 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6827 6828 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6829 6830 __ ushr(ind0, arrangement, in0, 2); 6831 6832 __ ushr(ind1, arrangement, in1, 2); 6833 __ shl(in0, arrangement, in0, 6); 6834 __ orr(ind1, arrangement, ind1, in0); 6835 __ ushr(ind1, arrangement, ind1, 2); 6836 6837 __ ushr(ind2, arrangement, in2, 4); 6838 __ shl(in1, arrangement, in1, 4); 6839 __ orr(ind2, arrangement, in1, ind2); 6840 __ ushr(ind2, arrangement, ind2, 2); 6841 6842 __ shl(ind3, arrangement, in2, 2); 6843 __ ushr(ind3, arrangement, ind3, 2); 6844 6845 __ tbl(out0, arrangement, codec, 4, ind0); 6846 __ tbl(out1, arrangement, codec, 4, ind1); 6847 __ tbl(out2, arrangement, codec, 4, ind2); 6848 __ tbl(out3, arrangement, codec, 4, ind3); 6849 6850 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6851 } 6852 6853 /** 6854 * Arguments: 6855 * 6856 * Input: 6857 * c_rarg0 - src_start 6858 * c_rarg1 - src_offset 6859 * c_rarg2 - src_length 6860 * c_rarg3 - dest_start 6861 * c_rarg4 - dest_offset 6862 * c_rarg5 - isURL 6863 * 6864 */ 6865 address generate_base64_encodeBlock() { 6866 6867 static const char toBase64[64] = { 6868 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6869 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6870 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6871 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6872 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6873 }; 6874 6875 static const char toBase64URL[64] = { 6876 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6877 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6878 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6879 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6880 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6881 }; 6882 6883 __ align(CodeEntryAlignment); 6884 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 6885 StubCodeMark mark(this, stub_id); 6886 address start = __ pc(); 6887 6888 Register src = c_rarg0; // source array 6889 Register soff = c_rarg1; // source start offset 6890 Register send = c_rarg2; // source end offset 6891 Register dst = c_rarg3; // dest array 6892 Register doff = c_rarg4; // position for writing to dest array 6893 Register isURL = c_rarg5; // Base64 or URL character set 6894 6895 // c_rarg6 and c_rarg7 are free to use as temps 6896 Register codec = c_rarg6; 6897 Register length = c_rarg7; 6898 6899 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6900 6901 __ add(src, src, soff); 6902 __ add(dst, dst, doff); 6903 __ sub(length, send, soff); 6904 6905 // load the codec base address 6906 __ lea(codec, ExternalAddress((address) toBase64)); 6907 __ cbz(isURL, ProcessData); 6908 __ lea(codec, ExternalAddress((address) toBase64URL)); 6909 6910 __ BIND(ProcessData); 6911 6912 // too short to formup a SIMD loop, roll back 6913 __ cmp(length, (u1)24); 6914 __ br(Assembler::LT, Process3B); 6915 6916 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6917 6918 __ BIND(Process48B); 6919 __ cmp(length, (u1)48); 6920 __ br(Assembler::LT, Process24B); 6921 generate_base64_encode_simdround(src, dst, v0, 16); 6922 __ sub(length, length, 48); 6923 __ b(Process48B); 6924 6925 __ BIND(Process24B); 6926 __ cmp(length, (u1)24); 6927 __ br(Assembler::LT, SIMDExit); 6928 generate_base64_encode_simdround(src, dst, v0, 8); 6929 __ sub(length, length, 24); 6930 6931 __ BIND(SIMDExit); 6932 __ cbz(length, Exit); 6933 6934 __ BIND(Process3B); 6935 // 3 src bytes, 24 bits 6936 __ ldrb(r10, __ post(src, 1)); 6937 __ ldrb(r11, __ post(src, 1)); 6938 __ ldrb(r12, __ post(src, 1)); 6939 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6940 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6941 // codec index 6942 __ ubfmw(r15, r12, 18, 23); 6943 __ ubfmw(r14, r12, 12, 17); 6944 __ ubfmw(r13, r12, 6, 11); 6945 __ andw(r12, r12, 63); 6946 // get the code based on the codec 6947 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6948 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6949 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6950 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6951 __ strb(r15, __ post(dst, 1)); 6952 __ strb(r14, __ post(dst, 1)); 6953 __ strb(r13, __ post(dst, 1)); 6954 __ strb(r12, __ post(dst, 1)); 6955 __ sub(length, length, 3); 6956 __ cbnz(length, Process3B); 6957 6958 __ BIND(Exit); 6959 __ ret(lr); 6960 6961 return start; 6962 } 6963 6964 void generate_base64_decode_simdround(Register src, Register dst, 6965 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6966 6967 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6968 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6969 6970 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6971 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6972 6973 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6974 6975 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6976 6977 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6978 6979 // we need unsigned saturating subtract, to make sure all input values 6980 // in range [0, 63] will have 0U value in the higher half lookup 6981 __ uqsubv(decH0, __ T16B, in0, v27); 6982 __ uqsubv(decH1, __ T16B, in1, v27); 6983 __ uqsubv(decH2, __ T16B, in2, v27); 6984 __ uqsubv(decH3, __ T16B, in3, v27); 6985 6986 // lower half lookup 6987 __ tbl(decL0, arrangement, codecL, 4, in0); 6988 __ tbl(decL1, arrangement, codecL, 4, in1); 6989 __ tbl(decL2, arrangement, codecL, 4, in2); 6990 __ tbl(decL3, arrangement, codecL, 4, in3); 6991 6992 // higher half lookup 6993 __ tbx(decH0, arrangement, codecH, 4, decH0); 6994 __ tbx(decH1, arrangement, codecH, 4, decH1); 6995 __ tbx(decH2, arrangement, codecH, 4, decH2); 6996 __ tbx(decH3, arrangement, codecH, 4, decH3); 6997 6998 // combine lower and higher 6999 __ orr(decL0, arrangement, decL0, decH0); 7000 __ orr(decL1, arrangement, decL1, decH1); 7001 __ orr(decL2, arrangement, decL2, decH2); 7002 __ orr(decL3, arrangement, decL3, decH3); 7003 7004 // check illegal inputs, value larger than 63 (maximum of 6 bits) 7005 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 7006 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 7007 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 7008 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 7009 __ orr(in0, arrangement, decH0, decH1); 7010 __ orr(in1, arrangement, decH2, decH3); 7011 __ orr(in2, arrangement, in0, in1); 7012 __ umaxv(in3, arrangement, in2); 7013 __ umov(rscratch2, in3, __ B, 0); 7014 7015 // get the data to output 7016 __ shl(out0, arrangement, decL0, 2); 7017 __ ushr(out1, arrangement, decL1, 4); 7018 __ orr(out0, arrangement, out0, out1); 7019 __ shl(out1, arrangement, decL1, 4); 7020 __ ushr(out2, arrangement, decL2, 2); 7021 __ orr(out1, arrangement, out1, out2); 7022 __ shl(out2, arrangement, decL2, 6); 7023 __ orr(out2, arrangement, out2, decL3); 7024 7025 __ cbz(rscratch2, NoIllegalData); 7026 7027 // handle illegal input 7028 __ umov(r10, in2, __ D, 0); 7029 if (size == 16) { 7030 __ cbnz(r10, ErrorInLowerHalf); 7031 7032 // illegal input is in higher half, store the lower half now. 7033 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 7034 7035 __ umov(r10, in2, __ D, 1); 7036 __ umov(r11, out0, __ D, 1); 7037 __ umov(r12, out1, __ D, 1); 7038 __ umov(r13, out2, __ D, 1); 7039 __ b(StoreLegalData); 7040 7041 __ BIND(ErrorInLowerHalf); 7042 } 7043 __ umov(r11, out0, __ D, 0); 7044 __ umov(r12, out1, __ D, 0); 7045 __ umov(r13, out2, __ D, 0); 7046 7047 __ BIND(StoreLegalData); 7048 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 7049 __ strb(r11, __ post(dst, 1)); 7050 __ strb(r12, __ post(dst, 1)); 7051 __ strb(r13, __ post(dst, 1)); 7052 __ lsr(r10, r10, 8); 7053 __ lsr(r11, r11, 8); 7054 __ lsr(r12, r12, 8); 7055 __ lsr(r13, r13, 8); 7056 __ b(StoreLegalData); 7057 7058 __ BIND(NoIllegalData); 7059 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 7060 } 7061 7062 7063 /** 7064 * Arguments: 7065 * 7066 * Input: 7067 * c_rarg0 - src_start 7068 * c_rarg1 - src_offset 7069 * c_rarg2 - src_length 7070 * c_rarg3 - dest_start 7071 * c_rarg4 - dest_offset 7072 * c_rarg5 - isURL 7073 * c_rarg6 - isMIME 7074 * 7075 */ 7076 address generate_base64_decodeBlock() { 7077 7078 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 7079 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 7080 // titled "Base64 decoding". 7081 7082 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 7083 // except the trailing character '=' is also treated illegal value in this intrinsic. That 7084 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 7085 static const uint8_t fromBase64ForNoSIMD[256] = { 7086 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7087 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7088 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 7089 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 7090 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 7091 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 7092 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 7093 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 7094 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7095 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7096 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7097 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7098 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7099 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7100 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7101 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7102 }; 7103 7104 static const uint8_t fromBase64URLForNoSIMD[256] = { 7105 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7106 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7107 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 7108 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 7109 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 7110 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 7111 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 7112 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 7113 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7114 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7115 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7116 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7117 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7118 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7119 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7120 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7121 }; 7122 7123 // A legal value of base64 code is in range [0, 127]. We need two lookups 7124 // with tbl/tbx and combine them to get the decode data. The 1st table vector 7125 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 7126 // table vector lookup use tbx, out of range indices are unchanged in 7127 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 7128 // The value of index 64 is set to 0, so that we know that we already get the 7129 // decoded data with the 1st lookup. 7130 static const uint8_t fromBase64ForSIMD[128] = { 7131 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7132 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7133 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 7134 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 7135 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 7136 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 7137 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 7138 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 7139 }; 7140 7141 static const uint8_t fromBase64URLForSIMD[128] = { 7142 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7143 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 7144 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 7145 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 7146 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 7147 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 7148 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 7149 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 7150 }; 7151 7152 __ align(CodeEntryAlignment); 7153 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 7154 StubCodeMark mark(this, stub_id); 7155 address start = __ pc(); 7156 7157 Register src = c_rarg0; // source array 7158 Register soff = c_rarg1; // source start offset 7159 Register send = c_rarg2; // source end offset 7160 Register dst = c_rarg3; // dest array 7161 Register doff = c_rarg4; // position for writing to dest array 7162 Register isURL = c_rarg5; // Base64 or URL character set 7163 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 7164 7165 Register length = send; // reuse send as length of source data to process 7166 7167 Register simd_codec = c_rarg6; 7168 Register nosimd_codec = c_rarg7; 7169 7170 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 7171 7172 __ enter(); 7173 7174 __ add(src, src, soff); 7175 __ add(dst, dst, doff); 7176 7177 __ mov(doff, dst); 7178 7179 __ sub(length, send, soff); 7180 __ bfm(length, zr, 0, 1); 7181 7182 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 7183 __ cbz(isURL, ProcessData); 7184 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 7185 7186 __ BIND(ProcessData); 7187 __ mov(rscratch1, length); 7188 __ cmp(length, (u1)144); // 144 = 80 + 64 7189 __ br(Assembler::LT, Process4B); 7190 7191 // In the MIME case, the line length cannot be more than 76 7192 // bytes (see RFC 2045). This is too short a block for SIMD 7193 // to be worthwhile, so we use non-SIMD here. 7194 __ movw(rscratch1, 79); 7195 7196 __ BIND(Process4B); 7197 __ ldrw(r14, __ post(src, 4)); 7198 __ ubfxw(r10, r14, 0, 8); 7199 __ ubfxw(r11, r14, 8, 8); 7200 __ ubfxw(r12, r14, 16, 8); 7201 __ ubfxw(r13, r14, 24, 8); 7202 // get the de-code 7203 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 7204 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 7205 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 7206 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 7207 // error detection, 255u indicates an illegal input 7208 __ orrw(r14, r10, r11); 7209 __ orrw(r15, r12, r13); 7210 __ orrw(r14, r14, r15); 7211 __ tbnz(r14, 7, Exit); 7212 // recover the data 7213 __ lslw(r14, r10, 10); 7214 __ bfiw(r14, r11, 4, 6); 7215 __ bfmw(r14, r12, 2, 5); 7216 __ rev16w(r14, r14); 7217 __ bfiw(r13, r12, 6, 2); 7218 __ strh(r14, __ post(dst, 2)); 7219 __ strb(r13, __ post(dst, 1)); 7220 // non-simd loop 7221 __ subsw(rscratch1, rscratch1, 4); 7222 __ br(Assembler::GT, Process4B); 7223 7224 // if exiting from PreProcess80B, rscratch1 == -1; 7225 // otherwise, rscratch1 == 0. 7226 __ cbzw(rscratch1, Exit); 7227 __ sub(length, length, 80); 7228 7229 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 7230 __ cbz(isURL, SIMDEnter); 7231 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 7232 7233 __ BIND(SIMDEnter); 7234 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 7235 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 7236 __ mov(rscratch1, 63); 7237 __ dup(v27, __ T16B, rscratch1); 7238 7239 __ BIND(Process64B); 7240 __ cmp(length, (u1)64); 7241 __ br(Assembler::LT, Process32B); 7242 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 7243 __ sub(length, length, 64); 7244 __ b(Process64B); 7245 7246 __ BIND(Process32B); 7247 __ cmp(length, (u1)32); 7248 __ br(Assembler::LT, SIMDExit); 7249 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 7250 __ sub(length, length, 32); 7251 __ b(Process32B); 7252 7253 __ BIND(SIMDExit); 7254 __ cbz(length, Exit); 7255 __ movw(rscratch1, length); 7256 __ b(Process4B); 7257 7258 __ BIND(Exit); 7259 __ sub(c_rarg0, dst, doff); 7260 7261 __ leave(); 7262 __ ret(lr); 7263 7264 return start; 7265 } 7266 7267 // Support for spin waits. 7268 address generate_spin_wait() { 7269 __ align(CodeEntryAlignment); 7270 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 7271 StubCodeMark mark(this, stub_id); 7272 address start = __ pc(); 7273 7274 __ spin_wait(); 7275 __ ret(lr); 7276 7277 return start; 7278 } 7279 7280 void generate_lookup_secondary_supers_table_stub() { 7281 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 7282 StubCodeMark mark(this, stub_id); 7283 7284 const Register 7285 r_super_klass = r0, 7286 r_array_base = r1, 7287 r_array_length = r2, 7288 r_array_index = r3, 7289 r_sub_klass = r4, 7290 r_bitmap = rscratch2, 7291 result = r5; 7292 const FloatRegister 7293 vtemp = v0; 7294 7295 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 7296 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 7297 Label L_success; 7298 __ enter(); 7299 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 7300 r_array_base, r_array_length, r_array_index, 7301 vtemp, result, slot, 7302 /*stub_is_near*/true); 7303 __ leave(); 7304 __ ret(lr); 7305 } 7306 } 7307 7308 // Slow path implementation for UseSecondarySupersTable. 7309 address generate_lookup_secondary_supers_table_slow_path_stub() { 7310 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 7311 StubCodeMark mark(this, stub_id); 7312 7313 address start = __ pc(); 7314 const Register 7315 r_super_klass = r0, // argument 7316 r_array_base = r1, // argument 7317 temp1 = r2, // temp 7318 r_array_index = r3, // argument 7319 r_bitmap = rscratch2, // argument 7320 result = r5; // argument 7321 7322 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 7323 __ ret(lr); 7324 7325 return start; 7326 } 7327 7328 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 7329 7330 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 7331 // 7332 // If LSE is in use, generate LSE versions of all the stubs. The 7333 // non-LSE versions are in atomic_aarch64.S. 7334 7335 // class AtomicStubMark records the entry point of a stub and the 7336 // stub pointer which will point to it. The stub pointer is set to 7337 // the entry point when ~AtomicStubMark() is called, which must be 7338 // after ICache::invalidate_range. This ensures safe publication of 7339 // the generated code. 7340 class AtomicStubMark { 7341 address _entry_point; 7342 aarch64_atomic_stub_t *_stub; 7343 MacroAssembler *_masm; 7344 public: 7345 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 7346 _masm = masm; 7347 __ align(32); 7348 _entry_point = __ pc(); 7349 _stub = stub; 7350 } 7351 ~AtomicStubMark() { 7352 *_stub = (aarch64_atomic_stub_t)_entry_point; 7353 } 7354 }; 7355 7356 // NB: For memory_order_conservative we need a trailing membar after 7357 // LSE atomic operations but not a leading membar. 7358 // 7359 // We don't need a leading membar because a clause in the Arm ARM 7360 // says: 7361 // 7362 // Barrier-ordered-before 7363 // 7364 // Barrier instructions order prior Memory effects before subsequent 7365 // Memory effects generated by the same Observer. A read or a write 7366 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 7367 // Observer if and only if RW1 appears in program order before RW 2 7368 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 7369 // instruction with both Acquire and Release semantics. 7370 // 7371 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 7372 // and Release semantics, therefore we don't need a leading 7373 // barrier. However, there is no corresponding Barrier-ordered-after 7374 // relationship, therefore we need a trailing membar to prevent a 7375 // later store or load from being reordered with the store in an 7376 // atomic instruction. 7377 // 7378 // This was checked by using the herd7 consistency model simulator 7379 // (http://diy.inria.fr/) with this test case: 7380 // 7381 // AArch64 LseCas 7382 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 7383 // P0 | P1; 7384 // LDR W4, [X2] | MOV W3, #0; 7385 // DMB LD | MOV W4, #1; 7386 // LDR W3, [X1] | CASAL W3, W4, [X1]; 7387 // | DMB ISH; 7388 // | STR W4, [X2]; 7389 // exists 7390 // (0:X3=0 /\ 0:X4=1) 7391 // 7392 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 7393 // with the store to x in P1. Without the DMB in P1 this may happen. 7394 // 7395 // At the time of writing we don't know of any AArch64 hardware that 7396 // reorders stores in this way, but the Reference Manual permits it. 7397 7398 void gen_cas_entry(Assembler::operand_size size, 7399 atomic_memory_order order) { 7400 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 7401 exchange_val = c_rarg2; 7402 bool acquire, release; 7403 switch (order) { 7404 case memory_order_relaxed: 7405 acquire = false; 7406 release = false; 7407 break; 7408 case memory_order_release: 7409 acquire = false; 7410 release = true; 7411 break; 7412 default: 7413 acquire = true; 7414 release = true; 7415 break; 7416 } 7417 __ mov(prev, compare_val); 7418 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 7419 if (order == memory_order_conservative) { 7420 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7421 } 7422 if (size == Assembler::xword) { 7423 __ mov(r0, prev); 7424 } else { 7425 __ movw(r0, prev); 7426 } 7427 __ ret(lr); 7428 } 7429 7430 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 7431 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 7432 // If not relaxed, then default to conservative. Relaxed is the only 7433 // case we use enough to be worth specializing. 7434 if (order == memory_order_relaxed) { 7435 __ ldadd(size, incr, prev, addr); 7436 } else { 7437 __ ldaddal(size, incr, prev, addr); 7438 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7439 } 7440 if (size == Assembler::xword) { 7441 __ mov(r0, prev); 7442 } else { 7443 __ movw(r0, prev); 7444 } 7445 __ ret(lr); 7446 } 7447 7448 void gen_swpal_entry(Assembler::operand_size size) { 7449 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 7450 __ swpal(size, incr, prev, addr); 7451 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7452 if (size == Assembler::xword) { 7453 __ mov(r0, prev); 7454 } else { 7455 __ movw(r0, prev); 7456 } 7457 __ ret(lr); 7458 } 7459 7460 void generate_atomic_entry_points() { 7461 if (! UseLSE) { 7462 return; 7463 } 7464 __ align(CodeEntryAlignment); 7465 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 7466 StubCodeMark mark(this, stub_id); 7467 address first_entry = __ pc(); 7468 7469 // ADD, memory_order_conservative 7470 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 7471 gen_ldadd_entry(Assembler::word, memory_order_conservative); 7472 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 7473 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 7474 7475 // ADD, memory_order_relaxed 7476 AtomicStubMark mark_fetch_add_4_relaxed 7477 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 7478 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 7479 AtomicStubMark mark_fetch_add_8_relaxed 7480 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 7481 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 7482 7483 // XCHG, memory_order_conservative 7484 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 7485 gen_swpal_entry(Assembler::word); 7486 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 7487 gen_swpal_entry(Assembler::xword); 7488 7489 // CAS, memory_order_conservative 7490 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 7491 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 7492 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 7493 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 7494 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 7495 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 7496 7497 // CAS, memory_order_relaxed 7498 AtomicStubMark mark_cmpxchg_1_relaxed 7499 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 7500 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 7501 AtomicStubMark mark_cmpxchg_4_relaxed 7502 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 7503 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 7504 AtomicStubMark mark_cmpxchg_8_relaxed 7505 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 7506 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 7507 7508 AtomicStubMark mark_cmpxchg_4_release 7509 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 7510 gen_cas_entry(MacroAssembler::word, memory_order_release); 7511 AtomicStubMark mark_cmpxchg_8_release 7512 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 7513 gen_cas_entry(MacroAssembler::xword, memory_order_release); 7514 7515 AtomicStubMark mark_cmpxchg_4_seq_cst 7516 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 7517 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 7518 AtomicStubMark mark_cmpxchg_8_seq_cst 7519 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 7520 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 7521 7522 ICache::invalidate_range(first_entry, __ pc() - first_entry); 7523 } 7524 #endif // LINUX 7525 7526 address generate_cont_thaw(Continuation::thaw_kind kind) { 7527 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 7528 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 7529 7530 address start = __ pc(); 7531 7532 if (return_barrier) { 7533 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 7534 __ mov(sp, rscratch1); 7535 } 7536 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7537 7538 if (return_barrier) { 7539 // preserve possible return value from a method returning to the return barrier 7540 __ fmovd(rscratch1, v0); 7541 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7542 } 7543 7544 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 7545 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 7546 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 7547 7548 if (return_barrier) { 7549 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7550 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7551 __ fmovd(v0, rscratch1); 7552 } 7553 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7554 7555 7556 Label thaw_success; 7557 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 7558 __ cbnz(rscratch2, thaw_success); 7559 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 7560 __ br(rscratch1); 7561 __ bind(thaw_success); 7562 7563 // make room for the thawed frames 7564 __ sub(rscratch1, sp, rscratch2); 7565 __ andr(rscratch1, rscratch1, -16); // align 7566 __ mov(sp, rscratch1); 7567 7568 if (return_barrier) { 7569 // save original return value -- again 7570 __ fmovd(rscratch1, v0); 7571 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7572 } 7573 7574 // If we want, we can templatize thaw by kind, and have three different entries 7575 __ movw(c_rarg1, (uint32_t)kind); 7576 7577 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7578 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7579 7580 if (return_barrier) { 7581 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7582 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7583 __ fmovd(v0, rscratch1); 7584 } else { 7585 __ mov(r0, zr); // return 0 (success) from doYield 7586 } 7587 7588 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7589 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7590 __ mov(rfp, sp); 7591 7592 if (return_barrier_exception) { 7593 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7594 __ authenticate_return_address(c_rarg1); 7595 __ verify_oop(r0); 7596 // save return value containing the exception oop in callee-saved R19 7597 __ mov(r19, r0); 7598 7599 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7600 7601 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7602 // __ reinitialize_ptrue(); 7603 7604 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7605 7606 __ mov(r1, r0); // the exception handler 7607 __ mov(r0, r19); // restore return value containing the exception oop 7608 __ verify_oop(r0); 7609 7610 __ leave(); 7611 __ mov(r3, lr); 7612 __ br(r1); // the exception handler 7613 } else { 7614 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7615 __ leave(); 7616 __ ret(lr); 7617 } 7618 7619 return start; 7620 } 7621 7622 address generate_cont_thaw() { 7623 if (!Continuations::enabled()) return nullptr; 7624 7625 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 7626 StubCodeMark mark(this, stub_id); 7627 address start = __ pc(); 7628 generate_cont_thaw(Continuation::thaw_top); 7629 return start; 7630 } 7631 7632 address generate_cont_returnBarrier() { 7633 if (!Continuations::enabled()) return nullptr; 7634 7635 // TODO: will probably need multiple return barriers depending on return type 7636 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 7637 StubCodeMark mark(this, stub_id); 7638 address start = __ pc(); 7639 7640 generate_cont_thaw(Continuation::thaw_return_barrier); 7641 7642 return start; 7643 } 7644 7645 address generate_cont_returnBarrier_exception() { 7646 if (!Continuations::enabled()) return nullptr; 7647 7648 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 7649 StubCodeMark mark(this, stub_id); 7650 address start = __ pc(); 7651 7652 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7653 7654 return start; 7655 } 7656 7657 address generate_cont_preempt_stub() { 7658 if (!Continuations::enabled()) return nullptr; 7659 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 7660 StubCodeMark mark(this, stub_id); 7661 address start = __ pc(); 7662 7663 __ reset_last_Java_frame(true); 7664 7665 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 7666 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 7667 __ mov(sp, rscratch2); 7668 7669 Label preemption_cancelled; 7670 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 7671 __ cbnz(rscratch1, preemption_cancelled); 7672 7673 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 7674 SharedRuntime::continuation_enter_cleanup(_masm); 7675 __ leave(); 7676 __ ret(lr); 7677 7678 // We acquired the monitor after freezing the frames so call thaw to continue execution. 7679 __ bind(preemption_cancelled); 7680 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 7681 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 7682 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 7683 __ ldr(rscratch1, Address(rscratch1)); 7684 __ br(rscratch1); 7685 7686 return start; 7687 } 7688 7689 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7690 // are represented as long[5], with BITS_PER_LIMB = 26. 7691 // Pack five 26-bit limbs into three 64-bit registers. 7692 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7693 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7694 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7695 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7696 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7697 7698 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7699 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7700 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7701 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7702 7703 if (dest2->is_valid()) { 7704 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7705 } else { 7706 #ifdef ASSERT 7707 Label OK; 7708 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7709 __ br(__ EQ, OK); 7710 __ stop("high bits of Poly1305 integer should be zero"); 7711 __ should_not_reach_here(); 7712 __ bind(OK); 7713 #endif 7714 } 7715 } 7716 7717 // As above, but return only a 128-bit integer, packed into two 7718 // 64-bit registers. 7719 void pack_26(Register dest0, Register dest1, Register src) { 7720 pack_26(dest0, dest1, noreg, src); 7721 } 7722 7723 // Multiply and multiply-accumulate unsigned 64-bit registers. 7724 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7725 __ mul(prod_lo, n, m); 7726 __ umulh(prod_hi, n, m); 7727 } 7728 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7729 wide_mul(rscratch1, rscratch2, n, m); 7730 __ adds(sum_lo, sum_lo, rscratch1); 7731 __ adc(sum_hi, sum_hi, rscratch2); 7732 } 7733 7734 // Poly1305, RFC 7539 7735 7736 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7737 // description of the tricks used to simplify and accelerate this 7738 // computation. 7739 7740 address generate_poly1305_processBlocks() { 7741 __ align(CodeEntryAlignment); 7742 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 7743 StubCodeMark mark(this, stub_id); 7744 address start = __ pc(); 7745 Label here; 7746 __ enter(); 7747 RegSet callee_saved = RegSet::range(r19, r28); 7748 __ push(callee_saved, sp); 7749 7750 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7751 7752 // Arguments 7753 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7754 7755 // R_n is the 128-bit randomly-generated key, packed into two 7756 // registers. The caller passes this key to us as long[5], with 7757 // BITS_PER_LIMB = 26. 7758 const Register R_0 = *++regs, R_1 = *++regs; 7759 pack_26(R_0, R_1, r_start); 7760 7761 // RR_n is (R_n >> 2) * 5 7762 const Register RR_0 = *++regs, RR_1 = *++regs; 7763 __ lsr(RR_0, R_0, 2); 7764 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7765 __ lsr(RR_1, R_1, 2); 7766 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7767 7768 // U_n is the current checksum 7769 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7770 pack_26(U_0, U_1, U_2, acc_start); 7771 7772 static constexpr int BLOCK_LENGTH = 16; 7773 Label DONE, LOOP; 7774 7775 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7776 __ br(Assembler::LT, DONE); { 7777 __ bind(LOOP); 7778 7779 // S_n is to be the sum of U_n and the next block of data 7780 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7781 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7782 __ adds(S_0, U_0, S_0); 7783 __ adcs(S_1, U_1, S_1); 7784 __ adc(S_2, U_2, zr); 7785 __ add(S_2, S_2, 1); 7786 7787 const Register U_0HI = *++regs, U_1HI = *++regs; 7788 7789 // NB: this logic depends on some of the special properties of 7790 // Poly1305 keys. In particular, because we know that the top 7791 // four bits of R_0 and R_1 are zero, we can add together 7792 // partial products without any risk of needing to propagate a 7793 // carry out. 7794 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7795 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7796 __ andr(U_2, R_0, 3); 7797 __ mul(U_2, S_2, U_2); 7798 7799 // Recycle registers S_0, S_1, S_2 7800 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7801 7802 // Partial reduction mod 2**130 - 5 7803 __ adds(U_1, U_0HI, U_1); 7804 __ adc(U_2, U_1HI, U_2); 7805 // Sum now in U_2:U_1:U_0. 7806 // Dead: U_0HI, U_1HI. 7807 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7808 7809 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7810 7811 // First, U_2:U_1:U_0 += (U_2 >> 2) 7812 __ lsr(rscratch1, U_2, 2); 7813 __ andr(U_2, U_2, (u8)3); 7814 __ adds(U_0, U_0, rscratch1); 7815 __ adcs(U_1, U_1, zr); 7816 __ adc(U_2, U_2, zr); 7817 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7818 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7819 __ adcs(U_1, U_1, zr); 7820 __ adc(U_2, U_2, zr); 7821 7822 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7823 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7824 __ br(~ Assembler::LT, LOOP); 7825 } 7826 7827 // Further reduce modulo 2^130 - 5 7828 __ lsr(rscratch1, U_2, 2); 7829 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7830 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7831 __ adcs(U_1, U_1, zr); 7832 __ andr(U_2, U_2, (u1)3); 7833 __ adc(U_2, U_2, zr); 7834 7835 // Unpack the sum into five 26-bit limbs and write to memory. 7836 __ ubfiz(rscratch1, U_0, 0, 26); 7837 __ ubfx(rscratch2, U_0, 26, 26); 7838 __ stp(rscratch1, rscratch2, Address(acc_start)); 7839 __ ubfx(rscratch1, U_0, 52, 12); 7840 __ bfi(rscratch1, U_1, 12, 14); 7841 __ ubfx(rscratch2, U_1, 14, 26); 7842 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7843 __ ubfx(rscratch1, U_1, 40, 24); 7844 __ bfi(rscratch1, U_2, 24, 3); 7845 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7846 7847 __ bind(DONE); 7848 __ pop(callee_saved, sp); 7849 __ leave(); 7850 __ ret(lr); 7851 7852 return start; 7853 } 7854 7855 // exception handler for upcall stubs 7856 address generate_upcall_stub_exception_handler() { 7857 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 7858 StubCodeMark mark(this, stub_id); 7859 address start = __ pc(); 7860 7861 // Native caller has no idea how to handle exceptions, 7862 // so we just crash here. Up to callee to catch exceptions. 7863 __ verify_oop(r0); 7864 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7865 __ blr(rscratch1); 7866 __ should_not_reach_here(); 7867 7868 return start; 7869 } 7870 7871 // load Method* target of MethodHandle 7872 // j_rarg0 = jobject receiver 7873 // rmethod = result 7874 address generate_upcall_stub_load_target() { 7875 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 7876 StubCodeMark mark(this, stub_id); 7877 address start = __ pc(); 7878 7879 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 7880 // Load target method from receiver 7881 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 7882 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 7883 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 7884 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 7885 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 7886 noreg, noreg); 7887 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 7888 7889 __ ret(lr); 7890 7891 return start; 7892 } 7893 7894 #undef __ 7895 #define __ masm-> 7896 7897 class MontgomeryMultiplyGenerator : public MacroAssembler { 7898 7899 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7900 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7901 7902 RegSet _toSave; 7903 bool _squaring; 7904 7905 public: 7906 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7907 : MacroAssembler(as->code()), _squaring(squaring) { 7908 7909 // Register allocation 7910 7911 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7912 Pa_base = *regs; // Argument registers 7913 if (squaring) 7914 Pb_base = Pa_base; 7915 else 7916 Pb_base = *++regs; 7917 Pn_base = *++regs; 7918 Rlen= *++regs; 7919 inv = *++regs; 7920 Pm_base = *++regs; 7921 7922 // Working registers: 7923 Ra = *++regs; // The current digit of a, b, n, and m. 7924 Rb = *++regs; 7925 Rm = *++regs; 7926 Rn = *++regs; 7927 7928 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7929 Pb = *++regs; 7930 Pm = *++regs; 7931 Pn = *++regs; 7932 7933 t0 = *++regs; // Three registers which form a 7934 t1 = *++regs; // triple-precision accumuator. 7935 t2 = *++regs; 7936 7937 Ri = *++regs; // Inner and outer loop indexes. 7938 Rj = *++regs; 7939 7940 Rhi_ab = *++regs; // Product registers: low and high parts 7941 Rlo_ab = *++regs; // of a*b and m*n. 7942 Rhi_mn = *++regs; 7943 Rlo_mn = *++regs; 7944 7945 // r19 and up are callee-saved. 7946 _toSave = RegSet::range(r19, *regs) + Pm_base; 7947 } 7948 7949 private: 7950 void save_regs() { 7951 push(_toSave, sp); 7952 } 7953 7954 void restore_regs() { 7955 pop(_toSave, sp); 7956 } 7957 7958 template <typename T> 7959 void unroll_2(Register count, T block) { 7960 Label loop, end, odd; 7961 tbnz(count, 0, odd); 7962 cbz(count, end); 7963 align(16); 7964 bind(loop); 7965 (this->*block)(); 7966 bind(odd); 7967 (this->*block)(); 7968 subs(count, count, 2); 7969 br(Assembler::GT, loop); 7970 bind(end); 7971 } 7972 7973 template <typename T> 7974 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7975 Label loop, end, odd; 7976 tbnz(count, 0, odd); 7977 cbz(count, end); 7978 align(16); 7979 bind(loop); 7980 (this->*block)(d, s, tmp); 7981 bind(odd); 7982 (this->*block)(d, s, tmp); 7983 subs(count, count, 2); 7984 br(Assembler::GT, loop); 7985 bind(end); 7986 } 7987 7988 void pre1(RegisterOrConstant i) { 7989 block_comment("pre1"); 7990 // Pa = Pa_base; 7991 // Pb = Pb_base + i; 7992 // Pm = Pm_base; 7993 // Pn = Pn_base + i; 7994 // Ra = *Pa; 7995 // Rb = *Pb; 7996 // Rm = *Pm; 7997 // Rn = *Pn; 7998 ldr(Ra, Address(Pa_base)); 7999 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 8000 ldr(Rm, Address(Pm_base)); 8001 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 8002 lea(Pa, Address(Pa_base)); 8003 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 8004 lea(Pm, Address(Pm_base)); 8005 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 8006 8007 // Zero the m*n result. 8008 mov(Rhi_mn, zr); 8009 mov(Rlo_mn, zr); 8010 } 8011 8012 // The core multiply-accumulate step of a Montgomery 8013 // multiplication. The idea is to schedule operations as a 8014 // pipeline so that instructions with long latencies (loads and 8015 // multiplies) have time to complete before their results are 8016 // used. This most benefits in-order implementations of the 8017 // architecture but out-of-order ones also benefit. 8018 void step() { 8019 block_comment("step"); 8020 // MACC(Ra, Rb, t0, t1, t2); 8021 // Ra = *++Pa; 8022 // Rb = *--Pb; 8023 umulh(Rhi_ab, Ra, Rb); 8024 mul(Rlo_ab, Ra, Rb); 8025 ldr(Ra, pre(Pa, wordSize)); 8026 ldr(Rb, pre(Pb, -wordSize)); 8027 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 8028 // previous iteration. 8029 // MACC(Rm, Rn, t0, t1, t2); 8030 // Rm = *++Pm; 8031 // Rn = *--Pn; 8032 umulh(Rhi_mn, Rm, Rn); 8033 mul(Rlo_mn, Rm, Rn); 8034 ldr(Rm, pre(Pm, wordSize)); 8035 ldr(Rn, pre(Pn, -wordSize)); 8036 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 8037 } 8038 8039 void post1() { 8040 block_comment("post1"); 8041 8042 // MACC(Ra, Rb, t0, t1, t2); 8043 // Ra = *++Pa; 8044 // Rb = *--Pb; 8045 umulh(Rhi_ab, Ra, Rb); 8046 mul(Rlo_ab, Ra, Rb); 8047 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 8048 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 8049 8050 // *Pm = Rm = t0 * inv; 8051 mul(Rm, t0, inv); 8052 str(Rm, Address(Pm)); 8053 8054 // MACC(Rm, Rn, t0, t1, t2); 8055 // t0 = t1; t1 = t2; t2 = 0; 8056 umulh(Rhi_mn, Rm, Rn); 8057 8058 #ifndef PRODUCT 8059 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 8060 { 8061 mul(Rlo_mn, Rm, Rn); 8062 add(Rlo_mn, t0, Rlo_mn); 8063 Label ok; 8064 cbz(Rlo_mn, ok); { 8065 stop("broken Montgomery multiply"); 8066 } bind(ok); 8067 } 8068 #endif 8069 // We have very carefully set things up so that 8070 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 8071 // the lower half of Rm * Rn because we know the result already: 8072 // it must be -t0. t0 + (-t0) must generate a carry iff 8073 // t0 != 0. So, rather than do a mul and an adds we just set 8074 // the carry flag iff t0 is nonzero. 8075 // 8076 // mul(Rlo_mn, Rm, Rn); 8077 // adds(zr, t0, Rlo_mn); 8078 subs(zr, t0, 1); // Set carry iff t0 is nonzero 8079 adcs(t0, t1, Rhi_mn); 8080 adc(t1, t2, zr); 8081 mov(t2, zr); 8082 } 8083 8084 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 8085 block_comment("pre2"); 8086 // Pa = Pa_base + i-len; 8087 // Pb = Pb_base + len; 8088 // Pm = Pm_base + i-len; 8089 // Pn = Pn_base + len; 8090 8091 if (i.is_register()) { 8092 sub(Rj, i.as_register(), len); 8093 } else { 8094 mov(Rj, i.as_constant()); 8095 sub(Rj, Rj, len); 8096 } 8097 // Rj == i-len 8098 8099 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 8100 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 8101 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 8102 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 8103 8104 // Ra = *++Pa; 8105 // Rb = *--Pb; 8106 // Rm = *++Pm; 8107 // Rn = *--Pn; 8108 ldr(Ra, pre(Pa, wordSize)); 8109 ldr(Rb, pre(Pb, -wordSize)); 8110 ldr(Rm, pre(Pm, wordSize)); 8111 ldr(Rn, pre(Pn, -wordSize)); 8112 8113 mov(Rhi_mn, zr); 8114 mov(Rlo_mn, zr); 8115 } 8116 8117 void post2(RegisterOrConstant i, RegisterOrConstant len) { 8118 block_comment("post2"); 8119 if (i.is_constant()) { 8120 mov(Rj, i.as_constant()-len.as_constant()); 8121 } else { 8122 sub(Rj, i.as_register(), len); 8123 } 8124 8125 adds(t0, t0, Rlo_mn); // The pending m*n, low part 8126 8127 // As soon as we know the least significant digit of our result, 8128 // store it. 8129 // Pm_base[i-len] = t0; 8130 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 8131 8132 // t0 = t1; t1 = t2; t2 = 0; 8133 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 8134 adc(t1, t2, zr); 8135 mov(t2, zr); 8136 } 8137 8138 // A carry in t0 after Montgomery multiplication means that we 8139 // should subtract multiples of n from our result in m. We'll 8140 // keep doing that until there is no carry. 8141 void normalize(RegisterOrConstant len) { 8142 block_comment("normalize"); 8143 // while (t0) 8144 // t0 = sub(Pm_base, Pn_base, t0, len); 8145 Label loop, post, again; 8146 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 8147 cbz(t0, post); { 8148 bind(again); { 8149 mov(i, zr); 8150 mov(cnt, len); 8151 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 8152 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 8153 subs(zr, zr, zr); // set carry flag, i.e. no borrow 8154 align(16); 8155 bind(loop); { 8156 sbcs(Rm, Rm, Rn); 8157 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 8158 add(i, i, 1); 8159 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 8160 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 8161 sub(cnt, cnt, 1); 8162 } cbnz(cnt, loop); 8163 sbc(t0, t0, zr); 8164 } cbnz(t0, again); 8165 } bind(post); 8166 } 8167 8168 // Move memory at s to d, reversing words. 8169 // Increments d to end of copied memory 8170 // Destroys tmp1, tmp2 8171 // Preserves len 8172 // Leaves s pointing to the address which was in d at start 8173 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 8174 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 8175 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 8176 8177 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 8178 mov(tmp1, len); 8179 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 8180 sub(s, d, len, ext::uxtw, LogBytesPerWord); 8181 } 8182 // where 8183 void reverse1(Register d, Register s, Register tmp) { 8184 ldr(tmp, pre(s, -wordSize)); 8185 ror(tmp, tmp, 32); 8186 str(tmp, post(d, wordSize)); 8187 } 8188 8189 void step_squaring() { 8190 // An extra ACC 8191 step(); 8192 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 8193 } 8194 8195 void last_squaring(RegisterOrConstant i) { 8196 Label dont; 8197 // if ((i & 1) == 0) { 8198 tbnz(i.as_register(), 0, dont); { 8199 // MACC(Ra, Rb, t0, t1, t2); 8200 // Ra = *++Pa; 8201 // Rb = *--Pb; 8202 umulh(Rhi_ab, Ra, Rb); 8203 mul(Rlo_ab, Ra, Rb); 8204 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 8205 } bind(dont); 8206 } 8207 8208 void extra_step_squaring() { 8209 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 8210 8211 // MACC(Rm, Rn, t0, t1, t2); 8212 // Rm = *++Pm; 8213 // Rn = *--Pn; 8214 umulh(Rhi_mn, Rm, Rn); 8215 mul(Rlo_mn, Rm, Rn); 8216 ldr(Rm, pre(Pm, wordSize)); 8217 ldr(Rn, pre(Pn, -wordSize)); 8218 } 8219 8220 void post1_squaring() { 8221 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 8222 8223 // *Pm = Rm = t0 * inv; 8224 mul(Rm, t0, inv); 8225 str(Rm, Address(Pm)); 8226 8227 // MACC(Rm, Rn, t0, t1, t2); 8228 // t0 = t1; t1 = t2; t2 = 0; 8229 umulh(Rhi_mn, Rm, Rn); 8230 8231 #ifndef PRODUCT 8232 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 8233 { 8234 mul(Rlo_mn, Rm, Rn); 8235 add(Rlo_mn, t0, Rlo_mn); 8236 Label ok; 8237 cbz(Rlo_mn, ok); { 8238 stop("broken Montgomery multiply"); 8239 } bind(ok); 8240 } 8241 #endif 8242 // We have very carefully set things up so that 8243 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 8244 // the lower half of Rm * Rn because we know the result already: 8245 // it must be -t0. t0 + (-t0) must generate a carry iff 8246 // t0 != 0. So, rather than do a mul and an adds we just set 8247 // the carry flag iff t0 is nonzero. 8248 // 8249 // mul(Rlo_mn, Rm, Rn); 8250 // adds(zr, t0, Rlo_mn); 8251 subs(zr, t0, 1); // Set carry iff t0 is nonzero 8252 adcs(t0, t1, Rhi_mn); 8253 adc(t1, t2, zr); 8254 mov(t2, zr); 8255 } 8256 8257 void acc(Register Rhi, Register Rlo, 8258 Register t0, Register t1, Register t2) { 8259 adds(t0, t0, Rlo); 8260 adcs(t1, t1, Rhi); 8261 adc(t2, t2, zr); 8262 } 8263 8264 public: 8265 /** 8266 * Fast Montgomery multiplication. The derivation of the 8267 * algorithm is in A Cryptographic Library for the Motorola 8268 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 8269 * 8270 * Arguments: 8271 * 8272 * Inputs for multiplication: 8273 * c_rarg0 - int array elements a 8274 * c_rarg1 - int array elements b 8275 * c_rarg2 - int array elements n (the modulus) 8276 * c_rarg3 - int length 8277 * c_rarg4 - int inv 8278 * c_rarg5 - int array elements m (the result) 8279 * 8280 * Inputs for squaring: 8281 * c_rarg0 - int array elements a 8282 * c_rarg1 - int array elements n (the modulus) 8283 * c_rarg2 - int length 8284 * c_rarg3 - int inv 8285 * c_rarg4 - int array elements m (the result) 8286 * 8287 */ 8288 address generate_multiply() { 8289 Label argh, nothing; 8290 bind(argh); 8291 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8292 8293 align(CodeEntryAlignment); 8294 address entry = pc(); 8295 8296 cbzw(Rlen, nothing); 8297 8298 enter(); 8299 8300 // Make room. 8301 cmpw(Rlen, 512); 8302 br(Assembler::HI, argh); 8303 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8304 andr(sp, Ra, -2 * wordSize); 8305 8306 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8307 8308 { 8309 // Copy input args, reversing as we go. We use Ra as a 8310 // temporary variable. 8311 reverse(Ra, Pa_base, Rlen, t0, t1); 8312 if (!_squaring) 8313 reverse(Ra, Pb_base, Rlen, t0, t1); 8314 reverse(Ra, Pn_base, Rlen, t0, t1); 8315 } 8316 8317 // Push all call-saved registers and also Pm_base which we'll need 8318 // at the end. 8319 save_regs(); 8320 8321 #ifndef PRODUCT 8322 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 8323 { 8324 ldr(Rn, Address(Pn_base, 0)); 8325 mul(Rlo_mn, Rn, inv); 8326 subs(zr, Rlo_mn, -1); 8327 Label ok; 8328 br(EQ, ok); { 8329 stop("broken inverse in Montgomery multiply"); 8330 } bind(ok); 8331 } 8332 #endif 8333 8334 mov(Pm_base, Ra); 8335 8336 mov(t0, zr); 8337 mov(t1, zr); 8338 mov(t2, zr); 8339 8340 block_comment("for (int i = 0; i < len; i++) {"); 8341 mov(Ri, zr); { 8342 Label loop, end; 8343 cmpw(Ri, Rlen); 8344 br(Assembler::GE, end); 8345 8346 bind(loop); 8347 pre1(Ri); 8348 8349 block_comment(" for (j = i; j; j--) {"); { 8350 movw(Rj, Ri); 8351 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 8352 } block_comment(" } // j"); 8353 8354 post1(); 8355 addw(Ri, Ri, 1); 8356 cmpw(Ri, Rlen); 8357 br(Assembler::LT, loop); 8358 bind(end); 8359 block_comment("} // i"); 8360 } 8361 8362 block_comment("for (int i = len; i < 2*len; i++) {"); 8363 mov(Ri, Rlen); { 8364 Label loop, end; 8365 cmpw(Ri, Rlen, Assembler::LSL, 1); 8366 br(Assembler::GE, end); 8367 8368 bind(loop); 8369 pre2(Ri, Rlen); 8370 8371 block_comment(" for (j = len*2-i-1; j; j--) {"); { 8372 lslw(Rj, Rlen, 1); 8373 subw(Rj, Rj, Ri); 8374 subw(Rj, Rj, 1); 8375 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 8376 } block_comment(" } // j"); 8377 8378 post2(Ri, Rlen); 8379 addw(Ri, Ri, 1); 8380 cmpw(Ri, Rlen, Assembler::LSL, 1); 8381 br(Assembler::LT, loop); 8382 bind(end); 8383 } 8384 block_comment("} // i"); 8385 8386 normalize(Rlen); 8387 8388 mov(Ra, Pm_base); // Save Pm_base in Ra 8389 restore_regs(); // Restore caller's Pm_base 8390 8391 // Copy our result into caller's Pm_base 8392 reverse(Pm_base, Ra, Rlen, t0, t1); 8393 8394 leave(); 8395 bind(nothing); 8396 ret(lr); 8397 8398 return entry; 8399 } 8400 // In C, approximately: 8401 8402 // void 8403 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 8404 // julong Pn_base[], julong Pm_base[], 8405 // julong inv, int len) { 8406 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8407 // julong *Pa, *Pb, *Pn, *Pm; 8408 // julong Ra, Rb, Rn, Rm; 8409 8410 // int i; 8411 8412 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8413 8414 // for (i = 0; i < len; i++) { 8415 // int j; 8416 8417 // Pa = Pa_base; 8418 // Pb = Pb_base + i; 8419 // Pm = Pm_base; 8420 // Pn = Pn_base + i; 8421 8422 // Ra = *Pa; 8423 // Rb = *Pb; 8424 // Rm = *Pm; 8425 // Rn = *Pn; 8426 8427 // int iters = i; 8428 // for (j = 0; iters--; j++) { 8429 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8430 // MACC(Ra, Rb, t0, t1, t2); 8431 // Ra = *++Pa; 8432 // Rb = *--Pb; 8433 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8434 // MACC(Rm, Rn, t0, t1, t2); 8435 // Rm = *++Pm; 8436 // Rn = *--Pn; 8437 // } 8438 8439 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 8440 // MACC(Ra, Rb, t0, t1, t2); 8441 // *Pm = Rm = t0 * inv; 8442 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8443 // MACC(Rm, Rn, t0, t1, t2); 8444 8445 // assert(t0 == 0, "broken Montgomery multiply"); 8446 8447 // t0 = t1; t1 = t2; t2 = 0; 8448 // } 8449 8450 // for (i = len; i < 2*len; i++) { 8451 // int j; 8452 8453 // Pa = Pa_base + i-len; 8454 // Pb = Pb_base + len; 8455 // Pm = Pm_base + i-len; 8456 // Pn = Pn_base + len; 8457 8458 // Ra = *++Pa; 8459 // Rb = *--Pb; 8460 // Rm = *++Pm; 8461 // Rn = *--Pn; 8462 8463 // int iters = len*2-i-1; 8464 // for (j = i-len+1; iters--; j++) { 8465 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8466 // MACC(Ra, Rb, t0, t1, t2); 8467 // Ra = *++Pa; 8468 // Rb = *--Pb; 8469 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8470 // MACC(Rm, Rn, t0, t1, t2); 8471 // Rm = *++Pm; 8472 // Rn = *--Pn; 8473 // } 8474 8475 // Pm_base[i-len] = t0; 8476 // t0 = t1; t1 = t2; t2 = 0; 8477 // } 8478 8479 // while (t0) 8480 // t0 = sub(Pm_base, Pn_base, t0, len); 8481 // } 8482 8483 /** 8484 * Fast Montgomery squaring. This uses asymptotically 25% fewer 8485 * multiplies than Montgomery multiplication so it should be up to 8486 * 25% faster. However, its loop control is more complex and it 8487 * may actually run slower on some machines. 8488 * 8489 * Arguments: 8490 * 8491 * Inputs: 8492 * c_rarg0 - int array elements a 8493 * c_rarg1 - int array elements n (the modulus) 8494 * c_rarg2 - int length 8495 * c_rarg3 - int inv 8496 * c_rarg4 - int array elements m (the result) 8497 * 8498 */ 8499 address generate_square() { 8500 Label argh; 8501 bind(argh); 8502 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8503 8504 align(CodeEntryAlignment); 8505 address entry = pc(); 8506 8507 enter(); 8508 8509 // Make room. 8510 cmpw(Rlen, 512); 8511 br(Assembler::HI, argh); 8512 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8513 andr(sp, Ra, -2 * wordSize); 8514 8515 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8516 8517 { 8518 // Copy input args, reversing as we go. We use Ra as a 8519 // temporary variable. 8520 reverse(Ra, Pa_base, Rlen, t0, t1); 8521 reverse(Ra, Pn_base, Rlen, t0, t1); 8522 } 8523 8524 // Push all call-saved registers and also Pm_base which we'll need 8525 // at the end. 8526 save_regs(); 8527 8528 mov(Pm_base, Ra); 8529 8530 mov(t0, zr); 8531 mov(t1, zr); 8532 mov(t2, zr); 8533 8534 block_comment("for (int i = 0; i < len; i++) {"); 8535 mov(Ri, zr); { 8536 Label loop, end; 8537 bind(loop); 8538 cmp(Ri, Rlen); 8539 br(Assembler::GE, end); 8540 8541 pre1(Ri); 8542 8543 block_comment("for (j = (i+1)/2; j; j--) {"); { 8544 add(Rj, Ri, 1); 8545 lsr(Rj, Rj, 1); 8546 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8547 } block_comment(" } // j"); 8548 8549 last_squaring(Ri); 8550 8551 block_comment(" for (j = i/2; j; j--) {"); { 8552 lsr(Rj, Ri, 1); 8553 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8554 } block_comment(" } // j"); 8555 8556 post1_squaring(); 8557 add(Ri, Ri, 1); 8558 cmp(Ri, Rlen); 8559 br(Assembler::LT, loop); 8560 8561 bind(end); 8562 block_comment("} // i"); 8563 } 8564 8565 block_comment("for (int i = len; i < 2*len; i++) {"); 8566 mov(Ri, Rlen); { 8567 Label loop, end; 8568 bind(loop); 8569 cmp(Ri, Rlen, Assembler::LSL, 1); 8570 br(Assembler::GE, end); 8571 8572 pre2(Ri, Rlen); 8573 8574 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8575 lsl(Rj, Rlen, 1); 8576 sub(Rj, Rj, Ri); 8577 sub(Rj, Rj, 1); 8578 lsr(Rj, Rj, 1); 8579 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8580 } block_comment(" } // j"); 8581 8582 last_squaring(Ri); 8583 8584 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8585 lsl(Rj, Rlen, 1); 8586 sub(Rj, Rj, Ri); 8587 lsr(Rj, Rj, 1); 8588 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8589 } block_comment(" } // j"); 8590 8591 post2(Ri, Rlen); 8592 add(Ri, Ri, 1); 8593 cmp(Ri, Rlen, Assembler::LSL, 1); 8594 8595 br(Assembler::LT, loop); 8596 bind(end); 8597 block_comment("} // i"); 8598 } 8599 8600 normalize(Rlen); 8601 8602 mov(Ra, Pm_base); // Save Pm_base in Ra 8603 restore_regs(); // Restore caller's Pm_base 8604 8605 // Copy our result into caller's Pm_base 8606 reverse(Pm_base, Ra, Rlen, t0, t1); 8607 8608 leave(); 8609 ret(lr); 8610 8611 return entry; 8612 } 8613 // In C, approximately: 8614 8615 // void 8616 // montgomery_square(julong Pa_base[], julong Pn_base[], 8617 // julong Pm_base[], julong inv, int len) { 8618 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8619 // julong *Pa, *Pb, *Pn, *Pm; 8620 // julong Ra, Rb, Rn, Rm; 8621 8622 // int i; 8623 8624 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8625 8626 // for (i = 0; i < len; i++) { 8627 // int j; 8628 8629 // Pa = Pa_base; 8630 // Pb = Pa_base + i; 8631 // Pm = Pm_base; 8632 // Pn = Pn_base + i; 8633 8634 // Ra = *Pa; 8635 // Rb = *Pb; 8636 // Rm = *Pm; 8637 // Rn = *Pn; 8638 8639 // int iters = (i+1)/2; 8640 // for (j = 0; iters--; j++) { 8641 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8642 // MACC2(Ra, Rb, t0, t1, t2); 8643 // Ra = *++Pa; 8644 // Rb = *--Pb; 8645 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8646 // MACC(Rm, Rn, t0, t1, t2); 8647 // Rm = *++Pm; 8648 // Rn = *--Pn; 8649 // } 8650 // if ((i & 1) == 0) { 8651 // assert(Ra == Pa_base[j], "must be"); 8652 // MACC(Ra, Ra, t0, t1, t2); 8653 // } 8654 // iters = i/2; 8655 // assert(iters == i-j, "must be"); 8656 // for (; iters--; j++) { 8657 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8658 // MACC(Rm, Rn, t0, t1, t2); 8659 // Rm = *++Pm; 8660 // Rn = *--Pn; 8661 // } 8662 8663 // *Pm = Rm = t0 * inv; 8664 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8665 // MACC(Rm, Rn, t0, t1, t2); 8666 8667 // assert(t0 == 0, "broken Montgomery multiply"); 8668 8669 // t0 = t1; t1 = t2; t2 = 0; 8670 // } 8671 8672 // for (i = len; i < 2*len; i++) { 8673 // int start = i-len+1; 8674 // int end = start + (len - start)/2; 8675 // int j; 8676 8677 // Pa = Pa_base + i-len; 8678 // Pb = Pa_base + len; 8679 // Pm = Pm_base + i-len; 8680 // Pn = Pn_base + len; 8681 8682 // Ra = *++Pa; 8683 // Rb = *--Pb; 8684 // Rm = *++Pm; 8685 // Rn = *--Pn; 8686 8687 // int iters = (2*len-i-1)/2; 8688 // assert(iters == end-start, "must be"); 8689 // for (j = start; iters--; j++) { 8690 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8691 // MACC2(Ra, Rb, t0, t1, t2); 8692 // Ra = *++Pa; 8693 // Rb = *--Pb; 8694 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8695 // MACC(Rm, Rn, t0, t1, t2); 8696 // Rm = *++Pm; 8697 // Rn = *--Pn; 8698 // } 8699 // if ((i & 1) == 0) { 8700 // assert(Ra == Pa_base[j], "must be"); 8701 // MACC(Ra, Ra, t0, t1, t2); 8702 // } 8703 // iters = (2*len-i)/2; 8704 // assert(iters == len-j, "must be"); 8705 // for (; iters--; j++) { 8706 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8707 // MACC(Rm, Rn, t0, t1, t2); 8708 // Rm = *++Pm; 8709 // Rn = *--Pn; 8710 // } 8711 // Pm_base[i-len] = t0; 8712 // t0 = t1; t1 = t2; t2 = 0; 8713 // } 8714 8715 // while (t0) 8716 // t0 = sub(Pm_base, Pn_base, t0, len); 8717 // } 8718 }; 8719 8720 void generate_vector_math_stubs() { 8721 // Get native vector math stub routine addresses 8722 void* libsleef = nullptr; 8723 char ebuf[1024]; 8724 char dll_name[JVM_MAXPATHLEN]; 8725 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 8726 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 8727 } 8728 if (libsleef == nullptr) { 8729 log_info(library)("Failed to load native vector math library, %s!", ebuf); 8730 return; 8731 } 8732 // Method naming convention 8733 // All the methods are named as <OP><T><N>_<U><suffix> 8734 // Where: 8735 // <OP> is the operation name, e.g. sin 8736 // <T> is optional to indicate float/double 8737 // "f/d" for vector float/double operation 8738 // <N> is the number of elements in the vector 8739 // "2/4" for neon, and "x" for sve 8740 // <U> is the precision level 8741 // "u10/u05" represents 1.0/0.5 ULP error bounds 8742 // We use "u10" for all operations by default 8743 // But for those functions do not have u10 support, we use "u05" instead 8744 // <suffix> indicates neon/sve 8745 // "sve/advsimd" for sve/neon implementations 8746 // e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions 8747 // cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions 8748 // 8749 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 8750 8751 // Math vector stubs implemented with SVE for scalable vector size. 8752 if (UseSVE > 0) { 8753 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 8754 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 8755 // Skip "tanh" because there is performance regression 8756 if (vop == VectorSupport::VECTOR_OP_TANH) { 8757 continue; 8758 } 8759 8760 // The native library does not support u10 level of "hypot". 8761 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 8762 8763 snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf); 8764 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 8765 8766 snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf); 8767 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 8768 } 8769 } 8770 8771 // Math vector stubs implemented with NEON for 64/128 bits vector size. 8772 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 8773 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 8774 // Skip "tanh" because there is performance regression 8775 if (vop == VectorSupport::VECTOR_OP_TANH) { 8776 continue; 8777 } 8778 8779 // The native library does not support u10 level of "hypot". 8780 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 8781 8782 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 8783 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf); 8784 8785 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 8786 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 8787 8788 snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf); 8789 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 8790 } 8791 } 8792 8793 // Initialization 8794 void generate_initial_stubs() { 8795 // Generate initial stubs and initializes the entry points 8796 8797 // entry points that exist in all platforms Note: This is code 8798 // that could be shared among different platforms - however the 8799 // benefit seems to be smaller than the disadvantage of having a 8800 // much more complicated generator structure. See also comment in 8801 // stubRoutines.hpp. 8802 8803 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8804 8805 StubRoutines::_call_stub_entry = 8806 generate_call_stub(StubRoutines::_call_stub_return_address); 8807 8808 // is referenced by megamorphic call 8809 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8810 8811 // Initialize table for copy memory (arraycopy) check. 8812 if (UnsafeMemoryAccess::_table == nullptr) { 8813 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 8814 } 8815 8816 if (UseCRC32Intrinsics) { 8817 // set table address before stub generation which use it 8818 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8819 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8820 } 8821 8822 if (UseCRC32CIntrinsics) { 8823 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8824 } 8825 8826 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8827 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8828 } 8829 8830 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8831 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8832 } 8833 8834 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 8835 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 8836 StubRoutines::_hf2f = generate_float16ToFloat(); 8837 StubRoutines::_f2hf = generate_floatToFloat16(); 8838 } 8839 } 8840 8841 void generate_continuation_stubs() { 8842 // Continuation stubs: 8843 StubRoutines::_cont_thaw = generate_cont_thaw(); 8844 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8845 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8846 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 8847 } 8848 8849 void generate_final_stubs() { 8850 // support for verify_oop (must happen after universe_init) 8851 if (VerifyOops) { 8852 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8853 } 8854 8855 // arraycopy stubs used by compilers 8856 generate_arraycopy_stubs(); 8857 8858 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8859 if (bs_nm != nullptr) { 8860 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8861 } 8862 8863 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8864 8865 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8866 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 8867 8868 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8869 8870 generate_atomic_entry_points(); 8871 8872 #endif // LINUX 8873 8874 #ifdef COMPILER2 8875 if (UseSecondarySupersTable) { 8876 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 8877 if (! InlineSecondarySupersTest) { 8878 generate_lookup_secondary_supers_table_stub(); 8879 } 8880 } 8881 #endif 8882 8883 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8884 } 8885 8886 void generate_compiler_stubs() { 8887 #if COMPILER2_OR_JVMCI 8888 8889 if (UseSVE == 0) { 8890 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 8891 } 8892 8893 // array equals stub for large arrays. 8894 if (!UseSimpleArrayEquals) { 8895 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8896 } 8897 8898 // arrays_hascode stub for large arrays. 8899 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 8900 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 8901 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 8902 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 8903 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 8904 8905 // byte_array_inflate stub for large arrays. 8906 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8907 8908 // countPositives stub for large arrays. 8909 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8910 8911 generate_compare_long_strings(); 8912 8913 generate_string_indexof_stubs(); 8914 8915 #ifdef COMPILER2 8916 if (UseMultiplyToLenIntrinsic) { 8917 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8918 } 8919 8920 if (UseSquareToLenIntrinsic) { 8921 StubRoutines::_squareToLen = generate_squareToLen(); 8922 } 8923 8924 if (UseMulAddIntrinsic) { 8925 StubRoutines::_mulAdd = generate_mulAdd(); 8926 } 8927 8928 if (UseSIMDForBigIntegerShiftIntrinsics) { 8929 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8930 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8931 } 8932 8933 if (UseMontgomeryMultiplyIntrinsic) { 8934 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 8935 StubCodeMark mark(this, stub_id); 8936 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8937 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8938 } 8939 8940 if (UseMontgomerySquareIntrinsic) { 8941 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 8942 StubCodeMark mark(this, stub_id); 8943 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8944 // We use generate_multiply() rather than generate_square() 8945 // because it's faster for the sizes of modulus we care about. 8946 StubRoutines::_montgomerySquare = g.generate_multiply(); 8947 } 8948 8949 generate_vector_math_stubs(); 8950 8951 #endif // COMPILER2 8952 8953 if (UseChaCha20Intrinsics) { 8954 StubRoutines::_chacha20Block = generate_chacha20Block_qrpar(); 8955 } 8956 8957 if (UseBASE64Intrinsics) { 8958 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8959 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8960 } 8961 8962 // data cache line writeback 8963 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8964 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8965 8966 if (UseAESIntrinsics) { 8967 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8968 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8969 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8970 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8971 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8972 } 8973 if (UseGHASHIntrinsics) { 8974 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8975 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8976 } 8977 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8978 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8979 } 8980 8981 if (UseMD5Intrinsics) { 8982 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 8983 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 8984 } 8985 if (UseSHA1Intrinsics) { 8986 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 8987 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 8988 } 8989 if (UseSHA256Intrinsics) { 8990 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 8991 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 8992 } 8993 if (UseSHA512Intrinsics) { 8994 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 8995 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 8996 } 8997 if (UseSHA3Intrinsics) { 8998 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 8999 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 9000 } 9001 9002 if (UsePoly1305Intrinsics) { 9003 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 9004 } 9005 9006 // generate Adler32 intrinsics code 9007 if (UseAdler32Intrinsics) { 9008 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 9009 } 9010 9011 #endif // COMPILER2_OR_JVMCI 9012 } 9013 9014 public: 9015 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 9016 switch(blob_id) { 9017 case initial_id: 9018 generate_initial_stubs(); 9019 break; 9020 case continuation_id: 9021 generate_continuation_stubs(); 9022 break; 9023 case compiler_id: 9024 generate_compiler_stubs(); 9025 break; 9026 case final_id: 9027 generate_final_stubs(); 9028 break; 9029 default: 9030 fatal("unexpected blob id: %d", blob_id); 9031 break; 9032 }; 9033 } 9034 }; // end class declaration 9035 9036 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 9037 StubGenerator g(code, blob_id); 9038 } 9039 9040 9041 #if defined (LINUX) 9042 9043 // Define pointers to atomic stubs and initialize them to point to the 9044 // code in atomic_aarch64.S. 9045 9046 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 9047 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 9048 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 9049 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 9050 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 9051 9052 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 9053 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 9054 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 9055 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 9056 DEFAULT_ATOMIC_OP(xchg, 4, ) 9057 DEFAULT_ATOMIC_OP(xchg, 8, ) 9058 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 9059 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 9060 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 9061 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 9062 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 9063 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 9064 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 9065 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 9066 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 9067 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 9068 9069 #undef DEFAULT_ATOMIC_OP 9070 9071 #endif // LINUX