1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "code/SCCache.hpp" 32 #include "compiler/oopMap.hpp" 33 #include "gc/shared/barrierSet.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/gc_globals.hpp" 36 #include "gc/shared/tlab_globals.hpp" 37 #include "interpreter/interpreter.hpp" 38 #include "memory/universe.hpp" 39 #include "nativeInst_aarch64.hpp" 40 #include "oops/instanceOop.hpp" 41 #include "oops/method.hpp" 42 #include "oops/objArrayKlass.hpp" 43 #include "oops/oop.inline.hpp" 44 #include "prims/methodHandles.hpp" 45 #include "prims/upcallLinker.hpp" 46 #include "runtime/arguments.hpp" 47 #include "runtime/atomic.hpp" 48 #include "runtime/continuation.hpp" 49 #include "runtime/continuationEntry.inline.hpp" 50 #include "runtime/frame.inline.hpp" 51 #include "runtime/handles.inline.hpp" 52 #include "runtime/javaThread.hpp" 53 #include "runtime/sharedRuntime.hpp" 54 #include "runtime/stubCodeGenerator.hpp" 55 #include "runtime/stubRoutines.hpp" 56 #include "utilities/align.hpp" 57 #include "utilities/checkedCast.hpp" 58 #include "utilities/debug.hpp" 59 #include "utilities/globalDefinitions.hpp" 60 #include "utilities/intpow.hpp" 61 #include "utilities/powerOfTwo.hpp" 62 #ifdef COMPILER2 63 #include "opto/runtime.hpp" 64 #endif 65 #if INCLUDE_ZGC 66 #include "gc/z/zThreadLocalData.hpp" 67 #endif 68 69 // Declaration and definition of StubGenerator (no .hpp file). 70 // For a more detailed description of the stub routine structure 71 // see the comment in stubRoutines.hpp 72 73 #undef __ 74 #define __ _masm-> 75 76 #ifdef PRODUCT 77 #define BLOCK_COMMENT(str) /* nothing */ 78 #else 79 #define BLOCK_COMMENT(str) __ block_comment(str) 80 #endif 81 82 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 83 84 // Stub Code definitions 85 86 class StubGenerator: public StubCodeGenerator { 87 private: 88 89 #ifdef PRODUCT 90 #define inc_counter_np(counter) ((void)0) 91 #else 92 void inc_counter_np_(uint& counter) { 93 __ incrementw(ExternalAddress((address)&counter)); 94 } 95 #define inc_counter_np(counter) \ 96 BLOCK_COMMENT("inc_counter " #counter); \ 97 inc_counter_np_(counter); 98 #endif 99 100 // Call stubs are used to call Java from C 101 // 102 // Arguments: 103 // c_rarg0: call wrapper address address 104 // c_rarg1: result address 105 // c_rarg2: result type BasicType 106 // c_rarg3: method Method* 107 // c_rarg4: (interpreter) entry point address 108 // c_rarg5: parameters intptr_t* 109 // c_rarg6: parameter size (in words) int 110 // c_rarg7: thread Thread* 111 // 112 // There is no return from the stub itself as any Java result 113 // is written to result 114 // 115 // we save r30 (lr) as the return PC at the base of the frame and 116 // link r29 (fp) below it as the frame pointer installing sp (r31) 117 // into fp. 118 // 119 // we save r0-r7, which accounts for all the c arguments. 120 // 121 // TODO: strictly do we need to save them all? they are treated as 122 // volatile by C so could we omit saving the ones we are going to 123 // place in global registers (thread? method?) or those we only use 124 // during setup of the Java call? 125 // 126 // we don't need to save r8 which C uses as an indirect result location 127 // return register. 128 // 129 // we don't need to save r9-r15 which both C and Java treat as 130 // volatile 131 // 132 // we don't need to save r16-18 because Java does not use them 133 // 134 // we save r19-r28 which Java uses as scratch registers and C 135 // expects to be callee-save 136 // 137 // we save the bottom 64 bits of each value stored in v8-v15; it is 138 // the responsibility of the caller to preserve larger values. 139 // 140 // so the stub frame looks like this when we enter Java code 141 // 142 // [ return_from_Java ] <--- sp 143 // [ argument word n ] 144 // ... 145 // -29 [ argument word 1 ] 146 // -28 [ saved Floating-point Control Register ] 147 // -26 [ saved v15 ] <--- sp_after_call 148 // -25 [ saved v14 ] 149 // -24 [ saved v13 ] 150 // -23 [ saved v12 ] 151 // -22 [ saved v11 ] 152 // -21 [ saved v10 ] 153 // -20 [ saved v9 ] 154 // -19 [ saved v8 ] 155 // -18 [ saved r28 ] 156 // -17 [ saved r27 ] 157 // -16 [ saved r26 ] 158 // -15 [ saved r25 ] 159 // -14 [ saved r24 ] 160 // -13 [ saved r23 ] 161 // -12 [ saved r22 ] 162 // -11 [ saved r21 ] 163 // -10 [ saved r20 ] 164 // -9 [ saved r19 ] 165 // -8 [ call wrapper (r0) ] 166 // -7 [ result (r1) ] 167 // -6 [ result type (r2) ] 168 // -5 [ method (r3) ] 169 // -4 [ entry point (r4) ] 170 // -3 [ parameters (r5) ] 171 // -2 [ parameter size (r6) ] 172 // -1 [ thread (r7) ] 173 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 174 // 1 [ saved lr (r30) ] 175 176 // Call stub stack layout word offsets from fp 177 enum call_stub_layout { 178 sp_after_call_off = -28, 179 180 fpcr_off = sp_after_call_off, 181 d15_off = -26, 182 d13_off = -24, 183 d11_off = -22, 184 d9_off = -20, 185 186 r28_off = -18, 187 r26_off = -16, 188 r24_off = -14, 189 r22_off = -12, 190 r20_off = -10, 191 call_wrapper_off = -8, 192 result_off = -7, 193 result_type_off = -6, 194 method_off = -5, 195 entry_point_off = -4, 196 parameter_size_off = -2, 197 thread_off = -1, 198 fp_f = 0, 199 retaddr_off = 1, 200 }; 201 202 address generate_call_stub(address& return_address) { 203 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 204 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 205 "adjust this code"); 206 207 StubCodeMark mark(this, "StubRoutines", "call_stub"); 208 address start = __ pc(); 209 210 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 211 212 const Address fpcr_save (rfp, fpcr_off * wordSize); 213 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 214 const Address result (rfp, result_off * wordSize); 215 const Address result_type (rfp, result_type_off * wordSize); 216 const Address method (rfp, method_off * wordSize); 217 const Address entry_point (rfp, entry_point_off * wordSize); 218 const Address parameter_size(rfp, parameter_size_off * wordSize); 219 220 const Address thread (rfp, thread_off * wordSize); 221 222 const Address d15_save (rfp, d15_off * wordSize); 223 const Address d13_save (rfp, d13_off * wordSize); 224 const Address d11_save (rfp, d11_off * wordSize); 225 const Address d9_save (rfp, d9_off * wordSize); 226 227 const Address r28_save (rfp, r28_off * wordSize); 228 const Address r26_save (rfp, r26_off * wordSize); 229 const Address r24_save (rfp, r24_off * wordSize); 230 const Address r22_save (rfp, r22_off * wordSize); 231 const Address r20_save (rfp, r20_off * wordSize); 232 233 // stub code 234 235 address aarch64_entry = __ pc(); 236 237 // set up frame and move sp to end of save area 238 __ enter(); 239 __ sub(sp, rfp, -sp_after_call_off * wordSize); 240 241 // save register parameters and Java scratch/global registers 242 // n.b. we save thread even though it gets installed in 243 // rthread because we want to sanity check rthread later 244 __ str(c_rarg7, thread); 245 __ strw(c_rarg6, parameter_size); 246 __ stp(c_rarg4, c_rarg5, entry_point); 247 __ stp(c_rarg2, c_rarg3, result_type); 248 __ stp(c_rarg0, c_rarg1, call_wrapper); 249 250 __ stp(r20, r19, r20_save); 251 __ stp(r22, r21, r22_save); 252 __ stp(r24, r23, r24_save); 253 __ stp(r26, r25, r26_save); 254 __ stp(r28, r27, r28_save); 255 256 __ stpd(v9, v8, d9_save); 257 __ stpd(v11, v10, d11_save); 258 __ stpd(v13, v12, d13_save); 259 __ stpd(v15, v14, d15_save); 260 261 __ get_fpcr(rscratch1); 262 __ str(rscratch1, fpcr_save); 263 // Set FPCR to the state we need. We do want Round to Nearest. We 264 // don't want non-IEEE rounding modes or floating-point traps. 265 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 266 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 267 __ set_fpcr(rscratch1); 268 269 // install Java thread in global register now we have saved 270 // whatever value it held 271 __ mov(rthread, c_rarg7); 272 // And method 273 __ mov(rmethod, c_rarg3); 274 275 // set up the heapbase register 276 __ reinit_heapbase(); 277 278 #ifdef ASSERT 279 // make sure we have no pending exceptions 280 { 281 Label L; 282 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 283 __ cmp(rscratch1, (u1)NULL_WORD); 284 __ br(Assembler::EQ, L); 285 __ stop("StubRoutines::call_stub: entered with pending exception"); 286 __ BIND(L); 287 } 288 #endif 289 // pass parameters if any 290 __ mov(esp, sp); 291 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 292 __ andr(sp, rscratch1, -2 * wordSize); 293 294 BLOCK_COMMENT("pass parameters if any"); 295 Label parameters_done; 296 // parameter count is still in c_rarg6 297 // and parameter pointer identifying param 1 is in c_rarg5 298 __ cbzw(c_rarg6, parameters_done); 299 300 address loop = __ pc(); 301 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 302 __ subsw(c_rarg6, c_rarg6, 1); 303 __ push(rscratch1); 304 __ br(Assembler::GT, loop); 305 306 __ BIND(parameters_done); 307 308 // call Java entry -- passing methdoOop, and current sp 309 // rmethod: Method* 310 // r19_sender_sp: sender sp 311 BLOCK_COMMENT("call Java function"); 312 __ mov(r19_sender_sp, sp); 313 __ blr(c_rarg4); 314 315 // we do this here because the notify will already have been done 316 // if we get to the next instruction via an exception 317 // 318 // n.b. adding this instruction here affects the calculation of 319 // whether or not a routine returns to the call stub (used when 320 // doing stack walks) since the normal test is to check the return 321 // pc against the address saved below. so we may need to allow for 322 // this extra instruction in the check. 323 324 // save current address for use by exception handling code 325 326 return_address = __ pc(); 327 328 // store result depending on type (everything that is not 329 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 330 // n.b. this assumes Java returns an integral result in r0 331 // and a floating result in j_farg0 332 __ ldr(j_rarg2, result); 333 Label is_long, is_float, is_double, exit; 334 __ ldr(j_rarg1, result_type); 335 __ cmp(j_rarg1, (u1)T_OBJECT); 336 __ br(Assembler::EQ, is_long); 337 __ cmp(j_rarg1, (u1)T_LONG); 338 __ br(Assembler::EQ, is_long); 339 __ cmp(j_rarg1, (u1)T_FLOAT); 340 __ br(Assembler::EQ, is_float); 341 __ cmp(j_rarg1, (u1)T_DOUBLE); 342 __ br(Assembler::EQ, is_double); 343 344 // handle T_INT case 345 __ strw(r0, Address(j_rarg2)); 346 347 __ BIND(exit); 348 349 // pop parameters 350 __ sub(esp, rfp, -sp_after_call_off * wordSize); 351 352 #ifdef ASSERT 353 // verify that threads correspond 354 { 355 Label L, S; 356 __ ldr(rscratch1, thread); 357 __ cmp(rthread, rscratch1); 358 __ br(Assembler::NE, S); 359 __ get_thread(rscratch1); 360 __ cmp(rthread, rscratch1); 361 __ br(Assembler::EQ, L); 362 __ BIND(S); 363 __ stop("StubRoutines::call_stub: threads must correspond"); 364 __ BIND(L); 365 } 366 #endif 367 368 __ pop_cont_fastpath(rthread); 369 370 // restore callee-save registers 371 __ ldpd(v15, v14, d15_save); 372 __ ldpd(v13, v12, d13_save); 373 __ ldpd(v11, v10, d11_save); 374 __ ldpd(v9, v8, d9_save); 375 376 __ ldp(r28, r27, r28_save); 377 __ ldp(r26, r25, r26_save); 378 __ ldp(r24, r23, r24_save); 379 __ ldp(r22, r21, r22_save); 380 __ ldp(r20, r19, r20_save); 381 382 // restore fpcr 383 __ ldr(rscratch1, fpcr_save); 384 __ set_fpcr(rscratch1); 385 386 __ ldp(c_rarg0, c_rarg1, call_wrapper); 387 __ ldrw(c_rarg2, result_type); 388 __ ldr(c_rarg3, method); 389 __ ldp(c_rarg4, c_rarg5, entry_point); 390 __ ldp(c_rarg6, c_rarg7, parameter_size); 391 392 // leave frame and return to caller 393 __ leave(); 394 __ ret(lr); 395 396 // handle return types different from T_INT 397 398 __ BIND(is_long); 399 __ str(r0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_float); 403 __ strs(j_farg0, Address(j_rarg2, 0)); 404 __ br(Assembler::AL, exit); 405 406 __ BIND(is_double); 407 __ strd(j_farg0, Address(j_rarg2, 0)); 408 __ br(Assembler::AL, exit); 409 410 return start; 411 } 412 413 // Return point for a Java call if there's an exception thrown in 414 // Java code. The exception is caught and transformed into a 415 // pending exception stored in JavaThread that can be tested from 416 // within the VM. 417 // 418 // Note: Usually the parameters are removed by the callee. In case 419 // of an exception crossing an activation frame boundary, that is 420 // not the case if the callee is compiled code => need to setup the 421 // rsp. 422 // 423 // r0: exception oop 424 425 address generate_catch_exception() { 426 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 427 address start = __ pc(); 428 429 // same as in generate_call_stub(): 430 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 431 const Address thread (rfp, thread_off * wordSize); 432 433 #ifdef ASSERT 434 // verify that threads correspond 435 { 436 Label L, S; 437 __ ldr(rscratch1, thread); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::NE, S); 440 __ get_thread(rscratch1); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::EQ, L); 443 __ bind(S); 444 __ stop("StubRoutines::catch_exception: threads must correspond"); 445 __ bind(L); 446 } 447 #endif 448 449 // set pending exception 450 __ verify_oop(r0); 451 452 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 453 __ mov(rscratch1, (address)__FILE__); 454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 455 __ movw(rscratch1, (int)__LINE__); 456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 457 458 // complete return to VM 459 assert(StubRoutines::_call_stub_return_address != nullptr, 460 "_call_stub_return_address must have been generated before"); 461 __ b(StubRoutines::_call_stub_return_address); 462 463 return start; 464 } 465 466 // Continuation point for runtime calls returning with a pending 467 // exception. The pending exception check happened in the runtime 468 // or native call stub. The pending exception in Thread is 469 // converted into a Java-level exception. 470 // 471 // Contract with Java-level exception handlers: 472 // r0: exception 473 // r3: throwing pc 474 // 475 // NOTE: At entry of this stub, exception-pc must be in LR !! 476 477 // NOTE: this is always used as a jump target within generated code 478 // so it just needs to be generated code with no x86 prolog 479 480 address generate_forward_exception() { 481 StubCodeMark mark(this, "StubRoutines", "forward exception"); 482 address start = __ pc(); 483 484 // Upon entry, LR points to the return address returning into 485 // Java (interpreted or compiled) code; i.e., the return address 486 // becomes the throwing pc. 487 // 488 // Arguments pushed before the runtime call are still on the stack 489 // but the exception handler will reset the stack pointer -> 490 // ignore them. A potential result in registers can be ignored as 491 // well. 492 493 #ifdef ASSERT 494 // make sure this code is only executed if there is a pending exception 495 { 496 Label L; 497 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 498 __ cbnz(rscratch1, L); 499 __ stop("StubRoutines::forward exception: no pending exception (1)"); 500 __ bind(L); 501 } 502 #endif 503 504 // compute exception handler into r19 505 506 // call the VM to find the handler address associated with the 507 // caller address. pass thread in r0 and caller pc (ret address) 508 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 509 // the stack. 510 __ mov(c_rarg1, lr); 511 // lr will be trashed by the VM call so we move it to R19 512 // (callee-saved) because we also need to pass it to the handler 513 // returned by this call. 514 __ mov(r19, lr); 515 BLOCK_COMMENT("call exception_handler_for_return_address"); 516 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 517 SharedRuntime::exception_handler_for_return_address), 518 rthread, c_rarg1); 519 // Reinitialize the ptrue predicate register, in case the external runtime 520 // call clobbers ptrue reg, as we may return to SVE compiled code. 521 __ reinitialize_ptrue(); 522 523 // we should not really care that lr is no longer the callee 524 // address. we saved the value the handler needs in r19 so we can 525 // just copy it to r3. however, the C2 handler will push its own 526 // frame and then calls into the VM and the VM code asserts that 527 // the PC for the frame above the handler belongs to a compiled 528 // Java method. So, we restore lr here to satisfy that assert. 529 __ mov(lr, r19); 530 // setup r0 & r3 & clear pending exception 531 __ mov(r3, r19); 532 __ mov(r19, r0); 533 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 534 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 535 536 #ifdef ASSERT 537 // make sure exception is set 538 { 539 Label L; 540 __ cbnz(r0, L); 541 __ stop("StubRoutines::forward exception: no pending exception (2)"); 542 __ bind(L); 543 } 544 #endif 545 546 // continue at exception handler 547 // r0: exception 548 // r3: throwing pc 549 // r19: exception handler 550 __ verify_oop(r0); 551 __ br(r19); 552 553 return start; 554 } 555 556 // Non-destructive plausibility checks for oops 557 // 558 // Arguments: 559 // r0: oop to verify 560 // rscratch1: error message 561 // 562 // Stack after saving c_rarg3: 563 // [tos + 0]: saved c_rarg3 564 // [tos + 1]: saved c_rarg2 565 // [tos + 2]: saved lr 566 // [tos + 3]: saved rscratch2 567 // [tos + 4]: saved r0 568 // [tos + 5]: saved rscratch1 569 address generate_verify_oop() { 570 571 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 572 address start = __ pc(); 573 574 Label exit, error; 575 576 // save c_rarg2 and c_rarg3 577 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 578 579 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 580 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 581 __ ldr(c_rarg3, Address(c_rarg2)); 582 __ add(c_rarg3, c_rarg3, 1); 583 __ str(c_rarg3, Address(c_rarg2)); 584 585 // object is in r0 586 // make sure object is 'reasonable' 587 __ cbz(r0, exit); // if obj is null it is OK 588 589 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 590 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 591 592 // return if everything seems ok 593 __ bind(exit); 594 595 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 596 __ ret(lr); 597 598 // handle errors 599 __ bind(error); 600 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 601 602 __ push(RegSet::range(r0, r29), sp); 603 // debug(char* msg, int64_t pc, int64_t regs[]) 604 __ mov(c_rarg0, rscratch1); // pass address of error message 605 __ mov(c_rarg1, lr); // pass return address 606 __ mov(c_rarg2, sp); // pass address of regs on stack 607 #ifndef PRODUCT 608 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 609 #endif 610 BLOCK_COMMENT("call MacroAssembler::debug"); 611 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 612 __ blr(rscratch1); 613 __ hlt(0); 614 615 return start; 616 } 617 618 // Generate indices for iota vector. 619 address generate_iota_indices(const char *stub_name) { 620 __ align(CodeEntryAlignment); 621 StubCodeMark mark(this, "StubRoutines", stub_name); 622 address start = __ pc(); 623 // B 624 __ emit_data64(0x0706050403020100, relocInfo::none); 625 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 626 // H 627 __ emit_data64(0x0003000200010000, relocInfo::none); 628 __ emit_data64(0x0007000600050004, relocInfo::none); 629 // S 630 __ emit_data64(0x0000000100000000, relocInfo::none); 631 __ emit_data64(0x0000000300000002, relocInfo::none); 632 // D 633 __ emit_data64(0x0000000000000000, relocInfo::none); 634 __ emit_data64(0x0000000000000001, relocInfo::none); 635 // S - FP 636 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 637 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 638 // D - FP 639 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 640 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 641 return start; 642 } 643 644 // The inner part of zero_words(). This is the bulk operation, 645 // zeroing words in blocks, possibly using DC ZVA to do it. The 646 // caller is responsible for zeroing the last few words. 647 // 648 // Inputs: 649 // r10: the HeapWord-aligned base address of an array to zero. 650 // r11: the count in HeapWords, r11 > 0. 651 // 652 // Returns r10 and r11, adjusted for the caller to clear. 653 // r10: the base address of the tail of words left to clear. 654 // r11: the number of words in the tail. 655 // r11 < MacroAssembler::zero_words_block_size. 656 657 address generate_zero_blocks() { 658 Label done; 659 Label base_aligned; 660 661 Register base = r10, cnt = r11; 662 663 __ align(CodeEntryAlignment); 664 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 665 address start = __ pc(); 666 667 if (UseBlockZeroing) { 668 int zva_length = VM_Version::zva_length(); 669 670 // Ensure ZVA length can be divided by 16. This is required by 671 // the subsequent operations. 672 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 673 674 __ tbz(base, 3, base_aligned); 675 __ str(zr, Address(__ post(base, 8))); 676 __ sub(cnt, cnt, 1); 677 __ bind(base_aligned); 678 679 // Ensure count >= zva_length * 2 so that it still deserves a zva after 680 // alignment. 681 Label small; 682 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 683 __ subs(rscratch1, cnt, low_limit >> 3); 684 __ br(Assembler::LT, small); 685 __ zero_dcache_blocks(base, cnt); 686 __ bind(small); 687 } 688 689 { 690 // Number of stp instructions we'll unroll 691 const int unroll = 692 MacroAssembler::zero_words_block_size / 2; 693 // Clear the remaining blocks. 694 Label loop; 695 __ subs(cnt, cnt, unroll * 2); 696 __ br(Assembler::LT, done); 697 __ bind(loop); 698 for (int i = 0; i < unroll; i++) 699 __ stp(zr, zr, __ post(base, 16)); 700 __ subs(cnt, cnt, unroll * 2); 701 __ br(Assembler::GE, loop); 702 __ bind(done); 703 __ add(cnt, cnt, unroll * 2); 704 } 705 706 __ ret(lr); 707 708 return start; 709 } 710 711 712 typedef enum { 713 copy_forwards = 1, 714 copy_backwards = -1 715 } copy_direction; 716 717 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 718 // for arraycopy stubs. 719 class ArrayCopyBarrierSetHelper : StackObj { 720 BarrierSetAssembler* _bs_asm; 721 MacroAssembler* _masm; 722 DecoratorSet _decorators; 723 BasicType _type; 724 Register _gct1; 725 Register _gct2; 726 Register _gct3; 727 FloatRegister _gcvt1; 728 FloatRegister _gcvt2; 729 FloatRegister _gcvt3; 730 731 public: 732 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 733 DecoratorSet decorators, 734 BasicType type, 735 Register gct1, 736 Register gct2, 737 Register gct3, 738 FloatRegister gcvt1, 739 FloatRegister gcvt2, 740 FloatRegister gcvt3) 741 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 742 _masm(masm), 743 _decorators(decorators), 744 _type(type), 745 _gct1(gct1), 746 _gct2(gct2), 747 _gct3(gct3), 748 _gcvt1(gcvt1), 749 _gcvt2(gcvt2), 750 _gcvt3(gcvt3) { 751 } 752 753 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 754 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 755 dst1, dst2, src, 756 _gct1, _gct2, _gcvt1); 757 } 758 759 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 760 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 761 dst, src1, src2, 762 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 763 } 764 765 void copy_load_at_16(Register dst1, Register dst2, Address src) { 766 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 767 dst1, dst2, src, 768 _gct1); 769 } 770 771 void copy_store_at_16(Address dst, Register src1, Register src2) { 772 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 773 dst, src1, src2, 774 _gct1, _gct2, _gct3); 775 } 776 777 void copy_load_at_8(Register dst, Address src) { 778 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 779 dst, noreg, src, 780 _gct1); 781 } 782 783 void copy_store_at_8(Address dst, Register src) { 784 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 785 dst, src, noreg, 786 _gct1, _gct2, _gct3); 787 } 788 }; 789 790 // Bulk copy of blocks of 8 words. 791 // 792 // count is a count of words. 793 // 794 // Precondition: count >= 8 795 // 796 // Postconditions: 797 // 798 // The least significant bit of count contains the remaining count 799 // of words to copy. The rest of count is trash. 800 // 801 // s and d are adjusted to point to the remaining words to copy 802 // 803 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 804 copy_direction direction) { 805 int unit = wordSize * direction; 806 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 807 808 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 809 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 810 const Register stride = r14; 811 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 812 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 813 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 814 815 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 816 assert_different_registers(s, d, count, rscratch1, rscratch2); 817 818 Label again, drain; 819 const char *stub_name; 820 if (direction == copy_forwards) 821 stub_name = "forward_copy_longs"; 822 else 823 stub_name = "backward_copy_longs"; 824 825 __ align(CodeEntryAlignment); 826 827 StubCodeMark mark(this, "StubRoutines", stub_name); 828 829 __ bind(start); 830 831 Label unaligned_copy_long; 832 if (AvoidUnalignedAccesses) { 833 __ tbnz(d, 3, unaligned_copy_long); 834 } 835 836 if (direction == copy_forwards) { 837 __ sub(s, s, bias); 838 __ sub(d, d, bias); 839 } 840 841 #ifdef ASSERT 842 // Make sure we are never given < 8 words 843 { 844 Label L; 845 __ cmp(count, (u1)8); 846 __ br(Assembler::GE, L); 847 __ stop("genrate_copy_longs called with < 8 words"); 848 __ bind(L); 849 } 850 #endif 851 852 // Fill 8 registers 853 if (UseSIMDForMemoryOps) { 854 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 855 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 856 } else { 857 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 858 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 859 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 860 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 861 } 862 863 __ subs(count, count, 16); 864 __ br(Assembler::LO, drain); 865 866 int prefetch = PrefetchCopyIntervalInBytes; 867 bool use_stride = false; 868 if (direction == copy_backwards) { 869 use_stride = prefetch > 256; 870 prefetch = -prefetch; 871 if (use_stride) __ mov(stride, prefetch); 872 } 873 874 __ bind(again); 875 876 if (PrefetchCopyIntervalInBytes > 0) 877 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 878 879 if (UseSIMDForMemoryOps) { 880 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 881 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 882 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 884 } else { 885 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 886 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 887 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 888 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 889 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 890 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 891 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 892 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 893 } 894 895 __ subs(count, count, 8); 896 __ br(Assembler::HS, again); 897 898 // Drain 899 __ bind(drain); 900 if (UseSIMDForMemoryOps) { 901 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 902 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 903 } else { 904 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 905 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 906 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 907 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 908 } 909 910 { 911 Label L1, L2; 912 __ tbz(count, exact_log2(4), L1); 913 if (UseSIMDForMemoryOps) { 914 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 915 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 916 } else { 917 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 918 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 919 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 920 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 921 } 922 __ bind(L1); 923 924 if (direction == copy_forwards) { 925 __ add(s, s, bias); 926 __ add(d, d, bias); 927 } 928 929 __ tbz(count, 1, L2); 930 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 931 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 932 __ bind(L2); 933 } 934 935 __ ret(lr); 936 937 if (AvoidUnalignedAccesses) { 938 Label drain, again; 939 // Register order for storing. Order is different for backward copy. 940 941 __ bind(unaligned_copy_long); 942 943 // source address is even aligned, target odd aligned 944 // 945 // when forward copying word pairs we read long pairs at offsets 946 // {0, 2, 4, 6} (in long words). when backwards copying we read 947 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 948 // address by -2 in the forwards case so we can compute the 949 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 950 // or -1. 951 // 952 // when forward copying we need to store 1 word, 3 pairs and 953 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 954 // zero offset We adjust the destination by -1 which means we 955 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 956 // 957 // When backwards copyng we need to store 1 word, 3 pairs and 958 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 959 // offsets {1, 3, 5, 7, 8} * unit. 960 961 if (direction == copy_forwards) { 962 __ sub(s, s, 16); 963 __ sub(d, d, 8); 964 } 965 966 // Fill 8 registers 967 // 968 // for forwards copy s was offset by -16 from the original input 969 // value of s so the register contents are at these offsets 970 // relative to the 64 bit block addressed by that original input 971 // and so on for each successive 64 byte block when s is updated 972 // 973 // t0 at offset 0, t1 at offset 8 974 // t2 at offset 16, t3 at offset 24 975 // t4 at offset 32, t5 at offset 40 976 // t6 at offset 48, t7 at offset 56 977 978 // for backwards copy s was not offset so the register contents 979 // are at these offsets into the preceding 64 byte block 980 // relative to that original input and so on for each successive 981 // preceding 64 byte block when s is updated. this explains the 982 // slightly counter-intuitive looking pattern of register usage 983 // in the stp instructions for backwards copy. 984 // 985 // t0 at offset -16, t1 at offset -8 986 // t2 at offset -32, t3 at offset -24 987 // t4 at offset -48, t5 at offset -40 988 // t6 at offset -64, t7 at offset -56 989 990 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 991 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 992 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 993 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 994 995 __ subs(count, count, 16); 996 __ br(Assembler::LO, drain); 997 998 int prefetch = PrefetchCopyIntervalInBytes; 999 bool use_stride = false; 1000 if (direction == copy_backwards) { 1001 use_stride = prefetch > 256; 1002 prefetch = -prefetch; 1003 if (use_stride) __ mov(stride, prefetch); 1004 } 1005 1006 __ bind(again); 1007 1008 if (PrefetchCopyIntervalInBytes > 0) 1009 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1010 1011 if (direction == copy_forwards) { 1012 // allowing for the offset of -8 the store instructions place 1013 // registers into the target 64 bit block at the following 1014 // offsets 1015 // 1016 // t0 at offset 0 1017 // t1 at offset 8, t2 at offset 16 1018 // t3 at offset 24, t4 at offset 32 1019 // t5 at offset 40, t6 at offset 48 1020 // t7 at offset 56 1021 1022 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1023 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1024 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1025 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1026 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1027 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1028 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1029 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1030 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1031 } else { 1032 // d was not offset when we started so the registers are 1033 // written into the 64 bit block preceding d with the following 1034 // offsets 1035 // 1036 // t1 at offset -8 1037 // t3 at offset -24, t0 at offset -16 1038 // t5 at offset -48, t2 at offset -32 1039 // t7 at offset -56, t4 at offset -48 1040 // t6 at offset -64 1041 // 1042 // note that this matches the offsets previously noted for the 1043 // loads 1044 1045 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1046 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1047 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1048 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1049 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1050 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1051 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1052 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1053 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1054 } 1055 1056 __ subs(count, count, 8); 1057 __ br(Assembler::HS, again); 1058 1059 // Drain 1060 // 1061 // this uses the same pattern of offsets and register arguments 1062 // as above 1063 __ bind(drain); 1064 if (direction == copy_forwards) { 1065 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1066 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1067 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1068 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1069 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1070 } else { 1071 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1072 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1073 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1074 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1075 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1076 } 1077 // now we need to copy any remaining part block which may 1078 // include a 4 word block subblock and/or a 2 word subblock. 1079 // bits 2 and 1 in the count are the tell-tale for whether we 1080 // have each such subblock 1081 { 1082 Label L1, L2; 1083 __ tbz(count, exact_log2(4), L1); 1084 // this is the same as above but copying only 4 longs hence 1085 // with only one intervening stp between the str instructions 1086 // but note that the offsets and registers still follow the 1087 // same pattern 1088 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1089 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1090 if (direction == copy_forwards) { 1091 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1092 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1093 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1094 } else { 1095 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1096 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1097 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1098 } 1099 __ bind(L1); 1100 1101 __ tbz(count, 1, L2); 1102 // this is the same as above but copying only 2 longs hence 1103 // there is no intervening stp between the str instructions 1104 // but note that the offset and register patterns are still 1105 // the same 1106 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1107 if (direction == copy_forwards) { 1108 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1109 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1110 } else { 1111 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1112 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1113 } 1114 __ bind(L2); 1115 1116 // for forwards copy we need to re-adjust the offsets we 1117 // applied so that s and d are follow the last words written 1118 1119 if (direction == copy_forwards) { 1120 __ add(s, s, 16); 1121 __ add(d, d, 8); 1122 } 1123 1124 } 1125 1126 __ ret(lr); 1127 } 1128 } 1129 1130 // Small copy: less than 16 bytes. 1131 // 1132 // NB: Ignores all of the bits of count which represent more than 15 1133 // bytes, so a caller doesn't have to mask them. 1134 1135 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1136 bool is_backwards = step < 0; 1137 size_t granularity = uabs(step); 1138 int direction = is_backwards ? -1 : 1; 1139 1140 Label Lword, Lint, Lshort, Lbyte; 1141 1142 assert(granularity 1143 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1144 1145 const Register t0 = r3; 1146 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1147 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1148 1149 // ??? I don't know if this bit-test-and-branch is the right thing 1150 // to do. It does a lot of jumping, resulting in several 1151 // mispredicted branches. It might make more sense to do this 1152 // with something like Duff's device with a single computed branch. 1153 1154 __ tbz(count, 3 - exact_log2(granularity), Lword); 1155 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1156 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1157 __ bind(Lword); 1158 1159 if (granularity <= sizeof (jint)) { 1160 __ tbz(count, 2 - exact_log2(granularity), Lint); 1161 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1162 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1163 __ bind(Lint); 1164 } 1165 1166 if (granularity <= sizeof (jshort)) { 1167 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1168 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1169 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1170 __ bind(Lshort); 1171 } 1172 1173 if (granularity <= sizeof (jbyte)) { 1174 __ tbz(count, 0, Lbyte); 1175 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1176 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1177 __ bind(Lbyte); 1178 } 1179 } 1180 1181 Label copy_f, copy_b; 1182 Label copy_obj_f, copy_obj_b; 1183 Label copy_obj_uninit_f, copy_obj_uninit_b; 1184 1185 // All-singing all-dancing memory copy. 1186 // 1187 // Copy count units of memory from s to d. The size of a unit is 1188 // step, which can be positive or negative depending on the direction 1189 // of copy. If is_aligned is false, we align the source address. 1190 // 1191 1192 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1193 Register s, Register d, Register count, int step) { 1194 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1195 bool is_backwards = step < 0; 1196 unsigned int granularity = uabs(step); 1197 const Register t0 = r3, t1 = r4; 1198 1199 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1200 // load all the data before writing anything 1201 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1202 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1203 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1204 const Register send = r17, dend = r16; 1205 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1206 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1207 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1208 1209 if (PrefetchCopyIntervalInBytes > 0) 1210 __ prfm(Address(s, 0), PLDL1KEEP); 1211 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1212 __ br(Assembler::HI, copy_big); 1213 1214 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1215 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1216 1217 __ cmp(count, u1(16/granularity)); 1218 __ br(Assembler::LS, copy16); 1219 1220 __ cmp(count, u1(64/granularity)); 1221 __ br(Assembler::HI, copy80); 1222 1223 __ cmp(count, u1(32/granularity)); 1224 __ br(Assembler::LS, copy32); 1225 1226 // 33..64 bytes 1227 if (UseSIMDForMemoryOps) { 1228 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1229 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1230 bs.copy_store_at_32(Address(d, 0), v0, v1); 1231 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1232 } else { 1233 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1234 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1235 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1236 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1237 1238 bs.copy_store_at_16(Address(d, 0), t0, t1); 1239 bs.copy_store_at_16(Address(d, 16), t2, t3); 1240 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1241 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1242 } 1243 __ b(finish); 1244 1245 // 17..32 bytes 1246 __ bind(copy32); 1247 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1248 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1249 1250 bs.copy_store_at_16(Address(d, 0), t0, t1); 1251 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1252 __ b(finish); 1253 1254 // 65..80/96 bytes 1255 // (96 bytes if SIMD because we do 32 byes per instruction) 1256 __ bind(copy80); 1257 if (UseSIMDForMemoryOps) { 1258 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1259 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1260 // Unaligned pointers can be an issue for copying. 1261 // The issue has more chances to happen when granularity of data is 1262 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1263 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1264 // The most performance drop has been seen for the range 65-80 bytes. 1265 // For such cases using the pair of ldp/stp instead of the third pair of 1266 // ldpq/stpq fixes the performance issue. 1267 if (granularity < sizeof (jint)) { 1268 Label copy96; 1269 __ cmp(count, u1(80/granularity)); 1270 __ br(Assembler::HI, copy96); 1271 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1272 1273 bs.copy_store_at_32(Address(d, 0), v0, v1); 1274 bs.copy_store_at_32(Address(d, 32), v2, v3); 1275 1276 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1277 __ b(finish); 1278 1279 __ bind(copy96); 1280 } 1281 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1282 1283 bs.copy_store_at_32(Address(d, 0), v0, v1); 1284 bs.copy_store_at_32(Address(d, 32), v2, v3); 1285 1286 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1287 } else { 1288 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1289 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1290 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1291 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1292 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1293 1294 bs.copy_store_at_16(Address(d, 0), t0, t1); 1295 bs.copy_store_at_16(Address(d, 16), t2, t3); 1296 bs.copy_store_at_16(Address(d, 32), t4, t5); 1297 bs.copy_store_at_16(Address(d, 48), t6, t7); 1298 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1299 } 1300 __ b(finish); 1301 1302 // 0..16 bytes 1303 __ bind(copy16); 1304 __ cmp(count, u1(8/granularity)); 1305 __ br(Assembler::LO, copy8); 1306 1307 // 8..16 bytes 1308 bs.copy_load_at_8(t0, Address(s, 0)); 1309 bs.copy_load_at_8(t1, Address(send, -8)); 1310 bs.copy_store_at_8(Address(d, 0), t0); 1311 bs.copy_store_at_8(Address(dend, -8), t1); 1312 __ b(finish); 1313 1314 if (granularity < 8) { 1315 // 4..7 bytes 1316 __ bind(copy8); 1317 __ tbz(count, 2 - exact_log2(granularity), copy4); 1318 __ ldrw(t0, Address(s, 0)); 1319 __ ldrw(t1, Address(send, -4)); 1320 __ strw(t0, Address(d, 0)); 1321 __ strw(t1, Address(dend, -4)); 1322 __ b(finish); 1323 if (granularity < 4) { 1324 // 0..3 bytes 1325 __ bind(copy4); 1326 __ cbz(count, finish); // get rid of 0 case 1327 if (granularity == 2) { 1328 __ ldrh(t0, Address(s, 0)); 1329 __ strh(t0, Address(d, 0)); 1330 } else { // granularity == 1 1331 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1332 // the first and last byte. 1333 // Handle the 3 byte case by loading and storing base + count/2 1334 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1335 // This does means in the 1 byte case we load/store the same 1336 // byte 3 times. 1337 __ lsr(count, count, 1); 1338 __ ldrb(t0, Address(s, 0)); 1339 __ ldrb(t1, Address(send, -1)); 1340 __ ldrb(t2, Address(s, count)); 1341 __ strb(t0, Address(d, 0)); 1342 __ strb(t1, Address(dend, -1)); 1343 __ strb(t2, Address(d, count)); 1344 } 1345 __ b(finish); 1346 } 1347 } 1348 1349 __ bind(copy_big); 1350 if (is_backwards) { 1351 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1352 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1353 } 1354 1355 // Now we've got the small case out of the way we can align the 1356 // source address on a 2-word boundary. 1357 1358 // Here we will materialize a count in r15, which is used by copy_memory_small 1359 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1360 // Up until here, we have used t9, which aliases r15, but from here on, that register 1361 // can not be used as a temp register, as it contains the count. 1362 1363 Label aligned; 1364 1365 if (is_aligned) { 1366 // We may have to adjust by 1 word to get s 2-word-aligned. 1367 __ tbz(s, exact_log2(wordSize), aligned); 1368 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1369 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1370 __ sub(count, count, wordSize/granularity); 1371 } else { 1372 if (is_backwards) { 1373 __ andr(r15, s, 2 * wordSize - 1); 1374 } else { 1375 __ neg(r15, s); 1376 __ andr(r15, r15, 2 * wordSize - 1); 1377 } 1378 // r15 is the byte adjustment needed to align s. 1379 __ cbz(r15, aligned); 1380 int shift = exact_log2(granularity); 1381 if (shift > 0) { 1382 __ lsr(r15, r15, shift); 1383 } 1384 __ sub(count, count, r15); 1385 1386 #if 0 1387 // ?? This code is only correct for a disjoint copy. It may or 1388 // may not make sense to use it in that case. 1389 1390 // Copy the first pair; s and d may not be aligned. 1391 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1392 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1393 1394 // Align s and d, adjust count 1395 if (is_backwards) { 1396 __ sub(s, s, r15); 1397 __ sub(d, d, r15); 1398 } else { 1399 __ add(s, s, r15); 1400 __ add(d, d, r15); 1401 } 1402 #else 1403 copy_memory_small(decorators, type, s, d, r15, step); 1404 #endif 1405 } 1406 1407 __ bind(aligned); 1408 1409 // s is now 2-word-aligned. 1410 1411 // We have a count of units and some trailing bytes. Adjust the 1412 // count and do a bulk copy of words. If the shift is zero 1413 // perform a move instead to benefit from zero latency moves. 1414 int shift = exact_log2(wordSize/granularity); 1415 if (shift > 0) { 1416 __ lsr(r15, count, shift); 1417 } else { 1418 __ mov(r15, count); 1419 } 1420 if (direction == copy_forwards) { 1421 if (type != T_OBJECT) { 1422 __ bl(copy_f); 1423 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1424 __ bl(copy_obj_uninit_f); 1425 } else { 1426 __ bl(copy_obj_f); 1427 } 1428 } else { 1429 if (type != T_OBJECT) { 1430 __ bl(copy_b); 1431 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1432 __ bl(copy_obj_uninit_b); 1433 } else { 1434 __ bl(copy_obj_b); 1435 } 1436 } 1437 1438 // And the tail. 1439 copy_memory_small(decorators, type, s, d, count, step); 1440 1441 if (granularity >= 8) __ bind(copy8); 1442 if (granularity >= 4) __ bind(copy4); 1443 __ bind(finish); 1444 } 1445 1446 1447 void clobber_registers() { 1448 #ifdef ASSERT 1449 RegSet clobbered 1450 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1451 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1452 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1453 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1454 __ mov(*it, rscratch1); 1455 } 1456 #endif 1457 1458 } 1459 1460 // Scan over array at a for count oops, verifying each one. 1461 // Preserves a and count, clobbers rscratch1 and rscratch2. 1462 void verify_oop_array (int size, Register a, Register count, Register temp) { 1463 Label loop, end; 1464 __ mov(rscratch1, a); 1465 __ mov(rscratch2, zr); 1466 __ bind(loop); 1467 __ cmp(rscratch2, count); 1468 __ br(Assembler::HS, end); 1469 if (size == wordSize) { 1470 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1471 __ verify_oop(temp); 1472 } else { 1473 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1474 __ decode_heap_oop(temp); // calls verify_oop 1475 } 1476 __ add(rscratch2, rscratch2, 1); 1477 __ b(loop); 1478 __ bind(end); 1479 } 1480 1481 // Arguments: 1482 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1483 // ignored 1484 // is_oop - true => oop array, so generate store check code 1485 // name - stub name string 1486 // 1487 // Inputs: 1488 // c_rarg0 - source array address 1489 // c_rarg1 - destination array address 1490 // c_rarg2 - element count, treated as ssize_t, can be zero 1491 // 1492 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1493 // the hardware handle it. The two dwords within qwords that span 1494 // cache line boundaries will still be loaded and stored atomically. 1495 // 1496 // Side Effects: 1497 // disjoint_int_copy_entry is set to the no-overlap entry point 1498 // used by generate_conjoint_int_oop_copy(). 1499 // 1500 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1501 const char *name, bool dest_uninitialized = false) { 1502 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1503 RegSet saved_reg = RegSet::of(s, d, count); 1504 __ align(CodeEntryAlignment); 1505 StubCodeMark mark(this, "StubRoutines", name); 1506 address start = __ pc(); 1507 __ enter(); 1508 1509 if (entry != nullptr) { 1510 *entry = __ pc(); 1511 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1512 BLOCK_COMMENT("Entry:"); 1513 } 1514 1515 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1516 if (dest_uninitialized) { 1517 decorators |= IS_DEST_UNINITIALIZED; 1518 } 1519 if (aligned) { 1520 decorators |= ARRAYCOPY_ALIGNED; 1521 } 1522 1523 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1524 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1525 1526 if (is_oop) { 1527 // save regs before copy_memory 1528 __ push(RegSet::of(d, count), sp); 1529 } 1530 { 1531 // UnsafeMemoryAccess page error: continue after unsafe access 1532 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1533 UnsafeMemoryAccessMark umam(this, add_entry, true); 1534 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1535 } 1536 1537 if (is_oop) { 1538 __ pop(RegSet::of(d, count), sp); 1539 if (VerifyOops) 1540 verify_oop_array(size, d, count, r16); 1541 } 1542 1543 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1544 1545 __ leave(); 1546 __ mov(r0, zr); // return 0 1547 __ ret(lr); 1548 return start; 1549 } 1550 1551 // Arguments: 1552 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1553 // ignored 1554 // is_oop - true => oop array, so generate store check code 1555 // name - stub name string 1556 // 1557 // Inputs: 1558 // c_rarg0 - source array address 1559 // c_rarg1 - destination array address 1560 // c_rarg2 - element count, treated as ssize_t, can be zero 1561 // 1562 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1563 // the hardware handle it. The two dwords within qwords that span 1564 // cache line boundaries will still be loaded and stored atomically. 1565 // 1566 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1567 address *entry, const char *name, 1568 bool dest_uninitialized = false) { 1569 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1570 RegSet saved_regs = RegSet::of(s, d, count); 1571 StubCodeMark mark(this, "StubRoutines", name); 1572 address start = __ pc(); 1573 __ enter(); 1574 1575 if (entry != nullptr) { 1576 *entry = __ pc(); 1577 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1578 BLOCK_COMMENT("Entry:"); 1579 } 1580 1581 // use fwd copy when (d-s) above_equal (count*size) 1582 __ sub(rscratch1, d, s); 1583 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1584 __ br(Assembler::HS, nooverlap_target); 1585 1586 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1587 if (dest_uninitialized) { 1588 decorators |= IS_DEST_UNINITIALIZED; 1589 } 1590 if (aligned) { 1591 decorators |= ARRAYCOPY_ALIGNED; 1592 } 1593 1594 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1595 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1596 1597 if (is_oop) { 1598 // save regs before copy_memory 1599 __ push(RegSet::of(d, count), sp); 1600 } 1601 { 1602 // UnsafeMemoryAccess page error: continue after unsafe access 1603 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1604 UnsafeMemoryAccessMark umam(this, add_entry, true); 1605 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1606 } 1607 if (is_oop) { 1608 __ pop(RegSet::of(d, count), sp); 1609 if (VerifyOops) 1610 verify_oop_array(size, d, count, r16); 1611 } 1612 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1613 __ leave(); 1614 __ mov(r0, zr); // return 0 1615 __ ret(lr); 1616 return start; 1617 } 1618 1619 // Arguments: 1620 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1621 // ignored 1622 // name - stub name string 1623 // 1624 // Inputs: 1625 // c_rarg0 - source array address 1626 // c_rarg1 - destination array address 1627 // c_rarg2 - element count, treated as ssize_t, can be zero 1628 // 1629 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1630 // we let the hardware handle it. The one to eight bytes within words, 1631 // dwords or qwords that span cache line boundaries will still be loaded 1632 // and stored atomically. 1633 // 1634 // Side Effects: 1635 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1636 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1637 // we let the hardware handle it. The one to eight bytes within words, 1638 // dwords or qwords that span cache line boundaries will still be loaded 1639 // and stored atomically. 1640 // 1641 // Side Effects: 1642 // disjoint_byte_copy_entry is set to the no-overlap entry point 1643 // used by generate_conjoint_byte_copy(). 1644 // 1645 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1646 const bool not_oop = false; 1647 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1648 } 1649 1650 // Arguments: 1651 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1652 // ignored 1653 // name - stub name string 1654 // 1655 // Inputs: 1656 // c_rarg0 - source array address 1657 // c_rarg1 - destination array address 1658 // c_rarg2 - element count, treated as ssize_t, can be zero 1659 // 1660 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1661 // we let the hardware handle it. The one to eight bytes within words, 1662 // dwords or qwords that span cache line boundaries will still be loaded 1663 // and stored atomically. 1664 // 1665 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1666 address* entry, const char *name) { 1667 const bool not_oop = false; 1668 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1669 } 1670 1671 // Arguments: 1672 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1673 // ignored 1674 // name - stub name string 1675 // 1676 // Inputs: 1677 // c_rarg0 - source array address 1678 // c_rarg1 - destination array address 1679 // c_rarg2 - element count, treated as ssize_t, can be zero 1680 // 1681 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1682 // let the hardware handle it. The two or four words within dwords 1683 // or qwords that span cache line boundaries will still be loaded 1684 // and stored atomically. 1685 // 1686 // Side Effects: 1687 // disjoint_short_copy_entry is set to the no-overlap entry point 1688 // used by generate_conjoint_short_copy(). 1689 // 1690 address generate_disjoint_short_copy(bool aligned, 1691 address* entry, const char *name) { 1692 const bool not_oop = false; 1693 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1694 } 1695 1696 // Arguments: 1697 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1698 // ignored 1699 // name - stub name string 1700 // 1701 // Inputs: 1702 // c_rarg0 - source array address 1703 // c_rarg1 - destination array address 1704 // c_rarg2 - element count, treated as ssize_t, can be zero 1705 // 1706 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1707 // let the hardware handle it. The two or four words within dwords 1708 // or qwords that span cache line boundaries will still be loaded 1709 // and stored atomically. 1710 // 1711 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1712 address *entry, const char *name) { 1713 const bool not_oop = false; 1714 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1715 1716 } 1717 // Arguments: 1718 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1719 // ignored 1720 // name - stub name string 1721 // 1722 // Inputs: 1723 // c_rarg0 - source array address 1724 // c_rarg1 - destination array address 1725 // c_rarg2 - element count, treated as ssize_t, can be zero 1726 // 1727 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1728 // the hardware handle it. The two dwords within qwords that span 1729 // cache line boundaries will still be loaded and stored atomically. 1730 // 1731 // Side Effects: 1732 // disjoint_int_copy_entry is set to the no-overlap entry point 1733 // used by generate_conjoint_int_oop_copy(). 1734 // 1735 address generate_disjoint_int_copy(bool aligned, address *entry, 1736 const char *name, bool dest_uninitialized = false) { 1737 const bool not_oop = false; 1738 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1739 } 1740 1741 // Arguments: 1742 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1743 // ignored 1744 // name - stub name string 1745 // 1746 // Inputs: 1747 // c_rarg0 - source array address 1748 // c_rarg1 - destination array address 1749 // c_rarg2 - element count, treated as ssize_t, can be zero 1750 // 1751 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1752 // the hardware handle it. The two dwords within qwords that span 1753 // cache line boundaries will still be loaded and stored atomically. 1754 // 1755 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1756 address *entry, const char *name, 1757 bool dest_uninitialized = false) { 1758 const bool not_oop = false; 1759 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1760 } 1761 1762 1763 // Arguments: 1764 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1765 // ignored 1766 // name - stub name string 1767 // 1768 // Inputs: 1769 // c_rarg0 - source array address 1770 // c_rarg1 - destination array address 1771 // c_rarg2 - element count, treated as size_t, can be zero 1772 // 1773 // Side Effects: 1774 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1775 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1776 // 1777 address generate_disjoint_long_copy(bool aligned, address *entry, 1778 const char *name, bool dest_uninitialized = false) { 1779 const bool not_oop = false; 1780 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1781 } 1782 1783 // Arguments: 1784 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1785 // ignored 1786 // name - stub name string 1787 // 1788 // Inputs: 1789 // c_rarg0 - source array address 1790 // c_rarg1 - destination array address 1791 // c_rarg2 - element count, treated as size_t, can be zero 1792 // 1793 address generate_conjoint_long_copy(bool aligned, 1794 address nooverlap_target, address *entry, 1795 const char *name, bool dest_uninitialized = false) { 1796 const bool not_oop = false; 1797 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1798 } 1799 1800 // Arguments: 1801 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1802 // ignored 1803 // name - stub name string 1804 // 1805 // Inputs: 1806 // c_rarg0 - source array address 1807 // c_rarg1 - destination array address 1808 // c_rarg2 - element count, treated as size_t, can be zero 1809 // 1810 // Side Effects: 1811 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1812 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1813 // 1814 address generate_disjoint_oop_copy(bool aligned, address *entry, 1815 const char *name, bool dest_uninitialized) { 1816 const bool is_oop = true; 1817 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1818 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1819 } 1820 1821 // Arguments: 1822 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1823 // ignored 1824 // name - stub name string 1825 // 1826 // Inputs: 1827 // c_rarg0 - source array address 1828 // c_rarg1 - destination array address 1829 // c_rarg2 - element count, treated as size_t, can be zero 1830 // 1831 address generate_conjoint_oop_copy(bool aligned, 1832 address nooverlap_target, address *entry, 1833 const char *name, bool dest_uninitialized) { 1834 const bool is_oop = true; 1835 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1836 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1837 name, dest_uninitialized); 1838 } 1839 1840 1841 // Helper for generating a dynamic type check. 1842 // Smashes rscratch1, rscratch2. 1843 void generate_type_check(Register sub_klass, 1844 Register super_check_offset, 1845 Register super_klass, 1846 Register temp1, 1847 Register temp2, 1848 Register result, 1849 Label& L_success) { 1850 assert_different_registers(sub_klass, super_check_offset, super_klass); 1851 1852 BLOCK_COMMENT("type_check:"); 1853 1854 Label L_miss; 1855 1856 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1857 super_check_offset); 1858 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1859 1860 // Fall through on failure! 1861 __ BIND(L_miss); 1862 } 1863 1864 // 1865 // Generate checkcasting array copy stub 1866 // 1867 // Input: 1868 // c_rarg0 - source array address 1869 // c_rarg1 - destination array address 1870 // c_rarg2 - element count, treated as ssize_t, can be zero 1871 // c_rarg3 - size_t ckoff (super_check_offset) 1872 // c_rarg4 - oop ckval (super_klass) 1873 // 1874 // Output: 1875 // r0 == 0 - success 1876 // r0 == -1^K - failure, where K is partial transfer count 1877 // 1878 address generate_checkcast_copy(const char *name, address *entry, 1879 bool dest_uninitialized = false) { 1880 1881 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1882 1883 // Input registers (after setup_arg_regs) 1884 const Register from = c_rarg0; // source array address 1885 const Register to = c_rarg1; // destination array address 1886 const Register count = c_rarg2; // elementscount 1887 const Register ckoff = c_rarg3; // super_check_offset 1888 const Register ckval = c_rarg4; // super_klass 1889 1890 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1891 RegSet wb_post_saved_regs = RegSet::of(count); 1892 1893 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1894 const Register copied_oop = r22; // actual oop copied 1895 const Register count_save = r21; // orig elementscount 1896 const Register start_to = r20; // destination array start address 1897 const Register r19_klass = r19; // oop._klass 1898 1899 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1900 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1901 1902 //--------------------------------------------------------------- 1903 // Assembler stub will be used for this call to arraycopy 1904 // if the two arrays are subtypes of Object[] but the 1905 // destination array type is not equal to or a supertype 1906 // of the source type. Each element must be separately 1907 // checked. 1908 1909 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1910 copied_oop, r19_klass, count_save); 1911 1912 __ align(CodeEntryAlignment); 1913 StubCodeMark mark(this, "StubRoutines", name); 1914 address start = __ pc(); 1915 1916 __ enter(); // required for proper stackwalking of RuntimeStub frame 1917 1918 #ifdef ASSERT 1919 // caller guarantees that the arrays really are different 1920 // otherwise, we would have to make conjoint checks 1921 { Label L; 1922 __ b(L); // conjoint check not yet implemented 1923 __ stop("checkcast_copy within a single array"); 1924 __ bind(L); 1925 } 1926 #endif //ASSERT 1927 1928 // Caller of this entry point must set up the argument registers. 1929 if (entry != nullptr) { 1930 *entry = __ pc(); 1931 BLOCK_COMMENT("Entry:"); 1932 } 1933 1934 // Empty array: Nothing to do. 1935 __ cbz(count, L_done); 1936 __ push(RegSet::of(r19, r20, r21, r22), sp); 1937 1938 #ifdef ASSERT 1939 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1940 // The ckoff and ckval must be mutually consistent, 1941 // even though caller generates both. 1942 { Label L; 1943 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1944 __ ldrw(start_to, Address(ckval, sco_offset)); 1945 __ cmpw(ckoff, start_to); 1946 __ br(Assembler::EQ, L); 1947 __ stop("super_check_offset inconsistent"); 1948 __ bind(L); 1949 } 1950 #endif //ASSERT 1951 1952 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1953 bool is_oop = true; 1954 int element_size = UseCompressedOops ? 4 : 8; 1955 if (dest_uninitialized) { 1956 decorators |= IS_DEST_UNINITIALIZED; 1957 } 1958 1959 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1960 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1961 1962 // save the original count 1963 __ mov(count_save, count); 1964 1965 // Copy from low to high addresses 1966 __ mov(start_to, to); // Save destination array start address 1967 __ b(L_load_element); 1968 1969 // ======== begin loop ======== 1970 // (Loop is rotated; its entry is L_load_element.) 1971 // Loop control: 1972 // for (; count != 0; count--) { 1973 // copied_oop = load_heap_oop(from++); 1974 // ... generate_type_check ...; 1975 // store_heap_oop(to++, copied_oop); 1976 // } 1977 __ align(OptoLoopAlignment); 1978 1979 __ BIND(L_store_element); 1980 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1981 __ post(to, element_size), copied_oop, noreg, 1982 gct1, gct2, gct3); 1983 __ sub(count, count, 1); 1984 __ cbz(count, L_do_card_marks); 1985 1986 // ======== loop entry is here ======== 1987 __ BIND(L_load_element); 1988 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1989 copied_oop, noreg, __ post(from, element_size), 1990 gct1); 1991 __ cbz(copied_oop, L_store_element); 1992 1993 __ load_klass(r19_klass, copied_oop);// query the object klass 1994 1995 BLOCK_COMMENT("type_check:"); 1996 generate_type_check(/*sub_klass*/r19_klass, 1997 /*super_check_offset*/ckoff, 1998 /*super_klass*/ckval, 1999 /*r_array_base*/gct1, 2000 /*temp2*/gct2, 2001 /*result*/r10, L_store_element); 2002 2003 // Fall through on failure! 2004 2005 // ======== end loop ======== 2006 2007 // It was a real error; we must depend on the caller to finish the job. 2008 // Register count = remaining oops, count_orig = total oops. 2009 // Emit GC store barriers for the oops we have copied and report 2010 // their number to the caller. 2011 2012 __ subs(count, count_save, count); // K = partially copied oop count 2013 __ eon(count, count, zr); // report (-1^K) to caller 2014 __ br(Assembler::EQ, L_done_pop); 2015 2016 __ BIND(L_do_card_marks); 2017 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2018 2019 __ bind(L_done_pop); 2020 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2021 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2022 2023 __ bind(L_done); 2024 __ mov(r0, count); 2025 __ leave(); 2026 __ ret(lr); 2027 2028 return start; 2029 } 2030 2031 // Perform range checks on the proposed arraycopy. 2032 // Kills temp, but nothing else. 2033 // Also, clean the sign bits of src_pos and dst_pos. 2034 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2035 Register src_pos, // source position (c_rarg1) 2036 Register dst, // destination array oo (c_rarg2) 2037 Register dst_pos, // destination position (c_rarg3) 2038 Register length, 2039 Register temp, 2040 Label& L_failed) { 2041 BLOCK_COMMENT("arraycopy_range_checks:"); 2042 2043 assert_different_registers(rscratch1, temp); 2044 2045 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2046 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2047 __ addw(temp, length, src_pos); 2048 __ cmpw(temp, rscratch1); 2049 __ br(Assembler::HI, L_failed); 2050 2051 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2052 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2053 __ addw(temp, length, dst_pos); 2054 __ cmpw(temp, rscratch1); 2055 __ br(Assembler::HI, L_failed); 2056 2057 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2058 __ movw(src_pos, src_pos); 2059 __ movw(dst_pos, dst_pos); 2060 2061 BLOCK_COMMENT("arraycopy_range_checks done"); 2062 } 2063 2064 // These stubs get called from some dumb test routine. 2065 // I'll write them properly when they're called from 2066 // something that's actually doing something. 2067 static void fake_arraycopy_stub(address src, address dst, int count) { 2068 assert(count == 0, "huh?"); 2069 } 2070 2071 2072 // 2073 // Generate 'unsafe' array copy stub 2074 // Though just as safe as the other stubs, it takes an unscaled 2075 // size_t argument instead of an element count. 2076 // 2077 // Input: 2078 // c_rarg0 - source array address 2079 // c_rarg1 - destination array address 2080 // c_rarg2 - byte count, treated as ssize_t, can be zero 2081 // 2082 // Examines the alignment of the operands and dispatches 2083 // to a long, int, short, or byte copy loop. 2084 // 2085 address generate_unsafe_copy(const char *name, 2086 address byte_copy_entry, 2087 address short_copy_entry, 2088 address int_copy_entry, 2089 address long_copy_entry) { 2090 Label L_long_aligned, L_int_aligned, L_short_aligned; 2091 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2092 2093 __ align(CodeEntryAlignment); 2094 StubCodeMark mark(this, "StubRoutines", name); 2095 address start = __ pc(); 2096 __ enter(); // required for proper stackwalking of RuntimeStub frame 2097 2098 // bump this on entry, not on exit: 2099 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2100 2101 __ orr(rscratch1, s, d); 2102 __ orr(rscratch1, rscratch1, count); 2103 2104 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2105 __ cbz(rscratch1, L_long_aligned); 2106 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2107 __ cbz(rscratch1, L_int_aligned); 2108 __ tbz(rscratch1, 0, L_short_aligned); 2109 __ b(RuntimeAddress(byte_copy_entry)); 2110 2111 __ BIND(L_short_aligned); 2112 __ lsr(count, count, LogBytesPerShort); // size => short_count 2113 __ b(RuntimeAddress(short_copy_entry)); 2114 __ BIND(L_int_aligned); 2115 __ lsr(count, count, LogBytesPerInt); // size => int_count 2116 __ b(RuntimeAddress(int_copy_entry)); 2117 __ BIND(L_long_aligned); 2118 __ lsr(count, count, LogBytesPerLong); // size => long_count 2119 __ b(RuntimeAddress(long_copy_entry)); 2120 2121 return start; 2122 } 2123 2124 // 2125 // Generate generic array copy stubs 2126 // 2127 // Input: 2128 // c_rarg0 - src oop 2129 // c_rarg1 - src_pos (32-bits) 2130 // c_rarg2 - dst oop 2131 // c_rarg3 - dst_pos (32-bits) 2132 // c_rarg4 - element count (32-bits) 2133 // 2134 // Output: 2135 // r0 == 0 - success 2136 // r0 == -1^K - failure, where K is partial transfer count 2137 // 2138 address generate_generic_copy(const char *name, 2139 address byte_copy_entry, address short_copy_entry, 2140 address int_copy_entry, address oop_copy_entry, 2141 address long_copy_entry, address checkcast_copy_entry) { 2142 2143 Label L_failed, L_objArray; 2144 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2145 2146 // Input registers 2147 const Register src = c_rarg0; // source array oop 2148 const Register src_pos = c_rarg1; // source position 2149 const Register dst = c_rarg2; // destination array oop 2150 const Register dst_pos = c_rarg3; // destination position 2151 const Register length = c_rarg4; 2152 2153 2154 // Registers used as temps 2155 const Register dst_klass = c_rarg5; 2156 2157 __ align(CodeEntryAlignment); 2158 2159 StubCodeMark mark(this, "StubRoutines", name); 2160 2161 address start = __ pc(); 2162 2163 __ enter(); // required for proper stackwalking of RuntimeStub frame 2164 2165 // bump this on entry, not on exit: 2166 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2167 2168 //----------------------------------------------------------------------- 2169 // Assembler stub will be used for this call to arraycopy 2170 // if the following conditions are met: 2171 // 2172 // (1) src and dst must not be null. 2173 // (2) src_pos must not be negative. 2174 // (3) dst_pos must not be negative. 2175 // (4) length must not be negative. 2176 // (5) src klass and dst klass should be the same and not null. 2177 // (6) src and dst should be arrays. 2178 // (7) src_pos + length must not exceed length of src. 2179 // (8) dst_pos + length must not exceed length of dst. 2180 // 2181 2182 // if (src == nullptr) return -1; 2183 __ cbz(src, L_failed); 2184 2185 // if (src_pos < 0) return -1; 2186 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2187 2188 // if (dst == nullptr) return -1; 2189 __ cbz(dst, L_failed); 2190 2191 // if (dst_pos < 0) return -1; 2192 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2193 2194 // registers used as temp 2195 const Register scratch_length = r16; // elements count to copy 2196 const Register scratch_src_klass = r17; // array klass 2197 const Register lh = r15; // layout helper 2198 2199 // if (length < 0) return -1; 2200 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2201 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2202 2203 __ load_klass(scratch_src_klass, src); 2204 #ifdef ASSERT 2205 // assert(src->klass() != nullptr); 2206 { 2207 BLOCK_COMMENT("assert klasses not null {"); 2208 Label L1, L2; 2209 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2210 __ bind(L1); 2211 __ stop("broken null klass"); 2212 __ bind(L2); 2213 __ load_klass(rscratch1, dst); 2214 __ cbz(rscratch1, L1); // this would be broken also 2215 BLOCK_COMMENT("} assert klasses not null done"); 2216 } 2217 #endif 2218 2219 // Load layout helper (32-bits) 2220 // 2221 // |array_tag| | header_size | element_type | |log2_element_size| 2222 // 32 30 24 16 8 2 0 2223 // 2224 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2225 // 2226 2227 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2228 2229 // Handle objArrays completely differently... 2230 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2231 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2232 __ movw(rscratch1, objArray_lh); 2233 __ eorw(rscratch2, lh, rscratch1); 2234 __ cbzw(rscratch2, L_objArray); 2235 2236 // if (src->klass() != dst->klass()) return -1; 2237 __ load_klass(rscratch2, dst); 2238 __ eor(rscratch2, rscratch2, scratch_src_klass); 2239 __ cbnz(rscratch2, L_failed); 2240 2241 // if (!src->is_Array()) return -1; 2242 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2243 2244 // At this point, it is known to be a typeArray (array_tag 0x3). 2245 #ifdef ASSERT 2246 { 2247 BLOCK_COMMENT("assert primitive array {"); 2248 Label L; 2249 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2250 __ cmpw(lh, rscratch2); 2251 __ br(Assembler::GE, L); 2252 __ stop("must be a primitive array"); 2253 __ bind(L); 2254 BLOCK_COMMENT("} assert primitive array done"); 2255 } 2256 #endif 2257 2258 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2259 rscratch2, L_failed); 2260 2261 // TypeArrayKlass 2262 // 2263 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2264 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2265 // 2266 2267 const Register rscratch1_offset = rscratch1; // array offset 2268 const Register r15_elsize = lh; // element size 2269 2270 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2271 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2272 __ add(src, src, rscratch1_offset); // src array offset 2273 __ add(dst, dst, rscratch1_offset); // dst array offset 2274 BLOCK_COMMENT("choose copy loop based on element size"); 2275 2276 // next registers should be set before the jump to corresponding stub 2277 const Register from = c_rarg0; // source array address 2278 const Register to = c_rarg1; // destination array address 2279 const Register count = c_rarg2; // elements count 2280 2281 // 'from', 'to', 'count' registers should be set in such order 2282 // since they are the same as 'src', 'src_pos', 'dst'. 2283 2284 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2285 2286 // The possible values of elsize are 0-3, i.e. exact_log2(element 2287 // size in bytes). We do a simple bitwise binary search. 2288 __ BIND(L_copy_bytes); 2289 __ tbnz(r15_elsize, 1, L_copy_ints); 2290 __ tbnz(r15_elsize, 0, L_copy_shorts); 2291 __ lea(from, Address(src, src_pos));// src_addr 2292 __ lea(to, Address(dst, dst_pos));// dst_addr 2293 __ movw(count, scratch_length); // length 2294 __ b(RuntimeAddress(byte_copy_entry)); 2295 2296 __ BIND(L_copy_shorts); 2297 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2298 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2299 __ movw(count, scratch_length); // length 2300 __ b(RuntimeAddress(short_copy_entry)); 2301 2302 __ BIND(L_copy_ints); 2303 __ tbnz(r15_elsize, 0, L_copy_longs); 2304 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2305 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2306 __ movw(count, scratch_length); // length 2307 __ b(RuntimeAddress(int_copy_entry)); 2308 2309 __ BIND(L_copy_longs); 2310 #ifdef ASSERT 2311 { 2312 BLOCK_COMMENT("assert long copy {"); 2313 Label L; 2314 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2315 __ cmpw(r15_elsize, LogBytesPerLong); 2316 __ br(Assembler::EQ, L); 2317 __ stop("must be long copy, but elsize is wrong"); 2318 __ bind(L); 2319 BLOCK_COMMENT("} assert long copy done"); 2320 } 2321 #endif 2322 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2323 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2324 __ movw(count, scratch_length); // length 2325 __ b(RuntimeAddress(long_copy_entry)); 2326 2327 // ObjArrayKlass 2328 __ BIND(L_objArray); 2329 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2330 2331 Label L_plain_copy, L_checkcast_copy; 2332 // test array classes for subtyping 2333 __ load_klass(r15, dst); 2334 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2335 __ br(Assembler::NE, L_checkcast_copy); 2336 2337 // Identically typed arrays can be copied without element-wise checks. 2338 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2339 rscratch2, L_failed); 2340 2341 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2342 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2343 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2344 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2345 __ movw(count, scratch_length); // length 2346 __ BIND(L_plain_copy); 2347 __ b(RuntimeAddress(oop_copy_entry)); 2348 2349 __ BIND(L_checkcast_copy); 2350 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2351 { 2352 // Before looking at dst.length, make sure dst is also an objArray. 2353 __ ldrw(rscratch1, Address(r15, lh_offset)); 2354 __ movw(rscratch2, objArray_lh); 2355 __ eorw(rscratch1, rscratch1, rscratch2); 2356 __ cbnzw(rscratch1, L_failed); 2357 2358 // It is safe to examine both src.length and dst.length. 2359 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2360 r15, L_failed); 2361 2362 __ load_klass(dst_klass, dst); // reload 2363 2364 // Marshal the base address arguments now, freeing registers. 2365 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2366 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2367 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2368 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2369 __ movw(count, length); // length (reloaded) 2370 Register sco_temp = c_rarg3; // this register is free now 2371 assert_different_registers(from, to, count, sco_temp, 2372 dst_klass, scratch_src_klass); 2373 // assert_clean_int(count, sco_temp); 2374 2375 // Generate the type check. 2376 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2377 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2378 2379 // Smashes rscratch1, rscratch2 2380 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2381 L_plain_copy); 2382 2383 // Fetch destination element klass from the ObjArrayKlass header. 2384 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2385 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2386 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2387 2388 // the checkcast_copy loop needs two extra arguments: 2389 assert(c_rarg3 == sco_temp, "#3 already in place"); 2390 // Set up arguments for checkcast_copy_entry. 2391 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2392 __ b(RuntimeAddress(checkcast_copy_entry)); 2393 } 2394 2395 __ BIND(L_failed); 2396 __ mov(r0, -1); 2397 __ leave(); // required for proper stackwalking of RuntimeStub frame 2398 __ ret(lr); 2399 2400 return start; 2401 } 2402 2403 // 2404 // Generate stub for array fill. If "aligned" is true, the 2405 // "to" address is assumed to be heapword aligned. 2406 // 2407 // Arguments for generated stub: 2408 // to: c_rarg0 2409 // value: c_rarg1 2410 // count: c_rarg2 treated as signed 2411 // 2412 address generate_fill(BasicType t, bool aligned, const char *name) { 2413 __ align(CodeEntryAlignment); 2414 StubCodeMark mark(this, "StubRoutines", name); 2415 address start = __ pc(); 2416 2417 BLOCK_COMMENT("Entry:"); 2418 2419 const Register to = c_rarg0; // source array address 2420 const Register value = c_rarg1; // value 2421 const Register count = c_rarg2; // elements count 2422 2423 const Register bz_base = r10; // base for block_zero routine 2424 const Register cnt_words = r11; // temp register 2425 2426 __ enter(); 2427 2428 Label L_fill_elements, L_exit1; 2429 2430 int shift = -1; 2431 switch (t) { 2432 case T_BYTE: 2433 shift = 0; 2434 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2435 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2436 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2437 __ br(Assembler::LO, L_fill_elements); 2438 break; 2439 case T_SHORT: 2440 shift = 1; 2441 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2442 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2443 __ br(Assembler::LO, L_fill_elements); 2444 break; 2445 case T_INT: 2446 shift = 2; 2447 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2448 __ br(Assembler::LO, L_fill_elements); 2449 break; 2450 default: ShouldNotReachHere(); 2451 } 2452 2453 // Align source address at 8 bytes address boundary. 2454 Label L_skip_align1, L_skip_align2, L_skip_align4; 2455 if (!aligned) { 2456 switch (t) { 2457 case T_BYTE: 2458 // One byte misalignment happens only for byte arrays. 2459 __ tbz(to, 0, L_skip_align1); 2460 __ strb(value, Address(__ post(to, 1))); 2461 __ subw(count, count, 1); 2462 __ bind(L_skip_align1); 2463 // Fallthrough 2464 case T_SHORT: 2465 // Two bytes misalignment happens only for byte and short (char) arrays. 2466 __ tbz(to, 1, L_skip_align2); 2467 __ strh(value, Address(__ post(to, 2))); 2468 __ subw(count, count, 2 >> shift); 2469 __ bind(L_skip_align2); 2470 // Fallthrough 2471 case T_INT: 2472 // Align to 8 bytes, we know we are 4 byte aligned to start. 2473 __ tbz(to, 2, L_skip_align4); 2474 __ strw(value, Address(__ post(to, 4))); 2475 __ subw(count, count, 4 >> shift); 2476 __ bind(L_skip_align4); 2477 break; 2478 default: ShouldNotReachHere(); 2479 } 2480 } 2481 2482 // 2483 // Fill large chunks 2484 // 2485 __ lsrw(cnt_words, count, 3 - shift); // number of words 2486 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2487 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2488 if (UseBlockZeroing) { 2489 Label non_block_zeroing, rest; 2490 // If the fill value is zero we can use the fast zero_words(). 2491 __ cbnz(value, non_block_zeroing); 2492 __ mov(bz_base, to); 2493 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2494 address tpc = __ zero_words(bz_base, cnt_words); 2495 if (tpc == nullptr) { 2496 fatal("CodeCache is full at generate_fill"); 2497 } 2498 __ b(rest); 2499 __ bind(non_block_zeroing); 2500 __ fill_words(to, cnt_words, value); 2501 __ bind(rest); 2502 } else { 2503 __ fill_words(to, cnt_words, value); 2504 } 2505 2506 // Remaining count is less than 8 bytes. Fill it by a single store. 2507 // Note that the total length is no less than 8 bytes. 2508 if (t == T_BYTE || t == T_SHORT) { 2509 Label L_exit1; 2510 __ cbzw(count, L_exit1); 2511 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2512 __ str(value, Address(to, -8)); // overwrite some elements 2513 __ bind(L_exit1); 2514 __ leave(); 2515 __ ret(lr); 2516 } 2517 2518 // Handle copies less than 8 bytes. 2519 Label L_fill_2, L_fill_4, L_exit2; 2520 __ bind(L_fill_elements); 2521 switch (t) { 2522 case T_BYTE: 2523 __ tbz(count, 0, L_fill_2); 2524 __ strb(value, Address(__ post(to, 1))); 2525 __ bind(L_fill_2); 2526 __ tbz(count, 1, L_fill_4); 2527 __ strh(value, Address(__ post(to, 2))); 2528 __ bind(L_fill_4); 2529 __ tbz(count, 2, L_exit2); 2530 __ strw(value, Address(to)); 2531 break; 2532 case T_SHORT: 2533 __ tbz(count, 0, L_fill_4); 2534 __ strh(value, Address(__ post(to, 2))); 2535 __ bind(L_fill_4); 2536 __ tbz(count, 1, L_exit2); 2537 __ strw(value, Address(to)); 2538 break; 2539 case T_INT: 2540 __ cbzw(count, L_exit2); 2541 __ strw(value, Address(to)); 2542 break; 2543 default: ShouldNotReachHere(); 2544 } 2545 __ bind(L_exit2); 2546 __ leave(); 2547 __ ret(lr); 2548 return start; 2549 } 2550 2551 address generate_data_cache_writeback() { 2552 const Register line = c_rarg0; // address of line to write back 2553 2554 __ align(CodeEntryAlignment); 2555 2556 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2557 2558 address start = __ pc(); 2559 __ enter(); 2560 __ cache_wb(Address(line, 0)); 2561 __ leave(); 2562 __ ret(lr); 2563 2564 return start; 2565 } 2566 2567 address generate_data_cache_writeback_sync() { 2568 const Register is_pre = c_rarg0; // pre or post sync 2569 2570 __ align(CodeEntryAlignment); 2571 2572 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2573 2574 // pre wbsync is a no-op 2575 // post wbsync translates to an sfence 2576 2577 Label skip; 2578 address start = __ pc(); 2579 __ enter(); 2580 __ cbnz(is_pre, skip); 2581 __ cache_wbsync(false); 2582 __ bind(skip); 2583 __ leave(); 2584 __ ret(lr); 2585 2586 return start; 2587 } 2588 2589 void generate_arraycopy_stubs() { 2590 address entry; 2591 address entry_jbyte_arraycopy; 2592 address entry_jshort_arraycopy; 2593 address entry_jint_arraycopy; 2594 address entry_oop_arraycopy; 2595 address entry_jlong_arraycopy; 2596 address entry_checkcast_arraycopy; 2597 2598 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2599 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2600 2601 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2602 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2603 2604 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2605 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2606 2607 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2608 2609 //*** jbyte 2610 // Always need aligned and unaligned versions 2611 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2612 "jbyte_disjoint_arraycopy"); 2613 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2614 &entry_jbyte_arraycopy, 2615 "jbyte_arraycopy"); 2616 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2617 "arrayof_jbyte_disjoint_arraycopy"); 2618 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2619 "arrayof_jbyte_arraycopy"); 2620 2621 //*** jshort 2622 // Always need aligned and unaligned versions 2623 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2624 "jshort_disjoint_arraycopy"); 2625 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2626 &entry_jshort_arraycopy, 2627 "jshort_arraycopy"); 2628 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2629 "arrayof_jshort_disjoint_arraycopy"); 2630 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2631 "arrayof_jshort_arraycopy"); 2632 2633 //*** jint 2634 // Aligned versions 2635 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2636 "arrayof_jint_disjoint_arraycopy"); 2637 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2638 "arrayof_jint_arraycopy"); 2639 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2640 // entry_jint_arraycopy always points to the unaligned version 2641 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2642 "jint_disjoint_arraycopy"); 2643 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2644 &entry_jint_arraycopy, 2645 "jint_arraycopy"); 2646 2647 //*** jlong 2648 // It is always aligned 2649 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2650 "arrayof_jlong_disjoint_arraycopy"); 2651 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2652 "arrayof_jlong_arraycopy"); 2653 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2654 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2655 2656 //*** oops 2657 { 2658 // With compressed oops we need unaligned versions; notice that 2659 // we overwrite entry_oop_arraycopy. 2660 bool aligned = !UseCompressedOops; 2661 2662 StubRoutines::_arrayof_oop_disjoint_arraycopy 2663 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2664 /*dest_uninitialized*/false); 2665 StubRoutines::_arrayof_oop_arraycopy 2666 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2667 /*dest_uninitialized*/false); 2668 // Aligned versions without pre-barriers 2669 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2670 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2671 /*dest_uninitialized*/true); 2672 StubRoutines::_arrayof_oop_arraycopy_uninit 2673 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2674 /*dest_uninitialized*/true); 2675 } 2676 2677 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2678 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2679 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2680 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2681 2682 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2683 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2684 /*dest_uninitialized*/true); 2685 2686 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2687 entry_jbyte_arraycopy, 2688 entry_jshort_arraycopy, 2689 entry_jint_arraycopy, 2690 entry_jlong_arraycopy); 2691 2692 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2693 entry_jbyte_arraycopy, 2694 entry_jshort_arraycopy, 2695 entry_jint_arraycopy, 2696 entry_oop_arraycopy, 2697 entry_jlong_arraycopy, 2698 entry_checkcast_arraycopy); 2699 2700 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2701 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2702 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2703 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2704 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2705 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2706 } 2707 2708 void generate_math_stubs() { Unimplemented(); } 2709 2710 // Arguments: 2711 // 2712 // Inputs: 2713 // c_rarg0 - source byte array address 2714 // c_rarg1 - destination byte array address 2715 // c_rarg2 - K (key) in little endian int array 2716 // 2717 address generate_aescrypt_encryptBlock() { 2718 __ align(CodeEntryAlignment); 2719 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2720 2721 const Register from = c_rarg0; // source array address 2722 const Register to = c_rarg1; // destination array address 2723 const Register key = c_rarg2; // key array address 2724 const Register keylen = rscratch1; 2725 2726 address start = __ pc(); 2727 __ enter(); 2728 2729 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2730 2731 __ aesenc_loadkeys(key, keylen); 2732 __ aesecb_encrypt(from, to, keylen); 2733 2734 __ mov(r0, 0); 2735 2736 __ leave(); 2737 __ ret(lr); 2738 2739 return start; 2740 } 2741 2742 // Arguments: 2743 // 2744 // Inputs: 2745 // c_rarg0 - source byte array address 2746 // c_rarg1 - destination byte array address 2747 // c_rarg2 - K (key) in little endian int array 2748 // 2749 address generate_aescrypt_decryptBlock() { 2750 assert(UseAES, "need AES cryptographic extension support"); 2751 __ align(CodeEntryAlignment); 2752 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2753 Label L_doLast; 2754 2755 const Register from = c_rarg0; // source array address 2756 const Register to = c_rarg1; // destination array address 2757 const Register key = c_rarg2; // key array address 2758 const Register keylen = rscratch1; 2759 2760 address start = __ pc(); 2761 __ enter(); // required for proper stackwalking of RuntimeStub frame 2762 2763 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2764 2765 __ aesecb_decrypt(from, to, key, keylen); 2766 2767 __ mov(r0, 0); 2768 2769 __ leave(); 2770 __ ret(lr); 2771 2772 return start; 2773 } 2774 2775 // Arguments: 2776 // 2777 // Inputs: 2778 // c_rarg0 - source byte array address 2779 // c_rarg1 - destination byte array address 2780 // c_rarg2 - K (key) in little endian int array 2781 // c_rarg3 - r vector byte array address 2782 // c_rarg4 - input length 2783 // 2784 // Output: 2785 // x0 - input length 2786 // 2787 address generate_cipherBlockChaining_encryptAESCrypt() { 2788 assert(UseAES, "need AES cryptographic extension support"); 2789 __ align(CodeEntryAlignment); 2790 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2791 2792 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2793 2794 const Register from = c_rarg0; // source array address 2795 const Register to = c_rarg1; // destination array address 2796 const Register key = c_rarg2; // key array address 2797 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2798 // and left with the results of the last encryption block 2799 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2800 const Register keylen = rscratch1; 2801 2802 address start = __ pc(); 2803 2804 __ enter(); 2805 2806 __ movw(rscratch2, len_reg); 2807 2808 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2809 2810 __ ld1(v0, __ T16B, rvec); 2811 2812 __ cmpw(keylen, 52); 2813 __ br(Assembler::CC, L_loadkeys_44); 2814 __ br(Assembler::EQ, L_loadkeys_52); 2815 2816 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2817 __ rev32(v17, __ T16B, v17); 2818 __ rev32(v18, __ T16B, v18); 2819 __ BIND(L_loadkeys_52); 2820 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2821 __ rev32(v19, __ T16B, v19); 2822 __ rev32(v20, __ T16B, v20); 2823 __ BIND(L_loadkeys_44); 2824 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2825 __ rev32(v21, __ T16B, v21); 2826 __ rev32(v22, __ T16B, v22); 2827 __ rev32(v23, __ T16B, v23); 2828 __ rev32(v24, __ T16B, v24); 2829 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2830 __ rev32(v25, __ T16B, v25); 2831 __ rev32(v26, __ T16B, v26); 2832 __ rev32(v27, __ T16B, v27); 2833 __ rev32(v28, __ T16B, v28); 2834 __ ld1(v29, v30, v31, __ T16B, key); 2835 __ rev32(v29, __ T16B, v29); 2836 __ rev32(v30, __ T16B, v30); 2837 __ rev32(v31, __ T16B, v31); 2838 2839 __ BIND(L_aes_loop); 2840 __ ld1(v1, __ T16B, __ post(from, 16)); 2841 __ eor(v0, __ T16B, v0, v1); 2842 2843 __ br(Assembler::CC, L_rounds_44); 2844 __ br(Assembler::EQ, L_rounds_52); 2845 2846 __ aese(v0, v17); __ aesmc(v0, v0); 2847 __ aese(v0, v18); __ aesmc(v0, v0); 2848 __ BIND(L_rounds_52); 2849 __ aese(v0, v19); __ aesmc(v0, v0); 2850 __ aese(v0, v20); __ aesmc(v0, v0); 2851 __ BIND(L_rounds_44); 2852 __ aese(v0, v21); __ aesmc(v0, v0); 2853 __ aese(v0, v22); __ aesmc(v0, v0); 2854 __ aese(v0, v23); __ aesmc(v0, v0); 2855 __ aese(v0, v24); __ aesmc(v0, v0); 2856 __ aese(v0, v25); __ aesmc(v0, v0); 2857 __ aese(v0, v26); __ aesmc(v0, v0); 2858 __ aese(v0, v27); __ aesmc(v0, v0); 2859 __ aese(v0, v28); __ aesmc(v0, v0); 2860 __ aese(v0, v29); __ aesmc(v0, v0); 2861 __ aese(v0, v30); 2862 __ eor(v0, __ T16B, v0, v31); 2863 2864 __ st1(v0, __ T16B, __ post(to, 16)); 2865 2866 __ subw(len_reg, len_reg, 16); 2867 __ cbnzw(len_reg, L_aes_loop); 2868 2869 __ st1(v0, __ T16B, rvec); 2870 2871 __ mov(r0, rscratch2); 2872 2873 __ leave(); 2874 __ ret(lr); 2875 2876 return start; 2877 } 2878 2879 // Arguments: 2880 // 2881 // Inputs: 2882 // c_rarg0 - source byte array address 2883 // c_rarg1 - destination byte array address 2884 // c_rarg2 - K (key) in little endian int array 2885 // c_rarg3 - r vector byte array address 2886 // c_rarg4 - input length 2887 // 2888 // Output: 2889 // r0 - input length 2890 // 2891 address generate_cipherBlockChaining_decryptAESCrypt() { 2892 assert(UseAES, "need AES cryptographic extension support"); 2893 __ align(CodeEntryAlignment); 2894 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2895 2896 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2897 2898 const Register from = c_rarg0; // source array address 2899 const Register to = c_rarg1; // destination array address 2900 const Register key = c_rarg2; // key array address 2901 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2902 // and left with the results of the last encryption block 2903 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2904 const Register keylen = rscratch1; 2905 2906 address start = __ pc(); 2907 2908 __ enter(); 2909 2910 __ movw(rscratch2, len_reg); 2911 2912 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2913 2914 __ ld1(v2, __ T16B, rvec); 2915 2916 __ ld1(v31, __ T16B, __ post(key, 16)); 2917 __ rev32(v31, __ T16B, v31); 2918 2919 __ cmpw(keylen, 52); 2920 __ br(Assembler::CC, L_loadkeys_44); 2921 __ br(Assembler::EQ, L_loadkeys_52); 2922 2923 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2924 __ rev32(v17, __ T16B, v17); 2925 __ rev32(v18, __ T16B, v18); 2926 __ BIND(L_loadkeys_52); 2927 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2928 __ rev32(v19, __ T16B, v19); 2929 __ rev32(v20, __ T16B, v20); 2930 __ BIND(L_loadkeys_44); 2931 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2932 __ rev32(v21, __ T16B, v21); 2933 __ rev32(v22, __ T16B, v22); 2934 __ rev32(v23, __ T16B, v23); 2935 __ rev32(v24, __ T16B, v24); 2936 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2937 __ rev32(v25, __ T16B, v25); 2938 __ rev32(v26, __ T16B, v26); 2939 __ rev32(v27, __ T16B, v27); 2940 __ rev32(v28, __ T16B, v28); 2941 __ ld1(v29, v30, __ T16B, key); 2942 __ rev32(v29, __ T16B, v29); 2943 __ rev32(v30, __ T16B, v30); 2944 2945 __ BIND(L_aes_loop); 2946 __ ld1(v0, __ T16B, __ post(from, 16)); 2947 __ orr(v1, __ T16B, v0, v0); 2948 2949 __ br(Assembler::CC, L_rounds_44); 2950 __ br(Assembler::EQ, L_rounds_52); 2951 2952 __ aesd(v0, v17); __ aesimc(v0, v0); 2953 __ aesd(v0, v18); __ aesimc(v0, v0); 2954 __ BIND(L_rounds_52); 2955 __ aesd(v0, v19); __ aesimc(v0, v0); 2956 __ aesd(v0, v20); __ aesimc(v0, v0); 2957 __ BIND(L_rounds_44); 2958 __ aesd(v0, v21); __ aesimc(v0, v0); 2959 __ aesd(v0, v22); __ aesimc(v0, v0); 2960 __ aesd(v0, v23); __ aesimc(v0, v0); 2961 __ aesd(v0, v24); __ aesimc(v0, v0); 2962 __ aesd(v0, v25); __ aesimc(v0, v0); 2963 __ aesd(v0, v26); __ aesimc(v0, v0); 2964 __ aesd(v0, v27); __ aesimc(v0, v0); 2965 __ aesd(v0, v28); __ aesimc(v0, v0); 2966 __ aesd(v0, v29); __ aesimc(v0, v0); 2967 __ aesd(v0, v30); 2968 __ eor(v0, __ T16B, v0, v31); 2969 __ eor(v0, __ T16B, v0, v2); 2970 2971 __ st1(v0, __ T16B, __ post(to, 16)); 2972 __ orr(v2, __ T16B, v1, v1); 2973 2974 __ subw(len_reg, len_reg, 16); 2975 __ cbnzw(len_reg, L_aes_loop); 2976 2977 __ st1(v2, __ T16B, rvec); 2978 2979 __ mov(r0, rscratch2); 2980 2981 __ leave(); 2982 __ ret(lr); 2983 2984 return start; 2985 } 2986 2987 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2988 // Inputs: 128-bits. in is preserved. 2989 // The least-significant 64-bit word is in the upper dword of each vector. 2990 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2991 // Output: result 2992 void be_add_128_64(FloatRegister result, FloatRegister in, 2993 FloatRegister inc, FloatRegister tmp) { 2994 assert_different_registers(result, tmp, inc); 2995 2996 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2997 // input 2998 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 2999 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3000 // MSD == 0 (must be!) to LSD 3001 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3002 } 3003 3004 // CTR AES crypt. 3005 // Arguments: 3006 // 3007 // Inputs: 3008 // c_rarg0 - source byte array address 3009 // c_rarg1 - destination byte array address 3010 // c_rarg2 - K (key) in little endian int array 3011 // c_rarg3 - counter vector byte array address 3012 // c_rarg4 - input length 3013 // c_rarg5 - saved encryptedCounter start 3014 // c_rarg6 - saved used length 3015 // 3016 // Output: 3017 // r0 - input length 3018 // 3019 address generate_counterMode_AESCrypt() { 3020 const Register in = c_rarg0; 3021 const Register out = c_rarg1; 3022 const Register key = c_rarg2; 3023 const Register counter = c_rarg3; 3024 const Register saved_len = c_rarg4, len = r10; 3025 const Register saved_encrypted_ctr = c_rarg5; 3026 const Register used_ptr = c_rarg6, used = r12; 3027 3028 const Register offset = r7; 3029 const Register keylen = r11; 3030 3031 const unsigned char block_size = 16; 3032 const int bulk_width = 4; 3033 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3034 // performance with larger data sizes, but it also means that the 3035 // fast path isn't used until you have at least 8 blocks, and up 3036 // to 127 bytes of data will be executed on the slow path. For 3037 // that reason, and also so as not to blow away too much icache, 4 3038 // blocks seems like a sensible compromise. 3039 3040 // Algorithm: 3041 // 3042 // if (len == 0) { 3043 // goto DONE; 3044 // } 3045 // int result = len; 3046 // do { 3047 // if (used >= blockSize) { 3048 // if (len >= bulk_width * blockSize) { 3049 // CTR_large_block(); 3050 // if (len == 0) 3051 // goto DONE; 3052 // } 3053 // for (;;) { 3054 // 16ByteVector v0 = counter; 3055 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3056 // used = 0; 3057 // if (len < blockSize) 3058 // break; /* goto NEXT */ 3059 // 16ByteVector v1 = load16Bytes(in, offset); 3060 // v1 = v1 ^ encryptedCounter; 3061 // store16Bytes(out, offset); 3062 // used = blockSize; 3063 // offset += blockSize; 3064 // len -= blockSize; 3065 // if (len == 0) 3066 // goto DONE; 3067 // } 3068 // } 3069 // NEXT: 3070 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3071 // len--; 3072 // } while (len != 0); 3073 // DONE: 3074 // return result; 3075 // 3076 // CTR_large_block() 3077 // Wide bulk encryption of whole blocks. 3078 3079 __ align(CodeEntryAlignment); 3080 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3081 const address start = __ pc(); 3082 __ enter(); 3083 3084 Label DONE, CTR_large_block, large_block_return; 3085 __ ldrw(used, Address(used_ptr)); 3086 __ cbzw(saved_len, DONE); 3087 3088 __ mov(len, saved_len); 3089 __ mov(offset, 0); 3090 3091 // Compute #rounds for AES based on the length of the key array 3092 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3093 3094 __ aesenc_loadkeys(key, keylen); 3095 3096 { 3097 Label L_CTR_loop, NEXT; 3098 3099 __ bind(L_CTR_loop); 3100 3101 __ cmp(used, block_size); 3102 __ br(__ LO, NEXT); 3103 3104 // Maybe we have a lot of data 3105 __ subsw(rscratch1, len, bulk_width * block_size); 3106 __ br(__ HS, CTR_large_block); 3107 __ BIND(large_block_return); 3108 __ cbzw(len, DONE); 3109 3110 // Setup the counter 3111 __ movi(v4, __ T4S, 0); 3112 __ movi(v5, __ T4S, 1); 3113 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3114 3115 // 128-bit big-endian increment 3116 __ ld1(v0, __ T16B, counter); 3117 __ rev64(v16, __ T16B, v0); 3118 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3119 __ rev64(v16, __ T16B, v16); 3120 __ st1(v16, __ T16B, counter); 3121 // Previous counter value is in v0 3122 // v4 contains { 0, 1 } 3123 3124 { 3125 // We have fewer than bulk_width blocks of data left. Encrypt 3126 // them one by one until there is less than a full block 3127 // remaining, being careful to save both the encrypted counter 3128 // and the counter. 3129 3130 Label inner_loop; 3131 __ bind(inner_loop); 3132 // Counter to encrypt is in v0 3133 __ aesecb_encrypt(noreg, noreg, keylen); 3134 __ st1(v0, __ T16B, saved_encrypted_ctr); 3135 3136 // Do we have a remaining full block? 3137 3138 __ mov(used, 0); 3139 __ cmp(len, block_size); 3140 __ br(__ LO, NEXT); 3141 3142 // Yes, we have a full block 3143 __ ldrq(v1, Address(in, offset)); 3144 __ eor(v1, __ T16B, v1, v0); 3145 __ strq(v1, Address(out, offset)); 3146 __ mov(used, block_size); 3147 __ add(offset, offset, block_size); 3148 3149 __ subw(len, len, block_size); 3150 __ cbzw(len, DONE); 3151 3152 // Increment the counter, store it back 3153 __ orr(v0, __ T16B, v16, v16); 3154 __ rev64(v16, __ T16B, v16); 3155 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3156 __ rev64(v16, __ T16B, v16); 3157 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3158 3159 __ b(inner_loop); 3160 } 3161 3162 __ BIND(NEXT); 3163 3164 // Encrypt a single byte, and loop. 3165 // We expect this to be a rare event. 3166 __ ldrb(rscratch1, Address(in, offset)); 3167 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3168 __ eor(rscratch1, rscratch1, rscratch2); 3169 __ strb(rscratch1, Address(out, offset)); 3170 __ add(offset, offset, 1); 3171 __ add(used, used, 1); 3172 __ subw(len, len,1); 3173 __ cbnzw(len, L_CTR_loop); 3174 } 3175 3176 __ bind(DONE); 3177 __ strw(used, Address(used_ptr)); 3178 __ mov(r0, saved_len); 3179 3180 __ leave(); // required for proper stackwalking of RuntimeStub frame 3181 __ ret(lr); 3182 3183 // Bulk encryption 3184 3185 __ BIND (CTR_large_block); 3186 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3187 3188 if (bulk_width == 8) { 3189 __ sub(sp, sp, 4 * 16); 3190 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3191 } 3192 __ sub(sp, sp, 4 * 16); 3193 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3194 RegSet saved_regs = (RegSet::of(in, out, offset) 3195 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3196 __ push(saved_regs, sp); 3197 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3198 __ add(in, in, offset); 3199 __ add(out, out, offset); 3200 3201 // Keys should already be loaded into the correct registers 3202 3203 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3204 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3205 3206 // AES/CTR loop 3207 { 3208 Label L_CTR_loop; 3209 __ BIND(L_CTR_loop); 3210 3211 // Setup the counters 3212 __ movi(v8, __ T4S, 0); 3213 __ movi(v9, __ T4S, 1); 3214 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3215 3216 for (int i = 0; i < bulk_width; i++) { 3217 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3218 __ rev64(v0_ofs, __ T16B, v16); 3219 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3220 } 3221 3222 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3223 3224 // Encrypt the counters 3225 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3226 3227 if (bulk_width == 8) { 3228 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3229 } 3230 3231 // XOR the encrypted counters with the inputs 3232 for (int i = 0; i < bulk_width; i++) { 3233 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3234 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3235 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3236 } 3237 3238 // Write the encrypted data 3239 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3240 if (bulk_width == 8) { 3241 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3242 } 3243 3244 __ subw(len, len, 16 * bulk_width); 3245 __ cbnzw(len, L_CTR_loop); 3246 } 3247 3248 // Save the counter back where it goes 3249 __ rev64(v16, __ T16B, v16); 3250 __ st1(v16, __ T16B, counter); 3251 3252 __ pop(saved_regs, sp); 3253 3254 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3255 if (bulk_width == 8) { 3256 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3257 } 3258 3259 __ andr(rscratch1, len, -16 * bulk_width); 3260 __ sub(len, len, rscratch1); 3261 __ add(offset, offset, rscratch1); 3262 __ mov(used, 16); 3263 __ strw(used, Address(used_ptr)); 3264 __ b(large_block_return); 3265 3266 return start; 3267 } 3268 3269 // Vector AES Galois Counter Mode implementation. Parameters: 3270 // 3271 // in = c_rarg0 3272 // len = c_rarg1 3273 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3274 // out = c_rarg3 3275 // key = c_rarg4 3276 // state = c_rarg5 - GHASH.state 3277 // subkeyHtbl = c_rarg6 - powers of H 3278 // counter = c_rarg7 - 16 bytes of CTR 3279 // return - number of processed bytes 3280 address generate_galoisCounterMode_AESCrypt() { 3281 address ghash_polynomial = __ pc(); 3282 __ emit_int64(0x87); // The low-order bits of the field 3283 // polynomial (i.e. p = z^7+z^2+z+1) 3284 // repeated in the low and high parts of a 3285 // 128-bit vector 3286 __ emit_int64(0x87); 3287 3288 __ align(CodeEntryAlignment); 3289 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3290 address start = __ pc(); 3291 __ enter(); 3292 3293 const Register in = c_rarg0; 3294 const Register len = c_rarg1; 3295 const Register ct = c_rarg2; 3296 const Register out = c_rarg3; 3297 // and updated with the incremented counter in the end 3298 3299 const Register key = c_rarg4; 3300 const Register state = c_rarg5; 3301 3302 const Register subkeyHtbl = c_rarg6; 3303 3304 const Register counter = c_rarg7; 3305 3306 const Register keylen = r10; 3307 // Save state before entering routine 3308 __ sub(sp, sp, 4 * 16); 3309 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3310 __ sub(sp, sp, 4 * 16); 3311 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3312 3313 // __ andr(len, len, -512); 3314 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3315 __ str(len, __ pre(sp, -2 * wordSize)); 3316 3317 Label DONE; 3318 __ cbz(len, DONE); 3319 3320 // Compute #rounds for AES based on the length of the key array 3321 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3322 3323 __ aesenc_loadkeys(key, keylen); 3324 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3325 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3326 3327 // AES/CTR loop 3328 { 3329 Label L_CTR_loop; 3330 __ BIND(L_CTR_loop); 3331 3332 // Setup the counters 3333 __ movi(v8, __ T4S, 0); 3334 __ movi(v9, __ T4S, 1); 3335 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3336 3337 assert(v0->encoding() < v8->encoding(), ""); 3338 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3339 FloatRegister f = as_FloatRegister(i); 3340 __ rev32(f, __ T16B, v16); 3341 __ addv(v16, __ T4S, v16, v8); 3342 } 3343 3344 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3345 3346 // Encrypt the counters 3347 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3348 3349 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3350 3351 // XOR the encrypted counters with the inputs 3352 for (int i = 0; i < 8; i++) { 3353 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3354 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3355 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3356 } 3357 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3358 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3359 3360 __ subw(len, len, 16 * 8); 3361 __ cbnzw(len, L_CTR_loop); 3362 } 3363 3364 __ rev32(v16, __ T16B, v16); 3365 __ st1(v16, __ T16B, counter); 3366 3367 __ ldr(len, Address(sp)); 3368 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3369 3370 // GHASH/CTR loop 3371 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3372 len, /*unrolls*/4); 3373 3374 #ifdef ASSERT 3375 { Label L; 3376 __ cmp(len, (unsigned char)0); 3377 __ br(Assembler::EQ, L); 3378 __ stop("stubGenerator: abort"); 3379 __ bind(L); 3380 } 3381 #endif 3382 3383 __ bind(DONE); 3384 // Return the number of bytes processed 3385 __ ldr(r0, __ post(sp, 2 * wordSize)); 3386 3387 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3388 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3389 3390 __ leave(); // required for proper stackwalking of RuntimeStub frame 3391 __ ret(lr); 3392 return start; 3393 } 3394 3395 class Cached64Bytes { 3396 private: 3397 MacroAssembler *_masm; 3398 Register _regs[8]; 3399 3400 public: 3401 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3402 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3403 auto it = rs.begin(); 3404 for (auto &r: _regs) { 3405 r = *it; 3406 ++it; 3407 } 3408 } 3409 3410 void gen_loads(Register base) { 3411 for (int i = 0; i < 8; i += 2) { 3412 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3413 } 3414 } 3415 3416 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3417 void extract_u32(Register dest, int i) { 3418 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3419 } 3420 }; 3421 3422 // Utility routines for md5. 3423 // Clobbers r10 and r11. 3424 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3425 int k, int s, int t) { 3426 Register rscratch3 = r10; 3427 Register rscratch4 = r11; 3428 3429 __ eorw(rscratch3, r3, r4); 3430 __ movw(rscratch2, t); 3431 __ andw(rscratch3, rscratch3, r2); 3432 __ addw(rscratch4, r1, rscratch2); 3433 reg_cache.extract_u32(rscratch1, k); 3434 __ eorw(rscratch3, rscratch3, r4); 3435 __ addw(rscratch4, rscratch4, rscratch1); 3436 __ addw(rscratch3, rscratch3, rscratch4); 3437 __ rorw(rscratch2, rscratch3, 32 - s); 3438 __ addw(r1, rscratch2, r2); 3439 } 3440 3441 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3442 int k, int s, int t) { 3443 Register rscratch3 = r10; 3444 Register rscratch4 = r11; 3445 3446 reg_cache.extract_u32(rscratch1, k); 3447 __ movw(rscratch2, t); 3448 __ addw(rscratch4, r1, rscratch2); 3449 __ addw(rscratch4, rscratch4, rscratch1); 3450 __ bicw(rscratch2, r3, r4); 3451 __ andw(rscratch3, r2, r4); 3452 __ addw(rscratch2, rscratch2, rscratch4); 3453 __ addw(rscratch2, rscratch2, rscratch3); 3454 __ rorw(rscratch2, rscratch2, 32 - s); 3455 __ addw(r1, rscratch2, r2); 3456 } 3457 3458 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3459 int k, int s, int t) { 3460 Register rscratch3 = r10; 3461 Register rscratch4 = r11; 3462 3463 __ eorw(rscratch3, r3, r4); 3464 __ movw(rscratch2, t); 3465 __ addw(rscratch4, r1, rscratch2); 3466 reg_cache.extract_u32(rscratch1, k); 3467 __ eorw(rscratch3, rscratch3, r2); 3468 __ addw(rscratch4, rscratch4, rscratch1); 3469 __ addw(rscratch3, rscratch3, rscratch4); 3470 __ rorw(rscratch2, rscratch3, 32 - s); 3471 __ addw(r1, rscratch2, r2); 3472 } 3473 3474 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3475 int k, int s, int t) { 3476 Register rscratch3 = r10; 3477 Register rscratch4 = r11; 3478 3479 __ movw(rscratch3, t); 3480 __ ornw(rscratch2, r2, r4); 3481 __ addw(rscratch4, r1, rscratch3); 3482 reg_cache.extract_u32(rscratch1, k); 3483 __ eorw(rscratch3, rscratch2, r3); 3484 __ addw(rscratch4, rscratch4, rscratch1); 3485 __ addw(rscratch3, rscratch3, rscratch4); 3486 __ rorw(rscratch2, rscratch3, 32 - s); 3487 __ addw(r1, rscratch2, r2); 3488 } 3489 3490 // Arguments: 3491 // 3492 // Inputs: 3493 // c_rarg0 - byte[] source+offset 3494 // c_rarg1 - int[] SHA.state 3495 // c_rarg2 - int offset 3496 // c_rarg3 - int limit 3497 // 3498 address generate_md5_implCompress(bool multi_block, const char *name) { 3499 __ align(CodeEntryAlignment); 3500 StubCodeMark mark(this, "StubRoutines", name); 3501 address start = __ pc(); 3502 3503 Register buf = c_rarg0; 3504 Register state = c_rarg1; 3505 Register ofs = c_rarg2; 3506 Register limit = c_rarg3; 3507 Register a = r4; 3508 Register b = r5; 3509 Register c = r6; 3510 Register d = r7; 3511 Register rscratch3 = r10; 3512 Register rscratch4 = r11; 3513 3514 Register state_regs[2] = { r12, r13 }; 3515 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3516 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3517 3518 __ push(saved_regs, sp); 3519 3520 __ ldp(state_regs[0], state_regs[1], Address(state)); 3521 __ ubfx(a, state_regs[0], 0, 32); 3522 __ ubfx(b, state_regs[0], 32, 32); 3523 __ ubfx(c, state_regs[1], 0, 32); 3524 __ ubfx(d, state_regs[1], 32, 32); 3525 3526 Label md5_loop; 3527 __ BIND(md5_loop); 3528 3529 reg_cache.gen_loads(buf); 3530 3531 // Round 1 3532 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3533 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3534 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3535 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3536 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3537 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3538 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3539 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3540 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3541 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3542 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3543 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3544 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3545 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3546 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3547 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3548 3549 // Round 2 3550 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3551 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3552 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3553 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3554 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3555 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3556 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3557 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3558 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3559 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3560 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3561 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3562 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3563 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3564 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3565 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3566 3567 // Round 3 3568 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3569 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3570 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3571 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3572 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3573 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3574 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3575 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3576 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3577 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3578 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3579 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3580 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3581 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3582 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3583 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3584 3585 // Round 4 3586 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3587 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3588 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3589 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3590 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3591 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3592 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3593 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3594 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3595 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3596 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3597 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3598 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3599 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3600 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3601 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3602 3603 __ addw(a, state_regs[0], a); 3604 __ ubfx(rscratch2, state_regs[0], 32, 32); 3605 __ addw(b, rscratch2, b); 3606 __ addw(c, state_regs[1], c); 3607 __ ubfx(rscratch4, state_regs[1], 32, 32); 3608 __ addw(d, rscratch4, d); 3609 3610 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3611 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3612 3613 if (multi_block) { 3614 __ add(buf, buf, 64); 3615 __ add(ofs, ofs, 64); 3616 __ cmp(ofs, limit); 3617 __ br(Assembler::LE, md5_loop); 3618 __ mov(c_rarg0, ofs); // return ofs 3619 } 3620 3621 // write hash values back in the correct order 3622 __ stp(state_regs[0], state_regs[1], Address(state)); 3623 3624 __ pop(saved_regs, sp); 3625 3626 __ ret(lr); 3627 3628 return start; 3629 } 3630 3631 // Arguments: 3632 // 3633 // Inputs: 3634 // c_rarg0 - byte[] source+offset 3635 // c_rarg1 - int[] SHA.state 3636 // c_rarg2 - int offset 3637 // c_rarg3 - int limit 3638 // 3639 address generate_sha1_implCompress(bool multi_block, const char *name) { 3640 __ align(CodeEntryAlignment); 3641 StubCodeMark mark(this, "StubRoutines", name); 3642 address start = __ pc(); 3643 3644 Register buf = c_rarg0; 3645 Register state = c_rarg1; 3646 Register ofs = c_rarg2; 3647 Register limit = c_rarg3; 3648 3649 Label keys; 3650 Label sha1_loop; 3651 3652 // load the keys into v0..v3 3653 __ adr(rscratch1, keys); 3654 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3655 // load 5 words state into v6, v7 3656 __ ldrq(v6, Address(state, 0)); 3657 __ ldrs(v7, Address(state, 16)); 3658 3659 3660 __ BIND(sha1_loop); 3661 // load 64 bytes of data into v16..v19 3662 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3663 __ rev32(v16, __ T16B, v16); 3664 __ rev32(v17, __ T16B, v17); 3665 __ rev32(v18, __ T16B, v18); 3666 __ rev32(v19, __ T16B, v19); 3667 3668 // do the sha1 3669 __ addv(v4, __ T4S, v16, v0); 3670 __ orr(v20, __ T16B, v6, v6); 3671 3672 FloatRegister d0 = v16; 3673 FloatRegister d1 = v17; 3674 FloatRegister d2 = v18; 3675 FloatRegister d3 = v19; 3676 3677 for (int round = 0; round < 20; round++) { 3678 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3679 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3680 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3681 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3682 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3683 3684 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3685 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3686 __ sha1h(tmp2, __ T4S, v20); 3687 if (round < 5) 3688 __ sha1c(v20, __ T4S, tmp3, tmp4); 3689 else if (round < 10 || round >= 15) 3690 __ sha1p(v20, __ T4S, tmp3, tmp4); 3691 else 3692 __ sha1m(v20, __ T4S, tmp3, tmp4); 3693 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3694 3695 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3696 } 3697 3698 __ addv(v7, __ T2S, v7, v21); 3699 __ addv(v6, __ T4S, v6, v20); 3700 3701 if (multi_block) { 3702 __ add(ofs, ofs, 64); 3703 __ cmp(ofs, limit); 3704 __ br(Assembler::LE, sha1_loop); 3705 __ mov(c_rarg0, ofs); // return ofs 3706 } 3707 3708 __ strq(v6, Address(state, 0)); 3709 __ strs(v7, Address(state, 16)); 3710 3711 __ ret(lr); 3712 3713 __ bind(keys); 3714 __ emit_int32(0x5a827999); 3715 __ emit_int32(0x6ed9eba1); 3716 __ emit_int32(0x8f1bbcdc); 3717 __ emit_int32(0xca62c1d6); 3718 3719 return start; 3720 } 3721 3722 3723 // Arguments: 3724 // 3725 // Inputs: 3726 // c_rarg0 - byte[] source+offset 3727 // c_rarg1 - int[] SHA.state 3728 // c_rarg2 - int offset 3729 // c_rarg3 - int limit 3730 // 3731 address generate_sha256_implCompress(bool multi_block, const char *name) { 3732 static const uint32_t round_consts[64] = { 3733 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3734 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3735 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3736 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3737 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3738 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3739 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3740 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3741 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3742 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3743 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3744 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3745 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3746 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3747 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3748 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3749 }; 3750 __ align(CodeEntryAlignment); 3751 StubCodeMark mark(this, "StubRoutines", name); 3752 address start = __ pc(); 3753 3754 Register buf = c_rarg0; 3755 Register state = c_rarg1; 3756 Register ofs = c_rarg2; 3757 Register limit = c_rarg3; 3758 3759 Label sha1_loop; 3760 3761 __ stpd(v8, v9, __ pre(sp, -32)); 3762 __ stpd(v10, v11, Address(sp, 16)); 3763 3764 // dga == v0 3765 // dgb == v1 3766 // dg0 == v2 3767 // dg1 == v3 3768 // dg2 == v4 3769 // t0 == v6 3770 // t1 == v7 3771 3772 // load 16 keys to v16..v31 3773 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3774 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3775 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3776 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3777 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3778 3779 // load 8 words (256 bits) state 3780 __ ldpq(v0, v1, state); 3781 3782 __ BIND(sha1_loop); 3783 // load 64 bytes of data into v8..v11 3784 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3785 __ rev32(v8, __ T16B, v8); 3786 __ rev32(v9, __ T16B, v9); 3787 __ rev32(v10, __ T16B, v10); 3788 __ rev32(v11, __ T16B, v11); 3789 3790 __ addv(v6, __ T4S, v8, v16); 3791 __ orr(v2, __ T16B, v0, v0); 3792 __ orr(v3, __ T16B, v1, v1); 3793 3794 FloatRegister d0 = v8; 3795 FloatRegister d1 = v9; 3796 FloatRegister d2 = v10; 3797 FloatRegister d3 = v11; 3798 3799 3800 for (int round = 0; round < 16; round++) { 3801 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3802 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3803 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3804 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3805 3806 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3807 __ orr(v4, __ T16B, v2, v2); 3808 if (round < 15) 3809 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3810 __ sha256h(v2, __ T4S, v3, tmp2); 3811 __ sha256h2(v3, __ T4S, v4, tmp2); 3812 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3813 3814 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3815 } 3816 3817 __ addv(v0, __ T4S, v0, v2); 3818 __ addv(v1, __ T4S, v1, v3); 3819 3820 if (multi_block) { 3821 __ add(ofs, ofs, 64); 3822 __ cmp(ofs, limit); 3823 __ br(Assembler::LE, sha1_loop); 3824 __ mov(c_rarg0, ofs); // return ofs 3825 } 3826 3827 __ ldpd(v10, v11, Address(sp, 16)); 3828 __ ldpd(v8, v9, __ post(sp, 32)); 3829 3830 __ stpq(v0, v1, state); 3831 3832 __ ret(lr); 3833 3834 return start; 3835 } 3836 3837 // Double rounds for sha512. 3838 void sha512_dround(int dr, 3839 FloatRegister vi0, FloatRegister vi1, 3840 FloatRegister vi2, FloatRegister vi3, 3841 FloatRegister vi4, FloatRegister vrc0, 3842 FloatRegister vrc1, FloatRegister vin0, 3843 FloatRegister vin1, FloatRegister vin2, 3844 FloatRegister vin3, FloatRegister vin4) { 3845 if (dr < 36) { 3846 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3847 } 3848 __ addv(v5, __ T2D, vrc0, vin0); 3849 __ ext(v6, __ T16B, vi2, vi3, 8); 3850 __ ext(v5, __ T16B, v5, v5, 8); 3851 __ ext(v7, __ T16B, vi1, vi2, 8); 3852 __ addv(vi3, __ T2D, vi3, v5); 3853 if (dr < 32) { 3854 __ ext(v5, __ T16B, vin3, vin4, 8); 3855 __ sha512su0(vin0, __ T2D, vin1); 3856 } 3857 __ sha512h(vi3, __ T2D, v6, v7); 3858 if (dr < 32) { 3859 __ sha512su1(vin0, __ T2D, vin2, v5); 3860 } 3861 __ addv(vi4, __ T2D, vi1, vi3); 3862 __ sha512h2(vi3, __ T2D, vi1, vi0); 3863 } 3864 3865 // Arguments: 3866 // 3867 // Inputs: 3868 // c_rarg0 - byte[] source+offset 3869 // c_rarg1 - int[] SHA.state 3870 // c_rarg2 - int offset 3871 // c_rarg3 - int limit 3872 // 3873 address generate_sha512_implCompress(bool multi_block, const char *name) { 3874 static const uint64_t round_consts[80] = { 3875 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3876 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3877 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3878 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3879 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3880 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3881 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3882 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3883 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3884 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3885 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3886 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3887 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3888 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3889 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3890 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3891 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3892 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3893 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3894 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3895 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3896 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3897 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3898 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3899 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3900 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3901 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3902 }; 3903 3904 __ align(CodeEntryAlignment); 3905 StubCodeMark mark(this, "StubRoutines", name); 3906 address start = __ pc(); 3907 3908 Register buf = c_rarg0; 3909 Register state = c_rarg1; 3910 Register ofs = c_rarg2; 3911 Register limit = c_rarg3; 3912 3913 __ stpd(v8, v9, __ pre(sp, -64)); 3914 __ stpd(v10, v11, Address(sp, 16)); 3915 __ stpd(v12, v13, Address(sp, 32)); 3916 __ stpd(v14, v15, Address(sp, 48)); 3917 3918 Label sha512_loop; 3919 3920 // load state 3921 __ ld1(v8, v9, v10, v11, __ T2D, state); 3922 3923 // load first 4 round constants 3924 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3925 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3926 3927 __ BIND(sha512_loop); 3928 // load 128B of data into v12..v19 3929 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3930 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3931 __ rev64(v12, __ T16B, v12); 3932 __ rev64(v13, __ T16B, v13); 3933 __ rev64(v14, __ T16B, v14); 3934 __ rev64(v15, __ T16B, v15); 3935 __ rev64(v16, __ T16B, v16); 3936 __ rev64(v17, __ T16B, v17); 3937 __ rev64(v18, __ T16B, v18); 3938 __ rev64(v19, __ T16B, v19); 3939 3940 __ mov(rscratch2, rscratch1); 3941 3942 __ mov(v0, __ T16B, v8); 3943 __ mov(v1, __ T16B, v9); 3944 __ mov(v2, __ T16B, v10); 3945 __ mov(v3, __ T16B, v11); 3946 3947 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3948 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3949 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3950 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3951 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3952 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3953 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3954 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3955 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3956 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3957 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3958 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3959 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3960 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3961 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3962 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3963 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3964 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3965 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3966 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3967 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3968 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3969 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3970 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3971 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3972 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3973 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3974 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3975 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3976 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3977 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3978 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3979 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3980 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3981 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3982 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3983 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3984 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3985 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3986 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3987 3988 __ addv(v8, __ T2D, v8, v0); 3989 __ addv(v9, __ T2D, v9, v1); 3990 __ addv(v10, __ T2D, v10, v2); 3991 __ addv(v11, __ T2D, v11, v3); 3992 3993 if (multi_block) { 3994 __ add(ofs, ofs, 128); 3995 __ cmp(ofs, limit); 3996 __ br(Assembler::LE, sha512_loop); 3997 __ mov(c_rarg0, ofs); // return ofs 3998 } 3999 4000 __ st1(v8, v9, v10, v11, __ T2D, state); 4001 4002 __ ldpd(v14, v15, Address(sp, 48)); 4003 __ ldpd(v12, v13, Address(sp, 32)); 4004 __ ldpd(v10, v11, Address(sp, 16)); 4005 __ ldpd(v8, v9, __ post(sp, 64)); 4006 4007 __ ret(lr); 4008 4009 return start; 4010 } 4011 4012 // Arguments: 4013 // 4014 // Inputs: 4015 // c_rarg0 - byte[] source+offset 4016 // c_rarg1 - byte[] SHA.state 4017 // c_rarg2 - int block_size 4018 // c_rarg3 - int offset 4019 // c_rarg4 - int limit 4020 // 4021 address generate_sha3_implCompress(bool multi_block, const char *name) { 4022 static const uint64_t round_consts[24] = { 4023 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4024 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4025 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4026 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4027 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4028 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4029 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4030 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4031 }; 4032 4033 __ align(CodeEntryAlignment); 4034 StubCodeMark mark(this, "StubRoutines", name); 4035 address start = __ pc(); 4036 4037 Register buf = c_rarg0; 4038 Register state = c_rarg1; 4039 Register block_size = c_rarg2; 4040 Register ofs = c_rarg3; 4041 Register limit = c_rarg4; 4042 4043 Label sha3_loop, rounds24_loop; 4044 Label sha3_512_or_sha3_384, shake128; 4045 4046 __ stpd(v8, v9, __ pre(sp, -64)); 4047 __ stpd(v10, v11, Address(sp, 16)); 4048 __ stpd(v12, v13, Address(sp, 32)); 4049 __ stpd(v14, v15, Address(sp, 48)); 4050 4051 // load state 4052 __ add(rscratch1, state, 32); 4053 __ ld1(v0, v1, v2, v3, __ T1D, state); 4054 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4055 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4056 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4057 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4058 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4059 __ ld1(v24, __ T1D, rscratch1); 4060 4061 __ BIND(sha3_loop); 4062 4063 // 24 keccak rounds 4064 __ movw(rscratch2, 24); 4065 4066 // load round_constants base 4067 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4068 4069 // load input 4070 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4071 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4072 __ eor(v0, __ T8B, v0, v25); 4073 __ eor(v1, __ T8B, v1, v26); 4074 __ eor(v2, __ T8B, v2, v27); 4075 __ eor(v3, __ T8B, v3, v28); 4076 __ eor(v4, __ T8B, v4, v29); 4077 __ eor(v5, __ T8B, v5, v30); 4078 __ eor(v6, __ T8B, v6, v31); 4079 4080 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4081 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4082 4083 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4084 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4085 __ eor(v7, __ T8B, v7, v25); 4086 __ eor(v8, __ T8B, v8, v26); 4087 __ eor(v9, __ T8B, v9, v27); 4088 __ eor(v10, __ T8B, v10, v28); 4089 __ eor(v11, __ T8B, v11, v29); 4090 __ eor(v12, __ T8B, v12, v30); 4091 __ eor(v13, __ T8B, v13, v31); 4092 4093 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4094 __ eor(v14, __ T8B, v14, v25); 4095 __ eor(v15, __ T8B, v15, v26); 4096 __ eor(v16, __ T8B, v16, v27); 4097 4098 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4099 __ andw(c_rarg5, block_size, 48); 4100 __ cbzw(c_rarg5, rounds24_loop); 4101 4102 __ tbnz(block_size, 5, shake128); 4103 // block_size == 144, bit5 == 0, SHA3-244 4104 __ ldrd(v28, __ post(buf, 8)); 4105 __ eor(v17, __ T8B, v17, v28); 4106 __ b(rounds24_loop); 4107 4108 __ BIND(shake128); 4109 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4110 __ eor(v17, __ T8B, v17, v28); 4111 __ eor(v18, __ T8B, v18, v29); 4112 __ eor(v19, __ T8B, v19, v30); 4113 __ eor(v20, __ T8B, v20, v31); 4114 __ b(rounds24_loop); // block_size == 168, SHAKE128 4115 4116 __ BIND(sha3_512_or_sha3_384); 4117 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4118 __ eor(v7, __ T8B, v7, v25); 4119 __ eor(v8, __ T8B, v8, v26); 4120 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4121 4122 // SHA3-384 4123 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4124 __ eor(v9, __ T8B, v9, v27); 4125 __ eor(v10, __ T8B, v10, v28); 4126 __ eor(v11, __ T8B, v11, v29); 4127 __ eor(v12, __ T8B, v12, v30); 4128 4129 __ BIND(rounds24_loop); 4130 __ subw(rscratch2, rscratch2, 1); 4131 4132 __ eor3(v29, __ T16B, v4, v9, v14); 4133 __ eor3(v26, __ T16B, v1, v6, v11); 4134 __ eor3(v28, __ T16B, v3, v8, v13); 4135 __ eor3(v25, __ T16B, v0, v5, v10); 4136 __ eor3(v27, __ T16B, v2, v7, v12); 4137 __ eor3(v29, __ T16B, v29, v19, v24); 4138 __ eor3(v26, __ T16B, v26, v16, v21); 4139 __ eor3(v28, __ T16B, v28, v18, v23); 4140 __ eor3(v25, __ T16B, v25, v15, v20); 4141 __ eor3(v27, __ T16B, v27, v17, v22); 4142 4143 __ rax1(v30, __ T2D, v29, v26); 4144 __ rax1(v26, __ T2D, v26, v28); 4145 __ rax1(v28, __ T2D, v28, v25); 4146 __ rax1(v25, __ T2D, v25, v27); 4147 __ rax1(v27, __ T2D, v27, v29); 4148 4149 __ eor(v0, __ T16B, v0, v30); 4150 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4151 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4152 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4153 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4154 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4155 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4156 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4157 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4158 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4159 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4160 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4161 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4162 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4163 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4164 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4165 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4166 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4167 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4168 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4169 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4170 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4171 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4172 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4173 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4174 4175 __ bcax(v20, __ T16B, v31, v22, v8); 4176 __ bcax(v21, __ T16B, v8, v23, v22); 4177 __ bcax(v22, __ T16B, v22, v24, v23); 4178 __ bcax(v23, __ T16B, v23, v31, v24); 4179 __ bcax(v24, __ T16B, v24, v8, v31); 4180 4181 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4182 4183 __ bcax(v17, __ T16B, v25, v19, v3); 4184 __ bcax(v18, __ T16B, v3, v15, v19); 4185 __ bcax(v19, __ T16B, v19, v16, v15); 4186 __ bcax(v15, __ T16B, v15, v25, v16); 4187 __ bcax(v16, __ T16B, v16, v3, v25); 4188 4189 __ bcax(v10, __ T16B, v29, v12, v26); 4190 __ bcax(v11, __ T16B, v26, v13, v12); 4191 __ bcax(v12, __ T16B, v12, v14, v13); 4192 __ bcax(v13, __ T16B, v13, v29, v14); 4193 __ bcax(v14, __ T16B, v14, v26, v29); 4194 4195 __ bcax(v7, __ T16B, v30, v9, v4); 4196 __ bcax(v8, __ T16B, v4, v5, v9); 4197 __ bcax(v9, __ T16B, v9, v6, v5); 4198 __ bcax(v5, __ T16B, v5, v30, v6); 4199 __ bcax(v6, __ T16B, v6, v4, v30); 4200 4201 __ bcax(v3, __ T16B, v27, v0, v28); 4202 __ bcax(v4, __ T16B, v28, v1, v0); 4203 __ bcax(v0, __ T16B, v0, v2, v1); 4204 __ bcax(v1, __ T16B, v1, v27, v2); 4205 __ bcax(v2, __ T16B, v2, v28, v27); 4206 4207 __ eor(v0, __ T16B, v0, v31); 4208 4209 __ cbnzw(rscratch2, rounds24_loop); 4210 4211 if (multi_block) { 4212 __ add(ofs, ofs, block_size); 4213 __ cmp(ofs, limit); 4214 __ br(Assembler::LE, sha3_loop); 4215 __ mov(c_rarg0, ofs); // return ofs 4216 } 4217 4218 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4219 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4220 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4221 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4222 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4223 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4224 __ st1(v24, __ T1D, state); 4225 4226 __ ldpd(v14, v15, Address(sp, 48)); 4227 __ ldpd(v12, v13, Address(sp, 32)); 4228 __ ldpd(v10, v11, Address(sp, 16)); 4229 __ ldpd(v8, v9, __ post(sp, 64)); 4230 4231 __ ret(lr); 4232 4233 return start; 4234 } 4235 4236 /** 4237 * Arguments: 4238 * 4239 * Inputs: 4240 * c_rarg0 - int crc 4241 * c_rarg1 - byte* buf 4242 * c_rarg2 - int length 4243 * 4244 * Output: 4245 * rax - int crc result 4246 */ 4247 address generate_updateBytesCRC32() { 4248 assert(UseCRC32Intrinsics, "what are we doing here?"); 4249 4250 __ align(CodeEntryAlignment); 4251 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4252 4253 address start = __ pc(); 4254 4255 const Register crc = c_rarg0; // crc 4256 const Register buf = c_rarg1; // source java byte array address 4257 const Register len = c_rarg2; // length 4258 const Register table0 = c_rarg3; // crc_table address 4259 const Register table1 = c_rarg4; 4260 const Register table2 = c_rarg5; 4261 const Register table3 = c_rarg6; 4262 const Register tmp3 = c_rarg7; 4263 4264 BLOCK_COMMENT("Entry:"); 4265 __ enter(); // required for proper stackwalking of RuntimeStub frame 4266 4267 __ kernel_crc32(crc, buf, len, 4268 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4269 4270 __ leave(); // required for proper stackwalking of RuntimeStub frame 4271 __ ret(lr); 4272 4273 return start; 4274 } 4275 4276 // ChaCha20 block function. This version parallelizes by loading 4277 // individual 32-bit state elements into vectors for four blocks 4278 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4279 // 4280 // state (int[16]) = c_rarg0 4281 // keystream (byte[1024]) = c_rarg1 4282 // return - number of bytes of keystream (always 256) 4283 address generate_chacha20Block_blockpar() { 4284 Label L_twoRounds, L_cc20_const; 4285 // The constant data is broken into two 128-bit segments to be loaded 4286 // onto FloatRegisters. The first 128 bits are a counter add overlay 4287 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4288 // The second 128-bits is a table constant used for 8-bit left rotations. 4289 __ BIND(L_cc20_const); 4290 __ emit_int64(0x0000000100000000UL); 4291 __ emit_int64(0x0000000300000002UL); 4292 __ emit_int64(0x0605040702010003UL); 4293 __ emit_int64(0x0E0D0C0F0A09080BUL); 4294 4295 __ align(CodeEntryAlignment); 4296 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4297 address start = __ pc(); 4298 __ enter(); 4299 4300 int i, j; 4301 const Register state = c_rarg0; 4302 const Register keystream = c_rarg1; 4303 const Register loopCtr = r10; 4304 const Register tmpAddr = r11; 4305 4306 const FloatRegister stateFirst = v0; 4307 const FloatRegister stateSecond = v1; 4308 const FloatRegister stateThird = v2; 4309 const FloatRegister stateFourth = v3; 4310 const FloatRegister origCtrState = v28; 4311 const FloatRegister scratch = v29; 4312 const FloatRegister lrot8Tbl = v30; 4313 4314 // Organize SIMD registers in an array that facilitates 4315 // putting repetitive opcodes into loop structures. It is 4316 // important that each grouping of 4 registers is monotonically 4317 // increasing to support the requirements of multi-register 4318 // instructions (e.g. ld4r, st4, etc.) 4319 const FloatRegister workSt[16] = { 4320 v4, v5, v6, v7, v16, v17, v18, v19, 4321 v20, v21, v22, v23, v24, v25, v26, v27 4322 }; 4323 4324 // Load from memory and interlace across 16 SIMD registers, 4325 // With each word from memory being broadcast to all lanes of 4326 // each successive SIMD register. 4327 // Addr(0) -> All lanes in workSt[i] 4328 // Addr(4) -> All lanes workSt[i + 1], etc. 4329 __ mov(tmpAddr, state); 4330 for (i = 0; i < 16; i += 4) { 4331 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4332 __ post(tmpAddr, 16)); 4333 } 4334 4335 // Pull in constant data. The first 16 bytes are the add overlay 4336 // which is applied to the vector holding the counter (state[12]). 4337 // The second 16 bytes is the index register for the 8-bit left 4338 // rotation tbl instruction. 4339 __ adr(tmpAddr, L_cc20_const); 4340 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4341 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4342 4343 // Set up the 10 iteration loop and perform all 8 quarter round ops 4344 __ mov(loopCtr, 10); 4345 __ BIND(L_twoRounds); 4346 4347 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4348 scratch, lrot8Tbl); 4349 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4350 scratch, lrot8Tbl); 4351 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4352 scratch, lrot8Tbl); 4353 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4354 scratch, lrot8Tbl); 4355 4356 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4357 scratch, lrot8Tbl); 4358 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4359 scratch, lrot8Tbl); 4360 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4361 scratch, lrot8Tbl); 4362 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4363 scratch, lrot8Tbl); 4364 4365 // Decrement and iterate 4366 __ sub(loopCtr, loopCtr, 1); 4367 __ cbnz(loopCtr, L_twoRounds); 4368 4369 __ mov(tmpAddr, state); 4370 4371 // Add the starting state back to the post-loop keystream 4372 // state. We read/interlace the state array from memory into 4373 // 4 registers similar to what we did in the beginning. Then 4374 // add the counter overlay onto workSt[12] at the end. 4375 for (i = 0; i < 16; i += 4) { 4376 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4377 __ post(tmpAddr, 16)); 4378 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4379 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4380 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4381 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4382 } 4383 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4384 4385 // Write to key stream, storing the same element out of workSt[0..15] 4386 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4387 // for the next element position. 4388 for (i = 0; i < 4; i++) { 4389 for (j = 0; j < 16; j += 4) { 4390 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4391 __ post(keystream, 16)); 4392 } 4393 } 4394 4395 __ mov(r0, 256); // Return length of output keystream 4396 __ leave(); 4397 __ ret(lr); 4398 4399 return start; 4400 } 4401 4402 /** 4403 * Arguments: 4404 * 4405 * Inputs: 4406 * c_rarg0 - int crc 4407 * c_rarg1 - byte* buf 4408 * c_rarg2 - int length 4409 * c_rarg3 - int* table 4410 * 4411 * Output: 4412 * r0 - int crc result 4413 */ 4414 address generate_updateBytesCRC32C() { 4415 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4416 4417 __ align(CodeEntryAlignment); 4418 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4419 4420 address start = __ pc(); 4421 4422 const Register crc = c_rarg0; // crc 4423 const Register buf = c_rarg1; // source java byte array address 4424 const Register len = c_rarg2; // length 4425 const Register table0 = c_rarg3; // crc_table address 4426 const Register table1 = c_rarg4; 4427 const Register table2 = c_rarg5; 4428 const Register table3 = c_rarg6; 4429 const Register tmp3 = c_rarg7; 4430 4431 BLOCK_COMMENT("Entry:"); 4432 __ enter(); // required for proper stackwalking of RuntimeStub frame 4433 4434 __ kernel_crc32c(crc, buf, len, 4435 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4436 4437 __ leave(); // required for proper stackwalking of RuntimeStub frame 4438 __ ret(lr); 4439 4440 return start; 4441 } 4442 4443 /*** 4444 * Arguments: 4445 * 4446 * Inputs: 4447 * c_rarg0 - int adler 4448 * c_rarg1 - byte* buff 4449 * c_rarg2 - int len 4450 * 4451 * Output: 4452 * c_rarg0 - int adler result 4453 */ 4454 address generate_updateBytesAdler32() { 4455 __ align(CodeEntryAlignment); 4456 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4457 address start = __ pc(); 4458 4459 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4460 4461 // Aliases 4462 Register adler = c_rarg0; 4463 Register s1 = c_rarg0; 4464 Register s2 = c_rarg3; 4465 Register buff = c_rarg1; 4466 Register len = c_rarg2; 4467 Register nmax = r4; 4468 Register base = r5; 4469 Register count = r6; 4470 Register temp0 = rscratch1; 4471 Register temp1 = rscratch2; 4472 FloatRegister vbytes = v0; 4473 FloatRegister vs1acc = v1; 4474 FloatRegister vs2acc = v2; 4475 FloatRegister vtable = v3; 4476 4477 // Max number of bytes we can process before having to take the mod 4478 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4479 uint64_t BASE = 0xfff1; 4480 uint64_t NMAX = 0x15B0; 4481 4482 __ mov(base, BASE); 4483 __ mov(nmax, NMAX); 4484 4485 // Load accumulation coefficients for the upper 16 bits 4486 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4487 __ ld1(vtable, __ T16B, Address(temp0)); 4488 4489 // s1 is initialized to the lower 16 bits of adler 4490 // s2 is initialized to the upper 16 bits of adler 4491 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4492 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4493 4494 // The pipelined loop needs at least 16 elements for 1 iteration 4495 // It does check this, but it is more effective to skip to the cleanup loop 4496 __ cmp(len, (u1)16); 4497 __ br(Assembler::HS, L_nmax); 4498 __ cbz(len, L_combine); 4499 4500 __ bind(L_simple_by1_loop); 4501 __ ldrb(temp0, Address(__ post(buff, 1))); 4502 __ add(s1, s1, temp0); 4503 __ add(s2, s2, s1); 4504 __ subs(len, len, 1); 4505 __ br(Assembler::HI, L_simple_by1_loop); 4506 4507 // s1 = s1 % BASE 4508 __ subs(temp0, s1, base); 4509 __ csel(s1, temp0, s1, Assembler::HS); 4510 4511 // s2 = s2 % BASE 4512 __ lsr(temp0, s2, 16); 4513 __ lsl(temp1, temp0, 4); 4514 __ sub(temp1, temp1, temp0); 4515 __ add(s2, temp1, s2, ext::uxth); 4516 4517 __ subs(temp0, s2, base); 4518 __ csel(s2, temp0, s2, Assembler::HS); 4519 4520 __ b(L_combine); 4521 4522 __ bind(L_nmax); 4523 __ subs(len, len, nmax); 4524 __ sub(count, nmax, 16); 4525 __ br(Assembler::LO, L_by16); 4526 4527 __ bind(L_nmax_loop); 4528 4529 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4530 vbytes, vs1acc, vs2acc, vtable); 4531 4532 __ subs(count, count, 16); 4533 __ br(Assembler::HS, L_nmax_loop); 4534 4535 // s1 = s1 % BASE 4536 __ lsr(temp0, s1, 16); 4537 __ lsl(temp1, temp0, 4); 4538 __ sub(temp1, temp1, temp0); 4539 __ add(temp1, temp1, s1, ext::uxth); 4540 4541 __ lsr(temp0, temp1, 16); 4542 __ lsl(s1, temp0, 4); 4543 __ sub(s1, s1, temp0); 4544 __ add(s1, s1, temp1, ext:: uxth); 4545 4546 __ subs(temp0, s1, base); 4547 __ csel(s1, temp0, s1, Assembler::HS); 4548 4549 // s2 = s2 % BASE 4550 __ lsr(temp0, s2, 16); 4551 __ lsl(temp1, temp0, 4); 4552 __ sub(temp1, temp1, temp0); 4553 __ add(temp1, temp1, s2, ext::uxth); 4554 4555 __ lsr(temp0, temp1, 16); 4556 __ lsl(s2, temp0, 4); 4557 __ sub(s2, s2, temp0); 4558 __ add(s2, s2, temp1, ext:: uxth); 4559 4560 __ subs(temp0, s2, base); 4561 __ csel(s2, temp0, s2, Assembler::HS); 4562 4563 __ subs(len, len, nmax); 4564 __ sub(count, nmax, 16); 4565 __ br(Assembler::HS, L_nmax_loop); 4566 4567 __ bind(L_by16); 4568 __ adds(len, len, count); 4569 __ br(Assembler::LO, L_by1); 4570 4571 __ bind(L_by16_loop); 4572 4573 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4574 vbytes, vs1acc, vs2acc, vtable); 4575 4576 __ subs(len, len, 16); 4577 __ br(Assembler::HS, L_by16_loop); 4578 4579 __ bind(L_by1); 4580 __ adds(len, len, 15); 4581 __ br(Assembler::LO, L_do_mod); 4582 4583 __ bind(L_by1_loop); 4584 __ ldrb(temp0, Address(__ post(buff, 1))); 4585 __ add(s1, temp0, s1); 4586 __ add(s2, s2, s1); 4587 __ subs(len, len, 1); 4588 __ br(Assembler::HS, L_by1_loop); 4589 4590 __ bind(L_do_mod); 4591 // s1 = s1 % BASE 4592 __ lsr(temp0, s1, 16); 4593 __ lsl(temp1, temp0, 4); 4594 __ sub(temp1, temp1, temp0); 4595 __ add(temp1, temp1, s1, ext::uxth); 4596 4597 __ lsr(temp0, temp1, 16); 4598 __ lsl(s1, temp0, 4); 4599 __ sub(s1, s1, temp0); 4600 __ add(s1, s1, temp1, ext:: uxth); 4601 4602 __ subs(temp0, s1, base); 4603 __ csel(s1, temp0, s1, Assembler::HS); 4604 4605 // s2 = s2 % BASE 4606 __ lsr(temp0, s2, 16); 4607 __ lsl(temp1, temp0, 4); 4608 __ sub(temp1, temp1, temp0); 4609 __ add(temp1, temp1, s2, ext::uxth); 4610 4611 __ lsr(temp0, temp1, 16); 4612 __ lsl(s2, temp0, 4); 4613 __ sub(s2, s2, temp0); 4614 __ add(s2, s2, temp1, ext:: uxth); 4615 4616 __ subs(temp0, s2, base); 4617 __ csel(s2, temp0, s2, Assembler::HS); 4618 4619 // Combine lower bits and higher bits 4620 __ bind(L_combine); 4621 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4622 4623 __ ret(lr); 4624 4625 return start; 4626 } 4627 4628 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4629 Register temp0, Register temp1, FloatRegister vbytes, 4630 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4631 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4632 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4633 // In non-vectorized code, we update s1 and s2 as: 4634 // s1 <- s1 + b1 4635 // s2 <- s2 + s1 4636 // s1 <- s1 + b2 4637 // s2 <- s2 + b1 4638 // ... 4639 // s1 <- s1 + b16 4640 // s2 <- s2 + s1 4641 // Putting above assignments together, we have: 4642 // s1_new = s1 + b1 + b2 + ... + b16 4643 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4644 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4645 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4646 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4647 4648 // s2 = s2 + s1 * 16 4649 __ add(s2, s2, s1, Assembler::LSL, 4); 4650 4651 // vs1acc = b1 + b2 + b3 + ... + b16 4652 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4653 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4654 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4655 __ uaddlv(vs1acc, __ T16B, vbytes); 4656 __ uaddlv(vs2acc, __ T8H, vs2acc); 4657 4658 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4659 __ fmovd(temp0, vs1acc); 4660 __ fmovd(temp1, vs2acc); 4661 __ add(s1, s1, temp0); 4662 __ add(s2, s2, temp1); 4663 } 4664 4665 /** 4666 * Arguments: 4667 * 4668 * Input: 4669 * c_rarg0 - x address 4670 * c_rarg1 - x length 4671 * c_rarg2 - y address 4672 * c_rarg3 - y length 4673 * c_rarg4 - z address 4674 */ 4675 address generate_multiplyToLen() { 4676 __ align(CodeEntryAlignment); 4677 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4678 4679 address start = __ pc(); 4680 4681 if (SCCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) { 4682 return start; 4683 } 4684 const Register x = r0; 4685 const Register xlen = r1; 4686 const Register y = r2; 4687 const Register ylen = r3; 4688 const Register z = r4; 4689 4690 const Register tmp0 = r5; 4691 const Register tmp1 = r10; 4692 const Register tmp2 = r11; 4693 const Register tmp3 = r12; 4694 const Register tmp4 = r13; 4695 const Register tmp5 = r14; 4696 const Register tmp6 = r15; 4697 const Register tmp7 = r16; 4698 4699 BLOCK_COMMENT("Entry:"); 4700 __ enter(); // required for proper stackwalking of RuntimeStub frame 4701 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4702 __ leave(); // required for proper stackwalking of RuntimeStub frame 4703 __ ret(lr); 4704 4705 SCCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start); 4706 return start; 4707 } 4708 4709 address generate_squareToLen() { 4710 // squareToLen algorithm for sizes 1..127 described in java code works 4711 // faster than multiply_to_len on some CPUs and slower on others, but 4712 // multiply_to_len shows a bit better overall results 4713 __ align(CodeEntryAlignment); 4714 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4715 address start = __ pc(); 4716 4717 if (SCCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) { 4718 return start; 4719 } 4720 const Register x = r0; 4721 const Register xlen = r1; 4722 const Register z = r2; 4723 const Register y = r4; // == x 4724 const Register ylen = r5; // == xlen 4725 4726 const Register tmp0 = r3; 4727 const Register tmp1 = r10; 4728 const Register tmp2 = r11; 4729 const Register tmp3 = r12; 4730 const Register tmp4 = r13; 4731 const Register tmp5 = r14; 4732 const Register tmp6 = r15; 4733 const Register tmp7 = r16; 4734 4735 RegSet spilled_regs = RegSet::of(y, ylen); 4736 BLOCK_COMMENT("Entry:"); 4737 __ enter(); 4738 __ push(spilled_regs, sp); 4739 __ mov(y, x); 4740 __ mov(ylen, xlen); 4741 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4742 __ pop(spilled_regs, sp); 4743 __ leave(); 4744 __ ret(lr); 4745 4746 SCCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start); 4747 return start; 4748 } 4749 4750 address generate_mulAdd() { 4751 __ align(CodeEntryAlignment); 4752 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4753 4754 address start = __ pc(); 4755 4756 if (SCCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) { 4757 return start; 4758 } 4759 const Register out = r0; 4760 const Register in = r1; 4761 const Register offset = r2; 4762 const Register len = r3; 4763 const Register k = r4; 4764 4765 BLOCK_COMMENT("Entry:"); 4766 __ enter(); 4767 __ mul_add(out, in, offset, len, k); 4768 __ leave(); 4769 __ ret(lr); 4770 4771 SCCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start); 4772 return start; 4773 } 4774 4775 // Arguments: 4776 // 4777 // Input: 4778 // c_rarg0 - newArr address 4779 // c_rarg1 - oldArr address 4780 // c_rarg2 - newIdx 4781 // c_rarg3 - shiftCount 4782 // c_rarg4 - numIter 4783 // 4784 address generate_bigIntegerRightShift() { 4785 __ align(CodeEntryAlignment); 4786 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4787 address start = __ pc(); 4788 4789 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4790 4791 Register newArr = c_rarg0; 4792 Register oldArr = c_rarg1; 4793 Register newIdx = c_rarg2; 4794 Register shiftCount = c_rarg3; 4795 Register numIter = c_rarg4; 4796 Register idx = numIter; 4797 4798 Register newArrCur = rscratch1; 4799 Register shiftRevCount = rscratch2; 4800 Register oldArrCur = r13; 4801 Register oldArrNext = r14; 4802 4803 FloatRegister oldElem0 = v0; 4804 FloatRegister oldElem1 = v1; 4805 FloatRegister newElem = v2; 4806 FloatRegister shiftVCount = v3; 4807 FloatRegister shiftVRevCount = v4; 4808 4809 __ cbz(idx, Exit); 4810 4811 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4812 4813 // left shift count 4814 __ movw(shiftRevCount, 32); 4815 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4816 4817 // numIter too small to allow a 4-words SIMD loop, rolling back 4818 __ cmp(numIter, (u1)4); 4819 __ br(Assembler::LT, ShiftThree); 4820 4821 __ dup(shiftVCount, __ T4S, shiftCount); 4822 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4823 __ negr(shiftVCount, __ T4S, shiftVCount); 4824 4825 __ BIND(ShiftSIMDLoop); 4826 4827 // Calculate the load addresses 4828 __ sub(idx, idx, 4); 4829 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4830 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4831 __ add(oldArrCur, oldArrNext, 4); 4832 4833 // Load 4 words and process 4834 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4835 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4836 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4837 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4838 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4839 __ st1(newElem, __ T4S, Address(newArrCur)); 4840 4841 __ cmp(idx, (u1)4); 4842 __ br(Assembler::LT, ShiftTwoLoop); 4843 __ b(ShiftSIMDLoop); 4844 4845 __ BIND(ShiftTwoLoop); 4846 __ cbz(idx, Exit); 4847 __ cmp(idx, (u1)1); 4848 __ br(Assembler::EQ, ShiftOne); 4849 4850 // Calculate the load addresses 4851 __ sub(idx, idx, 2); 4852 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4853 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4854 __ add(oldArrCur, oldArrNext, 4); 4855 4856 // Load 2 words and process 4857 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4858 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4859 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4860 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4861 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4862 __ st1(newElem, __ T2S, Address(newArrCur)); 4863 __ b(ShiftTwoLoop); 4864 4865 __ BIND(ShiftThree); 4866 __ tbz(idx, 1, ShiftOne); 4867 __ tbz(idx, 0, ShiftTwo); 4868 __ ldrw(r10, Address(oldArr, 12)); 4869 __ ldrw(r11, Address(oldArr, 8)); 4870 __ lsrvw(r10, r10, shiftCount); 4871 __ lslvw(r11, r11, shiftRevCount); 4872 __ orrw(r12, r10, r11); 4873 __ strw(r12, Address(newArr, 8)); 4874 4875 __ BIND(ShiftTwo); 4876 __ ldrw(r10, Address(oldArr, 8)); 4877 __ ldrw(r11, Address(oldArr, 4)); 4878 __ lsrvw(r10, r10, shiftCount); 4879 __ lslvw(r11, r11, shiftRevCount); 4880 __ orrw(r12, r10, r11); 4881 __ strw(r12, Address(newArr, 4)); 4882 4883 __ BIND(ShiftOne); 4884 __ ldrw(r10, Address(oldArr, 4)); 4885 __ ldrw(r11, Address(oldArr)); 4886 __ lsrvw(r10, r10, shiftCount); 4887 __ lslvw(r11, r11, shiftRevCount); 4888 __ orrw(r12, r10, r11); 4889 __ strw(r12, Address(newArr)); 4890 4891 __ BIND(Exit); 4892 __ ret(lr); 4893 4894 return start; 4895 } 4896 4897 // Arguments: 4898 // 4899 // Input: 4900 // c_rarg0 - newArr address 4901 // c_rarg1 - oldArr address 4902 // c_rarg2 - newIdx 4903 // c_rarg3 - shiftCount 4904 // c_rarg4 - numIter 4905 // 4906 address generate_bigIntegerLeftShift() { 4907 __ align(CodeEntryAlignment); 4908 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4909 address start = __ pc(); 4910 4911 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4912 4913 Register newArr = c_rarg0; 4914 Register oldArr = c_rarg1; 4915 Register newIdx = c_rarg2; 4916 Register shiftCount = c_rarg3; 4917 Register numIter = c_rarg4; 4918 4919 Register shiftRevCount = rscratch1; 4920 Register oldArrNext = rscratch2; 4921 4922 FloatRegister oldElem0 = v0; 4923 FloatRegister oldElem1 = v1; 4924 FloatRegister newElem = v2; 4925 FloatRegister shiftVCount = v3; 4926 FloatRegister shiftVRevCount = v4; 4927 4928 __ cbz(numIter, Exit); 4929 4930 __ add(oldArrNext, oldArr, 4); 4931 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4932 4933 // right shift count 4934 __ movw(shiftRevCount, 32); 4935 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4936 4937 // numIter too small to allow a 4-words SIMD loop, rolling back 4938 __ cmp(numIter, (u1)4); 4939 __ br(Assembler::LT, ShiftThree); 4940 4941 __ dup(shiftVCount, __ T4S, shiftCount); 4942 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4943 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4944 4945 __ BIND(ShiftSIMDLoop); 4946 4947 // load 4 words and process 4948 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4949 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4950 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4951 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4952 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4953 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4954 __ sub(numIter, numIter, 4); 4955 4956 __ cmp(numIter, (u1)4); 4957 __ br(Assembler::LT, ShiftTwoLoop); 4958 __ b(ShiftSIMDLoop); 4959 4960 __ BIND(ShiftTwoLoop); 4961 __ cbz(numIter, Exit); 4962 __ cmp(numIter, (u1)1); 4963 __ br(Assembler::EQ, ShiftOne); 4964 4965 // load 2 words and process 4966 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4967 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4968 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4969 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4970 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4971 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4972 __ sub(numIter, numIter, 2); 4973 __ b(ShiftTwoLoop); 4974 4975 __ BIND(ShiftThree); 4976 __ ldrw(r10, __ post(oldArr, 4)); 4977 __ ldrw(r11, __ post(oldArrNext, 4)); 4978 __ lslvw(r10, r10, shiftCount); 4979 __ lsrvw(r11, r11, shiftRevCount); 4980 __ orrw(r12, r10, r11); 4981 __ strw(r12, __ post(newArr, 4)); 4982 __ tbz(numIter, 1, Exit); 4983 __ tbz(numIter, 0, ShiftOne); 4984 4985 __ BIND(ShiftTwo); 4986 __ ldrw(r10, __ post(oldArr, 4)); 4987 __ ldrw(r11, __ post(oldArrNext, 4)); 4988 __ lslvw(r10, r10, shiftCount); 4989 __ lsrvw(r11, r11, shiftRevCount); 4990 __ orrw(r12, r10, r11); 4991 __ strw(r12, __ post(newArr, 4)); 4992 4993 __ BIND(ShiftOne); 4994 __ ldrw(r10, Address(oldArr)); 4995 __ ldrw(r11, Address(oldArrNext)); 4996 __ lslvw(r10, r10, shiftCount); 4997 __ lsrvw(r11, r11, shiftRevCount); 4998 __ orrw(r12, r10, r11); 4999 __ strw(r12, Address(newArr)); 5000 5001 __ BIND(Exit); 5002 __ ret(lr); 5003 5004 return start; 5005 } 5006 5007 address generate_count_positives(address &count_positives_long) { 5008 const u1 large_loop_size = 64; 5009 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5010 int dcache_line = VM_Version::dcache_line_size(); 5011 5012 Register ary1 = r1, len = r2, result = r0; 5013 5014 __ align(CodeEntryAlignment); 5015 5016 StubCodeMark mark(this, "StubRoutines", "count_positives"); 5017 5018 address entry = __ pc(); 5019 5020 __ enter(); 5021 // precondition: a copy of len is already in result 5022 // __ mov(result, len); 5023 5024 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 5025 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 5026 5027 __ cmp(len, (u1)15); 5028 __ br(Assembler::GT, LEN_OVER_15); 5029 // The only case when execution falls into this code is when pointer is near 5030 // the end of memory page and we have to avoid reading next page 5031 __ add(ary1, ary1, len); 5032 __ subs(len, len, 8); 5033 __ br(Assembler::GT, LEN_OVER_8); 5034 __ ldr(rscratch2, Address(ary1, -8)); 5035 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 5036 __ lsrv(rscratch2, rscratch2, rscratch1); 5037 __ tst(rscratch2, UPPER_BIT_MASK); 5038 __ csel(result, zr, result, Assembler::NE); 5039 __ leave(); 5040 __ ret(lr); 5041 __ bind(LEN_OVER_8); 5042 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5043 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5044 __ tst(rscratch2, UPPER_BIT_MASK); 5045 __ br(Assembler::NE, RET_NO_POP); 5046 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5047 __ lsrv(rscratch1, rscratch1, rscratch2); 5048 __ tst(rscratch1, UPPER_BIT_MASK); 5049 __ bind(RET_NO_POP); 5050 __ csel(result, zr, result, Assembler::NE); 5051 __ leave(); 5052 __ ret(lr); 5053 5054 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5055 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5056 5057 count_positives_long = __ pc(); // 2nd entry point 5058 5059 __ enter(); 5060 5061 __ bind(LEN_OVER_15); 5062 __ push(spilled_regs, sp); 5063 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5064 __ cbz(rscratch2, ALIGNED); 5065 __ ldp(tmp6, tmp1, Address(ary1)); 5066 __ mov(tmp5, 16); 5067 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5068 __ add(ary1, ary1, rscratch1); 5069 __ orr(tmp6, tmp6, tmp1); 5070 __ tst(tmp6, UPPER_BIT_MASK); 5071 __ br(Assembler::NE, RET_ADJUST); 5072 __ sub(len, len, rscratch1); 5073 5074 __ bind(ALIGNED); 5075 __ cmp(len, large_loop_size); 5076 __ br(Assembler::LT, CHECK_16); 5077 // Perform 16-byte load as early return in pre-loop to handle situation 5078 // when initially aligned large array has negative values at starting bytes, 5079 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5080 // slower. Cases with negative bytes further ahead won't be affected that 5081 // much. In fact, it'll be faster due to early loads, less instructions and 5082 // less branches in LARGE_LOOP. 5083 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5084 __ sub(len, len, 16); 5085 __ orr(tmp6, tmp6, tmp1); 5086 __ tst(tmp6, UPPER_BIT_MASK); 5087 __ br(Assembler::NE, RET_ADJUST_16); 5088 __ cmp(len, large_loop_size); 5089 __ br(Assembler::LT, CHECK_16); 5090 5091 if (SoftwarePrefetchHintDistance >= 0 5092 && SoftwarePrefetchHintDistance >= dcache_line) { 5093 // initial prefetch 5094 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5095 } 5096 __ bind(LARGE_LOOP); 5097 if (SoftwarePrefetchHintDistance >= 0) { 5098 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5099 } 5100 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5101 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5102 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5103 // instructions per cycle and have less branches, but this approach disables 5104 // early return, thus, all 64 bytes are loaded and checked every time. 5105 __ ldp(tmp2, tmp3, Address(ary1)); 5106 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5107 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5108 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5109 __ add(ary1, ary1, large_loop_size); 5110 __ sub(len, len, large_loop_size); 5111 __ orr(tmp2, tmp2, tmp3); 5112 __ orr(tmp4, tmp4, tmp5); 5113 __ orr(rscratch1, rscratch1, rscratch2); 5114 __ orr(tmp6, tmp6, tmp1); 5115 __ orr(tmp2, tmp2, tmp4); 5116 __ orr(rscratch1, rscratch1, tmp6); 5117 __ orr(tmp2, tmp2, rscratch1); 5118 __ tst(tmp2, UPPER_BIT_MASK); 5119 __ br(Assembler::NE, RET_ADJUST_LONG); 5120 __ cmp(len, large_loop_size); 5121 __ br(Assembler::GE, LARGE_LOOP); 5122 5123 __ bind(CHECK_16); // small 16-byte load pre-loop 5124 __ cmp(len, (u1)16); 5125 __ br(Assembler::LT, POST_LOOP16); 5126 5127 __ bind(LOOP16); // small 16-byte load loop 5128 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5129 __ sub(len, len, 16); 5130 __ orr(tmp2, tmp2, tmp3); 5131 __ tst(tmp2, UPPER_BIT_MASK); 5132 __ br(Assembler::NE, RET_ADJUST_16); 5133 __ cmp(len, (u1)16); 5134 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5135 5136 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5137 __ cmp(len, (u1)8); 5138 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5139 __ ldr(tmp3, Address(__ post(ary1, 8))); 5140 __ tst(tmp3, UPPER_BIT_MASK); 5141 __ br(Assembler::NE, RET_ADJUST); 5142 __ sub(len, len, 8); 5143 5144 __ bind(POST_LOOP16_LOAD_TAIL); 5145 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5146 __ ldr(tmp1, Address(ary1)); 5147 __ mov(tmp2, 64); 5148 __ sub(tmp4, tmp2, len, __ LSL, 3); 5149 __ lslv(tmp1, tmp1, tmp4); 5150 __ tst(tmp1, UPPER_BIT_MASK); 5151 __ br(Assembler::NE, RET_ADJUST); 5152 // Fallthrough 5153 5154 __ bind(RET_LEN); 5155 __ pop(spilled_regs, sp); 5156 __ leave(); 5157 __ ret(lr); 5158 5159 // difference result - len is the count of guaranteed to be 5160 // positive bytes 5161 5162 __ bind(RET_ADJUST_LONG); 5163 __ add(len, len, (u1)(large_loop_size - 16)); 5164 __ bind(RET_ADJUST_16); 5165 __ add(len, len, 16); 5166 __ bind(RET_ADJUST); 5167 __ pop(spilled_regs, sp); 5168 __ leave(); 5169 __ sub(result, result, len); 5170 __ ret(lr); 5171 5172 return entry; 5173 } 5174 5175 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5176 bool usePrefetch, Label &NOT_EQUAL) { 5177 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5178 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5179 tmp7 = r12, tmp8 = r13; 5180 Label LOOP; 5181 5182 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5183 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5184 __ bind(LOOP); 5185 if (usePrefetch) { 5186 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5187 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5188 } 5189 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5190 __ eor(tmp1, tmp1, tmp2); 5191 __ eor(tmp3, tmp3, tmp4); 5192 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5193 __ orr(tmp1, tmp1, tmp3); 5194 __ cbnz(tmp1, NOT_EQUAL); 5195 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5196 __ eor(tmp5, tmp5, tmp6); 5197 __ eor(tmp7, tmp7, tmp8); 5198 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5199 __ orr(tmp5, tmp5, tmp7); 5200 __ cbnz(tmp5, NOT_EQUAL); 5201 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5202 __ eor(tmp1, tmp1, tmp2); 5203 __ eor(tmp3, tmp3, tmp4); 5204 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5205 __ orr(tmp1, tmp1, tmp3); 5206 __ cbnz(tmp1, NOT_EQUAL); 5207 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5208 __ eor(tmp5, tmp5, tmp6); 5209 __ sub(cnt1, cnt1, 8 * wordSize); 5210 __ eor(tmp7, tmp7, tmp8); 5211 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5212 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5213 // cmp) because subs allows an unlimited range of immediate operand. 5214 __ subs(tmp6, cnt1, loopThreshold); 5215 __ orr(tmp5, tmp5, tmp7); 5216 __ cbnz(tmp5, NOT_EQUAL); 5217 __ br(__ GE, LOOP); 5218 // post-loop 5219 __ eor(tmp1, tmp1, tmp2); 5220 __ eor(tmp3, tmp3, tmp4); 5221 __ orr(tmp1, tmp1, tmp3); 5222 __ sub(cnt1, cnt1, 2 * wordSize); 5223 __ cbnz(tmp1, NOT_EQUAL); 5224 } 5225 5226 void generate_large_array_equals_loop_simd(int loopThreshold, 5227 bool usePrefetch, Label &NOT_EQUAL) { 5228 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5229 tmp2 = rscratch2; 5230 Label LOOP; 5231 5232 __ bind(LOOP); 5233 if (usePrefetch) { 5234 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5235 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5236 } 5237 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5238 __ sub(cnt1, cnt1, 8 * wordSize); 5239 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5240 __ subs(tmp1, cnt1, loopThreshold); 5241 __ eor(v0, __ T16B, v0, v4); 5242 __ eor(v1, __ T16B, v1, v5); 5243 __ eor(v2, __ T16B, v2, v6); 5244 __ eor(v3, __ T16B, v3, v7); 5245 __ orr(v0, __ T16B, v0, v1); 5246 __ orr(v1, __ T16B, v2, v3); 5247 __ orr(v0, __ T16B, v0, v1); 5248 __ umov(tmp1, v0, __ D, 0); 5249 __ umov(tmp2, v0, __ D, 1); 5250 __ orr(tmp1, tmp1, tmp2); 5251 __ cbnz(tmp1, NOT_EQUAL); 5252 __ br(__ GE, LOOP); 5253 } 5254 5255 // a1 = r1 - array1 address 5256 // a2 = r2 - array2 address 5257 // result = r0 - return value. Already contains "false" 5258 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5259 // r3-r5 are reserved temporary registers 5260 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5261 address generate_large_array_equals() { 5262 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5263 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5264 tmp7 = r12, tmp8 = r13; 5265 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5266 SMALL_LOOP, POST_LOOP; 5267 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5268 // calculate if at least 32 prefetched bytes are used 5269 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5270 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5271 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5272 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5273 tmp5, tmp6, tmp7, tmp8); 5274 5275 __ align(CodeEntryAlignment); 5276 5277 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5278 5279 address entry = __ pc(); 5280 __ enter(); 5281 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5282 // also advance pointers to use post-increment instead of pre-increment 5283 __ add(a1, a1, wordSize); 5284 __ add(a2, a2, wordSize); 5285 if (AvoidUnalignedAccesses) { 5286 // both implementations (SIMD/nonSIMD) are using relatively large load 5287 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5288 // on some CPUs in case of address is not at least 16-byte aligned. 5289 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5290 // load if needed at least for 1st address and make if 16-byte aligned. 5291 Label ALIGNED16; 5292 __ tbz(a1, 3, ALIGNED16); 5293 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5294 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5295 __ sub(cnt1, cnt1, wordSize); 5296 __ eor(tmp1, tmp1, tmp2); 5297 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5298 __ bind(ALIGNED16); 5299 } 5300 if (UseSIMDForArrayEquals) { 5301 if (SoftwarePrefetchHintDistance >= 0) { 5302 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5303 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5304 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5305 /* prfm = */ true, NOT_EQUAL); 5306 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5307 __ br(__ LT, TAIL); 5308 } 5309 __ bind(NO_PREFETCH_LARGE_LOOP); 5310 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5311 /* prfm = */ false, NOT_EQUAL); 5312 } else { 5313 __ push(spilled_regs, sp); 5314 if (SoftwarePrefetchHintDistance >= 0) { 5315 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5316 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5317 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5318 /* prfm = */ true, NOT_EQUAL); 5319 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5320 __ br(__ LT, TAIL); 5321 } 5322 __ bind(NO_PREFETCH_LARGE_LOOP); 5323 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5324 /* prfm = */ false, NOT_EQUAL); 5325 } 5326 __ bind(TAIL); 5327 __ cbz(cnt1, EQUAL); 5328 __ subs(cnt1, cnt1, wordSize); 5329 __ br(__ LE, POST_LOOP); 5330 __ bind(SMALL_LOOP); 5331 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5332 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5333 __ subs(cnt1, cnt1, wordSize); 5334 __ eor(tmp1, tmp1, tmp2); 5335 __ cbnz(tmp1, NOT_EQUAL); 5336 __ br(__ GT, SMALL_LOOP); 5337 __ bind(POST_LOOP); 5338 __ ldr(tmp1, Address(a1, cnt1)); 5339 __ ldr(tmp2, Address(a2, cnt1)); 5340 __ eor(tmp1, tmp1, tmp2); 5341 __ cbnz(tmp1, NOT_EQUAL); 5342 __ bind(EQUAL); 5343 __ mov(result, true); 5344 __ bind(NOT_EQUAL); 5345 if (!UseSIMDForArrayEquals) { 5346 __ pop(spilled_regs, sp); 5347 } 5348 __ bind(NOT_EQUAL_NO_POP); 5349 __ leave(); 5350 __ ret(lr); 5351 return entry; 5352 } 5353 5354 // result = r0 - return value. Contains initial hashcode value on entry. 5355 // ary = r1 - array address 5356 // cnt = r2 - elements count 5357 // Clobbers: v0-v13, rscratch1, rscratch2 5358 address generate_large_arrays_hashcode(BasicType eltype) { 5359 const Register result = r0, ary = r1, cnt = r2; 5360 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 5361 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 5362 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 5363 const FloatRegister vpowm = v13; 5364 5365 ARRAYS_HASHCODE_REGISTERS; 5366 5367 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 5368 5369 unsigned int vf; // vectorization factor 5370 bool multiply_by_halves; 5371 Assembler::SIMD_Arrangement load_arrangement; 5372 switch (eltype) { 5373 case T_BOOLEAN: 5374 case T_BYTE: 5375 load_arrangement = Assembler::T8B; 5376 multiply_by_halves = true; 5377 vf = 8; 5378 break; 5379 case T_CHAR: 5380 case T_SHORT: 5381 load_arrangement = Assembler::T8H; 5382 multiply_by_halves = true; 5383 vf = 8; 5384 break; 5385 case T_INT: 5386 load_arrangement = Assembler::T4S; 5387 multiply_by_halves = false; 5388 vf = 4; 5389 break; 5390 default: 5391 ShouldNotReachHere(); 5392 } 5393 5394 // Unroll factor 5395 const unsigned uf = 4; 5396 5397 // Effective vectorization factor 5398 const unsigned evf = vf * uf; 5399 5400 __ align(CodeEntryAlignment); 5401 5402 const char *mark_name = ""; 5403 switch (eltype) { 5404 case T_BOOLEAN: 5405 mark_name = "_large_arrays_hashcode_boolean"; 5406 break; 5407 case T_BYTE: 5408 mark_name = "_large_arrays_hashcode_byte"; 5409 break; 5410 case T_CHAR: 5411 mark_name = "_large_arrays_hashcode_char"; 5412 break; 5413 case T_SHORT: 5414 mark_name = "_large_arrays_hashcode_short"; 5415 break; 5416 case T_INT: 5417 mark_name = "_large_arrays_hashcode_int"; 5418 break; 5419 default: 5420 mark_name = "_large_arrays_hashcode_incorrect_type"; 5421 __ should_not_reach_here(); 5422 }; 5423 5424 StubCodeMark mark(this, "StubRoutines", mark_name); 5425 5426 address entry = __ pc(); 5427 __ enter(); 5428 5429 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 5430 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 5431 // value shouldn't change throughout both loops. 5432 __ movw(rscratch1, intpow(31U, 3)); 5433 __ mov(vpow, Assembler::S, 0, rscratch1); 5434 __ movw(rscratch1, intpow(31U, 2)); 5435 __ mov(vpow, Assembler::S, 1, rscratch1); 5436 __ movw(rscratch1, intpow(31U, 1)); 5437 __ mov(vpow, Assembler::S, 2, rscratch1); 5438 __ movw(rscratch1, intpow(31U, 0)); 5439 __ mov(vpow, Assembler::S, 3, rscratch1); 5440 5441 __ mov(vmul0, Assembler::T16B, 0); 5442 __ mov(vmul0, Assembler::S, 3, result); 5443 5444 __ andr(rscratch2, cnt, (uf - 1) * vf); 5445 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 5446 5447 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 5448 __ mov(vpowm, Assembler::S, 0, rscratch1); 5449 5450 // SMALL LOOP 5451 __ bind(SMALL_LOOP); 5452 5453 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 5454 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5455 __ subsw(rscratch2, rscratch2, vf); 5456 5457 if (load_arrangement == Assembler::T8B) { 5458 // Extend 8B to 8H to be able to use vector multiply 5459 // instructions 5460 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 5461 if (is_signed_subword_type(eltype)) { 5462 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5463 } else { 5464 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5465 } 5466 } 5467 5468 switch (load_arrangement) { 5469 case Assembler::T4S: 5470 __ addv(vmul0, load_arrangement, vmul0, vdata0); 5471 break; 5472 case Assembler::T8B: 5473 case Assembler::T8H: 5474 assert(is_subword_type(eltype), "subword type expected"); 5475 if (is_signed_subword_type(eltype)) { 5476 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5477 } else { 5478 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5479 } 5480 break; 5481 default: 5482 __ should_not_reach_here(); 5483 } 5484 5485 // Process the upper half of a vector 5486 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 5487 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5488 if (is_signed_subword_type(eltype)) { 5489 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5490 } else { 5491 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5492 } 5493 } 5494 5495 __ br(Assembler::HI, SMALL_LOOP); 5496 5497 // SMALL LOOP'S EPILOQUE 5498 __ lsr(rscratch2, cnt, exact_log2(evf)); 5499 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 5500 5501 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 5502 __ addv(vmul0, Assembler::T4S, vmul0); 5503 __ umov(result, vmul0, Assembler::S, 0); 5504 5505 // TAIL 5506 __ bind(TAIL); 5507 5508 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 5509 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 5510 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 5511 __ andr(rscratch2, cnt, vf - 1); 5512 __ bind(TAIL_SHORTCUT); 5513 __ adr(rscratch1, BR_BASE); 5514 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 5515 __ movw(rscratch2, 0x1f); 5516 __ br(rscratch1); 5517 5518 for (size_t i = 0; i < vf - 1; ++i) { 5519 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 5520 eltype); 5521 __ maddw(result, result, rscratch2, rscratch1); 5522 } 5523 __ bind(BR_BASE); 5524 5525 __ leave(); 5526 __ ret(lr); 5527 5528 // LARGE LOOP 5529 __ bind(LARGE_LOOP_PREHEADER); 5530 5531 __ lsr(rscratch2, cnt, exact_log2(evf)); 5532 5533 if (multiply_by_halves) { 5534 // 31^4 - multiplier between lower and upper parts of a register 5535 __ movw(rscratch1, intpow(31U, vf / 2)); 5536 __ mov(vpowm, Assembler::S, 1, rscratch1); 5537 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 5538 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 5539 __ mov(vpowm, Assembler::S, 0, rscratch1); 5540 } else { 5541 // 31^16 5542 __ movw(rscratch1, intpow(31U, evf)); 5543 __ mov(vpowm, Assembler::S, 0, rscratch1); 5544 } 5545 5546 __ mov(vmul3, Assembler::T16B, 0); 5547 __ mov(vmul2, Assembler::T16B, 0); 5548 __ mov(vmul1, Assembler::T16B, 0); 5549 5550 __ bind(LARGE_LOOP); 5551 5552 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 5553 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 5554 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 5555 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5556 5557 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 5558 Address(__ post(ary, evf * type2aelembytes(eltype)))); 5559 5560 if (load_arrangement == Assembler::T8B) { 5561 // Extend 8B to 8H to be able to use vector multiply 5562 // instructions 5563 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 5564 if (is_signed_subword_type(eltype)) { 5565 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 5566 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 5567 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 5568 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5569 } else { 5570 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 5571 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 5572 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 5573 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5574 } 5575 } 5576 5577 switch (load_arrangement) { 5578 case Assembler::T4S: 5579 __ addv(vmul3, load_arrangement, vmul3, vdata3); 5580 __ addv(vmul2, load_arrangement, vmul2, vdata2); 5581 __ addv(vmul1, load_arrangement, vmul1, vdata1); 5582 __ addv(vmul0, load_arrangement, vmul0, vdata0); 5583 break; 5584 case Assembler::T8B: 5585 case Assembler::T8H: 5586 assert(is_subword_type(eltype), "subword type expected"); 5587 if (is_signed_subword_type(eltype)) { 5588 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 5589 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 5590 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 5591 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5592 } else { 5593 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 5594 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 5595 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 5596 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5597 } 5598 break; 5599 default: 5600 __ should_not_reach_here(); 5601 } 5602 5603 // Process the upper half of a vector 5604 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 5605 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 5606 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 5607 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 5608 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 5609 if (is_signed_subword_type(eltype)) { 5610 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 5611 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 5612 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 5613 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5614 } else { 5615 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 5616 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 5617 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 5618 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5619 } 5620 } 5621 5622 __ subsw(rscratch2, rscratch2, 1); 5623 __ br(Assembler::HI, LARGE_LOOP); 5624 5625 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 5626 __ addv(vmul3, Assembler::T4S, vmul3); 5627 __ umov(result, vmul3, Assembler::S, 0); 5628 5629 __ mov(rscratch2, intpow(31U, vf)); 5630 5631 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 5632 __ addv(vmul2, Assembler::T4S, vmul2); 5633 __ umov(rscratch1, vmul2, Assembler::S, 0); 5634 __ maddw(result, result, rscratch2, rscratch1); 5635 5636 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 5637 __ addv(vmul1, Assembler::T4S, vmul1); 5638 __ umov(rscratch1, vmul1, Assembler::S, 0); 5639 __ maddw(result, result, rscratch2, rscratch1); 5640 5641 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 5642 __ addv(vmul0, Assembler::T4S, vmul0); 5643 __ umov(rscratch1, vmul0, Assembler::S, 0); 5644 __ maddw(result, result, rscratch2, rscratch1); 5645 5646 __ andr(rscratch2, cnt, vf - 1); 5647 __ cbnz(rscratch2, TAIL_SHORTCUT); 5648 5649 __ leave(); 5650 __ ret(lr); 5651 5652 return entry; 5653 } 5654 5655 address generate_dsin_dcos(bool isCos) { 5656 __ align(CodeEntryAlignment); 5657 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5658 address start = __ pc(); 5659 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5660 (address)StubRoutines::aarch64::_two_over_pi, 5661 (address)StubRoutines::aarch64::_pio2, 5662 (address)StubRoutines::aarch64::_dsin_coef, 5663 (address)StubRoutines::aarch64::_dcos_coef); 5664 return start; 5665 } 5666 5667 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5668 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5669 Label &DIFF2) { 5670 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5671 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5672 5673 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5674 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5675 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5676 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5677 5678 __ fmovd(tmpL, vtmp3); 5679 __ eor(rscratch2, tmp3, tmpL); 5680 __ cbnz(rscratch2, DIFF2); 5681 5682 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5683 __ umov(tmpL, vtmp3, __ D, 1); 5684 __ eor(rscratch2, tmpU, tmpL); 5685 __ cbnz(rscratch2, DIFF1); 5686 5687 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5688 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5689 __ fmovd(tmpL, vtmp); 5690 __ eor(rscratch2, tmp3, tmpL); 5691 __ cbnz(rscratch2, DIFF2); 5692 5693 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5694 __ umov(tmpL, vtmp, __ D, 1); 5695 __ eor(rscratch2, tmpU, tmpL); 5696 __ cbnz(rscratch2, DIFF1); 5697 } 5698 5699 // r0 = result 5700 // r1 = str1 5701 // r2 = cnt1 5702 // r3 = str2 5703 // r4 = cnt2 5704 // r10 = tmp1 5705 // r11 = tmp2 5706 address generate_compare_long_string_different_encoding(bool isLU) { 5707 __ align(CodeEntryAlignment); 5708 StubCodeMark mark(this, "StubRoutines", isLU 5709 ? "compare_long_string_different_encoding LU" 5710 : "compare_long_string_different_encoding UL"); 5711 address entry = __ pc(); 5712 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5713 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5714 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5715 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5716 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5717 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5718 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5719 5720 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5721 5722 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5723 // cnt2 == amount of characters left to compare 5724 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5725 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5726 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5727 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5728 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5729 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5730 __ eor(rscratch2, tmp1, tmp2); 5731 __ mov(rscratch1, tmp2); 5732 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5733 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5734 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5735 __ push(spilled_regs, sp); 5736 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5737 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5738 5739 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5740 5741 if (SoftwarePrefetchHintDistance >= 0) { 5742 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5743 __ br(__ LT, NO_PREFETCH); 5744 __ bind(LARGE_LOOP_PREFETCH); 5745 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5746 __ mov(tmp4, 2); 5747 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5748 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5749 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5750 __ subs(tmp4, tmp4, 1); 5751 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5752 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5753 __ mov(tmp4, 2); 5754 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5755 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5756 __ subs(tmp4, tmp4, 1); 5757 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5758 __ sub(cnt2, cnt2, 64); 5759 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5760 __ br(__ GE, LARGE_LOOP_PREFETCH); 5761 } 5762 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5763 __ bind(NO_PREFETCH); 5764 __ subs(cnt2, cnt2, 16); 5765 __ br(__ LT, TAIL); 5766 __ align(OptoLoopAlignment); 5767 __ bind(SMALL_LOOP); // smaller loop 5768 __ subs(cnt2, cnt2, 16); 5769 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5770 __ br(__ GE, SMALL_LOOP); 5771 __ cmn(cnt2, (u1)16); 5772 __ br(__ EQ, LOAD_LAST); 5773 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5774 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5775 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5776 __ ldr(tmp3, Address(cnt1, -8)); 5777 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5778 __ b(LOAD_LAST); 5779 __ bind(DIFF2); 5780 __ mov(tmpU, tmp3); 5781 __ bind(DIFF1); 5782 __ pop(spilled_regs, sp); 5783 __ b(CALCULATE_DIFFERENCE); 5784 __ bind(LOAD_LAST); 5785 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5786 // No need to load it again 5787 __ mov(tmpU, tmp3); 5788 __ pop(spilled_regs, sp); 5789 5790 // tmp2 points to the address of the last 4 Latin1 characters right now 5791 __ ldrs(vtmp, Address(tmp2)); 5792 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5793 __ fmovd(tmpL, vtmp); 5794 5795 __ eor(rscratch2, tmpU, tmpL); 5796 __ cbz(rscratch2, DONE); 5797 5798 // Find the first different characters in the longwords and 5799 // compute their difference. 5800 __ bind(CALCULATE_DIFFERENCE); 5801 __ rev(rscratch2, rscratch2); 5802 __ clz(rscratch2, rscratch2); 5803 __ andr(rscratch2, rscratch2, -16); 5804 __ lsrv(tmp1, tmp1, rscratch2); 5805 __ uxthw(tmp1, tmp1); 5806 __ lsrv(rscratch1, rscratch1, rscratch2); 5807 __ uxthw(rscratch1, rscratch1); 5808 __ subw(result, tmp1, rscratch1); 5809 __ bind(DONE); 5810 __ ret(lr); 5811 return entry; 5812 } 5813 5814 // r0 = input (float16) 5815 // v0 = result (float) 5816 // v1 = temporary float register 5817 address generate_float16ToFloat() { 5818 __ align(CodeEntryAlignment); 5819 StubCodeMark mark(this, "StubRoutines", "float16ToFloat"); 5820 address entry = __ pc(); 5821 BLOCK_COMMENT("Entry:"); 5822 __ flt16_to_flt(v0, r0, v1); 5823 __ ret(lr); 5824 return entry; 5825 } 5826 5827 // v0 = input (float) 5828 // r0 = result (float16) 5829 // v1 = temporary float register 5830 address generate_floatToFloat16() { 5831 __ align(CodeEntryAlignment); 5832 StubCodeMark mark(this, "StubRoutines", "floatToFloat16"); 5833 address entry = __ pc(); 5834 BLOCK_COMMENT("Entry:"); 5835 __ flt_to_flt16(r0, v0, v1); 5836 __ ret(lr); 5837 return entry; 5838 } 5839 5840 address generate_method_entry_barrier() { 5841 __ align(CodeEntryAlignment); 5842 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5843 5844 Label deoptimize_label; 5845 5846 address start = __ pc(); 5847 5848 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5849 5850 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5851 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5852 // We can get here despite the nmethod being good, if we have not 5853 // yet applied our cross modification fence (or data fence). 5854 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5855 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5856 __ ldrw(rscratch2, rscratch2); 5857 __ strw(rscratch2, thread_epoch_addr); 5858 __ isb(); 5859 __ membar(__ LoadLoad); 5860 } 5861 5862 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5863 5864 __ enter(); 5865 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5866 5867 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5868 5869 __ push_call_clobbered_registers(); 5870 5871 __ mov(c_rarg0, rscratch2); 5872 __ call_VM_leaf 5873 (CAST_FROM_FN_PTR 5874 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5875 5876 __ reset_last_Java_frame(true); 5877 5878 __ mov(rscratch1, r0); 5879 5880 __ pop_call_clobbered_registers(); 5881 5882 __ cbnz(rscratch1, deoptimize_label); 5883 5884 __ leave(); 5885 __ ret(lr); 5886 5887 __ BIND(deoptimize_label); 5888 5889 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5890 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5891 5892 __ mov(sp, rscratch1); 5893 __ br(rscratch2); 5894 5895 return start; 5896 } 5897 5898 // r0 = result 5899 // r1 = str1 5900 // r2 = cnt1 5901 // r3 = str2 5902 // r4 = cnt2 5903 // r10 = tmp1 5904 // r11 = tmp2 5905 address generate_compare_long_string_same_encoding(bool isLL) { 5906 __ align(CodeEntryAlignment); 5907 StubCodeMark mark(this, "StubRoutines", isLL 5908 ? "compare_long_string_same_encoding LL" 5909 : "compare_long_string_same_encoding UU"); 5910 address entry = __ pc(); 5911 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5912 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5913 5914 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5915 5916 // exit from large loop when less than 64 bytes left to read or we're about 5917 // to prefetch memory behind array border 5918 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5919 5920 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5921 __ eor(rscratch2, tmp1, tmp2); 5922 __ cbnz(rscratch2, CAL_DIFFERENCE); 5923 5924 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5925 // update pointers, because of previous read 5926 __ add(str1, str1, wordSize); 5927 __ add(str2, str2, wordSize); 5928 if (SoftwarePrefetchHintDistance >= 0) { 5929 __ align(OptoLoopAlignment); 5930 __ bind(LARGE_LOOP_PREFETCH); 5931 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5932 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5933 5934 for (int i = 0; i < 4; i++) { 5935 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5936 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5937 __ cmp(tmp1, tmp2); 5938 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5939 __ br(Assembler::NE, DIFF); 5940 } 5941 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5942 __ add(str1, str1, 64); 5943 __ add(str2, str2, 64); 5944 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5945 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5946 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5947 } 5948 5949 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5950 __ br(Assembler::LE, LESS16); 5951 __ align(OptoLoopAlignment); 5952 __ bind(LOOP_COMPARE16); 5953 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5954 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5955 __ cmp(tmp1, tmp2); 5956 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5957 __ br(Assembler::NE, DIFF); 5958 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5959 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5960 __ br(Assembler::LT, LESS16); 5961 5962 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5963 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5964 __ cmp(tmp1, tmp2); 5965 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5966 __ br(Assembler::NE, DIFF); 5967 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5968 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5969 __ br(Assembler::GE, LOOP_COMPARE16); 5970 __ cbz(cnt2, LENGTH_DIFF); 5971 5972 __ bind(LESS16); 5973 // each 8 compare 5974 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5975 __ br(Assembler::LE, LESS8); 5976 __ ldr(tmp1, Address(__ post(str1, 8))); 5977 __ ldr(tmp2, Address(__ post(str2, 8))); 5978 __ eor(rscratch2, tmp1, tmp2); 5979 __ cbnz(rscratch2, CAL_DIFFERENCE); 5980 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5981 5982 __ bind(LESS8); // directly load last 8 bytes 5983 if (!isLL) { 5984 __ add(cnt2, cnt2, cnt2); 5985 } 5986 __ ldr(tmp1, Address(str1, cnt2)); 5987 __ ldr(tmp2, Address(str2, cnt2)); 5988 __ eor(rscratch2, tmp1, tmp2); 5989 __ cbz(rscratch2, LENGTH_DIFF); 5990 __ b(CAL_DIFFERENCE); 5991 5992 __ bind(DIFF); 5993 __ cmp(tmp1, tmp2); 5994 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5995 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5996 // reuse rscratch2 register for the result of eor instruction 5997 __ eor(rscratch2, tmp1, tmp2); 5998 5999 __ bind(CAL_DIFFERENCE); 6000 __ rev(rscratch2, rscratch2); 6001 __ clz(rscratch2, rscratch2); 6002 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 6003 __ lsrv(tmp1, tmp1, rscratch2); 6004 __ lsrv(tmp2, tmp2, rscratch2); 6005 if (isLL) { 6006 __ uxtbw(tmp1, tmp1); 6007 __ uxtbw(tmp2, tmp2); 6008 } else { 6009 __ uxthw(tmp1, tmp1); 6010 __ uxthw(tmp2, tmp2); 6011 } 6012 __ subw(result, tmp1, tmp2); 6013 6014 __ bind(LENGTH_DIFF); 6015 __ ret(lr); 6016 return entry; 6017 } 6018 6019 enum string_compare_mode { 6020 LL, 6021 LU, 6022 UL, 6023 UU, 6024 }; 6025 6026 // The following registers are declared in aarch64.ad 6027 // r0 = result 6028 // r1 = str1 6029 // r2 = cnt1 6030 // r3 = str2 6031 // r4 = cnt2 6032 // r10 = tmp1 6033 // r11 = tmp2 6034 // z0 = ztmp1 6035 // z1 = ztmp2 6036 // p0 = pgtmp1 6037 // p1 = pgtmp2 6038 address generate_compare_long_string_sve(string_compare_mode mode) { 6039 __ align(CodeEntryAlignment); 6040 address entry = __ pc(); 6041 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6042 tmp1 = r10, tmp2 = r11; 6043 6044 Label LOOP, DONE, MISMATCH; 6045 Register vec_len = tmp1; 6046 Register idx = tmp2; 6047 // The minimum of the string lengths has been stored in cnt2. 6048 Register cnt = cnt2; 6049 FloatRegister ztmp1 = z0, ztmp2 = z1; 6050 PRegister pgtmp1 = p0, pgtmp2 = p1; 6051 6052 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 6053 switch (mode) { \ 6054 case LL: \ 6055 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 6056 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 6057 break; \ 6058 case LU: \ 6059 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 6060 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 6061 break; \ 6062 case UL: \ 6063 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 6064 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 6065 break; \ 6066 case UU: \ 6067 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 6068 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 6069 break; \ 6070 default: \ 6071 ShouldNotReachHere(); \ 6072 } 6073 6074 const char* stubname; 6075 switch (mode) { 6076 case LL: stubname = "compare_long_string_same_encoding LL"; break; 6077 case LU: stubname = "compare_long_string_different_encoding LU"; break; 6078 case UL: stubname = "compare_long_string_different_encoding UL"; break; 6079 case UU: stubname = "compare_long_string_same_encoding UU"; break; 6080 default: ShouldNotReachHere(); 6081 } 6082 6083 StubCodeMark mark(this, "StubRoutines", stubname); 6084 6085 __ mov(idx, 0); 6086 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 6087 6088 if (mode == LL) { 6089 __ sve_cntb(vec_len); 6090 } else { 6091 __ sve_cnth(vec_len); 6092 } 6093 6094 __ sub(rscratch1, cnt, vec_len); 6095 6096 __ bind(LOOP); 6097 6098 // main loop 6099 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 6100 __ add(idx, idx, vec_len); 6101 // Compare strings. 6102 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 6103 __ br(__ NE, MISMATCH); 6104 __ cmp(idx, rscratch1); 6105 __ br(__ LT, LOOP); 6106 6107 // post loop, last iteration 6108 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 6109 6110 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 6111 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 6112 __ br(__ EQ, DONE); 6113 6114 __ bind(MISMATCH); 6115 6116 // Crop the vector to find its location. 6117 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 6118 // Extract the first different characters of each string. 6119 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 6120 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 6121 6122 // Compute the difference of the first different characters. 6123 __ sub(result, rscratch1, rscratch2); 6124 6125 __ bind(DONE); 6126 __ ret(lr); 6127 #undef LOAD_PAIR 6128 return entry; 6129 } 6130 6131 void generate_compare_long_strings() { 6132 if (UseSVE == 0) { 6133 StubRoutines::aarch64::_compare_long_string_LL 6134 = generate_compare_long_string_same_encoding(true); 6135 StubRoutines::aarch64::_compare_long_string_UU 6136 = generate_compare_long_string_same_encoding(false); 6137 StubRoutines::aarch64::_compare_long_string_LU 6138 = generate_compare_long_string_different_encoding(true); 6139 StubRoutines::aarch64::_compare_long_string_UL 6140 = generate_compare_long_string_different_encoding(false); 6141 } else { 6142 StubRoutines::aarch64::_compare_long_string_LL 6143 = generate_compare_long_string_sve(LL); 6144 StubRoutines::aarch64::_compare_long_string_UU 6145 = generate_compare_long_string_sve(UU); 6146 StubRoutines::aarch64::_compare_long_string_LU 6147 = generate_compare_long_string_sve(LU); 6148 StubRoutines::aarch64::_compare_long_string_UL 6149 = generate_compare_long_string_sve(UL); 6150 } 6151 } 6152 6153 // R0 = result 6154 // R1 = str2 6155 // R2 = cnt1 6156 // R3 = str1 6157 // R4 = cnt2 6158 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 6159 // 6160 // This generic linear code use few additional ideas, which makes it faster: 6161 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 6162 // in order to skip initial loading(help in systems with 1 ld pipeline) 6163 // 2) we can use "fast" algorithm of finding single character to search for 6164 // first symbol with less branches(1 branch per each loaded register instead 6165 // of branch for each symbol), so, this is where constants like 6166 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 6167 // 3) after loading and analyzing 1st register of source string, it can be 6168 // used to search for every 1st character entry, saving few loads in 6169 // comparison with "simplier-but-slower" implementation 6170 // 4) in order to avoid lots of push/pop operations, code below is heavily 6171 // re-using/re-initializing/compressing register values, which makes code 6172 // larger and a bit less readable, however, most of extra operations are 6173 // issued during loads or branches, so, penalty is minimal 6174 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 6175 const char* stubName = str1_isL 6176 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 6177 : "indexof_linear_uu"; 6178 __ align(CodeEntryAlignment); 6179 StubCodeMark mark(this, "StubRoutines", stubName); 6180 address entry = __ pc(); 6181 6182 int str1_chr_size = str1_isL ? 1 : 2; 6183 int str2_chr_size = str2_isL ? 1 : 2; 6184 int str1_chr_shift = str1_isL ? 0 : 1; 6185 int str2_chr_shift = str2_isL ? 0 : 1; 6186 bool isL = str1_isL && str2_isL; 6187 // parameters 6188 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 6189 // temporary registers 6190 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 6191 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 6192 // redefinitions 6193 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 6194 6195 __ push(spilled_regs, sp); 6196 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 6197 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 6198 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 6199 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 6200 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 6201 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 6202 // Read whole register from str1. It is safe, because length >=8 here 6203 __ ldr(ch1, Address(str1)); 6204 // Read whole register from str2. It is safe, because length >=8 here 6205 __ ldr(ch2, Address(str2)); 6206 __ sub(cnt2, cnt2, cnt1); 6207 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 6208 if (str1_isL != str2_isL) { 6209 __ eor(v0, __ T16B, v0, v0); 6210 } 6211 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 6212 __ mul(first, first, tmp1); 6213 // check if we have less than 1 register to check 6214 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 6215 if (str1_isL != str2_isL) { 6216 __ fmovd(v1, ch1); 6217 } 6218 __ br(__ LE, L_SMALL); 6219 __ eor(ch2, first, ch2); 6220 if (str1_isL != str2_isL) { 6221 __ zip1(v1, __ T16B, v1, v0); 6222 } 6223 __ sub(tmp2, ch2, tmp1); 6224 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6225 __ bics(tmp2, tmp2, ch2); 6226 if (str1_isL != str2_isL) { 6227 __ fmovd(ch1, v1); 6228 } 6229 __ br(__ NE, L_HAS_ZERO); 6230 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 6231 __ add(result, result, wordSize/str2_chr_size); 6232 __ add(str2, str2, wordSize); 6233 __ br(__ LT, L_POST_LOOP); 6234 __ BIND(L_LOOP); 6235 __ ldr(ch2, Address(str2)); 6236 __ eor(ch2, first, ch2); 6237 __ sub(tmp2, ch2, tmp1); 6238 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6239 __ bics(tmp2, tmp2, ch2); 6240 __ br(__ NE, L_HAS_ZERO); 6241 __ BIND(L_LOOP_PROCEED); 6242 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 6243 __ add(str2, str2, wordSize); 6244 __ add(result, result, wordSize/str2_chr_size); 6245 __ br(__ GE, L_LOOP); 6246 __ BIND(L_POST_LOOP); 6247 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 6248 __ br(__ LE, NOMATCH); 6249 __ ldr(ch2, Address(str2)); 6250 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 6251 __ eor(ch2, first, ch2); 6252 __ sub(tmp2, ch2, tmp1); 6253 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6254 __ mov(tmp4, -1); // all bits set 6255 __ b(L_SMALL_PROCEED); 6256 __ align(OptoLoopAlignment); 6257 __ BIND(L_SMALL); 6258 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 6259 __ eor(ch2, first, ch2); 6260 if (str1_isL != str2_isL) { 6261 __ zip1(v1, __ T16B, v1, v0); 6262 } 6263 __ sub(tmp2, ch2, tmp1); 6264 __ mov(tmp4, -1); // all bits set 6265 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6266 if (str1_isL != str2_isL) { 6267 __ fmovd(ch1, v1); // move converted 4 symbols 6268 } 6269 __ BIND(L_SMALL_PROCEED); 6270 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 6271 __ bic(tmp2, tmp2, ch2); 6272 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 6273 __ rbit(tmp2, tmp2); 6274 __ br(__ EQ, NOMATCH); 6275 __ BIND(L_SMALL_HAS_ZERO_LOOP); 6276 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 6277 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 6278 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 6279 if (str2_isL) { // LL 6280 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6281 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6282 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6283 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6284 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6285 } else { 6286 __ mov(ch2, 0xE); // all bits in byte set except last one 6287 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6288 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6289 __ lslv(tmp2, tmp2, tmp4); 6290 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6291 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6292 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6293 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6294 } 6295 __ cmp(ch1, ch2); 6296 __ mov(tmp4, wordSize/str2_chr_size); 6297 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6298 __ BIND(L_SMALL_CMP_LOOP); 6299 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6300 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6301 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6302 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6303 __ add(tmp4, tmp4, 1); 6304 __ cmp(tmp4, cnt1); 6305 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 6306 __ cmp(first, ch2); 6307 __ br(__ EQ, L_SMALL_CMP_LOOP); 6308 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 6309 __ cbz(tmp2, NOMATCH); // no more matches. exit 6310 __ clz(tmp4, tmp2); 6311 __ add(result, result, 1); // advance index 6312 __ add(str2, str2, str2_chr_size); // advance pointer 6313 __ b(L_SMALL_HAS_ZERO_LOOP); 6314 __ align(OptoLoopAlignment); 6315 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 6316 __ cmp(first, ch2); 6317 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6318 __ b(DONE); 6319 __ align(OptoLoopAlignment); 6320 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 6321 if (str2_isL) { // LL 6322 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6323 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6324 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6325 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6326 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6327 } else { 6328 __ mov(ch2, 0xE); // all bits in byte set except last one 6329 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6330 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6331 __ lslv(tmp2, tmp2, tmp4); 6332 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6333 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6334 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6335 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6336 } 6337 __ cmp(ch1, ch2); 6338 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6339 __ b(DONE); 6340 __ align(OptoLoopAlignment); 6341 __ BIND(L_HAS_ZERO); 6342 __ rbit(tmp2, tmp2); 6343 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 6344 // Now, perform compression of counters(cnt2 and cnt1) into one register. 6345 // It's fine because both counters are 32bit and are not changed in this 6346 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 6347 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 6348 __ sub(result, result, 1); 6349 __ BIND(L_HAS_ZERO_LOOP); 6350 __ mov(cnt1, wordSize/str2_chr_size); 6351 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6352 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6353 if (str2_isL) { 6354 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6355 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6356 __ lslv(tmp2, tmp2, tmp4); 6357 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6358 __ add(tmp4, tmp4, 1); 6359 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6360 __ lsl(tmp2, tmp2, 1); 6361 __ mov(tmp4, wordSize/str2_chr_size); 6362 } else { 6363 __ mov(ch2, 0xE); 6364 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6365 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6366 __ lslv(tmp2, tmp2, tmp4); 6367 __ add(tmp4, tmp4, 1); 6368 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6369 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6370 __ lsl(tmp2, tmp2, 1); 6371 __ mov(tmp4, wordSize/str2_chr_size); 6372 __ sub(str2, str2, str2_chr_size); 6373 } 6374 __ cmp(ch1, ch2); 6375 __ mov(tmp4, wordSize/str2_chr_size); 6376 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6377 __ BIND(L_CMP_LOOP); 6378 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6379 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6380 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6381 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6382 __ add(tmp4, tmp4, 1); 6383 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6384 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6385 __ cmp(cnt1, ch2); 6386 __ br(__ EQ, L_CMP_LOOP); 6387 __ BIND(L_CMP_LOOP_NOMATCH); 6388 // here we're not matched 6389 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6390 __ clz(tmp4, tmp2); 6391 __ add(str2, str2, str2_chr_size); // advance pointer 6392 __ b(L_HAS_ZERO_LOOP); 6393 __ align(OptoLoopAlignment); 6394 __ BIND(L_CMP_LOOP_LAST_CMP); 6395 __ cmp(cnt1, ch2); 6396 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6397 __ b(DONE); 6398 __ align(OptoLoopAlignment); 6399 __ BIND(L_CMP_LOOP_LAST_CMP2); 6400 if (str2_isL) { 6401 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6402 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6403 __ lslv(tmp2, tmp2, tmp4); 6404 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6405 __ add(tmp4, tmp4, 1); 6406 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6407 __ lsl(tmp2, tmp2, 1); 6408 } else { 6409 __ mov(ch2, 0xE); 6410 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6411 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6412 __ lslv(tmp2, tmp2, tmp4); 6413 __ add(tmp4, tmp4, 1); 6414 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6415 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6416 __ lsl(tmp2, tmp2, 1); 6417 __ sub(str2, str2, str2_chr_size); 6418 } 6419 __ cmp(ch1, ch2); 6420 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6421 __ b(DONE); 6422 __ align(OptoLoopAlignment); 6423 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6424 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6425 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6426 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6427 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6428 // result by analyzed characters value, so, we can just reset lower bits 6429 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6430 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6431 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6432 // index of last analyzed substring inside current octet. So, str2 in at 6433 // respective start address. We need to advance it to next octet 6434 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6435 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6436 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6437 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6438 __ movw(cnt2, cnt2); 6439 __ b(L_LOOP_PROCEED); 6440 __ align(OptoLoopAlignment); 6441 __ BIND(NOMATCH); 6442 __ mov(result, -1); 6443 __ BIND(DONE); 6444 __ pop(spilled_regs, sp); 6445 __ ret(lr); 6446 return entry; 6447 } 6448 6449 void generate_string_indexof_stubs() { 6450 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6451 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6452 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6453 } 6454 6455 void inflate_and_store_2_fp_registers(bool generatePrfm, 6456 FloatRegister src1, FloatRegister src2) { 6457 Register dst = r1; 6458 __ zip1(v1, __ T16B, src1, v0); 6459 __ zip2(v2, __ T16B, src1, v0); 6460 if (generatePrfm) { 6461 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6462 } 6463 __ zip1(v3, __ T16B, src2, v0); 6464 __ zip2(v4, __ T16B, src2, v0); 6465 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6466 } 6467 6468 // R0 = src 6469 // R1 = dst 6470 // R2 = len 6471 // R3 = len >> 3 6472 // V0 = 0 6473 // v1 = loaded 8 bytes 6474 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6475 address generate_large_byte_array_inflate() { 6476 __ align(CodeEntryAlignment); 6477 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6478 address entry = __ pc(); 6479 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6480 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6481 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6482 6483 // do one more 8-byte read to have address 16-byte aligned in most cases 6484 // also use single store instruction 6485 __ ldrd(v2, __ post(src, 8)); 6486 __ sub(octetCounter, octetCounter, 2); 6487 __ zip1(v1, __ T16B, v1, v0); 6488 __ zip1(v2, __ T16B, v2, v0); 6489 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6490 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6491 __ subs(rscratch1, octetCounter, large_loop_threshold); 6492 __ br(__ LE, LOOP_START); 6493 __ b(LOOP_PRFM_START); 6494 __ bind(LOOP_PRFM); 6495 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6496 __ bind(LOOP_PRFM_START); 6497 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6498 __ sub(octetCounter, octetCounter, 8); 6499 __ subs(rscratch1, octetCounter, large_loop_threshold); 6500 inflate_and_store_2_fp_registers(true, v3, v4); 6501 inflate_and_store_2_fp_registers(true, v5, v6); 6502 __ br(__ GT, LOOP_PRFM); 6503 __ cmp(octetCounter, (u1)8); 6504 __ br(__ LT, DONE); 6505 __ bind(LOOP); 6506 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6507 __ bind(LOOP_START); 6508 __ sub(octetCounter, octetCounter, 8); 6509 __ cmp(octetCounter, (u1)8); 6510 inflate_and_store_2_fp_registers(false, v3, v4); 6511 inflate_and_store_2_fp_registers(false, v5, v6); 6512 __ br(__ GE, LOOP); 6513 __ bind(DONE); 6514 __ ret(lr); 6515 return entry; 6516 } 6517 6518 /** 6519 * Arguments: 6520 * 6521 * Input: 6522 * c_rarg0 - current state address 6523 * c_rarg1 - H key address 6524 * c_rarg2 - data address 6525 * c_rarg3 - number of blocks 6526 * 6527 * Output: 6528 * Updated state at c_rarg0 6529 */ 6530 address generate_ghash_processBlocks() { 6531 // Bafflingly, GCM uses little-endian for the byte order, but 6532 // big-endian for the bit order. For example, the polynomial 1 is 6533 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6534 // 6535 // So, we must either reverse the bytes in each word and do 6536 // everything big-endian or reverse the bits in each byte and do 6537 // it little-endian. On AArch64 it's more idiomatic to reverse 6538 // the bits in each byte (we have an instruction, RBIT, to do 6539 // that) and keep the data in little-endian bit order through the 6540 // calculation, bit-reversing the inputs and outputs. 6541 6542 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6543 __ align(wordSize * 2); 6544 address p = __ pc(); 6545 __ emit_int64(0x87); // The low-order bits of the field 6546 // polynomial (i.e. p = z^7+z^2+z+1) 6547 // repeated in the low and high parts of a 6548 // 128-bit vector 6549 __ emit_int64(0x87); 6550 6551 __ align(CodeEntryAlignment); 6552 address start = __ pc(); 6553 6554 Register state = c_rarg0; 6555 Register subkeyH = c_rarg1; 6556 Register data = c_rarg2; 6557 Register blocks = c_rarg3; 6558 6559 FloatRegister vzr = v30; 6560 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6561 6562 __ ldrq(v24, p); // The field polynomial 6563 6564 __ ldrq(v0, Address(state)); 6565 __ ldrq(v1, Address(subkeyH)); 6566 6567 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6568 __ rbit(v0, __ T16B, v0); 6569 __ rev64(v1, __ T16B, v1); 6570 __ rbit(v1, __ T16B, v1); 6571 6572 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6573 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6574 6575 { 6576 Label L_ghash_loop; 6577 __ bind(L_ghash_loop); 6578 6579 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6580 // reversing each byte 6581 __ rbit(v2, __ T16B, v2); 6582 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6583 6584 // Multiply state in v2 by subkey in v1 6585 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6586 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6587 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6588 // Reduce v7:v5 by the field polynomial 6589 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6590 6591 __ sub(blocks, blocks, 1); 6592 __ cbnz(blocks, L_ghash_loop); 6593 } 6594 6595 // The bit-reversed result is at this point in v0 6596 __ rev64(v0, __ T16B, v0); 6597 __ rbit(v0, __ T16B, v0); 6598 6599 __ st1(v0, __ T16B, state); 6600 __ ret(lr); 6601 6602 return start; 6603 } 6604 6605 address generate_ghash_processBlocks_wide() { 6606 address small = generate_ghash_processBlocks(); 6607 6608 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6609 __ align(wordSize * 2); 6610 address p = __ pc(); 6611 __ emit_int64(0x87); // The low-order bits of the field 6612 // polynomial (i.e. p = z^7+z^2+z+1) 6613 // repeated in the low and high parts of a 6614 // 128-bit vector 6615 __ emit_int64(0x87); 6616 6617 __ align(CodeEntryAlignment); 6618 address start = __ pc(); 6619 6620 Register state = c_rarg0; 6621 Register subkeyH = c_rarg1; 6622 Register data = c_rarg2; 6623 Register blocks = c_rarg3; 6624 6625 const int unroll = 4; 6626 6627 __ cmp(blocks, (unsigned char)(unroll * 2)); 6628 __ br(__ LT, small); 6629 6630 if (unroll > 1) { 6631 // Save state before entering routine 6632 __ sub(sp, sp, 4 * 16); 6633 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6634 __ sub(sp, sp, 4 * 16); 6635 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6636 } 6637 6638 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6639 6640 if (unroll > 1) { 6641 // And restore state 6642 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6643 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6644 } 6645 6646 __ cmp(blocks, (unsigned char)0); 6647 __ br(__ GT, small); 6648 6649 __ ret(lr); 6650 6651 return start; 6652 } 6653 6654 void generate_base64_encode_simdround(Register src, Register dst, 6655 FloatRegister codec, u8 size) { 6656 6657 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6658 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6659 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6660 6661 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6662 6663 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6664 6665 __ ushr(ind0, arrangement, in0, 2); 6666 6667 __ ushr(ind1, arrangement, in1, 2); 6668 __ shl(in0, arrangement, in0, 6); 6669 __ orr(ind1, arrangement, ind1, in0); 6670 __ ushr(ind1, arrangement, ind1, 2); 6671 6672 __ ushr(ind2, arrangement, in2, 4); 6673 __ shl(in1, arrangement, in1, 4); 6674 __ orr(ind2, arrangement, in1, ind2); 6675 __ ushr(ind2, arrangement, ind2, 2); 6676 6677 __ shl(ind3, arrangement, in2, 2); 6678 __ ushr(ind3, arrangement, ind3, 2); 6679 6680 __ tbl(out0, arrangement, codec, 4, ind0); 6681 __ tbl(out1, arrangement, codec, 4, ind1); 6682 __ tbl(out2, arrangement, codec, 4, ind2); 6683 __ tbl(out3, arrangement, codec, 4, ind3); 6684 6685 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6686 } 6687 6688 /** 6689 * Arguments: 6690 * 6691 * Input: 6692 * c_rarg0 - src_start 6693 * c_rarg1 - src_offset 6694 * c_rarg2 - src_length 6695 * c_rarg3 - dest_start 6696 * c_rarg4 - dest_offset 6697 * c_rarg5 - isURL 6698 * 6699 */ 6700 address generate_base64_encodeBlock() { 6701 6702 static const char toBase64[64] = { 6703 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6704 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6705 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6706 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6707 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6708 }; 6709 6710 static const char toBase64URL[64] = { 6711 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6712 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6713 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6714 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6715 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6716 }; 6717 6718 __ align(CodeEntryAlignment); 6719 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6720 address start = __ pc(); 6721 6722 Register src = c_rarg0; // source array 6723 Register soff = c_rarg1; // source start offset 6724 Register send = c_rarg2; // source end offset 6725 Register dst = c_rarg3; // dest array 6726 Register doff = c_rarg4; // position for writing to dest array 6727 Register isURL = c_rarg5; // Base64 or URL character set 6728 6729 // c_rarg6 and c_rarg7 are free to use as temps 6730 Register codec = c_rarg6; 6731 Register length = c_rarg7; 6732 6733 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6734 6735 __ add(src, src, soff); 6736 __ add(dst, dst, doff); 6737 __ sub(length, send, soff); 6738 6739 // load the codec base address 6740 __ lea(codec, ExternalAddress((address) toBase64)); 6741 __ cbz(isURL, ProcessData); 6742 __ lea(codec, ExternalAddress((address) toBase64URL)); 6743 6744 __ BIND(ProcessData); 6745 6746 // too short to formup a SIMD loop, roll back 6747 __ cmp(length, (u1)24); 6748 __ br(Assembler::LT, Process3B); 6749 6750 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6751 6752 __ BIND(Process48B); 6753 __ cmp(length, (u1)48); 6754 __ br(Assembler::LT, Process24B); 6755 generate_base64_encode_simdround(src, dst, v0, 16); 6756 __ sub(length, length, 48); 6757 __ b(Process48B); 6758 6759 __ BIND(Process24B); 6760 __ cmp(length, (u1)24); 6761 __ br(Assembler::LT, SIMDExit); 6762 generate_base64_encode_simdround(src, dst, v0, 8); 6763 __ sub(length, length, 24); 6764 6765 __ BIND(SIMDExit); 6766 __ cbz(length, Exit); 6767 6768 __ BIND(Process3B); 6769 // 3 src bytes, 24 bits 6770 __ ldrb(r10, __ post(src, 1)); 6771 __ ldrb(r11, __ post(src, 1)); 6772 __ ldrb(r12, __ post(src, 1)); 6773 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6774 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6775 // codec index 6776 __ ubfmw(r15, r12, 18, 23); 6777 __ ubfmw(r14, r12, 12, 17); 6778 __ ubfmw(r13, r12, 6, 11); 6779 __ andw(r12, r12, 63); 6780 // get the code based on the codec 6781 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6782 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6783 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6784 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6785 __ strb(r15, __ post(dst, 1)); 6786 __ strb(r14, __ post(dst, 1)); 6787 __ strb(r13, __ post(dst, 1)); 6788 __ strb(r12, __ post(dst, 1)); 6789 __ sub(length, length, 3); 6790 __ cbnz(length, Process3B); 6791 6792 __ BIND(Exit); 6793 __ ret(lr); 6794 6795 return start; 6796 } 6797 6798 void generate_base64_decode_simdround(Register src, Register dst, 6799 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6800 6801 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6802 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6803 6804 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6805 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6806 6807 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6808 6809 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6810 6811 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6812 6813 // we need unsigned saturating subtract, to make sure all input values 6814 // in range [0, 63] will have 0U value in the higher half lookup 6815 __ uqsubv(decH0, __ T16B, in0, v27); 6816 __ uqsubv(decH1, __ T16B, in1, v27); 6817 __ uqsubv(decH2, __ T16B, in2, v27); 6818 __ uqsubv(decH3, __ T16B, in3, v27); 6819 6820 // lower half lookup 6821 __ tbl(decL0, arrangement, codecL, 4, in0); 6822 __ tbl(decL1, arrangement, codecL, 4, in1); 6823 __ tbl(decL2, arrangement, codecL, 4, in2); 6824 __ tbl(decL3, arrangement, codecL, 4, in3); 6825 6826 // higher half lookup 6827 __ tbx(decH0, arrangement, codecH, 4, decH0); 6828 __ tbx(decH1, arrangement, codecH, 4, decH1); 6829 __ tbx(decH2, arrangement, codecH, 4, decH2); 6830 __ tbx(decH3, arrangement, codecH, 4, decH3); 6831 6832 // combine lower and higher 6833 __ orr(decL0, arrangement, decL0, decH0); 6834 __ orr(decL1, arrangement, decL1, decH1); 6835 __ orr(decL2, arrangement, decL2, decH2); 6836 __ orr(decL3, arrangement, decL3, decH3); 6837 6838 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6839 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6840 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6841 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6842 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6843 __ orr(in0, arrangement, decH0, decH1); 6844 __ orr(in1, arrangement, decH2, decH3); 6845 __ orr(in2, arrangement, in0, in1); 6846 __ umaxv(in3, arrangement, in2); 6847 __ umov(rscratch2, in3, __ B, 0); 6848 6849 // get the data to output 6850 __ shl(out0, arrangement, decL0, 2); 6851 __ ushr(out1, arrangement, decL1, 4); 6852 __ orr(out0, arrangement, out0, out1); 6853 __ shl(out1, arrangement, decL1, 4); 6854 __ ushr(out2, arrangement, decL2, 2); 6855 __ orr(out1, arrangement, out1, out2); 6856 __ shl(out2, arrangement, decL2, 6); 6857 __ orr(out2, arrangement, out2, decL3); 6858 6859 __ cbz(rscratch2, NoIllegalData); 6860 6861 // handle illegal input 6862 __ umov(r10, in2, __ D, 0); 6863 if (size == 16) { 6864 __ cbnz(r10, ErrorInLowerHalf); 6865 6866 // illegal input is in higher half, store the lower half now. 6867 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6868 6869 __ umov(r10, in2, __ D, 1); 6870 __ umov(r11, out0, __ D, 1); 6871 __ umov(r12, out1, __ D, 1); 6872 __ umov(r13, out2, __ D, 1); 6873 __ b(StoreLegalData); 6874 6875 __ BIND(ErrorInLowerHalf); 6876 } 6877 __ umov(r11, out0, __ D, 0); 6878 __ umov(r12, out1, __ D, 0); 6879 __ umov(r13, out2, __ D, 0); 6880 6881 __ BIND(StoreLegalData); 6882 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6883 __ strb(r11, __ post(dst, 1)); 6884 __ strb(r12, __ post(dst, 1)); 6885 __ strb(r13, __ post(dst, 1)); 6886 __ lsr(r10, r10, 8); 6887 __ lsr(r11, r11, 8); 6888 __ lsr(r12, r12, 8); 6889 __ lsr(r13, r13, 8); 6890 __ b(StoreLegalData); 6891 6892 __ BIND(NoIllegalData); 6893 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6894 } 6895 6896 6897 /** 6898 * Arguments: 6899 * 6900 * Input: 6901 * c_rarg0 - src_start 6902 * c_rarg1 - src_offset 6903 * c_rarg2 - src_length 6904 * c_rarg3 - dest_start 6905 * c_rarg4 - dest_offset 6906 * c_rarg5 - isURL 6907 * c_rarg6 - isMIME 6908 * 6909 */ 6910 address generate_base64_decodeBlock() { 6911 6912 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6913 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6914 // titled "Base64 decoding". 6915 6916 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6917 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6918 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6919 static const uint8_t fromBase64ForNoSIMD[256] = { 6920 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6921 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6922 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6923 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6924 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6925 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6926 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6927 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6928 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6929 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6930 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6931 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6932 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6933 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6934 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6935 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6936 }; 6937 6938 static const uint8_t fromBase64URLForNoSIMD[256] = { 6939 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6940 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6941 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6942 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6943 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6944 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6945 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6946 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6947 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6948 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6949 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6950 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6951 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6952 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6953 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6954 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6955 }; 6956 6957 // A legal value of base64 code is in range [0, 127]. We need two lookups 6958 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6959 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6960 // table vector lookup use tbx, out of range indices are unchanged in 6961 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6962 // The value of index 64 is set to 0, so that we know that we already get the 6963 // decoded data with the 1st lookup. 6964 static const uint8_t fromBase64ForSIMD[128] = { 6965 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6966 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6967 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6968 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6969 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6970 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6971 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6972 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6973 }; 6974 6975 static const uint8_t fromBase64URLForSIMD[128] = { 6976 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6977 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6978 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6979 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6980 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6981 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6982 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6983 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6984 }; 6985 6986 __ align(CodeEntryAlignment); 6987 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6988 address start = __ pc(); 6989 6990 Register src = c_rarg0; // source array 6991 Register soff = c_rarg1; // source start offset 6992 Register send = c_rarg2; // source end offset 6993 Register dst = c_rarg3; // dest array 6994 Register doff = c_rarg4; // position for writing to dest array 6995 Register isURL = c_rarg5; // Base64 or URL character set 6996 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6997 6998 Register length = send; // reuse send as length of source data to process 6999 7000 Register simd_codec = c_rarg6; 7001 Register nosimd_codec = c_rarg7; 7002 7003 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 7004 7005 __ enter(); 7006 7007 __ add(src, src, soff); 7008 __ add(dst, dst, doff); 7009 7010 __ mov(doff, dst); 7011 7012 __ sub(length, send, soff); 7013 __ bfm(length, zr, 0, 1); 7014 7015 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 7016 __ cbz(isURL, ProcessData); 7017 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 7018 7019 __ BIND(ProcessData); 7020 __ mov(rscratch1, length); 7021 __ cmp(length, (u1)144); // 144 = 80 + 64 7022 __ br(Assembler::LT, Process4B); 7023 7024 // In the MIME case, the line length cannot be more than 76 7025 // bytes (see RFC 2045). This is too short a block for SIMD 7026 // to be worthwhile, so we use non-SIMD here. 7027 __ movw(rscratch1, 79); 7028 7029 __ BIND(Process4B); 7030 __ ldrw(r14, __ post(src, 4)); 7031 __ ubfxw(r10, r14, 0, 8); 7032 __ ubfxw(r11, r14, 8, 8); 7033 __ ubfxw(r12, r14, 16, 8); 7034 __ ubfxw(r13, r14, 24, 8); 7035 // get the de-code 7036 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 7037 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 7038 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 7039 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 7040 // error detection, 255u indicates an illegal input 7041 __ orrw(r14, r10, r11); 7042 __ orrw(r15, r12, r13); 7043 __ orrw(r14, r14, r15); 7044 __ tbnz(r14, 7, Exit); 7045 // recover the data 7046 __ lslw(r14, r10, 10); 7047 __ bfiw(r14, r11, 4, 6); 7048 __ bfmw(r14, r12, 2, 5); 7049 __ rev16w(r14, r14); 7050 __ bfiw(r13, r12, 6, 2); 7051 __ strh(r14, __ post(dst, 2)); 7052 __ strb(r13, __ post(dst, 1)); 7053 // non-simd loop 7054 __ subsw(rscratch1, rscratch1, 4); 7055 __ br(Assembler::GT, Process4B); 7056 7057 // if exiting from PreProcess80B, rscratch1 == -1; 7058 // otherwise, rscratch1 == 0. 7059 __ cbzw(rscratch1, Exit); 7060 __ sub(length, length, 80); 7061 7062 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 7063 __ cbz(isURL, SIMDEnter); 7064 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 7065 7066 __ BIND(SIMDEnter); 7067 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 7068 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 7069 __ mov(rscratch1, 63); 7070 __ dup(v27, __ T16B, rscratch1); 7071 7072 __ BIND(Process64B); 7073 __ cmp(length, (u1)64); 7074 __ br(Assembler::LT, Process32B); 7075 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 7076 __ sub(length, length, 64); 7077 __ b(Process64B); 7078 7079 __ BIND(Process32B); 7080 __ cmp(length, (u1)32); 7081 __ br(Assembler::LT, SIMDExit); 7082 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 7083 __ sub(length, length, 32); 7084 __ b(Process32B); 7085 7086 __ BIND(SIMDExit); 7087 __ cbz(length, Exit); 7088 __ movw(rscratch1, length); 7089 __ b(Process4B); 7090 7091 __ BIND(Exit); 7092 __ sub(c_rarg0, dst, doff); 7093 7094 __ leave(); 7095 __ ret(lr); 7096 7097 return start; 7098 } 7099 7100 // Support for spin waits. 7101 address generate_spin_wait() { 7102 __ align(CodeEntryAlignment); 7103 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 7104 address start = __ pc(); 7105 7106 __ spin_wait(); 7107 __ ret(lr); 7108 7109 return start; 7110 } 7111 7112 address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { 7113 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table"); 7114 7115 address start = __ pc(); 7116 const Register 7117 r_super_klass = r0, 7118 r_array_base = r1, 7119 r_array_length = r2, 7120 r_array_index = r3, 7121 r_sub_klass = r4, 7122 r_bitmap = rscratch2, 7123 result = r5; 7124 const FloatRegister 7125 vtemp = v0; 7126 7127 Label L_success; 7128 __ enter(); 7129 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 7130 r_array_base, r_array_length, r_array_index, 7131 vtemp, result, super_klass_index, 7132 /*stub_is_near*/true); 7133 __ leave(); 7134 __ ret(lr); 7135 7136 return start; 7137 } 7138 7139 // Slow path implementation for UseSecondarySupersTable. 7140 address generate_lookup_secondary_supers_table_slow_path_stub() { 7141 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path"); 7142 7143 address start = __ pc(); 7144 const Register 7145 r_super_klass = r0, // argument 7146 r_array_base = r1, // argument 7147 temp1 = r2, // temp 7148 r_array_index = r3, // argument 7149 r_bitmap = rscratch2, // argument 7150 result = r5; // argument 7151 7152 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 7153 __ ret(lr); 7154 7155 return start; 7156 } 7157 7158 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 7159 7160 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 7161 // 7162 // If LSE is in use, generate LSE versions of all the stubs. The 7163 // non-LSE versions are in atomic_aarch64.S. 7164 7165 // class AtomicStubMark records the entry point of a stub and the 7166 // stub pointer which will point to it. The stub pointer is set to 7167 // the entry point when ~AtomicStubMark() is called, which must be 7168 // after ICache::invalidate_range. This ensures safe publication of 7169 // the generated code. 7170 class AtomicStubMark { 7171 address _entry_point; 7172 aarch64_atomic_stub_t *_stub; 7173 MacroAssembler *_masm; 7174 public: 7175 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 7176 _masm = masm; 7177 __ align(32); 7178 _entry_point = __ pc(); 7179 _stub = stub; 7180 } 7181 ~AtomicStubMark() { 7182 *_stub = (aarch64_atomic_stub_t)_entry_point; 7183 } 7184 }; 7185 7186 // NB: For memory_order_conservative we need a trailing membar after 7187 // LSE atomic operations but not a leading membar. 7188 // 7189 // We don't need a leading membar because a clause in the Arm ARM 7190 // says: 7191 // 7192 // Barrier-ordered-before 7193 // 7194 // Barrier instructions order prior Memory effects before subsequent 7195 // Memory effects generated by the same Observer. A read or a write 7196 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 7197 // Observer if and only if RW1 appears in program order before RW 2 7198 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 7199 // instruction with both Acquire and Release semantics. 7200 // 7201 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 7202 // and Release semantics, therefore we don't need a leading 7203 // barrier. However, there is no corresponding Barrier-ordered-after 7204 // relationship, therefore we need a trailing membar to prevent a 7205 // later store or load from being reordered with the store in an 7206 // atomic instruction. 7207 // 7208 // This was checked by using the herd7 consistency model simulator 7209 // (http://diy.inria.fr/) with this test case: 7210 // 7211 // AArch64 LseCas 7212 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 7213 // P0 | P1; 7214 // LDR W4, [X2] | MOV W3, #0; 7215 // DMB LD | MOV W4, #1; 7216 // LDR W3, [X1] | CASAL W3, W4, [X1]; 7217 // | DMB ISH; 7218 // | STR W4, [X2]; 7219 // exists 7220 // (0:X3=0 /\ 0:X4=1) 7221 // 7222 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 7223 // with the store to x in P1. Without the DMB in P1 this may happen. 7224 // 7225 // At the time of writing we don't know of any AArch64 hardware that 7226 // reorders stores in this way, but the Reference Manual permits it. 7227 7228 void gen_cas_entry(Assembler::operand_size size, 7229 atomic_memory_order order) { 7230 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 7231 exchange_val = c_rarg2; 7232 bool acquire, release; 7233 switch (order) { 7234 case memory_order_relaxed: 7235 acquire = false; 7236 release = false; 7237 break; 7238 case memory_order_release: 7239 acquire = false; 7240 release = true; 7241 break; 7242 default: 7243 acquire = true; 7244 release = true; 7245 break; 7246 } 7247 __ mov(prev, compare_val); 7248 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 7249 if (order == memory_order_conservative) { 7250 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7251 } 7252 if (size == Assembler::xword) { 7253 __ mov(r0, prev); 7254 } else { 7255 __ movw(r0, prev); 7256 } 7257 __ ret(lr); 7258 } 7259 7260 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 7261 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 7262 // If not relaxed, then default to conservative. Relaxed is the only 7263 // case we use enough to be worth specializing. 7264 if (order == memory_order_relaxed) { 7265 __ ldadd(size, incr, prev, addr); 7266 } else { 7267 __ ldaddal(size, incr, prev, addr); 7268 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7269 } 7270 if (size == Assembler::xword) { 7271 __ mov(r0, prev); 7272 } else { 7273 __ movw(r0, prev); 7274 } 7275 __ ret(lr); 7276 } 7277 7278 void gen_swpal_entry(Assembler::operand_size size) { 7279 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 7280 __ swpal(size, incr, prev, addr); 7281 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7282 if (size == Assembler::xword) { 7283 __ mov(r0, prev); 7284 } else { 7285 __ movw(r0, prev); 7286 } 7287 __ ret(lr); 7288 } 7289 7290 void generate_atomic_entry_points() { 7291 if (! UseLSE) { 7292 return; 7293 } 7294 7295 __ align(CodeEntryAlignment); 7296 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 7297 address first_entry = __ pc(); 7298 7299 // ADD, memory_order_conservative 7300 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 7301 gen_ldadd_entry(Assembler::word, memory_order_conservative); 7302 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 7303 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 7304 7305 // ADD, memory_order_relaxed 7306 AtomicStubMark mark_fetch_add_4_relaxed 7307 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 7308 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 7309 AtomicStubMark mark_fetch_add_8_relaxed 7310 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 7311 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 7312 7313 // XCHG, memory_order_conservative 7314 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 7315 gen_swpal_entry(Assembler::word); 7316 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 7317 gen_swpal_entry(Assembler::xword); 7318 7319 // CAS, memory_order_conservative 7320 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 7321 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 7322 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 7323 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 7324 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 7325 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 7326 7327 // CAS, memory_order_relaxed 7328 AtomicStubMark mark_cmpxchg_1_relaxed 7329 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 7330 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 7331 AtomicStubMark mark_cmpxchg_4_relaxed 7332 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 7333 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 7334 AtomicStubMark mark_cmpxchg_8_relaxed 7335 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 7336 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 7337 7338 AtomicStubMark mark_cmpxchg_4_release 7339 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 7340 gen_cas_entry(MacroAssembler::word, memory_order_release); 7341 AtomicStubMark mark_cmpxchg_8_release 7342 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 7343 gen_cas_entry(MacroAssembler::xword, memory_order_release); 7344 7345 AtomicStubMark mark_cmpxchg_4_seq_cst 7346 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 7347 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 7348 AtomicStubMark mark_cmpxchg_8_seq_cst 7349 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 7350 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 7351 7352 ICache::invalidate_range(first_entry, __ pc() - first_entry); 7353 } 7354 #endif // LINUX 7355 7356 address generate_cont_thaw(Continuation::thaw_kind kind) { 7357 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 7358 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 7359 7360 address start = __ pc(); 7361 7362 if (return_barrier) { 7363 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 7364 __ mov(sp, rscratch1); 7365 } 7366 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7367 7368 if (return_barrier) { 7369 // preserve possible return value from a method returning to the return barrier 7370 __ fmovd(rscratch1, v0); 7371 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7372 } 7373 7374 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 7375 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 7376 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 7377 7378 if (return_barrier) { 7379 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7380 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7381 __ fmovd(v0, rscratch1); 7382 } 7383 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7384 7385 7386 Label thaw_success; 7387 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 7388 __ cbnz(rscratch2, thaw_success); 7389 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 7390 __ br(rscratch1); 7391 __ bind(thaw_success); 7392 7393 // make room for the thawed frames 7394 __ sub(rscratch1, sp, rscratch2); 7395 __ andr(rscratch1, rscratch1, -16); // align 7396 __ mov(sp, rscratch1); 7397 7398 if (return_barrier) { 7399 // save original return value -- again 7400 __ fmovd(rscratch1, v0); 7401 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7402 } 7403 7404 // If we want, we can templatize thaw by kind, and have three different entries 7405 __ movw(c_rarg1, (uint32_t)kind); 7406 7407 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7408 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7409 7410 if (return_barrier) { 7411 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7412 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7413 __ fmovd(v0, rscratch1); 7414 } else { 7415 __ mov(r0, zr); // return 0 (success) from doYield 7416 } 7417 7418 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7419 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7420 __ mov(rfp, sp); 7421 7422 if (return_barrier_exception) { 7423 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7424 __ authenticate_return_address(c_rarg1); 7425 __ verify_oop(r0); 7426 // save return value containing the exception oop in callee-saved R19 7427 __ mov(r19, r0); 7428 7429 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7430 7431 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7432 // __ reinitialize_ptrue(); 7433 7434 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7435 7436 __ mov(r1, r0); // the exception handler 7437 __ mov(r0, r19); // restore return value containing the exception oop 7438 __ verify_oop(r0); 7439 7440 __ leave(); 7441 __ mov(r3, lr); 7442 __ br(r1); // the exception handler 7443 } else { 7444 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7445 __ leave(); 7446 __ ret(lr); 7447 } 7448 7449 return start; 7450 } 7451 7452 address generate_cont_thaw() { 7453 if (!Continuations::enabled()) return nullptr; 7454 7455 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7456 address start = __ pc(); 7457 generate_cont_thaw(Continuation::thaw_top); 7458 return start; 7459 } 7460 7461 address generate_cont_returnBarrier() { 7462 if (!Continuations::enabled()) return nullptr; 7463 7464 // TODO: will probably need multiple return barriers depending on return type 7465 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7466 address start = __ pc(); 7467 7468 generate_cont_thaw(Continuation::thaw_return_barrier); 7469 7470 return start; 7471 } 7472 7473 address generate_cont_returnBarrier_exception() { 7474 if (!Continuations::enabled()) return nullptr; 7475 7476 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7477 address start = __ pc(); 7478 7479 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7480 7481 return start; 7482 } 7483 7484 address generate_cont_preempt_stub() { 7485 if (!Continuations::enabled()) return nullptr; 7486 StubCodeMark mark(this, "StubRoutines","Continuation preempt stub"); 7487 address start = __ pc(); 7488 7489 __ reset_last_Java_frame(true); 7490 7491 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 7492 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 7493 __ mov(sp, rscratch2); 7494 7495 Label preemption_cancelled; 7496 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 7497 __ cbnz(rscratch1, preemption_cancelled); 7498 7499 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 7500 SharedRuntime::continuation_enter_cleanup(_masm); 7501 __ leave(); 7502 __ ret(lr); 7503 7504 // We acquired the monitor after freezing the frames so call thaw to continue execution. 7505 __ bind(preemption_cancelled); 7506 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 7507 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 7508 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 7509 __ ldr(rscratch1, Address(rscratch1)); 7510 __ br(rscratch1); 7511 7512 return start; 7513 } 7514 7515 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7516 // are represented as long[5], with BITS_PER_LIMB = 26. 7517 // Pack five 26-bit limbs into three 64-bit registers. 7518 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7519 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7520 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7521 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7522 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7523 7524 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7525 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7526 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7527 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7528 7529 if (dest2->is_valid()) { 7530 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7531 } else { 7532 #ifdef ASSERT 7533 Label OK; 7534 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7535 __ br(__ EQ, OK); 7536 __ stop("high bits of Poly1305 integer should be zero"); 7537 __ should_not_reach_here(); 7538 __ bind(OK); 7539 #endif 7540 } 7541 } 7542 7543 // As above, but return only a 128-bit integer, packed into two 7544 // 64-bit registers. 7545 void pack_26(Register dest0, Register dest1, Register src) { 7546 pack_26(dest0, dest1, noreg, src); 7547 } 7548 7549 // Multiply and multiply-accumulate unsigned 64-bit registers. 7550 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7551 __ mul(prod_lo, n, m); 7552 __ umulh(prod_hi, n, m); 7553 } 7554 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7555 wide_mul(rscratch1, rscratch2, n, m); 7556 __ adds(sum_lo, sum_lo, rscratch1); 7557 __ adc(sum_hi, sum_hi, rscratch2); 7558 } 7559 7560 // Poly1305, RFC 7539 7561 7562 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7563 // description of the tricks used to simplify and accelerate this 7564 // computation. 7565 7566 address generate_poly1305_processBlocks() { 7567 __ align(CodeEntryAlignment); 7568 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 7569 address start = __ pc(); 7570 Label here; 7571 __ enter(); 7572 RegSet callee_saved = RegSet::range(r19, r28); 7573 __ push(callee_saved, sp); 7574 7575 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7576 7577 // Arguments 7578 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7579 7580 // R_n is the 128-bit randomly-generated key, packed into two 7581 // registers. The caller passes this key to us as long[5], with 7582 // BITS_PER_LIMB = 26. 7583 const Register R_0 = *++regs, R_1 = *++regs; 7584 pack_26(R_0, R_1, r_start); 7585 7586 // RR_n is (R_n >> 2) * 5 7587 const Register RR_0 = *++regs, RR_1 = *++regs; 7588 __ lsr(RR_0, R_0, 2); 7589 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7590 __ lsr(RR_1, R_1, 2); 7591 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7592 7593 // U_n is the current checksum 7594 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7595 pack_26(U_0, U_1, U_2, acc_start); 7596 7597 static constexpr int BLOCK_LENGTH = 16; 7598 Label DONE, LOOP; 7599 7600 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7601 __ br(Assembler::LT, DONE); { 7602 __ bind(LOOP); 7603 7604 // S_n is to be the sum of U_n and the next block of data 7605 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7606 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7607 __ adds(S_0, U_0, S_0); 7608 __ adcs(S_1, U_1, S_1); 7609 __ adc(S_2, U_2, zr); 7610 __ add(S_2, S_2, 1); 7611 7612 const Register U_0HI = *++regs, U_1HI = *++regs; 7613 7614 // NB: this logic depends on some of the special properties of 7615 // Poly1305 keys. In particular, because we know that the top 7616 // four bits of R_0 and R_1 are zero, we can add together 7617 // partial products without any risk of needing to propagate a 7618 // carry out. 7619 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7620 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7621 __ andr(U_2, R_0, 3); 7622 __ mul(U_2, S_2, U_2); 7623 7624 // Recycle registers S_0, S_1, S_2 7625 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7626 7627 // Partial reduction mod 2**130 - 5 7628 __ adds(U_1, U_0HI, U_1); 7629 __ adc(U_2, U_1HI, U_2); 7630 // Sum now in U_2:U_1:U_0. 7631 // Dead: U_0HI, U_1HI. 7632 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7633 7634 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7635 7636 // First, U_2:U_1:U_0 += (U_2 >> 2) 7637 __ lsr(rscratch1, U_2, 2); 7638 __ andr(U_2, U_2, (u8)3); 7639 __ adds(U_0, U_0, rscratch1); 7640 __ adcs(U_1, U_1, zr); 7641 __ adc(U_2, U_2, zr); 7642 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7643 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7644 __ adcs(U_1, U_1, zr); 7645 __ adc(U_2, U_2, zr); 7646 7647 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7648 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7649 __ br(~ Assembler::LT, LOOP); 7650 } 7651 7652 // Further reduce modulo 2^130 - 5 7653 __ lsr(rscratch1, U_2, 2); 7654 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7655 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7656 __ adcs(U_1, U_1, zr); 7657 __ andr(U_2, U_2, (u1)3); 7658 __ adc(U_2, U_2, zr); 7659 7660 // Unpack the sum into five 26-bit limbs and write to memory. 7661 __ ubfiz(rscratch1, U_0, 0, 26); 7662 __ ubfx(rscratch2, U_0, 26, 26); 7663 __ stp(rscratch1, rscratch2, Address(acc_start)); 7664 __ ubfx(rscratch1, U_0, 52, 12); 7665 __ bfi(rscratch1, U_1, 12, 14); 7666 __ ubfx(rscratch2, U_1, 14, 26); 7667 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7668 __ ubfx(rscratch1, U_1, 40, 24); 7669 __ bfi(rscratch1, U_2, 24, 3); 7670 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7671 7672 __ bind(DONE); 7673 __ pop(callee_saved, sp); 7674 __ leave(); 7675 __ ret(lr); 7676 7677 return start; 7678 } 7679 7680 // exception handler for upcall stubs 7681 address generate_upcall_stub_exception_handler() { 7682 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 7683 address start = __ pc(); 7684 7685 // Native caller has no idea how to handle exceptions, 7686 // so we just crash here. Up to callee to catch exceptions. 7687 __ verify_oop(r0); 7688 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7689 __ blr(rscratch1); 7690 __ should_not_reach_here(); 7691 7692 return start; 7693 } 7694 7695 // load Method* target of MethodHandle 7696 // j_rarg0 = jobject receiver 7697 // rmethod = result 7698 address generate_upcall_stub_load_target() { 7699 StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target"); 7700 address start = __ pc(); 7701 7702 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 7703 // Load target method from receiver 7704 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 7705 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 7706 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 7707 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 7708 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 7709 noreg, noreg); 7710 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 7711 7712 __ ret(lr); 7713 7714 return start; 7715 } 7716 7717 #undef __ 7718 #define __ masm-> 7719 7720 class MontgomeryMultiplyGenerator : public MacroAssembler { 7721 7722 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7723 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7724 7725 RegSet _toSave; 7726 bool _squaring; 7727 7728 public: 7729 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7730 : MacroAssembler(as->code()), _squaring(squaring) { 7731 7732 // Register allocation 7733 7734 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7735 Pa_base = *regs; // Argument registers 7736 if (squaring) 7737 Pb_base = Pa_base; 7738 else 7739 Pb_base = *++regs; 7740 Pn_base = *++regs; 7741 Rlen= *++regs; 7742 inv = *++regs; 7743 Pm_base = *++regs; 7744 7745 // Working registers: 7746 Ra = *++regs; // The current digit of a, b, n, and m. 7747 Rb = *++regs; 7748 Rm = *++regs; 7749 Rn = *++regs; 7750 7751 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7752 Pb = *++regs; 7753 Pm = *++regs; 7754 Pn = *++regs; 7755 7756 t0 = *++regs; // Three registers which form a 7757 t1 = *++regs; // triple-precision accumuator. 7758 t2 = *++regs; 7759 7760 Ri = *++regs; // Inner and outer loop indexes. 7761 Rj = *++regs; 7762 7763 Rhi_ab = *++regs; // Product registers: low and high parts 7764 Rlo_ab = *++regs; // of a*b and m*n. 7765 Rhi_mn = *++regs; 7766 Rlo_mn = *++regs; 7767 7768 // r19 and up are callee-saved. 7769 _toSave = RegSet::range(r19, *regs) + Pm_base; 7770 } 7771 7772 private: 7773 void save_regs() { 7774 push(_toSave, sp); 7775 } 7776 7777 void restore_regs() { 7778 pop(_toSave, sp); 7779 } 7780 7781 template <typename T> 7782 void unroll_2(Register count, T block) { 7783 Label loop, end, odd; 7784 tbnz(count, 0, odd); 7785 cbz(count, end); 7786 align(16); 7787 bind(loop); 7788 (this->*block)(); 7789 bind(odd); 7790 (this->*block)(); 7791 subs(count, count, 2); 7792 br(Assembler::GT, loop); 7793 bind(end); 7794 } 7795 7796 template <typename T> 7797 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7798 Label loop, end, odd; 7799 tbnz(count, 0, odd); 7800 cbz(count, end); 7801 align(16); 7802 bind(loop); 7803 (this->*block)(d, s, tmp); 7804 bind(odd); 7805 (this->*block)(d, s, tmp); 7806 subs(count, count, 2); 7807 br(Assembler::GT, loop); 7808 bind(end); 7809 } 7810 7811 void pre1(RegisterOrConstant i) { 7812 block_comment("pre1"); 7813 // Pa = Pa_base; 7814 // Pb = Pb_base + i; 7815 // Pm = Pm_base; 7816 // Pn = Pn_base + i; 7817 // Ra = *Pa; 7818 // Rb = *Pb; 7819 // Rm = *Pm; 7820 // Rn = *Pn; 7821 ldr(Ra, Address(Pa_base)); 7822 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7823 ldr(Rm, Address(Pm_base)); 7824 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7825 lea(Pa, Address(Pa_base)); 7826 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7827 lea(Pm, Address(Pm_base)); 7828 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7829 7830 // Zero the m*n result. 7831 mov(Rhi_mn, zr); 7832 mov(Rlo_mn, zr); 7833 } 7834 7835 // The core multiply-accumulate step of a Montgomery 7836 // multiplication. The idea is to schedule operations as a 7837 // pipeline so that instructions with long latencies (loads and 7838 // multiplies) have time to complete before their results are 7839 // used. This most benefits in-order implementations of the 7840 // architecture but out-of-order ones also benefit. 7841 void step() { 7842 block_comment("step"); 7843 // MACC(Ra, Rb, t0, t1, t2); 7844 // Ra = *++Pa; 7845 // Rb = *--Pb; 7846 umulh(Rhi_ab, Ra, Rb); 7847 mul(Rlo_ab, Ra, Rb); 7848 ldr(Ra, pre(Pa, wordSize)); 7849 ldr(Rb, pre(Pb, -wordSize)); 7850 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7851 // previous iteration. 7852 // MACC(Rm, Rn, t0, t1, t2); 7853 // Rm = *++Pm; 7854 // Rn = *--Pn; 7855 umulh(Rhi_mn, Rm, Rn); 7856 mul(Rlo_mn, Rm, Rn); 7857 ldr(Rm, pre(Pm, wordSize)); 7858 ldr(Rn, pre(Pn, -wordSize)); 7859 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7860 } 7861 7862 void post1() { 7863 block_comment("post1"); 7864 7865 // MACC(Ra, Rb, t0, t1, t2); 7866 // Ra = *++Pa; 7867 // Rb = *--Pb; 7868 umulh(Rhi_ab, Ra, Rb); 7869 mul(Rlo_ab, Ra, Rb); 7870 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7871 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7872 7873 // *Pm = Rm = t0 * inv; 7874 mul(Rm, t0, inv); 7875 str(Rm, Address(Pm)); 7876 7877 // MACC(Rm, Rn, t0, t1, t2); 7878 // t0 = t1; t1 = t2; t2 = 0; 7879 umulh(Rhi_mn, Rm, Rn); 7880 7881 #ifndef PRODUCT 7882 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7883 { 7884 mul(Rlo_mn, Rm, Rn); 7885 add(Rlo_mn, t0, Rlo_mn); 7886 Label ok; 7887 cbz(Rlo_mn, ok); { 7888 stop("broken Montgomery multiply"); 7889 } bind(ok); 7890 } 7891 #endif 7892 // We have very carefully set things up so that 7893 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7894 // the lower half of Rm * Rn because we know the result already: 7895 // it must be -t0. t0 + (-t0) must generate a carry iff 7896 // t0 != 0. So, rather than do a mul and an adds we just set 7897 // the carry flag iff t0 is nonzero. 7898 // 7899 // mul(Rlo_mn, Rm, Rn); 7900 // adds(zr, t0, Rlo_mn); 7901 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7902 adcs(t0, t1, Rhi_mn); 7903 adc(t1, t2, zr); 7904 mov(t2, zr); 7905 } 7906 7907 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7908 block_comment("pre2"); 7909 // Pa = Pa_base + i-len; 7910 // Pb = Pb_base + len; 7911 // Pm = Pm_base + i-len; 7912 // Pn = Pn_base + len; 7913 7914 if (i.is_register()) { 7915 sub(Rj, i.as_register(), len); 7916 } else { 7917 mov(Rj, i.as_constant()); 7918 sub(Rj, Rj, len); 7919 } 7920 // Rj == i-len 7921 7922 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7923 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7924 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7925 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7926 7927 // Ra = *++Pa; 7928 // Rb = *--Pb; 7929 // Rm = *++Pm; 7930 // Rn = *--Pn; 7931 ldr(Ra, pre(Pa, wordSize)); 7932 ldr(Rb, pre(Pb, -wordSize)); 7933 ldr(Rm, pre(Pm, wordSize)); 7934 ldr(Rn, pre(Pn, -wordSize)); 7935 7936 mov(Rhi_mn, zr); 7937 mov(Rlo_mn, zr); 7938 } 7939 7940 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7941 block_comment("post2"); 7942 if (i.is_constant()) { 7943 mov(Rj, i.as_constant()-len.as_constant()); 7944 } else { 7945 sub(Rj, i.as_register(), len); 7946 } 7947 7948 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7949 7950 // As soon as we know the least significant digit of our result, 7951 // store it. 7952 // Pm_base[i-len] = t0; 7953 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7954 7955 // t0 = t1; t1 = t2; t2 = 0; 7956 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7957 adc(t1, t2, zr); 7958 mov(t2, zr); 7959 } 7960 7961 // A carry in t0 after Montgomery multiplication means that we 7962 // should subtract multiples of n from our result in m. We'll 7963 // keep doing that until there is no carry. 7964 void normalize(RegisterOrConstant len) { 7965 block_comment("normalize"); 7966 // while (t0) 7967 // t0 = sub(Pm_base, Pn_base, t0, len); 7968 Label loop, post, again; 7969 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7970 cbz(t0, post); { 7971 bind(again); { 7972 mov(i, zr); 7973 mov(cnt, len); 7974 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7975 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7976 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7977 align(16); 7978 bind(loop); { 7979 sbcs(Rm, Rm, Rn); 7980 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7981 add(i, i, 1); 7982 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7983 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7984 sub(cnt, cnt, 1); 7985 } cbnz(cnt, loop); 7986 sbc(t0, t0, zr); 7987 } cbnz(t0, again); 7988 } bind(post); 7989 } 7990 7991 // Move memory at s to d, reversing words. 7992 // Increments d to end of copied memory 7993 // Destroys tmp1, tmp2 7994 // Preserves len 7995 // Leaves s pointing to the address which was in d at start 7996 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7997 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7998 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7999 8000 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 8001 mov(tmp1, len); 8002 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 8003 sub(s, d, len, ext::uxtw, LogBytesPerWord); 8004 } 8005 // where 8006 void reverse1(Register d, Register s, Register tmp) { 8007 ldr(tmp, pre(s, -wordSize)); 8008 ror(tmp, tmp, 32); 8009 str(tmp, post(d, wordSize)); 8010 } 8011 8012 void step_squaring() { 8013 // An extra ACC 8014 step(); 8015 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 8016 } 8017 8018 void last_squaring(RegisterOrConstant i) { 8019 Label dont; 8020 // if ((i & 1) == 0) { 8021 tbnz(i.as_register(), 0, dont); { 8022 // MACC(Ra, Rb, t0, t1, t2); 8023 // Ra = *++Pa; 8024 // Rb = *--Pb; 8025 umulh(Rhi_ab, Ra, Rb); 8026 mul(Rlo_ab, Ra, Rb); 8027 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 8028 } bind(dont); 8029 } 8030 8031 void extra_step_squaring() { 8032 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 8033 8034 // MACC(Rm, Rn, t0, t1, t2); 8035 // Rm = *++Pm; 8036 // Rn = *--Pn; 8037 umulh(Rhi_mn, Rm, Rn); 8038 mul(Rlo_mn, Rm, Rn); 8039 ldr(Rm, pre(Pm, wordSize)); 8040 ldr(Rn, pre(Pn, -wordSize)); 8041 } 8042 8043 void post1_squaring() { 8044 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 8045 8046 // *Pm = Rm = t0 * inv; 8047 mul(Rm, t0, inv); 8048 str(Rm, Address(Pm)); 8049 8050 // MACC(Rm, Rn, t0, t1, t2); 8051 // t0 = t1; t1 = t2; t2 = 0; 8052 umulh(Rhi_mn, Rm, Rn); 8053 8054 #ifndef PRODUCT 8055 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 8056 { 8057 mul(Rlo_mn, Rm, Rn); 8058 add(Rlo_mn, t0, Rlo_mn); 8059 Label ok; 8060 cbz(Rlo_mn, ok); { 8061 stop("broken Montgomery multiply"); 8062 } bind(ok); 8063 } 8064 #endif 8065 // We have very carefully set things up so that 8066 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 8067 // the lower half of Rm * Rn because we know the result already: 8068 // it must be -t0. t0 + (-t0) must generate a carry iff 8069 // t0 != 0. So, rather than do a mul and an adds we just set 8070 // the carry flag iff t0 is nonzero. 8071 // 8072 // mul(Rlo_mn, Rm, Rn); 8073 // adds(zr, t0, Rlo_mn); 8074 subs(zr, t0, 1); // Set carry iff t0 is nonzero 8075 adcs(t0, t1, Rhi_mn); 8076 adc(t1, t2, zr); 8077 mov(t2, zr); 8078 } 8079 8080 void acc(Register Rhi, Register Rlo, 8081 Register t0, Register t1, Register t2) { 8082 adds(t0, t0, Rlo); 8083 adcs(t1, t1, Rhi); 8084 adc(t2, t2, zr); 8085 } 8086 8087 public: 8088 /** 8089 * Fast Montgomery multiplication. The derivation of the 8090 * algorithm is in A Cryptographic Library for the Motorola 8091 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 8092 * 8093 * Arguments: 8094 * 8095 * Inputs for multiplication: 8096 * c_rarg0 - int array elements a 8097 * c_rarg1 - int array elements b 8098 * c_rarg2 - int array elements n (the modulus) 8099 * c_rarg3 - int length 8100 * c_rarg4 - int inv 8101 * c_rarg5 - int array elements m (the result) 8102 * 8103 * Inputs for squaring: 8104 * c_rarg0 - int array elements a 8105 * c_rarg1 - int array elements n (the modulus) 8106 * c_rarg2 - int length 8107 * c_rarg3 - int inv 8108 * c_rarg4 - int array elements m (the result) 8109 * 8110 */ 8111 address generate_multiply() { 8112 Label argh, nothing; 8113 bind(argh); 8114 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8115 8116 align(CodeEntryAlignment); 8117 address entry = pc(); 8118 8119 cbzw(Rlen, nothing); 8120 8121 enter(); 8122 8123 // Make room. 8124 cmpw(Rlen, 512); 8125 br(Assembler::HI, argh); 8126 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8127 andr(sp, Ra, -2 * wordSize); 8128 8129 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8130 8131 { 8132 // Copy input args, reversing as we go. We use Ra as a 8133 // temporary variable. 8134 reverse(Ra, Pa_base, Rlen, t0, t1); 8135 if (!_squaring) 8136 reverse(Ra, Pb_base, Rlen, t0, t1); 8137 reverse(Ra, Pn_base, Rlen, t0, t1); 8138 } 8139 8140 // Push all call-saved registers and also Pm_base which we'll need 8141 // at the end. 8142 save_regs(); 8143 8144 #ifndef PRODUCT 8145 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 8146 { 8147 ldr(Rn, Address(Pn_base, 0)); 8148 mul(Rlo_mn, Rn, inv); 8149 subs(zr, Rlo_mn, -1); 8150 Label ok; 8151 br(EQ, ok); { 8152 stop("broken inverse in Montgomery multiply"); 8153 } bind(ok); 8154 } 8155 #endif 8156 8157 mov(Pm_base, Ra); 8158 8159 mov(t0, zr); 8160 mov(t1, zr); 8161 mov(t2, zr); 8162 8163 block_comment("for (int i = 0; i < len; i++) {"); 8164 mov(Ri, zr); { 8165 Label loop, end; 8166 cmpw(Ri, Rlen); 8167 br(Assembler::GE, end); 8168 8169 bind(loop); 8170 pre1(Ri); 8171 8172 block_comment(" for (j = i; j; j--) {"); { 8173 movw(Rj, Ri); 8174 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 8175 } block_comment(" } // j"); 8176 8177 post1(); 8178 addw(Ri, Ri, 1); 8179 cmpw(Ri, Rlen); 8180 br(Assembler::LT, loop); 8181 bind(end); 8182 block_comment("} // i"); 8183 } 8184 8185 block_comment("for (int i = len; i < 2*len; i++) {"); 8186 mov(Ri, Rlen); { 8187 Label loop, end; 8188 cmpw(Ri, Rlen, Assembler::LSL, 1); 8189 br(Assembler::GE, end); 8190 8191 bind(loop); 8192 pre2(Ri, Rlen); 8193 8194 block_comment(" for (j = len*2-i-1; j; j--) {"); { 8195 lslw(Rj, Rlen, 1); 8196 subw(Rj, Rj, Ri); 8197 subw(Rj, Rj, 1); 8198 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 8199 } block_comment(" } // j"); 8200 8201 post2(Ri, Rlen); 8202 addw(Ri, Ri, 1); 8203 cmpw(Ri, Rlen, Assembler::LSL, 1); 8204 br(Assembler::LT, loop); 8205 bind(end); 8206 } 8207 block_comment("} // i"); 8208 8209 normalize(Rlen); 8210 8211 mov(Ra, Pm_base); // Save Pm_base in Ra 8212 restore_regs(); // Restore caller's Pm_base 8213 8214 // Copy our result into caller's Pm_base 8215 reverse(Pm_base, Ra, Rlen, t0, t1); 8216 8217 leave(); 8218 bind(nothing); 8219 ret(lr); 8220 8221 return entry; 8222 } 8223 // In C, approximately: 8224 8225 // void 8226 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 8227 // julong Pn_base[], julong Pm_base[], 8228 // julong inv, int len) { 8229 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8230 // julong *Pa, *Pb, *Pn, *Pm; 8231 // julong Ra, Rb, Rn, Rm; 8232 8233 // int i; 8234 8235 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8236 8237 // for (i = 0; i < len; i++) { 8238 // int j; 8239 8240 // Pa = Pa_base; 8241 // Pb = Pb_base + i; 8242 // Pm = Pm_base; 8243 // Pn = Pn_base + i; 8244 8245 // Ra = *Pa; 8246 // Rb = *Pb; 8247 // Rm = *Pm; 8248 // Rn = *Pn; 8249 8250 // int iters = i; 8251 // for (j = 0; iters--; j++) { 8252 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8253 // MACC(Ra, Rb, t0, t1, t2); 8254 // Ra = *++Pa; 8255 // Rb = *--Pb; 8256 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8257 // MACC(Rm, Rn, t0, t1, t2); 8258 // Rm = *++Pm; 8259 // Rn = *--Pn; 8260 // } 8261 8262 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 8263 // MACC(Ra, Rb, t0, t1, t2); 8264 // *Pm = Rm = t0 * inv; 8265 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8266 // MACC(Rm, Rn, t0, t1, t2); 8267 8268 // assert(t0 == 0, "broken Montgomery multiply"); 8269 8270 // t0 = t1; t1 = t2; t2 = 0; 8271 // } 8272 8273 // for (i = len; i < 2*len; i++) { 8274 // int j; 8275 8276 // Pa = Pa_base + i-len; 8277 // Pb = Pb_base + len; 8278 // Pm = Pm_base + i-len; 8279 // Pn = Pn_base + len; 8280 8281 // Ra = *++Pa; 8282 // Rb = *--Pb; 8283 // Rm = *++Pm; 8284 // Rn = *--Pn; 8285 8286 // int iters = len*2-i-1; 8287 // for (j = i-len+1; iters--; j++) { 8288 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8289 // MACC(Ra, Rb, t0, t1, t2); 8290 // Ra = *++Pa; 8291 // Rb = *--Pb; 8292 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8293 // MACC(Rm, Rn, t0, t1, t2); 8294 // Rm = *++Pm; 8295 // Rn = *--Pn; 8296 // } 8297 8298 // Pm_base[i-len] = t0; 8299 // t0 = t1; t1 = t2; t2 = 0; 8300 // } 8301 8302 // while (t0) 8303 // t0 = sub(Pm_base, Pn_base, t0, len); 8304 // } 8305 8306 /** 8307 * Fast Montgomery squaring. This uses asymptotically 25% fewer 8308 * multiplies than Montgomery multiplication so it should be up to 8309 * 25% faster. However, its loop control is more complex and it 8310 * may actually run slower on some machines. 8311 * 8312 * Arguments: 8313 * 8314 * Inputs: 8315 * c_rarg0 - int array elements a 8316 * c_rarg1 - int array elements n (the modulus) 8317 * c_rarg2 - int length 8318 * c_rarg3 - int inv 8319 * c_rarg4 - int array elements m (the result) 8320 * 8321 */ 8322 address generate_square() { 8323 Label argh; 8324 bind(argh); 8325 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8326 8327 align(CodeEntryAlignment); 8328 address entry = pc(); 8329 8330 enter(); 8331 8332 // Make room. 8333 cmpw(Rlen, 512); 8334 br(Assembler::HI, argh); 8335 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8336 andr(sp, Ra, -2 * wordSize); 8337 8338 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8339 8340 { 8341 // Copy input args, reversing as we go. We use Ra as a 8342 // temporary variable. 8343 reverse(Ra, Pa_base, Rlen, t0, t1); 8344 reverse(Ra, Pn_base, Rlen, t0, t1); 8345 } 8346 8347 // Push all call-saved registers and also Pm_base which we'll need 8348 // at the end. 8349 save_regs(); 8350 8351 mov(Pm_base, Ra); 8352 8353 mov(t0, zr); 8354 mov(t1, zr); 8355 mov(t2, zr); 8356 8357 block_comment("for (int i = 0; i < len; i++) {"); 8358 mov(Ri, zr); { 8359 Label loop, end; 8360 bind(loop); 8361 cmp(Ri, Rlen); 8362 br(Assembler::GE, end); 8363 8364 pre1(Ri); 8365 8366 block_comment("for (j = (i+1)/2; j; j--) {"); { 8367 add(Rj, Ri, 1); 8368 lsr(Rj, Rj, 1); 8369 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8370 } block_comment(" } // j"); 8371 8372 last_squaring(Ri); 8373 8374 block_comment(" for (j = i/2; j; j--) {"); { 8375 lsr(Rj, Ri, 1); 8376 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8377 } block_comment(" } // j"); 8378 8379 post1_squaring(); 8380 add(Ri, Ri, 1); 8381 cmp(Ri, Rlen); 8382 br(Assembler::LT, loop); 8383 8384 bind(end); 8385 block_comment("} // i"); 8386 } 8387 8388 block_comment("for (int i = len; i < 2*len; i++) {"); 8389 mov(Ri, Rlen); { 8390 Label loop, end; 8391 bind(loop); 8392 cmp(Ri, Rlen, Assembler::LSL, 1); 8393 br(Assembler::GE, end); 8394 8395 pre2(Ri, Rlen); 8396 8397 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8398 lsl(Rj, Rlen, 1); 8399 sub(Rj, Rj, Ri); 8400 sub(Rj, Rj, 1); 8401 lsr(Rj, Rj, 1); 8402 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8403 } block_comment(" } // j"); 8404 8405 last_squaring(Ri); 8406 8407 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8408 lsl(Rj, Rlen, 1); 8409 sub(Rj, Rj, Ri); 8410 lsr(Rj, Rj, 1); 8411 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8412 } block_comment(" } // j"); 8413 8414 post2(Ri, Rlen); 8415 add(Ri, Ri, 1); 8416 cmp(Ri, Rlen, Assembler::LSL, 1); 8417 8418 br(Assembler::LT, loop); 8419 bind(end); 8420 block_comment("} // i"); 8421 } 8422 8423 normalize(Rlen); 8424 8425 mov(Ra, Pm_base); // Save Pm_base in Ra 8426 restore_regs(); // Restore caller's Pm_base 8427 8428 // Copy our result into caller's Pm_base 8429 reverse(Pm_base, Ra, Rlen, t0, t1); 8430 8431 leave(); 8432 ret(lr); 8433 8434 return entry; 8435 } 8436 // In C, approximately: 8437 8438 // void 8439 // montgomery_square(julong Pa_base[], julong Pn_base[], 8440 // julong Pm_base[], julong inv, int len) { 8441 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8442 // julong *Pa, *Pb, *Pn, *Pm; 8443 // julong Ra, Rb, Rn, Rm; 8444 8445 // int i; 8446 8447 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8448 8449 // for (i = 0; i < len; i++) { 8450 // int j; 8451 8452 // Pa = Pa_base; 8453 // Pb = Pa_base + i; 8454 // Pm = Pm_base; 8455 // Pn = Pn_base + i; 8456 8457 // Ra = *Pa; 8458 // Rb = *Pb; 8459 // Rm = *Pm; 8460 // Rn = *Pn; 8461 8462 // int iters = (i+1)/2; 8463 // for (j = 0; iters--; j++) { 8464 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8465 // MACC2(Ra, Rb, t0, t1, t2); 8466 // Ra = *++Pa; 8467 // Rb = *--Pb; 8468 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8469 // MACC(Rm, Rn, t0, t1, t2); 8470 // Rm = *++Pm; 8471 // Rn = *--Pn; 8472 // } 8473 // if ((i & 1) == 0) { 8474 // assert(Ra == Pa_base[j], "must be"); 8475 // MACC(Ra, Ra, t0, t1, t2); 8476 // } 8477 // iters = i/2; 8478 // assert(iters == i-j, "must be"); 8479 // for (; iters--; j++) { 8480 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8481 // MACC(Rm, Rn, t0, t1, t2); 8482 // Rm = *++Pm; 8483 // Rn = *--Pn; 8484 // } 8485 8486 // *Pm = Rm = t0 * inv; 8487 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8488 // MACC(Rm, Rn, t0, t1, t2); 8489 8490 // assert(t0 == 0, "broken Montgomery multiply"); 8491 8492 // t0 = t1; t1 = t2; t2 = 0; 8493 // } 8494 8495 // for (i = len; i < 2*len; i++) { 8496 // int start = i-len+1; 8497 // int end = start + (len - start)/2; 8498 // int j; 8499 8500 // Pa = Pa_base + i-len; 8501 // Pb = Pa_base + len; 8502 // Pm = Pm_base + i-len; 8503 // Pn = Pn_base + len; 8504 8505 // Ra = *++Pa; 8506 // Rb = *--Pb; 8507 // Rm = *++Pm; 8508 // Rn = *--Pn; 8509 8510 // int iters = (2*len-i-1)/2; 8511 // assert(iters == end-start, "must be"); 8512 // for (j = start; iters--; j++) { 8513 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8514 // MACC2(Ra, Rb, t0, t1, t2); 8515 // Ra = *++Pa; 8516 // Rb = *--Pb; 8517 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8518 // MACC(Rm, Rn, t0, t1, t2); 8519 // Rm = *++Pm; 8520 // Rn = *--Pn; 8521 // } 8522 // if ((i & 1) == 0) { 8523 // assert(Ra == Pa_base[j], "must be"); 8524 // MACC(Ra, Ra, t0, t1, t2); 8525 // } 8526 // iters = (2*len-i)/2; 8527 // assert(iters == len-j, "must be"); 8528 // for (; iters--; j++) { 8529 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8530 // MACC(Rm, Rn, t0, t1, t2); 8531 // Rm = *++Pm; 8532 // Rn = *--Pn; 8533 // } 8534 // Pm_base[i-len] = t0; 8535 // t0 = t1; t1 = t2; t2 = 0; 8536 // } 8537 8538 // while (t0) 8539 // t0 = sub(Pm_base, Pn_base, t0, len); 8540 // } 8541 }; 8542 8543 void generate_vector_math_stubs() { 8544 // Get native vector math stub routine addresses 8545 void* libsleef = nullptr; 8546 char ebuf[1024]; 8547 char dll_name[JVM_MAXPATHLEN]; 8548 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 8549 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 8550 } 8551 if (libsleef == nullptr) { 8552 log_info(library)("Failed to load native vector math library, %s!", ebuf); 8553 return; 8554 } 8555 // Method naming convention 8556 // All the methods are named as <OP><T><N>_<U><suffix> 8557 // Where: 8558 // <OP> is the operation name, e.g. sin 8559 // <T> is optional to indicate float/double 8560 // "f/d" for vector float/double operation 8561 // <N> is the number of elements in the vector 8562 // "2/4" for neon, and "x" for sve 8563 // <U> is the precision level 8564 // "u10/u05" represents 1.0/0.5 ULP error bounds 8565 // We use "u10" for all operations by default 8566 // But for those functions do not have u10 support, we use "u05" instead 8567 // <suffix> indicates neon/sve 8568 // "sve/advsimd" for sve/neon implementations 8569 // e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions 8570 // cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions 8571 // 8572 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 8573 8574 // Math vector stubs implemented with SVE for scalable vector size. 8575 if (UseSVE > 0) { 8576 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 8577 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 8578 // Skip "tanh" because there is performance regression 8579 if (vop == VectorSupport::VECTOR_OP_TANH) { 8580 continue; 8581 } 8582 8583 // The native library does not support u10 level of "hypot". 8584 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 8585 8586 snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf); 8587 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 8588 8589 snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf); 8590 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 8591 } 8592 } 8593 8594 // Math vector stubs implemented with NEON for 64/128 bits vector size. 8595 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 8596 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 8597 // Skip "tanh" because there is performance regression 8598 if (vop == VectorSupport::VECTOR_OP_TANH) { 8599 continue; 8600 } 8601 8602 // The native library does not support u10 level of "hypot". 8603 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 8604 8605 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 8606 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf); 8607 8608 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 8609 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 8610 8611 snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf); 8612 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 8613 } 8614 } 8615 8616 // Initialization 8617 void generate_initial_stubs() { 8618 // Generate initial stubs and initializes the entry points 8619 8620 // entry points that exist in all platforms Note: This is code 8621 // that could be shared among different platforms - however the 8622 // benefit seems to be smaller than the disadvantage of having a 8623 // much more complicated generator structure. See also comment in 8624 // stubRoutines.hpp. 8625 8626 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8627 8628 StubRoutines::_call_stub_entry = 8629 generate_call_stub(StubRoutines::_call_stub_return_address); 8630 8631 // is referenced by megamorphic call 8632 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8633 8634 // Initialize table for copy memory (arraycopy) check. 8635 if (UnsafeMemoryAccess::_table == nullptr) { 8636 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 8637 } 8638 8639 if (UseCRC32Intrinsics) { 8640 // set table address before stub generation which use it 8641 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8642 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8643 } 8644 8645 if (UseCRC32CIntrinsics) { 8646 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8647 } 8648 8649 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8650 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8651 } 8652 8653 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8654 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8655 } 8656 8657 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 8658 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 8659 StubRoutines::_hf2f = generate_float16ToFloat(); 8660 StubRoutines::_f2hf = generate_floatToFloat16(); 8661 } 8662 } 8663 8664 void generate_continuation_stubs() { 8665 // Continuation stubs: 8666 StubRoutines::_cont_thaw = generate_cont_thaw(); 8667 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8668 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8669 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 8670 } 8671 8672 void generate_final_stubs() { 8673 // support for verify_oop (must happen after universe_init) 8674 if (VerifyOops) { 8675 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8676 } 8677 8678 // arraycopy stubs used by compilers 8679 generate_arraycopy_stubs(); 8680 8681 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8682 if (bs_nm != nullptr) { 8683 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8684 } 8685 8686 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8687 8688 if (UsePoly1305Intrinsics) { 8689 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 8690 } 8691 8692 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8693 8694 generate_atomic_entry_points(); 8695 8696 #endif // LINUX 8697 8698 #ifdef COMPILER2 8699 if (UseSecondarySupersTable) { 8700 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 8701 if (! InlineSecondarySupersTest) { 8702 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 8703 StubRoutines::_lookup_secondary_supers_table_stubs[slot] 8704 = generate_lookup_secondary_supers_table_stub(slot); 8705 } 8706 } 8707 } 8708 #endif 8709 8710 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8711 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 8712 8713 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8714 } 8715 8716 void generate_compiler_stubs() { 8717 #if COMPILER2_OR_JVMCI 8718 8719 if (UseSVE == 0) { 8720 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8721 } 8722 8723 // array equals stub for large arrays. 8724 if (!UseSimpleArrayEquals) { 8725 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8726 } 8727 8728 // arrays_hascode stub for large arrays. 8729 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 8730 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 8731 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 8732 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 8733 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 8734 8735 // byte_array_inflate stub for large arrays. 8736 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8737 8738 // countPositives stub for large arrays. 8739 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8740 8741 generate_compare_long_strings(); 8742 8743 generate_string_indexof_stubs(); 8744 8745 #ifdef COMPILER2 8746 if (UseMultiplyToLenIntrinsic) { 8747 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8748 } 8749 8750 if (UseSquareToLenIntrinsic) { 8751 StubRoutines::_squareToLen = generate_squareToLen(); 8752 } 8753 8754 if (UseMulAddIntrinsic) { 8755 StubRoutines::_mulAdd = generate_mulAdd(); 8756 } 8757 8758 if (UseSIMDForBigIntegerShiftIntrinsics) { 8759 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8760 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8761 } 8762 8763 if (UseMontgomeryMultiplyIntrinsic) { 8764 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8765 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8766 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8767 } 8768 8769 if (UseMontgomerySquareIntrinsic) { 8770 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8771 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8772 // We use generate_multiply() rather than generate_square() 8773 // because it's faster for the sizes of modulus we care about. 8774 StubRoutines::_montgomerySquare = g.generate_multiply(); 8775 } 8776 8777 generate_vector_math_stubs(); 8778 8779 #endif // COMPILER2 8780 8781 if (UseChaCha20Intrinsics) { 8782 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8783 } 8784 8785 if (UseBASE64Intrinsics) { 8786 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8787 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8788 } 8789 8790 // data cache line writeback 8791 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8792 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8793 8794 if (UseAESIntrinsics) { 8795 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8796 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8797 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8798 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8799 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8800 } 8801 if (UseGHASHIntrinsics) { 8802 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8803 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8804 } 8805 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8806 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8807 } 8808 8809 if (UseMD5Intrinsics) { 8810 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8811 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8812 } 8813 if (UseSHA1Intrinsics) { 8814 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8815 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8816 } 8817 if (UseSHA256Intrinsics) { 8818 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8819 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8820 } 8821 if (UseSHA512Intrinsics) { 8822 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8823 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8824 } 8825 if (UseSHA3Intrinsics) { 8826 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8827 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8828 } 8829 8830 // generate Adler32 intrinsics code 8831 if (UseAdler32Intrinsics) { 8832 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8833 } 8834 8835 #endif // COMPILER2_OR_JVMCI 8836 } 8837 8838 public: 8839 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8840 switch(kind) { 8841 case Initial_stubs: 8842 generate_initial_stubs(); 8843 break; 8844 case Continuation_stubs: 8845 generate_continuation_stubs(); 8846 break; 8847 case Compiler_stubs: 8848 generate_compiler_stubs(); 8849 break; 8850 case Final_stubs: 8851 generate_final_stubs(); 8852 break; 8853 default: 8854 fatal("unexpected stubs kind: %d", kind); 8855 break; 8856 }; 8857 } 8858 }; // end class declaration 8859 8860 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8861 StubGenerator g(code, kind); 8862 } 8863 8864 8865 #if defined (LINUX) 8866 8867 // Define pointers to atomic stubs and initialize them to point to the 8868 // code in atomic_aarch64.S. 8869 8870 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8871 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8872 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8873 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8874 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8875 8876 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8877 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8878 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8879 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8880 DEFAULT_ATOMIC_OP(xchg, 4, ) 8881 DEFAULT_ATOMIC_OP(xchg, 8, ) 8882 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8883 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8884 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8885 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8886 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8887 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8888 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8889 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8890 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8891 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8892 8893 #undef DEFAULT_ATOMIC_OP 8894 8895 #endif // LINUX