1 /* 2 * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "runtime/atomic.hpp" 45 #include "runtime/continuation.hpp" 46 #include "runtime/continuationEntry.inline.hpp" 47 #include "runtime/frame.inline.hpp" 48 #include "runtime/handles.inline.hpp" 49 #include "runtime/javaThread.hpp" 50 #include "runtime/sharedRuntime.hpp" 51 #include "runtime/stubCodeGenerator.hpp" 52 #include "runtime/stubRoutines.hpp" 53 #include "utilities/align.hpp" 54 #include "utilities/globalDefinitions.hpp" 55 #include "utilities/powerOfTwo.hpp" 56 #ifdef COMPILER2 57 #include "opto/runtime.hpp" 58 #endif 59 #if INCLUDE_ZGC 60 #include "gc/z/zThreadLocalData.hpp" 61 #endif 62 63 // Declaration and definition of StubGenerator (no .hpp file). 64 // For a more detailed description of the stub routine structure 65 // see the comment in stubRoutines.hpp 66 67 #undef __ 68 #define __ _masm-> 69 70 #ifdef PRODUCT 71 #define BLOCK_COMMENT(str) /* nothing */ 72 #else 73 #define BLOCK_COMMENT(str) __ block_comment(str) 74 #endif 75 76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 77 78 // Stub Code definitions 79 80 class StubGenerator: public StubCodeGenerator { 81 private: 82 83 #ifdef PRODUCT 84 #define inc_counter_np(counter) ((void)0) 85 #else 86 void inc_counter_np_(int& counter) { 87 __ lea(rscratch2, ExternalAddress((address)&counter)); 88 __ ldrw(rscratch1, Address(rscratch2)); 89 __ addw(rscratch1, rscratch1, 1); 90 __ strw(rscratch1, Address(rscratch2)); 91 } 92 #define inc_counter_np(counter) \ 93 BLOCK_COMMENT("inc_counter " #counter); \ 94 inc_counter_np_(counter); 95 #endif 96 97 // Call stubs are used to call Java from C 98 // 99 // Arguments: 100 // c_rarg0: call wrapper address address 101 // c_rarg1: result address 102 // c_rarg2: result type BasicType 103 // c_rarg3: method Method* 104 // c_rarg4: (interpreter) entry point address 105 // c_rarg5: parameters intptr_t* 106 // c_rarg6: parameter size (in words) int 107 // c_rarg7: thread Thread* 108 // 109 // There is no return from the stub itself as any Java result 110 // is written to result 111 // 112 // we save r30 (lr) as the return PC at the base of the frame and 113 // link r29 (fp) below it as the frame pointer installing sp (r31) 114 // into fp. 115 // 116 // we save r0-r7, which accounts for all the c arguments. 117 // 118 // TODO: strictly do we need to save them all? they are treated as 119 // volatile by C so could we omit saving the ones we are going to 120 // place in global registers (thread? method?) or those we only use 121 // during setup of the Java call? 122 // 123 // we don't need to save r8 which C uses as an indirect result location 124 // return register. 125 // 126 // we don't need to save r9-r15 which both C and Java treat as 127 // volatile 128 // 129 // we don't need to save r16-18 because Java does not use them 130 // 131 // we save r19-r28 which Java uses as scratch registers and C 132 // expects to be callee-save 133 // 134 // we save the bottom 64 bits of each value stored in v8-v15; it is 135 // the responsibility of the caller to preserve larger values. 136 // 137 // so the stub frame looks like this when we enter Java code 138 // 139 // [ return_from_Java ] <--- sp 140 // [ argument word n ] 141 // ... 142 // -27 [ argument word 1 ] 143 // -26 [ saved v15 ] <--- sp_after_call 144 // -25 [ saved v14 ] 145 // -24 [ saved v13 ] 146 // -23 [ saved v12 ] 147 // -22 [ saved v11 ] 148 // -21 [ saved v10 ] 149 // -20 [ saved v9 ] 150 // -19 [ saved v8 ] 151 // -18 [ saved r28 ] 152 // -17 [ saved r27 ] 153 // -16 [ saved r26 ] 154 // -15 [ saved r25 ] 155 // -14 [ saved r24 ] 156 // -13 [ saved r23 ] 157 // -12 [ saved r22 ] 158 // -11 [ saved r21 ] 159 // -10 [ saved r20 ] 160 // -9 [ saved r19 ] 161 // -8 [ call wrapper (r0) ] 162 // -7 [ result (r1) ] 163 // -6 [ result type (r2) ] 164 // -5 [ method (r3) ] 165 // -4 [ entry point (r4) ] 166 // -3 [ parameters (r5) ] 167 // -2 [ parameter size (r6) ] 168 // -1 [ thread (r7) ] 169 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 170 // 1 [ saved lr (r30) ] 171 172 // Call stub stack layout word offsets from fp 173 enum call_stub_layout { 174 sp_after_call_off = -26, 175 176 d15_off = -26, 177 d13_off = -24, 178 d11_off = -22, 179 d9_off = -20, 180 181 r28_off = -18, 182 r26_off = -16, 183 r24_off = -14, 184 r22_off = -12, 185 r20_off = -10, 186 call_wrapper_off = -8, 187 result_off = -7, 188 result_type_off = -6, 189 method_off = -5, 190 entry_point_off = -4, 191 parameter_size_off = -2, 192 thread_off = -1, 193 fp_f = 0, 194 retaddr_off = 1, 195 }; 196 197 address generate_call_stub(address& return_address) { 198 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 199 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 200 "adjust this code"); 201 202 StubCodeMark mark(this, "StubRoutines", "call_stub"); 203 address start = __ pc(); 204 205 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 206 207 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 208 const Address result (rfp, result_off * wordSize); 209 const Address result_type (rfp, result_type_off * wordSize); 210 const Address method (rfp, method_off * wordSize); 211 const Address entry_point (rfp, entry_point_off * wordSize); 212 const Address parameter_size(rfp, parameter_size_off * wordSize); 213 214 const Address thread (rfp, thread_off * wordSize); 215 216 const Address d15_save (rfp, d15_off * wordSize); 217 const Address d13_save (rfp, d13_off * wordSize); 218 const Address d11_save (rfp, d11_off * wordSize); 219 const Address d9_save (rfp, d9_off * wordSize); 220 221 const Address r28_save (rfp, r28_off * wordSize); 222 const Address r26_save (rfp, r26_off * wordSize); 223 const Address r24_save (rfp, r24_off * wordSize); 224 const Address r22_save (rfp, r22_off * wordSize); 225 const Address r20_save (rfp, r20_off * wordSize); 226 227 // stub code 228 229 address aarch64_entry = __ pc(); 230 231 // set up frame and move sp to end of save area 232 __ enter(); 233 __ sub(sp, rfp, -sp_after_call_off * wordSize); 234 235 // save register parameters and Java scratch/global registers 236 // n.b. we save thread even though it gets installed in 237 // rthread because we want to sanity check rthread later 238 __ str(c_rarg7, thread); 239 __ strw(c_rarg6, parameter_size); 240 __ stp(c_rarg4, c_rarg5, entry_point); 241 __ stp(c_rarg2, c_rarg3, result_type); 242 __ stp(c_rarg0, c_rarg1, call_wrapper); 243 244 __ stp(r20, r19, r20_save); 245 __ stp(r22, r21, r22_save); 246 __ stp(r24, r23, r24_save); 247 __ stp(r26, r25, r26_save); 248 __ stp(r28, r27, r28_save); 249 250 __ stpd(v9, v8, d9_save); 251 __ stpd(v11, v10, d11_save); 252 __ stpd(v13, v12, d13_save); 253 __ stpd(v15, v14, d15_save); 254 255 // install Java thread in global register now we have saved 256 // whatever value it held 257 __ mov(rthread, c_rarg7); 258 // And method 259 __ mov(rmethod, c_rarg3); 260 261 // set up the heapbase register 262 __ reinit_heapbase(); 263 264 #ifdef ASSERT 265 // make sure we have no pending exceptions 266 { 267 Label L; 268 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 269 __ cmp(rscratch1, (u1)NULL_WORD); 270 __ br(Assembler::EQ, L); 271 __ stop("StubRoutines::call_stub: entered with pending exception"); 272 __ BIND(L); 273 } 274 #endif 275 // pass parameters if any 276 __ mov(esp, sp); 277 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 278 __ andr(sp, rscratch1, -2 * wordSize); 279 280 BLOCK_COMMENT("pass parameters if any"); 281 Label parameters_done; 282 // parameter count is still in c_rarg6 283 // and parameter pointer identifying param 1 is in c_rarg5 284 __ cbzw(c_rarg6, parameters_done); 285 286 address loop = __ pc(); 287 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 288 __ subsw(c_rarg6, c_rarg6, 1); 289 __ push(rscratch1); 290 __ br(Assembler::GT, loop); 291 292 __ BIND(parameters_done); 293 294 // call Java entry -- passing methdoOop, and current sp 295 // rmethod: Method* 296 // r19_sender_sp: sender sp 297 BLOCK_COMMENT("call Java function"); 298 __ mov(r19_sender_sp, sp); 299 __ blr(c_rarg4); 300 301 // we do this here because the notify will already have been done 302 // if we get to the next instruction via an exception 303 // 304 // n.b. adding this instruction here affects the calculation of 305 // whether or not a routine returns to the call stub (used when 306 // doing stack walks) since the normal test is to check the return 307 // pc against the address saved below. so we may need to allow for 308 // this extra instruction in the check. 309 310 // save current address for use by exception handling code 311 312 return_address = __ pc(); 313 314 // store result depending on type (everything that is not 315 // T_OBJECT, T_PRIMITIVE_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 316 // n.b. this assumes Java returns an integral result in r0 317 // and a floating result in j_farg0 318 // All of j_rargN may be used to return inline type fields so be careful 319 // not to clobber those. 320 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register 321 // assignment of Rresult below. 322 Register Rresult = r14, Rresult_type = r15; 323 __ ldr(Rresult, result); 324 Label is_long, is_float, is_double, check_prim, exit; 325 __ ldr(Rresult_type, result_type); 326 __ cmp(Rresult_type, (u1)T_OBJECT); 327 __ br(Assembler::EQ, check_prim); 328 __ cmp(Rresult_type, (u1)T_PRIMITIVE_OBJECT); 329 __ br(Assembler::EQ, check_prim); 330 __ cmp(Rresult_type, (u1)T_LONG); 331 __ br(Assembler::EQ, is_long); 332 __ cmp(Rresult_type, (u1)T_FLOAT); 333 __ br(Assembler::EQ, is_float); 334 __ cmp(Rresult_type, (u1)T_DOUBLE); 335 __ br(Assembler::EQ, is_double); 336 337 // handle T_INT case 338 __ strw(r0, Address(Rresult)); 339 340 __ BIND(exit); 341 342 // pop parameters 343 __ sub(esp, rfp, -sp_after_call_off * wordSize); 344 345 #ifdef ASSERT 346 // verify that threads correspond 347 { 348 Label L, S; 349 __ ldr(rscratch1, thread); 350 __ cmp(rthread, rscratch1); 351 __ br(Assembler::NE, S); 352 __ get_thread(rscratch1); 353 __ cmp(rthread, rscratch1); 354 __ br(Assembler::EQ, L); 355 __ BIND(S); 356 __ stop("StubRoutines::call_stub: threads must correspond"); 357 __ BIND(L); 358 } 359 #endif 360 361 __ pop_cont_fastpath(rthread); 362 363 // restore callee-save registers 364 __ ldpd(v15, v14, d15_save); 365 __ ldpd(v13, v12, d13_save); 366 __ ldpd(v11, v10, d11_save); 367 __ ldpd(v9, v8, d9_save); 368 369 __ ldp(r28, r27, r28_save); 370 __ ldp(r26, r25, r26_save); 371 __ ldp(r24, r23, r24_save); 372 __ ldp(r22, r21, r22_save); 373 __ ldp(r20, r19, r20_save); 374 375 __ ldp(c_rarg0, c_rarg1, call_wrapper); 376 __ ldrw(c_rarg2, result_type); 377 __ ldr(c_rarg3, method); 378 __ ldp(c_rarg4, c_rarg5, entry_point); 379 __ ldp(c_rarg6, c_rarg7, parameter_size); 380 381 // leave frame and return to caller 382 __ leave(); 383 __ ret(lr); 384 385 // handle return types different from T_INT 386 __ BIND(check_prim); 387 if (InlineTypeReturnedAsFields) { 388 // Check for scalarized return value 389 __ tbz(r0, 0, is_long); 390 // Load pack handler address 391 __ andr(rscratch1, r0, -2); 392 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset())); 393 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset())); 394 __ blr(rscratch1); 395 __ b(exit); 396 } 397 398 __ BIND(is_long); 399 __ str(r0, Address(Rresult, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_float); 403 __ strs(j_farg0, Address(Rresult, 0)); 404 __ br(Assembler::AL, exit); 405 406 __ BIND(is_double); 407 __ strd(j_farg0, Address(Rresult, 0)); 408 __ br(Assembler::AL, exit); 409 410 return start; 411 } 412 413 // Return point for a Java call if there's an exception thrown in 414 // Java code. The exception is caught and transformed into a 415 // pending exception stored in JavaThread that can be tested from 416 // within the VM. 417 // 418 // Note: Usually the parameters are removed by the callee. In case 419 // of an exception crossing an activation frame boundary, that is 420 // not the case if the callee is compiled code => need to setup the 421 // rsp. 422 // 423 // r0: exception oop 424 425 address generate_catch_exception() { 426 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 427 address start = __ pc(); 428 429 // same as in generate_call_stub(): 430 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 431 const Address thread (rfp, thread_off * wordSize); 432 433 #ifdef ASSERT 434 // verify that threads correspond 435 { 436 Label L, S; 437 __ ldr(rscratch1, thread); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::NE, S); 440 __ get_thread(rscratch1); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::EQ, L); 443 __ bind(S); 444 __ stop("StubRoutines::catch_exception: threads must correspond"); 445 __ bind(L); 446 } 447 #endif 448 449 // set pending exception 450 __ verify_oop(r0); 451 452 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 453 __ mov(rscratch1, (address)__FILE__); 454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 455 __ movw(rscratch1, (int)__LINE__); 456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 457 458 // complete return to VM 459 assert(StubRoutines::_call_stub_return_address != nullptr, 460 "_call_stub_return_address must have been generated before"); 461 __ b(StubRoutines::_call_stub_return_address); 462 463 return start; 464 } 465 466 // Continuation point for runtime calls returning with a pending 467 // exception. The pending exception check happened in the runtime 468 // or native call stub. The pending exception in Thread is 469 // converted into a Java-level exception. 470 // 471 // Contract with Java-level exception handlers: 472 // r0: exception 473 // r3: throwing pc 474 // 475 // NOTE: At entry of this stub, exception-pc must be in LR !! 476 477 // NOTE: this is always used as a jump target within generated code 478 // so it just needs to be generated code with no x86 prolog 479 480 address generate_forward_exception() { 481 StubCodeMark mark(this, "StubRoutines", "forward exception"); 482 address start = __ pc(); 483 484 // Upon entry, LR points to the return address returning into 485 // Java (interpreted or compiled) code; i.e., the return address 486 // becomes the throwing pc. 487 // 488 // Arguments pushed before the runtime call are still on the stack 489 // but the exception handler will reset the stack pointer -> 490 // ignore them. A potential result in registers can be ignored as 491 // well. 492 493 #ifdef ASSERT 494 // make sure this code is only executed if there is a pending exception 495 { 496 Label L; 497 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 498 __ cbnz(rscratch1, L); 499 __ stop("StubRoutines::forward exception: no pending exception (1)"); 500 __ bind(L); 501 } 502 #endif 503 504 // compute exception handler into r19 505 506 // call the VM to find the handler address associated with the 507 // caller address. pass thread in r0 and caller pc (ret address) 508 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 509 // the stack. 510 __ mov(c_rarg1, lr); 511 // lr will be trashed by the VM call so we move it to R19 512 // (callee-saved) because we also need to pass it to the handler 513 // returned by this call. 514 __ mov(r19, lr); 515 BLOCK_COMMENT("call exception_handler_for_return_address"); 516 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 517 SharedRuntime::exception_handler_for_return_address), 518 rthread, c_rarg1); 519 // Reinitialize the ptrue predicate register, in case the external runtime 520 // call clobbers ptrue reg, as we may return to SVE compiled code. 521 __ reinitialize_ptrue(); 522 523 // we should not really care that lr is no longer the callee 524 // address. we saved the value the handler needs in r19 so we can 525 // just copy it to r3. however, the C2 handler will push its own 526 // frame and then calls into the VM and the VM code asserts that 527 // the PC for the frame above the handler belongs to a compiled 528 // Java method. So, we restore lr here to satisfy that assert. 529 __ mov(lr, r19); 530 // setup r0 & r3 & clear pending exception 531 __ mov(r3, r19); 532 __ mov(r19, r0); 533 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 534 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 535 536 #ifdef ASSERT 537 // make sure exception is set 538 { 539 Label L; 540 __ cbnz(r0, L); 541 __ stop("StubRoutines::forward exception: no pending exception (2)"); 542 __ bind(L); 543 } 544 #endif 545 546 // continue at exception handler 547 // r0: exception 548 // r3: throwing pc 549 // r19: exception handler 550 __ verify_oop(r0); 551 __ br(r19); 552 553 return start; 554 } 555 556 // Non-destructive plausibility checks for oops 557 // 558 // Arguments: 559 // r0: oop to verify 560 // rscratch1: error message 561 // 562 // Stack after saving c_rarg3: 563 // [tos + 0]: saved c_rarg3 564 // [tos + 1]: saved c_rarg2 565 // [tos + 2]: saved lr 566 // [tos + 3]: saved rscratch2 567 // [tos + 4]: saved r0 568 // [tos + 5]: saved rscratch1 569 address generate_verify_oop() { 570 571 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 572 address start = __ pc(); 573 574 Label exit, error; 575 576 // save c_rarg2 and c_rarg3 577 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 578 579 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 580 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 581 __ ldr(c_rarg3, Address(c_rarg2)); 582 __ add(c_rarg3, c_rarg3, 1); 583 __ str(c_rarg3, Address(c_rarg2)); 584 585 // object is in r0 586 // make sure object is 'reasonable' 587 __ cbz(r0, exit); // if obj is null it is OK 588 589 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 590 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 591 592 // return if everything seems ok 593 __ bind(exit); 594 595 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 596 __ ret(lr); 597 598 // handle errors 599 __ bind(error); 600 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 601 602 __ push(RegSet::range(r0, r29), sp); 603 // debug(char* msg, int64_t pc, int64_t regs[]) 604 __ mov(c_rarg0, rscratch1); // pass address of error message 605 __ mov(c_rarg1, lr); // pass return address 606 __ mov(c_rarg2, sp); // pass address of regs on stack 607 #ifndef PRODUCT 608 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 609 #endif 610 BLOCK_COMMENT("call MacroAssembler::debug"); 611 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 612 __ blr(rscratch1); 613 __ hlt(0); 614 615 return start; 616 } 617 618 // Generate indices for iota vector. 619 address generate_iota_indices(const char *stub_name) { 620 __ align(CodeEntryAlignment); 621 StubCodeMark mark(this, "StubRoutines", stub_name); 622 address start = __ pc(); 623 // B 624 __ emit_data64(0x0706050403020100, relocInfo::none); 625 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 626 // H 627 __ emit_data64(0x0003000200010000, relocInfo::none); 628 __ emit_data64(0x0007000600050004, relocInfo::none); 629 // S 630 __ emit_data64(0x0000000100000000, relocInfo::none); 631 __ emit_data64(0x0000000300000002, relocInfo::none); 632 // D 633 __ emit_data64(0x0000000000000000, relocInfo::none); 634 __ emit_data64(0x0000000000000001, relocInfo::none); 635 // S - FP 636 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 637 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 638 // D - FP 639 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 640 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 641 return start; 642 } 643 644 // The inner part of zero_words(). This is the bulk operation, 645 // zeroing words in blocks, possibly using DC ZVA to do it. The 646 // caller is responsible for zeroing the last few words. 647 // 648 // Inputs: 649 // r10: the HeapWord-aligned base address of an array to zero. 650 // r11: the count in HeapWords, r11 > 0. 651 // 652 // Returns r10 and r11, adjusted for the caller to clear. 653 // r10: the base address of the tail of words left to clear. 654 // r11: the number of words in the tail. 655 // r11 < MacroAssembler::zero_words_block_size. 656 657 address generate_zero_blocks() { 658 Label done; 659 Label base_aligned; 660 661 Register base = r10, cnt = r11; 662 663 __ align(CodeEntryAlignment); 664 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 665 address start = __ pc(); 666 667 if (UseBlockZeroing) { 668 int zva_length = VM_Version::zva_length(); 669 670 // Ensure ZVA length can be divided by 16. This is required by 671 // the subsequent operations. 672 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 673 674 __ tbz(base, 3, base_aligned); 675 __ str(zr, Address(__ post(base, 8))); 676 __ sub(cnt, cnt, 1); 677 __ bind(base_aligned); 678 679 // Ensure count >= zva_length * 2 so that it still deserves a zva after 680 // alignment. 681 Label small; 682 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 683 __ subs(rscratch1, cnt, low_limit >> 3); 684 __ br(Assembler::LT, small); 685 __ zero_dcache_blocks(base, cnt); 686 __ bind(small); 687 } 688 689 { 690 // Number of stp instructions we'll unroll 691 const int unroll = 692 MacroAssembler::zero_words_block_size / 2; 693 // Clear the remaining blocks. 694 Label loop; 695 __ subs(cnt, cnt, unroll * 2); 696 __ br(Assembler::LT, done); 697 __ bind(loop); 698 for (int i = 0; i < unroll; i++) 699 __ stp(zr, zr, __ post(base, 16)); 700 __ subs(cnt, cnt, unroll * 2); 701 __ br(Assembler::GE, loop); 702 __ bind(done); 703 __ add(cnt, cnt, unroll * 2); 704 } 705 706 __ ret(lr); 707 708 return start; 709 } 710 711 712 typedef enum { 713 copy_forwards = 1, 714 copy_backwards = -1 715 } copy_direction; 716 717 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 718 // for arraycopy stubs. 719 class ArrayCopyBarrierSetHelper : StackObj { 720 BarrierSetAssembler* _bs_asm; 721 MacroAssembler* _masm; 722 DecoratorSet _decorators; 723 BasicType _type; 724 Register _gct1; 725 Register _gct2; 726 Register _gct3; 727 FloatRegister _gcvt1; 728 FloatRegister _gcvt2; 729 FloatRegister _gcvt3; 730 731 public: 732 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 733 DecoratorSet decorators, 734 BasicType type, 735 Register gct1, 736 Register gct2, 737 Register gct3, 738 FloatRegister gcvt1, 739 FloatRegister gcvt2, 740 FloatRegister gcvt3) 741 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 742 _masm(masm), 743 _decorators(decorators), 744 _type(type), 745 _gct1(gct1), 746 _gct2(gct2), 747 _gct3(gct3), 748 _gcvt1(gcvt1), 749 _gcvt2(gcvt2), 750 _gcvt3(gcvt3) { 751 } 752 753 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 754 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 755 dst1, dst2, src, 756 _gct1, _gct2, _gcvt1); 757 } 758 759 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 760 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 761 dst, src1, src2, 762 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 763 } 764 765 void copy_load_at_16(Register dst1, Register dst2, Address src) { 766 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 767 dst1, dst2, src, 768 _gct1); 769 } 770 771 void copy_store_at_16(Address dst, Register src1, Register src2) { 772 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 773 dst, src1, src2, 774 _gct1, _gct2, _gct3); 775 } 776 777 void copy_load_at_8(Register dst, Address src) { 778 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 779 dst, noreg, src, 780 _gct1); 781 } 782 783 void copy_store_at_8(Address dst, Register src) { 784 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 785 dst, src, noreg, 786 _gct1, _gct2, _gct3); 787 } 788 }; 789 790 // Bulk copy of blocks of 8 words. 791 // 792 // count is a count of words. 793 // 794 // Precondition: count >= 8 795 // 796 // Postconditions: 797 // 798 // The least significant bit of count contains the remaining count 799 // of words to copy. The rest of count is trash. 800 // 801 // s and d are adjusted to point to the remaining words to copy 802 // 803 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 804 copy_direction direction) { 805 int unit = wordSize * direction; 806 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 807 808 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 809 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 810 const Register stride = r14; 811 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 812 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 813 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 814 815 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 816 assert_different_registers(s, d, count, rscratch1, rscratch2); 817 818 Label again, drain; 819 const char *stub_name; 820 if (direction == copy_forwards) 821 stub_name = "forward_copy_longs"; 822 else 823 stub_name = "backward_copy_longs"; 824 825 __ align(CodeEntryAlignment); 826 827 StubCodeMark mark(this, "StubRoutines", stub_name); 828 829 __ bind(start); 830 831 Label unaligned_copy_long; 832 if (AvoidUnalignedAccesses) { 833 __ tbnz(d, 3, unaligned_copy_long); 834 } 835 836 if (direction == copy_forwards) { 837 __ sub(s, s, bias); 838 __ sub(d, d, bias); 839 } 840 841 #ifdef ASSERT 842 // Make sure we are never given < 8 words 843 { 844 Label L; 845 __ cmp(count, (u1)8); 846 __ br(Assembler::GE, L); 847 __ stop("genrate_copy_longs called with < 8 words"); 848 __ bind(L); 849 } 850 #endif 851 852 // Fill 8 registers 853 if (UseSIMDForMemoryOps) { 854 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 855 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 856 } else { 857 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 858 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 859 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 860 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 861 } 862 863 __ subs(count, count, 16); 864 __ br(Assembler::LO, drain); 865 866 int prefetch = PrefetchCopyIntervalInBytes; 867 bool use_stride = false; 868 if (direction == copy_backwards) { 869 use_stride = prefetch > 256; 870 prefetch = -prefetch; 871 if (use_stride) __ mov(stride, prefetch); 872 } 873 874 __ bind(again); 875 876 if (PrefetchCopyIntervalInBytes > 0) 877 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 878 879 if (UseSIMDForMemoryOps) { 880 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 881 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 882 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 884 } else { 885 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 886 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 887 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 888 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 889 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 890 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 891 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 892 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 893 } 894 895 __ subs(count, count, 8); 896 __ br(Assembler::HS, again); 897 898 // Drain 899 __ bind(drain); 900 if (UseSIMDForMemoryOps) { 901 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 902 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 903 } else { 904 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 905 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 906 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 907 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 908 } 909 910 { 911 Label L1, L2; 912 __ tbz(count, exact_log2(4), L1); 913 if (UseSIMDForMemoryOps) { 914 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 915 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 916 } else { 917 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 918 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 919 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 920 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 921 } 922 __ bind(L1); 923 924 if (direction == copy_forwards) { 925 __ add(s, s, bias); 926 __ add(d, d, bias); 927 } 928 929 __ tbz(count, 1, L2); 930 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 931 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 932 __ bind(L2); 933 } 934 935 __ ret(lr); 936 937 if (AvoidUnalignedAccesses) { 938 Label drain, again; 939 // Register order for storing. Order is different for backward copy. 940 941 __ bind(unaligned_copy_long); 942 943 // source address is even aligned, target odd aligned 944 // 945 // when forward copying word pairs we read long pairs at offsets 946 // {0, 2, 4, 6} (in long words). when backwards copying we read 947 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 948 // address by -2 in the forwards case so we can compute the 949 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 950 // or -1. 951 // 952 // when forward copying we need to store 1 word, 3 pairs and 953 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 954 // zero offset We adjust the destination by -1 which means we 955 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 956 // 957 // When backwards copyng we need to store 1 word, 3 pairs and 958 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 959 // offsets {1, 3, 5, 7, 8} * unit. 960 961 if (direction == copy_forwards) { 962 __ sub(s, s, 16); 963 __ sub(d, d, 8); 964 } 965 966 // Fill 8 registers 967 // 968 // for forwards copy s was offset by -16 from the original input 969 // value of s so the register contents are at these offsets 970 // relative to the 64 bit block addressed by that original input 971 // and so on for each successive 64 byte block when s is updated 972 // 973 // t0 at offset 0, t1 at offset 8 974 // t2 at offset 16, t3 at offset 24 975 // t4 at offset 32, t5 at offset 40 976 // t6 at offset 48, t7 at offset 56 977 978 // for backwards copy s was not offset so the register contents 979 // are at these offsets into the preceding 64 byte block 980 // relative to that original input and so on for each successive 981 // preceding 64 byte block when s is updated. this explains the 982 // slightly counter-intuitive looking pattern of register usage 983 // in the stp instructions for backwards copy. 984 // 985 // t0 at offset -16, t1 at offset -8 986 // t2 at offset -32, t3 at offset -24 987 // t4 at offset -48, t5 at offset -40 988 // t6 at offset -64, t7 at offset -56 989 990 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 991 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 992 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 993 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 994 995 __ subs(count, count, 16); 996 __ br(Assembler::LO, drain); 997 998 int prefetch = PrefetchCopyIntervalInBytes; 999 bool use_stride = false; 1000 if (direction == copy_backwards) { 1001 use_stride = prefetch > 256; 1002 prefetch = -prefetch; 1003 if (use_stride) __ mov(stride, prefetch); 1004 } 1005 1006 __ bind(again); 1007 1008 if (PrefetchCopyIntervalInBytes > 0) 1009 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1010 1011 if (direction == copy_forwards) { 1012 // allowing for the offset of -8 the store instructions place 1013 // registers into the target 64 bit block at the following 1014 // offsets 1015 // 1016 // t0 at offset 0 1017 // t1 at offset 8, t2 at offset 16 1018 // t3 at offset 24, t4 at offset 32 1019 // t5 at offset 40, t6 at offset 48 1020 // t7 at offset 56 1021 1022 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1023 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1024 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1025 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1026 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1027 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1028 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1029 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1030 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1031 } else { 1032 // d was not offset when we started so the registers are 1033 // written into the 64 bit block preceding d with the following 1034 // offsets 1035 // 1036 // t1 at offset -8 1037 // t3 at offset -24, t0 at offset -16 1038 // t5 at offset -48, t2 at offset -32 1039 // t7 at offset -56, t4 at offset -48 1040 // t6 at offset -64 1041 // 1042 // note that this matches the offsets previously noted for the 1043 // loads 1044 1045 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1046 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1047 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1048 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1049 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1050 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1051 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1052 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1053 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1054 } 1055 1056 __ subs(count, count, 8); 1057 __ br(Assembler::HS, again); 1058 1059 // Drain 1060 // 1061 // this uses the same pattern of offsets and register arguments 1062 // as above 1063 __ bind(drain); 1064 if (direction == copy_forwards) { 1065 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1066 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1067 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1068 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1069 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1070 } else { 1071 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1072 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1073 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1074 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1075 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1076 } 1077 // now we need to copy any remaining part block which may 1078 // include a 4 word block subblock and/or a 2 word subblock. 1079 // bits 2 and 1 in the count are the tell-tale for whether we 1080 // have each such subblock 1081 { 1082 Label L1, L2; 1083 __ tbz(count, exact_log2(4), L1); 1084 // this is the same as above but copying only 4 longs hence 1085 // with only one intervening stp between the str instructions 1086 // but note that the offsets and registers still follow the 1087 // same pattern 1088 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1089 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1090 if (direction == copy_forwards) { 1091 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1092 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1093 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1094 } else { 1095 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1096 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1097 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1098 } 1099 __ bind(L1); 1100 1101 __ tbz(count, 1, L2); 1102 // this is the same as above but copying only 2 longs hence 1103 // there is no intervening stp between the str instructions 1104 // but note that the offset and register patterns are still 1105 // the same 1106 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1107 if (direction == copy_forwards) { 1108 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1109 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1110 } else { 1111 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1112 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1113 } 1114 __ bind(L2); 1115 1116 // for forwards copy we need to re-adjust the offsets we 1117 // applied so that s and d are follow the last words written 1118 1119 if (direction == copy_forwards) { 1120 __ add(s, s, 16); 1121 __ add(d, d, 8); 1122 } 1123 1124 } 1125 1126 __ ret(lr); 1127 } 1128 } 1129 1130 // Small copy: less than 16 bytes. 1131 // 1132 // NB: Ignores all of the bits of count which represent more than 15 1133 // bytes, so a caller doesn't have to mask them. 1134 1135 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1136 bool is_backwards = step < 0; 1137 size_t granularity = uabs(step); 1138 int direction = is_backwards ? -1 : 1; 1139 1140 Label Lword, Lint, Lshort, Lbyte; 1141 1142 assert(granularity 1143 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1144 1145 const Register t0 = r3; 1146 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1147 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1148 1149 // ??? I don't know if this bit-test-and-branch is the right thing 1150 // to do. It does a lot of jumping, resulting in several 1151 // mispredicted branches. It might make more sense to do this 1152 // with something like Duff's device with a single computed branch. 1153 1154 __ tbz(count, 3 - exact_log2(granularity), Lword); 1155 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1156 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1157 __ bind(Lword); 1158 1159 if (granularity <= sizeof (jint)) { 1160 __ tbz(count, 2 - exact_log2(granularity), Lint); 1161 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1162 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1163 __ bind(Lint); 1164 } 1165 1166 if (granularity <= sizeof (jshort)) { 1167 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1168 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1169 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1170 __ bind(Lshort); 1171 } 1172 1173 if (granularity <= sizeof (jbyte)) { 1174 __ tbz(count, 0, Lbyte); 1175 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1176 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1177 __ bind(Lbyte); 1178 } 1179 } 1180 1181 Label copy_f, copy_b; 1182 Label copy_obj_f, copy_obj_b; 1183 Label copy_obj_uninit_f, copy_obj_uninit_b; 1184 1185 // All-singing all-dancing memory copy. 1186 // 1187 // Copy count units of memory from s to d. The size of a unit is 1188 // step, which can be positive or negative depending on the direction 1189 // of copy. If is_aligned is false, we align the source address. 1190 // 1191 1192 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1193 Register s, Register d, Register count, int step) { 1194 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1195 bool is_backwards = step < 0; 1196 unsigned int granularity = uabs(step); 1197 const Register t0 = r3, t1 = r4; 1198 1199 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1200 // load all the data before writing anything 1201 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1202 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1203 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1204 const Register send = r17, dend = r16; 1205 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1206 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1207 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1208 1209 if (PrefetchCopyIntervalInBytes > 0) 1210 __ prfm(Address(s, 0), PLDL1KEEP); 1211 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1212 __ br(Assembler::HI, copy_big); 1213 1214 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1215 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1216 1217 __ cmp(count, u1(16/granularity)); 1218 __ br(Assembler::LS, copy16); 1219 1220 __ cmp(count, u1(64/granularity)); 1221 __ br(Assembler::HI, copy80); 1222 1223 __ cmp(count, u1(32/granularity)); 1224 __ br(Assembler::LS, copy32); 1225 1226 // 33..64 bytes 1227 if (UseSIMDForMemoryOps) { 1228 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1229 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1230 bs.copy_store_at_32(Address(d, 0), v0, v1); 1231 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1232 } else { 1233 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1234 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1235 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1236 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1237 1238 bs.copy_store_at_16(Address(d, 0), t0, t1); 1239 bs.copy_store_at_16(Address(d, 16), t2, t3); 1240 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1241 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1242 } 1243 __ b(finish); 1244 1245 // 17..32 bytes 1246 __ bind(copy32); 1247 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1248 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1249 1250 bs.copy_store_at_16(Address(d, 0), t0, t1); 1251 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1252 __ b(finish); 1253 1254 // 65..80/96 bytes 1255 // (96 bytes if SIMD because we do 32 byes per instruction) 1256 __ bind(copy80); 1257 if (UseSIMDForMemoryOps) { 1258 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1259 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1260 // Unaligned pointers can be an issue for copying. 1261 // The issue has more chances to happen when granularity of data is 1262 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1263 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1264 // The most performance drop has been seen for the range 65-80 bytes. 1265 // For such cases using the pair of ldp/stp instead of the third pair of 1266 // ldpq/stpq fixes the performance issue. 1267 if (granularity < sizeof (jint)) { 1268 Label copy96; 1269 __ cmp(count, u1(80/granularity)); 1270 __ br(Assembler::HI, copy96); 1271 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1272 1273 bs.copy_store_at_32(Address(d, 0), v0, v1); 1274 bs.copy_store_at_32(Address(d, 32), v2, v3); 1275 1276 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1277 __ b(finish); 1278 1279 __ bind(copy96); 1280 } 1281 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1282 1283 bs.copy_store_at_32(Address(d, 0), v0, v1); 1284 bs.copy_store_at_32(Address(d, 32), v2, v3); 1285 1286 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1287 } else { 1288 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1289 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1290 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1291 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1292 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1293 1294 bs.copy_store_at_16(Address(d, 0), t0, t1); 1295 bs.copy_store_at_16(Address(d, 16), t2, t3); 1296 bs.copy_store_at_16(Address(d, 32), t4, t5); 1297 bs.copy_store_at_16(Address(d, 48), t6, t7); 1298 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1299 } 1300 __ b(finish); 1301 1302 // 0..16 bytes 1303 __ bind(copy16); 1304 __ cmp(count, u1(8/granularity)); 1305 __ br(Assembler::LO, copy8); 1306 1307 // 8..16 bytes 1308 bs.copy_load_at_8(t0, Address(s, 0)); 1309 bs.copy_load_at_8(t1, Address(send, -8)); 1310 bs.copy_store_at_8(Address(d, 0), t0); 1311 bs.copy_store_at_8(Address(dend, -8), t1); 1312 __ b(finish); 1313 1314 if (granularity < 8) { 1315 // 4..7 bytes 1316 __ bind(copy8); 1317 __ tbz(count, 2 - exact_log2(granularity), copy4); 1318 __ ldrw(t0, Address(s, 0)); 1319 __ ldrw(t1, Address(send, -4)); 1320 __ strw(t0, Address(d, 0)); 1321 __ strw(t1, Address(dend, -4)); 1322 __ b(finish); 1323 if (granularity < 4) { 1324 // 0..3 bytes 1325 __ bind(copy4); 1326 __ cbz(count, finish); // get rid of 0 case 1327 if (granularity == 2) { 1328 __ ldrh(t0, Address(s, 0)); 1329 __ strh(t0, Address(d, 0)); 1330 } else { // granularity == 1 1331 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1332 // the first and last byte. 1333 // Handle the 3 byte case by loading and storing base + count/2 1334 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1335 // This does means in the 1 byte case we load/store the same 1336 // byte 3 times. 1337 __ lsr(count, count, 1); 1338 __ ldrb(t0, Address(s, 0)); 1339 __ ldrb(t1, Address(send, -1)); 1340 __ ldrb(t2, Address(s, count)); 1341 __ strb(t0, Address(d, 0)); 1342 __ strb(t1, Address(dend, -1)); 1343 __ strb(t2, Address(d, count)); 1344 } 1345 __ b(finish); 1346 } 1347 } 1348 1349 __ bind(copy_big); 1350 if (is_backwards) { 1351 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1352 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1353 } 1354 1355 // Now we've got the small case out of the way we can align the 1356 // source address on a 2-word boundary. 1357 1358 // Here we will materialize a count in r15, which is used by copy_memory_small 1359 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1360 // Up until here, we have used t9, which aliases r15, but from here on, that register 1361 // can not be used as a temp register, as it contains the count. 1362 1363 Label aligned; 1364 1365 if (is_aligned) { 1366 // We may have to adjust by 1 word to get s 2-word-aligned. 1367 __ tbz(s, exact_log2(wordSize), aligned); 1368 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1369 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1370 __ sub(count, count, wordSize/granularity); 1371 } else { 1372 if (is_backwards) { 1373 __ andr(r15, s, 2 * wordSize - 1); 1374 } else { 1375 __ neg(r15, s); 1376 __ andr(r15, r15, 2 * wordSize - 1); 1377 } 1378 // r15 is the byte adjustment needed to align s. 1379 __ cbz(r15, aligned); 1380 int shift = exact_log2(granularity); 1381 if (shift) __ lsr(r15, r15, shift); 1382 __ sub(count, count, r15); 1383 1384 #if 0 1385 // ?? This code is only correct for a disjoint copy. It may or 1386 // may not make sense to use it in that case. 1387 1388 // Copy the first pair; s and d may not be aligned. 1389 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1390 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1391 1392 // Align s and d, adjust count 1393 if (is_backwards) { 1394 __ sub(s, s, r15); 1395 __ sub(d, d, r15); 1396 } else { 1397 __ add(s, s, r15); 1398 __ add(d, d, r15); 1399 } 1400 #else 1401 copy_memory_small(decorators, type, s, d, r15, step); 1402 #endif 1403 } 1404 1405 __ bind(aligned); 1406 1407 // s is now 2-word-aligned. 1408 1409 // We have a count of units and some trailing bytes. Adjust the 1410 // count and do a bulk copy of words. 1411 __ lsr(r15, count, exact_log2(wordSize/granularity)); 1412 if (direction == copy_forwards) { 1413 if (type != T_OBJECT) { 1414 __ bl(copy_f); 1415 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1416 __ bl(copy_obj_uninit_f); 1417 } else { 1418 __ bl(copy_obj_f); 1419 } 1420 } else { 1421 if (type != T_OBJECT) { 1422 __ bl(copy_b); 1423 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1424 __ bl(copy_obj_uninit_b); 1425 } else { 1426 __ bl(copy_obj_b); 1427 } 1428 } 1429 1430 // And the tail. 1431 copy_memory_small(decorators, type, s, d, count, step); 1432 1433 if (granularity >= 8) __ bind(copy8); 1434 if (granularity >= 4) __ bind(copy4); 1435 __ bind(finish); 1436 } 1437 1438 1439 void clobber_registers() { 1440 #ifdef ASSERT 1441 RegSet clobbered 1442 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1443 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1444 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1445 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1446 __ mov(*it, rscratch1); 1447 } 1448 #endif 1449 1450 } 1451 1452 // Scan over array at a for count oops, verifying each one. 1453 // Preserves a and count, clobbers rscratch1 and rscratch2. 1454 void verify_oop_array (int size, Register a, Register count, Register temp) { 1455 Label loop, end; 1456 __ mov(rscratch1, a); 1457 __ mov(rscratch2, zr); 1458 __ bind(loop); 1459 __ cmp(rscratch2, count); 1460 __ br(Assembler::HS, end); 1461 if (size == wordSize) { 1462 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1463 __ verify_oop(temp); 1464 } else { 1465 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1466 __ decode_heap_oop(temp); // calls verify_oop 1467 } 1468 __ add(rscratch2, rscratch2, 1); 1469 __ b(loop); 1470 __ bind(end); 1471 } 1472 1473 // Arguments: 1474 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1475 // ignored 1476 // is_oop - true => oop array, so generate store check code 1477 // name - stub name string 1478 // 1479 // Inputs: 1480 // c_rarg0 - source array address 1481 // c_rarg1 - destination array address 1482 // c_rarg2 - element count, treated as ssize_t, can be zero 1483 // 1484 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1485 // the hardware handle it. The two dwords within qwords that span 1486 // cache line boundaries will still be loaded and stored atomically. 1487 // 1488 // Side Effects: 1489 // disjoint_int_copy_entry is set to the no-overlap entry point 1490 // used by generate_conjoint_int_oop_copy(). 1491 // 1492 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1493 const char *name, bool dest_uninitialized = false) { 1494 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1495 RegSet saved_reg = RegSet::of(s, d, count); 1496 __ align(CodeEntryAlignment); 1497 StubCodeMark mark(this, "StubRoutines", name); 1498 address start = __ pc(); 1499 __ enter(); 1500 1501 if (entry != nullptr) { 1502 *entry = __ pc(); 1503 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1504 BLOCK_COMMENT("Entry:"); 1505 } 1506 1507 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1508 if (dest_uninitialized) { 1509 decorators |= IS_DEST_UNINITIALIZED; 1510 } 1511 if (aligned) { 1512 decorators |= ARRAYCOPY_ALIGNED; 1513 } 1514 1515 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1516 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1517 1518 if (is_oop) { 1519 // save regs before copy_memory 1520 __ push(RegSet::of(d, count), sp); 1521 } 1522 { 1523 // UnsafeCopyMemory page error: continue after ucm 1524 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1525 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1526 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1527 } 1528 1529 if (is_oop) { 1530 __ pop(RegSet::of(d, count), sp); 1531 if (VerifyOops) 1532 verify_oop_array(size, d, count, r16); 1533 } 1534 1535 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1536 1537 __ leave(); 1538 __ mov(r0, zr); // return 0 1539 __ ret(lr); 1540 return start; 1541 } 1542 1543 // Arguments: 1544 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1545 // ignored 1546 // is_oop - true => oop array, so generate store check code 1547 // name - stub name string 1548 // 1549 // Inputs: 1550 // c_rarg0 - source array address 1551 // c_rarg1 - destination array address 1552 // c_rarg2 - element count, treated as ssize_t, can be zero 1553 // 1554 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1555 // the hardware handle it. The two dwords within qwords that span 1556 // cache line boundaries will still be loaded and stored atomically. 1557 // 1558 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1559 address *entry, const char *name, 1560 bool dest_uninitialized = false) { 1561 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1562 RegSet saved_regs = RegSet::of(s, d, count); 1563 StubCodeMark mark(this, "StubRoutines", name); 1564 address start = __ pc(); 1565 __ enter(); 1566 1567 if (entry != nullptr) { 1568 *entry = __ pc(); 1569 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1570 BLOCK_COMMENT("Entry:"); 1571 } 1572 1573 // use fwd copy when (d-s) above_equal (count*size) 1574 __ sub(rscratch1, d, s); 1575 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1576 __ br(Assembler::HS, nooverlap_target); 1577 1578 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1579 if (dest_uninitialized) { 1580 decorators |= IS_DEST_UNINITIALIZED; 1581 } 1582 if (aligned) { 1583 decorators |= ARRAYCOPY_ALIGNED; 1584 } 1585 1586 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1587 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1588 1589 if (is_oop) { 1590 // save regs before copy_memory 1591 __ push(RegSet::of(d, count), sp); 1592 } 1593 { 1594 // UnsafeCopyMemory page error: continue after ucm 1595 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1596 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1597 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1598 } 1599 if (is_oop) { 1600 __ pop(RegSet::of(d, count), sp); 1601 if (VerifyOops) 1602 verify_oop_array(size, d, count, r16); 1603 } 1604 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1605 __ leave(); 1606 __ mov(r0, zr); // return 0 1607 __ ret(lr); 1608 return start; 1609 } 1610 1611 // Arguments: 1612 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1613 // ignored 1614 // name - stub name string 1615 // 1616 // Inputs: 1617 // c_rarg0 - source array address 1618 // c_rarg1 - destination array address 1619 // c_rarg2 - element count, treated as ssize_t, can be zero 1620 // 1621 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1622 // we let the hardware handle it. The one to eight bytes within words, 1623 // dwords or qwords that span cache line boundaries will still be loaded 1624 // and stored atomically. 1625 // 1626 // Side Effects: 1627 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1628 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1629 // we let the hardware handle it. The one to eight bytes within words, 1630 // dwords or qwords that span cache line boundaries will still be loaded 1631 // and stored atomically. 1632 // 1633 // Side Effects: 1634 // disjoint_byte_copy_entry is set to the no-overlap entry point 1635 // used by generate_conjoint_byte_copy(). 1636 // 1637 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1638 const bool not_oop = false; 1639 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1640 } 1641 1642 // Arguments: 1643 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1644 // ignored 1645 // name - stub name string 1646 // 1647 // Inputs: 1648 // c_rarg0 - source array address 1649 // c_rarg1 - destination array address 1650 // c_rarg2 - element count, treated as ssize_t, can be zero 1651 // 1652 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1653 // we let the hardware handle it. The one to eight bytes within words, 1654 // dwords or qwords that span cache line boundaries will still be loaded 1655 // and stored atomically. 1656 // 1657 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1658 address* entry, const char *name) { 1659 const bool not_oop = false; 1660 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1661 } 1662 1663 // Arguments: 1664 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1665 // ignored 1666 // name - stub name string 1667 // 1668 // Inputs: 1669 // c_rarg0 - source array address 1670 // c_rarg1 - destination array address 1671 // c_rarg2 - element count, treated as ssize_t, can be zero 1672 // 1673 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1674 // let the hardware handle it. The two or four words within dwords 1675 // or qwords that span cache line boundaries will still be loaded 1676 // and stored atomically. 1677 // 1678 // Side Effects: 1679 // disjoint_short_copy_entry is set to the no-overlap entry point 1680 // used by generate_conjoint_short_copy(). 1681 // 1682 address generate_disjoint_short_copy(bool aligned, 1683 address* entry, const char *name) { 1684 const bool not_oop = false; 1685 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1686 } 1687 1688 // Arguments: 1689 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1690 // ignored 1691 // name - stub name string 1692 // 1693 // Inputs: 1694 // c_rarg0 - source array address 1695 // c_rarg1 - destination array address 1696 // c_rarg2 - element count, treated as ssize_t, can be zero 1697 // 1698 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1699 // let the hardware handle it. The two or four words within dwords 1700 // or qwords that span cache line boundaries will still be loaded 1701 // and stored atomically. 1702 // 1703 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1704 address *entry, const char *name) { 1705 const bool not_oop = false; 1706 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1707 1708 } 1709 // Arguments: 1710 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1711 // ignored 1712 // name - stub name string 1713 // 1714 // Inputs: 1715 // c_rarg0 - source array address 1716 // c_rarg1 - destination array address 1717 // c_rarg2 - element count, treated as ssize_t, can be zero 1718 // 1719 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1720 // the hardware handle it. The two dwords within qwords that span 1721 // cache line boundaries will still be loaded and stored atomically. 1722 // 1723 // Side Effects: 1724 // disjoint_int_copy_entry is set to the no-overlap entry point 1725 // used by generate_conjoint_int_oop_copy(). 1726 // 1727 address generate_disjoint_int_copy(bool aligned, address *entry, 1728 const char *name, bool dest_uninitialized = false) { 1729 const bool not_oop = false; 1730 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1731 } 1732 1733 // Arguments: 1734 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1735 // ignored 1736 // name - stub name string 1737 // 1738 // Inputs: 1739 // c_rarg0 - source array address 1740 // c_rarg1 - destination array address 1741 // c_rarg2 - element count, treated as ssize_t, can be zero 1742 // 1743 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1744 // the hardware handle it. The two dwords within qwords that span 1745 // cache line boundaries will still be loaded and stored atomically. 1746 // 1747 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1748 address *entry, const char *name, 1749 bool dest_uninitialized = false) { 1750 const bool not_oop = false; 1751 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1752 } 1753 1754 1755 // Arguments: 1756 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1757 // ignored 1758 // name - stub name string 1759 // 1760 // Inputs: 1761 // c_rarg0 - source array address 1762 // c_rarg1 - destination array address 1763 // c_rarg2 - element count, treated as size_t, can be zero 1764 // 1765 // Side Effects: 1766 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1767 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1768 // 1769 address generate_disjoint_long_copy(bool aligned, address *entry, 1770 const char *name, bool dest_uninitialized = false) { 1771 const bool not_oop = false; 1772 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1773 } 1774 1775 // Arguments: 1776 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1777 // ignored 1778 // name - stub name string 1779 // 1780 // Inputs: 1781 // c_rarg0 - source array address 1782 // c_rarg1 - destination array address 1783 // c_rarg2 - element count, treated as size_t, can be zero 1784 // 1785 address generate_conjoint_long_copy(bool aligned, 1786 address nooverlap_target, address *entry, 1787 const char *name, bool dest_uninitialized = false) { 1788 const bool not_oop = false; 1789 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1790 } 1791 1792 // Arguments: 1793 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1794 // ignored 1795 // name - stub name string 1796 // 1797 // Inputs: 1798 // c_rarg0 - source array address 1799 // c_rarg1 - destination array address 1800 // c_rarg2 - element count, treated as size_t, can be zero 1801 // 1802 // Side Effects: 1803 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1804 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1805 // 1806 address generate_disjoint_oop_copy(bool aligned, address *entry, 1807 const char *name, bool dest_uninitialized) { 1808 const bool is_oop = true; 1809 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1810 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1811 } 1812 1813 // Arguments: 1814 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1815 // ignored 1816 // name - stub name string 1817 // 1818 // Inputs: 1819 // c_rarg0 - source array address 1820 // c_rarg1 - destination array address 1821 // c_rarg2 - element count, treated as size_t, can be zero 1822 // 1823 address generate_conjoint_oop_copy(bool aligned, 1824 address nooverlap_target, address *entry, 1825 const char *name, bool dest_uninitialized) { 1826 const bool is_oop = true; 1827 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1828 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1829 name, dest_uninitialized); 1830 } 1831 1832 1833 // Helper for generating a dynamic type check. 1834 // Smashes rscratch1, rscratch2. 1835 void generate_type_check(Register sub_klass, 1836 Register super_check_offset, 1837 Register super_klass, 1838 Label& L_success) { 1839 assert_different_registers(sub_klass, super_check_offset, super_klass); 1840 1841 BLOCK_COMMENT("type_check:"); 1842 1843 Label L_miss; 1844 1845 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1846 super_check_offset); 1847 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1848 1849 // Fall through on failure! 1850 __ BIND(L_miss); 1851 } 1852 1853 // 1854 // Generate checkcasting array copy stub 1855 // 1856 // Input: 1857 // c_rarg0 - source array address 1858 // c_rarg1 - destination array address 1859 // c_rarg2 - element count, treated as ssize_t, can be zero 1860 // c_rarg3 - size_t ckoff (super_check_offset) 1861 // c_rarg4 - oop ckval (super_klass) 1862 // 1863 // Output: 1864 // r0 == 0 - success 1865 // r0 == -1^K - failure, where K is partial transfer count 1866 // 1867 address generate_checkcast_copy(const char *name, address *entry, 1868 bool dest_uninitialized = false) { 1869 1870 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1871 1872 // Input registers (after setup_arg_regs) 1873 const Register from = c_rarg0; // source array address 1874 const Register to = c_rarg1; // destination array address 1875 const Register count = c_rarg2; // elementscount 1876 const Register ckoff = c_rarg3; // super_check_offset 1877 const Register ckval = c_rarg4; // super_klass 1878 1879 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1880 RegSet wb_post_saved_regs = RegSet::of(count); 1881 1882 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1883 const Register copied_oop = r22; // actual oop copied 1884 const Register count_save = r21; // orig elementscount 1885 const Register start_to = r20; // destination array start address 1886 const Register r19_klass = r19; // oop._klass 1887 1888 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1889 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1890 1891 //--------------------------------------------------------------- 1892 // Assembler stub will be used for this call to arraycopy 1893 // if the two arrays are subtypes of Object[] but the 1894 // destination array type is not equal to or a supertype 1895 // of the source type. Each element must be separately 1896 // checked. 1897 1898 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1899 copied_oop, r19_klass, count_save); 1900 1901 __ align(CodeEntryAlignment); 1902 StubCodeMark mark(this, "StubRoutines", name); 1903 address start = __ pc(); 1904 1905 __ enter(); // required for proper stackwalking of RuntimeStub frame 1906 1907 #ifdef ASSERT 1908 // caller guarantees that the arrays really are different 1909 // otherwise, we would have to make conjoint checks 1910 { Label L; 1911 __ b(L); // conjoint check not yet implemented 1912 __ stop("checkcast_copy within a single array"); 1913 __ bind(L); 1914 } 1915 #endif //ASSERT 1916 1917 // Caller of this entry point must set up the argument registers. 1918 if (entry != nullptr) { 1919 *entry = __ pc(); 1920 BLOCK_COMMENT("Entry:"); 1921 } 1922 1923 // Empty array: Nothing to do. 1924 __ cbz(count, L_done); 1925 __ push(RegSet::of(r19, r20, r21, r22), sp); 1926 1927 #ifdef ASSERT 1928 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1929 // The ckoff and ckval must be mutually consistent, 1930 // even though caller generates both. 1931 { Label L; 1932 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1933 __ ldrw(start_to, Address(ckval, sco_offset)); 1934 __ cmpw(ckoff, start_to); 1935 __ br(Assembler::EQ, L); 1936 __ stop("super_check_offset inconsistent"); 1937 __ bind(L); 1938 } 1939 #endif //ASSERT 1940 1941 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1942 bool is_oop = true; 1943 int element_size = UseCompressedOops ? 4 : 8; 1944 if (dest_uninitialized) { 1945 decorators |= IS_DEST_UNINITIALIZED; 1946 } 1947 1948 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1949 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1950 1951 // save the original count 1952 __ mov(count_save, count); 1953 1954 // Copy from low to high addresses 1955 __ mov(start_to, to); // Save destination array start address 1956 __ b(L_load_element); 1957 1958 // ======== begin loop ======== 1959 // (Loop is rotated; its entry is L_load_element.) 1960 // Loop control: 1961 // for (; count != 0; count--) { 1962 // copied_oop = load_heap_oop(from++); 1963 // ... generate_type_check ...; 1964 // store_heap_oop(to++, copied_oop); 1965 // } 1966 __ align(OptoLoopAlignment); 1967 1968 __ BIND(L_store_element); 1969 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1970 __ post(to, element_size), copied_oop, noreg, 1971 gct1, gct2, gct3); 1972 __ sub(count, count, 1); 1973 __ cbz(count, L_do_card_marks); 1974 1975 // ======== loop entry is here ======== 1976 __ BIND(L_load_element); 1977 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1978 copied_oop, noreg, __ post(from, element_size), 1979 gct1); 1980 __ cbz(copied_oop, L_store_element); 1981 1982 __ load_klass(r19_klass, copied_oop);// query the object klass 1983 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1984 // ======== end loop ======== 1985 1986 // It was a real error; we must depend on the caller to finish the job. 1987 // Register count = remaining oops, count_orig = total oops. 1988 // Emit GC store barriers for the oops we have copied and report 1989 // their number to the caller. 1990 1991 __ subs(count, count_save, count); // K = partially copied oop count 1992 __ eon(count, count, zr); // report (-1^K) to caller 1993 __ br(Assembler::EQ, L_done_pop); 1994 1995 __ BIND(L_do_card_marks); 1996 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1997 1998 __ bind(L_done_pop); 1999 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2000 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2001 2002 __ bind(L_done); 2003 __ mov(r0, count); 2004 __ leave(); 2005 __ ret(lr); 2006 2007 return start; 2008 } 2009 2010 // Perform range checks on the proposed arraycopy. 2011 // Kills temp, but nothing else. 2012 // Also, clean the sign bits of src_pos and dst_pos. 2013 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2014 Register src_pos, // source position (c_rarg1) 2015 Register dst, // destination array oo (c_rarg2) 2016 Register dst_pos, // destination position (c_rarg3) 2017 Register length, 2018 Register temp, 2019 Label& L_failed) { 2020 BLOCK_COMMENT("arraycopy_range_checks:"); 2021 2022 assert_different_registers(rscratch1, temp); 2023 2024 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2025 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2026 __ addw(temp, length, src_pos); 2027 __ cmpw(temp, rscratch1); 2028 __ br(Assembler::HI, L_failed); 2029 2030 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2031 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2032 __ addw(temp, length, dst_pos); 2033 __ cmpw(temp, rscratch1); 2034 __ br(Assembler::HI, L_failed); 2035 2036 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2037 __ movw(src_pos, src_pos); 2038 __ movw(dst_pos, dst_pos); 2039 2040 BLOCK_COMMENT("arraycopy_range_checks done"); 2041 } 2042 2043 // These stubs get called from some dumb test routine. 2044 // I'll write them properly when they're called from 2045 // something that's actually doing something. 2046 static void fake_arraycopy_stub(address src, address dst, int count) { 2047 assert(count == 0, "huh?"); 2048 } 2049 2050 2051 // 2052 // Generate 'unsafe' array copy stub 2053 // Though just as safe as the other stubs, it takes an unscaled 2054 // size_t argument instead of an element count. 2055 // 2056 // Input: 2057 // c_rarg0 - source array address 2058 // c_rarg1 - destination array address 2059 // c_rarg2 - byte count, treated as ssize_t, can be zero 2060 // 2061 // Examines the alignment of the operands and dispatches 2062 // to a long, int, short, or byte copy loop. 2063 // 2064 address generate_unsafe_copy(const char *name, 2065 address byte_copy_entry, 2066 address short_copy_entry, 2067 address int_copy_entry, 2068 address long_copy_entry) { 2069 Label L_long_aligned, L_int_aligned, L_short_aligned; 2070 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2071 2072 __ align(CodeEntryAlignment); 2073 StubCodeMark mark(this, "StubRoutines", name); 2074 address start = __ pc(); 2075 __ enter(); // required for proper stackwalking of RuntimeStub frame 2076 2077 // bump this on entry, not on exit: 2078 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2079 2080 __ orr(rscratch1, s, d); 2081 __ orr(rscratch1, rscratch1, count); 2082 2083 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2084 __ cbz(rscratch1, L_long_aligned); 2085 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2086 __ cbz(rscratch1, L_int_aligned); 2087 __ tbz(rscratch1, 0, L_short_aligned); 2088 __ b(RuntimeAddress(byte_copy_entry)); 2089 2090 __ BIND(L_short_aligned); 2091 __ lsr(count, count, LogBytesPerShort); // size => short_count 2092 __ b(RuntimeAddress(short_copy_entry)); 2093 __ BIND(L_int_aligned); 2094 __ lsr(count, count, LogBytesPerInt); // size => int_count 2095 __ b(RuntimeAddress(int_copy_entry)); 2096 __ BIND(L_long_aligned); 2097 __ lsr(count, count, LogBytesPerLong); // size => long_count 2098 __ b(RuntimeAddress(long_copy_entry)); 2099 2100 return start; 2101 } 2102 2103 // 2104 // Generate generic array copy stubs 2105 // 2106 // Input: 2107 // c_rarg0 - src oop 2108 // c_rarg1 - src_pos (32-bits) 2109 // c_rarg2 - dst oop 2110 // c_rarg3 - dst_pos (32-bits) 2111 // c_rarg4 - element count (32-bits) 2112 // 2113 // Output: 2114 // r0 == 0 - success 2115 // r0 == -1^K - failure, where K is partial transfer count 2116 // 2117 address generate_generic_copy(const char *name, 2118 address byte_copy_entry, address short_copy_entry, 2119 address int_copy_entry, address oop_copy_entry, 2120 address long_copy_entry, address checkcast_copy_entry) { 2121 2122 Label L_failed, L_objArray; 2123 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2124 2125 // Input registers 2126 const Register src = c_rarg0; // source array oop 2127 const Register src_pos = c_rarg1; // source position 2128 const Register dst = c_rarg2; // destination array oop 2129 const Register dst_pos = c_rarg3; // destination position 2130 const Register length = c_rarg4; 2131 2132 2133 // Registers used as temps 2134 const Register dst_klass = c_rarg5; 2135 2136 __ align(CodeEntryAlignment); 2137 2138 StubCodeMark mark(this, "StubRoutines", name); 2139 2140 address start = __ pc(); 2141 2142 __ enter(); // required for proper stackwalking of RuntimeStub frame 2143 2144 // bump this on entry, not on exit: 2145 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2146 2147 //----------------------------------------------------------------------- 2148 // Assembler stub will be used for this call to arraycopy 2149 // if the following conditions are met: 2150 // 2151 // (1) src and dst must not be null. 2152 // (2) src_pos must not be negative. 2153 // (3) dst_pos must not be negative. 2154 // (4) length must not be negative. 2155 // (5) src klass and dst klass should be the same and not null. 2156 // (6) src and dst should be arrays. 2157 // (7) src_pos + length must not exceed length of src. 2158 // (8) dst_pos + length must not exceed length of dst. 2159 // 2160 2161 // if (src == nullptr) return -1; 2162 __ cbz(src, L_failed); 2163 2164 // if (src_pos < 0) return -1; 2165 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2166 2167 // if (dst == nullptr) return -1; 2168 __ cbz(dst, L_failed); 2169 2170 // if (dst_pos < 0) return -1; 2171 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2172 2173 // registers used as temp 2174 const Register scratch_length = r16; // elements count to copy 2175 const Register scratch_src_klass = r17; // array klass 2176 const Register lh = r15; // layout helper 2177 2178 // if (length < 0) return -1; 2179 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2180 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2181 2182 __ load_klass(scratch_src_klass, src); 2183 #ifdef ASSERT 2184 // assert(src->klass() != nullptr); 2185 { 2186 BLOCK_COMMENT("assert klasses not null {"); 2187 Label L1, L2; 2188 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2189 __ bind(L1); 2190 __ stop("broken null klass"); 2191 __ bind(L2); 2192 __ load_klass(rscratch1, dst); 2193 __ cbz(rscratch1, L1); // this would be broken also 2194 BLOCK_COMMENT("} assert klasses not null done"); 2195 } 2196 #endif 2197 2198 // Load layout helper (32-bits) 2199 // 2200 // |array_tag| | header_size | element_type | |log2_element_size| 2201 // 32 30 24 16 8 2 0 2202 // 2203 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2204 // 2205 2206 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2207 2208 // Handle objArrays completely differently... 2209 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2210 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2211 __ movw(rscratch1, objArray_lh); 2212 __ eorw(rscratch2, lh, rscratch1); 2213 __ cbzw(rscratch2, L_objArray); 2214 2215 // if (src->klass() != dst->klass()) return -1; 2216 __ load_klass(rscratch2, dst); 2217 __ eor(rscratch2, rscratch2, scratch_src_klass); 2218 __ cbnz(rscratch2, L_failed); 2219 2220 // Check for flat inline type array -> return -1 2221 __ tst(lh, Klass::_lh_array_tag_flat_value_bit_inplace); 2222 __ br(Assembler::NE, L_failed); 2223 2224 // Check for null-free (non-flat) inline type array -> handle as object array 2225 __ tst(lh, Klass::_lh_null_free_array_bit_inplace); 2226 __ br(Assembler::NE, L_failed); 2227 2228 // if (!src->is_Array()) return -1; 2229 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2230 2231 // At this point, it is known to be a typeArray (array_tag 0x3). 2232 #ifdef ASSERT 2233 { 2234 BLOCK_COMMENT("assert primitive array {"); 2235 Label L; 2236 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2237 __ cmpw(lh, rscratch2); 2238 __ br(Assembler::GE, L); 2239 __ stop("must be a primitive array"); 2240 __ bind(L); 2241 BLOCK_COMMENT("} assert primitive array done"); 2242 } 2243 #endif 2244 2245 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2246 rscratch2, L_failed); 2247 2248 // TypeArrayKlass 2249 // 2250 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2251 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2252 // 2253 2254 const Register rscratch1_offset = rscratch1; // array offset 2255 const Register r15_elsize = lh; // element size 2256 2257 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2258 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2259 __ add(src, src, rscratch1_offset); // src array offset 2260 __ add(dst, dst, rscratch1_offset); // dst array offset 2261 BLOCK_COMMENT("choose copy loop based on element size"); 2262 2263 // next registers should be set before the jump to corresponding stub 2264 const Register from = c_rarg0; // source array address 2265 const Register to = c_rarg1; // destination array address 2266 const Register count = c_rarg2; // elements count 2267 2268 // 'from', 'to', 'count' registers should be set in such order 2269 // since they are the same as 'src', 'src_pos', 'dst'. 2270 2271 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2272 2273 // The possible values of elsize are 0-3, i.e. exact_log2(element 2274 // size in bytes). We do a simple bitwise binary search. 2275 __ BIND(L_copy_bytes); 2276 __ tbnz(r15_elsize, 1, L_copy_ints); 2277 __ tbnz(r15_elsize, 0, L_copy_shorts); 2278 __ lea(from, Address(src, src_pos));// src_addr 2279 __ lea(to, Address(dst, dst_pos));// dst_addr 2280 __ movw(count, scratch_length); // length 2281 __ b(RuntimeAddress(byte_copy_entry)); 2282 2283 __ BIND(L_copy_shorts); 2284 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2285 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2286 __ movw(count, scratch_length); // length 2287 __ b(RuntimeAddress(short_copy_entry)); 2288 2289 __ BIND(L_copy_ints); 2290 __ tbnz(r15_elsize, 0, L_copy_longs); 2291 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2292 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2293 __ movw(count, scratch_length); // length 2294 __ b(RuntimeAddress(int_copy_entry)); 2295 2296 __ BIND(L_copy_longs); 2297 #ifdef ASSERT 2298 { 2299 BLOCK_COMMENT("assert long copy {"); 2300 Label L; 2301 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2302 __ cmpw(r15_elsize, LogBytesPerLong); 2303 __ br(Assembler::EQ, L); 2304 __ stop("must be long copy, but elsize is wrong"); 2305 __ bind(L); 2306 BLOCK_COMMENT("} assert long copy done"); 2307 } 2308 #endif 2309 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2310 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2311 __ movw(count, scratch_length); // length 2312 __ b(RuntimeAddress(long_copy_entry)); 2313 2314 // ObjArrayKlass 2315 __ BIND(L_objArray); 2316 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2317 2318 Label L_plain_copy, L_checkcast_copy; 2319 // test array classes for subtyping 2320 __ load_klass(r15, dst); 2321 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2322 __ br(Assembler::NE, L_checkcast_copy); 2323 2324 // Identically typed arrays can be copied without element-wise checks. 2325 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2326 rscratch2, L_failed); 2327 2328 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2329 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2330 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2331 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2332 __ movw(count, scratch_length); // length 2333 __ BIND(L_plain_copy); 2334 __ b(RuntimeAddress(oop_copy_entry)); 2335 2336 __ BIND(L_checkcast_copy); 2337 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2338 { 2339 // Before looking at dst.length, make sure dst is also an objArray. 2340 __ ldrw(rscratch1, Address(r15, lh_offset)); 2341 __ movw(rscratch2, objArray_lh); 2342 __ eorw(rscratch1, rscratch1, rscratch2); 2343 __ cbnzw(rscratch1, L_failed); 2344 2345 // It is safe to examine both src.length and dst.length. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 r15, L_failed); 2348 2349 __ load_klass(dst_klass, dst); // reload 2350 2351 // Marshal the base address arguments now, freeing registers. 2352 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2353 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2354 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2355 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2356 __ movw(count, length); // length (reloaded) 2357 Register sco_temp = c_rarg3; // this register is free now 2358 assert_different_registers(from, to, count, sco_temp, 2359 dst_klass, scratch_src_klass); 2360 // assert_clean_int(count, sco_temp); 2361 2362 // Generate the type check. 2363 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2364 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2365 2366 // Smashes rscratch1, rscratch2 2367 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2368 2369 // Fetch destination element klass from the ObjArrayKlass header. 2370 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2371 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2372 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2373 2374 // the checkcast_copy loop needs two extra arguments: 2375 assert(c_rarg3 == sco_temp, "#3 already in place"); 2376 // Set up arguments for checkcast_copy_entry. 2377 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2378 __ b(RuntimeAddress(checkcast_copy_entry)); 2379 } 2380 2381 __ BIND(L_failed); 2382 __ mov(r0, -1); 2383 __ leave(); // required for proper stackwalking of RuntimeStub frame 2384 __ ret(lr); 2385 2386 return start; 2387 } 2388 2389 // 2390 // Generate stub for array fill. If "aligned" is true, the 2391 // "to" address is assumed to be heapword aligned. 2392 // 2393 // Arguments for generated stub: 2394 // to: c_rarg0 2395 // value: c_rarg1 2396 // count: c_rarg2 treated as signed 2397 // 2398 address generate_fill(BasicType t, bool aligned, const char *name) { 2399 __ align(CodeEntryAlignment); 2400 StubCodeMark mark(this, "StubRoutines", name); 2401 address start = __ pc(); 2402 2403 BLOCK_COMMENT("Entry:"); 2404 2405 const Register to = c_rarg0; // source array address 2406 const Register value = c_rarg1; // value 2407 const Register count = c_rarg2; // elements count 2408 2409 const Register bz_base = r10; // base for block_zero routine 2410 const Register cnt_words = r11; // temp register 2411 2412 __ enter(); 2413 2414 Label L_fill_elements, L_exit1; 2415 2416 int shift = -1; 2417 switch (t) { 2418 case T_BYTE: 2419 shift = 0; 2420 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2421 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2422 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2423 __ br(Assembler::LO, L_fill_elements); 2424 break; 2425 case T_SHORT: 2426 shift = 1; 2427 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2428 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2429 __ br(Assembler::LO, L_fill_elements); 2430 break; 2431 case T_INT: 2432 shift = 2; 2433 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2434 __ br(Assembler::LO, L_fill_elements); 2435 break; 2436 default: ShouldNotReachHere(); 2437 } 2438 2439 // Align source address at 8 bytes address boundary. 2440 Label L_skip_align1, L_skip_align2, L_skip_align4; 2441 if (!aligned) { 2442 switch (t) { 2443 case T_BYTE: 2444 // One byte misalignment happens only for byte arrays. 2445 __ tbz(to, 0, L_skip_align1); 2446 __ strb(value, Address(__ post(to, 1))); 2447 __ subw(count, count, 1); 2448 __ bind(L_skip_align1); 2449 // Fallthrough 2450 case T_SHORT: 2451 // Two bytes misalignment happens only for byte and short (char) arrays. 2452 __ tbz(to, 1, L_skip_align2); 2453 __ strh(value, Address(__ post(to, 2))); 2454 __ subw(count, count, 2 >> shift); 2455 __ bind(L_skip_align2); 2456 // Fallthrough 2457 case T_INT: 2458 // Align to 8 bytes, we know we are 4 byte aligned to start. 2459 __ tbz(to, 2, L_skip_align4); 2460 __ strw(value, Address(__ post(to, 4))); 2461 __ subw(count, count, 4 >> shift); 2462 __ bind(L_skip_align4); 2463 break; 2464 default: ShouldNotReachHere(); 2465 } 2466 } 2467 2468 // 2469 // Fill large chunks 2470 // 2471 __ lsrw(cnt_words, count, 3 - shift); // number of words 2472 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2473 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2474 if (UseBlockZeroing) { 2475 Label non_block_zeroing, rest; 2476 // If the fill value is zero we can use the fast zero_words(). 2477 __ cbnz(value, non_block_zeroing); 2478 __ mov(bz_base, to); 2479 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2480 address tpc = __ zero_words(bz_base, cnt_words); 2481 if (tpc == nullptr) { 2482 fatal("CodeCache is full at generate_fill"); 2483 } 2484 __ b(rest); 2485 __ bind(non_block_zeroing); 2486 __ fill_words(to, cnt_words, value); 2487 __ bind(rest); 2488 } else { 2489 __ fill_words(to, cnt_words, value); 2490 } 2491 2492 // Remaining count is less than 8 bytes. Fill it by a single store. 2493 // Note that the total length is no less than 8 bytes. 2494 if (t == T_BYTE || t == T_SHORT) { 2495 Label L_exit1; 2496 __ cbzw(count, L_exit1); 2497 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2498 __ str(value, Address(to, -8)); // overwrite some elements 2499 __ bind(L_exit1); 2500 __ leave(); 2501 __ ret(lr); 2502 } 2503 2504 // Handle copies less than 8 bytes. 2505 Label L_fill_2, L_fill_4, L_exit2; 2506 __ bind(L_fill_elements); 2507 switch (t) { 2508 case T_BYTE: 2509 __ tbz(count, 0, L_fill_2); 2510 __ strb(value, Address(__ post(to, 1))); 2511 __ bind(L_fill_2); 2512 __ tbz(count, 1, L_fill_4); 2513 __ strh(value, Address(__ post(to, 2))); 2514 __ bind(L_fill_4); 2515 __ tbz(count, 2, L_exit2); 2516 __ strw(value, Address(to)); 2517 break; 2518 case T_SHORT: 2519 __ tbz(count, 0, L_fill_4); 2520 __ strh(value, Address(__ post(to, 2))); 2521 __ bind(L_fill_4); 2522 __ tbz(count, 1, L_exit2); 2523 __ strw(value, Address(to)); 2524 break; 2525 case T_INT: 2526 __ cbzw(count, L_exit2); 2527 __ strw(value, Address(to)); 2528 break; 2529 default: ShouldNotReachHere(); 2530 } 2531 __ bind(L_exit2); 2532 __ leave(); 2533 __ ret(lr); 2534 return start; 2535 } 2536 2537 address generate_data_cache_writeback() { 2538 const Register line = c_rarg0; // address of line to write back 2539 2540 __ align(CodeEntryAlignment); 2541 2542 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2543 2544 address start = __ pc(); 2545 __ enter(); 2546 __ cache_wb(Address(line, 0)); 2547 __ leave(); 2548 __ ret(lr); 2549 2550 return start; 2551 } 2552 2553 address generate_data_cache_writeback_sync() { 2554 const Register is_pre = c_rarg0; // pre or post sync 2555 2556 __ align(CodeEntryAlignment); 2557 2558 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2559 2560 // pre wbsync is a no-op 2561 // post wbsync translates to an sfence 2562 2563 Label skip; 2564 address start = __ pc(); 2565 __ enter(); 2566 __ cbnz(is_pre, skip); 2567 __ cache_wbsync(false); 2568 __ bind(skip); 2569 __ leave(); 2570 __ ret(lr); 2571 2572 return start; 2573 } 2574 2575 void generate_arraycopy_stubs() { 2576 address entry; 2577 address entry_jbyte_arraycopy; 2578 address entry_jshort_arraycopy; 2579 address entry_jint_arraycopy; 2580 address entry_oop_arraycopy; 2581 address entry_jlong_arraycopy; 2582 address entry_checkcast_arraycopy; 2583 2584 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2585 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2586 2587 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2588 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2589 2590 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2591 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2592 2593 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2594 2595 //*** jbyte 2596 // Always need aligned and unaligned versions 2597 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2598 "jbyte_disjoint_arraycopy"); 2599 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2600 &entry_jbyte_arraycopy, 2601 "jbyte_arraycopy"); 2602 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2603 "arrayof_jbyte_disjoint_arraycopy"); 2604 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2605 "arrayof_jbyte_arraycopy"); 2606 2607 //*** jshort 2608 // Always need aligned and unaligned versions 2609 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2610 "jshort_disjoint_arraycopy"); 2611 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2612 &entry_jshort_arraycopy, 2613 "jshort_arraycopy"); 2614 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2615 "arrayof_jshort_disjoint_arraycopy"); 2616 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2617 "arrayof_jshort_arraycopy"); 2618 2619 //*** jint 2620 // Aligned versions 2621 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2622 "arrayof_jint_disjoint_arraycopy"); 2623 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2624 "arrayof_jint_arraycopy"); 2625 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2626 // entry_jint_arraycopy always points to the unaligned version 2627 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2628 "jint_disjoint_arraycopy"); 2629 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2630 &entry_jint_arraycopy, 2631 "jint_arraycopy"); 2632 2633 //*** jlong 2634 // It is always aligned 2635 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2636 "arrayof_jlong_disjoint_arraycopy"); 2637 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2638 "arrayof_jlong_arraycopy"); 2639 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2640 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2641 2642 //*** oops 2643 { 2644 // With compressed oops we need unaligned versions; notice that 2645 // we overwrite entry_oop_arraycopy. 2646 bool aligned = !UseCompressedOops; 2647 2648 StubRoutines::_arrayof_oop_disjoint_arraycopy 2649 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2650 /*dest_uninitialized*/false); 2651 StubRoutines::_arrayof_oop_arraycopy 2652 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2653 /*dest_uninitialized*/false); 2654 // Aligned versions without pre-barriers 2655 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2656 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2657 /*dest_uninitialized*/true); 2658 StubRoutines::_arrayof_oop_arraycopy_uninit 2659 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2660 /*dest_uninitialized*/true); 2661 } 2662 2663 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2664 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2665 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2666 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2667 2668 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2669 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2670 /*dest_uninitialized*/true); 2671 2672 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2673 entry_jbyte_arraycopy, 2674 entry_jshort_arraycopy, 2675 entry_jint_arraycopy, 2676 entry_jlong_arraycopy); 2677 2678 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2679 entry_jbyte_arraycopy, 2680 entry_jshort_arraycopy, 2681 entry_jint_arraycopy, 2682 entry_oop_arraycopy, 2683 entry_jlong_arraycopy, 2684 entry_checkcast_arraycopy); 2685 2686 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2687 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2688 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2689 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2690 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2691 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2692 } 2693 2694 void generate_math_stubs() { Unimplemented(); } 2695 2696 // Arguments: 2697 // 2698 // Inputs: 2699 // c_rarg0 - source byte array address 2700 // c_rarg1 - destination byte array address 2701 // c_rarg2 - K (key) in little endian int array 2702 // 2703 address generate_aescrypt_encryptBlock() { 2704 __ align(CodeEntryAlignment); 2705 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2706 2707 const Register from = c_rarg0; // source array address 2708 const Register to = c_rarg1; // destination array address 2709 const Register key = c_rarg2; // key array address 2710 const Register keylen = rscratch1; 2711 2712 address start = __ pc(); 2713 __ enter(); 2714 2715 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2716 2717 __ aesenc_loadkeys(key, keylen); 2718 __ aesecb_encrypt(from, to, keylen); 2719 2720 __ mov(r0, 0); 2721 2722 __ leave(); 2723 __ ret(lr); 2724 2725 return start; 2726 } 2727 2728 // Arguments: 2729 // 2730 // Inputs: 2731 // c_rarg0 - source byte array address 2732 // c_rarg1 - destination byte array address 2733 // c_rarg2 - K (key) in little endian int array 2734 // 2735 address generate_aescrypt_decryptBlock() { 2736 assert(UseAES, "need AES cryptographic extension support"); 2737 __ align(CodeEntryAlignment); 2738 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2739 Label L_doLast; 2740 2741 const Register from = c_rarg0; // source array address 2742 const Register to = c_rarg1; // destination array address 2743 const Register key = c_rarg2; // key array address 2744 const Register keylen = rscratch1; 2745 2746 address start = __ pc(); 2747 __ enter(); // required for proper stackwalking of RuntimeStub frame 2748 2749 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2750 2751 __ aesecb_decrypt(from, to, key, keylen); 2752 2753 __ mov(r0, 0); 2754 2755 __ leave(); 2756 __ ret(lr); 2757 2758 return start; 2759 } 2760 2761 // Arguments: 2762 // 2763 // Inputs: 2764 // c_rarg0 - source byte array address 2765 // c_rarg1 - destination byte array address 2766 // c_rarg2 - K (key) in little endian int array 2767 // c_rarg3 - r vector byte array address 2768 // c_rarg4 - input length 2769 // 2770 // Output: 2771 // x0 - input length 2772 // 2773 address generate_cipherBlockChaining_encryptAESCrypt() { 2774 assert(UseAES, "need AES cryptographic extension support"); 2775 __ align(CodeEntryAlignment); 2776 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2777 2778 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2779 2780 const Register from = c_rarg0; // source array address 2781 const Register to = c_rarg1; // destination array address 2782 const Register key = c_rarg2; // key array address 2783 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2784 // and left with the results of the last encryption block 2785 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2786 const Register keylen = rscratch1; 2787 2788 address start = __ pc(); 2789 2790 __ enter(); 2791 2792 __ movw(rscratch2, len_reg); 2793 2794 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2795 2796 __ ld1(v0, __ T16B, rvec); 2797 2798 __ cmpw(keylen, 52); 2799 __ br(Assembler::CC, L_loadkeys_44); 2800 __ br(Assembler::EQ, L_loadkeys_52); 2801 2802 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2803 __ rev32(v17, __ T16B, v17); 2804 __ rev32(v18, __ T16B, v18); 2805 __ BIND(L_loadkeys_52); 2806 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2807 __ rev32(v19, __ T16B, v19); 2808 __ rev32(v20, __ T16B, v20); 2809 __ BIND(L_loadkeys_44); 2810 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2811 __ rev32(v21, __ T16B, v21); 2812 __ rev32(v22, __ T16B, v22); 2813 __ rev32(v23, __ T16B, v23); 2814 __ rev32(v24, __ T16B, v24); 2815 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2816 __ rev32(v25, __ T16B, v25); 2817 __ rev32(v26, __ T16B, v26); 2818 __ rev32(v27, __ T16B, v27); 2819 __ rev32(v28, __ T16B, v28); 2820 __ ld1(v29, v30, v31, __ T16B, key); 2821 __ rev32(v29, __ T16B, v29); 2822 __ rev32(v30, __ T16B, v30); 2823 __ rev32(v31, __ T16B, v31); 2824 2825 __ BIND(L_aes_loop); 2826 __ ld1(v1, __ T16B, __ post(from, 16)); 2827 __ eor(v0, __ T16B, v0, v1); 2828 2829 __ br(Assembler::CC, L_rounds_44); 2830 __ br(Assembler::EQ, L_rounds_52); 2831 2832 __ aese(v0, v17); __ aesmc(v0, v0); 2833 __ aese(v0, v18); __ aesmc(v0, v0); 2834 __ BIND(L_rounds_52); 2835 __ aese(v0, v19); __ aesmc(v0, v0); 2836 __ aese(v0, v20); __ aesmc(v0, v0); 2837 __ BIND(L_rounds_44); 2838 __ aese(v0, v21); __ aesmc(v0, v0); 2839 __ aese(v0, v22); __ aesmc(v0, v0); 2840 __ aese(v0, v23); __ aesmc(v0, v0); 2841 __ aese(v0, v24); __ aesmc(v0, v0); 2842 __ aese(v0, v25); __ aesmc(v0, v0); 2843 __ aese(v0, v26); __ aesmc(v0, v0); 2844 __ aese(v0, v27); __ aesmc(v0, v0); 2845 __ aese(v0, v28); __ aesmc(v0, v0); 2846 __ aese(v0, v29); __ aesmc(v0, v0); 2847 __ aese(v0, v30); 2848 __ eor(v0, __ T16B, v0, v31); 2849 2850 __ st1(v0, __ T16B, __ post(to, 16)); 2851 2852 __ subw(len_reg, len_reg, 16); 2853 __ cbnzw(len_reg, L_aes_loop); 2854 2855 __ st1(v0, __ T16B, rvec); 2856 2857 __ mov(r0, rscratch2); 2858 2859 __ leave(); 2860 __ ret(lr); 2861 2862 return start; 2863 } 2864 2865 // Arguments: 2866 // 2867 // Inputs: 2868 // c_rarg0 - source byte array address 2869 // c_rarg1 - destination byte array address 2870 // c_rarg2 - K (key) in little endian int array 2871 // c_rarg3 - r vector byte array address 2872 // c_rarg4 - input length 2873 // 2874 // Output: 2875 // r0 - input length 2876 // 2877 address generate_cipherBlockChaining_decryptAESCrypt() { 2878 assert(UseAES, "need AES cryptographic extension support"); 2879 __ align(CodeEntryAlignment); 2880 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2881 2882 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2883 2884 const Register from = c_rarg0; // source array address 2885 const Register to = c_rarg1; // destination array address 2886 const Register key = c_rarg2; // key array address 2887 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2888 // and left with the results of the last encryption block 2889 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2890 const Register keylen = rscratch1; 2891 2892 address start = __ pc(); 2893 2894 __ enter(); 2895 2896 __ movw(rscratch2, len_reg); 2897 2898 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2899 2900 __ ld1(v2, __ T16B, rvec); 2901 2902 __ ld1(v31, __ T16B, __ post(key, 16)); 2903 __ rev32(v31, __ T16B, v31); 2904 2905 __ cmpw(keylen, 52); 2906 __ br(Assembler::CC, L_loadkeys_44); 2907 __ br(Assembler::EQ, L_loadkeys_52); 2908 2909 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2910 __ rev32(v17, __ T16B, v17); 2911 __ rev32(v18, __ T16B, v18); 2912 __ BIND(L_loadkeys_52); 2913 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2914 __ rev32(v19, __ T16B, v19); 2915 __ rev32(v20, __ T16B, v20); 2916 __ BIND(L_loadkeys_44); 2917 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2918 __ rev32(v21, __ T16B, v21); 2919 __ rev32(v22, __ T16B, v22); 2920 __ rev32(v23, __ T16B, v23); 2921 __ rev32(v24, __ T16B, v24); 2922 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2923 __ rev32(v25, __ T16B, v25); 2924 __ rev32(v26, __ T16B, v26); 2925 __ rev32(v27, __ T16B, v27); 2926 __ rev32(v28, __ T16B, v28); 2927 __ ld1(v29, v30, __ T16B, key); 2928 __ rev32(v29, __ T16B, v29); 2929 __ rev32(v30, __ T16B, v30); 2930 2931 __ BIND(L_aes_loop); 2932 __ ld1(v0, __ T16B, __ post(from, 16)); 2933 __ orr(v1, __ T16B, v0, v0); 2934 2935 __ br(Assembler::CC, L_rounds_44); 2936 __ br(Assembler::EQ, L_rounds_52); 2937 2938 __ aesd(v0, v17); __ aesimc(v0, v0); 2939 __ aesd(v0, v18); __ aesimc(v0, v0); 2940 __ BIND(L_rounds_52); 2941 __ aesd(v0, v19); __ aesimc(v0, v0); 2942 __ aesd(v0, v20); __ aesimc(v0, v0); 2943 __ BIND(L_rounds_44); 2944 __ aesd(v0, v21); __ aesimc(v0, v0); 2945 __ aesd(v0, v22); __ aesimc(v0, v0); 2946 __ aesd(v0, v23); __ aesimc(v0, v0); 2947 __ aesd(v0, v24); __ aesimc(v0, v0); 2948 __ aesd(v0, v25); __ aesimc(v0, v0); 2949 __ aesd(v0, v26); __ aesimc(v0, v0); 2950 __ aesd(v0, v27); __ aesimc(v0, v0); 2951 __ aesd(v0, v28); __ aesimc(v0, v0); 2952 __ aesd(v0, v29); __ aesimc(v0, v0); 2953 __ aesd(v0, v30); 2954 __ eor(v0, __ T16B, v0, v31); 2955 __ eor(v0, __ T16B, v0, v2); 2956 2957 __ st1(v0, __ T16B, __ post(to, 16)); 2958 __ orr(v2, __ T16B, v1, v1); 2959 2960 __ subw(len_reg, len_reg, 16); 2961 __ cbnzw(len_reg, L_aes_loop); 2962 2963 __ st1(v2, __ T16B, rvec); 2964 2965 __ mov(r0, rscratch2); 2966 2967 __ leave(); 2968 __ ret(lr); 2969 2970 return start; 2971 } 2972 2973 // CTR AES crypt. 2974 // Arguments: 2975 // 2976 // Inputs: 2977 // c_rarg0 - source byte array address 2978 // c_rarg1 - destination byte array address 2979 // c_rarg2 - K (key) in little endian int array 2980 // c_rarg3 - counter vector byte array address 2981 // c_rarg4 - input length 2982 // c_rarg5 - saved encryptedCounter start 2983 // c_rarg6 - saved used length 2984 // 2985 // Output: 2986 // r0 - input length 2987 // 2988 address generate_counterMode_AESCrypt() { 2989 const Register in = c_rarg0; 2990 const Register out = c_rarg1; 2991 const Register key = c_rarg2; 2992 const Register counter = c_rarg3; 2993 const Register saved_len = c_rarg4, len = r10; 2994 const Register saved_encrypted_ctr = c_rarg5; 2995 const Register used_ptr = c_rarg6, used = r12; 2996 2997 const Register offset = r7; 2998 const Register keylen = r11; 2999 3000 const unsigned char block_size = 16; 3001 const int bulk_width = 4; 3002 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3003 // performance with larger data sizes, but it also means that the 3004 // fast path isn't used until you have at least 8 blocks, and up 3005 // to 127 bytes of data will be executed on the slow path. For 3006 // that reason, and also so as not to blow away too much icache, 4 3007 // blocks seems like a sensible compromise. 3008 3009 // Algorithm: 3010 // 3011 // if (len == 0) { 3012 // goto DONE; 3013 // } 3014 // int result = len; 3015 // do { 3016 // if (used >= blockSize) { 3017 // if (len >= bulk_width * blockSize) { 3018 // CTR_large_block(); 3019 // if (len == 0) 3020 // goto DONE; 3021 // } 3022 // for (;;) { 3023 // 16ByteVector v0 = counter; 3024 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3025 // used = 0; 3026 // if (len < blockSize) 3027 // break; /* goto NEXT */ 3028 // 16ByteVector v1 = load16Bytes(in, offset); 3029 // v1 = v1 ^ encryptedCounter; 3030 // store16Bytes(out, offset); 3031 // used = blockSize; 3032 // offset += blockSize; 3033 // len -= blockSize; 3034 // if (len == 0) 3035 // goto DONE; 3036 // } 3037 // } 3038 // NEXT: 3039 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3040 // len--; 3041 // } while (len != 0); 3042 // DONE: 3043 // return result; 3044 // 3045 // CTR_large_block() 3046 // Wide bulk encryption of whole blocks. 3047 3048 __ align(CodeEntryAlignment); 3049 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3050 const address start = __ pc(); 3051 __ enter(); 3052 3053 Label DONE, CTR_large_block, large_block_return; 3054 __ ldrw(used, Address(used_ptr)); 3055 __ cbzw(saved_len, DONE); 3056 3057 __ mov(len, saved_len); 3058 __ mov(offset, 0); 3059 3060 // Compute #rounds for AES based on the length of the key array 3061 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3062 3063 __ aesenc_loadkeys(key, keylen); 3064 3065 { 3066 Label L_CTR_loop, NEXT; 3067 3068 __ bind(L_CTR_loop); 3069 3070 __ cmp(used, block_size); 3071 __ br(__ LO, NEXT); 3072 3073 // Maybe we have a lot of data 3074 __ subsw(rscratch1, len, bulk_width * block_size); 3075 __ br(__ HS, CTR_large_block); 3076 __ BIND(large_block_return); 3077 __ cbzw(len, DONE); 3078 3079 // Setup the counter 3080 __ movi(v4, __ T4S, 0); 3081 __ movi(v5, __ T4S, 1); 3082 __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 } 3083 3084 __ ld1(v0, __ T16B, counter); // Load the counter into v0 3085 __ rev32(v16, __ T16B, v0); 3086 __ addv(v16, __ T4S, v16, v4); 3087 __ rev32(v16, __ T16B, v16); 3088 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3089 3090 { 3091 // We have fewer than bulk_width blocks of data left. Encrypt 3092 // them one by one until there is less than a full block 3093 // remaining, being careful to save both the encrypted counter 3094 // and the counter. 3095 3096 Label inner_loop; 3097 __ bind(inner_loop); 3098 // Counter to encrypt is in v0 3099 __ aesecb_encrypt(noreg, noreg, keylen); 3100 __ st1(v0, __ T16B, saved_encrypted_ctr); 3101 3102 // Do we have a remaining full block? 3103 3104 __ mov(used, 0); 3105 __ cmp(len, block_size); 3106 __ br(__ LO, NEXT); 3107 3108 // Yes, we have a full block 3109 __ ldrq(v1, Address(in, offset)); 3110 __ eor(v1, __ T16B, v1, v0); 3111 __ strq(v1, Address(out, offset)); 3112 __ mov(used, block_size); 3113 __ add(offset, offset, block_size); 3114 3115 __ subw(len, len, block_size); 3116 __ cbzw(len, DONE); 3117 3118 // Increment the counter, store it back 3119 __ orr(v0, __ T16B, v16, v16); 3120 __ rev32(v16, __ T16B, v16); 3121 __ addv(v16, __ T4S, v16, v4); 3122 __ rev32(v16, __ T16B, v16); 3123 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3124 3125 __ b(inner_loop); 3126 } 3127 3128 __ BIND(NEXT); 3129 3130 // Encrypt a single byte, and loop. 3131 // We expect this to be a rare event. 3132 __ ldrb(rscratch1, Address(in, offset)); 3133 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3134 __ eor(rscratch1, rscratch1, rscratch2); 3135 __ strb(rscratch1, Address(out, offset)); 3136 __ add(offset, offset, 1); 3137 __ add(used, used, 1); 3138 __ subw(len, len,1); 3139 __ cbnzw(len, L_CTR_loop); 3140 } 3141 3142 __ bind(DONE); 3143 __ strw(used, Address(used_ptr)); 3144 __ mov(r0, saved_len); 3145 3146 __ leave(); // required for proper stackwalking of RuntimeStub frame 3147 __ ret(lr); 3148 3149 // Bulk encryption 3150 3151 __ BIND (CTR_large_block); 3152 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3153 3154 if (bulk_width == 8) { 3155 __ sub(sp, sp, 4 * 16); 3156 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3157 } 3158 __ sub(sp, sp, 4 * 16); 3159 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3160 RegSet saved_regs = (RegSet::of(in, out, offset) 3161 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3162 __ push(saved_regs, sp); 3163 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3164 __ add(in, in, offset); 3165 __ add(out, out, offset); 3166 3167 // Keys should already be loaded into the correct registers 3168 3169 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3170 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3171 3172 // AES/CTR loop 3173 { 3174 Label L_CTR_loop; 3175 __ BIND(L_CTR_loop); 3176 3177 // Setup the counters 3178 __ movi(v8, __ T4S, 0); 3179 __ movi(v9, __ T4S, 1); 3180 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3181 3182 for (int i = 0; i < bulk_width; i++) { 3183 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3184 __ rev32(v0_ofs, __ T16B, v16); 3185 __ addv(v16, __ T4S, v16, v8); 3186 } 3187 3188 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3189 3190 // Encrypt the counters 3191 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3192 3193 if (bulk_width == 8) { 3194 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3195 } 3196 3197 // XOR the encrypted counters with the inputs 3198 for (int i = 0; i < bulk_width; i++) { 3199 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3200 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3201 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3202 } 3203 3204 // Write the encrypted data 3205 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3206 if (bulk_width == 8) { 3207 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3208 } 3209 3210 __ subw(len, len, 16 * bulk_width); 3211 __ cbnzw(len, L_CTR_loop); 3212 } 3213 3214 // Save the counter back where it goes 3215 __ rev32(v16, __ T16B, v16); 3216 __ st1(v16, __ T16B, counter); 3217 3218 __ pop(saved_regs, sp); 3219 3220 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3221 if (bulk_width == 8) { 3222 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3223 } 3224 3225 __ andr(rscratch1, len, -16 * bulk_width); 3226 __ sub(len, len, rscratch1); 3227 __ add(offset, offset, rscratch1); 3228 __ mov(used, 16); 3229 __ strw(used, Address(used_ptr)); 3230 __ b(large_block_return); 3231 3232 return start; 3233 } 3234 3235 // Vector AES Galois Counter Mode implementation. Parameters: 3236 // 3237 // in = c_rarg0 3238 // len = c_rarg1 3239 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3240 // out = c_rarg3 3241 // key = c_rarg4 3242 // state = c_rarg5 - GHASH.state 3243 // subkeyHtbl = c_rarg6 - powers of H 3244 // counter = c_rarg7 - 16 bytes of CTR 3245 // return - number of processed bytes 3246 address generate_galoisCounterMode_AESCrypt() { 3247 address ghash_polynomial = __ pc(); 3248 __ emit_int64(0x87); // The low-order bits of the field 3249 // polynomial (i.e. p = z^7+z^2+z+1) 3250 // repeated in the low and high parts of a 3251 // 128-bit vector 3252 __ emit_int64(0x87); 3253 3254 __ align(CodeEntryAlignment); 3255 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3256 address start = __ pc(); 3257 __ enter(); 3258 3259 const Register in = c_rarg0; 3260 const Register len = c_rarg1; 3261 const Register ct = c_rarg2; 3262 const Register out = c_rarg3; 3263 // and updated with the incremented counter in the end 3264 3265 const Register key = c_rarg4; 3266 const Register state = c_rarg5; 3267 3268 const Register subkeyHtbl = c_rarg6; 3269 3270 const Register counter = c_rarg7; 3271 3272 const Register keylen = r10; 3273 // Save state before entering routine 3274 __ sub(sp, sp, 4 * 16); 3275 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3276 __ sub(sp, sp, 4 * 16); 3277 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3278 3279 // __ andr(len, len, -512); 3280 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3281 __ str(len, __ pre(sp, -2 * wordSize)); 3282 3283 Label DONE; 3284 __ cbz(len, DONE); 3285 3286 // Compute #rounds for AES based on the length of the key array 3287 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3288 3289 __ aesenc_loadkeys(key, keylen); 3290 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3291 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3292 3293 // AES/CTR loop 3294 { 3295 Label L_CTR_loop; 3296 __ BIND(L_CTR_loop); 3297 3298 // Setup the counters 3299 __ movi(v8, __ T4S, 0); 3300 __ movi(v9, __ T4S, 1); 3301 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3302 3303 assert(v0->encoding() < v8->encoding(), ""); 3304 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3305 FloatRegister f = as_FloatRegister(i); 3306 __ rev32(f, __ T16B, v16); 3307 __ addv(v16, __ T4S, v16, v8); 3308 } 3309 3310 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3311 3312 // Encrypt the counters 3313 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3314 3315 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3316 3317 // XOR the encrypted counters with the inputs 3318 for (int i = 0; i < 8; i++) { 3319 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3320 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3321 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3322 } 3323 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3324 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3325 3326 __ subw(len, len, 16 * 8); 3327 __ cbnzw(len, L_CTR_loop); 3328 } 3329 3330 __ rev32(v16, __ T16B, v16); 3331 __ st1(v16, __ T16B, counter); 3332 3333 __ ldr(len, Address(sp)); 3334 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3335 3336 // GHASH/CTR loop 3337 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3338 len, /*unrolls*/4); 3339 3340 #ifdef ASSERT 3341 { Label L; 3342 __ cmp(len, (unsigned char)0); 3343 __ br(Assembler::EQ, L); 3344 __ stop("stubGenerator: abort"); 3345 __ bind(L); 3346 } 3347 #endif 3348 3349 __ bind(DONE); 3350 // Return the number of bytes processed 3351 __ ldr(r0, __ post(sp, 2 * wordSize)); 3352 3353 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3354 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3355 3356 __ leave(); // required for proper stackwalking of RuntimeStub frame 3357 __ ret(lr); 3358 return start; 3359 } 3360 3361 class Cached64Bytes { 3362 private: 3363 MacroAssembler *_masm; 3364 Register _regs[8]; 3365 3366 public: 3367 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3368 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3369 auto it = rs.begin(); 3370 for (auto &r: _regs) { 3371 r = *it; 3372 ++it; 3373 } 3374 } 3375 3376 void gen_loads(Register base) { 3377 for (int i = 0; i < 8; i += 2) { 3378 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3379 } 3380 } 3381 3382 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3383 void extract_u32(Register dest, int i) { 3384 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3385 } 3386 }; 3387 3388 // Utility routines for md5. 3389 // Clobbers r10 and r11. 3390 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3391 int k, int s, int t) { 3392 Register rscratch3 = r10; 3393 Register rscratch4 = r11; 3394 3395 __ eorw(rscratch3, r3, r4); 3396 __ movw(rscratch2, t); 3397 __ andw(rscratch3, rscratch3, r2); 3398 __ addw(rscratch4, r1, rscratch2); 3399 reg_cache.extract_u32(rscratch1, k); 3400 __ eorw(rscratch3, rscratch3, r4); 3401 __ addw(rscratch4, rscratch4, rscratch1); 3402 __ addw(rscratch3, rscratch3, rscratch4); 3403 __ rorw(rscratch2, rscratch3, 32 - s); 3404 __ addw(r1, rscratch2, r2); 3405 } 3406 3407 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3408 int k, int s, int t) { 3409 Register rscratch3 = r10; 3410 Register rscratch4 = r11; 3411 3412 __ andw(rscratch3, r2, r4); 3413 __ bicw(rscratch4, r3, r4); 3414 reg_cache.extract_u32(rscratch1, k); 3415 __ movw(rscratch2, t); 3416 __ orrw(rscratch3, rscratch3, rscratch4); 3417 __ addw(rscratch4, r1, rscratch2); 3418 __ addw(rscratch4, rscratch4, rscratch1); 3419 __ addw(rscratch3, rscratch3, rscratch4); 3420 __ rorw(rscratch2, rscratch3, 32 - s); 3421 __ addw(r1, rscratch2, r2); 3422 } 3423 3424 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3425 int k, int s, int t) { 3426 Register rscratch3 = r10; 3427 Register rscratch4 = r11; 3428 3429 __ eorw(rscratch3, r3, r4); 3430 __ movw(rscratch2, t); 3431 __ addw(rscratch4, r1, rscratch2); 3432 reg_cache.extract_u32(rscratch1, k); 3433 __ eorw(rscratch3, rscratch3, r2); 3434 __ addw(rscratch4, rscratch4, rscratch1); 3435 __ addw(rscratch3, rscratch3, rscratch4); 3436 __ rorw(rscratch2, rscratch3, 32 - s); 3437 __ addw(r1, rscratch2, r2); 3438 } 3439 3440 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3441 int k, int s, int t) { 3442 Register rscratch3 = r10; 3443 Register rscratch4 = r11; 3444 3445 __ movw(rscratch3, t); 3446 __ ornw(rscratch2, r2, r4); 3447 __ addw(rscratch4, r1, rscratch3); 3448 reg_cache.extract_u32(rscratch1, k); 3449 __ eorw(rscratch3, rscratch2, r3); 3450 __ addw(rscratch4, rscratch4, rscratch1); 3451 __ addw(rscratch3, rscratch3, rscratch4); 3452 __ rorw(rscratch2, rscratch3, 32 - s); 3453 __ addw(r1, rscratch2, r2); 3454 } 3455 3456 // Arguments: 3457 // 3458 // Inputs: 3459 // c_rarg0 - byte[] source+offset 3460 // c_rarg1 - int[] SHA.state 3461 // c_rarg2 - int offset 3462 // c_rarg3 - int limit 3463 // 3464 address generate_md5_implCompress(bool multi_block, const char *name) { 3465 __ align(CodeEntryAlignment); 3466 StubCodeMark mark(this, "StubRoutines", name); 3467 address start = __ pc(); 3468 3469 Register buf = c_rarg0; 3470 Register state = c_rarg1; 3471 Register ofs = c_rarg2; 3472 Register limit = c_rarg3; 3473 Register a = r4; 3474 Register b = r5; 3475 Register c = r6; 3476 Register d = r7; 3477 Register rscratch3 = r10; 3478 Register rscratch4 = r11; 3479 3480 Register state_regs[2] = { r12, r13 }; 3481 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3482 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3483 3484 __ push(saved_regs, sp); 3485 3486 __ ldp(state_regs[0], state_regs[1], Address(state)); 3487 __ ubfx(a, state_regs[0], 0, 32); 3488 __ ubfx(b, state_regs[0], 32, 32); 3489 __ ubfx(c, state_regs[1], 0, 32); 3490 __ ubfx(d, state_regs[1], 32, 32); 3491 3492 Label md5_loop; 3493 __ BIND(md5_loop); 3494 3495 reg_cache.gen_loads(buf); 3496 3497 // Round 1 3498 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3499 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3500 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3501 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3502 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3503 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3504 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3505 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3506 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3507 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3508 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3509 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3510 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3511 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3512 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3513 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3514 3515 // Round 2 3516 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3517 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3518 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3519 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3520 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3521 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3522 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3523 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3524 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3525 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3526 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3527 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3528 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3529 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3530 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3531 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3532 3533 // Round 3 3534 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3535 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3536 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3537 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3538 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3539 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3540 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3541 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3542 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3543 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3544 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3545 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3546 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3547 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3548 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3549 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3550 3551 // Round 4 3552 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3553 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3554 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3555 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3556 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3557 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3558 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3559 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3560 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3561 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3562 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3563 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3564 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3565 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3566 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3567 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3568 3569 __ addw(a, state_regs[0], a); 3570 __ ubfx(rscratch2, state_regs[0], 32, 32); 3571 __ addw(b, rscratch2, b); 3572 __ addw(c, state_regs[1], c); 3573 __ ubfx(rscratch4, state_regs[1], 32, 32); 3574 __ addw(d, rscratch4, d); 3575 3576 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3577 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3578 3579 if (multi_block) { 3580 __ add(buf, buf, 64); 3581 __ add(ofs, ofs, 64); 3582 __ cmp(ofs, limit); 3583 __ br(Assembler::LE, md5_loop); 3584 __ mov(c_rarg0, ofs); // return ofs 3585 } 3586 3587 // write hash values back in the correct order 3588 __ stp(state_regs[0], state_regs[1], Address(state)); 3589 3590 __ pop(saved_regs, sp); 3591 3592 __ ret(lr); 3593 3594 return start; 3595 } 3596 3597 // Arguments: 3598 // 3599 // Inputs: 3600 // c_rarg0 - byte[] source+offset 3601 // c_rarg1 - int[] SHA.state 3602 // c_rarg2 - int offset 3603 // c_rarg3 - int limit 3604 // 3605 address generate_sha1_implCompress(bool multi_block, const char *name) { 3606 __ align(CodeEntryAlignment); 3607 StubCodeMark mark(this, "StubRoutines", name); 3608 address start = __ pc(); 3609 3610 Register buf = c_rarg0; 3611 Register state = c_rarg1; 3612 Register ofs = c_rarg2; 3613 Register limit = c_rarg3; 3614 3615 Label keys; 3616 Label sha1_loop; 3617 3618 // load the keys into v0..v3 3619 __ adr(rscratch1, keys); 3620 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3621 // load 5 words state into v6, v7 3622 __ ldrq(v6, Address(state, 0)); 3623 __ ldrs(v7, Address(state, 16)); 3624 3625 3626 __ BIND(sha1_loop); 3627 // load 64 bytes of data into v16..v19 3628 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3629 __ rev32(v16, __ T16B, v16); 3630 __ rev32(v17, __ T16B, v17); 3631 __ rev32(v18, __ T16B, v18); 3632 __ rev32(v19, __ T16B, v19); 3633 3634 // do the sha1 3635 __ addv(v4, __ T4S, v16, v0); 3636 __ orr(v20, __ T16B, v6, v6); 3637 3638 FloatRegister d0 = v16; 3639 FloatRegister d1 = v17; 3640 FloatRegister d2 = v18; 3641 FloatRegister d3 = v19; 3642 3643 for (int round = 0; round < 20; round++) { 3644 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3645 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3646 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3647 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3648 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3649 3650 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3651 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3652 __ sha1h(tmp2, __ T4S, v20); 3653 if (round < 5) 3654 __ sha1c(v20, __ T4S, tmp3, tmp4); 3655 else if (round < 10 || round >= 15) 3656 __ sha1p(v20, __ T4S, tmp3, tmp4); 3657 else 3658 __ sha1m(v20, __ T4S, tmp3, tmp4); 3659 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3660 3661 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3662 } 3663 3664 __ addv(v7, __ T2S, v7, v21); 3665 __ addv(v6, __ T4S, v6, v20); 3666 3667 if (multi_block) { 3668 __ add(ofs, ofs, 64); 3669 __ cmp(ofs, limit); 3670 __ br(Assembler::LE, sha1_loop); 3671 __ mov(c_rarg0, ofs); // return ofs 3672 } 3673 3674 __ strq(v6, Address(state, 0)); 3675 __ strs(v7, Address(state, 16)); 3676 3677 __ ret(lr); 3678 3679 __ bind(keys); 3680 __ emit_int32(0x5a827999); 3681 __ emit_int32(0x6ed9eba1); 3682 __ emit_int32(0x8f1bbcdc); 3683 __ emit_int32(0xca62c1d6); 3684 3685 return start; 3686 } 3687 3688 3689 // Arguments: 3690 // 3691 // Inputs: 3692 // c_rarg0 - byte[] source+offset 3693 // c_rarg1 - int[] SHA.state 3694 // c_rarg2 - int offset 3695 // c_rarg3 - int limit 3696 // 3697 address generate_sha256_implCompress(bool multi_block, const char *name) { 3698 static const uint32_t round_consts[64] = { 3699 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3700 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3701 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3702 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3703 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3704 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3705 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3706 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3707 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3708 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3709 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3710 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3711 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3712 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3713 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3714 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3715 }; 3716 __ align(CodeEntryAlignment); 3717 StubCodeMark mark(this, "StubRoutines", name); 3718 address start = __ pc(); 3719 3720 Register buf = c_rarg0; 3721 Register state = c_rarg1; 3722 Register ofs = c_rarg2; 3723 Register limit = c_rarg3; 3724 3725 Label sha1_loop; 3726 3727 __ stpd(v8, v9, __ pre(sp, -32)); 3728 __ stpd(v10, v11, Address(sp, 16)); 3729 3730 // dga == v0 3731 // dgb == v1 3732 // dg0 == v2 3733 // dg1 == v3 3734 // dg2 == v4 3735 // t0 == v6 3736 // t1 == v7 3737 3738 // load 16 keys to v16..v31 3739 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3740 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3741 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3742 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3743 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3744 3745 // load 8 words (256 bits) state 3746 __ ldpq(v0, v1, state); 3747 3748 __ BIND(sha1_loop); 3749 // load 64 bytes of data into v8..v11 3750 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3751 __ rev32(v8, __ T16B, v8); 3752 __ rev32(v9, __ T16B, v9); 3753 __ rev32(v10, __ T16B, v10); 3754 __ rev32(v11, __ T16B, v11); 3755 3756 __ addv(v6, __ T4S, v8, v16); 3757 __ orr(v2, __ T16B, v0, v0); 3758 __ orr(v3, __ T16B, v1, v1); 3759 3760 FloatRegister d0 = v8; 3761 FloatRegister d1 = v9; 3762 FloatRegister d2 = v10; 3763 FloatRegister d3 = v11; 3764 3765 3766 for (int round = 0; round < 16; round++) { 3767 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3768 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3769 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3770 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3771 3772 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3773 __ orr(v4, __ T16B, v2, v2); 3774 if (round < 15) 3775 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3776 __ sha256h(v2, __ T4S, v3, tmp2); 3777 __ sha256h2(v3, __ T4S, v4, tmp2); 3778 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3779 3780 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3781 } 3782 3783 __ addv(v0, __ T4S, v0, v2); 3784 __ addv(v1, __ T4S, v1, v3); 3785 3786 if (multi_block) { 3787 __ add(ofs, ofs, 64); 3788 __ cmp(ofs, limit); 3789 __ br(Assembler::LE, sha1_loop); 3790 __ mov(c_rarg0, ofs); // return ofs 3791 } 3792 3793 __ ldpd(v10, v11, Address(sp, 16)); 3794 __ ldpd(v8, v9, __ post(sp, 32)); 3795 3796 __ stpq(v0, v1, state); 3797 3798 __ ret(lr); 3799 3800 return start; 3801 } 3802 3803 // Double rounds for sha512. 3804 void sha512_dround(int dr, 3805 FloatRegister vi0, FloatRegister vi1, 3806 FloatRegister vi2, FloatRegister vi3, 3807 FloatRegister vi4, FloatRegister vrc0, 3808 FloatRegister vrc1, FloatRegister vin0, 3809 FloatRegister vin1, FloatRegister vin2, 3810 FloatRegister vin3, FloatRegister vin4) { 3811 if (dr < 36) { 3812 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3813 } 3814 __ addv(v5, __ T2D, vrc0, vin0); 3815 __ ext(v6, __ T16B, vi2, vi3, 8); 3816 __ ext(v5, __ T16B, v5, v5, 8); 3817 __ ext(v7, __ T16B, vi1, vi2, 8); 3818 __ addv(vi3, __ T2D, vi3, v5); 3819 if (dr < 32) { 3820 __ ext(v5, __ T16B, vin3, vin4, 8); 3821 __ sha512su0(vin0, __ T2D, vin1); 3822 } 3823 __ sha512h(vi3, __ T2D, v6, v7); 3824 if (dr < 32) { 3825 __ sha512su1(vin0, __ T2D, vin2, v5); 3826 } 3827 __ addv(vi4, __ T2D, vi1, vi3); 3828 __ sha512h2(vi3, __ T2D, vi1, vi0); 3829 } 3830 3831 // Arguments: 3832 // 3833 // Inputs: 3834 // c_rarg0 - byte[] source+offset 3835 // c_rarg1 - int[] SHA.state 3836 // c_rarg2 - int offset 3837 // c_rarg3 - int limit 3838 // 3839 address generate_sha512_implCompress(bool multi_block, const char *name) { 3840 static const uint64_t round_consts[80] = { 3841 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3842 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3843 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3844 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3845 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3846 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3847 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3848 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3849 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3850 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3851 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3852 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3853 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3854 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3855 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3856 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3857 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3858 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3859 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3860 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3861 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3862 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3863 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3864 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3865 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3866 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3867 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3868 }; 3869 3870 __ align(CodeEntryAlignment); 3871 StubCodeMark mark(this, "StubRoutines", name); 3872 address start = __ pc(); 3873 3874 Register buf = c_rarg0; 3875 Register state = c_rarg1; 3876 Register ofs = c_rarg2; 3877 Register limit = c_rarg3; 3878 3879 __ stpd(v8, v9, __ pre(sp, -64)); 3880 __ stpd(v10, v11, Address(sp, 16)); 3881 __ stpd(v12, v13, Address(sp, 32)); 3882 __ stpd(v14, v15, Address(sp, 48)); 3883 3884 Label sha512_loop; 3885 3886 // load state 3887 __ ld1(v8, v9, v10, v11, __ T2D, state); 3888 3889 // load first 4 round constants 3890 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3891 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3892 3893 __ BIND(sha512_loop); 3894 // load 128B of data into v12..v19 3895 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3896 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3897 __ rev64(v12, __ T16B, v12); 3898 __ rev64(v13, __ T16B, v13); 3899 __ rev64(v14, __ T16B, v14); 3900 __ rev64(v15, __ T16B, v15); 3901 __ rev64(v16, __ T16B, v16); 3902 __ rev64(v17, __ T16B, v17); 3903 __ rev64(v18, __ T16B, v18); 3904 __ rev64(v19, __ T16B, v19); 3905 3906 __ mov(rscratch2, rscratch1); 3907 3908 __ mov(v0, __ T16B, v8); 3909 __ mov(v1, __ T16B, v9); 3910 __ mov(v2, __ T16B, v10); 3911 __ mov(v3, __ T16B, v11); 3912 3913 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3914 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3915 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3916 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3917 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3918 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3919 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3920 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3921 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3922 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3923 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3924 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3925 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3926 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3927 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3928 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3929 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3930 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3931 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3932 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3933 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3934 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3935 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3936 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3937 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3938 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3939 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3940 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3941 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3942 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3943 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3944 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3945 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3946 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3947 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3948 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3949 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3950 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3951 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3952 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3953 3954 __ addv(v8, __ T2D, v8, v0); 3955 __ addv(v9, __ T2D, v9, v1); 3956 __ addv(v10, __ T2D, v10, v2); 3957 __ addv(v11, __ T2D, v11, v3); 3958 3959 if (multi_block) { 3960 __ add(ofs, ofs, 128); 3961 __ cmp(ofs, limit); 3962 __ br(Assembler::LE, sha512_loop); 3963 __ mov(c_rarg0, ofs); // return ofs 3964 } 3965 3966 __ st1(v8, v9, v10, v11, __ T2D, state); 3967 3968 __ ldpd(v14, v15, Address(sp, 48)); 3969 __ ldpd(v12, v13, Address(sp, 32)); 3970 __ ldpd(v10, v11, Address(sp, 16)); 3971 __ ldpd(v8, v9, __ post(sp, 64)); 3972 3973 __ ret(lr); 3974 3975 return start; 3976 } 3977 3978 // Arguments: 3979 // 3980 // Inputs: 3981 // c_rarg0 - byte[] source+offset 3982 // c_rarg1 - byte[] SHA.state 3983 // c_rarg2 - int block_size 3984 // c_rarg3 - int offset 3985 // c_rarg4 - int limit 3986 // 3987 address generate_sha3_implCompress(bool multi_block, const char *name) { 3988 static const uint64_t round_consts[24] = { 3989 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 3990 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 3991 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 3992 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 3993 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 3994 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 3995 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 3996 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 3997 }; 3998 3999 __ align(CodeEntryAlignment); 4000 StubCodeMark mark(this, "StubRoutines", name); 4001 address start = __ pc(); 4002 4003 Register buf = c_rarg0; 4004 Register state = c_rarg1; 4005 Register block_size = c_rarg2; 4006 Register ofs = c_rarg3; 4007 Register limit = c_rarg4; 4008 4009 Label sha3_loop, rounds24_loop; 4010 Label sha3_512_or_sha3_384, shake128; 4011 4012 __ stpd(v8, v9, __ pre(sp, -64)); 4013 __ stpd(v10, v11, Address(sp, 16)); 4014 __ stpd(v12, v13, Address(sp, 32)); 4015 __ stpd(v14, v15, Address(sp, 48)); 4016 4017 // load state 4018 __ add(rscratch1, state, 32); 4019 __ ld1(v0, v1, v2, v3, __ T1D, state); 4020 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4021 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4022 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4023 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4024 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4025 __ ld1(v24, __ T1D, rscratch1); 4026 4027 __ BIND(sha3_loop); 4028 4029 // 24 keccak rounds 4030 __ movw(rscratch2, 24); 4031 4032 // load round_constants base 4033 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4034 4035 // load input 4036 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4037 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4038 __ eor(v0, __ T8B, v0, v25); 4039 __ eor(v1, __ T8B, v1, v26); 4040 __ eor(v2, __ T8B, v2, v27); 4041 __ eor(v3, __ T8B, v3, v28); 4042 __ eor(v4, __ T8B, v4, v29); 4043 __ eor(v5, __ T8B, v5, v30); 4044 __ eor(v6, __ T8B, v6, v31); 4045 4046 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4047 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4048 4049 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4050 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4051 __ eor(v7, __ T8B, v7, v25); 4052 __ eor(v8, __ T8B, v8, v26); 4053 __ eor(v9, __ T8B, v9, v27); 4054 __ eor(v10, __ T8B, v10, v28); 4055 __ eor(v11, __ T8B, v11, v29); 4056 __ eor(v12, __ T8B, v12, v30); 4057 __ eor(v13, __ T8B, v13, v31); 4058 4059 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4060 __ eor(v14, __ T8B, v14, v25); 4061 __ eor(v15, __ T8B, v15, v26); 4062 __ eor(v16, __ T8B, v16, v27); 4063 4064 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4065 __ andw(c_rarg5, block_size, 48); 4066 __ cbzw(c_rarg5, rounds24_loop); 4067 4068 __ tbnz(block_size, 5, shake128); 4069 // block_size == 144, bit5 == 0, SHA3-244 4070 __ ldrd(v28, __ post(buf, 8)); 4071 __ eor(v17, __ T8B, v17, v28); 4072 __ b(rounds24_loop); 4073 4074 __ BIND(shake128); 4075 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4076 __ eor(v17, __ T8B, v17, v28); 4077 __ eor(v18, __ T8B, v18, v29); 4078 __ eor(v19, __ T8B, v19, v30); 4079 __ eor(v20, __ T8B, v20, v31); 4080 __ b(rounds24_loop); // block_size == 168, SHAKE128 4081 4082 __ BIND(sha3_512_or_sha3_384); 4083 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4084 __ eor(v7, __ T8B, v7, v25); 4085 __ eor(v8, __ T8B, v8, v26); 4086 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4087 4088 // SHA3-384 4089 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4090 __ eor(v9, __ T8B, v9, v27); 4091 __ eor(v10, __ T8B, v10, v28); 4092 __ eor(v11, __ T8B, v11, v29); 4093 __ eor(v12, __ T8B, v12, v30); 4094 4095 __ BIND(rounds24_loop); 4096 __ subw(rscratch2, rscratch2, 1); 4097 4098 __ eor3(v29, __ T16B, v4, v9, v14); 4099 __ eor3(v26, __ T16B, v1, v6, v11); 4100 __ eor3(v28, __ T16B, v3, v8, v13); 4101 __ eor3(v25, __ T16B, v0, v5, v10); 4102 __ eor3(v27, __ T16B, v2, v7, v12); 4103 __ eor3(v29, __ T16B, v29, v19, v24); 4104 __ eor3(v26, __ T16B, v26, v16, v21); 4105 __ eor3(v28, __ T16B, v28, v18, v23); 4106 __ eor3(v25, __ T16B, v25, v15, v20); 4107 __ eor3(v27, __ T16B, v27, v17, v22); 4108 4109 __ rax1(v30, __ T2D, v29, v26); 4110 __ rax1(v26, __ T2D, v26, v28); 4111 __ rax1(v28, __ T2D, v28, v25); 4112 __ rax1(v25, __ T2D, v25, v27); 4113 __ rax1(v27, __ T2D, v27, v29); 4114 4115 __ eor(v0, __ T16B, v0, v30); 4116 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4117 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4118 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4119 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4120 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4121 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4122 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4123 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4124 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4125 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4126 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4127 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4128 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4129 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4130 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4131 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4132 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4133 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4134 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4135 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4136 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4137 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4138 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4139 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4140 4141 __ bcax(v20, __ T16B, v31, v22, v8); 4142 __ bcax(v21, __ T16B, v8, v23, v22); 4143 __ bcax(v22, __ T16B, v22, v24, v23); 4144 __ bcax(v23, __ T16B, v23, v31, v24); 4145 __ bcax(v24, __ T16B, v24, v8, v31); 4146 4147 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4148 4149 __ bcax(v17, __ T16B, v25, v19, v3); 4150 __ bcax(v18, __ T16B, v3, v15, v19); 4151 __ bcax(v19, __ T16B, v19, v16, v15); 4152 __ bcax(v15, __ T16B, v15, v25, v16); 4153 __ bcax(v16, __ T16B, v16, v3, v25); 4154 4155 __ bcax(v10, __ T16B, v29, v12, v26); 4156 __ bcax(v11, __ T16B, v26, v13, v12); 4157 __ bcax(v12, __ T16B, v12, v14, v13); 4158 __ bcax(v13, __ T16B, v13, v29, v14); 4159 __ bcax(v14, __ T16B, v14, v26, v29); 4160 4161 __ bcax(v7, __ T16B, v30, v9, v4); 4162 __ bcax(v8, __ T16B, v4, v5, v9); 4163 __ bcax(v9, __ T16B, v9, v6, v5); 4164 __ bcax(v5, __ T16B, v5, v30, v6); 4165 __ bcax(v6, __ T16B, v6, v4, v30); 4166 4167 __ bcax(v3, __ T16B, v27, v0, v28); 4168 __ bcax(v4, __ T16B, v28, v1, v0); 4169 __ bcax(v0, __ T16B, v0, v2, v1); 4170 __ bcax(v1, __ T16B, v1, v27, v2); 4171 __ bcax(v2, __ T16B, v2, v28, v27); 4172 4173 __ eor(v0, __ T16B, v0, v31); 4174 4175 __ cbnzw(rscratch2, rounds24_loop); 4176 4177 if (multi_block) { 4178 __ add(ofs, ofs, block_size); 4179 __ cmp(ofs, limit); 4180 __ br(Assembler::LE, sha3_loop); 4181 __ mov(c_rarg0, ofs); // return ofs 4182 } 4183 4184 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4185 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4186 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4187 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4188 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4189 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4190 __ st1(v24, __ T1D, state); 4191 4192 __ ldpd(v14, v15, Address(sp, 48)); 4193 __ ldpd(v12, v13, Address(sp, 32)); 4194 __ ldpd(v10, v11, Address(sp, 16)); 4195 __ ldpd(v8, v9, __ post(sp, 64)); 4196 4197 __ ret(lr); 4198 4199 return start; 4200 } 4201 4202 /** 4203 * Arguments: 4204 * 4205 * Inputs: 4206 * c_rarg0 - int crc 4207 * c_rarg1 - byte* buf 4208 * c_rarg2 - int length 4209 * 4210 * Output: 4211 * rax - int crc result 4212 */ 4213 address generate_updateBytesCRC32() { 4214 assert(UseCRC32Intrinsics, "what are we doing here?"); 4215 4216 __ align(CodeEntryAlignment); 4217 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4218 4219 address start = __ pc(); 4220 4221 const Register crc = c_rarg0; // crc 4222 const Register buf = c_rarg1; // source java byte array address 4223 const Register len = c_rarg2; // length 4224 const Register table0 = c_rarg3; // crc_table address 4225 const Register table1 = c_rarg4; 4226 const Register table2 = c_rarg5; 4227 const Register table3 = c_rarg6; 4228 const Register tmp3 = c_rarg7; 4229 4230 BLOCK_COMMENT("Entry:"); 4231 __ enter(); // required for proper stackwalking of RuntimeStub frame 4232 4233 __ kernel_crc32(crc, buf, len, 4234 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4235 4236 __ leave(); // required for proper stackwalking of RuntimeStub frame 4237 __ ret(lr); 4238 4239 return start; 4240 } 4241 4242 // ChaCha20 block function. This version parallelizes by loading 4243 // individual 32-bit state elements into vectors for four blocks 4244 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4245 // 4246 // state (int[16]) = c_rarg0 4247 // keystream (byte[1024]) = c_rarg1 4248 // return - number of bytes of keystream (always 256) 4249 address generate_chacha20Block_blockpar() { 4250 Label L_twoRounds, L_cc20_const; 4251 // The constant data is broken into two 128-bit segments to be loaded 4252 // onto FloatRegisters. The first 128 bits are a counter add overlay 4253 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4254 // The second 128-bits is a table constant used for 8-bit left rotations. 4255 __ BIND(L_cc20_const); 4256 __ emit_int64(0x0000000100000000UL); 4257 __ emit_int64(0x0000000300000002UL); 4258 __ emit_int64(0x0605040702010003UL); 4259 __ emit_int64(0x0E0D0C0F0A09080BUL); 4260 4261 __ align(CodeEntryAlignment); 4262 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4263 address start = __ pc(); 4264 __ enter(); 4265 4266 int i, j; 4267 const Register state = c_rarg0; 4268 const Register keystream = c_rarg1; 4269 const Register loopCtr = r10; 4270 const Register tmpAddr = r11; 4271 4272 const FloatRegister stateFirst = v0; 4273 const FloatRegister stateSecond = v1; 4274 const FloatRegister stateThird = v2; 4275 const FloatRegister stateFourth = v3; 4276 const FloatRegister origCtrState = v28; 4277 const FloatRegister scratch = v29; 4278 const FloatRegister lrot8Tbl = v30; 4279 4280 // Organize SIMD registers in an array that facilitates 4281 // putting repetitive opcodes into loop structures. It is 4282 // important that each grouping of 4 registers is monotonically 4283 // increasing to support the requirements of multi-register 4284 // instructions (e.g. ld4r, st4, etc.) 4285 const FloatRegister workSt[16] = { 4286 v4, v5, v6, v7, v16, v17, v18, v19, 4287 v20, v21, v22, v23, v24, v25, v26, v27 4288 }; 4289 4290 // Load from memory and interlace across 16 SIMD registers, 4291 // With each word from memory being broadcast to all lanes of 4292 // each successive SIMD register. 4293 // Addr(0) -> All lanes in workSt[i] 4294 // Addr(4) -> All lanes workSt[i + 1], etc. 4295 __ mov(tmpAddr, state); 4296 for (i = 0; i < 16; i += 4) { 4297 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4298 __ post(tmpAddr, 16)); 4299 } 4300 4301 // Pull in constant data. The first 16 bytes are the add overlay 4302 // which is applied to the vector holding the counter (state[12]). 4303 // The second 16 bytes is the index register for the 8-bit left 4304 // rotation tbl instruction. 4305 __ adr(tmpAddr, L_cc20_const); 4306 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4307 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4308 4309 // Set up the 10 iteration loop and perform all 8 quarter round ops 4310 __ mov(loopCtr, 10); 4311 __ BIND(L_twoRounds); 4312 4313 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4314 scratch, lrot8Tbl); 4315 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4316 scratch, lrot8Tbl); 4317 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4318 scratch, lrot8Tbl); 4319 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4320 scratch, lrot8Tbl); 4321 4322 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4323 scratch, lrot8Tbl); 4324 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4325 scratch, lrot8Tbl); 4326 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4327 scratch, lrot8Tbl); 4328 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4329 scratch, lrot8Tbl); 4330 4331 // Decrement and iterate 4332 __ sub(loopCtr, loopCtr, 1); 4333 __ cbnz(loopCtr, L_twoRounds); 4334 4335 __ mov(tmpAddr, state); 4336 4337 // Add the starting state back to the post-loop keystream 4338 // state. We read/interlace the state array from memory into 4339 // 4 registers similar to what we did in the beginning. Then 4340 // add the counter overlay onto workSt[12] at the end. 4341 for (i = 0; i < 16; i += 4) { 4342 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4343 __ post(tmpAddr, 16)); 4344 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4345 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4346 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4347 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4348 } 4349 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4350 4351 // Write to key stream, storing the same element out of workSt[0..15] 4352 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4353 // for the next element position. 4354 for (i = 0; i < 4; i++) { 4355 for (j = 0; j < 16; j += 4) { 4356 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4357 __ post(keystream, 16)); 4358 } 4359 } 4360 4361 __ mov(r0, 256); // Return length of output keystream 4362 __ leave(); 4363 __ ret(lr); 4364 4365 return start; 4366 } 4367 4368 /** 4369 * Arguments: 4370 * 4371 * Inputs: 4372 * c_rarg0 - int crc 4373 * c_rarg1 - byte* buf 4374 * c_rarg2 - int length 4375 * c_rarg3 - int* table 4376 * 4377 * Output: 4378 * r0 - int crc result 4379 */ 4380 address generate_updateBytesCRC32C() { 4381 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4382 4383 __ align(CodeEntryAlignment); 4384 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4385 4386 address start = __ pc(); 4387 4388 const Register crc = c_rarg0; // crc 4389 const Register buf = c_rarg1; // source java byte array address 4390 const Register len = c_rarg2; // length 4391 const Register table0 = c_rarg3; // crc_table address 4392 const Register table1 = c_rarg4; 4393 const Register table2 = c_rarg5; 4394 const Register table3 = c_rarg6; 4395 const Register tmp3 = c_rarg7; 4396 4397 BLOCK_COMMENT("Entry:"); 4398 __ enter(); // required for proper stackwalking of RuntimeStub frame 4399 4400 __ kernel_crc32c(crc, buf, len, 4401 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4402 4403 __ leave(); // required for proper stackwalking of RuntimeStub frame 4404 __ ret(lr); 4405 4406 return start; 4407 } 4408 4409 /*** 4410 * Arguments: 4411 * 4412 * Inputs: 4413 * c_rarg0 - int adler 4414 * c_rarg1 - byte* buff 4415 * c_rarg2 - int len 4416 * 4417 * Output: 4418 * c_rarg0 - int adler result 4419 */ 4420 address generate_updateBytesAdler32() { 4421 __ align(CodeEntryAlignment); 4422 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4423 address start = __ pc(); 4424 4425 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4426 4427 // Aliases 4428 Register adler = c_rarg0; 4429 Register s1 = c_rarg0; 4430 Register s2 = c_rarg3; 4431 Register buff = c_rarg1; 4432 Register len = c_rarg2; 4433 Register nmax = r4; 4434 Register base = r5; 4435 Register count = r6; 4436 Register temp0 = rscratch1; 4437 Register temp1 = rscratch2; 4438 FloatRegister vbytes = v0; 4439 FloatRegister vs1acc = v1; 4440 FloatRegister vs2acc = v2; 4441 FloatRegister vtable = v3; 4442 4443 // Max number of bytes we can process before having to take the mod 4444 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4445 uint64_t BASE = 0xfff1; 4446 uint64_t NMAX = 0x15B0; 4447 4448 __ mov(base, BASE); 4449 __ mov(nmax, NMAX); 4450 4451 // Load accumulation coefficients for the upper 16 bits 4452 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4453 __ ld1(vtable, __ T16B, Address(temp0)); 4454 4455 // s1 is initialized to the lower 16 bits of adler 4456 // s2 is initialized to the upper 16 bits of adler 4457 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4458 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4459 4460 // The pipelined loop needs at least 16 elements for 1 iteration 4461 // It does check this, but it is more effective to skip to the cleanup loop 4462 __ cmp(len, (u1)16); 4463 __ br(Assembler::HS, L_nmax); 4464 __ cbz(len, L_combine); 4465 4466 __ bind(L_simple_by1_loop); 4467 __ ldrb(temp0, Address(__ post(buff, 1))); 4468 __ add(s1, s1, temp0); 4469 __ add(s2, s2, s1); 4470 __ subs(len, len, 1); 4471 __ br(Assembler::HI, L_simple_by1_loop); 4472 4473 // s1 = s1 % BASE 4474 __ subs(temp0, s1, base); 4475 __ csel(s1, temp0, s1, Assembler::HS); 4476 4477 // s2 = s2 % BASE 4478 __ lsr(temp0, s2, 16); 4479 __ lsl(temp1, temp0, 4); 4480 __ sub(temp1, temp1, temp0); 4481 __ add(s2, temp1, s2, ext::uxth); 4482 4483 __ subs(temp0, s2, base); 4484 __ csel(s2, temp0, s2, Assembler::HS); 4485 4486 __ b(L_combine); 4487 4488 __ bind(L_nmax); 4489 __ subs(len, len, nmax); 4490 __ sub(count, nmax, 16); 4491 __ br(Assembler::LO, L_by16); 4492 4493 __ bind(L_nmax_loop); 4494 4495 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4496 vbytes, vs1acc, vs2acc, vtable); 4497 4498 __ subs(count, count, 16); 4499 __ br(Assembler::HS, L_nmax_loop); 4500 4501 // s1 = s1 % BASE 4502 __ lsr(temp0, s1, 16); 4503 __ lsl(temp1, temp0, 4); 4504 __ sub(temp1, temp1, temp0); 4505 __ add(temp1, temp1, s1, ext::uxth); 4506 4507 __ lsr(temp0, temp1, 16); 4508 __ lsl(s1, temp0, 4); 4509 __ sub(s1, s1, temp0); 4510 __ add(s1, s1, temp1, ext:: uxth); 4511 4512 __ subs(temp0, s1, base); 4513 __ csel(s1, temp0, s1, Assembler::HS); 4514 4515 // s2 = s2 % BASE 4516 __ lsr(temp0, s2, 16); 4517 __ lsl(temp1, temp0, 4); 4518 __ sub(temp1, temp1, temp0); 4519 __ add(temp1, temp1, s2, ext::uxth); 4520 4521 __ lsr(temp0, temp1, 16); 4522 __ lsl(s2, temp0, 4); 4523 __ sub(s2, s2, temp0); 4524 __ add(s2, s2, temp1, ext:: uxth); 4525 4526 __ subs(temp0, s2, base); 4527 __ csel(s2, temp0, s2, Assembler::HS); 4528 4529 __ subs(len, len, nmax); 4530 __ sub(count, nmax, 16); 4531 __ br(Assembler::HS, L_nmax_loop); 4532 4533 __ bind(L_by16); 4534 __ adds(len, len, count); 4535 __ br(Assembler::LO, L_by1); 4536 4537 __ bind(L_by16_loop); 4538 4539 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4540 vbytes, vs1acc, vs2acc, vtable); 4541 4542 __ subs(len, len, 16); 4543 __ br(Assembler::HS, L_by16_loop); 4544 4545 __ bind(L_by1); 4546 __ adds(len, len, 15); 4547 __ br(Assembler::LO, L_do_mod); 4548 4549 __ bind(L_by1_loop); 4550 __ ldrb(temp0, Address(__ post(buff, 1))); 4551 __ add(s1, temp0, s1); 4552 __ add(s2, s2, s1); 4553 __ subs(len, len, 1); 4554 __ br(Assembler::HS, L_by1_loop); 4555 4556 __ bind(L_do_mod); 4557 // s1 = s1 % BASE 4558 __ lsr(temp0, s1, 16); 4559 __ lsl(temp1, temp0, 4); 4560 __ sub(temp1, temp1, temp0); 4561 __ add(temp1, temp1, s1, ext::uxth); 4562 4563 __ lsr(temp0, temp1, 16); 4564 __ lsl(s1, temp0, 4); 4565 __ sub(s1, s1, temp0); 4566 __ add(s1, s1, temp1, ext:: uxth); 4567 4568 __ subs(temp0, s1, base); 4569 __ csel(s1, temp0, s1, Assembler::HS); 4570 4571 // s2 = s2 % BASE 4572 __ lsr(temp0, s2, 16); 4573 __ lsl(temp1, temp0, 4); 4574 __ sub(temp1, temp1, temp0); 4575 __ add(temp1, temp1, s2, ext::uxth); 4576 4577 __ lsr(temp0, temp1, 16); 4578 __ lsl(s2, temp0, 4); 4579 __ sub(s2, s2, temp0); 4580 __ add(s2, s2, temp1, ext:: uxth); 4581 4582 __ subs(temp0, s2, base); 4583 __ csel(s2, temp0, s2, Assembler::HS); 4584 4585 // Combine lower bits and higher bits 4586 __ bind(L_combine); 4587 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4588 4589 __ ret(lr); 4590 4591 return start; 4592 } 4593 4594 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4595 Register temp0, Register temp1, FloatRegister vbytes, 4596 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4597 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4598 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4599 // In non-vectorized code, we update s1 and s2 as: 4600 // s1 <- s1 + b1 4601 // s2 <- s2 + s1 4602 // s1 <- s1 + b2 4603 // s2 <- s2 + b1 4604 // ... 4605 // s1 <- s1 + b16 4606 // s2 <- s2 + s1 4607 // Putting above assignments together, we have: 4608 // s1_new = s1 + b1 + b2 + ... + b16 4609 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4610 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4611 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4612 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4613 4614 // s2 = s2 + s1 * 16 4615 __ add(s2, s2, s1, Assembler::LSL, 4); 4616 4617 // vs1acc = b1 + b2 + b3 + ... + b16 4618 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4619 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4620 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4621 __ uaddlv(vs1acc, __ T16B, vbytes); 4622 __ uaddlv(vs2acc, __ T8H, vs2acc); 4623 4624 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4625 __ fmovd(temp0, vs1acc); 4626 __ fmovd(temp1, vs2acc); 4627 __ add(s1, s1, temp0); 4628 __ add(s2, s2, temp1); 4629 } 4630 4631 /** 4632 * Arguments: 4633 * 4634 * Input: 4635 * c_rarg0 - x address 4636 * c_rarg1 - x length 4637 * c_rarg2 - y address 4638 * c_rarg3 - y length 4639 * c_rarg4 - z address 4640 * c_rarg5 - z length 4641 */ 4642 address generate_multiplyToLen() { 4643 __ align(CodeEntryAlignment); 4644 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4645 4646 address start = __ pc(); 4647 const Register x = r0; 4648 const Register xlen = r1; 4649 const Register y = r2; 4650 const Register ylen = r3; 4651 const Register z = r4; 4652 const Register zlen = r5; 4653 4654 const Register tmp1 = r10; 4655 const Register tmp2 = r11; 4656 const Register tmp3 = r12; 4657 const Register tmp4 = r13; 4658 const Register tmp5 = r14; 4659 const Register tmp6 = r15; 4660 const Register tmp7 = r16; 4661 4662 BLOCK_COMMENT("Entry:"); 4663 __ enter(); // required for proper stackwalking of RuntimeStub frame 4664 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4665 __ leave(); // required for proper stackwalking of RuntimeStub frame 4666 __ ret(lr); 4667 4668 return start; 4669 } 4670 4671 address generate_squareToLen() { 4672 // squareToLen algorithm for sizes 1..127 described in java code works 4673 // faster than multiply_to_len on some CPUs and slower on others, but 4674 // multiply_to_len shows a bit better overall results 4675 __ align(CodeEntryAlignment); 4676 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4677 address start = __ pc(); 4678 4679 const Register x = r0; 4680 const Register xlen = r1; 4681 const Register z = r2; 4682 const Register zlen = r3; 4683 const Register y = r4; // == x 4684 const Register ylen = r5; // == xlen 4685 4686 const Register tmp1 = r10; 4687 const Register tmp2 = r11; 4688 const Register tmp3 = r12; 4689 const Register tmp4 = r13; 4690 const Register tmp5 = r14; 4691 const Register tmp6 = r15; 4692 const Register tmp7 = r16; 4693 4694 RegSet spilled_regs = RegSet::of(y, ylen); 4695 BLOCK_COMMENT("Entry:"); 4696 __ enter(); 4697 __ push(spilled_regs, sp); 4698 __ mov(y, x); 4699 __ mov(ylen, xlen); 4700 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4701 __ pop(spilled_regs, sp); 4702 __ leave(); 4703 __ ret(lr); 4704 return start; 4705 } 4706 4707 address generate_mulAdd() { 4708 __ align(CodeEntryAlignment); 4709 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4710 4711 address start = __ pc(); 4712 4713 const Register out = r0; 4714 const Register in = r1; 4715 const Register offset = r2; 4716 const Register len = r3; 4717 const Register k = r4; 4718 4719 BLOCK_COMMENT("Entry:"); 4720 __ enter(); 4721 __ mul_add(out, in, offset, len, k); 4722 __ leave(); 4723 __ ret(lr); 4724 4725 return start; 4726 } 4727 4728 // Arguments: 4729 // 4730 // Input: 4731 // c_rarg0 - newArr address 4732 // c_rarg1 - oldArr address 4733 // c_rarg2 - newIdx 4734 // c_rarg3 - shiftCount 4735 // c_rarg4 - numIter 4736 // 4737 address generate_bigIntegerRightShift() { 4738 __ align(CodeEntryAlignment); 4739 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4740 address start = __ pc(); 4741 4742 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4743 4744 Register newArr = c_rarg0; 4745 Register oldArr = c_rarg1; 4746 Register newIdx = c_rarg2; 4747 Register shiftCount = c_rarg3; 4748 Register numIter = c_rarg4; 4749 Register idx = numIter; 4750 4751 Register newArrCur = rscratch1; 4752 Register shiftRevCount = rscratch2; 4753 Register oldArrCur = r13; 4754 Register oldArrNext = r14; 4755 4756 FloatRegister oldElem0 = v0; 4757 FloatRegister oldElem1 = v1; 4758 FloatRegister newElem = v2; 4759 FloatRegister shiftVCount = v3; 4760 FloatRegister shiftVRevCount = v4; 4761 4762 __ cbz(idx, Exit); 4763 4764 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4765 4766 // left shift count 4767 __ movw(shiftRevCount, 32); 4768 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4769 4770 // numIter too small to allow a 4-words SIMD loop, rolling back 4771 __ cmp(numIter, (u1)4); 4772 __ br(Assembler::LT, ShiftThree); 4773 4774 __ dup(shiftVCount, __ T4S, shiftCount); 4775 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4776 __ negr(shiftVCount, __ T4S, shiftVCount); 4777 4778 __ BIND(ShiftSIMDLoop); 4779 4780 // Calculate the load addresses 4781 __ sub(idx, idx, 4); 4782 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4783 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4784 __ add(oldArrCur, oldArrNext, 4); 4785 4786 // Load 4 words and process 4787 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4788 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4789 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4790 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4791 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4792 __ st1(newElem, __ T4S, Address(newArrCur)); 4793 4794 __ cmp(idx, (u1)4); 4795 __ br(Assembler::LT, ShiftTwoLoop); 4796 __ b(ShiftSIMDLoop); 4797 4798 __ BIND(ShiftTwoLoop); 4799 __ cbz(idx, Exit); 4800 __ cmp(idx, (u1)1); 4801 __ br(Assembler::EQ, ShiftOne); 4802 4803 // Calculate the load addresses 4804 __ sub(idx, idx, 2); 4805 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4806 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4807 __ add(oldArrCur, oldArrNext, 4); 4808 4809 // Load 2 words and process 4810 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4811 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4812 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4813 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4814 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4815 __ st1(newElem, __ T2S, Address(newArrCur)); 4816 __ b(ShiftTwoLoop); 4817 4818 __ BIND(ShiftThree); 4819 __ tbz(idx, 1, ShiftOne); 4820 __ tbz(idx, 0, ShiftTwo); 4821 __ ldrw(r10, Address(oldArr, 12)); 4822 __ ldrw(r11, Address(oldArr, 8)); 4823 __ lsrvw(r10, r10, shiftCount); 4824 __ lslvw(r11, r11, shiftRevCount); 4825 __ orrw(r12, r10, r11); 4826 __ strw(r12, Address(newArr, 8)); 4827 4828 __ BIND(ShiftTwo); 4829 __ ldrw(r10, Address(oldArr, 8)); 4830 __ ldrw(r11, Address(oldArr, 4)); 4831 __ lsrvw(r10, r10, shiftCount); 4832 __ lslvw(r11, r11, shiftRevCount); 4833 __ orrw(r12, r10, r11); 4834 __ strw(r12, Address(newArr, 4)); 4835 4836 __ BIND(ShiftOne); 4837 __ ldrw(r10, Address(oldArr, 4)); 4838 __ ldrw(r11, Address(oldArr)); 4839 __ lsrvw(r10, r10, shiftCount); 4840 __ lslvw(r11, r11, shiftRevCount); 4841 __ orrw(r12, r10, r11); 4842 __ strw(r12, Address(newArr)); 4843 4844 __ BIND(Exit); 4845 __ ret(lr); 4846 4847 return start; 4848 } 4849 4850 // Arguments: 4851 // 4852 // Input: 4853 // c_rarg0 - newArr address 4854 // c_rarg1 - oldArr address 4855 // c_rarg2 - newIdx 4856 // c_rarg3 - shiftCount 4857 // c_rarg4 - numIter 4858 // 4859 address generate_bigIntegerLeftShift() { 4860 __ align(CodeEntryAlignment); 4861 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4862 address start = __ pc(); 4863 4864 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4865 4866 Register newArr = c_rarg0; 4867 Register oldArr = c_rarg1; 4868 Register newIdx = c_rarg2; 4869 Register shiftCount = c_rarg3; 4870 Register numIter = c_rarg4; 4871 4872 Register shiftRevCount = rscratch1; 4873 Register oldArrNext = rscratch2; 4874 4875 FloatRegister oldElem0 = v0; 4876 FloatRegister oldElem1 = v1; 4877 FloatRegister newElem = v2; 4878 FloatRegister shiftVCount = v3; 4879 FloatRegister shiftVRevCount = v4; 4880 4881 __ cbz(numIter, Exit); 4882 4883 __ add(oldArrNext, oldArr, 4); 4884 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4885 4886 // right shift count 4887 __ movw(shiftRevCount, 32); 4888 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4889 4890 // numIter too small to allow a 4-words SIMD loop, rolling back 4891 __ cmp(numIter, (u1)4); 4892 __ br(Assembler::LT, ShiftThree); 4893 4894 __ dup(shiftVCount, __ T4S, shiftCount); 4895 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4896 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4897 4898 __ BIND(ShiftSIMDLoop); 4899 4900 // load 4 words and process 4901 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4902 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4903 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4904 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4905 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4906 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4907 __ sub(numIter, numIter, 4); 4908 4909 __ cmp(numIter, (u1)4); 4910 __ br(Assembler::LT, ShiftTwoLoop); 4911 __ b(ShiftSIMDLoop); 4912 4913 __ BIND(ShiftTwoLoop); 4914 __ cbz(numIter, Exit); 4915 __ cmp(numIter, (u1)1); 4916 __ br(Assembler::EQ, ShiftOne); 4917 4918 // load 2 words and process 4919 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4920 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4921 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4922 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4923 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4924 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4925 __ sub(numIter, numIter, 2); 4926 __ b(ShiftTwoLoop); 4927 4928 __ BIND(ShiftThree); 4929 __ ldrw(r10, __ post(oldArr, 4)); 4930 __ ldrw(r11, __ post(oldArrNext, 4)); 4931 __ lslvw(r10, r10, shiftCount); 4932 __ lsrvw(r11, r11, shiftRevCount); 4933 __ orrw(r12, r10, r11); 4934 __ strw(r12, __ post(newArr, 4)); 4935 __ tbz(numIter, 1, Exit); 4936 __ tbz(numIter, 0, ShiftOne); 4937 4938 __ BIND(ShiftTwo); 4939 __ ldrw(r10, __ post(oldArr, 4)); 4940 __ ldrw(r11, __ post(oldArrNext, 4)); 4941 __ lslvw(r10, r10, shiftCount); 4942 __ lsrvw(r11, r11, shiftRevCount); 4943 __ orrw(r12, r10, r11); 4944 __ strw(r12, __ post(newArr, 4)); 4945 4946 __ BIND(ShiftOne); 4947 __ ldrw(r10, Address(oldArr)); 4948 __ ldrw(r11, Address(oldArrNext)); 4949 __ lslvw(r10, r10, shiftCount); 4950 __ lsrvw(r11, r11, shiftRevCount); 4951 __ orrw(r12, r10, r11); 4952 __ strw(r12, Address(newArr)); 4953 4954 __ BIND(Exit); 4955 __ ret(lr); 4956 4957 return start; 4958 } 4959 4960 address generate_count_positives(address &count_positives_long) { 4961 const u1 large_loop_size = 64; 4962 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4963 int dcache_line = VM_Version::dcache_line_size(); 4964 4965 Register ary1 = r1, len = r2, result = r0; 4966 4967 __ align(CodeEntryAlignment); 4968 4969 StubCodeMark mark(this, "StubRoutines", "count_positives"); 4970 4971 address entry = __ pc(); 4972 4973 __ enter(); 4974 // precondition: a copy of len is already in result 4975 // __ mov(result, len); 4976 4977 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 4978 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 4979 4980 __ cmp(len, (u1)15); 4981 __ br(Assembler::GT, LEN_OVER_15); 4982 // The only case when execution falls into this code is when pointer is near 4983 // the end of memory page and we have to avoid reading next page 4984 __ add(ary1, ary1, len); 4985 __ subs(len, len, 8); 4986 __ br(Assembler::GT, LEN_OVER_8); 4987 __ ldr(rscratch2, Address(ary1, -8)); 4988 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 4989 __ lsrv(rscratch2, rscratch2, rscratch1); 4990 __ tst(rscratch2, UPPER_BIT_MASK); 4991 __ csel(result, zr, result, Assembler::NE); 4992 __ leave(); 4993 __ ret(lr); 4994 __ bind(LEN_OVER_8); 4995 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 4996 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 4997 __ tst(rscratch2, UPPER_BIT_MASK); 4998 __ br(Assembler::NE, RET_NO_POP); 4999 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5000 __ lsrv(rscratch1, rscratch1, rscratch2); 5001 __ tst(rscratch1, UPPER_BIT_MASK); 5002 __ bind(RET_NO_POP); 5003 __ csel(result, zr, result, Assembler::NE); 5004 __ leave(); 5005 __ ret(lr); 5006 5007 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5008 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5009 5010 count_positives_long = __ pc(); // 2nd entry point 5011 5012 __ enter(); 5013 5014 __ bind(LEN_OVER_15); 5015 __ push(spilled_regs, sp); 5016 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5017 __ cbz(rscratch2, ALIGNED); 5018 __ ldp(tmp6, tmp1, Address(ary1)); 5019 __ mov(tmp5, 16); 5020 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5021 __ add(ary1, ary1, rscratch1); 5022 __ orr(tmp6, tmp6, tmp1); 5023 __ tst(tmp6, UPPER_BIT_MASK); 5024 __ br(Assembler::NE, RET_ADJUST); 5025 __ sub(len, len, rscratch1); 5026 5027 __ bind(ALIGNED); 5028 __ cmp(len, large_loop_size); 5029 __ br(Assembler::LT, CHECK_16); 5030 // Perform 16-byte load as early return in pre-loop to handle situation 5031 // when initially aligned large array has negative values at starting bytes, 5032 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5033 // slower. Cases with negative bytes further ahead won't be affected that 5034 // much. In fact, it'll be faster due to early loads, less instructions and 5035 // less branches in LARGE_LOOP. 5036 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5037 __ sub(len, len, 16); 5038 __ orr(tmp6, tmp6, tmp1); 5039 __ tst(tmp6, UPPER_BIT_MASK); 5040 __ br(Assembler::NE, RET_ADJUST_16); 5041 __ cmp(len, large_loop_size); 5042 __ br(Assembler::LT, CHECK_16); 5043 5044 if (SoftwarePrefetchHintDistance >= 0 5045 && SoftwarePrefetchHintDistance >= dcache_line) { 5046 // initial prefetch 5047 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5048 } 5049 __ bind(LARGE_LOOP); 5050 if (SoftwarePrefetchHintDistance >= 0) { 5051 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5052 } 5053 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5054 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5055 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5056 // instructions per cycle and have less branches, but this approach disables 5057 // early return, thus, all 64 bytes are loaded and checked every time. 5058 __ ldp(tmp2, tmp3, Address(ary1)); 5059 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5060 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5061 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5062 __ add(ary1, ary1, large_loop_size); 5063 __ sub(len, len, large_loop_size); 5064 __ orr(tmp2, tmp2, tmp3); 5065 __ orr(tmp4, tmp4, tmp5); 5066 __ orr(rscratch1, rscratch1, rscratch2); 5067 __ orr(tmp6, tmp6, tmp1); 5068 __ orr(tmp2, tmp2, tmp4); 5069 __ orr(rscratch1, rscratch1, tmp6); 5070 __ orr(tmp2, tmp2, rscratch1); 5071 __ tst(tmp2, UPPER_BIT_MASK); 5072 __ br(Assembler::NE, RET_ADJUST_LONG); 5073 __ cmp(len, large_loop_size); 5074 __ br(Assembler::GE, LARGE_LOOP); 5075 5076 __ bind(CHECK_16); // small 16-byte load pre-loop 5077 __ cmp(len, (u1)16); 5078 __ br(Assembler::LT, POST_LOOP16); 5079 5080 __ bind(LOOP16); // small 16-byte load loop 5081 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5082 __ sub(len, len, 16); 5083 __ orr(tmp2, tmp2, tmp3); 5084 __ tst(tmp2, UPPER_BIT_MASK); 5085 __ br(Assembler::NE, RET_ADJUST_16); 5086 __ cmp(len, (u1)16); 5087 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5088 5089 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5090 __ cmp(len, (u1)8); 5091 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5092 __ ldr(tmp3, Address(__ post(ary1, 8))); 5093 __ tst(tmp3, UPPER_BIT_MASK); 5094 __ br(Assembler::NE, RET_ADJUST); 5095 __ sub(len, len, 8); 5096 5097 __ bind(POST_LOOP16_LOAD_TAIL); 5098 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5099 __ ldr(tmp1, Address(ary1)); 5100 __ mov(tmp2, 64); 5101 __ sub(tmp4, tmp2, len, __ LSL, 3); 5102 __ lslv(tmp1, tmp1, tmp4); 5103 __ tst(tmp1, UPPER_BIT_MASK); 5104 __ br(Assembler::NE, RET_ADJUST); 5105 // Fallthrough 5106 5107 __ bind(RET_LEN); 5108 __ pop(spilled_regs, sp); 5109 __ leave(); 5110 __ ret(lr); 5111 5112 // difference result - len is the count of guaranteed to be 5113 // positive bytes 5114 5115 __ bind(RET_ADJUST_LONG); 5116 __ add(len, len, (u1)(large_loop_size - 16)); 5117 __ bind(RET_ADJUST_16); 5118 __ add(len, len, 16); 5119 __ bind(RET_ADJUST); 5120 __ pop(spilled_regs, sp); 5121 __ leave(); 5122 __ sub(result, result, len); 5123 __ ret(lr); 5124 5125 return entry; 5126 } 5127 5128 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5129 bool usePrefetch, Label &NOT_EQUAL) { 5130 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5131 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5132 tmp7 = r12, tmp8 = r13; 5133 Label LOOP; 5134 5135 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5136 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5137 __ bind(LOOP); 5138 if (usePrefetch) { 5139 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5140 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5141 } 5142 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5143 __ eor(tmp1, tmp1, tmp2); 5144 __ eor(tmp3, tmp3, tmp4); 5145 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5146 __ orr(tmp1, tmp1, tmp3); 5147 __ cbnz(tmp1, NOT_EQUAL); 5148 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5149 __ eor(tmp5, tmp5, tmp6); 5150 __ eor(tmp7, tmp7, tmp8); 5151 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5152 __ orr(tmp5, tmp5, tmp7); 5153 __ cbnz(tmp5, NOT_EQUAL); 5154 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5155 __ eor(tmp1, tmp1, tmp2); 5156 __ eor(tmp3, tmp3, tmp4); 5157 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5158 __ orr(tmp1, tmp1, tmp3); 5159 __ cbnz(tmp1, NOT_EQUAL); 5160 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5161 __ eor(tmp5, tmp5, tmp6); 5162 __ sub(cnt1, cnt1, 8 * wordSize); 5163 __ eor(tmp7, tmp7, tmp8); 5164 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5165 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5166 // cmp) because subs allows an unlimited range of immediate operand. 5167 __ subs(tmp6, cnt1, loopThreshold); 5168 __ orr(tmp5, tmp5, tmp7); 5169 __ cbnz(tmp5, NOT_EQUAL); 5170 __ br(__ GE, LOOP); 5171 // post-loop 5172 __ eor(tmp1, tmp1, tmp2); 5173 __ eor(tmp3, tmp3, tmp4); 5174 __ orr(tmp1, tmp1, tmp3); 5175 __ sub(cnt1, cnt1, 2 * wordSize); 5176 __ cbnz(tmp1, NOT_EQUAL); 5177 } 5178 5179 void generate_large_array_equals_loop_simd(int loopThreshold, 5180 bool usePrefetch, Label &NOT_EQUAL) { 5181 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5182 tmp2 = rscratch2; 5183 Label LOOP; 5184 5185 __ bind(LOOP); 5186 if (usePrefetch) { 5187 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5188 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5189 } 5190 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5191 __ sub(cnt1, cnt1, 8 * wordSize); 5192 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5193 __ subs(tmp1, cnt1, loopThreshold); 5194 __ eor(v0, __ T16B, v0, v4); 5195 __ eor(v1, __ T16B, v1, v5); 5196 __ eor(v2, __ T16B, v2, v6); 5197 __ eor(v3, __ T16B, v3, v7); 5198 __ orr(v0, __ T16B, v0, v1); 5199 __ orr(v1, __ T16B, v2, v3); 5200 __ orr(v0, __ T16B, v0, v1); 5201 __ umov(tmp1, v0, __ D, 0); 5202 __ umov(tmp2, v0, __ D, 1); 5203 __ orr(tmp1, tmp1, tmp2); 5204 __ cbnz(tmp1, NOT_EQUAL); 5205 __ br(__ GE, LOOP); 5206 } 5207 5208 // a1 = r1 - array1 address 5209 // a2 = r2 - array2 address 5210 // result = r0 - return value. Already contains "false" 5211 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5212 // r3-r5 are reserved temporary registers 5213 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5214 address generate_large_array_equals() { 5215 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5216 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5217 tmp7 = r12, tmp8 = r13; 5218 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5219 SMALL_LOOP, POST_LOOP; 5220 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5221 // calculate if at least 32 prefetched bytes are used 5222 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5223 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5224 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5225 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5226 tmp5, tmp6, tmp7, tmp8); 5227 5228 __ align(CodeEntryAlignment); 5229 5230 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5231 5232 address entry = __ pc(); 5233 __ enter(); 5234 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5235 // also advance pointers to use post-increment instead of pre-increment 5236 __ add(a1, a1, wordSize); 5237 __ add(a2, a2, wordSize); 5238 if (AvoidUnalignedAccesses) { 5239 // both implementations (SIMD/nonSIMD) are using relatively large load 5240 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5241 // on some CPUs in case of address is not at least 16-byte aligned. 5242 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5243 // load if needed at least for 1st address and make if 16-byte aligned. 5244 Label ALIGNED16; 5245 __ tbz(a1, 3, ALIGNED16); 5246 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5247 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5248 __ sub(cnt1, cnt1, wordSize); 5249 __ eor(tmp1, tmp1, tmp2); 5250 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5251 __ bind(ALIGNED16); 5252 } 5253 if (UseSIMDForArrayEquals) { 5254 if (SoftwarePrefetchHintDistance >= 0) { 5255 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5256 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5257 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5258 /* prfm = */ true, NOT_EQUAL); 5259 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5260 __ br(__ LT, TAIL); 5261 } 5262 __ bind(NO_PREFETCH_LARGE_LOOP); 5263 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5264 /* prfm = */ false, NOT_EQUAL); 5265 } else { 5266 __ push(spilled_regs, sp); 5267 if (SoftwarePrefetchHintDistance >= 0) { 5268 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5269 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5270 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5271 /* prfm = */ true, NOT_EQUAL); 5272 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5273 __ br(__ LT, TAIL); 5274 } 5275 __ bind(NO_PREFETCH_LARGE_LOOP); 5276 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5277 /* prfm = */ false, NOT_EQUAL); 5278 } 5279 __ bind(TAIL); 5280 __ cbz(cnt1, EQUAL); 5281 __ subs(cnt1, cnt1, wordSize); 5282 __ br(__ LE, POST_LOOP); 5283 __ bind(SMALL_LOOP); 5284 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5285 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5286 __ subs(cnt1, cnt1, wordSize); 5287 __ eor(tmp1, tmp1, tmp2); 5288 __ cbnz(tmp1, NOT_EQUAL); 5289 __ br(__ GT, SMALL_LOOP); 5290 __ bind(POST_LOOP); 5291 __ ldr(tmp1, Address(a1, cnt1)); 5292 __ ldr(tmp2, Address(a2, cnt1)); 5293 __ eor(tmp1, tmp1, tmp2); 5294 __ cbnz(tmp1, NOT_EQUAL); 5295 __ bind(EQUAL); 5296 __ mov(result, true); 5297 __ bind(NOT_EQUAL); 5298 if (!UseSIMDForArrayEquals) { 5299 __ pop(spilled_regs, sp); 5300 } 5301 __ bind(NOT_EQUAL_NO_POP); 5302 __ leave(); 5303 __ ret(lr); 5304 return entry; 5305 } 5306 5307 address generate_dsin_dcos(bool isCos) { 5308 __ align(CodeEntryAlignment); 5309 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5310 address start = __ pc(); 5311 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5312 (address)StubRoutines::aarch64::_two_over_pi, 5313 (address)StubRoutines::aarch64::_pio2, 5314 (address)StubRoutines::aarch64::_dsin_coef, 5315 (address)StubRoutines::aarch64::_dcos_coef); 5316 return start; 5317 } 5318 5319 address generate_dlog() { 5320 __ align(CodeEntryAlignment); 5321 StubCodeMark mark(this, "StubRoutines", "dlog"); 5322 address entry = __ pc(); 5323 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 5324 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 5325 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 5326 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 5327 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 5328 return entry; 5329 } 5330 5331 5332 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5333 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5334 Label &DIFF2) { 5335 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5336 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5337 5338 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5339 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5340 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5341 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5342 5343 __ fmovd(tmpL, vtmp3); 5344 __ eor(rscratch2, tmp3, tmpL); 5345 __ cbnz(rscratch2, DIFF2); 5346 5347 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5348 __ umov(tmpL, vtmp3, __ D, 1); 5349 __ eor(rscratch2, tmpU, tmpL); 5350 __ cbnz(rscratch2, DIFF1); 5351 5352 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5353 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5354 __ fmovd(tmpL, vtmp); 5355 __ eor(rscratch2, tmp3, tmpL); 5356 __ cbnz(rscratch2, DIFF2); 5357 5358 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5359 __ umov(tmpL, vtmp, __ D, 1); 5360 __ eor(rscratch2, tmpU, tmpL); 5361 __ cbnz(rscratch2, DIFF1); 5362 } 5363 5364 // r0 = result 5365 // r1 = str1 5366 // r2 = cnt1 5367 // r3 = str2 5368 // r4 = cnt2 5369 // r10 = tmp1 5370 // r11 = tmp2 5371 address generate_compare_long_string_different_encoding(bool isLU) { 5372 __ align(CodeEntryAlignment); 5373 StubCodeMark mark(this, "StubRoutines", isLU 5374 ? "compare_long_string_different_encoding LU" 5375 : "compare_long_string_different_encoding UL"); 5376 address entry = __ pc(); 5377 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5378 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5379 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5380 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5381 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5382 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5383 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5384 5385 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5386 5387 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5388 // cnt2 == amount of characters left to compare 5389 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5390 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5391 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5392 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5393 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5394 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5395 __ eor(rscratch2, tmp1, tmp2); 5396 __ mov(rscratch1, tmp2); 5397 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5398 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5399 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5400 __ push(spilled_regs, sp); 5401 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5402 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5403 5404 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5405 5406 if (SoftwarePrefetchHintDistance >= 0) { 5407 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5408 __ br(__ LT, NO_PREFETCH); 5409 __ bind(LARGE_LOOP_PREFETCH); 5410 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5411 __ mov(tmp4, 2); 5412 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5413 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5414 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5415 __ subs(tmp4, tmp4, 1); 5416 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5417 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5418 __ mov(tmp4, 2); 5419 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5420 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5421 __ subs(tmp4, tmp4, 1); 5422 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5423 __ sub(cnt2, cnt2, 64); 5424 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5425 __ br(__ GE, LARGE_LOOP_PREFETCH); 5426 } 5427 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5428 __ bind(NO_PREFETCH); 5429 __ subs(cnt2, cnt2, 16); 5430 __ br(__ LT, TAIL); 5431 __ align(OptoLoopAlignment); 5432 __ bind(SMALL_LOOP); // smaller loop 5433 __ subs(cnt2, cnt2, 16); 5434 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5435 __ br(__ GE, SMALL_LOOP); 5436 __ cmn(cnt2, (u1)16); 5437 __ br(__ EQ, LOAD_LAST); 5438 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5439 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5440 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5441 __ ldr(tmp3, Address(cnt1, -8)); 5442 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5443 __ b(LOAD_LAST); 5444 __ bind(DIFF2); 5445 __ mov(tmpU, tmp3); 5446 __ bind(DIFF1); 5447 __ pop(spilled_regs, sp); 5448 __ b(CALCULATE_DIFFERENCE); 5449 __ bind(LOAD_LAST); 5450 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5451 // No need to load it again 5452 __ mov(tmpU, tmp3); 5453 __ pop(spilled_regs, sp); 5454 5455 // tmp2 points to the address of the last 4 Latin1 characters right now 5456 __ ldrs(vtmp, Address(tmp2)); 5457 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5458 __ fmovd(tmpL, vtmp); 5459 5460 __ eor(rscratch2, tmpU, tmpL); 5461 __ cbz(rscratch2, DONE); 5462 5463 // Find the first different characters in the longwords and 5464 // compute their difference. 5465 __ bind(CALCULATE_DIFFERENCE); 5466 __ rev(rscratch2, rscratch2); 5467 __ clz(rscratch2, rscratch2); 5468 __ andr(rscratch2, rscratch2, -16); 5469 __ lsrv(tmp1, tmp1, rscratch2); 5470 __ uxthw(tmp1, tmp1); 5471 __ lsrv(rscratch1, rscratch1, rscratch2); 5472 __ uxthw(rscratch1, rscratch1); 5473 __ subw(result, tmp1, rscratch1); 5474 __ bind(DONE); 5475 __ ret(lr); 5476 return entry; 5477 } 5478 5479 address generate_method_entry_barrier() { 5480 __ align(CodeEntryAlignment); 5481 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5482 5483 Label deoptimize_label; 5484 5485 address start = __ pc(); 5486 5487 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5488 5489 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5490 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5491 // We can get here despite the nmethod being good, if we have not 5492 // yet applied our cross modification fence (or data fence). 5493 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5494 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5495 __ ldrw(rscratch2, rscratch2); 5496 __ strw(rscratch2, thread_epoch_addr); 5497 __ isb(); 5498 __ membar(__ LoadLoad); 5499 } 5500 5501 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5502 5503 __ enter(); 5504 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5505 5506 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5507 5508 __ push_call_clobbered_registers(); 5509 5510 __ mov(c_rarg0, rscratch2); 5511 __ call_VM_leaf 5512 (CAST_FROM_FN_PTR 5513 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5514 5515 __ reset_last_Java_frame(true); 5516 5517 __ mov(rscratch1, r0); 5518 5519 __ pop_call_clobbered_registers(); 5520 5521 __ cbnz(rscratch1, deoptimize_label); 5522 5523 __ leave(); 5524 __ ret(lr); 5525 5526 __ BIND(deoptimize_label); 5527 5528 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5529 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5530 5531 __ mov(sp, rscratch1); 5532 __ br(rscratch2); 5533 5534 return start; 5535 } 5536 5537 // r0 = result 5538 // r1 = str1 5539 // r2 = cnt1 5540 // r3 = str2 5541 // r4 = cnt2 5542 // r10 = tmp1 5543 // r11 = tmp2 5544 address generate_compare_long_string_same_encoding(bool isLL) { 5545 __ align(CodeEntryAlignment); 5546 StubCodeMark mark(this, "StubRoutines", isLL 5547 ? "compare_long_string_same_encoding LL" 5548 : "compare_long_string_same_encoding UU"); 5549 address entry = __ pc(); 5550 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5551 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5552 5553 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5554 5555 // exit from large loop when less than 64 bytes left to read or we're about 5556 // to prefetch memory behind array border 5557 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5558 5559 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5560 __ eor(rscratch2, tmp1, tmp2); 5561 __ cbnz(rscratch2, CAL_DIFFERENCE); 5562 5563 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5564 // update pointers, because of previous read 5565 __ add(str1, str1, wordSize); 5566 __ add(str2, str2, wordSize); 5567 if (SoftwarePrefetchHintDistance >= 0) { 5568 __ align(OptoLoopAlignment); 5569 __ bind(LARGE_LOOP_PREFETCH); 5570 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5571 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5572 5573 for (int i = 0; i < 4; i++) { 5574 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5575 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5576 __ cmp(tmp1, tmp2); 5577 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5578 __ br(Assembler::NE, DIFF); 5579 } 5580 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5581 __ add(str1, str1, 64); 5582 __ add(str2, str2, 64); 5583 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5584 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5585 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5586 } 5587 5588 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5589 __ br(Assembler::LE, LESS16); 5590 __ align(OptoLoopAlignment); 5591 __ bind(LOOP_COMPARE16); 5592 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5593 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5594 __ cmp(tmp1, tmp2); 5595 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5596 __ br(Assembler::NE, DIFF); 5597 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5598 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5599 __ br(Assembler::LT, LESS16); 5600 5601 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5602 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5603 __ cmp(tmp1, tmp2); 5604 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5605 __ br(Assembler::NE, DIFF); 5606 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5607 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5608 __ br(Assembler::GE, LOOP_COMPARE16); 5609 __ cbz(cnt2, LENGTH_DIFF); 5610 5611 __ bind(LESS16); 5612 // each 8 compare 5613 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5614 __ br(Assembler::LE, LESS8); 5615 __ ldr(tmp1, Address(__ post(str1, 8))); 5616 __ ldr(tmp2, Address(__ post(str2, 8))); 5617 __ eor(rscratch2, tmp1, tmp2); 5618 __ cbnz(rscratch2, CAL_DIFFERENCE); 5619 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5620 5621 __ bind(LESS8); // directly load last 8 bytes 5622 if (!isLL) { 5623 __ add(cnt2, cnt2, cnt2); 5624 } 5625 __ ldr(tmp1, Address(str1, cnt2)); 5626 __ ldr(tmp2, Address(str2, cnt2)); 5627 __ eor(rscratch2, tmp1, tmp2); 5628 __ cbz(rscratch2, LENGTH_DIFF); 5629 __ b(CAL_DIFFERENCE); 5630 5631 __ bind(DIFF); 5632 __ cmp(tmp1, tmp2); 5633 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5634 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5635 // reuse rscratch2 register for the result of eor instruction 5636 __ eor(rscratch2, tmp1, tmp2); 5637 5638 __ bind(CAL_DIFFERENCE); 5639 __ rev(rscratch2, rscratch2); 5640 __ clz(rscratch2, rscratch2); 5641 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5642 __ lsrv(tmp1, tmp1, rscratch2); 5643 __ lsrv(tmp2, tmp2, rscratch2); 5644 if (isLL) { 5645 __ uxtbw(tmp1, tmp1); 5646 __ uxtbw(tmp2, tmp2); 5647 } else { 5648 __ uxthw(tmp1, tmp1); 5649 __ uxthw(tmp2, tmp2); 5650 } 5651 __ subw(result, tmp1, tmp2); 5652 5653 __ bind(LENGTH_DIFF); 5654 __ ret(lr); 5655 return entry; 5656 } 5657 5658 enum string_compare_mode { 5659 LL, 5660 LU, 5661 UL, 5662 UU, 5663 }; 5664 5665 // The following registers are declared in aarch64.ad 5666 // r0 = result 5667 // r1 = str1 5668 // r2 = cnt1 5669 // r3 = str2 5670 // r4 = cnt2 5671 // r10 = tmp1 5672 // r11 = tmp2 5673 // z0 = ztmp1 5674 // z1 = ztmp2 5675 // p0 = pgtmp1 5676 // p1 = pgtmp2 5677 address generate_compare_long_string_sve(string_compare_mode mode) { 5678 __ align(CodeEntryAlignment); 5679 address entry = __ pc(); 5680 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5681 tmp1 = r10, tmp2 = r11; 5682 5683 Label LOOP, DONE, MISMATCH; 5684 Register vec_len = tmp1; 5685 Register idx = tmp2; 5686 // The minimum of the string lengths has been stored in cnt2. 5687 Register cnt = cnt2; 5688 FloatRegister ztmp1 = z0, ztmp2 = z1; 5689 PRegister pgtmp1 = p0, pgtmp2 = p1; 5690 5691 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 5692 switch (mode) { \ 5693 case LL: \ 5694 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 5695 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 5696 break; \ 5697 case LU: \ 5698 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 5699 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5700 break; \ 5701 case UL: \ 5702 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5703 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 5704 break; \ 5705 case UU: \ 5706 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5707 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5708 break; \ 5709 default: \ 5710 ShouldNotReachHere(); \ 5711 } 5712 5713 const char* stubname; 5714 switch (mode) { 5715 case LL: stubname = "compare_long_string_same_encoding LL"; break; 5716 case LU: stubname = "compare_long_string_different_encoding LU"; break; 5717 case UL: stubname = "compare_long_string_different_encoding UL"; break; 5718 case UU: stubname = "compare_long_string_same_encoding UU"; break; 5719 default: ShouldNotReachHere(); 5720 } 5721 5722 StubCodeMark mark(this, "StubRoutines", stubname); 5723 5724 __ mov(idx, 0); 5725 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5726 5727 if (mode == LL) { 5728 __ sve_cntb(vec_len); 5729 } else { 5730 __ sve_cnth(vec_len); 5731 } 5732 5733 __ sub(rscratch1, cnt, vec_len); 5734 5735 __ bind(LOOP); 5736 5737 // main loop 5738 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5739 __ add(idx, idx, vec_len); 5740 // Compare strings. 5741 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5742 __ br(__ NE, MISMATCH); 5743 __ cmp(idx, rscratch1); 5744 __ br(__ LT, LOOP); 5745 5746 // post loop, last iteration 5747 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5748 5749 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5750 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5751 __ br(__ EQ, DONE); 5752 5753 __ bind(MISMATCH); 5754 5755 // Crop the vector to find its location. 5756 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 5757 // Extract the first different characters of each string. 5758 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 5759 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 5760 5761 // Compute the difference of the first different characters. 5762 __ sub(result, rscratch1, rscratch2); 5763 5764 __ bind(DONE); 5765 __ ret(lr); 5766 #undef LOAD_PAIR 5767 return entry; 5768 } 5769 5770 void generate_compare_long_strings() { 5771 if (UseSVE == 0) { 5772 StubRoutines::aarch64::_compare_long_string_LL 5773 = generate_compare_long_string_same_encoding(true); 5774 StubRoutines::aarch64::_compare_long_string_UU 5775 = generate_compare_long_string_same_encoding(false); 5776 StubRoutines::aarch64::_compare_long_string_LU 5777 = generate_compare_long_string_different_encoding(true); 5778 StubRoutines::aarch64::_compare_long_string_UL 5779 = generate_compare_long_string_different_encoding(false); 5780 } else { 5781 StubRoutines::aarch64::_compare_long_string_LL 5782 = generate_compare_long_string_sve(LL); 5783 StubRoutines::aarch64::_compare_long_string_UU 5784 = generate_compare_long_string_sve(UU); 5785 StubRoutines::aarch64::_compare_long_string_LU 5786 = generate_compare_long_string_sve(LU); 5787 StubRoutines::aarch64::_compare_long_string_UL 5788 = generate_compare_long_string_sve(UL); 5789 } 5790 } 5791 5792 // R0 = result 5793 // R1 = str2 5794 // R2 = cnt1 5795 // R3 = str1 5796 // R4 = cnt2 5797 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 5798 // 5799 // This generic linear code use few additional ideas, which makes it faster: 5800 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5801 // in order to skip initial loading(help in systems with 1 ld pipeline) 5802 // 2) we can use "fast" algorithm of finding single character to search for 5803 // first symbol with less branches(1 branch per each loaded register instead 5804 // of branch for each symbol), so, this is where constants like 5805 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5806 // 3) after loading and analyzing 1st register of source string, it can be 5807 // used to search for every 1st character entry, saving few loads in 5808 // comparison with "simplier-but-slower" implementation 5809 // 4) in order to avoid lots of push/pop operations, code below is heavily 5810 // re-using/re-initializing/compressing register values, which makes code 5811 // larger and a bit less readable, however, most of extra operations are 5812 // issued during loads or branches, so, penalty is minimal 5813 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5814 const char* stubName = str1_isL 5815 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5816 : "indexof_linear_uu"; 5817 __ align(CodeEntryAlignment); 5818 StubCodeMark mark(this, "StubRoutines", stubName); 5819 address entry = __ pc(); 5820 5821 int str1_chr_size = str1_isL ? 1 : 2; 5822 int str2_chr_size = str2_isL ? 1 : 2; 5823 int str1_chr_shift = str1_isL ? 0 : 1; 5824 int str2_chr_shift = str2_isL ? 0 : 1; 5825 bool isL = str1_isL && str2_isL; 5826 // parameters 5827 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5828 // temporary registers 5829 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5830 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5831 // redefinitions 5832 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5833 5834 __ push(spilled_regs, sp); 5835 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5836 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5837 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5838 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5839 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5840 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5841 // Read whole register from str1. It is safe, because length >=8 here 5842 __ ldr(ch1, Address(str1)); 5843 // Read whole register from str2. It is safe, because length >=8 here 5844 __ ldr(ch2, Address(str2)); 5845 __ sub(cnt2, cnt2, cnt1); 5846 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5847 if (str1_isL != str2_isL) { 5848 __ eor(v0, __ T16B, v0, v0); 5849 } 5850 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5851 __ mul(first, first, tmp1); 5852 // check if we have less than 1 register to check 5853 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5854 if (str1_isL != str2_isL) { 5855 __ fmovd(v1, ch1); 5856 } 5857 __ br(__ LE, L_SMALL); 5858 __ eor(ch2, first, ch2); 5859 if (str1_isL != str2_isL) { 5860 __ zip1(v1, __ T16B, v1, v0); 5861 } 5862 __ sub(tmp2, ch2, tmp1); 5863 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5864 __ bics(tmp2, tmp2, ch2); 5865 if (str1_isL != str2_isL) { 5866 __ fmovd(ch1, v1); 5867 } 5868 __ br(__ NE, L_HAS_ZERO); 5869 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5870 __ add(result, result, wordSize/str2_chr_size); 5871 __ add(str2, str2, wordSize); 5872 __ br(__ LT, L_POST_LOOP); 5873 __ BIND(L_LOOP); 5874 __ ldr(ch2, Address(str2)); 5875 __ eor(ch2, first, ch2); 5876 __ sub(tmp2, ch2, tmp1); 5877 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5878 __ bics(tmp2, tmp2, ch2); 5879 __ br(__ NE, L_HAS_ZERO); 5880 __ BIND(L_LOOP_PROCEED); 5881 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5882 __ add(str2, str2, wordSize); 5883 __ add(result, result, wordSize/str2_chr_size); 5884 __ br(__ GE, L_LOOP); 5885 __ BIND(L_POST_LOOP); 5886 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5887 __ br(__ LE, NOMATCH); 5888 __ ldr(ch2, Address(str2)); 5889 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5890 __ eor(ch2, first, ch2); 5891 __ sub(tmp2, ch2, tmp1); 5892 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5893 __ mov(tmp4, -1); // all bits set 5894 __ b(L_SMALL_PROCEED); 5895 __ align(OptoLoopAlignment); 5896 __ BIND(L_SMALL); 5897 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5898 __ eor(ch2, first, ch2); 5899 if (str1_isL != str2_isL) { 5900 __ zip1(v1, __ T16B, v1, v0); 5901 } 5902 __ sub(tmp2, ch2, tmp1); 5903 __ mov(tmp4, -1); // all bits set 5904 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5905 if (str1_isL != str2_isL) { 5906 __ fmovd(ch1, v1); // move converted 4 symbols 5907 } 5908 __ BIND(L_SMALL_PROCEED); 5909 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5910 __ bic(tmp2, tmp2, ch2); 5911 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5912 __ rbit(tmp2, tmp2); 5913 __ br(__ EQ, NOMATCH); 5914 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5915 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5916 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5917 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5918 if (str2_isL) { // LL 5919 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5920 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5921 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5922 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5923 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5924 } else { 5925 __ mov(ch2, 0xE); // all bits in byte set except last one 5926 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5927 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5928 __ lslv(tmp2, tmp2, tmp4); 5929 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5930 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5931 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5932 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5933 } 5934 __ cmp(ch1, ch2); 5935 __ mov(tmp4, wordSize/str2_chr_size); 5936 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5937 __ BIND(L_SMALL_CMP_LOOP); 5938 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5939 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5940 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5941 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5942 __ add(tmp4, tmp4, 1); 5943 __ cmp(tmp4, cnt1); 5944 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5945 __ cmp(first, ch2); 5946 __ br(__ EQ, L_SMALL_CMP_LOOP); 5947 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5948 __ cbz(tmp2, NOMATCH); // no more matches. exit 5949 __ clz(tmp4, tmp2); 5950 __ add(result, result, 1); // advance index 5951 __ add(str2, str2, str2_chr_size); // advance pointer 5952 __ b(L_SMALL_HAS_ZERO_LOOP); 5953 __ align(OptoLoopAlignment); 5954 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5955 __ cmp(first, ch2); 5956 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5957 __ b(DONE); 5958 __ align(OptoLoopAlignment); 5959 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5960 if (str2_isL) { // LL 5961 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5962 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5963 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5964 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5965 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5966 } else { 5967 __ mov(ch2, 0xE); // all bits in byte set except last one 5968 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5969 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5970 __ lslv(tmp2, tmp2, tmp4); 5971 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5972 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5973 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5974 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5975 } 5976 __ cmp(ch1, ch2); 5977 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5978 __ b(DONE); 5979 __ align(OptoLoopAlignment); 5980 __ BIND(L_HAS_ZERO); 5981 __ rbit(tmp2, tmp2); 5982 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 5983 // Now, perform compression of counters(cnt2 and cnt1) into one register. 5984 // It's fine because both counters are 32bit and are not changed in this 5985 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 5986 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 5987 __ sub(result, result, 1); 5988 __ BIND(L_HAS_ZERO_LOOP); 5989 __ mov(cnt1, wordSize/str2_chr_size); 5990 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5991 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 5992 if (str2_isL) { 5993 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5994 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5995 __ lslv(tmp2, tmp2, tmp4); 5996 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5997 __ add(tmp4, tmp4, 1); 5998 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5999 __ lsl(tmp2, tmp2, 1); 6000 __ mov(tmp4, wordSize/str2_chr_size); 6001 } else { 6002 __ mov(ch2, 0xE); 6003 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6004 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6005 __ lslv(tmp2, tmp2, tmp4); 6006 __ add(tmp4, tmp4, 1); 6007 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6008 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6009 __ lsl(tmp2, tmp2, 1); 6010 __ mov(tmp4, wordSize/str2_chr_size); 6011 __ sub(str2, str2, str2_chr_size); 6012 } 6013 __ cmp(ch1, ch2); 6014 __ mov(tmp4, wordSize/str2_chr_size); 6015 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6016 __ BIND(L_CMP_LOOP); 6017 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6018 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6019 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6020 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6021 __ add(tmp4, tmp4, 1); 6022 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6023 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6024 __ cmp(cnt1, ch2); 6025 __ br(__ EQ, L_CMP_LOOP); 6026 __ BIND(L_CMP_LOOP_NOMATCH); 6027 // here we're not matched 6028 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6029 __ clz(tmp4, tmp2); 6030 __ add(str2, str2, str2_chr_size); // advance pointer 6031 __ b(L_HAS_ZERO_LOOP); 6032 __ align(OptoLoopAlignment); 6033 __ BIND(L_CMP_LOOP_LAST_CMP); 6034 __ cmp(cnt1, ch2); 6035 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6036 __ b(DONE); 6037 __ align(OptoLoopAlignment); 6038 __ BIND(L_CMP_LOOP_LAST_CMP2); 6039 if (str2_isL) { 6040 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6041 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6042 __ lslv(tmp2, tmp2, tmp4); 6043 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6044 __ add(tmp4, tmp4, 1); 6045 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6046 __ lsl(tmp2, tmp2, 1); 6047 } else { 6048 __ mov(ch2, 0xE); 6049 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6050 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6051 __ lslv(tmp2, tmp2, tmp4); 6052 __ add(tmp4, tmp4, 1); 6053 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6054 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6055 __ lsl(tmp2, tmp2, 1); 6056 __ sub(str2, str2, str2_chr_size); 6057 } 6058 __ cmp(ch1, ch2); 6059 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6060 __ b(DONE); 6061 __ align(OptoLoopAlignment); 6062 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6063 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6064 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6065 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6066 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6067 // result by analyzed characters value, so, we can just reset lower bits 6068 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6069 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6070 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6071 // index of last analyzed substring inside current octet. So, str2 in at 6072 // respective start address. We need to advance it to next octet 6073 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6074 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6075 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6076 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6077 __ movw(cnt2, cnt2); 6078 __ b(L_LOOP_PROCEED); 6079 __ align(OptoLoopAlignment); 6080 __ BIND(NOMATCH); 6081 __ mov(result, -1); 6082 __ BIND(DONE); 6083 __ pop(spilled_regs, sp); 6084 __ ret(lr); 6085 return entry; 6086 } 6087 6088 void generate_string_indexof_stubs() { 6089 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6090 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6091 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6092 } 6093 6094 void inflate_and_store_2_fp_registers(bool generatePrfm, 6095 FloatRegister src1, FloatRegister src2) { 6096 Register dst = r1; 6097 __ zip1(v1, __ T16B, src1, v0); 6098 __ zip2(v2, __ T16B, src1, v0); 6099 if (generatePrfm) { 6100 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6101 } 6102 __ zip1(v3, __ T16B, src2, v0); 6103 __ zip2(v4, __ T16B, src2, v0); 6104 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6105 } 6106 6107 // R0 = src 6108 // R1 = dst 6109 // R2 = len 6110 // R3 = len >> 3 6111 // V0 = 0 6112 // v1 = loaded 8 bytes 6113 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6114 address generate_large_byte_array_inflate() { 6115 __ align(CodeEntryAlignment); 6116 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6117 address entry = __ pc(); 6118 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6119 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6120 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6121 6122 // do one more 8-byte read to have address 16-byte aligned in most cases 6123 // also use single store instruction 6124 __ ldrd(v2, __ post(src, 8)); 6125 __ sub(octetCounter, octetCounter, 2); 6126 __ zip1(v1, __ T16B, v1, v0); 6127 __ zip1(v2, __ T16B, v2, v0); 6128 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6129 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6130 __ subs(rscratch1, octetCounter, large_loop_threshold); 6131 __ br(__ LE, LOOP_START); 6132 __ b(LOOP_PRFM_START); 6133 __ bind(LOOP_PRFM); 6134 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6135 __ bind(LOOP_PRFM_START); 6136 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6137 __ sub(octetCounter, octetCounter, 8); 6138 __ subs(rscratch1, octetCounter, large_loop_threshold); 6139 inflate_and_store_2_fp_registers(true, v3, v4); 6140 inflate_and_store_2_fp_registers(true, v5, v6); 6141 __ br(__ GT, LOOP_PRFM); 6142 __ cmp(octetCounter, (u1)8); 6143 __ br(__ LT, DONE); 6144 __ bind(LOOP); 6145 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6146 __ bind(LOOP_START); 6147 __ sub(octetCounter, octetCounter, 8); 6148 __ cmp(octetCounter, (u1)8); 6149 inflate_and_store_2_fp_registers(false, v3, v4); 6150 inflate_and_store_2_fp_registers(false, v5, v6); 6151 __ br(__ GE, LOOP); 6152 __ bind(DONE); 6153 __ ret(lr); 6154 return entry; 6155 } 6156 6157 /** 6158 * Arguments: 6159 * 6160 * Input: 6161 * c_rarg0 - current state address 6162 * c_rarg1 - H key address 6163 * c_rarg2 - data address 6164 * c_rarg3 - number of blocks 6165 * 6166 * Output: 6167 * Updated state at c_rarg0 6168 */ 6169 address generate_ghash_processBlocks() { 6170 // Bafflingly, GCM uses little-endian for the byte order, but 6171 // big-endian for the bit order. For example, the polynomial 1 is 6172 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6173 // 6174 // So, we must either reverse the bytes in each word and do 6175 // everything big-endian or reverse the bits in each byte and do 6176 // it little-endian. On AArch64 it's more idiomatic to reverse 6177 // the bits in each byte (we have an instruction, RBIT, to do 6178 // that) and keep the data in little-endian bit order through the 6179 // calculation, bit-reversing the inputs and outputs. 6180 6181 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6182 __ align(wordSize * 2); 6183 address p = __ pc(); 6184 __ emit_int64(0x87); // The low-order bits of the field 6185 // polynomial (i.e. p = z^7+z^2+z+1) 6186 // repeated in the low and high parts of a 6187 // 128-bit vector 6188 __ emit_int64(0x87); 6189 6190 __ align(CodeEntryAlignment); 6191 address start = __ pc(); 6192 6193 Register state = c_rarg0; 6194 Register subkeyH = c_rarg1; 6195 Register data = c_rarg2; 6196 Register blocks = c_rarg3; 6197 6198 FloatRegister vzr = v30; 6199 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6200 6201 __ ldrq(v24, p); // The field polynomial 6202 6203 __ ldrq(v0, Address(state)); 6204 __ ldrq(v1, Address(subkeyH)); 6205 6206 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6207 __ rbit(v0, __ T16B, v0); 6208 __ rev64(v1, __ T16B, v1); 6209 __ rbit(v1, __ T16B, v1); 6210 6211 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6212 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6213 6214 { 6215 Label L_ghash_loop; 6216 __ bind(L_ghash_loop); 6217 6218 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6219 // reversing each byte 6220 __ rbit(v2, __ T16B, v2); 6221 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6222 6223 // Multiply state in v2 by subkey in v1 6224 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6225 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6226 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6227 // Reduce v7:v5 by the field polynomial 6228 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6229 6230 __ sub(blocks, blocks, 1); 6231 __ cbnz(blocks, L_ghash_loop); 6232 } 6233 6234 // The bit-reversed result is at this point in v0 6235 __ rev64(v0, __ T16B, v0); 6236 __ rbit(v0, __ T16B, v0); 6237 6238 __ st1(v0, __ T16B, state); 6239 __ ret(lr); 6240 6241 return start; 6242 } 6243 6244 address generate_ghash_processBlocks_wide() { 6245 address small = generate_ghash_processBlocks(); 6246 6247 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6248 __ align(wordSize * 2); 6249 address p = __ pc(); 6250 __ emit_int64(0x87); // The low-order bits of the field 6251 // polynomial (i.e. p = z^7+z^2+z+1) 6252 // repeated in the low and high parts of a 6253 // 128-bit vector 6254 __ emit_int64(0x87); 6255 6256 __ align(CodeEntryAlignment); 6257 address start = __ pc(); 6258 6259 Register state = c_rarg0; 6260 Register subkeyH = c_rarg1; 6261 Register data = c_rarg2; 6262 Register blocks = c_rarg3; 6263 6264 const int unroll = 4; 6265 6266 __ cmp(blocks, (unsigned char)(unroll * 2)); 6267 __ br(__ LT, small); 6268 6269 if (unroll > 1) { 6270 // Save state before entering routine 6271 __ sub(sp, sp, 4 * 16); 6272 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6273 __ sub(sp, sp, 4 * 16); 6274 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6275 } 6276 6277 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6278 6279 if (unroll > 1) { 6280 // And restore state 6281 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6282 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6283 } 6284 6285 __ cmp(blocks, (unsigned char)0); 6286 __ br(__ GT, small); 6287 6288 __ ret(lr); 6289 6290 return start; 6291 } 6292 6293 void generate_base64_encode_simdround(Register src, Register dst, 6294 FloatRegister codec, u8 size) { 6295 6296 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6297 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6298 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6299 6300 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6301 6302 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6303 6304 __ ushr(ind0, arrangement, in0, 2); 6305 6306 __ ushr(ind1, arrangement, in1, 2); 6307 __ shl(in0, arrangement, in0, 6); 6308 __ orr(ind1, arrangement, ind1, in0); 6309 __ ushr(ind1, arrangement, ind1, 2); 6310 6311 __ ushr(ind2, arrangement, in2, 4); 6312 __ shl(in1, arrangement, in1, 4); 6313 __ orr(ind2, arrangement, in1, ind2); 6314 __ ushr(ind2, arrangement, ind2, 2); 6315 6316 __ shl(ind3, arrangement, in2, 2); 6317 __ ushr(ind3, arrangement, ind3, 2); 6318 6319 __ tbl(out0, arrangement, codec, 4, ind0); 6320 __ tbl(out1, arrangement, codec, 4, ind1); 6321 __ tbl(out2, arrangement, codec, 4, ind2); 6322 __ tbl(out3, arrangement, codec, 4, ind3); 6323 6324 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6325 } 6326 6327 /** 6328 * Arguments: 6329 * 6330 * Input: 6331 * c_rarg0 - src_start 6332 * c_rarg1 - src_offset 6333 * c_rarg2 - src_length 6334 * c_rarg3 - dest_start 6335 * c_rarg4 - dest_offset 6336 * c_rarg5 - isURL 6337 * 6338 */ 6339 address generate_base64_encodeBlock() { 6340 6341 static const char toBase64[64] = { 6342 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6343 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6344 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6345 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6346 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6347 }; 6348 6349 static const char toBase64URL[64] = { 6350 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6351 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6352 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6353 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6354 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6355 }; 6356 6357 __ align(CodeEntryAlignment); 6358 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6359 address start = __ pc(); 6360 6361 Register src = c_rarg0; // source array 6362 Register soff = c_rarg1; // source start offset 6363 Register send = c_rarg2; // source end offset 6364 Register dst = c_rarg3; // dest array 6365 Register doff = c_rarg4; // position for writing to dest array 6366 Register isURL = c_rarg5; // Base64 or URL character set 6367 6368 // c_rarg6 and c_rarg7 are free to use as temps 6369 Register codec = c_rarg6; 6370 Register length = c_rarg7; 6371 6372 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6373 6374 __ add(src, src, soff); 6375 __ add(dst, dst, doff); 6376 __ sub(length, send, soff); 6377 6378 // load the codec base address 6379 __ lea(codec, ExternalAddress((address) toBase64)); 6380 __ cbz(isURL, ProcessData); 6381 __ lea(codec, ExternalAddress((address) toBase64URL)); 6382 6383 __ BIND(ProcessData); 6384 6385 // too short to formup a SIMD loop, roll back 6386 __ cmp(length, (u1)24); 6387 __ br(Assembler::LT, Process3B); 6388 6389 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6390 6391 __ BIND(Process48B); 6392 __ cmp(length, (u1)48); 6393 __ br(Assembler::LT, Process24B); 6394 generate_base64_encode_simdround(src, dst, v0, 16); 6395 __ sub(length, length, 48); 6396 __ b(Process48B); 6397 6398 __ BIND(Process24B); 6399 __ cmp(length, (u1)24); 6400 __ br(Assembler::LT, SIMDExit); 6401 generate_base64_encode_simdround(src, dst, v0, 8); 6402 __ sub(length, length, 24); 6403 6404 __ BIND(SIMDExit); 6405 __ cbz(length, Exit); 6406 6407 __ BIND(Process3B); 6408 // 3 src bytes, 24 bits 6409 __ ldrb(r10, __ post(src, 1)); 6410 __ ldrb(r11, __ post(src, 1)); 6411 __ ldrb(r12, __ post(src, 1)); 6412 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6413 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6414 // codec index 6415 __ ubfmw(r15, r12, 18, 23); 6416 __ ubfmw(r14, r12, 12, 17); 6417 __ ubfmw(r13, r12, 6, 11); 6418 __ andw(r12, r12, 63); 6419 // get the code based on the codec 6420 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6421 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6422 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6423 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6424 __ strb(r15, __ post(dst, 1)); 6425 __ strb(r14, __ post(dst, 1)); 6426 __ strb(r13, __ post(dst, 1)); 6427 __ strb(r12, __ post(dst, 1)); 6428 __ sub(length, length, 3); 6429 __ cbnz(length, Process3B); 6430 6431 __ BIND(Exit); 6432 __ ret(lr); 6433 6434 return start; 6435 } 6436 6437 void generate_base64_decode_simdround(Register src, Register dst, 6438 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6439 6440 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6441 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6442 6443 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6444 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6445 6446 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6447 6448 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6449 6450 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6451 6452 // we need unsigned saturating subtract, to make sure all input values 6453 // in range [0, 63] will have 0U value in the higher half lookup 6454 __ uqsubv(decH0, __ T16B, in0, v27); 6455 __ uqsubv(decH1, __ T16B, in1, v27); 6456 __ uqsubv(decH2, __ T16B, in2, v27); 6457 __ uqsubv(decH3, __ T16B, in3, v27); 6458 6459 // lower half lookup 6460 __ tbl(decL0, arrangement, codecL, 4, in0); 6461 __ tbl(decL1, arrangement, codecL, 4, in1); 6462 __ tbl(decL2, arrangement, codecL, 4, in2); 6463 __ tbl(decL3, arrangement, codecL, 4, in3); 6464 6465 // higher half lookup 6466 __ tbx(decH0, arrangement, codecH, 4, decH0); 6467 __ tbx(decH1, arrangement, codecH, 4, decH1); 6468 __ tbx(decH2, arrangement, codecH, 4, decH2); 6469 __ tbx(decH3, arrangement, codecH, 4, decH3); 6470 6471 // combine lower and higher 6472 __ orr(decL0, arrangement, decL0, decH0); 6473 __ orr(decL1, arrangement, decL1, decH1); 6474 __ orr(decL2, arrangement, decL2, decH2); 6475 __ orr(decL3, arrangement, decL3, decH3); 6476 6477 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6478 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6479 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6480 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6481 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6482 __ orr(in0, arrangement, decH0, decH1); 6483 __ orr(in1, arrangement, decH2, decH3); 6484 __ orr(in2, arrangement, in0, in1); 6485 __ umaxv(in3, arrangement, in2); 6486 __ umov(rscratch2, in3, __ B, 0); 6487 6488 // get the data to output 6489 __ shl(out0, arrangement, decL0, 2); 6490 __ ushr(out1, arrangement, decL1, 4); 6491 __ orr(out0, arrangement, out0, out1); 6492 __ shl(out1, arrangement, decL1, 4); 6493 __ ushr(out2, arrangement, decL2, 2); 6494 __ orr(out1, arrangement, out1, out2); 6495 __ shl(out2, arrangement, decL2, 6); 6496 __ orr(out2, arrangement, out2, decL3); 6497 6498 __ cbz(rscratch2, NoIllegalData); 6499 6500 // handle illegal input 6501 __ umov(r10, in2, __ D, 0); 6502 if (size == 16) { 6503 __ cbnz(r10, ErrorInLowerHalf); 6504 6505 // illegal input is in higher half, store the lower half now. 6506 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6507 6508 __ umov(r10, in2, __ D, 1); 6509 __ umov(r11, out0, __ D, 1); 6510 __ umov(r12, out1, __ D, 1); 6511 __ umov(r13, out2, __ D, 1); 6512 __ b(StoreLegalData); 6513 6514 __ BIND(ErrorInLowerHalf); 6515 } 6516 __ umov(r11, out0, __ D, 0); 6517 __ umov(r12, out1, __ D, 0); 6518 __ umov(r13, out2, __ D, 0); 6519 6520 __ BIND(StoreLegalData); 6521 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6522 __ strb(r11, __ post(dst, 1)); 6523 __ strb(r12, __ post(dst, 1)); 6524 __ strb(r13, __ post(dst, 1)); 6525 __ lsr(r10, r10, 8); 6526 __ lsr(r11, r11, 8); 6527 __ lsr(r12, r12, 8); 6528 __ lsr(r13, r13, 8); 6529 __ b(StoreLegalData); 6530 6531 __ BIND(NoIllegalData); 6532 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6533 } 6534 6535 6536 /** 6537 * Arguments: 6538 * 6539 * Input: 6540 * c_rarg0 - src_start 6541 * c_rarg1 - src_offset 6542 * c_rarg2 - src_length 6543 * c_rarg3 - dest_start 6544 * c_rarg4 - dest_offset 6545 * c_rarg5 - isURL 6546 * c_rarg6 - isMIME 6547 * 6548 */ 6549 address generate_base64_decodeBlock() { 6550 6551 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6552 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6553 // titled "Base64 decoding". 6554 6555 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6556 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6557 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6558 static const uint8_t fromBase64ForNoSIMD[256] = { 6559 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6560 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6561 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6562 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6563 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6564 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6565 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6566 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6567 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6568 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6569 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6570 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6571 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6572 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6573 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6574 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6575 }; 6576 6577 static const uint8_t fromBase64URLForNoSIMD[256] = { 6578 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6579 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6580 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6581 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6582 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6583 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6584 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6585 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6586 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6587 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6588 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6589 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6590 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6591 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6592 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6593 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6594 }; 6595 6596 // A legal value of base64 code is in range [0, 127]. We need two lookups 6597 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6598 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6599 // table vector lookup use tbx, out of range indices are unchanged in 6600 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6601 // The value of index 64 is set to 0, so that we know that we already get the 6602 // decoded data with the 1st lookup. 6603 static const uint8_t fromBase64ForSIMD[128] = { 6604 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6605 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6606 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6607 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6608 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6609 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6610 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6611 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6612 }; 6613 6614 static const uint8_t fromBase64URLForSIMD[128] = { 6615 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6616 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6617 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6618 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6619 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6620 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6621 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6622 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6623 }; 6624 6625 __ align(CodeEntryAlignment); 6626 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6627 address start = __ pc(); 6628 6629 Register src = c_rarg0; // source array 6630 Register soff = c_rarg1; // source start offset 6631 Register send = c_rarg2; // source end offset 6632 Register dst = c_rarg3; // dest array 6633 Register doff = c_rarg4; // position for writing to dest array 6634 Register isURL = c_rarg5; // Base64 or URL character set 6635 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6636 6637 Register length = send; // reuse send as length of source data to process 6638 6639 Register simd_codec = c_rarg6; 6640 Register nosimd_codec = c_rarg7; 6641 6642 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6643 6644 __ enter(); 6645 6646 __ add(src, src, soff); 6647 __ add(dst, dst, doff); 6648 6649 __ mov(doff, dst); 6650 6651 __ sub(length, send, soff); 6652 __ bfm(length, zr, 0, 1); 6653 6654 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6655 __ cbz(isURL, ProcessData); 6656 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6657 6658 __ BIND(ProcessData); 6659 __ mov(rscratch1, length); 6660 __ cmp(length, (u1)144); // 144 = 80 + 64 6661 __ br(Assembler::LT, Process4B); 6662 6663 // In the MIME case, the line length cannot be more than 76 6664 // bytes (see RFC 2045). This is too short a block for SIMD 6665 // to be worthwhile, so we use non-SIMD here. 6666 __ movw(rscratch1, 79); 6667 6668 __ BIND(Process4B); 6669 __ ldrw(r14, __ post(src, 4)); 6670 __ ubfxw(r10, r14, 0, 8); 6671 __ ubfxw(r11, r14, 8, 8); 6672 __ ubfxw(r12, r14, 16, 8); 6673 __ ubfxw(r13, r14, 24, 8); 6674 // get the de-code 6675 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6676 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6677 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6678 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6679 // error detection, 255u indicates an illegal input 6680 __ orrw(r14, r10, r11); 6681 __ orrw(r15, r12, r13); 6682 __ orrw(r14, r14, r15); 6683 __ tbnz(r14, 7, Exit); 6684 // recover the data 6685 __ lslw(r14, r10, 10); 6686 __ bfiw(r14, r11, 4, 6); 6687 __ bfmw(r14, r12, 2, 5); 6688 __ rev16w(r14, r14); 6689 __ bfiw(r13, r12, 6, 2); 6690 __ strh(r14, __ post(dst, 2)); 6691 __ strb(r13, __ post(dst, 1)); 6692 // non-simd loop 6693 __ subsw(rscratch1, rscratch1, 4); 6694 __ br(Assembler::GT, Process4B); 6695 6696 // if exiting from PreProcess80B, rscratch1 == -1; 6697 // otherwise, rscratch1 == 0. 6698 __ cbzw(rscratch1, Exit); 6699 __ sub(length, length, 80); 6700 6701 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6702 __ cbz(isURL, SIMDEnter); 6703 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6704 6705 __ BIND(SIMDEnter); 6706 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6707 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6708 __ mov(rscratch1, 63); 6709 __ dup(v27, __ T16B, rscratch1); 6710 6711 __ BIND(Process64B); 6712 __ cmp(length, (u1)64); 6713 __ br(Assembler::LT, Process32B); 6714 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6715 __ sub(length, length, 64); 6716 __ b(Process64B); 6717 6718 __ BIND(Process32B); 6719 __ cmp(length, (u1)32); 6720 __ br(Assembler::LT, SIMDExit); 6721 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6722 __ sub(length, length, 32); 6723 __ b(Process32B); 6724 6725 __ BIND(SIMDExit); 6726 __ cbz(length, Exit); 6727 __ movw(rscratch1, length); 6728 __ b(Process4B); 6729 6730 __ BIND(Exit); 6731 __ sub(c_rarg0, dst, doff); 6732 6733 __ leave(); 6734 __ ret(lr); 6735 6736 return start; 6737 } 6738 6739 // Support for spin waits. 6740 address generate_spin_wait() { 6741 __ align(CodeEntryAlignment); 6742 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6743 address start = __ pc(); 6744 6745 __ spin_wait(); 6746 __ ret(lr); 6747 6748 return start; 6749 } 6750 6751 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 6752 6753 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6754 // 6755 // If LSE is in use, generate LSE versions of all the stubs. The 6756 // non-LSE versions are in atomic_aarch64.S. 6757 6758 // class AtomicStubMark records the entry point of a stub and the 6759 // stub pointer which will point to it. The stub pointer is set to 6760 // the entry point when ~AtomicStubMark() is called, which must be 6761 // after ICache::invalidate_range. This ensures safe publication of 6762 // the generated code. 6763 class AtomicStubMark { 6764 address _entry_point; 6765 aarch64_atomic_stub_t *_stub; 6766 MacroAssembler *_masm; 6767 public: 6768 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6769 _masm = masm; 6770 __ align(32); 6771 _entry_point = __ pc(); 6772 _stub = stub; 6773 } 6774 ~AtomicStubMark() { 6775 *_stub = (aarch64_atomic_stub_t)_entry_point; 6776 } 6777 }; 6778 6779 // NB: For memory_order_conservative we need a trailing membar after 6780 // LSE atomic operations but not a leading membar. 6781 // 6782 // We don't need a leading membar because a clause in the Arm ARM 6783 // says: 6784 // 6785 // Barrier-ordered-before 6786 // 6787 // Barrier instructions order prior Memory effects before subsequent 6788 // Memory effects generated by the same Observer. A read or a write 6789 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6790 // Observer if and only if RW1 appears in program order before RW 2 6791 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6792 // instruction with both Acquire and Release semantics. 6793 // 6794 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6795 // and Release semantics, therefore we don't need a leading 6796 // barrier. However, there is no corresponding Barrier-ordered-after 6797 // relationship, therefore we need a trailing membar to prevent a 6798 // later store or load from being reordered with the store in an 6799 // atomic instruction. 6800 // 6801 // This was checked by using the herd7 consistency model simulator 6802 // (http://diy.inria.fr/) with this test case: 6803 // 6804 // AArch64 LseCas 6805 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6806 // P0 | P1; 6807 // LDR W4, [X2] | MOV W3, #0; 6808 // DMB LD | MOV W4, #1; 6809 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6810 // | DMB ISH; 6811 // | STR W4, [X2]; 6812 // exists 6813 // (0:X3=0 /\ 0:X4=1) 6814 // 6815 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6816 // with the store to x in P1. Without the DMB in P1 this may happen. 6817 // 6818 // At the time of writing we don't know of any AArch64 hardware that 6819 // reorders stores in this way, but the Reference Manual permits it. 6820 6821 void gen_cas_entry(Assembler::operand_size size, 6822 atomic_memory_order order) { 6823 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6824 exchange_val = c_rarg2; 6825 bool acquire, release; 6826 switch (order) { 6827 case memory_order_relaxed: 6828 acquire = false; 6829 release = false; 6830 break; 6831 case memory_order_release: 6832 acquire = false; 6833 release = true; 6834 break; 6835 default: 6836 acquire = true; 6837 release = true; 6838 break; 6839 } 6840 __ mov(prev, compare_val); 6841 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6842 if (order == memory_order_conservative) { 6843 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6844 } 6845 if (size == Assembler::xword) { 6846 __ mov(r0, prev); 6847 } else { 6848 __ movw(r0, prev); 6849 } 6850 __ ret(lr); 6851 } 6852 6853 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6854 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6855 // If not relaxed, then default to conservative. Relaxed is the only 6856 // case we use enough to be worth specializing. 6857 if (order == memory_order_relaxed) { 6858 __ ldadd(size, incr, prev, addr); 6859 } else { 6860 __ ldaddal(size, incr, prev, addr); 6861 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6862 } 6863 if (size == Assembler::xword) { 6864 __ mov(r0, prev); 6865 } else { 6866 __ movw(r0, prev); 6867 } 6868 __ ret(lr); 6869 } 6870 6871 void gen_swpal_entry(Assembler::operand_size size) { 6872 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6873 __ swpal(size, incr, prev, addr); 6874 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6875 if (size == Assembler::xword) { 6876 __ mov(r0, prev); 6877 } else { 6878 __ movw(r0, prev); 6879 } 6880 __ ret(lr); 6881 } 6882 6883 void generate_atomic_entry_points() { 6884 if (! UseLSE) { 6885 return; 6886 } 6887 6888 __ align(CodeEntryAlignment); 6889 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6890 address first_entry = __ pc(); 6891 6892 // ADD, memory_order_conservative 6893 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6894 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6895 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6896 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6897 6898 // ADD, memory_order_relaxed 6899 AtomicStubMark mark_fetch_add_4_relaxed 6900 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6901 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6902 AtomicStubMark mark_fetch_add_8_relaxed 6903 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6904 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6905 6906 // XCHG, memory_order_conservative 6907 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6908 gen_swpal_entry(Assembler::word); 6909 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6910 gen_swpal_entry(Assembler::xword); 6911 6912 // CAS, memory_order_conservative 6913 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6914 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6915 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6916 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6917 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6918 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6919 6920 // CAS, memory_order_relaxed 6921 AtomicStubMark mark_cmpxchg_1_relaxed 6922 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6923 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6924 AtomicStubMark mark_cmpxchg_4_relaxed 6925 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6926 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6927 AtomicStubMark mark_cmpxchg_8_relaxed 6928 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6929 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6930 6931 AtomicStubMark mark_cmpxchg_4_release 6932 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6933 gen_cas_entry(MacroAssembler::word, memory_order_release); 6934 AtomicStubMark mark_cmpxchg_8_release 6935 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 6936 gen_cas_entry(MacroAssembler::xword, memory_order_release); 6937 6938 AtomicStubMark mark_cmpxchg_4_seq_cst 6939 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 6940 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 6941 AtomicStubMark mark_cmpxchg_8_seq_cst 6942 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 6943 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 6944 6945 ICache::invalidate_range(first_entry, __ pc() - first_entry); 6946 } 6947 #endif // LINUX 6948 6949 address generate_cont_thaw(Continuation::thaw_kind kind) { 6950 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 6951 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 6952 6953 address start = __ pc(); 6954 6955 if (return_barrier) { 6956 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 6957 __ mov(sp, rscratch1); 6958 } 6959 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6960 6961 if (return_barrier) { 6962 // preserve possible return value from a method returning to the return barrier 6963 __ fmovd(rscratch1, v0); 6964 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 6965 } 6966 6967 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 6968 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 6969 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 6970 6971 if (return_barrier) { 6972 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 6973 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 6974 __ fmovd(v0, rscratch1); 6975 } 6976 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6977 6978 6979 Label thaw_success; 6980 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 6981 __ cbnz(rscratch2, thaw_success); 6982 __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry())); 6983 __ br(rscratch1); 6984 __ bind(thaw_success); 6985 6986 // make room for the thawed frames 6987 __ sub(rscratch1, sp, rscratch2); 6988 __ andr(rscratch1, rscratch1, -16); // align 6989 __ mov(sp, rscratch1); 6990 6991 if (return_barrier) { 6992 // save original return value -- again 6993 __ fmovd(rscratch1, v0); 6994 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 6995 } 6996 6997 // If we want, we can templatize thaw by kind, and have three different entries 6998 __ movw(c_rarg1, (uint32_t)kind); 6999 7000 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7001 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7002 7003 if (return_barrier) { 7004 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7005 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7006 __ fmovd(v0, rscratch1); 7007 } else { 7008 __ mov(r0, zr); // return 0 (success) from doYield 7009 } 7010 7011 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7012 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7013 __ mov(rfp, sp); 7014 7015 if (return_barrier_exception) { 7016 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7017 __ verify_oop(r0); 7018 __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19 7019 7020 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7021 7022 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7023 // __ reinitialize_ptrue(); 7024 7025 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7026 7027 __ mov(r1, r0); // the exception handler 7028 __ mov(r0, r19); // restore return value contaning the exception oop 7029 __ verify_oop(r0); 7030 7031 __ leave(); 7032 __ mov(r3, lr); 7033 __ br(r1); // the exception handler 7034 } else { 7035 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7036 __ leave(); 7037 __ ret(lr); 7038 } 7039 7040 return start; 7041 } 7042 7043 address generate_cont_thaw() { 7044 if (!Continuations::enabled()) return nullptr; 7045 7046 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7047 address start = __ pc(); 7048 generate_cont_thaw(Continuation::thaw_top); 7049 return start; 7050 } 7051 7052 address generate_cont_returnBarrier() { 7053 if (!Continuations::enabled()) return nullptr; 7054 7055 // TODO: will probably need multiple return barriers depending on return type 7056 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7057 address start = __ pc(); 7058 7059 generate_cont_thaw(Continuation::thaw_return_barrier); 7060 7061 return start; 7062 } 7063 7064 address generate_cont_returnBarrier_exception() { 7065 if (!Continuations::enabled()) return nullptr; 7066 7067 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7068 address start = __ pc(); 7069 7070 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7071 7072 return start; 7073 } 7074 7075 #if INCLUDE_JFR 7076 7077 static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { 7078 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7079 __ mov(c_rarg0, thread); 7080 } 7081 7082 // The handle is dereferenced through a load barrier. 7083 static void jfr_epilogue(MacroAssembler* _masm) { 7084 __ reset_last_Java_frame(true); 7085 __ resolve_global_jobject(r0, rscratch1, rscratch2); 7086 } 7087 7088 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 7089 // It returns a jobject handle to the event writer. 7090 // The handle is dereferenced and the return value is the event writer oop. 7091 static RuntimeStub* generate_jfr_write_checkpoint() { 7092 enum layout { 7093 rbp_off, 7094 rbpH_off, 7095 return_off, 7096 return_off2, 7097 framesize // inclusive of return address 7098 }; 7099 7100 int insts_size = 1024; 7101 int locs_size = 64; 7102 CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size); 7103 OopMapSet* oop_maps = new OopMapSet(); 7104 MacroAssembler* masm = new MacroAssembler(&code); 7105 MacroAssembler* _masm = masm; 7106 7107 address start = __ pc(); 7108 __ enter(); 7109 int frame_complete = __ pc() - start; 7110 address the_pc = __ pc(); 7111 jfr_prologue(the_pc, _masm, rthread); 7112 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 7113 jfr_epilogue(_masm); 7114 __ leave(); 7115 __ ret(lr); 7116 7117 OopMap* map = new OopMap(framesize, 1); // rfp 7118 oop_maps->add_gc_map(the_pc - start, map); 7119 7120 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 7121 RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete, 7122 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7123 oop_maps, false); 7124 return stub; 7125 } 7126 7127 #endif // INCLUDE_JFR 7128 7129 // Continuation point for throwing of implicit exceptions that are 7130 // not handled in the current activation. Fabricates an exception 7131 // oop and initiates normal exception dispatching in this 7132 // frame. Since we need to preserve callee-saved values (currently 7133 // only for C2, but done for C1 as well) we need a callee-saved oop 7134 // map and therefore have to make these stubs into RuntimeStubs 7135 // rather than BufferBlobs. If the compiler needs all registers to 7136 // be preserved between the fault point and the exception handler 7137 // then it must assume responsibility for that in 7138 // AbstractCompiler::continuation_for_implicit_null_exception or 7139 // continuation_for_implicit_division_by_zero_exception. All other 7140 // implicit exceptions (e.g., NullPointerException or 7141 // AbstractMethodError on entry) are either at call sites or 7142 // otherwise assume that stack unwinding will be initiated, so 7143 // caller saved registers were assumed volatile in the compiler. 7144 7145 #undef __ 7146 #define __ masm-> 7147 7148 address generate_throw_exception(const char* name, 7149 address runtime_entry, 7150 Register arg1 = noreg, 7151 Register arg2 = noreg) { 7152 // Information about frame layout at time of blocking runtime call. 7153 // Note that we only have to preserve callee-saved registers since 7154 // the compilers are responsible for supplying a continuation point 7155 // if they expect all registers to be preserved. 7156 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 7157 enum layout { 7158 rfp_off = 0, 7159 rfp_off2, 7160 return_off, 7161 return_off2, 7162 framesize // inclusive of return address 7163 }; 7164 7165 int insts_size = 512; 7166 int locs_size = 64; 7167 7168 CodeBuffer code(name, insts_size, locs_size); 7169 OopMapSet* oop_maps = new OopMapSet(); 7170 MacroAssembler* masm = new MacroAssembler(&code); 7171 7172 address start = __ pc(); 7173 7174 // This is an inlined and slightly modified version of call_VM 7175 // which has the ability to fetch the return PC out of 7176 // thread-local storage and also sets up last_Java_sp slightly 7177 // differently than the real call_VM 7178 7179 __ enter(); // Save FP and LR before call 7180 7181 assert(is_even(framesize/2), "sp not 16-byte aligned"); 7182 7183 // lr and fp are already in place 7184 __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog 7185 7186 int frame_complete = __ pc() - start; 7187 7188 // Set up last_Java_sp and last_Java_fp 7189 address the_pc = __ pc(); 7190 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7191 7192 // Call runtime 7193 if (arg1 != noreg) { 7194 assert(arg2 != c_rarg1, "clobbered"); 7195 __ mov(c_rarg1, arg1); 7196 } 7197 if (arg2 != noreg) { 7198 __ mov(c_rarg2, arg2); 7199 } 7200 __ mov(c_rarg0, rthread); 7201 BLOCK_COMMENT("call runtime_entry"); 7202 __ mov(rscratch1, runtime_entry); 7203 __ blr(rscratch1); 7204 7205 // Generate oop map 7206 OopMap* map = new OopMap(framesize, 0); 7207 7208 oop_maps->add_gc_map(the_pc - start, map); 7209 7210 __ reset_last_Java_frame(true); 7211 7212 // Reinitialize the ptrue predicate register, in case the external runtime 7213 // call clobbers ptrue reg, as we may return to SVE compiled code. 7214 __ reinitialize_ptrue(); 7215 7216 __ leave(); 7217 7218 // check for pending exceptions 7219 #ifdef ASSERT 7220 Label L; 7221 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 7222 __ cbnz(rscratch1, L); 7223 __ should_not_reach_here(); 7224 __ bind(L); 7225 #endif // ASSERT 7226 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 7227 7228 // codeBlob framesize is in words (not VMRegImpl::slot_size) 7229 RuntimeStub* stub = 7230 RuntimeStub::new_runtime_stub(name, 7231 &code, 7232 frame_complete, 7233 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7234 oop_maps, false); 7235 return stub->entry_point(); 7236 } 7237 7238 class MontgomeryMultiplyGenerator : public MacroAssembler { 7239 7240 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7241 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7242 7243 RegSet _toSave; 7244 bool _squaring; 7245 7246 public: 7247 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7248 : MacroAssembler(as->code()), _squaring(squaring) { 7249 7250 // Register allocation 7251 7252 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7253 Pa_base = *regs; // Argument registers 7254 if (squaring) 7255 Pb_base = Pa_base; 7256 else 7257 Pb_base = *++regs; 7258 Pn_base = *++regs; 7259 Rlen= *++regs; 7260 inv = *++regs; 7261 Pm_base = *++regs; 7262 7263 // Working registers: 7264 Ra = *++regs; // The current digit of a, b, n, and m. 7265 Rb = *++regs; 7266 Rm = *++regs; 7267 Rn = *++regs; 7268 7269 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7270 Pb = *++regs; 7271 Pm = *++regs; 7272 Pn = *++regs; 7273 7274 t0 = *++regs; // Three registers which form a 7275 t1 = *++regs; // triple-precision accumuator. 7276 t2 = *++regs; 7277 7278 Ri = *++regs; // Inner and outer loop indexes. 7279 Rj = *++regs; 7280 7281 Rhi_ab = *++regs; // Product registers: low and high parts 7282 Rlo_ab = *++regs; // of a*b and m*n. 7283 Rhi_mn = *++regs; 7284 Rlo_mn = *++regs; 7285 7286 // r19 and up are callee-saved. 7287 _toSave = RegSet::range(r19, *regs) + Pm_base; 7288 } 7289 7290 private: 7291 void save_regs() { 7292 push(_toSave, sp); 7293 } 7294 7295 void restore_regs() { 7296 pop(_toSave, sp); 7297 } 7298 7299 template <typename T> 7300 void unroll_2(Register count, T block) { 7301 Label loop, end, odd; 7302 tbnz(count, 0, odd); 7303 cbz(count, end); 7304 align(16); 7305 bind(loop); 7306 (this->*block)(); 7307 bind(odd); 7308 (this->*block)(); 7309 subs(count, count, 2); 7310 br(Assembler::GT, loop); 7311 bind(end); 7312 } 7313 7314 template <typename T> 7315 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7316 Label loop, end, odd; 7317 tbnz(count, 0, odd); 7318 cbz(count, end); 7319 align(16); 7320 bind(loop); 7321 (this->*block)(d, s, tmp); 7322 bind(odd); 7323 (this->*block)(d, s, tmp); 7324 subs(count, count, 2); 7325 br(Assembler::GT, loop); 7326 bind(end); 7327 } 7328 7329 void pre1(RegisterOrConstant i) { 7330 block_comment("pre1"); 7331 // Pa = Pa_base; 7332 // Pb = Pb_base + i; 7333 // Pm = Pm_base; 7334 // Pn = Pn_base + i; 7335 // Ra = *Pa; 7336 // Rb = *Pb; 7337 // Rm = *Pm; 7338 // Rn = *Pn; 7339 ldr(Ra, Address(Pa_base)); 7340 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7341 ldr(Rm, Address(Pm_base)); 7342 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7343 lea(Pa, Address(Pa_base)); 7344 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7345 lea(Pm, Address(Pm_base)); 7346 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7347 7348 // Zero the m*n result. 7349 mov(Rhi_mn, zr); 7350 mov(Rlo_mn, zr); 7351 } 7352 7353 // The core multiply-accumulate step of a Montgomery 7354 // multiplication. The idea is to schedule operations as a 7355 // pipeline so that instructions with long latencies (loads and 7356 // multiplies) have time to complete before their results are 7357 // used. This most benefits in-order implementations of the 7358 // architecture but out-of-order ones also benefit. 7359 void step() { 7360 block_comment("step"); 7361 // MACC(Ra, Rb, t0, t1, t2); 7362 // Ra = *++Pa; 7363 // Rb = *--Pb; 7364 umulh(Rhi_ab, Ra, Rb); 7365 mul(Rlo_ab, Ra, Rb); 7366 ldr(Ra, pre(Pa, wordSize)); 7367 ldr(Rb, pre(Pb, -wordSize)); 7368 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7369 // previous iteration. 7370 // MACC(Rm, Rn, t0, t1, t2); 7371 // Rm = *++Pm; 7372 // Rn = *--Pn; 7373 umulh(Rhi_mn, Rm, Rn); 7374 mul(Rlo_mn, Rm, Rn); 7375 ldr(Rm, pre(Pm, wordSize)); 7376 ldr(Rn, pre(Pn, -wordSize)); 7377 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7378 } 7379 7380 void post1() { 7381 block_comment("post1"); 7382 7383 // MACC(Ra, Rb, t0, t1, t2); 7384 // Ra = *++Pa; 7385 // Rb = *--Pb; 7386 umulh(Rhi_ab, Ra, Rb); 7387 mul(Rlo_ab, Ra, Rb); 7388 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7389 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7390 7391 // *Pm = Rm = t0 * inv; 7392 mul(Rm, t0, inv); 7393 str(Rm, Address(Pm)); 7394 7395 // MACC(Rm, Rn, t0, t1, t2); 7396 // t0 = t1; t1 = t2; t2 = 0; 7397 umulh(Rhi_mn, Rm, Rn); 7398 7399 #ifndef PRODUCT 7400 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7401 { 7402 mul(Rlo_mn, Rm, Rn); 7403 add(Rlo_mn, t0, Rlo_mn); 7404 Label ok; 7405 cbz(Rlo_mn, ok); { 7406 stop("broken Montgomery multiply"); 7407 } bind(ok); 7408 } 7409 #endif 7410 // We have very carefully set things up so that 7411 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7412 // the lower half of Rm * Rn because we know the result already: 7413 // it must be -t0. t0 + (-t0) must generate a carry iff 7414 // t0 != 0. So, rather than do a mul and an adds we just set 7415 // the carry flag iff t0 is nonzero. 7416 // 7417 // mul(Rlo_mn, Rm, Rn); 7418 // adds(zr, t0, Rlo_mn); 7419 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7420 adcs(t0, t1, Rhi_mn); 7421 adc(t1, t2, zr); 7422 mov(t2, zr); 7423 } 7424 7425 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7426 block_comment("pre2"); 7427 // Pa = Pa_base + i-len; 7428 // Pb = Pb_base + len; 7429 // Pm = Pm_base + i-len; 7430 // Pn = Pn_base + len; 7431 7432 if (i.is_register()) { 7433 sub(Rj, i.as_register(), len); 7434 } else { 7435 mov(Rj, i.as_constant()); 7436 sub(Rj, Rj, len); 7437 } 7438 // Rj == i-len 7439 7440 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7441 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7442 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7443 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7444 7445 // Ra = *++Pa; 7446 // Rb = *--Pb; 7447 // Rm = *++Pm; 7448 // Rn = *--Pn; 7449 ldr(Ra, pre(Pa, wordSize)); 7450 ldr(Rb, pre(Pb, -wordSize)); 7451 ldr(Rm, pre(Pm, wordSize)); 7452 ldr(Rn, pre(Pn, -wordSize)); 7453 7454 mov(Rhi_mn, zr); 7455 mov(Rlo_mn, zr); 7456 } 7457 7458 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7459 block_comment("post2"); 7460 if (i.is_constant()) { 7461 mov(Rj, i.as_constant()-len.as_constant()); 7462 } else { 7463 sub(Rj, i.as_register(), len); 7464 } 7465 7466 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7467 7468 // As soon as we know the least significant digit of our result, 7469 // store it. 7470 // Pm_base[i-len] = t0; 7471 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7472 7473 // t0 = t1; t1 = t2; t2 = 0; 7474 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7475 adc(t1, t2, zr); 7476 mov(t2, zr); 7477 } 7478 7479 // A carry in t0 after Montgomery multiplication means that we 7480 // should subtract multiples of n from our result in m. We'll 7481 // keep doing that until there is no carry. 7482 void normalize(RegisterOrConstant len) { 7483 block_comment("normalize"); 7484 // while (t0) 7485 // t0 = sub(Pm_base, Pn_base, t0, len); 7486 Label loop, post, again; 7487 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7488 cbz(t0, post); { 7489 bind(again); { 7490 mov(i, zr); 7491 mov(cnt, len); 7492 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7493 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7494 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7495 align(16); 7496 bind(loop); { 7497 sbcs(Rm, Rm, Rn); 7498 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7499 add(i, i, 1); 7500 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7501 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7502 sub(cnt, cnt, 1); 7503 } cbnz(cnt, loop); 7504 sbc(t0, t0, zr); 7505 } cbnz(t0, again); 7506 } bind(post); 7507 } 7508 7509 // Move memory at s to d, reversing words. 7510 // Increments d to end of copied memory 7511 // Destroys tmp1, tmp2 7512 // Preserves len 7513 // Leaves s pointing to the address which was in d at start 7514 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7515 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7516 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7517 7518 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7519 mov(tmp1, len); 7520 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7521 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7522 } 7523 // where 7524 void reverse1(Register d, Register s, Register tmp) { 7525 ldr(tmp, pre(s, -wordSize)); 7526 ror(tmp, tmp, 32); 7527 str(tmp, post(d, wordSize)); 7528 } 7529 7530 void step_squaring() { 7531 // An extra ACC 7532 step(); 7533 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7534 } 7535 7536 void last_squaring(RegisterOrConstant i) { 7537 Label dont; 7538 // if ((i & 1) == 0) { 7539 tbnz(i.as_register(), 0, dont); { 7540 // MACC(Ra, Rb, t0, t1, t2); 7541 // Ra = *++Pa; 7542 // Rb = *--Pb; 7543 umulh(Rhi_ab, Ra, Rb); 7544 mul(Rlo_ab, Ra, Rb); 7545 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7546 } bind(dont); 7547 } 7548 7549 void extra_step_squaring() { 7550 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7551 7552 // MACC(Rm, Rn, t0, t1, t2); 7553 // Rm = *++Pm; 7554 // Rn = *--Pn; 7555 umulh(Rhi_mn, Rm, Rn); 7556 mul(Rlo_mn, Rm, Rn); 7557 ldr(Rm, pre(Pm, wordSize)); 7558 ldr(Rn, pre(Pn, -wordSize)); 7559 } 7560 7561 void post1_squaring() { 7562 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7563 7564 // *Pm = Rm = t0 * inv; 7565 mul(Rm, t0, inv); 7566 str(Rm, Address(Pm)); 7567 7568 // MACC(Rm, Rn, t0, t1, t2); 7569 // t0 = t1; t1 = t2; t2 = 0; 7570 umulh(Rhi_mn, Rm, Rn); 7571 7572 #ifndef PRODUCT 7573 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7574 { 7575 mul(Rlo_mn, Rm, Rn); 7576 add(Rlo_mn, t0, Rlo_mn); 7577 Label ok; 7578 cbz(Rlo_mn, ok); { 7579 stop("broken Montgomery multiply"); 7580 } bind(ok); 7581 } 7582 #endif 7583 // We have very carefully set things up so that 7584 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7585 // the lower half of Rm * Rn because we know the result already: 7586 // it must be -t0. t0 + (-t0) must generate a carry iff 7587 // t0 != 0. So, rather than do a mul and an adds we just set 7588 // the carry flag iff t0 is nonzero. 7589 // 7590 // mul(Rlo_mn, Rm, Rn); 7591 // adds(zr, t0, Rlo_mn); 7592 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7593 adcs(t0, t1, Rhi_mn); 7594 adc(t1, t2, zr); 7595 mov(t2, zr); 7596 } 7597 7598 void acc(Register Rhi, Register Rlo, 7599 Register t0, Register t1, Register t2) { 7600 adds(t0, t0, Rlo); 7601 adcs(t1, t1, Rhi); 7602 adc(t2, t2, zr); 7603 } 7604 7605 public: 7606 /** 7607 * Fast Montgomery multiplication. The derivation of the 7608 * algorithm is in A Cryptographic Library for the Motorola 7609 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7610 * 7611 * Arguments: 7612 * 7613 * Inputs for multiplication: 7614 * c_rarg0 - int array elements a 7615 * c_rarg1 - int array elements b 7616 * c_rarg2 - int array elements n (the modulus) 7617 * c_rarg3 - int length 7618 * c_rarg4 - int inv 7619 * c_rarg5 - int array elements m (the result) 7620 * 7621 * Inputs for squaring: 7622 * c_rarg0 - int array elements a 7623 * c_rarg1 - int array elements n (the modulus) 7624 * c_rarg2 - int length 7625 * c_rarg3 - int inv 7626 * c_rarg4 - int array elements m (the result) 7627 * 7628 */ 7629 address generate_multiply() { 7630 Label argh, nothing; 7631 bind(argh); 7632 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7633 7634 align(CodeEntryAlignment); 7635 address entry = pc(); 7636 7637 cbzw(Rlen, nothing); 7638 7639 enter(); 7640 7641 // Make room. 7642 cmpw(Rlen, 512); 7643 br(Assembler::HI, argh); 7644 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7645 andr(sp, Ra, -2 * wordSize); 7646 7647 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7648 7649 { 7650 // Copy input args, reversing as we go. We use Ra as a 7651 // temporary variable. 7652 reverse(Ra, Pa_base, Rlen, t0, t1); 7653 if (!_squaring) 7654 reverse(Ra, Pb_base, Rlen, t0, t1); 7655 reverse(Ra, Pn_base, Rlen, t0, t1); 7656 } 7657 7658 // Push all call-saved registers and also Pm_base which we'll need 7659 // at the end. 7660 save_regs(); 7661 7662 #ifndef PRODUCT 7663 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7664 { 7665 ldr(Rn, Address(Pn_base, 0)); 7666 mul(Rlo_mn, Rn, inv); 7667 subs(zr, Rlo_mn, -1); 7668 Label ok; 7669 br(EQ, ok); { 7670 stop("broken inverse in Montgomery multiply"); 7671 } bind(ok); 7672 } 7673 #endif 7674 7675 mov(Pm_base, Ra); 7676 7677 mov(t0, zr); 7678 mov(t1, zr); 7679 mov(t2, zr); 7680 7681 block_comment("for (int i = 0; i < len; i++) {"); 7682 mov(Ri, zr); { 7683 Label loop, end; 7684 cmpw(Ri, Rlen); 7685 br(Assembler::GE, end); 7686 7687 bind(loop); 7688 pre1(Ri); 7689 7690 block_comment(" for (j = i; j; j--) {"); { 7691 movw(Rj, Ri); 7692 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7693 } block_comment(" } // j"); 7694 7695 post1(); 7696 addw(Ri, Ri, 1); 7697 cmpw(Ri, Rlen); 7698 br(Assembler::LT, loop); 7699 bind(end); 7700 block_comment("} // i"); 7701 } 7702 7703 block_comment("for (int i = len; i < 2*len; i++) {"); 7704 mov(Ri, Rlen); { 7705 Label loop, end; 7706 cmpw(Ri, Rlen, Assembler::LSL, 1); 7707 br(Assembler::GE, end); 7708 7709 bind(loop); 7710 pre2(Ri, Rlen); 7711 7712 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7713 lslw(Rj, Rlen, 1); 7714 subw(Rj, Rj, Ri); 7715 subw(Rj, Rj, 1); 7716 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7717 } block_comment(" } // j"); 7718 7719 post2(Ri, Rlen); 7720 addw(Ri, Ri, 1); 7721 cmpw(Ri, Rlen, Assembler::LSL, 1); 7722 br(Assembler::LT, loop); 7723 bind(end); 7724 } 7725 block_comment("} // i"); 7726 7727 normalize(Rlen); 7728 7729 mov(Ra, Pm_base); // Save Pm_base in Ra 7730 restore_regs(); // Restore caller's Pm_base 7731 7732 // Copy our result into caller's Pm_base 7733 reverse(Pm_base, Ra, Rlen, t0, t1); 7734 7735 leave(); 7736 bind(nothing); 7737 ret(lr); 7738 7739 return entry; 7740 } 7741 // In C, approximately: 7742 7743 // void 7744 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7745 // julong Pn_base[], julong Pm_base[], 7746 // julong inv, int len) { 7747 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7748 // julong *Pa, *Pb, *Pn, *Pm; 7749 // julong Ra, Rb, Rn, Rm; 7750 7751 // int i; 7752 7753 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7754 7755 // for (i = 0; i < len; i++) { 7756 // int j; 7757 7758 // Pa = Pa_base; 7759 // Pb = Pb_base + i; 7760 // Pm = Pm_base; 7761 // Pn = Pn_base + i; 7762 7763 // Ra = *Pa; 7764 // Rb = *Pb; 7765 // Rm = *Pm; 7766 // Rn = *Pn; 7767 7768 // int iters = i; 7769 // for (j = 0; iters--; j++) { 7770 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7771 // MACC(Ra, Rb, t0, t1, t2); 7772 // Ra = *++Pa; 7773 // Rb = *--Pb; 7774 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7775 // MACC(Rm, Rn, t0, t1, t2); 7776 // Rm = *++Pm; 7777 // Rn = *--Pn; 7778 // } 7779 7780 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 7781 // MACC(Ra, Rb, t0, t1, t2); 7782 // *Pm = Rm = t0 * inv; 7783 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7784 // MACC(Rm, Rn, t0, t1, t2); 7785 7786 // assert(t0 == 0, "broken Montgomery multiply"); 7787 7788 // t0 = t1; t1 = t2; t2 = 0; 7789 // } 7790 7791 // for (i = len; i < 2*len; i++) { 7792 // int j; 7793 7794 // Pa = Pa_base + i-len; 7795 // Pb = Pb_base + len; 7796 // Pm = Pm_base + i-len; 7797 // Pn = Pn_base + len; 7798 7799 // Ra = *++Pa; 7800 // Rb = *--Pb; 7801 // Rm = *++Pm; 7802 // Rn = *--Pn; 7803 7804 // int iters = len*2-i-1; 7805 // for (j = i-len+1; iters--; j++) { 7806 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7807 // MACC(Ra, Rb, t0, t1, t2); 7808 // Ra = *++Pa; 7809 // Rb = *--Pb; 7810 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7811 // MACC(Rm, Rn, t0, t1, t2); 7812 // Rm = *++Pm; 7813 // Rn = *--Pn; 7814 // } 7815 7816 // Pm_base[i-len] = t0; 7817 // t0 = t1; t1 = t2; t2 = 0; 7818 // } 7819 7820 // while (t0) 7821 // t0 = sub(Pm_base, Pn_base, t0, len); 7822 // } 7823 7824 /** 7825 * Fast Montgomery squaring. This uses asymptotically 25% fewer 7826 * multiplies than Montgomery multiplication so it should be up to 7827 * 25% faster. However, its loop control is more complex and it 7828 * may actually run slower on some machines. 7829 * 7830 * Arguments: 7831 * 7832 * Inputs: 7833 * c_rarg0 - int array elements a 7834 * c_rarg1 - int array elements n (the modulus) 7835 * c_rarg2 - int length 7836 * c_rarg3 - int inv 7837 * c_rarg4 - int array elements m (the result) 7838 * 7839 */ 7840 address generate_square() { 7841 Label argh; 7842 bind(argh); 7843 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7844 7845 align(CodeEntryAlignment); 7846 address entry = pc(); 7847 7848 enter(); 7849 7850 // Make room. 7851 cmpw(Rlen, 512); 7852 br(Assembler::HI, argh); 7853 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7854 andr(sp, Ra, -2 * wordSize); 7855 7856 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7857 7858 { 7859 // Copy input args, reversing as we go. We use Ra as a 7860 // temporary variable. 7861 reverse(Ra, Pa_base, Rlen, t0, t1); 7862 reverse(Ra, Pn_base, Rlen, t0, t1); 7863 } 7864 7865 // Push all call-saved registers and also Pm_base which we'll need 7866 // at the end. 7867 save_regs(); 7868 7869 mov(Pm_base, Ra); 7870 7871 mov(t0, zr); 7872 mov(t1, zr); 7873 mov(t2, zr); 7874 7875 block_comment("for (int i = 0; i < len; i++) {"); 7876 mov(Ri, zr); { 7877 Label loop, end; 7878 bind(loop); 7879 cmp(Ri, Rlen); 7880 br(Assembler::GE, end); 7881 7882 pre1(Ri); 7883 7884 block_comment("for (j = (i+1)/2; j; j--) {"); { 7885 add(Rj, Ri, 1); 7886 lsr(Rj, Rj, 1); 7887 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7888 } block_comment(" } // j"); 7889 7890 last_squaring(Ri); 7891 7892 block_comment(" for (j = i/2; j; j--) {"); { 7893 lsr(Rj, Ri, 1); 7894 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7895 } block_comment(" } // j"); 7896 7897 post1_squaring(); 7898 add(Ri, Ri, 1); 7899 cmp(Ri, Rlen); 7900 br(Assembler::LT, loop); 7901 7902 bind(end); 7903 block_comment("} // i"); 7904 } 7905 7906 block_comment("for (int i = len; i < 2*len; i++) {"); 7907 mov(Ri, Rlen); { 7908 Label loop, end; 7909 bind(loop); 7910 cmp(Ri, Rlen, Assembler::LSL, 1); 7911 br(Assembler::GE, end); 7912 7913 pre2(Ri, Rlen); 7914 7915 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 7916 lsl(Rj, Rlen, 1); 7917 sub(Rj, Rj, Ri); 7918 sub(Rj, Rj, 1); 7919 lsr(Rj, Rj, 1); 7920 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7921 } block_comment(" } // j"); 7922 7923 last_squaring(Ri); 7924 7925 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 7926 lsl(Rj, Rlen, 1); 7927 sub(Rj, Rj, Ri); 7928 lsr(Rj, Rj, 1); 7929 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7930 } block_comment(" } // j"); 7931 7932 post2(Ri, Rlen); 7933 add(Ri, Ri, 1); 7934 cmp(Ri, Rlen, Assembler::LSL, 1); 7935 7936 br(Assembler::LT, loop); 7937 bind(end); 7938 block_comment("} // i"); 7939 } 7940 7941 normalize(Rlen); 7942 7943 mov(Ra, Pm_base); // Save Pm_base in Ra 7944 restore_regs(); // Restore caller's Pm_base 7945 7946 // Copy our result into caller's Pm_base 7947 reverse(Pm_base, Ra, Rlen, t0, t1); 7948 7949 leave(); 7950 ret(lr); 7951 7952 return entry; 7953 } 7954 // In C, approximately: 7955 7956 // void 7957 // montgomery_square(julong Pa_base[], julong Pn_base[], 7958 // julong Pm_base[], julong inv, int len) { 7959 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7960 // julong *Pa, *Pb, *Pn, *Pm; 7961 // julong Ra, Rb, Rn, Rm; 7962 7963 // int i; 7964 7965 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7966 7967 // for (i = 0; i < len; i++) { 7968 // int j; 7969 7970 // Pa = Pa_base; 7971 // Pb = Pa_base + i; 7972 // Pm = Pm_base; 7973 // Pn = Pn_base + i; 7974 7975 // Ra = *Pa; 7976 // Rb = *Pb; 7977 // Rm = *Pm; 7978 // Rn = *Pn; 7979 7980 // int iters = (i+1)/2; 7981 // for (j = 0; iters--; j++) { 7982 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7983 // MACC2(Ra, Rb, t0, t1, t2); 7984 // Ra = *++Pa; 7985 // Rb = *--Pb; 7986 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7987 // MACC(Rm, Rn, t0, t1, t2); 7988 // Rm = *++Pm; 7989 // Rn = *--Pn; 7990 // } 7991 // if ((i & 1) == 0) { 7992 // assert(Ra == Pa_base[j], "must be"); 7993 // MACC(Ra, Ra, t0, t1, t2); 7994 // } 7995 // iters = i/2; 7996 // assert(iters == i-j, "must be"); 7997 // for (; iters--; j++) { 7998 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7999 // MACC(Rm, Rn, t0, t1, t2); 8000 // Rm = *++Pm; 8001 // Rn = *--Pn; 8002 // } 8003 8004 // *Pm = Rm = t0 * inv; 8005 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8006 // MACC(Rm, Rn, t0, t1, t2); 8007 8008 // assert(t0 == 0, "broken Montgomery multiply"); 8009 8010 // t0 = t1; t1 = t2; t2 = 0; 8011 // } 8012 8013 // for (i = len; i < 2*len; i++) { 8014 // int start = i-len+1; 8015 // int end = start + (len - start)/2; 8016 // int j; 8017 8018 // Pa = Pa_base + i-len; 8019 // Pb = Pa_base + len; 8020 // Pm = Pm_base + i-len; 8021 // Pn = Pn_base + len; 8022 8023 // Ra = *++Pa; 8024 // Rb = *--Pb; 8025 // Rm = *++Pm; 8026 // Rn = *--Pn; 8027 8028 // int iters = (2*len-i-1)/2; 8029 // assert(iters == end-start, "must be"); 8030 // for (j = start; iters--; j++) { 8031 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8032 // MACC2(Ra, Rb, t0, t1, t2); 8033 // Ra = *++Pa; 8034 // Rb = *--Pb; 8035 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8036 // MACC(Rm, Rn, t0, t1, t2); 8037 // Rm = *++Pm; 8038 // Rn = *--Pn; 8039 // } 8040 // if ((i & 1) == 0) { 8041 // assert(Ra == Pa_base[j], "must be"); 8042 // MACC(Ra, Ra, t0, t1, t2); 8043 // } 8044 // iters = (2*len-i)/2; 8045 // assert(iters == len-j, "must be"); 8046 // for (; iters--; j++) { 8047 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8048 // MACC(Rm, Rn, t0, t1, t2); 8049 // Rm = *++Pm; 8050 // Rn = *--Pn; 8051 // } 8052 // Pm_base[i-len] = t0; 8053 // t0 = t1; t1 = t2; t2 = 0; 8054 // } 8055 8056 // while (t0) 8057 // t0 = sub(Pm_base, Pn_base, t0, len); 8058 // } 8059 }; 8060 8061 8062 // Call here from the interpreter or compiled code to either load 8063 // multiple returned values from the inline type instance being 8064 // returned to registers or to store returned values to a newly 8065 // allocated inline type instance. 8066 address generate_return_value_stub(address destination, const char* name, bool has_res) { 8067 // We need to save all registers the calling convention may use so 8068 // the runtime calls read or update those registers. This needs to 8069 // be in sync with SharedRuntime::java_return_convention(). 8070 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 8071 enum layout { 8072 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0 8073 j_rarg6_off, j_rarg6_2, 8074 j_rarg5_off, j_rarg5_2, 8075 j_rarg4_off, j_rarg4_2, 8076 j_rarg3_off, j_rarg3_2, 8077 j_rarg2_off, j_rarg2_2, 8078 j_rarg1_off, j_rarg1_2, 8079 j_rarg0_off, j_rarg0_2, 8080 8081 j_farg7_off, j_farg7_2, 8082 j_farg6_off, j_farg6_2, 8083 j_farg5_off, j_farg5_2, 8084 j_farg4_off, j_farg4_2, 8085 j_farg3_off, j_farg3_2, 8086 j_farg2_off, j_farg2_2, 8087 j_farg1_off, j_farg1_2, 8088 j_farg0_off, j_farg0_2, 8089 8090 rfp_off, rfp_off2, 8091 return_off, return_off2, 8092 8093 framesize // inclusive of return address 8094 }; 8095 8096 CodeBuffer code(name, 512, 64); 8097 MacroAssembler* masm = new MacroAssembler(&code); 8098 8099 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16); 8100 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned"); 8101 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 8102 int frame_size_in_words = frame_size_in_bytes / wordSize; 8103 8104 OopMapSet* oop_maps = new OopMapSet(); 8105 OopMap* map = new OopMap(frame_size_in_slots, 0); 8106 8107 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 8108 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 8109 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 8110 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 8111 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 8112 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 8113 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 8114 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 8115 8116 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 8117 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 8118 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 8119 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 8120 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 8121 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 8122 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 8123 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 8124 8125 address start = __ pc(); 8126 8127 __ enter(); // Save FP and LR before call 8128 8129 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize))); 8130 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize))); 8131 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize))); 8132 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize))); 8133 8134 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize))); 8135 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize))); 8136 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize))); 8137 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize))); 8138 8139 int frame_complete = __ offset(); 8140 8141 // Set up last_Java_sp and last_Java_fp 8142 address the_pc = __ pc(); 8143 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1); 8144 8145 // Call runtime 8146 __ mov(c_rarg1, r0); 8147 __ mov(c_rarg0, rthread); 8148 8149 __ mov(rscratch1, destination); 8150 __ blr(rscratch1); 8151 8152 oop_maps->add_gc_map(the_pc - start, map); 8153 8154 __ reset_last_Java_frame(false); 8155 8156 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize))); 8157 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize))); 8158 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize))); 8159 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize))); 8160 8161 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize))); 8162 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize))); 8163 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize))); 8164 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize))); 8165 8166 __ leave(); 8167 8168 // check for pending exceptions 8169 Label pending; 8170 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 8171 __ cbnz(rscratch1, pending); 8172 8173 if (has_res) { 8174 __ get_vm_result(r0, rthread); 8175 } 8176 8177 __ ret(lr); 8178 8179 __ bind(pending); 8180 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 8181 8182 // ------------- 8183 // make sure all code is generated 8184 masm->flush(); 8185 8186 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 8187 return stub->entry_point(); 8188 } 8189 8190 // Initialization 8191 void generate_initial_stubs() { 8192 // Generate initial stubs and initializes the entry points 8193 8194 // entry points that exist in all platforms Note: This is code 8195 // that could be shared among different platforms - however the 8196 // benefit seems to be smaller than the disadvantage of having a 8197 // much more complicated generator structure. See also comment in 8198 // stubRoutines.hpp. 8199 8200 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8201 8202 StubRoutines::_call_stub_entry = 8203 generate_call_stub(StubRoutines::_call_stub_return_address); 8204 8205 // is referenced by megamorphic call 8206 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8207 8208 // Build this early so it's available for the interpreter. 8209 StubRoutines::_throw_StackOverflowError_entry = 8210 generate_throw_exception("StackOverflowError throw_exception", 8211 CAST_FROM_FN_PTR(address, 8212 SharedRuntime::throw_StackOverflowError)); 8213 StubRoutines::_throw_delayed_StackOverflowError_entry = 8214 generate_throw_exception("delayed StackOverflowError throw_exception", 8215 CAST_FROM_FN_PTR(address, 8216 SharedRuntime::throw_delayed_StackOverflowError)); 8217 8218 // Initialize table for copy memory (arraycopy) check. 8219 if (UnsafeCopyMemory::_table == nullptr) { 8220 UnsafeCopyMemory::create_table(8); 8221 } 8222 8223 if (UseCRC32Intrinsics) { 8224 // set table address before stub generation which use it 8225 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8226 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8227 } 8228 8229 if (UseCRC32CIntrinsics) { 8230 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8231 } 8232 8233 // Disabled until JDK-8210858 is fixed 8234 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 8235 // StubRoutines::_dlog = generate_dlog(); 8236 // } 8237 8238 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8239 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8240 } 8241 8242 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8243 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8244 } 8245 8246 if (InlineTypeReturnedAsFields) { 8247 StubRoutines::_load_inline_type_fields_in_regs = 8248 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false); 8249 StubRoutines::_store_inline_type_fields_to_buf = 8250 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true); 8251 } 8252 } 8253 8254 void generate_continuation_stubs() { 8255 // Continuation stubs: 8256 StubRoutines::_cont_thaw = generate_cont_thaw(); 8257 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8258 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8259 8260 JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();) 8261 JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();) 8262 } 8263 8264 void generate_final_stubs() { 8265 // support for verify_oop (must happen after universe_init) 8266 if (VerifyOops) { 8267 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8268 } 8269 StubRoutines::_throw_AbstractMethodError_entry = 8270 generate_throw_exception("AbstractMethodError throw_exception", 8271 CAST_FROM_FN_PTR(address, 8272 SharedRuntime:: 8273 throw_AbstractMethodError)); 8274 8275 StubRoutines::_throw_IncompatibleClassChangeError_entry = 8276 generate_throw_exception("IncompatibleClassChangeError throw_exception", 8277 CAST_FROM_FN_PTR(address, 8278 SharedRuntime:: 8279 throw_IncompatibleClassChangeError)); 8280 8281 StubRoutines::_throw_NullPointerException_at_call_entry = 8282 generate_throw_exception("NullPointerException at call throw_exception", 8283 CAST_FROM_FN_PTR(address, 8284 SharedRuntime:: 8285 throw_NullPointerException_at_call)); 8286 8287 // arraycopy stubs used by compilers 8288 generate_arraycopy_stubs(); 8289 8290 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8291 if (bs_nm != nullptr) { 8292 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 8293 } 8294 8295 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8296 8297 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8298 8299 generate_atomic_entry_points(); 8300 8301 #endif // LINUX 8302 8303 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8304 } 8305 8306 void generate_compiler_stubs() { 8307 #if COMPILER2_OR_JVMCI 8308 8309 if (UseSVE == 0) { 8310 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8311 } 8312 8313 // array equals stub for large arrays. 8314 if (!UseSimpleArrayEquals) { 8315 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8316 } 8317 8318 // byte_array_inflate stub for large arrays. 8319 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8320 8321 // countPositives stub for large arrays. 8322 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8323 8324 generate_compare_long_strings(); 8325 8326 generate_string_indexof_stubs(); 8327 8328 #ifdef COMPILER2 8329 if (UseMultiplyToLenIntrinsic) { 8330 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8331 } 8332 8333 if (UseSquareToLenIntrinsic) { 8334 StubRoutines::_squareToLen = generate_squareToLen(); 8335 } 8336 8337 if (UseMulAddIntrinsic) { 8338 StubRoutines::_mulAdd = generate_mulAdd(); 8339 } 8340 8341 if (UseSIMDForBigIntegerShiftIntrinsics) { 8342 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8343 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8344 } 8345 8346 if (UseMontgomeryMultiplyIntrinsic) { 8347 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8348 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8349 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8350 } 8351 8352 if (UseMontgomerySquareIntrinsic) { 8353 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8354 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8355 // We use generate_multiply() rather than generate_square() 8356 // because it's faster for the sizes of modulus we care about. 8357 StubRoutines::_montgomerySquare = g.generate_multiply(); 8358 } 8359 #endif // COMPILER2 8360 8361 if (UseChaCha20Intrinsics) { 8362 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8363 } 8364 8365 if (UseBASE64Intrinsics) { 8366 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8367 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8368 } 8369 8370 // data cache line writeback 8371 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8372 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8373 8374 if (UseAESIntrinsics) { 8375 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8376 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8377 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8378 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8379 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8380 } 8381 if (UseGHASHIntrinsics) { 8382 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8383 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8384 } 8385 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8386 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8387 } 8388 8389 if (UseMD5Intrinsics) { 8390 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8391 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8392 } 8393 if (UseSHA1Intrinsics) { 8394 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8395 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8396 } 8397 if (UseSHA256Intrinsics) { 8398 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8399 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8400 } 8401 if (UseSHA512Intrinsics) { 8402 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8403 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8404 } 8405 if (UseSHA3Intrinsics) { 8406 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8407 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8408 } 8409 8410 // generate Adler32 intrinsics code 8411 if (UseAdler32Intrinsics) { 8412 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8413 } 8414 #endif // COMPILER2_OR_JVMCI 8415 } 8416 8417 public: 8418 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8419 switch(kind) { 8420 case Initial_stubs: 8421 generate_initial_stubs(); 8422 break; 8423 case Continuation_stubs: 8424 generate_continuation_stubs(); 8425 break; 8426 case Compiler_stubs: 8427 generate_compiler_stubs(); 8428 break; 8429 case Final_stubs: 8430 generate_final_stubs(); 8431 break; 8432 default: 8433 fatal("unexpected stubs kind: %d", kind); 8434 break; 8435 }; 8436 } 8437 }; // end class declaration 8438 8439 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8440 StubGenerator g(code, kind); 8441 } 8442 8443 8444 #if defined (LINUX) 8445 8446 // Define pointers to atomic stubs and initialize them to point to the 8447 // code in atomic_aarch64.S. 8448 8449 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8450 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8451 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8452 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8453 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8454 8455 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8456 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8457 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8458 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8459 DEFAULT_ATOMIC_OP(xchg, 4, ) 8460 DEFAULT_ATOMIC_OP(xchg, 8, ) 8461 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8462 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8463 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8464 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8465 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8466 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8467 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8468 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8469 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8470 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8471 8472 #undef DEFAULT_ATOMIC_OP 8473 8474 #endif // LINUX