1 /* 2 * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "runtime/atomic.hpp" 45 #include "runtime/continuation.hpp" 46 #include "runtime/continuationEntry.inline.hpp" 47 #include "runtime/frame.inline.hpp" 48 #include "runtime/handles.inline.hpp" 49 #include "runtime/javaThread.hpp" 50 #include "runtime/sharedRuntime.hpp" 51 #include "runtime/stubCodeGenerator.hpp" 52 #include "runtime/stubRoutines.hpp" 53 #include "utilities/align.hpp" 54 #include "utilities/globalDefinitions.hpp" 55 #include "utilities/powerOfTwo.hpp" 56 #ifdef COMPILER2 57 #include "opto/runtime.hpp" 58 #endif 59 #if INCLUDE_ZGC 60 #include "gc/z/zThreadLocalData.hpp" 61 #endif 62 63 // Declaration and definition of StubGenerator (no .hpp file). 64 // For a more detailed description of the stub routine structure 65 // see the comment in stubRoutines.hpp 66 67 #undef __ 68 #define __ _masm-> 69 70 #ifdef PRODUCT 71 #define BLOCK_COMMENT(str) /* nothing */ 72 #else 73 #define BLOCK_COMMENT(str) __ block_comment(str) 74 #endif 75 76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 77 78 // Stub Code definitions 79 80 class StubGenerator: public StubCodeGenerator { 81 private: 82 83 #ifdef PRODUCT 84 #define inc_counter_np(counter) ((void)0) 85 #else 86 void inc_counter_np_(int& counter) { 87 __ lea(rscratch2, ExternalAddress((address)&counter)); 88 __ ldrw(rscratch1, Address(rscratch2)); 89 __ addw(rscratch1, rscratch1, 1); 90 __ strw(rscratch1, Address(rscratch2)); 91 } 92 #define inc_counter_np(counter) \ 93 BLOCK_COMMENT("inc_counter " #counter); \ 94 inc_counter_np_(counter); 95 #endif 96 97 // Call stubs are used to call Java from C 98 // 99 // Arguments: 100 // c_rarg0: call wrapper address address 101 // c_rarg1: result address 102 // c_rarg2: result type BasicType 103 // c_rarg3: method Method* 104 // c_rarg4: (interpreter) entry point address 105 // c_rarg5: parameters intptr_t* 106 // c_rarg6: parameter size (in words) int 107 // c_rarg7: thread Thread* 108 // 109 // There is no return from the stub itself as any Java result 110 // is written to result 111 // 112 // we save r30 (lr) as the return PC at the base of the frame and 113 // link r29 (fp) below it as the frame pointer installing sp (r31) 114 // into fp. 115 // 116 // we save r0-r7, which accounts for all the c arguments. 117 // 118 // TODO: strictly do we need to save them all? they are treated as 119 // volatile by C so could we omit saving the ones we are going to 120 // place in global registers (thread? method?) or those we only use 121 // during setup of the Java call? 122 // 123 // we don't need to save r8 which C uses as an indirect result location 124 // return register. 125 // 126 // we don't need to save r9-r15 which both C and Java treat as 127 // volatile 128 // 129 // we don't need to save r16-18 because Java does not use them 130 // 131 // we save r19-r28 which Java uses as scratch registers and C 132 // expects to be callee-save 133 // 134 // we save the bottom 64 bits of each value stored in v8-v15; it is 135 // the responsibility of the caller to preserve larger values. 136 // 137 // so the stub frame looks like this when we enter Java code 138 // 139 // [ return_from_Java ] <--- sp 140 // [ argument word n ] 141 // ... 142 // -27 [ argument word 1 ] 143 // -26 [ saved v15 ] <--- sp_after_call 144 // -25 [ saved v14 ] 145 // -24 [ saved v13 ] 146 // -23 [ saved v12 ] 147 // -22 [ saved v11 ] 148 // -21 [ saved v10 ] 149 // -20 [ saved v9 ] 150 // -19 [ saved v8 ] 151 // -18 [ saved r28 ] 152 // -17 [ saved r27 ] 153 // -16 [ saved r26 ] 154 // -15 [ saved r25 ] 155 // -14 [ saved r24 ] 156 // -13 [ saved r23 ] 157 // -12 [ saved r22 ] 158 // -11 [ saved r21 ] 159 // -10 [ saved r20 ] 160 // -9 [ saved r19 ] 161 // -8 [ call wrapper (r0) ] 162 // -7 [ result (r1) ] 163 // -6 [ result type (r2) ] 164 // -5 [ method (r3) ] 165 // -4 [ entry point (r4) ] 166 // -3 [ parameters (r5) ] 167 // -2 [ parameter size (r6) ] 168 // -1 [ thread (r7) ] 169 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 170 // 1 [ saved lr (r30) ] 171 172 // Call stub stack layout word offsets from fp 173 enum call_stub_layout { 174 sp_after_call_off = -26, 175 176 d15_off = -26, 177 d13_off = -24, 178 d11_off = -22, 179 d9_off = -20, 180 181 r28_off = -18, 182 r26_off = -16, 183 r24_off = -14, 184 r22_off = -12, 185 r20_off = -10, 186 call_wrapper_off = -8, 187 result_off = -7, 188 result_type_off = -6, 189 method_off = -5, 190 entry_point_off = -4, 191 parameter_size_off = -2, 192 thread_off = -1, 193 fp_f = 0, 194 retaddr_off = 1, 195 }; 196 197 address generate_call_stub(address& return_address) { 198 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 199 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 200 "adjust this code"); 201 202 StubCodeMark mark(this, "StubRoutines", "call_stub"); 203 address start = __ pc(); 204 205 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 206 207 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 208 const Address result (rfp, result_off * wordSize); 209 const Address result_type (rfp, result_type_off * wordSize); 210 const Address method (rfp, method_off * wordSize); 211 const Address entry_point (rfp, entry_point_off * wordSize); 212 const Address parameter_size(rfp, parameter_size_off * wordSize); 213 214 const Address thread (rfp, thread_off * wordSize); 215 216 const Address d15_save (rfp, d15_off * wordSize); 217 const Address d13_save (rfp, d13_off * wordSize); 218 const Address d11_save (rfp, d11_off * wordSize); 219 const Address d9_save (rfp, d9_off * wordSize); 220 221 const Address r28_save (rfp, r28_off * wordSize); 222 const Address r26_save (rfp, r26_off * wordSize); 223 const Address r24_save (rfp, r24_off * wordSize); 224 const Address r22_save (rfp, r22_off * wordSize); 225 const Address r20_save (rfp, r20_off * wordSize); 226 227 // stub code 228 229 address aarch64_entry = __ pc(); 230 231 // set up frame and move sp to end of save area 232 __ enter(); 233 __ sub(sp, rfp, -sp_after_call_off * wordSize); 234 235 // save register parameters and Java scratch/global registers 236 // n.b. we save thread even though it gets installed in 237 // rthread because we want to sanity check rthread later 238 __ str(c_rarg7, thread); 239 __ strw(c_rarg6, parameter_size); 240 __ stp(c_rarg4, c_rarg5, entry_point); 241 __ stp(c_rarg2, c_rarg3, result_type); 242 __ stp(c_rarg0, c_rarg1, call_wrapper); 243 244 __ stp(r20, r19, r20_save); 245 __ stp(r22, r21, r22_save); 246 __ stp(r24, r23, r24_save); 247 __ stp(r26, r25, r26_save); 248 __ stp(r28, r27, r28_save); 249 250 __ stpd(v9, v8, d9_save); 251 __ stpd(v11, v10, d11_save); 252 __ stpd(v13, v12, d13_save); 253 __ stpd(v15, v14, d15_save); 254 255 // install Java thread in global register now we have saved 256 // whatever value it held 257 __ mov(rthread, c_rarg7); 258 // And method 259 __ mov(rmethod, c_rarg3); 260 261 // set up the heapbase register 262 __ reinit_heapbase(); 263 264 #ifdef ASSERT 265 // make sure we have no pending exceptions 266 { 267 Label L; 268 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 269 __ cmp(rscratch1, (u1)NULL_WORD); 270 __ br(Assembler::EQ, L); 271 __ stop("StubRoutines::call_stub: entered with pending exception"); 272 __ BIND(L); 273 } 274 #endif 275 // pass parameters if any 276 __ mov(esp, sp); 277 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 278 __ andr(sp, rscratch1, -2 * wordSize); 279 280 BLOCK_COMMENT("pass parameters if any"); 281 Label parameters_done; 282 // parameter count is still in c_rarg6 283 // and parameter pointer identifying param 1 is in c_rarg5 284 __ cbzw(c_rarg6, parameters_done); 285 286 address loop = __ pc(); 287 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 288 __ subsw(c_rarg6, c_rarg6, 1); 289 __ push(rscratch1); 290 __ br(Assembler::GT, loop); 291 292 __ BIND(parameters_done); 293 294 // call Java entry -- passing methdoOop, and current sp 295 // rmethod: Method* 296 // r19_sender_sp: sender sp 297 BLOCK_COMMENT("call Java function"); 298 __ mov(r19_sender_sp, sp); 299 __ blr(c_rarg4); 300 301 // we do this here because the notify will already have been done 302 // if we get to the next instruction via an exception 303 // 304 // n.b. adding this instruction here affects the calculation of 305 // whether or not a routine returns to the call stub (used when 306 // doing stack walks) since the normal test is to check the return 307 // pc against the address saved below. so we may need to allow for 308 // this extra instruction in the check. 309 310 // save current address for use by exception handling code 311 312 return_address = __ pc(); 313 314 // store result depending on type (everything that is not 315 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 316 // n.b. this assumes Java returns an integral result in r0 317 // and a floating result in j_farg0 318 __ ldr(j_rarg2, result); 319 Label is_long, is_float, is_double, exit; 320 __ ldr(j_rarg1, result_type); 321 __ cmp(j_rarg1, (u1)T_OBJECT); 322 __ br(Assembler::EQ, is_long); 323 __ cmp(j_rarg1, (u1)T_LONG); 324 __ br(Assembler::EQ, is_long); 325 __ cmp(j_rarg1, (u1)T_FLOAT); 326 __ br(Assembler::EQ, is_float); 327 __ cmp(j_rarg1, (u1)T_DOUBLE); 328 __ br(Assembler::EQ, is_double); 329 330 // handle T_INT case 331 __ strw(r0, Address(j_rarg2)); 332 333 __ BIND(exit); 334 335 // pop parameters 336 __ sub(esp, rfp, -sp_after_call_off * wordSize); 337 338 #ifdef ASSERT 339 // verify that threads correspond 340 { 341 Label L, S; 342 __ ldr(rscratch1, thread); 343 __ cmp(rthread, rscratch1); 344 __ br(Assembler::NE, S); 345 __ get_thread(rscratch1); 346 __ cmp(rthread, rscratch1); 347 __ br(Assembler::EQ, L); 348 __ BIND(S); 349 __ stop("StubRoutines::call_stub: threads must correspond"); 350 __ BIND(L); 351 } 352 #endif 353 354 __ pop_cont_fastpath(rthread); 355 356 // restore callee-save registers 357 __ ldpd(v15, v14, d15_save); 358 __ ldpd(v13, v12, d13_save); 359 __ ldpd(v11, v10, d11_save); 360 __ ldpd(v9, v8, d9_save); 361 362 __ ldp(r28, r27, r28_save); 363 __ ldp(r26, r25, r26_save); 364 __ ldp(r24, r23, r24_save); 365 __ ldp(r22, r21, r22_save); 366 __ ldp(r20, r19, r20_save); 367 368 __ ldp(c_rarg0, c_rarg1, call_wrapper); 369 __ ldrw(c_rarg2, result_type); 370 __ ldr(c_rarg3, method); 371 __ ldp(c_rarg4, c_rarg5, entry_point); 372 __ ldp(c_rarg6, c_rarg7, parameter_size); 373 374 // leave frame and return to caller 375 __ leave(); 376 __ ret(lr); 377 378 // handle return types different from T_INT 379 380 __ BIND(is_long); 381 __ str(r0, Address(j_rarg2, 0)); 382 __ br(Assembler::AL, exit); 383 384 __ BIND(is_float); 385 __ strs(j_farg0, Address(j_rarg2, 0)); 386 __ br(Assembler::AL, exit); 387 388 __ BIND(is_double); 389 __ strd(j_farg0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 return start; 393 } 394 395 // Return point for a Java call if there's an exception thrown in 396 // Java code. The exception is caught and transformed into a 397 // pending exception stored in JavaThread that can be tested from 398 // within the VM. 399 // 400 // Note: Usually the parameters are removed by the callee. In case 401 // of an exception crossing an activation frame boundary, that is 402 // not the case if the callee is compiled code => need to setup the 403 // rsp. 404 // 405 // r0: exception oop 406 407 address generate_catch_exception() { 408 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 409 address start = __ pc(); 410 411 // same as in generate_call_stub(): 412 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 413 const Address thread (rfp, thread_off * wordSize); 414 415 #ifdef ASSERT 416 // verify that threads correspond 417 { 418 Label L, S; 419 __ ldr(rscratch1, thread); 420 __ cmp(rthread, rscratch1); 421 __ br(Assembler::NE, S); 422 __ get_thread(rscratch1); 423 __ cmp(rthread, rscratch1); 424 __ br(Assembler::EQ, L); 425 __ bind(S); 426 __ stop("StubRoutines::catch_exception: threads must correspond"); 427 __ bind(L); 428 } 429 #endif 430 431 // set pending exception 432 __ verify_oop(r0); 433 434 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 435 __ mov(rscratch1, (address)__FILE__); 436 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 437 __ movw(rscratch1, (int)__LINE__); 438 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 439 440 // complete return to VM 441 assert(StubRoutines::_call_stub_return_address != NULL, 442 "_call_stub_return_address must have been generated before"); 443 __ b(StubRoutines::_call_stub_return_address); 444 445 return start; 446 } 447 448 // Continuation point for runtime calls returning with a pending 449 // exception. The pending exception check happened in the runtime 450 // or native call stub. The pending exception in Thread is 451 // converted into a Java-level exception. 452 // 453 // Contract with Java-level exception handlers: 454 // r0: exception 455 // r3: throwing pc 456 // 457 // NOTE: At entry of this stub, exception-pc must be in LR !! 458 459 // NOTE: this is always used as a jump target within generated code 460 // so it just needs to be generated code with no x86 prolog 461 462 address generate_forward_exception() { 463 StubCodeMark mark(this, "StubRoutines", "forward exception"); 464 address start = __ pc(); 465 466 // Upon entry, LR points to the return address returning into 467 // Java (interpreted or compiled) code; i.e., the return address 468 // becomes the throwing pc. 469 // 470 // Arguments pushed before the runtime call are still on the stack 471 // but the exception handler will reset the stack pointer -> 472 // ignore them. A potential result in registers can be ignored as 473 // well. 474 475 #ifdef ASSERT 476 // make sure this code is only executed if there is a pending exception 477 { 478 Label L; 479 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 480 __ cbnz(rscratch1, L); 481 __ stop("StubRoutines::forward exception: no pending exception (1)"); 482 __ bind(L); 483 } 484 #endif 485 486 // compute exception handler into r19 487 488 // call the VM to find the handler address associated with the 489 // caller address. pass thread in r0 and caller pc (ret address) 490 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 491 // the stack. 492 __ mov(c_rarg1, lr); 493 // lr will be trashed by the VM call so we move it to R19 494 // (callee-saved) because we also need to pass it to the handler 495 // returned by this call. 496 __ mov(r19, lr); 497 BLOCK_COMMENT("call exception_handler_for_return_address"); 498 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 499 SharedRuntime::exception_handler_for_return_address), 500 rthread, c_rarg1); 501 // Reinitialize the ptrue predicate register, in case the external runtime 502 // call clobbers ptrue reg, as we may return to SVE compiled code. 503 __ reinitialize_ptrue(); 504 505 // we should not really care that lr is no longer the callee 506 // address. we saved the value the handler needs in r19 so we can 507 // just copy it to r3. however, the C2 handler will push its own 508 // frame and then calls into the VM and the VM code asserts that 509 // the PC for the frame above the handler belongs to a compiled 510 // Java method. So, we restore lr here to satisfy that assert. 511 __ mov(lr, r19); 512 // setup r0 & r3 & clear pending exception 513 __ mov(r3, r19); 514 __ mov(r19, r0); 515 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 516 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 517 518 #ifdef ASSERT 519 // make sure exception is set 520 { 521 Label L; 522 __ cbnz(r0, L); 523 __ stop("StubRoutines::forward exception: no pending exception (2)"); 524 __ bind(L); 525 } 526 #endif 527 528 // continue at exception handler 529 // r0: exception 530 // r3: throwing pc 531 // r19: exception handler 532 __ verify_oop(r0); 533 __ br(r19); 534 535 return start; 536 } 537 538 // Non-destructive plausibility checks for oops 539 // 540 // Arguments: 541 // r0: oop to verify 542 // rscratch1: error message 543 // 544 // Stack after saving c_rarg3: 545 // [tos + 0]: saved c_rarg3 546 // [tos + 1]: saved c_rarg2 547 // [tos + 2]: saved lr 548 // [tos + 3]: saved rscratch2 549 // [tos + 4]: saved r0 550 // [tos + 5]: saved rscratch1 551 address generate_verify_oop() { 552 553 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 554 address start = __ pc(); 555 556 Label exit, error; 557 558 // save c_rarg2 and c_rarg3 559 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 560 561 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 562 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 563 __ ldr(c_rarg3, Address(c_rarg2)); 564 __ add(c_rarg3, c_rarg3, 1); 565 __ str(c_rarg3, Address(c_rarg2)); 566 567 // object is in r0 568 // make sure object is 'reasonable' 569 __ cbz(r0, exit); // if obj is NULL it is OK 570 571 #if INCLUDE_ZGC 572 if (UseZGC) { 573 // Check if mask is good. 574 // verifies that ZAddressBadMask & r0 == 0 575 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 576 __ andr(c_rarg2, r0, c_rarg3); 577 __ cbnz(c_rarg2, error); 578 } 579 #endif 580 581 // Check if the oop is in the right area of memory 582 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 583 __ andr(c_rarg2, r0, c_rarg3); 584 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 585 586 // Compare c_rarg2 and c_rarg3. We don't use a compare 587 // instruction here because the flags register is live. 588 __ eor(c_rarg2, c_rarg2, c_rarg3); 589 __ cbnz(c_rarg2, error); 590 591 // make sure klass is 'reasonable', which is not zero. 592 __ load_klass(r0, r0); // get klass 593 __ cbz(r0, error); // if klass is NULL it is broken 594 595 // return if everything seems ok 596 __ bind(exit); 597 598 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 599 __ ret(lr); 600 601 // handle errors 602 __ bind(error); 603 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 604 605 __ push(RegSet::range(r0, r29), sp); 606 // debug(char* msg, int64_t pc, int64_t regs[]) 607 __ mov(c_rarg0, rscratch1); // pass address of error message 608 __ mov(c_rarg1, lr); // pass return address 609 __ mov(c_rarg2, sp); // pass address of regs on stack 610 #ifndef PRODUCT 611 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 612 #endif 613 BLOCK_COMMENT("call MacroAssembler::debug"); 614 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 615 __ blr(rscratch1); 616 __ hlt(0); 617 618 return start; 619 } 620 621 // Generate indices for iota vector. 622 address generate_iota_indices(const char *stub_name) { 623 __ align(CodeEntryAlignment); 624 StubCodeMark mark(this, "StubRoutines", stub_name); 625 address start = __ pc(); 626 // B 627 __ emit_data64(0x0706050403020100, relocInfo::none); 628 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 629 // H 630 __ emit_data64(0x0003000200010000, relocInfo::none); 631 __ emit_data64(0x0007000600050004, relocInfo::none); 632 // S 633 __ emit_data64(0x0000000100000000, relocInfo::none); 634 __ emit_data64(0x0000000300000002, relocInfo::none); 635 // D 636 __ emit_data64(0x0000000000000000, relocInfo::none); 637 __ emit_data64(0x0000000000000001, relocInfo::none); 638 // S - FP 639 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 640 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 641 // D - FP 642 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 643 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 644 return start; 645 } 646 647 // The inner part of zero_words(). This is the bulk operation, 648 // zeroing words in blocks, possibly using DC ZVA to do it. The 649 // caller is responsible for zeroing the last few words. 650 // 651 // Inputs: 652 // r10: the HeapWord-aligned base address of an array to zero. 653 // r11: the count in HeapWords, r11 > 0. 654 // 655 // Returns r10 and r11, adjusted for the caller to clear. 656 // r10: the base address of the tail of words left to clear. 657 // r11: the number of words in the tail. 658 // r11 < MacroAssembler::zero_words_block_size. 659 660 address generate_zero_blocks() { 661 Label done; 662 Label base_aligned; 663 664 Register base = r10, cnt = r11; 665 666 __ align(CodeEntryAlignment); 667 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 668 address start = __ pc(); 669 670 if (UseBlockZeroing) { 671 int zva_length = VM_Version::zva_length(); 672 673 // Ensure ZVA length can be divided by 16. This is required by 674 // the subsequent operations. 675 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 676 677 __ tbz(base, 3, base_aligned); 678 __ str(zr, Address(__ post(base, 8))); 679 __ sub(cnt, cnt, 1); 680 __ bind(base_aligned); 681 682 // Ensure count >= zva_length * 2 so that it still deserves a zva after 683 // alignment. 684 Label small; 685 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 686 __ subs(rscratch1, cnt, low_limit >> 3); 687 __ br(Assembler::LT, small); 688 __ zero_dcache_blocks(base, cnt); 689 __ bind(small); 690 } 691 692 { 693 // Number of stp instructions we'll unroll 694 const int unroll = 695 MacroAssembler::zero_words_block_size / 2; 696 // Clear the remaining blocks. 697 Label loop; 698 __ subs(cnt, cnt, unroll * 2); 699 __ br(Assembler::LT, done); 700 __ bind(loop); 701 for (int i = 0; i < unroll; i++) 702 __ stp(zr, zr, __ post(base, 16)); 703 __ subs(cnt, cnt, unroll * 2); 704 __ br(Assembler::GE, loop); 705 __ bind(done); 706 __ add(cnt, cnt, unroll * 2); 707 } 708 709 __ ret(lr); 710 711 return start; 712 } 713 714 715 typedef enum { 716 copy_forwards = 1, 717 copy_backwards = -1 718 } copy_direction; 719 720 // Bulk copy of blocks of 8 words. 721 // 722 // count is a count of words. 723 // 724 // Precondition: count >= 8 725 // 726 // Postconditions: 727 // 728 // The least significant bit of count contains the remaining count 729 // of words to copy. The rest of count is trash. 730 // 731 // s and d are adjusted to point to the remaining words to copy 732 // 733 void generate_copy_longs(Label &start, Register s, Register d, Register count, 734 copy_direction direction) { 735 int unit = wordSize * direction; 736 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 737 738 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 739 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 740 const Register stride = r13; 741 742 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 743 assert_different_registers(s, d, count, rscratch1); 744 745 Label again, drain; 746 const char *stub_name; 747 if (direction == copy_forwards) 748 stub_name = "forward_copy_longs"; 749 else 750 stub_name = "backward_copy_longs"; 751 752 __ align(CodeEntryAlignment); 753 754 StubCodeMark mark(this, "StubRoutines", stub_name); 755 756 __ bind(start); 757 758 Label unaligned_copy_long; 759 if (AvoidUnalignedAccesses) { 760 __ tbnz(d, 3, unaligned_copy_long); 761 } 762 763 if (direction == copy_forwards) { 764 __ sub(s, s, bias); 765 __ sub(d, d, bias); 766 } 767 768 #ifdef ASSERT 769 // Make sure we are never given < 8 words 770 { 771 Label L; 772 __ cmp(count, (u1)8); 773 __ br(Assembler::GE, L); 774 __ stop("genrate_copy_longs called with < 8 words"); 775 __ bind(L); 776 } 777 #endif 778 779 // Fill 8 registers 780 if (UseSIMDForMemoryOps) { 781 __ ldpq(v0, v1, Address(s, 4 * unit)); 782 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 783 } else { 784 __ ldp(t0, t1, Address(s, 2 * unit)); 785 __ ldp(t2, t3, Address(s, 4 * unit)); 786 __ ldp(t4, t5, Address(s, 6 * unit)); 787 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 788 } 789 790 __ subs(count, count, 16); 791 __ br(Assembler::LO, drain); 792 793 int prefetch = PrefetchCopyIntervalInBytes; 794 bool use_stride = false; 795 if (direction == copy_backwards) { 796 use_stride = prefetch > 256; 797 prefetch = -prefetch; 798 if (use_stride) __ mov(stride, prefetch); 799 } 800 801 __ bind(again); 802 803 if (PrefetchCopyIntervalInBytes > 0) 804 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 805 806 if (UseSIMDForMemoryOps) { 807 __ stpq(v0, v1, Address(d, 4 * unit)); 808 __ ldpq(v0, v1, Address(s, 4 * unit)); 809 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 810 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 811 } else { 812 __ stp(t0, t1, Address(d, 2 * unit)); 813 __ ldp(t0, t1, Address(s, 2 * unit)); 814 __ stp(t2, t3, Address(d, 4 * unit)); 815 __ ldp(t2, t3, Address(s, 4 * unit)); 816 __ stp(t4, t5, Address(d, 6 * unit)); 817 __ ldp(t4, t5, Address(s, 6 * unit)); 818 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 819 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 820 } 821 822 __ subs(count, count, 8); 823 __ br(Assembler::HS, again); 824 825 // Drain 826 __ bind(drain); 827 if (UseSIMDForMemoryOps) { 828 __ stpq(v0, v1, Address(d, 4 * unit)); 829 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 830 } else { 831 __ stp(t0, t1, Address(d, 2 * unit)); 832 __ stp(t2, t3, Address(d, 4 * unit)); 833 __ stp(t4, t5, Address(d, 6 * unit)); 834 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 835 } 836 837 { 838 Label L1, L2; 839 __ tbz(count, exact_log2(4), L1); 840 if (UseSIMDForMemoryOps) { 841 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 842 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 843 } else { 844 __ ldp(t0, t1, Address(s, 2 * unit)); 845 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 846 __ stp(t0, t1, Address(d, 2 * unit)); 847 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 848 } 849 __ bind(L1); 850 851 if (direction == copy_forwards) { 852 __ add(s, s, bias); 853 __ add(d, d, bias); 854 } 855 856 __ tbz(count, 1, L2); 857 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 858 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 859 __ bind(L2); 860 } 861 862 __ ret(lr); 863 864 if (AvoidUnalignedAccesses) { 865 Label drain, again; 866 // Register order for storing. Order is different for backward copy. 867 868 __ bind(unaligned_copy_long); 869 870 // source address is even aligned, target odd aligned 871 // 872 // when forward copying word pairs we read long pairs at offsets 873 // {0, 2, 4, 6} (in long words). when backwards copying we read 874 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 875 // address by -2 in the forwards case so we can compute the 876 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 877 // or -1. 878 // 879 // when forward copying we need to store 1 word, 3 pairs and 880 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 881 // zero offset We adjust the destination by -1 which means we 882 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 883 // 884 // When backwards copyng we need to store 1 word, 3 pairs and 885 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 886 // offsets {1, 3, 5, 7, 8} * unit. 887 888 if (direction == copy_forwards) { 889 __ sub(s, s, 16); 890 __ sub(d, d, 8); 891 } 892 893 // Fill 8 registers 894 // 895 // for forwards copy s was offset by -16 from the original input 896 // value of s so the register contents are at these offsets 897 // relative to the 64 bit block addressed by that original input 898 // and so on for each successive 64 byte block when s is updated 899 // 900 // t0 at offset 0, t1 at offset 8 901 // t2 at offset 16, t3 at offset 24 902 // t4 at offset 32, t5 at offset 40 903 // t6 at offset 48, t7 at offset 56 904 905 // for backwards copy s was not offset so the register contents 906 // are at these offsets into the preceding 64 byte block 907 // relative to that original input and so on for each successive 908 // preceding 64 byte block when s is updated. this explains the 909 // slightly counter-intuitive looking pattern of register usage 910 // in the stp instructions for backwards copy. 911 // 912 // t0 at offset -16, t1 at offset -8 913 // t2 at offset -32, t3 at offset -24 914 // t4 at offset -48, t5 at offset -40 915 // t6 at offset -64, t7 at offset -56 916 917 __ ldp(t0, t1, Address(s, 2 * unit)); 918 __ ldp(t2, t3, Address(s, 4 * unit)); 919 __ ldp(t4, t5, Address(s, 6 * unit)); 920 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 921 922 __ subs(count, count, 16); 923 __ br(Assembler::LO, drain); 924 925 int prefetch = PrefetchCopyIntervalInBytes; 926 bool use_stride = false; 927 if (direction == copy_backwards) { 928 use_stride = prefetch > 256; 929 prefetch = -prefetch; 930 if (use_stride) __ mov(stride, prefetch); 931 } 932 933 __ bind(again); 934 935 if (PrefetchCopyIntervalInBytes > 0) 936 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 937 938 if (direction == copy_forwards) { 939 // allowing for the offset of -8 the store instructions place 940 // registers into the target 64 bit block at the following 941 // offsets 942 // 943 // t0 at offset 0 944 // t1 at offset 8, t2 at offset 16 945 // t3 at offset 24, t4 at offset 32 946 // t5 at offset 40, t6 at offset 48 947 // t7 at offset 56 948 949 __ str(t0, Address(d, 1 * unit)); 950 __ stp(t1, t2, Address(d, 2 * unit)); 951 __ ldp(t0, t1, Address(s, 2 * unit)); 952 __ stp(t3, t4, Address(d, 4 * unit)); 953 __ ldp(t2, t3, Address(s, 4 * unit)); 954 __ stp(t5, t6, Address(d, 6 * unit)); 955 __ ldp(t4, t5, Address(s, 6 * unit)); 956 __ str(t7, Address(__ pre(d, 8 * unit))); 957 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 958 } else { 959 // d was not offset when we started so the registers are 960 // written into the 64 bit block preceding d with the following 961 // offsets 962 // 963 // t1 at offset -8 964 // t3 at offset -24, t0 at offset -16 965 // t5 at offset -48, t2 at offset -32 966 // t7 at offset -56, t4 at offset -48 967 // t6 at offset -64 968 // 969 // note that this matches the offsets previously noted for the 970 // loads 971 972 __ str(t1, Address(d, 1 * unit)); 973 __ stp(t3, t0, Address(d, 3 * unit)); 974 __ ldp(t0, t1, Address(s, 2 * unit)); 975 __ stp(t5, t2, Address(d, 5 * unit)); 976 __ ldp(t2, t3, Address(s, 4 * unit)); 977 __ stp(t7, t4, Address(d, 7 * unit)); 978 __ ldp(t4, t5, Address(s, 6 * unit)); 979 __ str(t6, Address(__ pre(d, 8 * unit))); 980 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 981 } 982 983 __ subs(count, count, 8); 984 __ br(Assembler::HS, again); 985 986 // Drain 987 // 988 // this uses the same pattern of offsets and register arguments 989 // as above 990 __ bind(drain); 991 if (direction == copy_forwards) { 992 __ str(t0, Address(d, 1 * unit)); 993 __ stp(t1, t2, Address(d, 2 * unit)); 994 __ stp(t3, t4, Address(d, 4 * unit)); 995 __ stp(t5, t6, Address(d, 6 * unit)); 996 __ str(t7, Address(__ pre(d, 8 * unit))); 997 } else { 998 __ str(t1, Address(d, 1 * unit)); 999 __ stp(t3, t0, Address(d, 3 * unit)); 1000 __ stp(t5, t2, Address(d, 5 * unit)); 1001 __ stp(t7, t4, Address(d, 7 * unit)); 1002 __ str(t6, Address(__ pre(d, 8 * unit))); 1003 } 1004 // now we need to copy any remaining part block which may 1005 // include a 4 word block subblock and/or a 2 word subblock. 1006 // bits 2 and 1 in the count are the tell-tale for whether we 1007 // have each such subblock 1008 { 1009 Label L1, L2; 1010 __ tbz(count, exact_log2(4), L1); 1011 // this is the same as above but copying only 4 longs hence 1012 // with only one intervening stp between the str instructions 1013 // but note that the offsets and registers still follow the 1014 // same pattern 1015 __ ldp(t0, t1, Address(s, 2 * unit)); 1016 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1017 if (direction == copy_forwards) { 1018 __ str(t0, Address(d, 1 * unit)); 1019 __ stp(t1, t2, Address(d, 2 * unit)); 1020 __ str(t3, Address(__ pre(d, 4 * unit))); 1021 } else { 1022 __ str(t1, Address(d, 1 * unit)); 1023 __ stp(t3, t0, Address(d, 3 * unit)); 1024 __ str(t2, Address(__ pre(d, 4 * unit))); 1025 } 1026 __ bind(L1); 1027 1028 __ tbz(count, 1, L2); 1029 // this is the same as above but copying only 2 longs hence 1030 // there is no intervening stp between the str instructions 1031 // but note that the offset and register patterns are still 1032 // the same 1033 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1034 if (direction == copy_forwards) { 1035 __ str(t0, Address(d, 1 * unit)); 1036 __ str(t1, Address(__ pre(d, 2 * unit))); 1037 } else { 1038 __ str(t1, Address(d, 1 * unit)); 1039 __ str(t0, Address(__ pre(d, 2 * unit))); 1040 } 1041 __ bind(L2); 1042 1043 // for forwards copy we need to re-adjust the offsets we 1044 // applied so that s and d are follow the last words written 1045 1046 if (direction == copy_forwards) { 1047 __ add(s, s, 16); 1048 __ add(d, d, 8); 1049 } 1050 1051 } 1052 1053 __ ret(lr); 1054 } 1055 } 1056 1057 // Small copy: less than 16 bytes. 1058 // 1059 // NB: Ignores all of the bits of count which represent more than 15 1060 // bytes, so a caller doesn't have to mask them. 1061 1062 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1063 bool is_backwards = step < 0; 1064 size_t granularity = uabs(step); 1065 int direction = is_backwards ? -1 : 1; 1066 int unit = wordSize * direction; 1067 1068 Label Lword, Lint, Lshort, Lbyte; 1069 1070 assert(granularity 1071 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1072 1073 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1074 1075 // ??? I don't know if this bit-test-and-branch is the right thing 1076 // to do. It does a lot of jumping, resulting in several 1077 // mispredicted branches. It might make more sense to do this 1078 // with something like Duff's device with a single computed branch. 1079 1080 __ tbz(count, 3 - exact_log2(granularity), Lword); 1081 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1082 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1083 __ bind(Lword); 1084 1085 if (granularity <= sizeof (jint)) { 1086 __ tbz(count, 2 - exact_log2(granularity), Lint); 1087 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1088 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1089 __ bind(Lint); 1090 } 1091 1092 if (granularity <= sizeof (jshort)) { 1093 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1094 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1095 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1096 __ bind(Lshort); 1097 } 1098 1099 if (granularity <= sizeof (jbyte)) { 1100 __ tbz(count, 0, Lbyte); 1101 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1102 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1103 __ bind(Lbyte); 1104 } 1105 } 1106 1107 Label copy_f, copy_b; 1108 1109 // All-singing all-dancing memory copy. 1110 // 1111 // Copy count units of memory from s to d. The size of a unit is 1112 // step, which can be positive or negative depending on the direction 1113 // of copy. If is_aligned is false, we align the source address. 1114 // 1115 1116 void copy_memory(bool is_aligned, Register s, Register d, 1117 Register count, Register tmp, int step) { 1118 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1119 bool is_backwards = step < 0; 1120 unsigned int granularity = uabs(step); 1121 const Register t0 = r3, t1 = r4; 1122 1123 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1124 // load all the data before writing anything 1125 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1126 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1127 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1128 const Register send = r17, dend = r16; 1129 1130 if (PrefetchCopyIntervalInBytes > 0) 1131 __ prfm(Address(s, 0), PLDL1KEEP); 1132 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1133 __ br(Assembler::HI, copy_big); 1134 1135 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1136 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1137 1138 __ cmp(count, u1(16/granularity)); 1139 __ br(Assembler::LS, copy16); 1140 1141 __ cmp(count, u1(64/granularity)); 1142 __ br(Assembler::HI, copy80); 1143 1144 __ cmp(count, u1(32/granularity)); 1145 __ br(Assembler::LS, copy32); 1146 1147 // 33..64 bytes 1148 if (UseSIMDForMemoryOps) { 1149 __ ldpq(v0, v1, Address(s, 0)); 1150 __ ldpq(v2, v3, Address(send, -32)); 1151 __ stpq(v0, v1, Address(d, 0)); 1152 __ stpq(v2, v3, Address(dend, -32)); 1153 } else { 1154 __ ldp(t0, t1, Address(s, 0)); 1155 __ ldp(t2, t3, Address(s, 16)); 1156 __ ldp(t4, t5, Address(send, -32)); 1157 __ ldp(t6, t7, Address(send, -16)); 1158 1159 __ stp(t0, t1, Address(d, 0)); 1160 __ stp(t2, t3, Address(d, 16)); 1161 __ stp(t4, t5, Address(dend, -32)); 1162 __ stp(t6, t7, Address(dend, -16)); 1163 } 1164 __ b(finish); 1165 1166 // 17..32 bytes 1167 __ bind(copy32); 1168 __ ldp(t0, t1, Address(s, 0)); 1169 __ ldp(t2, t3, Address(send, -16)); 1170 __ stp(t0, t1, Address(d, 0)); 1171 __ stp(t2, t3, Address(dend, -16)); 1172 __ b(finish); 1173 1174 // 65..80/96 bytes 1175 // (96 bytes if SIMD because we do 32 byes per instruction) 1176 __ bind(copy80); 1177 if (UseSIMDForMemoryOps) { 1178 __ ldpq(v0, v1, Address(s, 0)); 1179 __ ldpq(v2, v3, Address(s, 32)); 1180 // Unaligned pointers can be an issue for copying. 1181 // The issue has more chances to happen when granularity of data is 1182 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1183 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1184 // The most performance drop has been seen for the range 65-80 bytes. 1185 // For such cases using the pair of ldp/stp instead of the third pair of 1186 // ldpq/stpq fixes the performance issue. 1187 if (granularity < sizeof (jint)) { 1188 Label copy96; 1189 __ cmp(count, u1(80/granularity)); 1190 __ br(Assembler::HI, copy96); 1191 __ ldp(t0, t1, Address(send, -16)); 1192 1193 __ stpq(v0, v1, Address(d, 0)); 1194 __ stpq(v2, v3, Address(d, 32)); 1195 __ stp(t0, t1, Address(dend, -16)); 1196 __ b(finish); 1197 1198 __ bind(copy96); 1199 } 1200 __ ldpq(v4, v5, Address(send, -32)); 1201 1202 __ stpq(v0, v1, Address(d, 0)); 1203 __ stpq(v2, v3, Address(d, 32)); 1204 __ stpq(v4, v5, Address(dend, -32)); 1205 } else { 1206 __ ldp(t0, t1, Address(s, 0)); 1207 __ ldp(t2, t3, Address(s, 16)); 1208 __ ldp(t4, t5, Address(s, 32)); 1209 __ ldp(t6, t7, Address(s, 48)); 1210 __ ldp(t8, t9, Address(send, -16)); 1211 1212 __ stp(t0, t1, Address(d, 0)); 1213 __ stp(t2, t3, Address(d, 16)); 1214 __ stp(t4, t5, Address(d, 32)); 1215 __ stp(t6, t7, Address(d, 48)); 1216 __ stp(t8, t9, Address(dend, -16)); 1217 } 1218 __ b(finish); 1219 1220 // 0..16 bytes 1221 __ bind(copy16); 1222 __ cmp(count, u1(8/granularity)); 1223 __ br(Assembler::LO, copy8); 1224 1225 // 8..16 bytes 1226 __ ldr(t0, Address(s, 0)); 1227 __ ldr(t1, Address(send, -8)); 1228 __ str(t0, Address(d, 0)); 1229 __ str(t1, Address(dend, -8)); 1230 __ b(finish); 1231 1232 if (granularity < 8) { 1233 // 4..7 bytes 1234 __ bind(copy8); 1235 __ tbz(count, 2 - exact_log2(granularity), copy4); 1236 __ ldrw(t0, Address(s, 0)); 1237 __ ldrw(t1, Address(send, -4)); 1238 __ strw(t0, Address(d, 0)); 1239 __ strw(t1, Address(dend, -4)); 1240 __ b(finish); 1241 if (granularity < 4) { 1242 // 0..3 bytes 1243 __ bind(copy4); 1244 __ cbz(count, finish); // get rid of 0 case 1245 if (granularity == 2) { 1246 __ ldrh(t0, Address(s, 0)); 1247 __ strh(t0, Address(d, 0)); 1248 } else { // granularity == 1 1249 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1250 // the first and last byte. 1251 // Handle the 3 byte case by loading and storing base + count/2 1252 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1253 // This does means in the 1 byte case we load/store the same 1254 // byte 3 times. 1255 __ lsr(count, count, 1); 1256 __ ldrb(t0, Address(s, 0)); 1257 __ ldrb(t1, Address(send, -1)); 1258 __ ldrb(t2, Address(s, count)); 1259 __ strb(t0, Address(d, 0)); 1260 __ strb(t1, Address(dend, -1)); 1261 __ strb(t2, Address(d, count)); 1262 } 1263 __ b(finish); 1264 } 1265 } 1266 1267 __ bind(copy_big); 1268 if (is_backwards) { 1269 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1270 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1271 } 1272 1273 // Now we've got the small case out of the way we can align the 1274 // source address on a 2-word boundary. 1275 1276 Label aligned; 1277 1278 if (is_aligned) { 1279 // We may have to adjust by 1 word to get s 2-word-aligned. 1280 __ tbz(s, exact_log2(wordSize), aligned); 1281 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1282 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1283 __ sub(count, count, wordSize/granularity); 1284 } else { 1285 if (is_backwards) { 1286 __ andr(rscratch2, s, 2 * wordSize - 1); 1287 } else { 1288 __ neg(rscratch2, s); 1289 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1290 } 1291 // rscratch2 is the byte adjustment needed to align s. 1292 __ cbz(rscratch2, aligned); 1293 int shift = exact_log2(granularity); 1294 if (shift) __ lsr(rscratch2, rscratch2, shift); 1295 __ sub(count, count, rscratch2); 1296 1297 #if 0 1298 // ?? This code is only correct for a disjoint copy. It may or 1299 // may not make sense to use it in that case. 1300 1301 // Copy the first pair; s and d may not be aligned. 1302 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1303 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1304 1305 // Align s and d, adjust count 1306 if (is_backwards) { 1307 __ sub(s, s, rscratch2); 1308 __ sub(d, d, rscratch2); 1309 } else { 1310 __ add(s, s, rscratch2); 1311 __ add(d, d, rscratch2); 1312 } 1313 #else 1314 copy_memory_small(s, d, rscratch2, rscratch1, step); 1315 #endif 1316 } 1317 1318 __ bind(aligned); 1319 1320 // s is now 2-word-aligned. 1321 1322 // We have a count of units and some trailing bytes. Adjust the 1323 // count and do a bulk copy of words. 1324 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1325 if (direction == copy_forwards) 1326 __ bl(copy_f); 1327 else 1328 __ bl(copy_b); 1329 1330 // And the tail. 1331 copy_memory_small(s, d, count, tmp, step); 1332 1333 if (granularity >= 8) __ bind(copy8); 1334 if (granularity >= 4) __ bind(copy4); 1335 __ bind(finish); 1336 } 1337 1338 1339 void clobber_registers() { 1340 #ifdef ASSERT 1341 RegSet clobbered 1342 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1343 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1344 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1345 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1346 __ mov(*it, rscratch1); 1347 } 1348 #endif 1349 1350 } 1351 1352 // Scan over array at a for count oops, verifying each one. 1353 // Preserves a and count, clobbers rscratch1 and rscratch2. 1354 void verify_oop_array (int size, Register a, Register count, Register temp) { 1355 Label loop, end; 1356 __ mov(rscratch1, a); 1357 __ mov(rscratch2, zr); 1358 __ bind(loop); 1359 __ cmp(rscratch2, count); 1360 __ br(Assembler::HS, end); 1361 if (size == wordSize) { 1362 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1363 __ verify_oop(temp); 1364 } else { 1365 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1366 __ decode_heap_oop(temp); // calls verify_oop 1367 } 1368 __ add(rscratch2, rscratch2, 1); 1369 __ b(loop); 1370 __ bind(end); 1371 } 1372 1373 // Arguments: 1374 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1375 // ignored 1376 // is_oop - true => oop array, so generate store check code 1377 // name - stub name string 1378 // 1379 // Inputs: 1380 // c_rarg0 - source array address 1381 // c_rarg1 - destination array address 1382 // c_rarg2 - element count, treated as ssize_t, can be zero 1383 // 1384 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1385 // the hardware handle it. The two dwords within qwords that span 1386 // cache line boundaries will still be loaded and stored atomically. 1387 // 1388 // Side Effects: 1389 // disjoint_int_copy_entry is set to the no-overlap entry point 1390 // used by generate_conjoint_int_oop_copy(). 1391 // 1392 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1393 const char *name, bool dest_uninitialized = false) { 1394 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1395 RegSet saved_reg = RegSet::of(s, d, count); 1396 __ align(CodeEntryAlignment); 1397 StubCodeMark mark(this, "StubRoutines", name); 1398 address start = __ pc(); 1399 __ enter(); 1400 1401 if (entry != NULL) { 1402 *entry = __ pc(); 1403 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1404 BLOCK_COMMENT("Entry:"); 1405 } 1406 1407 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1408 if (dest_uninitialized) { 1409 decorators |= IS_DEST_UNINITIALIZED; 1410 } 1411 if (aligned) { 1412 decorators |= ARRAYCOPY_ALIGNED; 1413 } 1414 1415 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1416 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1417 1418 if (is_oop) { 1419 // save regs before copy_memory 1420 __ push(RegSet::of(d, count), sp); 1421 } 1422 { 1423 // UnsafeCopyMemory page error: continue after ucm 1424 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1425 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1426 copy_memory(aligned, s, d, count, rscratch1, size); 1427 } 1428 1429 if (is_oop) { 1430 __ pop(RegSet::of(d, count), sp); 1431 if (VerifyOops) 1432 verify_oop_array(size, d, count, r16); 1433 } 1434 1435 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1436 1437 __ leave(); 1438 __ mov(r0, zr); // return 0 1439 __ ret(lr); 1440 return start; 1441 } 1442 1443 // Arguments: 1444 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1445 // ignored 1446 // is_oop - true => oop array, so generate store check code 1447 // name - stub name string 1448 // 1449 // Inputs: 1450 // c_rarg0 - source array address 1451 // c_rarg1 - destination array address 1452 // c_rarg2 - element count, treated as ssize_t, can be zero 1453 // 1454 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1455 // the hardware handle it. The two dwords within qwords that span 1456 // cache line boundaries will still be loaded and stored atomically. 1457 // 1458 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1459 address *entry, const char *name, 1460 bool dest_uninitialized = false) { 1461 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1462 RegSet saved_regs = RegSet::of(s, d, count); 1463 StubCodeMark mark(this, "StubRoutines", name); 1464 address start = __ pc(); 1465 __ enter(); 1466 1467 if (entry != NULL) { 1468 *entry = __ pc(); 1469 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1470 BLOCK_COMMENT("Entry:"); 1471 } 1472 1473 // use fwd copy when (d-s) above_equal (count*size) 1474 __ sub(rscratch1, d, s); 1475 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1476 __ br(Assembler::HS, nooverlap_target); 1477 1478 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1479 if (dest_uninitialized) { 1480 decorators |= IS_DEST_UNINITIALIZED; 1481 } 1482 if (aligned) { 1483 decorators |= ARRAYCOPY_ALIGNED; 1484 } 1485 1486 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1487 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1488 1489 if (is_oop) { 1490 // save regs before copy_memory 1491 __ push(RegSet::of(d, count), sp); 1492 } 1493 { 1494 // UnsafeCopyMemory page error: continue after ucm 1495 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1496 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1497 copy_memory(aligned, s, d, count, rscratch1, -size); 1498 } 1499 if (is_oop) { 1500 __ pop(RegSet::of(d, count), sp); 1501 if (VerifyOops) 1502 verify_oop_array(size, d, count, r16); 1503 } 1504 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1505 __ leave(); 1506 __ mov(r0, zr); // return 0 1507 __ ret(lr); 1508 return start; 1509 } 1510 1511 // Arguments: 1512 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1513 // ignored 1514 // name - stub name string 1515 // 1516 // Inputs: 1517 // c_rarg0 - source array address 1518 // c_rarg1 - destination array address 1519 // c_rarg2 - element count, treated as ssize_t, can be zero 1520 // 1521 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1522 // we let the hardware handle it. The one to eight bytes within words, 1523 // dwords or qwords that span cache line boundaries will still be loaded 1524 // and stored atomically. 1525 // 1526 // Side Effects: 1527 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1528 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1529 // we let the hardware handle it. The one to eight bytes within words, 1530 // dwords or qwords that span cache line boundaries will still be loaded 1531 // and stored atomically. 1532 // 1533 // Side Effects: 1534 // disjoint_byte_copy_entry is set to the no-overlap entry point 1535 // used by generate_conjoint_byte_copy(). 1536 // 1537 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1538 const bool not_oop = false; 1539 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1540 } 1541 1542 // Arguments: 1543 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1544 // ignored 1545 // name - stub name string 1546 // 1547 // Inputs: 1548 // c_rarg0 - source array address 1549 // c_rarg1 - destination array address 1550 // c_rarg2 - element count, treated as ssize_t, can be zero 1551 // 1552 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1553 // we let the hardware handle it. The one to eight bytes within words, 1554 // dwords or qwords that span cache line boundaries will still be loaded 1555 // and stored atomically. 1556 // 1557 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1558 address* entry, const char *name) { 1559 const bool not_oop = false; 1560 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1561 } 1562 1563 // Arguments: 1564 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1565 // ignored 1566 // name - stub name string 1567 // 1568 // Inputs: 1569 // c_rarg0 - source array address 1570 // c_rarg1 - destination array address 1571 // c_rarg2 - element count, treated as ssize_t, can be zero 1572 // 1573 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1574 // let the hardware handle it. The two or four words within dwords 1575 // or qwords that span cache line boundaries will still be loaded 1576 // and stored atomically. 1577 // 1578 // Side Effects: 1579 // disjoint_short_copy_entry is set to the no-overlap entry point 1580 // used by generate_conjoint_short_copy(). 1581 // 1582 address generate_disjoint_short_copy(bool aligned, 1583 address* entry, const char *name) { 1584 const bool not_oop = false; 1585 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1586 } 1587 1588 // Arguments: 1589 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1590 // ignored 1591 // name - stub name string 1592 // 1593 // Inputs: 1594 // c_rarg0 - source array address 1595 // c_rarg1 - destination array address 1596 // c_rarg2 - element count, treated as ssize_t, can be zero 1597 // 1598 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1599 // let the hardware handle it. The two or four words within dwords 1600 // or qwords that span cache line boundaries will still be loaded 1601 // and stored atomically. 1602 // 1603 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1604 address *entry, const char *name) { 1605 const bool not_oop = false; 1606 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1607 1608 } 1609 // Arguments: 1610 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1611 // ignored 1612 // name - stub name string 1613 // 1614 // Inputs: 1615 // c_rarg0 - source array address 1616 // c_rarg1 - destination array address 1617 // c_rarg2 - element count, treated as ssize_t, can be zero 1618 // 1619 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1620 // the hardware handle it. The two dwords within qwords that span 1621 // cache line boundaries will still be loaded and stored atomically. 1622 // 1623 // Side Effects: 1624 // disjoint_int_copy_entry is set to the no-overlap entry point 1625 // used by generate_conjoint_int_oop_copy(). 1626 // 1627 address generate_disjoint_int_copy(bool aligned, address *entry, 1628 const char *name, bool dest_uninitialized = false) { 1629 const bool not_oop = false; 1630 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1631 } 1632 1633 // Arguments: 1634 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1635 // ignored 1636 // name - stub name string 1637 // 1638 // Inputs: 1639 // c_rarg0 - source array address 1640 // c_rarg1 - destination array address 1641 // c_rarg2 - element count, treated as ssize_t, can be zero 1642 // 1643 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1644 // the hardware handle it. The two dwords within qwords that span 1645 // cache line boundaries will still be loaded and stored atomically. 1646 // 1647 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1648 address *entry, const char *name, 1649 bool dest_uninitialized = false) { 1650 const bool not_oop = false; 1651 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1652 } 1653 1654 1655 // Arguments: 1656 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1657 // ignored 1658 // name - stub name string 1659 // 1660 // Inputs: 1661 // c_rarg0 - source array address 1662 // c_rarg1 - destination array address 1663 // c_rarg2 - element count, treated as size_t, can be zero 1664 // 1665 // Side Effects: 1666 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1667 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1668 // 1669 address generate_disjoint_long_copy(bool aligned, address *entry, 1670 const char *name, bool dest_uninitialized = false) { 1671 const bool not_oop = false; 1672 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1673 } 1674 1675 // Arguments: 1676 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1677 // ignored 1678 // name - stub name string 1679 // 1680 // Inputs: 1681 // c_rarg0 - source array address 1682 // c_rarg1 - destination array address 1683 // c_rarg2 - element count, treated as size_t, can be zero 1684 // 1685 address generate_conjoint_long_copy(bool aligned, 1686 address nooverlap_target, address *entry, 1687 const char *name, bool dest_uninitialized = false) { 1688 const bool not_oop = false; 1689 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1690 } 1691 1692 // Arguments: 1693 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1694 // ignored 1695 // name - stub name string 1696 // 1697 // Inputs: 1698 // c_rarg0 - source array address 1699 // c_rarg1 - destination array address 1700 // c_rarg2 - element count, treated as size_t, can be zero 1701 // 1702 // Side Effects: 1703 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1704 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1705 // 1706 address generate_disjoint_oop_copy(bool aligned, address *entry, 1707 const char *name, bool dest_uninitialized) { 1708 const bool is_oop = true; 1709 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1710 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1711 } 1712 1713 // Arguments: 1714 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1715 // ignored 1716 // name - stub name string 1717 // 1718 // Inputs: 1719 // c_rarg0 - source array address 1720 // c_rarg1 - destination array address 1721 // c_rarg2 - element count, treated as size_t, can be zero 1722 // 1723 address generate_conjoint_oop_copy(bool aligned, 1724 address nooverlap_target, address *entry, 1725 const char *name, bool dest_uninitialized) { 1726 const bool is_oop = true; 1727 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1728 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1729 name, dest_uninitialized); 1730 } 1731 1732 1733 // Helper for generating a dynamic type check. 1734 // Smashes rscratch1, rscratch2. 1735 void generate_type_check(Register sub_klass, 1736 Register super_check_offset, 1737 Register super_klass, 1738 Label& L_success) { 1739 assert_different_registers(sub_klass, super_check_offset, super_klass); 1740 1741 BLOCK_COMMENT("type_check:"); 1742 1743 Label L_miss; 1744 1745 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1746 super_check_offset); 1747 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1748 1749 // Fall through on failure! 1750 __ BIND(L_miss); 1751 } 1752 1753 // 1754 // Generate checkcasting array copy stub 1755 // 1756 // Input: 1757 // c_rarg0 - source array address 1758 // c_rarg1 - destination array address 1759 // c_rarg2 - element count, treated as ssize_t, can be zero 1760 // c_rarg3 - size_t ckoff (super_check_offset) 1761 // c_rarg4 - oop ckval (super_klass) 1762 // 1763 // Output: 1764 // r0 == 0 - success 1765 // r0 == -1^K - failure, where K is partial transfer count 1766 // 1767 address generate_checkcast_copy(const char *name, address *entry, 1768 bool dest_uninitialized = false) { 1769 1770 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1771 1772 // Input registers (after setup_arg_regs) 1773 const Register from = c_rarg0; // source array address 1774 const Register to = c_rarg1; // destination array address 1775 const Register count = c_rarg2; // elementscount 1776 const Register ckoff = c_rarg3; // super_check_offset 1777 const Register ckval = c_rarg4; // super_klass 1778 1779 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1780 RegSet wb_post_saved_regs = RegSet::of(count); 1781 1782 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1783 const Register copied_oop = r22; // actual oop copied 1784 const Register count_save = r21; // orig elementscount 1785 const Register start_to = r20; // destination array start address 1786 const Register r19_klass = r19; // oop._klass 1787 1788 //--------------------------------------------------------------- 1789 // Assembler stub will be used for this call to arraycopy 1790 // if the two arrays are subtypes of Object[] but the 1791 // destination array type is not equal to or a supertype 1792 // of the source type. Each element must be separately 1793 // checked. 1794 1795 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1796 copied_oop, r19_klass, count_save); 1797 1798 __ align(CodeEntryAlignment); 1799 StubCodeMark mark(this, "StubRoutines", name); 1800 address start = __ pc(); 1801 1802 __ enter(); // required for proper stackwalking of RuntimeStub frame 1803 1804 #ifdef ASSERT 1805 // caller guarantees that the arrays really are different 1806 // otherwise, we would have to make conjoint checks 1807 { Label L; 1808 __ b(L); // conjoint check not yet implemented 1809 __ stop("checkcast_copy within a single array"); 1810 __ bind(L); 1811 } 1812 #endif //ASSERT 1813 1814 // Caller of this entry point must set up the argument registers. 1815 if (entry != NULL) { 1816 *entry = __ pc(); 1817 BLOCK_COMMENT("Entry:"); 1818 } 1819 1820 // Empty array: Nothing to do. 1821 __ cbz(count, L_done); 1822 __ push(RegSet::of(r19, r20, r21, r22), sp); 1823 1824 #ifdef ASSERT 1825 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1826 // The ckoff and ckval must be mutually consistent, 1827 // even though caller generates both. 1828 { Label L; 1829 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1830 __ ldrw(start_to, Address(ckval, sco_offset)); 1831 __ cmpw(ckoff, start_to); 1832 __ br(Assembler::EQ, L); 1833 __ stop("super_check_offset inconsistent"); 1834 __ bind(L); 1835 } 1836 #endif //ASSERT 1837 1838 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1839 bool is_oop = true; 1840 if (dest_uninitialized) { 1841 decorators |= IS_DEST_UNINITIALIZED; 1842 } 1843 1844 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1845 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1846 1847 // save the original count 1848 __ mov(count_save, count); 1849 1850 // Copy from low to high addresses 1851 __ mov(start_to, to); // Save destination array start address 1852 __ b(L_load_element); 1853 1854 // ======== begin loop ======== 1855 // (Loop is rotated; its entry is L_load_element.) 1856 // Loop control: 1857 // for (; count != 0; count--) { 1858 // copied_oop = load_heap_oop(from++); 1859 // ... generate_type_check ...; 1860 // store_heap_oop(to++, copied_oop); 1861 // } 1862 __ align(OptoLoopAlignment); 1863 1864 __ BIND(L_store_element); 1865 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW); // store the oop 1866 __ sub(count, count, 1); 1867 __ cbz(count, L_do_card_marks); 1868 1869 // ======== loop entry is here ======== 1870 __ BIND(L_load_element); 1871 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1872 __ cbz(copied_oop, L_store_element); 1873 1874 __ load_klass(r19_klass, copied_oop);// query the object klass 1875 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1876 // ======== end loop ======== 1877 1878 // It was a real error; we must depend on the caller to finish the job. 1879 // Register count = remaining oops, count_orig = total oops. 1880 // Emit GC store barriers for the oops we have copied and report 1881 // their number to the caller. 1882 1883 __ subs(count, count_save, count); // K = partially copied oop count 1884 __ eon(count, count, zr); // report (-1^K) to caller 1885 __ br(Assembler::EQ, L_done_pop); 1886 1887 __ BIND(L_do_card_marks); 1888 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1889 1890 __ bind(L_done_pop); 1891 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1892 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1893 1894 __ bind(L_done); 1895 __ mov(r0, count); 1896 __ leave(); 1897 __ ret(lr); 1898 1899 return start; 1900 } 1901 1902 // Perform range checks on the proposed arraycopy. 1903 // Kills temp, but nothing else. 1904 // Also, clean the sign bits of src_pos and dst_pos. 1905 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1906 Register src_pos, // source position (c_rarg1) 1907 Register dst, // destination array oo (c_rarg2) 1908 Register dst_pos, // destination position (c_rarg3) 1909 Register length, 1910 Register temp, 1911 Label& L_failed) { 1912 BLOCK_COMMENT("arraycopy_range_checks:"); 1913 1914 assert_different_registers(rscratch1, temp); 1915 1916 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1917 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1918 __ addw(temp, length, src_pos); 1919 __ cmpw(temp, rscratch1); 1920 __ br(Assembler::HI, L_failed); 1921 1922 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1923 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1924 __ addw(temp, length, dst_pos); 1925 __ cmpw(temp, rscratch1); 1926 __ br(Assembler::HI, L_failed); 1927 1928 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1929 __ movw(src_pos, src_pos); 1930 __ movw(dst_pos, dst_pos); 1931 1932 BLOCK_COMMENT("arraycopy_range_checks done"); 1933 } 1934 1935 // These stubs get called from some dumb test routine. 1936 // I'll write them properly when they're called from 1937 // something that's actually doing something. 1938 static void fake_arraycopy_stub(address src, address dst, int count) { 1939 assert(count == 0, "huh?"); 1940 } 1941 1942 1943 // 1944 // Generate 'unsafe' array copy stub 1945 // Though just as safe as the other stubs, it takes an unscaled 1946 // size_t argument instead of an element count. 1947 // 1948 // Input: 1949 // c_rarg0 - source array address 1950 // c_rarg1 - destination array address 1951 // c_rarg2 - byte count, treated as ssize_t, can be zero 1952 // 1953 // Examines the alignment of the operands and dispatches 1954 // to a long, int, short, or byte copy loop. 1955 // 1956 address generate_unsafe_copy(const char *name, 1957 address byte_copy_entry, 1958 address short_copy_entry, 1959 address int_copy_entry, 1960 address long_copy_entry) { 1961 Label L_long_aligned, L_int_aligned, L_short_aligned; 1962 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1963 1964 __ align(CodeEntryAlignment); 1965 StubCodeMark mark(this, "StubRoutines", name); 1966 address start = __ pc(); 1967 __ enter(); // required for proper stackwalking of RuntimeStub frame 1968 1969 // bump this on entry, not on exit: 1970 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1971 1972 __ orr(rscratch1, s, d); 1973 __ orr(rscratch1, rscratch1, count); 1974 1975 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1976 __ cbz(rscratch1, L_long_aligned); 1977 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1978 __ cbz(rscratch1, L_int_aligned); 1979 __ tbz(rscratch1, 0, L_short_aligned); 1980 __ b(RuntimeAddress(byte_copy_entry)); 1981 1982 __ BIND(L_short_aligned); 1983 __ lsr(count, count, LogBytesPerShort); // size => short_count 1984 __ b(RuntimeAddress(short_copy_entry)); 1985 __ BIND(L_int_aligned); 1986 __ lsr(count, count, LogBytesPerInt); // size => int_count 1987 __ b(RuntimeAddress(int_copy_entry)); 1988 __ BIND(L_long_aligned); 1989 __ lsr(count, count, LogBytesPerLong); // size => long_count 1990 __ b(RuntimeAddress(long_copy_entry)); 1991 1992 return start; 1993 } 1994 1995 // 1996 // Generate generic array copy stubs 1997 // 1998 // Input: 1999 // c_rarg0 - src oop 2000 // c_rarg1 - src_pos (32-bits) 2001 // c_rarg2 - dst oop 2002 // c_rarg3 - dst_pos (32-bits) 2003 // c_rarg4 - element count (32-bits) 2004 // 2005 // Output: 2006 // r0 == 0 - success 2007 // r0 == -1^K - failure, where K is partial transfer count 2008 // 2009 address generate_generic_copy(const char *name, 2010 address byte_copy_entry, address short_copy_entry, 2011 address int_copy_entry, address oop_copy_entry, 2012 address long_copy_entry, address checkcast_copy_entry) { 2013 2014 Label L_failed, L_objArray; 2015 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2016 2017 // Input registers 2018 const Register src = c_rarg0; // source array oop 2019 const Register src_pos = c_rarg1; // source position 2020 const Register dst = c_rarg2; // destination array oop 2021 const Register dst_pos = c_rarg3; // destination position 2022 const Register length = c_rarg4; 2023 2024 2025 // Registers used as temps 2026 const Register dst_klass = c_rarg5; 2027 2028 __ align(CodeEntryAlignment); 2029 2030 StubCodeMark mark(this, "StubRoutines", name); 2031 2032 address start = __ pc(); 2033 2034 __ enter(); // required for proper stackwalking of RuntimeStub frame 2035 2036 // bump this on entry, not on exit: 2037 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2038 2039 //----------------------------------------------------------------------- 2040 // Assembler stub will be used for this call to arraycopy 2041 // if the following conditions are met: 2042 // 2043 // (1) src and dst must not be null. 2044 // (2) src_pos must not be negative. 2045 // (3) dst_pos must not be negative. 2046 // (4) length must not be negative. 2047 // (5) src klass and dst klass should be the same and not NULL. 2048 // (6) src and dst should be arrays. 2049 // (7) src_pos + length must not exceed length of src. 2050 // (8) dst_pos + length must not exceed length of dst. 2051 // 2052 2053 // if (src == NULL) return -1; 2054 __ cbz(src, L_failed); 2055 2056 // if (src_pos < 0) return -1; 2057 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2058 2059 // if (dst == NULL) return -1; 2060 __ cbz(dst, L_failed); 2061 2062 // if (dst_pos < 0) return -1; 2063 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2064 2065 // registers used as temp 2066 const Register scratch_length = r16; // elements count to copy 2067 const Register scratch_src_klass = r17; // array klass 2068 const Register lh = r15; // layout helper 2069 2070 // if (length < 0) return -1; 2071 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2072 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2073 2074 __ load_klass(scratch_src_klass, src); 2075 #ifdef ASSERT 2076 // assert(src->klass() != NULL); 2077 { 2078 BLOCK_COMMENT("assert klasses not null {"); 2079 Label L1, L2; 2080 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2081 __ bind(L1); 2082 __ stop("broken null klass"); 2083 __ bind(L2); 2084 __ load_klass(rscratch1, dst); 2085 __ cbz(rscratch1, L1); // this would be broken also 2086 BLOCK_COMMENT("} assert klasses not null done"); 2087 } 2088 #endif 2089 2090 // Load layout helper (32-bits) 2091 // 2092 // |array_tag| | header_size | element_type | |log2_element_size| 2093 // 32 30 24 16 8 2 0 2094 // 2095 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2096 // 2097 2098 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2099 2100 // Handle objArrays completely differently... 2101 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2102 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2103 __ movw(rscratch1, objArray_lh); 2104 __ eorw(rscratch2, lh, rscratch1); 2105 __ cbzw(rscratch2, L_objArray); 2106 2107 // if (src->klass() != dst->klass()) return -1; 2108 __ load_klass(rscratch2, dst); 2109 __ eor(rscratch2, rscratch2, scratch_src_klass); 2110 __ cbnz(rscratch2, L_failed); 2111 2112 // if (!src->is_Array()) return -1; 2113 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2114 2115 // At this point, it is known to be a typeArray (array_tag 0x3). 2116 #ifdef ASSERT 2117 { 2118 BLOCK_COMMENT("assert primitive array {"); 2119 Label L; 2120 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2121 __ cmpw(lh, rscratch2); 2122 __ br(Assembler::GE, L); 2123 __ stop("must be a primitive array"); 2124 __ bind(L); 2125 BLOCK_COMMENT("} assert primitive array done"); 2126 } 2127 #endif 2128 2129 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2130 rscratch2, L_failed); 2131 2132 // TypeArrayKlass 2133 // 2134 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2135 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2136 // 2137 2138 const Register rscratch1_offset = rscratch1; // array offset 2139 const Register r15_elsize = lh; // element size 2140 2141 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2142 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2143 __ add(src, src, rscratch1_offset); // src array offset 2144 __ add(dst, dst, rscratch1_offset); // dst array offset 2145 BLOCK_COMMENT("choose copy loop based on element size"); 2146 2147 // next registers should be set before the jump to corresponding stub 2148 const Register from = c_rarg0; // source array address 2149 const Register to = c_rarg1; // destination array address 2150 const Register count = c_rarg2; // elements count 2151 2152 // 'from', 'to', 'count' registers should be set in such order 2153 // since they are the same as 'src', 'src_pos', 'dst'. 2154 2155 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2156 2157 // The possible values of elsize are 0-3, i.e. exact_log2(element 2158 // size in bytes). We do a simple bitwise binary search. 2159 __ BIND(L_copy_bytes); 2160 __ tbnz(r15_elsize, 1, L_copy_ints); 2161 __ tbnz(r15_elsize, 0, L_copy_shorts); 2162 __ lea(from, Address(src, src_pos));// src_addr 2163 __ lea(to, Address(dst, dst_pos));// dst_addr 2164 __ movw(count, scratch_length); // length 2165 __ b(RuntimeAddress(byte_copy_entry)); 2166 2167 __ BIND(L_copy_shorts); 2168 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2169 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2170 __ movw(count, scratch_length); // length 2171 __ b(RuntimeAddress(short_copy_entry)); 2172 2173 __ BIND(L_copy_ints); 2174 __ tbnz(r15_elsize, 0, L_copy_longs); 2175 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2176 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2177 __ movw(count, scratch_length); // length 2178 __ b(RuntimeAddress(int_copy_entry)); 2179 2180 __ BIND(L_copy_longs); 2181 #ifdef ASSERT 2182 { 2183 BLOCK_COMMENT("assert long copy {"); 2184 Label L; 2185 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2186 __ cmpw(r15_elsize, LogBytesPerLong); 2187 __ br(Assembler::EQ, L); 2188 __ stop("must be long copy, but elsize is wrong"); 2189 __ bind(L); 2190 BLOCK_COMMENT("} assert long copy done"); 2191 } 2192 #endif 2193 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2194 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2195 __ movw(count, scratch_length); // length 2196 __ b(RuntimeAddress(long_copy_entry)); 2197 2198 // ObjArrayKlass 2199 __ BIND(L_objArray); 2200 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2201 2202 Label L_plain_copy, L_checkcast_copy; 2203 // test array classes for subtyping 2204 __ load_klass(r15, dst); 2205 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2206 __ br(Assembler::NE, L_checkcast_copy); 2207 2208 // Identically typed arrays can be copied without element-wise checks. 2209 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2210 rscratch2, L_failed); 2211 2212 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2213 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2214 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2215 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2216 __ movw(count, scratch_length); // length 2217 __ BIND(L_plain_copy); 2218 __ b(RuntimeAddress(oop_copy_entry)); 2219 2220 __ BIND(L_checkcast_copy); 2221 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2222 { 2223 // Before looking at dst.length, make sure dst is also an objArray. 2224 __ ldrw(rscratch1, Address(r15, lh_offset)); 2225 __ movw(rscratch2, objArray_lh); 2226 __ eorw(rscratch1, rscratch1, rscratch2); 2227 __ cbnzw(rscratch1, L_failed); 2228 2229 // It is safe to examine both src.length and dst.length. 2230 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2231 r15, L_failed); 2232 2233 __ load_klass(dst_klass, dst); // reload 2234 2235 // Marshal the base address arguments now, freeing registers. 2236 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2237 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2238 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2239 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2240 __ movw(count, length); // length (reloaded) 2241 Register sco_temp = c_rarg3; // this register is free now 2242 assert_different_registers(from, to, count, sco_temp, 2243 dst_klass, scratch_src_klass); 2244 // assert_clean_int(count, sco_temp); 2245 2246 // Generate the type check. 2247 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2248 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2249 2250 // Smashes rscratch1, rscratch2 2251 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2252 2253 // Fetch destination element klass from the ObjArrayKlass header. 2254 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2255 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2256 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2257 2258 // the checkcast_copy loop needs two extra arguments: 2259 assert(c_rarg3 == sco_temp, "#3 already in place"); 2260 // Set up arguments for checkcast_copy_entry. 2261 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2262 __ b(RuntimeAddress(checkcast_copy_entry)); 2263 } 2264 2265 __ BIND(L_failed); 2266 __ mov(r0, -1); 2267 __ leave(); // required for proper stackwalking of RuntimeStub frame 2268 __ ret(lr); 2269 2270 return start; 2271 } 2272 2273 // 2274 // Generate stub for array fill. If "aligned" is true, the 2275 // "to" address is assumed to be heapword aligned. 2276 // 2277 // Arguments for generated stub: 2278 // to: c_rarg0 2279 // value: c_rarg1 2280 // count: c_rarg2 treated as signed 2281 // 2282 address generate_fill(BasicType t, bool aligned, const char *name) { 2283 __ align(CodeEntryAlignment); 2284 StubCodeMark mark(this, "StubRoutines", name); 2285 address start = __ pc(); 2286 2287 BLOCK_COMMENT("Entry:"); 2288 2289 const Register to = c_rarg0; // source array address 2290 const Register value = c_rarg1; // value 2291 const Register count = c_rarg2; // elements count 2292 2293 const Register bz_base = r10; // base for block_zero routine 2294 const Register cnt_words = r11; // temp register 2295 2296 __ enter(); 2297 2298 Label L_fill_elements, L_exit1; 2299 2300 int shift = -1; 2301 switch (t) { 2302 case T_BYTE: 2303 shift = 0; 2304 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2305 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2306 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2307 __ br(Assembler::LO, L_fill_elements); 2308 break; 2309 case T_SHORT: 2310 shift = 1; 2311 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2312 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2313 __ br(Assembler::LO, L_fill_elements); 2314 break; 2315 case T_INT: 2316 shift = 2; 2317 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2318 __ br(Assembler::LO, L_fill_elements); 2319 break; 2320 default: ShouldNotReachHere(); 2321 } 2322 2323 // Align source address at 8 bytes address boundary. 2324 Label L_skip_align1, L_skip_align2, L_skip_align4; 2325 if (!aligned) { 2326 switch (t) { 2327 case T_BYTE: 2328 // One byte misalignment happens only for byte arrays. 2329 __ tbz(to, 0, L_skip_align1); 2330 __ strb(value, Address(__ post(to, 1))); 2331 __ subw(count, count, 1); 2332 __ bind(L_skip_align1); 2333 // Fallthrough 2334 case T_SHORT: 2335 // Two bytes misalignment happens only for byte and short (char) arrays. 2336 __ tbz(to, 1, L_skip_align2); 2337 __ strh(value, Address(__ post(to, 2))); 2338 __ subw(count, count, 2 >> shift); 2339 __ bind(L_skip_align2); 2340 // Fallthrough 2341 case T_INT: 2342 // Align to 8 bytes, we know we are 4 byte aligned to start. 2343 __ tbz(to, 2, L_skip_align4); 2344 __ strw(value, Address(__ post(to, 4))); 2345 __ subw(count, count, 4 >> shift); 2346 __ bind(L_skip_align4); 2347 break; 2348 default: ShouldNotReachHere(); 2349 } 2350 } 2351 2352 // 2353 // Fill large chunks 2354 // 2355 __ lsrw(cnt_words, count, 3 - shift); // number of words 2356 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2357 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2358 if (UseBlockZeroing) { 2359 Label non_block_zeroing, rest; 2360 // If the fill value is zero we can use the fast zero_words(). 2361 __ cbnz(value, non_block_zeroing); 2362 __ mov(bz_base, to); 2363 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2364 address tpc = __ zero_words(bz_base, cnt_words); 2365 if (tpc == nullptr) { 2366 fatal("CodeCache is full at generate_fill"); 2367 } 2368 __ b(rest); 2369 __ bind(non_block_zeroing); 2370 __ fill_words(to, cnt_words, value); 2371 __ bind(rest); 2372 } else { 2373 __ fill_words(to, cnt_words, value); 2374 } 2375 2376 // Remaining count is less than 8 bytes. Fill it by a single store. 2377 // Note that the total length is no less than 8 bytes. 2378 if (t == T_BYTE || t == T_SHORT) { 2379 Label L_exit1; 2380 __ cbzw(count, L_exit1); 2381 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2382 __ str(value, Address(to, -8)); // overwrite some elements 2383 __ bind(L_exit1); 2384 __ leave(); 2385 __ ret(lr); 2386 } 2387 2388 // Handle copies less than 8 bytes. 2389 Label L_fill_2, L_fill_4, L_exit2; 2390 __ bind(L_fill_elements); 2391 switch (t) { 2392 case T_BYTE: 2393 __ tbz(count, 0, L_fill_2); 2394 __ strb(value, Address(__ post(to, 1))); 2395 __ bind(L_fill_2); 2396 __ tbz(count, 1, L_fill_4); 2397 __ strh(value, Address(__ post(to, 2))); 2398 __ bind(L_fill_4); 2399 __ tbz(count, 2, L_exit2); 2400 __ strw(value, Address(to)); 2401 break; 2402 case T_SHORT: 2403 __ tbz(count, 0, L_fill_4); 2404 __ strh(value, Address(__ post(to, 2))); 2405 __ bind(L_fill_4); 2406 __ tbz(count, 1, L_exit2); 2407 __ strw(value, Address(to)); 2408 break; 2409 case T_INT: 2410 __ cbzw(count, L_exit2); 2411 __ strw(value, Address(to)); 2412 break; 2413 default: ShouldNotReachHere(); 2414 } 2415 __ bind(L_exit2); 2416 __ leave(); 2417 __ ret(lr); 2418 return start; 2419 } 2420 2421 address generate_data_cache_writeback() { 2422 const Register line = c_rarg0; // address of line to write back 2423 2424 __ align(CodeEntryAlignment); 2425 2426 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2427 2428 address start = __ pc(); 2429 __ enter(); 2430 __ cache_wb(Address(line, 0)); 2431 __ leave(); 2432 __ ret(lr); 2433 2434 return start; 2435 } 2436 2437 address generate_data_cache_writeback_sync() { 2438 const Register is_pre = c_rarg0; // pre or post sync 2439 2440 __ align(CodeEntryAlignment); 2441 2442 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2443 2444 // pre wbsync is a no-op 2445 // post wbsync translates to an sfence 2446 2447 Label skip; 2448 address start = __ pc(); 2449 __ enter(); 2450 __ cbnz(is_pre, skip); 2451 __ cache_wbsync(false); 2452 __ bind(skip); 2453 __ leave(); 2454 __ ret(lr); 2455 2456 return start; 2457 } 2458 2459 void generate_arraycopy_stubs() { 2460 address entry; 2461 address entry_jbyte_arraycopy; 2462 address entry_jshort_arraycopy; 2463 address entry_jint_arraycopy; 2464 address entry_oop_arraycopy; 2465 address entry_jlong_arraycopy; 2466 address entry_checkcast_arraycopy; 2467 2468 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2469 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2470 2471 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2472 2473 //*** jbyte 2474 // Always need aligned and unaligned versions 2475 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2476 "jbyte_disjoint_arraycopy"); 2477 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2478 &entry_jbyte_arraycopy, 2479 "jbyte_arraycopy"); 2480 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2481 "arrayof_jbyte_disjoint_arraycopy"); 2482 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2483 "arrayof_jbyte_arraycopy"); 2484 2485 //*** jshort 2486 // Always need aligned and unaligned versions 2487 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2488 "jshort_disjoint_arraycopy"); 2489 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2490 &entry_jshort_arraycopy, 2491 "jshort_arraycopy"); 2492 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2493 "arrayof_jshort_disjoint_arraycopy"); 2494 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2495 "arrayof_jshort_arraycopy"); 2496 2497 //*** jint 2498 // Aligned versions 2499 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2500 "arrayof_jint_disjoint_arraycopy"); 2501 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2502 "arrayof_jint_arraycopy"); 2503 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2504 // entry_jint_arraycopy always points to the unaligned version 2505 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2506 "jint_disjoint_arraycopy"); 2507 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2508 &entry_jint_arraycopy, 2509 "jint_arraycopy"); 2510 2511 //*** jlong 2512 // It is always aligned 2513 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2514 "arrayof_jlong_disjoint_arraycopy"); 2515 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2516 "arrayof_jlong_arraycopy"); 2517 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2518 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2519 2520 //*** oops 2521 { 2522 // With compressed oops we need unaligned versions; notice that 2523 // we overwrite entry_oop_arraycopy. 2524 bool aligned = !UseCompressedOops; 2525 2526 StubRoutines::_arrayof_oop_disjoint_arraycopy 2527 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2528 /*dest_uninitialized*/false); 2529 StubRoutines::_arrayof_oop_arraycopy 2530 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2531 /*dest_uninitialized*/false); 2532 // Aligned versions without pre-barriers 2533 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2534 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2535 /*dest_uninitialized*/true); 2536 StubRoutines::_arrayof_oop_arraycopy_uninit 2537 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2538 /*dest_uninitialized*/true); 2539 } 2540 2541 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2542 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2543 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2544 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2545 2546 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2547 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2548 /*dest_uninitialized*/true); 2549 2550 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2551 entry_jbyte_arraycopy, 2552 entry_jshort_arraycopy, 2553 entry_jint_arraycopy, 2554 entry_jlong_arraycopy); 2555 2556 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2557 entry_jbyte_arraycopy, 2558 entry_jshort_arraycopy, 2559 entry_jint_arraycopy, 2560 entry_oop_arraycopy, 2561 entry_jlong_arraycopy, 2562 entry_checkcast_arraycopy); 2563 2564 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2565 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2566 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2567 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2568 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2569 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2570 } 2571 2572 void generate_math_stubs() { Unimplemented(); } 2573 2574 // Arguments: 2575 // 2576 // Inputs: 2577 // c_rarg0 - source byte array address 2578 // c_rarg1 - destination byte array address 2579 // c_rarg2 - K (key) in little endian int array 2580 // 2581 address generate_aescrypt_encryptBlock() { 2582 __ align(CodeEntryAlignment); 2583 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2584 2585 const Register from = c_rarg0; // source array address 2586 const Register to = c_rarg1; // destination array address 2587 const Register key = c_rarg2; // key array address 2588 const Register keylen = rscratch1; 2589 2590 address start = __ pc(); 2591 __ enter(); 2592 2593 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2594 2595 __ aesenc_loadkeys(key, keylen); 2596 __ aesecb_encrypt(from, to, keylen); 2597 2598 __ mov(r0, 0); 2599 2600 __ leave(); 2601 __ ret(lr); 2602 2603 return start; 2604 } 2605 2606 // Arguments: 2607 // 2608 // Inputs: 2609 // c_rarg0 - source byte array address 2610 // c_rarg1 - destination byte array address 2611 // c_rarg2 - K (key) in little endian int array 2612 // 2613 address generate_aescrypt_decryptBlock() { 2614 assert(UseAES, "need AES cryptographic extension support"); 2615 __ align(CodeEntryAlignment); 2616 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2617 Label L_doLast; 2618 2619 const Register from = c_rarg0; // source array address 2620 const Register to = c_rarg1; // destination array address 2621 const Register key = c_rarg2; // key array address 2622 const Register keylen = rscratch1; 2623 2624 address start = __ pc(); 2625 __ enter(); // required for proper stackwalking of RuntimeStub frame 2626 2627 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2628 2629 __ aesecb_decrypt(from, to, key, keylen); 2630 2631 __ mov(r0, 0); 2632 2633 __ leave(); 2634 __ ret(lr); 2635 2636 return start; 2637 } 2638 2639 // Arguments: 2640 // 2641 // Inputs: 2642 // c_rarg0 - source byte array address 2643 // c_rarg1 - destination byte array address 2644 // c_rarg2 - K (key) in little endian int array 2645 // c_rarg3 - r vector byte array address 2646 // c_rarg4 - input length 2647 // 2648 // Output: 2649 // x0 - input length 2650 // 2651 address generate_cipherBlockChaining_encryptAESCrypt() { 2652 assert(UseAES, "need AES cryptographic extension support"); 2653 __ align(CodeEntryAlignment); 2654 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2655 2656 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2657 2658 const Register from = c_rarg0; // source array address 2659 const Register to = c_rarg1; // destination array address 2660 const Register key = c_rarg2; // key array address 2661 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2662 // and left with the results of the last encryption block 2663 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2664 const Register keylen = rscratch1; 2665 2666 address start = __ pc(); 2667 2668 __ enter(); 2669 2670 __ movw(rscratch2, len_reg); 2671 2672 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2673 2674 __ ld1(v0, __ T16B, rvec); 2675 2676 __ cmpw(keylen, 52); 2677 __ br(Assembler::CC, L_loadkeys_44); 2678 __ br(Assembler::EQ, L_loadkeys_52); 2679 2680 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2681 __ rev32(v17, __ T16B, v17); 2682 __ rev32(v18, __ T16B, v18); 2683 __ BIND(L_loadkeys_52); 2684 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2685 __ rev32(v19, __ T16B, v19); 2686 __ rev32(v20, __ T16B, v20); 2687 __ BIND(L_loadkeys_44); 2688 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2689 __ rev32(v21, __ T16B, v21); 2690 __ rev32(v22, __ T16B, v22); 2691 __ rev32(v23, __ T16B, v23); 2692 __ rev32(v24, __ T16B, v24); 2693 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2694 __ rev32(v25, __ T16B, v25); 2695 __ rev32(v26, __ T16B, v26); 2696 __ rev32(v27, __ T16B, v27); 2697 __ rev32(v28, __ T16B, v28); 2698 __ ld1(v29, v30, v31, __ T16B, key); 2699 __ rev32(v29, __ T16B, v29); 2700 __ rev32(v30, __ T16B, v30); 2701 __ rev32(v31, __ T16B, v31); 2702 2703 __ BIND(L_aes_loop); 2704 __ ld1(v1, __ T16B, __ post(from, 16)); 2705 __ eor(v0, __ T16B, v0, v1); 2706 2707 __ br(Assembler::CC, L_rounds_44); 2708 __ br(Assembler::EQ, L_rounds_52); 2709 2710 __ aese(v0, v17); __ aesmc(v0, v0); 2711 __ aese(v0, v18); __ aesmc(v0, v0); 2712 __ BIND(L_rounds_52); 2713 __ aese(v0, v19); __ aesmc(v0, v0); 2714 __ aese(v0, v20); __ aesmc(v0, v0); 2715 __ BIND(L_rounds_44); 2716 __ aese(v0, v21); __ aesmc(v0, v0); 2717 __ aese(v0, v22); __ aesmc(v0, v0); 2718 __ aese(v0, v23); __ aesmc(v0, v0); 2719 __ aese(v0, v24); __ aesmc(v0, v0); 2720 __ aese(v0, v25); __ aesmc(v0, v0); 2721 __ aese(v0, v26); __ aesmc(v0, v0); 2722 __ aese(v0, v27); __ aesmc(v0, v0); 2723 __ aese(v0, v28); __ aesmc(v0, v0); 2724 __ aese(v0, v29); __ aesmc(v0, v0); 2725 __ aese(v0, v30); 2726 __ eor(v0, __ T16B, v0, v31); 2727 2728 __ st1(v0, __ T16B, __ post(to, 16)); 2729 2730 __ subw(len_reg, len_reg, 16); 2731 __ cbnzw(len_reg, L_aes_loop); 2732 2733 __ st1(v0, __ T16B, rvec); 2734 2735 __ mov(r0, rscratch2); 2736 2737 __ leave(); 2738 __ ret(lr); 2739 2740 return start; 2741 } 2742 2743 // Arguments: 2744 // 2745 // Inputs: 2746 // c_rarg0 - source byte array address 2747 // c_rarg1 - destination byte array address 2748 // c_rarg2 - K (key) in little endian int array 2749 // c_rarg3 - r vector byte array address 2750 // c_rarg4 - input length 2751 // 2752 // Output: 2753 // r0 - input length 2754 // 2755 address generate_cipherBlockChaining_decryptAESCrypt() { 2756 assert(UseAES, "need AES cryptographic extension support"); 2757 __ align(CodeEntryAlignment); 2758 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2759 2760 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2761 2762 const Register from = c_rarg0; // source array address 2763 const Register to = c_rarg1; // destination array address 2764 const Register key = c_rarg2; // key array address 2765 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2766 // and left with the results of the last encryption block 2767 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2768 const Register keylen = rscratch1; 2769 2770 address start = __ pc(); 2771 2772 __ enter(); 2773 2774 __ movw(rscratch2, len_reg); 2775 2776 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2777 2778 __ ld1(v2, __ T16B, rvec); 2779 2780 __ ld1(v31, __ T16B, __ post(key, 16)); 2781 __ rev32(v31, __ T16B, v31); 2782 2783 __ cmpw(keylen, 52); 2784 __ br(Assembler::CC, L_loadkeys_44); 2785 __ br(Assembler::EQ, L_loadkeys_52); 2786 2787 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2788 __ rev32(v17, __ T16B, v17); 2789 __ rev32(v18, __ T16B, v18); 2790 __ BIND(L_loadkeys_52); 2791 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2792 __ rev32(v19, __ T16B, v19); 2793 __ rev32(v20, __ T16B, v20); 2794 __ BIND(L_loadkeys_44); 2795 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2796 __ rev32(v21, __ T16B, v21); 2797 __ rev32(v22, __ T16B, v22); 2798 __ rev32(v23, __ T16B, v23); 2799 __ rev32(v24, __ T16B, v24); 2800 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2801 __ rev32(v25, __ T16B, v25); 2802 __ rev32(v26, __ T16B, v26); 2803 __ rev32(v27, __ T16B, v27); 2804 __ rev32(v28, __ T16B, v28); 2805 __ ld1(v29, v30, __ T16B, key); 2806 __ rev32(v29, __ T16B, v29); 2807 __ rev32(v30, __ T16B, v30); 2808 2809 __ BIND(L_aes_loop); 2810 __ ld1(v0, __ T16B, __ post(from, 16)); 2811 __ orr(v1, __ T16B, v0, v0); 2812 2813 __ br(Assembler::CC, L_rounds_44); 2814 __ br(Assembler::EQ, L_rounds_52); 2815 2816 __ aesd(v0, v17); __ aesimc(v0, v0); 2817 __ aesd(v0, v18); __ aesimc(v0, v0); 2818 __ BIND(L_rounds_52); 2819 __ aesd(v0, v19); __ aesimc(v0, v0); 2820 __ aesd(v0, v20); __ aesimc(v0, v0); 2821 __ BIND(L_rounds_44); 2822 __ aesd(v0, v21); __ aesimc(v0, v0); 2823 __ aesd(v0, v22); __ aesimc(v0, v0); 2824 __ aesd(v0, v23); __ aesimc(v0, v0); 2825 __ aesd(v0, v24); __ aesimc(v0, v0); 2826 __ aesd(v0, v25); __ aesimc(v0, v0); 2827 __ aesd(v0, v26); __ aesimc(v0, v0); 2828 __ aesd(v0, v27); __ aesimc(v0, v0); 2829 __ aesd(v0, v28); __ aesimc(v0, v0); 2830 __ aesd(v0, v29); __ aesimc(v0, v0); 2831 __ aesd(v0, v30); 2832 __ eor(v0, __ T16B, v0, v31); 2833 __ eor(v0, __ T16B, v0, v2); 2834 2835 __ st1(v0, __ T16B, __ post(to, 16)); 2836 __ orr(v2, __ T16B, v1, v1); 2837 2838 __ subw(len_reg, len_reg, 16); 2839 __ cbnzw(len_reg, L_aes_loop); 2840 2841 __ st1(v2, __ T16B, rvec); 2842 2843 __ mov(r0, rscratch2); 2844 2845 __ leave(); 2846 __ ret(lr); 2847 2848 return start; 2849 } 2850 2851 // CTR AES crypt. 2852 // Arguments: 2853 // 2854 // Inputs: 2855 // c_rarg0 - source byte array address 2856 // c_rarg1 - destination byte array address 2857 // c_rarg2 - K (key) in little endian int array 2858 // c_rarg3 - counter vector byte array address 2859 // c_rarg4 - input length 2860 // c_rarg5 - saved encryptedCounter start 2861 // c_rarg6 - saved used length 2862 // 2863 // Output: 2864 // r0 - input length 2865 // 2866 address generate_counterMode_AESCrypt() { 2867 const Register in = c_rarg0; 2868 const Register out = c_rarg1; 2869 const Register key = c_rarg2; 2870 const Register counter = c_rarg3; 2871 const Register saved_len = c_rarg4, len = r10; 2872 const Register saved_encrypted_ctr = c_rarg5; 2873 const Register used_ptr = c_rarg6, used = r12; 2874 2875 const Register offset = r7; 2876 const Register keylen = r11; 2877 2878 const unsigned char block_size = 16; 2879 const int bulk_width = 4; 2880 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 2881 // performance with larger data sizes, but it also means that the 2882 // fast path isn't used until you have at least 8 blocks, and up 2883 // to 127 bytes of data will be executed on the slow path. For 2884 // that reason, and also so as not to blow away too much icache, 4 2885 // blocks seems like a sensible compromise. 2886 2887 // Algorithm: 2888 // 2889 // if (len == 0) { 2890 // goto DONE; 2891 // } 2892 // int result = len; 2893 // do { 2894 // if (used >= blockSize) { 2895 // if (len >= bulk_width * blockSize) { 2896 // CTR_large_block(); 2897 // if (len == 0) 2898 // goto DONE; 2899 // } 2900 // for (;;) { 2901 // 16ByteVector v0 = counter; 2902 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 2903 // used = 0; 2904 // if (len < blockSize) 2905 // break; /* goto NEXT */ 2906 // 16ByteVector v1 = load16Bytes(in, offset); 2907 // v1 = v1 ^ encryptedCounter; 2908 // store16Bytes(out, offset); 2909 // used = blockSize; 2910 // offset += blockSize; 2911 // len -= blockSize; 2912 // if (len == 0) 2913 // goto DONE; 2914 // } 2915 // } 2916 // NEXT: 2917 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 2918 // len--; 2919 // } while (len != 0); 2920 // DONE: 2921 // return result; 2922 // 2923 // CTR_large_block() 2924 // Wide bulk encryption of whole blocks. 2925 2926 __ align(CodeEntryAlignment); 2927 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 2928 const address start = __ pc(); 2929 __ enter(); 2930 2931 Label DONE, CTR_large_block, large_block_return; 2932 __ ldrw(used, Address(used_ptr)); 2933 __ cbzw(saved_len, DONE); 2934 2935 __ mov(len, saved_len); 2936 __ mov(offset, 0); 2937 2938 // Compute #rounds for AES based on the length of the key array 2939 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2940 2941 __ aesenc_loadkeys(key, keylen); 2942 2943 { 2944 Label L_CTR_loop, NEXT; 2945 2946 __ bind(L_CTR_loop); 2947 2948 __ cmp(used, block_size); 2949 __ br(__ LO, NEXT); 2950 2951 // Maybe we have a lot of data 2952 __ subsw(rscratch1, len, bulk_width * block_size); 2953 __ br(__ HS, CTR_large_block); 2954 __ BIND(large_block_return); 2955 __ cbzw(len, DONE); 2956 2957 // Setup the counter 2958 __ movi(v4, __ T4S, 0); 2959 __ movi(v5, __ T4S, 1); 2960 __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 } 2961 2962 __ ld1(v0, __ T16B, counter); // Load the counter into v0 2963 __ rev32(v16, __ T16B, v0); 2964 __ addv(v16, __ T4S, v16, v4); 2965 __ rev32(v16, __ T16B, v16); 2966 __ st1(v16, __ T16B, counter); // Save the incremented counter back 2967 2968 { 2969 // We have fewer than bulk_width blocks of data left. Encrypt 2970 // them one by one until there is less than a full block 2971 // remaining, being careful to save both the encrypted counter 2972 // and the counter. 2973 2974 Label inner_loop; 2975 __ bind(inner_loop); 2976 // Counter to encrypt is in v0 2977 __ aesecb_encrypt(noreg, noreg, keylen); 2978 __ st1(v0, __ T16B, saved_encrypted_ctr); 2979 2980 // Do we have a remaining full block? 2981 2982 __ mov(used, 0); 2983 __ cmp(len, block_size); 2984 __ br(__ LO, NEXT); 2985 2986 // Yes, we have a full block 2987 __ ldrq(v1, Address(in, offset)); 2988 __ eor(v1, __ T16B, v1, v0); 2989 __ strq(v1, Address(out, offset)); 2990 __ mov(used, block_size); 2991 __ add(offset, offset, block_size); 2992 2993 __ subw(len, len, block_size); 2994 __ cbzw(len, DONE); 2995 2996 // Increment the counter, store it back 2997 __ orr(v0, __ T16B, v16, v16); 2998 __ rev32(v16, __ T16B, v16); 2999 __ addv(v16, __ T4S, v16, v4); 3000 __ rev32(v16, __ T16B, v16); 3001 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3002 3003 __ b(inner_loop); 3004 } 3005 3006 __ BIND(NEXT); 3007 3008 // Encrypt a single byte, and loop. 3009 // We expect this to be a rare event. 3010 __ ldrb(rscratch1, Address(in, offset)); 3011 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3012 __ eor(rscratch1, rscratch1, rscratch2); 3013 __ strb(rscratch1, Address(out, offset)); 3014 __ add(offset, offset, 1); 3015 __ add(used, used, 1); 3016 __ subw(len, len,1); 3017 __ cbnzw(len, L_CTR_loop); 3018 } 3019 3020 __ bind(DONE); 3021 __ strw(used, Address(used_ptr)); 3022 __ mov(r0, saved_len); 3023 3024 __ leave(); // required for proper stackwalking of RuntimeStub frame 3025 __ ret(lr); 3026 3027 // Bulk encryption 3028 3029 __ BIND (CTR_large_block); 3030 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3031 3032 if (bulk_width == 8) { 3033 __ sub(sp, sp, 4 * 16); 3034 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3035 } 3036 __ sub(sp, sp, 4 * 16); 3037 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3038 RegSet saved_regs = (RegSet::of(in, out, offset) 3039 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3040 __ push(saved_regs, sp); 3041 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3042 __ add(in, in, offset); 3043 __ add(out, out, offset); 3044 3045 // Keys should already be loaded into the correct registers 3046 3047 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3048 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3049 3050 // AES/CTR loop 3051 { 3052 Label L_CTR_loop; 3053 __ BIND(L_CTR_loop); 3054 3055 // Setup the counters 3056 __ movi(v8, __ T4S, 0); 3057 __ movi(v9, __ T4S, 1); 3058 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3059 3060 for (int i = 0; i < bulk_width; i++) { 3061 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3062 __ rev32(v0_ofs, __ T16B, v16); 3063 __ addv(v16, __ T4S, v16, v8); 3064 } 3065 3066 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3067 3068 // Encrypt the counters 3069 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3070 3071 if (bulk_width == 8) { 3072 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3073 } 3074 3075 // XOR the encrypted counters with the inputs 3076 for (int i = 0; i < bulk_width; i++) { 3077 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3078 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3079 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3080 } 3081 3082 // Write the encrypted data 3083 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3084 if (bulk_width == 8) { 3085 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3086 } 3087 3088 __ subw(len, len, 16 * bulk_width); 3089 __ cbnzw(len, L_CTR_loop); 3090 } 3091 3092 // Save the counter back where it goes 3093 __ rev32(v16, __ T16B, v16); 3094 __ st1(v16, __ T16B, counter); 3095 3096 __ pop(saved_regs, sp); 3097 3098 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3099 if (bulk_width == 8) { 3100 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3101 } 3102 3103 __ andr(rscratch1, len, -16 * bulk_width); 3104 __ sub(len, len, rscratch1); 3105 __ add(offset, offset, rscratch1); 3106 __ mov(used, 16); 3107 __ strw(used, Address(used_ptr)); 3108 __ b(large_block_return); 3109 3110 return start; 3111 } 3112 3113 // Vector AES Galois Counter Mode implementation. Parameters: 3114 // 3115 // in = c_rarg0 3116 // len = c_rarg1 3117 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3118 // out = c_rarg3 3119 // key = c_rarg4 3120 // state = c_rarg5 - GHASH.state 3121 // subkeyHtbl = c_rarg6 - powers of H 3122 // counter = c_rarg7 - 16 bytes of CTR 3123 // return - number of processed bytes 3124 address generate_galoisCounterMode_AESCrypt() { 3125 address ghash_polynomial = __ pc(); 3126 __ emit_int64(0x87); // The low-order bits of the field 3127 // polynomial (i.e. p = z^7+z^2+z+1) 3128 // repeated in the low and high parts of a 3129 // 128-bit vector 3130 __ emit_int64(0x87); 3131 3132 __ align(CodeEntryAlignment); 3133 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3134 address start = __ pc(); 3135 __ enter(); 3136 3137 const Register in = c_rarg0; 3138 const Register len = c_rarg1; 3139 const Register ct = c_rarg2; 3140 const Register out = c_rarg3; 3141 // and updated with the incremented counter in the end 3142 3143 const Register key = c_rarg4; 3144 const Register state = c_rarg5; 3145 3146 const Register subkeyHtbl = c_rarg6; 3147 3148 const Register counter = c_rarg7; 3149 3150 const Register keylen = r10; 3151 // Save state before entering routine 3152 __ sub(sp, sp, 4 * 16); 3153 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3154 __ sub(sp, sp, 4 * 16); 3155 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3156 3157 // __ andr(len, len, -512); 3158 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3159 __ str(len, __ pre(sp, -2 * wordSize)); 3160 3161 Label DONE; 3162 __ cbz(len, DONE); 3163 3164 // Compute #rounds for AES based on the length of the key array 3165 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3166 3167 __ aesenc_loadkeys(key, keylen); 3168 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3169 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3170 3171 // AES/CTR loop 3172 { 3173 Label L_CTR_loop; 3174 __ BIND(L_CTR_loop); 3175 3176 // Setup the counters 3177 __ movi(v8, __ T4S, 0); 3178 __ movi(v9, __ T4S, 1); 3179 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3180 3181 assert(v0->encoding() < v8->encoding(), ""); 3182 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3183 FloatRegister f = as_FloatRegister(i); 3184 __ rev32(f, __ T16B, v16); 3185 __ addv(v16, __ T4S, v16, v8); 3186 } 3187 3188 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3189 3190 // Encrypt the counters 3191 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3192 3193 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3194 3195 // XOR the encrypted counters with the inputs 3196 for (int i = 0; i < 8; i++) { 3197 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3198 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3199 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3200 } 3201 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3202 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3203 3204 __ subw(len, len, 16 * 8); 3205 __ cbnzw(len, L_CTR_loop); 3206 } 3207 3208 __ rev32(v16, __ T16B, v16); 3209 __ st1(v16, __ T16B, counter); 3210 3211 __ ldr(len, Address(sp)); 3212 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3213 3214 // GHASH/CTR loop 3215 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3216 len, /*unrolls*/4); 3217 3218 #ifdef ASSERT 3219 { Label L; 3220 __ cmp(len, (unsigned char)0); 3221 __ br(Assembler::EQ, L); 3222 __ stop("stubGenerator: abort"); 3223 __ bind(L); 3224 } 3225 #endif 3226 3227 __ bind(DONE); 3228 // Return the number of bytes processed 3229 __ ldr(r0, __ post(sp, 2 * wordSize)); 3230 3231 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3232 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3233 3234 __ leave(); // required for proper stackwalking of RuntimeStub frame 3235 __ ret(lr); 3236 return start; 3237 } 3238 3239 // Utility routines for md5. 3240 // Clobbers r10 and r11. 3241 void md5_FF(Register buf, Register r1, Register r2, Register r3, Register r4, 3242 int k, int s, int t) { 3243 Register rscratch3 = r10; 3244 Register rscratch4 = r11; 3245 3246 __ eorw(rscratch3, r3, r4); 3247 __ movw(rscratch2, t); 3248 __ andw(rscratch3, rscratch3, r2); 3249 __ addw(rscratch4, r1, rscratch2); 3250 __ ldrw(rscratch1, Address(buf, k*4)); 3251 __ eorw(rscratch3, rscratch3, r4); 3252 __ addw(rscratch4, rscratch4, rscratch1); 3253 __ addw(rscratch3, rscratch3, rscratch4); 3254 __ rorw(rscratch2, rscratch3, 32 - s); 3255 __ addw(r1, rscratch2, r2); 3256 } 3257 3258 void md5_GG(Register buf, Register r1, Register r2, Register r3, Register r4, 3259 int k, int s, int t) { 3260 Register rscratch3 = r10; 3261 Register rscratch4 = r11; 3262 3263 __ andw(rscratch3, r2, r4); 3264 __ bicw(rscratch4, r3, r4); 3265 __ ldrw(rscratch1, Address(buf, k*4)); 3266 __ movw(rscratch2, t); 3267 __ orrw(rscratch3, rscratch3, rscratch4); 3268 __ addw(rscratch4, r1, rscratch2); 3269 __ addw(rscratch4, rscratch4, rscratch1); 3270 __ addw(rscratch3, rscratch3, rscratch4); 3271 __ rorw(rscratch2, rscratch3, 32 - s); 3272 __ addw(r1, rscratch2, r2); 3273 } 3274 3275 void md5_HH(Register buf, Register r1, Register r2, Register r3, Register r4, 3276 int k, int s, int t) { 3277 Register rscratch3 = r10; 3278 Register rscratch4 = r11; 3279 3280 __ eorw(rscratch3, r3, r4); 3281 __ movw(rscratch2, t); 3282 __ addw(rscratch4, r1, rscratch2); 3283 __ ldrw(rscratch1, Address(buf, k*4)); 3284 __ eorw(rscratch3, rscratch3, r2); 3285 __ addw(rscratch4, rscratch4, rscratch1); 3286 __ addw(rscratch3, rscratch3, rscratch4); 3287 __ rorw(rscratch2, rscratch3, 32 - s); 3288 __ addw(r1, rscratch2, r2); 3289 } 3290 3291 void md5_II(Register buf, Register r1, Register r2, Register r3, Register r4, 3292 int k, int s, int t) { 3293 Register rscratch3 = r10; 3294 Register rscratch4 = r11; 3295 3296 __ movw(rscratch3, t); 3297 __ ornw(rscratch2, r2, r4); 3298 __ addw(rscratch4, r1, rscratch3); 3299 __ ldrw(rscratch1, Address(buf, k*4)); 3300 __ eorw(rscratch3, rscratch2, r3); 3301 __ addw(rscratch4, rscratch4, rscratch1); 3302 __ addw(rscratch3, rscratch3, rscratch4); 3303 __ rorw(rscratch2, rscratch3, 32 - s); 3304 __ addw(r1, rscratch2, r2); 3305 } 3306 3307 // Arguments: 3308 // 3309 // Inputs: 3310 // c_rarg0 - byte[] source+offset 3311 // c_rarg1 - int[] SHA.state 3312 // c_rarg2 - int offset 3313 // c_rarg3 - int limit 3314 // 3315 address generate_md5_implCompress(bool multi_block, const char *name) { 3316 __ align(CodeEntryAlignment); 3317 StubCodeMark mark(this, "StubRoutines", name); 3318 address start = __ pc(); 3319 3320 Register buf = c_rarg0; 3321 Register state = c_rarg1; 3322 Register ofs = c_rarg2; 3323 Register limit = c_rarg3; 3324 Register a = r4; 3325 Register b = r5; 3326 Register c = r6; 3327 Register d = r7; 3328 Register rscratch3 = r10; 3329 Register rscratch4 = r11; 3330 3331 Label md5_loop; 3332 __ BIND(md5_loop); 3333 3334 // Save hash values for addition after rounds 3335 __ ldrw(a, Address(state, 0)); 3336 __ ldrw(b, Address(state, 4)); 3337 __ ldrw(c, Address(state, 8)); 3338 __ ldrw(d, Address(state, 12)); 3339 3340 // Round 1 3341 md5_FF(buf, a, b, c, d, 0, 7, 0xd76aa478); 3342 md5_FF(buf, d, a, b, c, 1, 12, 0xe8c7b756); 3343 md5_FF(buf, c, d, a, b, 2, 17, 0x242070db); 3344 md5_FF(buf, b, c, d, a, 3, 22, 0xc1bdceee); 3345 md5_FF(buf, a, b, c, d, 4, 7, 0xf57c0faf); 3346 md5_FF(buf, d, a, b, c, 5, 12, 0x4787c62a); 3347 md5_FF(buf, c, d, a, b, 6, 17, 0xa8304613); 3348 md5_FF(buf, b, c, d, a, 7, 22, 0xfd469501); 3349 md5_FF(buf, a, b, c, d, 8, 7, 0x698098d8); 3350 md5_FF(buf, d, a, b, c, 9, 12, 0x8b44f7af); 3351 md5_FF(buf, c, d, a, b, 10, 17, 0xffff5bb1); 3352 md5_FF(buf, b, c, d, a, 11, 22, 0x895cd7be); 3353 md5_FF(buf, a, b, c, d, 12, 7, 0x6b901122); 3354 md5_FF(buf, d, a, b, c, 13, 12, 0xfd987193); 3355 md5_FF(buf, c, d, a, b, 14, 17, 0xa679438e); 3356 md5_FF(buf, b, c, d, a, 15, 22, 0x49b40821); 3357 3358 // Round 2 3359 md5_GG(buf, a, b, c, d, 1, 5, 0xf61e2562); 3360 md5_GG(buf, d, a, b, c, 6, 9, 0xc040b340); 3361 md5_GG(buf, c, d, a, b, 11, 14, 0x265e5a51); 3362 md5_GG(buf, b, c, d, a, 0, 20, 0xe9b6c7aa); 3363 md5_GG(buf, a, b, c, d, 5, 5, 0xd62f105d); 3364 md5_GG(buf, d, a, b, c, 10, 9, 0x02441453); 3365 md5_GG(buf, c, d, a, b, 15, 14, 0xd8a1e681); 3366 md5_GG(buf, b, c, d, a, 4, 20, 0xe7d3fbc8); 3367 md5_GG(buf, a, b, c, d, 9, 5, 0x21e1cde6); 3368 md5_GG(buf, d, a, b, c, 14, 9, 0xc33707d6); 3369 md5_GG(buf, c, d, a, b, 3, 14, 0xf4d50d87); 3370 md5_GG(buf, b, c, d, a, 8, 20, 0x455a14ed); 3371 md5_GG(buf, a, b, c, d, 13, 5, 0xa9e3e905); 3372 md5_GG(buf, d, a, b, c, 2, 9, 0xfcefa3f8); 3373 md5_GG(buf, c, d, a, b, 7, 14, 0x676f02d9); 3374 md5_GG(buf, b, c, d, a, 12, 20, 0x8d2a4c8a); 3375 3376 // Round 3 3377 md5_HH(buf, a, b, c, d, 5, 4, 0xfffa3942); 3378 md5_HH(buf, d, a, b, c, 8, 11, 0x8771f681); 3379 md5_HH(buf, c, d, a, b, 11, 16, 0x6d9d6122); 3380 md5_HH(buf, b, c, d, a, 14, 23, 0xfde5380c); 3381 md5_HH(buf, a, b, c, d, 1, 4, 0xa4beea44); 3382 md5_HH(buf, d, a, b, c, 4, 11, 0x4bdecfa9); 3383 md5_HH(buf, c, d, a, b, 7, 16, 0xf6bb4b60); 3384 md5_HH(buf, b, c, d, a, 10, 23, 0xbebfbc70); 3385 md5_HH(buf, a, b, c, d, 13, 4, 0x289b7ec6); 3386 md5_HH(buf, d, a, b, c, 0, 11, 0xeaa127fa); 3387 md5_HH(buf, c, d, a, b, 3, 16, 0xd4ef3085); 3388 md5_HH(buf, b, c, d, a, 6, 23, 0x04881d05); 3389 md5_HH(buf, a, b, c, d, 9, 4, 0xd9d4d039); 3390 md5_HH(buf, d, a, b, c, 12, 11, 0xe6db99e5); 3391 md5_HH(buf, c, d, a, b, 15, 16, 0x1fa27cf8); 3392 md5_HH(buf, b, c, d, a, 2, 23, 0xc4ac5665); 3393 3394 // Round 4 3395 md5_II(buf, a, b, c, d, 0, 6, 0xf4292244); 3396 md5_II(buf, d, a, b, c, 7, 10, 0x432aff97); 3397 md5_II(buf, c, d, a, b, 14, 15, 0xab9423a7); 3398 md5_II(buf, b, c, d, a, 5, 21, 0xfc93a039); 3399 md5_II(buf, a, b, c, d, 12, 6, 0x655b59c3); 3400 md5_II(buf, d, a, b, c, 3, 10, 0x8f0ccc92); 3401 md5_II(buf, c, d, a, b, 10, 15, 0xffeff47d); 3402 md5_II(buf, b, c, d, a, 1, 21, 0x85845dd1); 3403 md5_II(buf, a, b, c, d, 8, 6, 0x6fa87e4f); 3404 md5_II(buf, d, a, b, c, 15, 10, 0xfe2ce6e0); 3405 md5_II(buf, c, d, a, b, 6, 15, 0xa3014314); 3406 md5_II(buf, b, c, d, a, 13, 21, 0x4e0811a1); 3407 md5_II(buf, a, b, c, d, 4, 6, 0xf7537e82); 3408 md5_II(buf, d, a, b, c, 11, 10, 0xbd3af235); 3409 md5_II(buf, c, d, a, b, 2, 15, 0x2ad7d2bb); 3410 md5_II(buf, b, c, d, a, 9, 21, 0xeb86d391); 3411 3412 // write hash values back in the correct order 3413 __ ldrw(rscratch1, Address(state, 0)); 3414 __ addw(rscratch1, rscratch1, a); 3415 __ strw(rscratch1, Address(state, 0)); 3416 3417 __ ldrw(rscratch2, Address(state, 4)); 3418 __ addw(rscratch2, rscratch2, b); 3419 __ strw(rscratch2, Address(state, 4)); 3420 3421 __ ldrw(rscratch3, Address(state, 8)); 3422 __ addw(rscratch3, rscratch3, c); 3423 __ strw(rscratch3, Address(state, 8)); 3424 3425 __ ldrw(rscratch4, Address(state, 12)); 3426 __ addw(rscratch4, rscratch4, d); 3427 __ strw(rscratch4, Address(state, 12)); 3428 3429 if (multi_block) { 3430 __ add(buf, buf, 64); 3431 __ add(ofs, ofs, 64); 3432 __ cmp(ofs, limit); 3433 __ br(Assembler::LE, md5_loop); 3434 __ mov(c_rarg0, ofs); // return ofs 3435 } 3436 3437 __ ret(lr); 3438 3439 return start; 3440 } 3441 3442 // Arguments: 3443 // 3444 // Inputs: 3445 // c_rarg0 - byte[] source+offset 3446 // c_rarg1 - int[] SHA.state 3447 // c_rarg2 - int offset 3448 // c_rarg3 - int limit 3449 // 3450 address generate_sha1_implCompress(bool multi_block, const char *name) { 3451 __ align(CodeEntryAlignment); 3452 StubCodeMark mark(this, "StubRoutines", name); 3453 address start = __ pc(); 3454 3455 Register buf = c_rarg0; 3456 Register state = c_rarg1; 3457 Register ofs = c_rarg2; 3458 Register limit = c_rarg3; 3459 3460 Label keys; 3461 Label sha1_loop; 3462 3463 // load the keys into v0..v3 3464 __ adr(rscratch1, keys); 3465 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3466 // load 5 words state into v6, v7 3467 __ ldrq(v6, Address(state, 0)); 3468 __ ldrs(v7, Address(state, 16)); 3469 3470 3471 __ BIND(sha1_loop); 3472 // load 64 bytes of data into v16..v19 3473 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3474 __ rev32(v16, __ T16B, v16); 3475 __ rev32(v17, __ T16B, v17); 3476 __ rev32(v18, __ T16B, v18); 3477 __ rev32(v19, __ T16B, v19); 3478 3479 // do the sha1 3480 __ addv(v4, __ T4S, v16, v0); 3481 __ orr(v20, __ T16B, v6, v6); 3482 3483 FloatRegister d0 = v16; 3484 FloatRegister d1 = v17; 3485 FloatRegister d2 = v18; 3486 FloatRegister d3 = v19; 3487 3488 for (int round = 0; round < 20; round++) { 3489 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3490 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3491 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3492 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3493 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3494 3495 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3496 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3497 __ sha1h(tmp2, __ T4S, v20); 3498 if (round < 5) 3499 __ sha1c(v20, __ T4S, tmp3, tmp4); 3500 else if (round < 10 || round >= 15) 3501 __ sha1p(v20, __ T4S, tmp3, tmp4); 3502 else 3503 __ sha1m(v20, __ T4S, tmp3, tmp4); 3504 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3505 3506 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3507 } 3508 3509 __ addv(v7, __ T2S, v7, v21); 3510 __ addv(v6, __ T4S, v6, v20); 3511 3512 if (multi_block) { 3513 __ add(ofs, ofs, 64); 3514 __ cmp(ofs, limit); 3515 __ br(Assembler::LE, sha1_loop); 3516 __ mov(c_rarg0, ofs); // return ofs 3517 } 3518 3519 __ strq(v6, Address(state, 0)); 3520 __ strs(v7, Address(state, 16)); 3521 3522 __ ret(lr); 3523 3524 __ bind(keys); 3525 __ emit_int32(0x5a827999); 3526 __ emit_int32(0x6ed9eba1); 3527 __ emit_int32(0x8f1bbcdc); 3528 __ emit_int32(0xca62c1d6); 3529 3530 return start; 3531 } 3532 3533 3534 // Arguments: 3535 // 3536 // Inputs: 3537 // c_rarg0 - byte[] source+offset 3538 // c_rarg1 - int[] SHA.state 3539 // c_rarg2 - int offset 3540 // c_rarg3 - int limit 3541 // 3542 address generate_sha256_implCompress(bool multi_block, const char *name) { 3543 static const uint32_t round_consts[64] = { 3544 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3545 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3546 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3547 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3548 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3549 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3550 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3551 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3552 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3553 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3554 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3555 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3556 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3557 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3558 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3559 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3560 }; 3561 __ align(CodeEntryAlignment); 3562 StubCodeMark mark(this, "StubRoutines", name); 3563 address start = __ pc(); 3564 3565 Register buf = c_rarg0; 3566 Register state = c_rarg1; 3567 Register ofs = c_rarg2; 3568 Register limit = c_rarg3; 3569 3570 Label sha1_loop; 3571 3572 __ stpd(v8, v9, __ pre(sp, -32)); 3573 __ stpd(v10, v11, Address(sp, 16)); 3574 3575 // dga == v0 3576 // dgb == v1 3577 // dg0 == v2 3578 // dg1 == v3 3579 // dg2 == v4 3580 // t0 == v6 3581 // t1 == v7 3582 3583 // load 16 keys to v16..v31 3584 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3585 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3586 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3587 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3588 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3589 3590 // load 8 words (256 bits) state 3591 __ ldpq(v0, v1, state); 3592 3593 __ BIND(sha1_loop); 3594 // load 64 bytes of data into v8..v11 3595 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3596 __ rev32(v8, __ T16B, v8); 3597 __ rev32(v9, __ T16B, v9); 3598 __ rev32(v10, __ T16B, v10); 3599 __ rev32(v11, __ T16B, v11); 3600 3601 __ addv(v6, __ T4S, v8, v16); 3602 __ orr(v2, __ T16B, v0, v0); 3603 __ orr(v3, __ T16B, v1, v1); 3604 3605 FloatRegister d0 = v8; 3606 FloatRegister d1 = v9; 3607 FloatRegister d2 = v10; 3608 FloatRegister d3 = v11; 3609 3610 3611 for (int round = 0; round < 16; round++) { 3612 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3613 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3614 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3615 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3616 3617 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3618 __ orr(v4, __ T16B, v2, v2); 3619 if (round < 15) 3620 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3621 __ sha256h(v2, __ T4S, v3, tmp2); 3622 __ sha256h2(v3, __ T4S, v4, tmp2); 3623 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3624 3625 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3626 } 3627 3628 __ addv(v0, __ T4S, v0, v2); 3629 __ addv(v1, __ T4S, v1, v3); 3630 3631 if (multi_block) { 3632 __ add(ofs, ofs, 64); 3633 __ cmp(ofs, limit); 3634 __ br(Assembler::LE, sha1_loop); 3635 __ mov(c_rarg0, ofs); // return ofs 3636 } 3637 3638 __ ldpd(v10, v11, Address(sp, 16)); 3639 __ ldpd(v8, v9, __ post(sp, 32)); 3640 3641 __ stpq(v0, v1, state); 3642 3643 __ ret(lr); 3644 3645 return start; 3646 } 3647 3648 // Double rounds for sha512. 3649 void sha512_dround(int dr, 3650 FloatRegister vi0, FloatRegister vi1, 3651 FloatRegister vi2, FloatRegister vi3, 3652 FloatRegister vi4, FloatRegister vrc0, 3653 FloatRegister vrc1, FloatRegister vin0, 3654 FloatRegister vin1, FloatRegister vin2, 3655 FloatRegister vin3, FloatRegister vin4) { 3656 if (dr < 36) { 3657 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3658 } 3659 __ addv(v5, __ T2D, vrc0, vin0); 3660 __ ext(v6, __ T16B, vi2, vi3, 8); 3661 __ ext(v5, __ T16B, v5, v5, 8); 3662 __ ext(v7, __ T16B, vi1, vi2, 8); 3663 __ addv(vi3, __ T2D, vi3, v5); 3664 if (dr < 32) { 3665 __ ext(v5, __ T16B, vin3, vin4, 8); 3666 __ sha512su0(vin0, __ T2D, vin1); 3667 } 3668 __ sha512h(vi3, __ T2D, v6, v7); 3669 if (dr < 32) { 3670 __ sha512su1(vin0, __ T2D, vin2, v5); 3671 } 3672 __ addv(vi4, __ T2D, vi1, vi3); 3673 __ sha512h2(vi3, __ T2D, vi1, vi0); 3674 } 3675 3676 // Arguments: 3677 // 3678 // Inputs: 3679 // c_rarg0 - byte[] source+offset 3680 // c_rarg1 - int[] SHA.state 3681 // c_rarg2 - int offset 3682 // c_rarg3 - int limit 3683 // 3684 address generate_sha512_implCompress(bool multi_block, const char *name) { 3685 static const uint64_t round_consts[80] = { 3686 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3687 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3688 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3689 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3690 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3691 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3692 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3693 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3694 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3695 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3696 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3697 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3698 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3699 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3700 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3701 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3702 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3703 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3704 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3705 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3706 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3707 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3708 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3709 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3710 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3711 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3712 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3713 }; 3714 3715 __ align(CodeEntryAlignment); 3716 StubCodeMark mark(this, "StubRoutines", name); 3717 address start = __ pc(); 3718 3719 Register buf = c_rarg0; 3720 Register state = c_rarg1; 3721 Register ofs = c_rarg2; 3722 Register limit = c_rarg3; 3723 3724 __ stpd(v8, v9, __ pre(sp, -64)); 3725 __ stpd(v10, v11, Address(sp, 16)); 3726 __ stpd(v12, v13, Address(sp, 32)); 3727 __ stpd(v14, v15, Address(sp, 48)); 3728 3729 Label sha512_loop; 3730 3731 // load state 3732 __ ld1(v8, v9, v10, v11, __ T2D, state); 3733 3734 // load first 4 round constants 3735 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3736 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3737 3738 __ BIND(sha512_loop); 3739 // load 128B of data into v12..v19 3740 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3741 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3742 __ rev64(v12, __ T16B, v12); 3743 __ rev64(v13, __ T16B, v13); 3744 __ rev64(v14, __ T16B, v14); 3745 __ rev64(v15, __ T16B, v15); 3746 __ rev64(v16, __ T16B, v16); 3747 __ rev64(v17, __ T16B, v17); 3748 __ rev64(v18, __ T16B, v18); 3749 __ rev64(v19, __ T16B, v19); 3750 3751 __ mov(rscratch2, rscratch1); 3752 3753 __ mov(v0, __ T16B, v8); 3754 __ mov(v1, __ T16B, v9); 3755 __ mov(v2, __ T16B, v10); 3756 __ mov(v3, __ T16B, v11); 3757 3758 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3759 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3760 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3761 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3762 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3763 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3764 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3765 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3766 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3767 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3768 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3769 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3770 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3771 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3772 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3773 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3774 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3775 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3776 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3777 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3778 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3779 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3780 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3781 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3782 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3783 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3784 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3785 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3786 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3787 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3788 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3789 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3790 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3791 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3792 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3793 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3794 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3795 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3796 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3797 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3798 3799 __ addv(v8, __ T2D, v8, v0); 3800 __ addv(v9, __ T2D, v9, v1); 3801 __ addv(v10, __ T2D, v10, v2); 3802 __ addv(v11, __ T2D, v11, v3); 3803 3804 if (multi_block) { 3805 __ add(ofs, ofs, 128); 3806 __ cmp(ofs, limit); 3807 __ br(Assembler::LE, sha512_loop); 3808 __ mov(c_rarg0, ofs); // return ofs 3809 } 3810 3811 __ st1(v8, v9, v10, v11, __ T2D, state); 3812 3813 __ ldpd(v14, v15, Address(sp, 48)); 3814 __ ldpd(v12, v13, Address(sp, 32)); 3815 __ ldpd(v10, v11, Address(sp, 16)); 3816 __ ldpd(v8, v9, __ post(sp, 64)); 3817 3818 __ ret(lr); 3819 3820 return start; 3821 } 3822 3823 // Arguments: 3824 // 3825 // Inputs: 3826 // c_rarg0 - byte[] source+offset 3827 // c_rarg1 - byte[] SHA.state 3828 // c_rarg2 - int block_size 3829 // c_rarg3 - int offset 3830 // c_rarg4 - int limit 3831 // 3832 address generate_sha3_implCompress(bool multi_block, const char *name) { 3833 static const uint64_t round_consts[24] = { 3834 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 3835 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 3836 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 3837 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 3838 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 3839 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 3840 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 3841 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 3842 }; 3843 3844 __ align(CodeEntryAlignment); 3845 StubCodeMark mark(this, "StubRoutines", name); 3846 address start = __ pc(); 3847 3848 Register buf = c_rarg0; 3849 Register state = c_rarg1; 3850 Register block_size = c_rarg2; 3851 Register ofs = c_rarg3; 3852 Register limit = c_rarg4; 3853 3854 Label sha3_loop, rounds24_loop; 3855 Label sha3_512_or_sha3_384, shake128; 3856 3857 __ stpd(v8, v9, __ pre(sp, -64)); 3858 __ stpd(v10, v11, Address(sp, 16)); 3859 __ stpd(v12, v13, Address(sp, 32)); 3860 __ stpd(v14, v15, Address(sp, 48)); 3861 3862 // load state 3863 __ add(rscratch1, state, 32); 3864 __ ld1(v0, v1, v2, v3, __ T1D, state); 3865 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 3866 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 3867 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 3868 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 3869 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 3870 __ ld1(v24, __ T1D, rscratch1); 3871 3872 __ BIND(sha3_loop); 3873 3874 // 24 keccak rounds 3875 __ movw(rscratch2, 24); 3876 3877 // load round_constants base 3878 __ lea(rscratch1, ExternalAddress((address) round_consts)); 3879 3880 // load input 3881 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3882 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 3883 __ eor(v0, __ T8B, v0, v25); 3884 __ eor(v1, __ T8B, v1, v26); 3885 __ eor(v2, __ T8B, v2, v27); 3886 __ eor(v3, __ T8B, v3, v28); 3887 __ eor(v4, __ T8B, v4, v29); 3888 __ eor(v5, __ T8B, v5, v30); 3889 __ eor(v6, __ T8B, v6, v31); 3890 3891 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 3892 __ tbz(block_size, 7, sha3_512_or_sha3_384); 3893 3894 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3895 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 3896 __ eor(v7, __ T8B, v7, v25); 3897 __ eor(v8, __ T8B, v8, v26); 3898 __ eor(v9, __ T8B, v9, v27); 3899 __ eor(v10, __ T8B, v10, v28); 3900 __ eor(v11, __ T8B, v11, v29); 3901 __ eor(v12, __ T8B, v12, v30); 3902 __ eor(v13, __ T8B, v13, v31); 3903 3904 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 3905 __ eor(v14, __ T8B, v14, v25); 3906 __ eor(v15, __ T8B, v15, v26); 3907 __ eor(v16, __ T8B, v16, v27); 3908 3909 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 3910 __ andw(c_rarg5, block_size, 48); 3911 __ cbzw(c_rarg5, rounds24_loop); 3912 3913 __ tbnz(block_size, 5, shake128); 3914 // block_size == 144, bit5 == 0, SHA3-244 3915 __ ldrd(v28, __ post(buf, 8)); 3916 __ eor(v17, __ T8B, v17, v28); 3917 __ b(rounds24_loop); 3918 3919 __ BIND(shake128); 3920 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 3921 __ eor(v17, __ T8B, v17, v28); 3922 __ eor(v18, __ T8B, v18, v29); 3923 __ eor(v19, __ T8B, v19, v30); 3924 __ eor(v20, __ T8B, v20, v31); 3925 __ b(rounds24_loop); // block_size == 168, SHAKE128 3926 3927 __ BIND(sha3_512_or_sha3_384); 3928 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 3929 __ eor(v7, __ T8B, v7, v25); 3930 __ eor(v8, __ T8B, v8, v26); 3931 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 3932 3933 // SHA3-384 3934 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 3935 __ eor(v9, __ T8B, v9, v27); 3936 __ eor(v10, __ T8B, v10, v28); 3937 __ eor(v11, __ T8B, v11, v29); 3938 __ eor(v12, __ T8B, v12, v30); 3939 3940 __ BIND(rounds24_loop); 3941 __ subw(rscratch2, rscratch2, 1); 3942 3943 __ eor3(v29, __ T16B, v4, v9, v14); 3944 __ eor3(v26, __ T16B, v1, v6, v11); 3945 __ eor3(v28, __ T16B, v3, v8, v13); 3946 __ eor3(v25, __ T16B, v0, v5, v10); 3947 __ eor3(v27, __ T16B, v2, v7, v12); 3948 __ eor3(v29, __ T16B, v29, v19, v24); 3949 __ eor3(v26, __ T16B, v26, v16, v21); 3950 __ eor3(v28, __ T16B, v28, v18, v23); 3951 __ eor3(v25, __ T16B, v25, v15, v20); 3952 __ eor3(v27, __ T16B, v27, v17, v22); 3953 3954 __ rax1(v30, __ T2D, v29, v26); 3955 __ rax1(v26, __ T2D, v26, v28); 3956 __ rax1(v28, __ T2D, v28, v25); 3957 __ rax1(v25, __ T2D, v25, v27); 3958 __ rax1(v27, __ T2D, v27, v29); 3959 3960 __ eor(v0, __ T16B, v0, v30); 3961 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 3962 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 3963 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 3964 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 3965 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 3966 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 3967 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 3968 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 3969 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 3970 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 3971 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 3972 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 3973 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 3974 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 3975 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 3976 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 3977 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 3978 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 3979 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 3980 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 3981 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 3982 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 3983 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 3984 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 3985 3986 __ bcax(v20, __ T16B, v31, v22, v8); 3987 __ bcax(v21, __ T16B, v8, v23, v22); 3988 __ bcax(v22, __ T16B, v22, v24, v23); 3989 __ bcax(v23, __ T16B, v23, v31, v24); 3990 __ bcax(v24, __ T16B, v24, v8, v31); 3991 3992 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 3993 3994 __ bcax(v17, __ T16B, v25, v19, v3); 3995 __ bcax(v18, __ T16B, v3, v15, v19); 3996 __ bcax(v19, __ T16B, v19, v16, v15); 3997 __ bcax(v15, __ T16B, v15, v25, v16); 3998 __ bcax(v16, __ T16B, v16, v3, v25); 3999 4000 __ bcax(v10, __ T16B, v29, v12, v26); 4001 __ bcax(v11, __ T16B, v26, v13, v12); 4002 __ bcax(v12, __ T16B, v12, v14, v13); 4003 __ bcax(v13, __ T16B, v13, v29, v14); 4004 __ bcax(v14, __ T16B, v14, v26, v29); 4005 4006 __ bcax(v7, __ T16B, v30, v9, v4); 4007 __ bcax(v8, __ T16B, v4, v5, v9); 4008 __ bcax(v9, __ T16B, v9, v6, v5); 4009 __ bcax(v5, __ T16B, v5, v30, v6); 4010 __ bcax(v6, __ T16B, v6, v4, v30); 4011 4012 __ bcax(v3, __ T16B, v27, v0, v28); 4013 __ bcax(v4, __ T16B, v28, v1, v0); 4014 __ bcax(v0, __ T16B, v0, v2, v1); 4015 __ bcax(v1, __ T16B, v1, v27, v2); 4016 __ bcax(v2, __ T16B, v2, v28, v27); 4017 4018 __ eor(v0, __ T16B, v0, v31); 4019 4020 __ cbnzw(rscratch2, rounds24_loop); 4021 4022 if (multi_block) { 4023 __ add(ofs, ofs, block_size); 4024 __ cmp(ofs, limit); 4025 __ br(Assembler::LE, sha3_loop); 4026 __ mov(c_rarg0, ofs); // return ofs 4027 } 4028 4029 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4030 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4031 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4032 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4033 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4034 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4035 __ st1(v24, __ T1D, state); 4036 4037 __ ldpd(v14, v15, Address(sp, 48)); 4038 __ ldpd(v12, v13, Address(sp, 32)); 4039 __ ldpd(v10, v11, Address(sp, 16)); 4040 __ ldpd(v8, v9, __ post(sp, 64)); 4041 4042 __ ret(lr); 4043 4044 return start; 4045 } 4046 4047 /** 4048 * Arguments: 4049 * 4050 * Inputs: 4051 * c_rarg0 - int crc 4052 * c_rarg1 - byte* buf 4053 * c_rarg2 - int length 4054 * 4055 * Output: 4056 * rax - int crc result 4057 */ 4058 address generate_updateBytesCRC32() { 4059 assert(UseCRC32Intrinsics, "what are we doing here?"); 4060 4061 __ align(CodeEntryAlignment); 4062 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4063 4064 address start = __ pc(); 4065 4066 const Register crc = c_rarg0; // crc 4067 const Register buf = c_rarg1; // source java byte array address 4068 const Register len = c_rarg2; // length 4069 const Register table0 = c_rarg3; // crc_table address 4070 const Register table1 = c_rarg4; 4071 const Register table2 = c_rarg5; 4072 const Register table3 = c_rarg6; 4073 const Register tmp3 = c_rarg7; 4074 4075 BLOCK_COMMENT("Entry:"); 4076 __ enter(); // required for proper stackwalking of RuntimeStub frame 4077 4078 __ kernel_crc32(crc, buf, len, 4079 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4080 4081 __ leave(); // required for proper stackwalking of RuntimeStub frame 4082 __ ret(lr); 4083 4084 return start; 4085 } 4086 4087 // ChaCha20 block function. This version parallelizes by loading 4088 // individual 32-bit state elements into vectors for four blocks 4089 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4090 // 4091 // state (int[16]) = c_rarg0 4092 // keystream (byte[1024]) = c_rarg1 4093 // return - number of bytes of keystream (always 256) 4094 address generate_chacha20Block_blockpar() { 4095 Label L_twoRounds, L_cc20_const; 4096 // The constant data is broken into two 128-bit segments to be loaded 4097 // onto FloatRegisters. The first 128 bits are a counter add overlay 4098 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4099 // The second 128-bits is a table constant used for 8-bit left rotations. 4100 __ BIND(L_cc20_const); 4101 __ emit_int64(0x0000000100000000UL); 4102 __ emit_int64(0x0000000300000002UL); 4103 __ emit_int64(0x0605040702010003UL); 4104 __ emit_int64(0x0E0D0C0F0A09080BUL); 4105 4106 __ align(CodeEntryAlignment); 4107 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4108 address start = __ pc(); 4109 __ enter(); 4110 4111 int i, j; 4112 const Register state = c_rarg0; 4113 const Register keystream = c_rarg1; 4114 const Register loopCtr = r10; 4115 const Register tmpAddr = r11; 4116 4117 const FloatRegister stateFirst = v0; 4118 const FloatRegister stateSecond = v1; 4119 const FloatRegister stateThird = v2; 4120 const FloatRegister stateFourth = v3; 4121 const FloatRegister origCtrState = v28; 4122 const FloatRegister scratch = v29; 4123 const FloatRegister lrot8Tbl = v30; 4124 4125 // Organize SIMD registers in an array that facilitates 4126 // putting repetitive opcodes into loop structures. It is 4127 // important that each grouping of 4 registers is monotonically 4128 // increasing to support the requirements of multi-register 4129 // instructions (e.g. ld4r, st4, etc.) 4130 const FloatRegister workSt[16] = { 4131 v4, v5, v6, v7, v16, v17, v18, v19, 4132 v20, v21, v22, v23, v24, v25, v26, v27 4133 }; 4134 4135 // Load from memory and interlace across 16 SIMD registers, 4136 // With each word from memory being broadcast to all lanes of 4137 // each successive SIMD register. 4138 // Addr(0) -> All lanes in workSt[i] 4139 // Addr(4) -> All lanes workSt[i + 1], etc. 4140 __ mov(tmpAddr, state); 4141 for (i = 0; i < 16; i += 4) { 4142 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4143 __ post(tmpAddr, 16)); 4144 } 4145 4146 // Pull in constant data. The first 16 bytes are the add overlay 4147 // which is applied to the vector holding the counter (state[12]). 4148 // The second 16 bytes is the index register for the 8-bit left 4149 // rotation tbl instruction. 4150 __ adr(tmpAddr, L_cc20_const); 4151 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4152 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4153 4154 // Set up the 10 iteration loop and perform all 8 quarter round ops 4155 __ mov(loopCtr, 10); 4156 __ BIND(L_twoRounds); 4157 4158 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4159 scratch, lrot8Tbl); 4160 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4161 scratch, lrot8Tbl); 4162 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4163 scratch, lrot8Tbl); 4164 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4165 scratch, lrot8Tbl); 4166 4167 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4168 scratch, lrot8Tbl); 4169 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4170 scratch, lrot8Tbl); 4171 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4172 scratch, lrot8Tbl); 4173 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4174 scratch, lrot8Tbl); 4175 4176 // Decrement and iterate 4177 __ sub(loopCtr, loopCtr, 1); 4178 __ cbnz(loopCtr, L_twoRounds); 4179 4180 __ mov(tmpAddr, state); 4181 4182 // Add the starting state back to the post-loop keystream 4183 // state. We read/interlace the state array from memory into 4184 // 4 registers similar to what we did in the beginning. Then 4185 // add the counter overlay onto workSt[12] at the end. 4186 for (i = 0; i < 16; i += 4) { 4187 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4188 __ post(tmpAddr, 16)); 4189 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4190 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4191 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4192 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4193 } 4194 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4195 4196 // Write to key stream, storing the same element out of workSt[0..15] 4197 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4198 // for the next element position. 4199 for (i = 0; i < 4; i++) { 4200 for (j = 0; j < 16; j += 4) { 4201 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4202 __ post(keystream, 16)); 4203 } 4204 } 4205 4206 __ mov(r0, 256); // Return length of output keystream 4207 __ leave(); 4208 __ ret(lr); 4209 4210 return start; 4211 } 4212 4213 /** 4214 * Arguments: 4215 * 4216 * Inputs: 4217 * c_rarg0 - int crc 4218 * c_rarg1 - byte* buf 4219 * c_rarg2 - int length 4220 * c_rarg3 - int* table 4221 * 4222 * Output: 4223 * r0 - int crc result 4224 */ 4225 address generate_updateBytesCRC32C() { 4226 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4227 4228 __ align(CodeEntryAlignment); 4229 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4230 4231 address start = __ pc(); 4232 4233 const Register crc = c_rarg0; // crc 4234 const Register buf = c_rarg1; // source java byte array address 4235 const Register len = c_rarg2; // length 4236 const Register table0 = c_rarg3; // crc_table address 4237 const Register table1 = c_rarg4; 4238 const Register table2 = c_rarg5; 4239 const Register table3 = c_rarg6; 4240 const Register tmp3 = c_rarg7; 4241 4242 BLOCK_COMMENT("Entry:"); 4243 __ enter(); // required for proper stackwalking of RuntimeStub frame 4244 4245 __ kernel_crc32c(crc, buf, len, 4246 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4247 4248 __ leave(); // required for proper stackwalking of RuntimeStub frame 4249 __ ret(lr); 4250 4251 return start; 4252 } 4253 4254 /*** 4255 * Arguments: 4256 * 4257 * Inputs: 4258 * c_rarg0 - int adler 4259 * c_rarg1 - byte* buff 4260 * c_rarg2 - int len 4261 * 4262 * Output: 4263 * c_rarg0 - int adler result 4264 */ 4265 address generate_updateBytesAdler32() { 4266 __ align(CodeEntryAlignment); 4267 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4268 address start = __ pc(); 4269 4270 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4271 4272 // Aliases 4273 Register adler = c_rarg0; 4274 Register s1 = c_rarg0; 4275 Register s2 = c_rarg3; 4276 Register buff = c_rarg1; 4277 Register len = c_rarg2; 4278 Register nmax = r4; 4279 Register base = r5; 4280 Register count = r6; 4281 Register temp0 = rscratch1; 4282 Register temp1 = rscratch2; 4283 FloatRegister vbytes = v0; 4284 FloatRegister vs1acc = v1; 4285 FloatRegister vs2acc = v2; 4286 FloatRegister vtable = v3; 4287 4288 // Max number of bytes we can process before having to take the mod 4289 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4290 uint64_t BASE = 0xfff1; 4291 uint64_t NMAX = 0x15B0; 4292 4293 __ mov(base, BASE); 4294 __ mov(nmax, NMAX); 4295 4296 // Load accumulation coefficients for the upper 16 bits 4297 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4298 __ ld1(vtable, __ T16B, Address(temp0)); 4299 4300 // s1 is initialized to the lower 16 bits of adler 4301 // s2 is initialized to the upper 16 bits of adler 4302 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4303 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4304 4305 // The pipelined loop needs at least 16 elements for 1 iteration 4306 // It does check this, but it is more effective to skip to the cleanup loop 4307 __ cmp(len, (u1)16); 4308 __ br(Assembler::HS, L_nmax); 4309 __ cbz(len, L_combine); 4310 4311 __ bind(L_simple_by1_loop); 4312 __ ldrb(temp0, Address(__ post(buff, 1))); 4313 __ add(s1, s1, temp0); 4314 __ add(s2, s2, s1); 4315 __ subs(len, len, 1); 4316 __ br(Assembler::HI, L_simple_by1_loop); 4317 4318 // s1 = s1 % BASE 4319 __ subs(temp0, s1, base); 4320 __ csel(s1, temp0, s1, Assembler::HS); 4321 4322 // s2 = s2 % BASE 4323 __ lsr(temp0, s2, 16); 4324 __ lsl(temp1, temp0, 4); 4325 __ sub(temp1, temp1, temp0); 4326 __ add(s2, temp1, s2, ext::uxth); 4327 4328 __ subs(temp0, s2, base); 4329 __ csel(s2, temp0, s2, Assembler::HS); 4330 4331 __ b(L_combine); 4332 4333 __ bind(L_nmax); 4334 __ subs(len, len, nmax); 4335 __ sub(count, nmax, 16); 4336 __ br(Assembler::LO, L_by16); 4337 4338 __ bind(L_nmax_loop); 4339 4340 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4341 vbytes, vs1acc, vs2acc, vtable); 4342 4343 __ subs(count, count, 16); 4344 __ br(Assembler::HS, L_nmax_loop); 4345 4346 // s1 = s1 % BASE 4347 __ lsr(temp0, s1, 16); 4348 __ lsl(temp1, temp0, 4); 4349 __ sub(temp1, temp1, temp0); 4350 __ add(temp1, temp1, s1, ext::uxth); 4351 4352 __ lsr(temp0, temp1, 16); 4353 __ lsl(s1, temp0, 4); 4354 __ sub(s1, s1, temp0); 4355 __ add(s1, s1, temp1, ext:: uxth); 4356 4357 __ subs(temp0, s1, base); 4358 __ csel(s1, temp0, s1, Assembler::HS); 4359 4360 // s2 = s2 % BASE 4361 __ lsr(temp0, s2, 16); 4362 __ lsl(temp1, temp0, 4); 4363 __ sub(temp1, temp1, temp0); 4364 __ add(temp1, temp1, s2, ext::uxth); 4365 4366 __ lsr(temp0, temp1, 16); 4367 __ lsl(s2, temp0, 4); 4368 __ sub(s2, s2, temp0); 4369 __ add(s2, s2, temp1, ext:: uxth); 4370 4371 __ subs(temp0, s2, base); 4372 __ csel(s2, temp0, s2, Assembler::HS); 4373 4374 __ subs(len, len, nmax); 4375 __ sub(count, nmax, 16); 4376 __ br(Assembler::HS, L_nmax_loop); 4377 4378 __ bind(L_by16); 4379 __ adds(len, len, count); 4380 __ br(Assembler::LO, L_by1); 4381 4382 __ bind(L_by16_loop); 4383 4384 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4385 vbytes, vs1acc, vs2acc, vtable); 4386 4387 __ subs(len, len, 16); 4388 __ br(Assembler::HS, L_by16_loop); 4389 4390 __ bind(L_by1); 4391 __ adds(len, len, 15); 4392 __ br(Assembler::LO, L_do_mod); 4393 4394 __ bind(L_by1_loop); 4395 __ ldrb(temp0, Address(__ post(buff, 1))); 4396 __ add(s1, temp0, s1); 4397 __ add(s2, s2, s1); 4398 __ subs(len, len, 1); 4399 __ br(Assembler::HS, L_by1_loop); 4400 4401 __ bind(L_do_mod); 4402 // s1 = s1 % BASE 4403 __ lsr(temp0, s1, 16); 4404 __ lsl(temp1, temp0, 4); 4405 __ sub(temp1, temp1, temp0); 4406 __ add(temp1, temp1, s1, ext::uxth); 4407 4408 __ lsr(temp0, temp1, 16); 4409 __ lsl(s1, temp0, 4); 4410 __ sub(s1, s1, temp0); 4411 __ add(s1, s1, temp1, ext:: uxth); 4412 4413 __ subs(temp0, s1, base); 4414 __ csel(s1, temp0, s1, Assembler::HS); 4415 4416 // s2 = s2 % BASE 4417 __ lsr(temp0, s2, 16); 4418 __ lsl(temp1, temp0, 4); 4419 __ sub(temp1, temp1, temp0); 4420 __ add(temp1, temp1, s2, ext::uxth); 4421 4422 __ lsr(temp0, temp1, 16); 4423 __ lsl(s2, temp0, 4); 4424 __ sub(s2, s2, temp0); 4425 __ add(s2, s2, temp1, ext:: uxth); 4426 4427 __ subs(temp0, s2, base); 4428 __ csel(s2, temp0, s2, Assembler::HS); 4429 4430 // Combine lower bits and higher bits 4431 __ bind(L_combine); 4432 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4433 4434 __ ret(lr); 4435 4436 return start; 4437 } 4438 4439 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4440 Register temp0, Register temp1, FloatRegister vbytes, 4441 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4442 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4443 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4444 // In non-vectorized code, we update s1 and s2 as: 4445 // s1 <- s1 + b1 4446 // s2 <- s2 + s1 4447 // s1 <- s1 + b2 4448 // s2 <- s2 + b1 4449 // ... 4450 // s1 <- s1 + b16 4451 // s2 <- s2 + s1 4452 // Putting above assignments together, we have: 4453 // s1_new = s1 + b1 + b2 + ... + b16 4454 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4455 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4456 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4457 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4458 4459 // s2 = s2 + s1 * 16 4460 __ add(s2, s2, s1, Assembler::LSL, 4); 4461 4462 // vs1acc = b1 + b2 + b3 + ... + b16 4463 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4464 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4465 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4466 __ uaddlv(vs1acc, __ T16B, vbytes); 4467 __ uaddlv(vs2acc, __ T8H, vs2acc); 4468 4469 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4470 __ fmovd(temp0, vs1acc); 4471 __ fmovd(temp1, vs2acc); 4472 __ add(s1, s1, temp0); 4473 __ add(s2, s2, temp1); 4474 } 4475 4476 /** 4477 * Arguments: 4478 * 4479 * Input: 4480 * c_rarg0 - x address 4481 * c_rarg1 - x length 4482 * c_rarg2 - y address 4483 * c_rarg3 - y length 4484 * c_rarg4 - z address 4485 * c_rarg5 - z length 4486 */ 4487 address generate_multiplyToLen() { 4488 __ align(CodeEntryAlignment); 4489 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4490 4491 address start = __ pc(); 4492 const Register x = r0; 4493 const Register xlen = r1; 4494 const Register y = r2; 4495 const Register ylen = r3; 4496 const Register z = r4; 4497 const Register zlen = r5; 4498 4499 const Register tmp1 = r10; 4500 const Register tmp2 = r11; 4501 const Register tmp3 = r12; 4502 const Register tmp4 = r13; 4503 const Register tmp5 = r14; 4504 const Register tmp6 = r15; 4505 const Register tmp7 = r16; 4506 4507 BLOCK_COMMENT("Entry:"); 4508 __ enter(); // required for proper stackwalking of RuntimeStub frame 4509 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4510 __ leave(); // required for proper stackwalking of RuntimeStub frame 4511 __ ret(lr); 4512 4513 return start; 4514 } 4515 4516 address generate_squareToLen() { 4517 // squareToLen algorithm for sizes 1..127 described in java code works 4518 // faster than multiply_to_len on some CPUs and slower on others, but 4519 // multiply_to_len shows a bit better overall results 4520 __ align(CodeEntryAlignment); 4521 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4522 address start = __ pc(); 4523 4524 const Register x = r0; 4525 const Register xlen = r1; 4526 const Register z = r2; 4527 const Register zlen = r3; 4528 const Register y = r4; // == x 4529 const Register ylen = r5; // == xlen 4530 4531 const Register tmp1 = r10; 4532 const Register tmp2 = r11; 4533 const Register tmp3 = r12; 4534 const Register tmp4 = r13; 4535 const Register tmp5 = r14; 4536 const Register tmp6 = r15; 4537 const Register tmp7 = r16; 4538 4539 RegSet spilled_regs = RegSet::of(y, ylen); 4540 BLOCK_COMMENT("Entry:"); 4541 __ enter(); 4542 __ push(spilled_regs, sp); 4543 __ mov(y, x); 4544 __ mov(ylen, xlen); 4545 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4546 __ pop(spilled_regs, sp); 4547 __ leave(); 4548 __ ret(lr); 4549 return start; 4550 } 4551 4552 address generate_mulAdd() { 4553 __ align(CodeEntryAlignment); 4554 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4555 4556 address start = __ pc(); 4557 4558 const Register out = r0; 4559 const Register in = r1; 4560 const Register offset = r2; 4561 const Register len = r3; 4562 const Register k = r4; 4563 4564 BLOCK_COMMENT("Entry:"); 4565 __ enter(); 4566 __ mul_add(out, in, offset, len, k); 4567 __ leave(); 4568 __ ret(lr); 4569 4570 return start; 4571 } 4572 4573 // Arguments: 4574 // 4575 // Input: 4576 // c_rarg0 - newArr address 4577 // c_rarg1 - oldArr address 4578 // c_rarg2 - newIdx 4579 // c_rarg3 - shiftCount 4580 // c_rarg4 - numIter 4581 // 4582 address generate_bigIntegerRightShift() { 4583 __ align(CodeEntryAlignment); 4584 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4585 address start = __ pc(); 4586 4587 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4588 4589 Register newArr = c_rarg0; 4590 Register oldArr = c_rarg1; 4591 Register newIdx = c_rarg2; 4592 Register shiftCount = c_rarg3; 4593 Register numIter = c_rarg4; 4594 Register idx = numIter; 4595 4596 Register newArrCur = rscratch1; 4597 Register shiftRevCount = rscratch2; 4598 Register oldArrCur = r13; 4599 Register oldArrNext = r14; 4600 4601 FloatRegister oldElem0 = v0; 4602 FloatRegister oldElem1 = v1; 4603 FloatRegister newElem = v2; 4604 FloatRegister shiftVCount = v3; 4605 FloatRegister shiftVRevCount = v4; 4606 4607 __ cbz(idx, Exit); 4608 4609 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4610 4611 // left shift count 4612 __ movw(shiftRevCount, 32); 4613 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4614 4615 // numIter too small to allow a 4-words SIMD loop, rolling back 4616 __ cmp(numIter, (u1)4); 4617 __ br(Assembler::LT, ShiftThree); 4618 4619 __ dup(shiftVCount, __ T4S, shiftCount); 4620 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4621 __ negr(shiftVCount, __ T4S, shiftVCount); 4622 4623 __ BIND(ShiftSIMDLoop); 4624 4625 // Calculate the load addresses 4626 __ sub(idx, idx, 4); 4627 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4628 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4629 __ add(oldArrCur, oldArrNext, 4); 4630 4631 // Load 4 words and process 4632 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4633 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4634 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4635 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4636 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4637 __ st1(newElem, __ T4S, Address(newArrCur)); 4638 4639 __ cmp(idx, (u1)4); 4640 __ br(Assembler::LT, ShiftTwoLoop); 4641 __ b(ShiftSIMDLoop); 4642 4643 __ BIND(ShiftTwoLoop); 4644 __ cbz(idx, Exit); 4645 __ cmp(idx, (u1)1); 4646 __ br(Assembler::EQ, ShiftOne); 4647 4648 // Calculate the load addresses 4649 __ sub(idx, idx, 2); 4650 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4651 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4652 __ add(oldArrCur, oldArrNext, 4); 4653 4654 // Load 2 words and process 4655 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4656 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4657 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4658 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4659 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4660 __ st1(newElem, __ T2S, Address(newArrCur)); 4661 __ b(ShiftTwoLoop); 4662 4663 __ BIND(ShiftThree); 4664 __ tbz(idx, 1, ShiftOne); 4665 __ tbz(idx, 0, ShiftTwo); 4666 __ ldrw(r10, Address(oldArr, 12)); 4667 __ ldrw(r11, Address(oldArr, 8)); 4668 __ lsrvw(r10, r10, shiftCount); 4669 __ lslvw(r11, r11, shiftRevCount); 4670 __ orrw(r12, r10, r11); 4671 __ strw(r12, Address(newArr, 8)); 4672 4673 __ BIND(ShiftTwo); 4674 __ ldrw(r10, Address(oldArr, 8)); 4675 __ ldrw(r11, Address(oldArr, 4)); 4676 __ lsrvw(r10, r10, shiftCount); 4677 __ lslvw(r11, r11, shiftRevCount); 4678 __ orrw(r12, r10, r11); 4679 __ strw(r12, Address(newArr, 4)); 4680 4681 __ BIND(ShiftOne); 4682 __ ldrw(r10, Address(oldArr, 4)); 4683 __ ldrw(r11, Address(oldArr)); 4684 __ lsrvw(r10, r10, shiftCount); 4685 __ lslvw(r11, r11, shiftRevCount); 4686 __ orrw(r12, r10, r11); 4687 __ strw(r12, Address(newArr)); 4688 4689 __ BIND(Exit); 4690 __ ret(lr); 4691 4692 return start; 4693 } 4694 4695 // Arguments: 4696 // 4697 // Input: 4698 // c_rarg0 - newArr address 4699 // c_rarg1 - oldArr address 4700 // c_rarg2 - newIdx 4701 // c_rarg3 - shiftCount 4702 // c_rarg4 - numIter 4703 // 4704 address generate_bigIntegerLeftShift() { 4705 __ align(CodeEntryAlignment); 4706 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4707 address start = __ pc(); 4708 4709 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4710 4711 Register newArr = c_rarg0; 4712 Register oldArr = c_rarg1; 4713 Register newIdx = c_rarg2; 4714 Register shiftCount = c_rarg3; 4715 Register numIter = c_rarg4; 4716 4717 Register shiftRevCount = rscratch1; 4718 Register oldArrNext = rscratch2; 4719 4720 FloatRegister oldElem0 = v0; 4721 FloatRegister oldElem1 = v1; 4722 FloatRegister newElem = v2; 4723 FloatRegister shiftVCount = v3; 4724 FloatRegister shiftVRevCount = v4; 4725 4726 __ cbz(numIter, Exit); 4727 4728 __ add(oldArrNext, oldArr, 4); 4729 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4730 4731 // right shift count 4732 __ movw(shiftRevCount, 32); 4733 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4734 4735 // numIter too small to allow a 4-words SIMD loop, rolling back 4736 __ cmp(numIter, (u1)4); 4737 __ br(Assembler::LT, ShiftThree); 4738 4739 __ dup(shiftVCount, __ T4S, shiftCount); 4740 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4741 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4742 4743 __ BIND(ShiftSIMDLoop); 4744 4745 // load 4 words and process 4746 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4747 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4748 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4749 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4750 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4751 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4752 __ sub(numIter, numIter, 4); 4753 4754 __ cmp(numIter, (u1)4); 4755 __ br(Assembler::LT, ShiftTwoLoop); 4756 __ b(ShiftSIMDLoop); 4757 4758 __ BIND(ShiftTwoLoop); 4759 __ cbz(numIter, Exit); 4760 __ cmp(numIter, (u1)1); 4761 __ br(Assembler::EQ, ShiftOne); 4762 4763 // load 2 words and process 4764 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4765 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4766 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4767 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4768 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4769 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4770 __ sub(numIter, numIter, 2); 4771 __ b(ShiftTwoLoop); 4772 4773 __ BIND(ShiftThree); 4774 __ ldrw(r10, __ post(oldArr, 4)); 4775 __ ldrw(r11, __ post(oldArrNext, 4)); 4776 __ lslvw(r10, r10, shiftCount); 4777 __ lsrvw(r11, r11, shiftRevCount); 4778 __ orrw(r12, r10, r11); 4779 __ strw(r12, __ post(newArr, 4)); 4780 __ tbz(numIter, 1, Exit); 4781 __ tbz(numIter, 0, ShiftOne); 4782 4783 __ BIND(ShiftTwo); 4784 __ ldrw(r10, __ post(oldArr, 4)); 4785 __ ldrw(r11, __ post(oldArrNext, 4)); 4786 __ lslvw(r10, r10, shiftCount); 4787 __ lsrvw(r11, r11, shiftRevCount); 4788 __ orrw(r12, r10, r11); 4789 __ strw(r12, __ post(newArr, 4)); 4790 4791 __ BIND(ShiftOne); 4792 __ ldrw(r10, Address(oldArr)); 4793 __ ldrw(r11, Address(oldArrNext)); 4794 __ lslvw(r10, r10, shiftCount); 4795 __ lsrvw(r11, r11, shiftRevCount); 4796 __ orrw(r12, r10, r11); 4797 __ strw(r12, Address(newArr)); 4798 4799 __ BIND(Exit); 4800 __ ret(lr); 4801 4802 return start; 4803 } 4804 4805 address generate_count_positives(address &count_positives_long) { 4806 const u1 large_loop_size = 64; 4807 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4808 int dcache_line = VM_Version::dcache_line_size(); 4809 4810 Register ary1 = r1, len = r2, result = r0; 4811 4812 __ align(CodeEntryAlignment); 4813 4814 StubCodeMark mark(this, "StubRoutines", "count_positives"); 4815 4816 address entry = __ pc(); 4817 4818 __ enter(); 4819 // precondition: a copy of len is already in result 4820 // __ mov(result, len); 4821 4822 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 4823 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 4824 4825 __ cmp(len, (u1)15); 4826 __ br(Assembler::GT, LEN_OVER_15); 4827 // The only case when execution falls into this code is when pointer is near 4828 // the end of memory page and we have to avoid reading next page 4829 __ add(ary1, ary1, len); 4830 __ subs(len, len, 8); 4831 __ br(Assembler::GT, LEN_OVER_8); 4832 __ ldr(rscratch2, Address(ary1, -8)); 4833 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 4834 __ lsrv(rscratch2, rscratch2, rscratch1); 4835 __ tst(rscratch2, UPPER_BIT_MASK); 4836 __ csel(result, zr, result, Assembler::NE); 4837 __ leave(); 4838 __ ret(lr); 4839 __ bind(LEN_OVER_8); 4840 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 4841 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 4842 __ tst(rscratch2, UPPER_BIT_MASK); 4843 __ br(Assembler::NE, RET_NO_POP); 4844 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 4845 __ lsrv(rscratch1, rscratch1, rscratch2); 4846 __ tst(rscratch1, UPPER_BIT_MASK); 4847 __ bind(RET_NO_POP); 4848 __ csel(result, zr, result, Assembler::NE); 4849 __ leave(); 4850 __ ret(lr); 4851 4852 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 4853 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 4854 4855 count_positives_long = __ pc(); // 2nd entry point 4856 4857 __ enter(); 4858 4859 __ bind(LEN_OVER_15); 4860 __ push(spilled_regs, sp); 4861 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 4862 __ cbz(rscratch2, ALIGNED); 4863 __ ldp(tmp6, tmp1, Address(ary1)); 4864 __ mov(tmp5, 16); 4865 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 4866 __ add(ary1, ary1, rscratch1); 4867 __ orr(tmp6, tmp6, tmp1); 4868 __ tst(tmp6, UPPER_BIT_MASK); 4869 __ br(Assembler::NE, RET_ADJUST); 4870 __ sub(len, len, rscratch1); 4871 4872 __ bind(ALIGNED); 4873 __ cmp(len, large_loop_size); 4874 __ br(Assembler::LT, CHECK_16); 4875 // Perform 16-byte load as early return in pre-loop to handle situation 4876 // when initially aligned large array has negative values at starting bytes, 4877 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 4878 // slower. Cases with negative bytes further ahead won't be affected that 4879 // much. In fact, it'll be faster due to early loads, less instructions and 4880 // less branches in LARGE_LOOP. 4881 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 4882 __ sub(len, len, 16); 4883 __ orr(tmp6, tmp6, tmp1); 4884 __ tst(tmp6, UPPER_BIT_MASK); 4885 __ br(Assembler::NE, RET_ADJUST_16); 4886 __ cmp(len, large_loop_size); 4887 __ br(Assembler::LT, CHECK_16); 4888 4889 if (SoftwarePrefetchHintDistance >= 0 4890 && SoftwarePrefetchHintDistance >= dcache_line) { 4891 // initial prefetch 4892 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 4893 } 4894 __ bind(LARGE_LOOP); 4895 if (SoftwarePrefetchHintDistance >= 0) { 4896 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 4897 } 4898 // Issue load instructions first, since it can save few CPU/MEM cycles, also 4899 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 4900 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 4901 // instructions per cycle and have less branches, but this approach disables 4902 // early return, thus, all 64 bytes are loaded and checked every time. 4903 __ ldp(tmp2, tmp3, Address(ary1)); 4904 __ ldp(tmp4, tmp5, Address(ary1, 16)); 4905 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 4906 __ ldp(tmp6, tmp1, Address(ary1, 48)); 4907 __ add(ary1, ary1, large_loop_size); 4908 __ sub(len, len, large_loop_size); 4909 __ orr(tmp2, tmp2, tmp3); 4910 __ orr(tmp4, tmp4, tmp5); 4911 __ orr(rscratch1, rscratch1, rscratch2); 4912 __ orr(tmp6, tmp6, tmp1); 4913 __ orr(tmp2, tmp2, tmp4); 4914 __ orr(rscratch1, rscratch1, tmp6); 4915 __ orr(tmp2, tmp2, rscratch1); 4916 __ tst(tmp2, UPPER_BIT_MASK); 4917 __ br(Assembler::NE, RET_ADJUST_LONG); 4918 __ cmp(len, large_loop_size); 4919 __ br(Assembler::GE, LARGE_LOOP); 4920 4921 __ bind(CHECK_16); // small 16-byte load pre-loop 4922 __ cmp(len, (u1)16); 4923 __ br(Assembler::LT, POST_LOOP16); 4924 4925 __ bind(LOOP16); // small 16-byte load loop 4926 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 4927 __ sub(len, len, 16); 4928 __ orr(tmp2, tmp2, tmp3); 4929 __ tst(tmp2, UPPER_BIT_MASK); 4930 __ br(Assembler::NE, RET_ADJUST_16); 4931 __ cmp(len, (u1)16); 4932 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 4933 4934 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 4935 __ cmp(len, (u1)8); 4936 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 4937 __ ldr(tmp3, Address(__ post(ary1, 8))); 4938 __ tst(tmp3, UPPER_BIT_MASK); 4939 __ br(Assembler::NE, RET_ADJUST); 4940 __ sub(len, len, 8); 4941 4942 __ bind(POST_LOOP16_LOAD_TAIL); 4943 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 4944 __ ldr(tmp1, Address(ary1)); 4945 __ mov(tmp2, 64); 4946 __ sub(tmp4, tmp2, len, __ LSL, 3); 4947 __ lslv(tmp1, tmp1, tmp4); 4948 __ tst(tmp1, UPPER_BIT_MASK); 4949 __ br(Assembler::NE, RET_ADJUST); 4950 // Fallthrough 4951 4952 __ bind(RET_LEN); 4953 __ pop(spilled_regs, sp); 4954 __ leave(); 4955 __ ret(lr); 4956 4957 // difference result - len is the count of guaranteed to be 4958 // positive bytes 4959 4960 __ bind(RET_ADJUST_LONG); 4961 __ add(len, len, (u1)(large_loop_size - 16)); 4962 __ bind(RET_ADJUST_16); 4963 __ add(len, len, 16); 4964 __ bind(RET_ADJUST); 4965 __ pop(spilled_regs, sp); 4966 __ leave(); 4967 __ sub(result, result, len); 4968 __ ret(lr); 4969 4970 return entry; 4971 } 4972 4973 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 4974 bool usePrefetch, Label &NOT_EQUAL) { 4975 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4976 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 4977 tmp7 = r12, tmp8 = r13; 4978 Label LOOP; 4979 4980 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4981 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4982 __ bind(LOOP); 4983 if (usePrefetch) { 4984 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 4985 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 4986 } 4987 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 4988 __ eor(tmp1, tmp1, tmp2); 4989 __ eor(tmp3, tmp3, tmp4); 4990 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 4991 __ orr(tmp1, tmp1, tmp3); 4992 __ cbnz(tmp1, NOT_EQUAL); 4993 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4994 __ eor(tmp5, tmp5, tmp6); 4995 __ eor(tmp7, tmp7, tmp8); 4996 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4997 __ orr(tmp5, tmp5, tmp7); 4998 __ cbnz(tmp5, NOT_EQUAL); 4999 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5000 __ eor(tmp1, tmp1, tmp2); 5001 __ eor(tmp3, tmp3, tmp4); 5002 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5003 __ orr(tmp1, tmp1, tmp3); 5004 __ cbnz(tmp1, NOT_EQUAL); 5005 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5006 __ eor(tmp5, tmp5, tmp6); 5007 __ sub(cnt1, cnt1, 8 * wordSize); 5008 __ eor(tmp7, tmp7, tmp8); 5009 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5010 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5011 // cmp) because subs allows an unlimited range of immediate operand. 5012 __ subs(tmp6, cnt1, loopThreshold); 5013 __ orr(tmp5, tmp5, tmp7); 5014 __ cbnz(tmp5, NOT_EQUAL); 5015 __ br(__ GE, LOOP); 5016 // post-loop 5017 __ eor(tmp1, tmp1, tmp2); 5018 __ eor(tmp3, tmp3, tmp4); 5019 __ orr(tmp1, tmp1, tmp3); 5020 __ sub(cnt1, cnt1, 2 * wordSize); 5021 __ cbnz(tmp1, NOT_EQUAL); 5022 } 5023 5024 void generate_large_array_equals_loop_simd(int loopThreshold, 5025 bool usePrefetch, Label &NOT_EQUAL) { 5026 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5027 tmp2 = rscratch2; 5028 Label LOOP; 5029 5030 __ bind(LOOP); 5031 if (usePrefetch) { 5032 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5033 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5034 } 5035 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5036 __ sub(cnt1, cnt1, 8 * wordSize); 5037 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5038 __ subs(tmp1, cnt1, loopThreshold); 5039 __ eor(v0, __ T16B, v0, v4); 5040 __ eor(v1, __ T16B, v1, v5); 5041 __ eor(v2, __ T16B, v2, v6); 5042 __ eor(v3, __ T16B, v3, v7); 5043 __ orr(v0, __ T16B, v0, v1); 5044 __ orr(v1, __ T16B, v2, v3); 5045 __ orr(v0, __ T16B, v0, v1); 5046 __ umov(tmp1, v0, __ D, 0); 5047 __ umov(tmp2, v0, __ D, 1); 5048 __ orr(tmp1, tmp1, tmp2); 5049 __ cbnz(tmp1, NOT_EQUAL); 5050 __ br(__ GE, LOOP); 5051 } 5052 5053 // a1 = r1 - array1 address 5054 // a2 = r2 - array2 address 5055 // result = r0 - return value. Already contains "false" 5056 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5057 // r3-r5 are reserved temporary registers 5058 address generate_large_array_equals() { 5059 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5060 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5061 tmp7 = r12, tmp8 = r13; 5062 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5063 SMALL_LOOP, POST_LOOP; 5064 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5065 // calculate if at least 32 prefetched bytes are used 5066 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5067 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5068 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5069 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5070 tmp5, tmp6, tmp7, tmp8); 5071 5072 __ align(CodeEntryAlignment); 5073 5074 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5075 5076 address entry = __ pc(); 5077 __ enter(); 5078 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5079 // also advance pointers to use post-increment instead of pre-increment 5080 __ add(a1, a1, wordSize); 5081 __ add(a2, a2, wordSize); 5082 if (AvoidUnalignedAccesses) { 5083 // both implementations (SIMD/nonSIMD) are using relatively large load 5084 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5085 // on some CPUs in case of address is not at least 16-byte aligned. 5086 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5087 // load if needed at least for 1st address and make if 16-byte aligned. 5088 Label ALIGNED16; 5089 __ tbz(a1, 3, ALIGNED16); 5090 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5091 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5092 __ sub(cnt1, cnt1, wordSize); 5093 __ eor(tmp1, tmp1, tmp2); 5094 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5095 __ bind(ALIGNED16); 5096 } 5097 if (UseSIMDForArrayEquals) { 5098 if (SoftwarePrefetchHintDistance >= 0) { 5099 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5100 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5101 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5102 /* prfm = */ true, NOT_EQUAL); 5103 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5104 __ br(__ LT, TAIL); 5105 } 5106 __ bind(NO_PREFETCH_LARGE_LOOP); 5107 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5108 /* prfm = */ false, NOT_EQUAL); 5109 } else { 5110 __ push(spilled_regs, sp); 5111 if (SoftwarePrefetchHintDistance >= 0) { 5112 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5113 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5114 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5115 /* prfm = */ true, NOT_EQUAL); 5116 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5117 __ br(__ LT, TAIL); 5118 } 5119 __ bind(NO_PREFETCH_LARGE_LOOP); 5120 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5121 /* prfm = */ false, NOT_EQUAL); 5122 } 5123 __ bind(TAIL); 5124 __ cbz(cnt1, EQUAL); 5125 __ subs(cnt1, cnt1, wordSize); 5126 __ br(__ LE, POST_LOOP); 5127 __ bind(SMALL_LOOP); 5128 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5129 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5130 __ subs(cnt1, cnt1, wordSize); 5131 __ eor(tmp1, tmp1, tmp2); 5132 __ cbnz(tmp1, NOT_EQUAL); 5133 __ br(__ GT, SMALL_LOOP); 5134 __ bind(POST_LOOP); 5135 __ ldr(tmp1, Address(a1, cnt1)); 5136 __ ldr(tmp2, Address(a2, cnt1)); 5137 __ eor(tmp1, tmp1, tmp2); 5138 __ cbnz(tmp1, NOT_EQUAL); 5139 __ bind(EQUAL); 5140 __ mov(result, true); 5141 __ bind(NOT_EQUAL); 5142 if (!UseSIMDForArrayEquals) { 5143 __ pop(spilled_regs, sp); 5144 } 5145 __ bind(NOT_EQUAL_NO_POP); 5146 __ leave(); 5147 __ ret(lr); 5148 return entry; 5149 } 5150 5151 address generate_dsin_dcos(bool isCos) { 5152 __ align(CodeEntryAlignment); 5153 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5154 address start = __ pc(); 5155 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5156 (address)StubRoutines::aarch64::_two_over_pi, 5157 (address)StubRoutines::aarch64::_pio2, 5158 (address)StubRoutines::aarch64::_dsin_coef, 5159 (address)StubRoutines::aarch64::_dcos_coef); 5160 return start; 5161 } 5162 5163 address generate_dlog() { 5164 __ align(CodeEntryAlignment); 5165 StubCodeMark mark(this, "StubRoutines", "dlog"); 5166 address entry = __ pc(); 5167 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 5168 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 5169 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 5170 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 5171 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 5172 return entry; 5173 } 5174 5175 5176 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5177 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5178 Label &DIFF2) { 5179 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5180 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5181 5182 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5183 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5184 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5185 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5186 5187 __ fmovd(tmpL, vtmp3); 5188 __ eor(rscratch2, tmp3, tmpL); 5189 __ cbnz(rscratch2, DIFF2); 5190 5191 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5192 __ umov(tmpL, vtmp3, __ D, 1); 5193 __ eor(rscratch2, tmpU, tmpL); 5194 __ cbnz(rscratch2, DIFF1); 5195 5196 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5197 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5198 __ fmovd(tmpL, vtmp); 5199 __ eor(rscratch2, tmp3, tmpL); 5200 __ cbnz(rscratch2, DIFF2); 5201 5202 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5203 __ umov(tmpL, vtmp, __ D, 1); 5204 __ eor(rscratch2, tmpU, tmpL); 5205 __ cbnz(rscratch2, DIFF1); 5206 } 5207 5208 // r0 = result 5209 // r1 = str1 5210 // r2 = cnt1 5211 // r3 = str2 5212 // r4 = cnt2 5213 // r10 = tmp1 5214 // r11 = tmp2 5215 address generate_compare_long_string_different_encoding(bool isLU) { 5216 __ align(CodeEntryAlignment); 5217 StubCodeMark mark(this, "StubRoutines", isLU 5218 ? "compare_long_string_different_encoding LU" 5219 : "compare_long_string_different_encoding UL"); 5220 address entry = __ pc(); 5221 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5222 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5223 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5224 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5225 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5226 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5227 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5228 5229 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5230 5231 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5232 // cnt2 == amount of characters left to compare 5233 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5234 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5235 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5236 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5237 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5238 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5239 __ eor(rscratch2, tmp1, tmp2); 5240 __ mov(rscratch1, tmp2); 5241 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5242 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5243 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5244 __ push(spilled_regs, sp); 5245 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5246 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5247 5248 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5249 5250 if (SoftwarePrefetchHintDistance >= 0) { 5251 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5252 __ br(__ LT, NO_PREFETCH); 5253 __ bind(LARGE_LOOP_PREFETCH); 5254 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5255 __ mov(tmp4, 2); 5256 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5257 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5258 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5259 __ subs(tmp4, tmp4, 1); 5260 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5261 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5262 __ mov(tmp4, 2); 5263 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5264 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5265 __ subs(tmp4, tmp4, 1); 5266 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5267 __ sub(cnt2, cnt2, 64); 5268 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5269 __ br(__ GE, LARGE_LOOP_PREFETCH); 5270 } 5271 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5272 __ bind(NO_PREFETCH); 5273 __ subs(cnt2, cnt2, 16); 5274 __ br(__ LT, TAIL); 5275 __ align(OptoLoopAlignment); 5276 __ bind(SMALL_LOOP); // smaller loop 5277 __ subs(cnt2, cnt2, 16); 5278 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5279 __ br(__ GE, SMALL_LOOP); 5280 __ cmn(cnt2, (u1)16); 5281 __ br(__ EQ, LOAD_LAST); 5282 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5283 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5284 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5285 __ ldr(tmp3, Address(cnt1, -8)); 5286 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5287 __ b(LOAD_LAST); 5288 __ bind(DIFF2); 5289 __ mov(tmpU, tmp3); 5290 __ bind(DIFF1); 5291 __ pop(spilled_regs, sp); 5292 __ b(CALCULATE_DIFFERENCE); 5293 __ bind(LOAD_LAST); 5294 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5295 // No need to load it again 5296 __ mov(tmpU, tmp3); 5297 __ pop(spilled_regs, sp); 5298 5299 // tmp2 points to the address of the last 4 Latin1 characters right now 5300 __ ldrs(vtmp, Address(tmp2)); 5301 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5302 __ fmovd(tmpL, vtmp); 5303 5304 __ eor(rscratch2, tmpU, tmpL); 5305 __ cbz(rscratch2, DONE); 5306 5307 // Find the first different characters in the longwords and 5308 // compute their difference. 5309 __ bind(CALCULATE_DIFFERENCE); 5310 __ rev(rscratch2, rscratch2); 5311 __ clz(rscratch2, rscratch2); 5312 __ andr(rscratch2, rscratch2, -16); 5313 __ lsrv(tmp1, tmp1, rscratch2); 5314 __ uxthw(tmp1, tmp1); 5315 __ lsrv(rscratch1, rscratch1, rscratch2); 5316 __ uxthw(rscratch1, rscratch1); 5317 __ subw(result, tmp1, rscratch1); 5318 __ bind(DONE); 5319 __ ret(lr); 5320 return entry; 5321 } 5322 5323 address generate_method_entry_barrier() { 5324 __ align(CodeEntryAlignment); 5325 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5326 5327 Label deoptimize_label; 5328 5329 address start = __ pc(); 5330 5331 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5332 5333 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5334 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5335 // We can get here despite the nmethod being good, if we have not 5336 // yet applied our cross modification fence (or data fence). 5337 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5338 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5339 __ ldrw(rscratch2, rscratch2); 5340 __ strw(rscratch2, thread_epoch_addr); 5341 __ isb(); 5342 __ membar(__ LoadLoad); 5343 } 5344 5345 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5346 5347 __ enter(); 5348 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5349 5350 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5351 5352 __ push_call_clobbered_registers(); 5353 5354 __ mov(c_rarg0, rscratch2); 5355 __ call_VM_leaf 5356 (CAST_FROM_FN_PTR 5357 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5358 5359 __ reset_last_Java_frame(true); 5360 5361 __ mov(rscratch1, r0); 5362 5363 __ pop_call_clobbered_registers(); 5364 5365 __ cbnz(rscratch1, deoptimize_label); 5366 5367 __ leave(); 5368 __ ret(lr); 5369 5370 __ BIND(deoptimize_label); 5371 5372 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5373 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5374 5375 __ mov(sp, rscratch1); 5376 __ br(rscratch2); 5377 5378 return start; 5379 } 5380 5381 // r0 = result 5382 // r1 = str1 5383 // r2 = cnt1 5384 // r3 = str2 5385 // r4 = cnt2 5386 // r10 = tmp1 5387 // r11 = tmp2 5388 address generate_compare_long_string_same_encoding(bool isLL) { 5389 __ align(CodeEntryAlignment); 5390 StubCodeMark mark(this, "StubRoutines", isLL 5391 ? "compare_long_string_same_encoding LL" 5392 : "compare_long_string_same_encoding UU"); 5393 address entry = __ pc(); 5394 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5395 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5396 5397 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5398 5399 // exit from large loop when less than 64 bytes left to read or we're about 5400 // to prefetch memory behind array border 5401 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5402 5403 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5404 __ eor(rscratch2, tmp1, tmp2); 5405 __ cbnz(rscratch2, CAL_DIFFERENCE); 5406 5407 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5408 // update pointers, because of previous read 5409 __ add(str1, str1, wordSize); 5410 __ add(str2, str2, wordSize); 5411 if (SoftwarePrefetchHintDistance >= 0) { 5412 __ align(OptoLoopAlignment); 5413 __ bind(LARGE_LOOP_PREFETCH); 5414 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5415 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5416 5417 for (int i = 0; i < 4; i++) { 5418 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5419 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5420 __ cmp(tmp1, tmp2); 5421 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5422 __ br(Assembler::NE, DIFF); 5423 } 5424 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5425 __ add(str1, str1, 64); 5426 __ add(str2, str2, 64); 5427 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5428 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5429 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5430 } 5431 5432 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5433 __ br(Assembler::LE, LESS16); 5434 __ align(OptoLoopAlignment); 5435 __ bind(LOOP_COMPARE16); 5436 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5437 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5438 __ cmp(tmp1, tmp2); 5439 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5440 __ br(Assembler::NE, DIFF); 5441 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5442 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5443 __ br(Assembler::LT, LESS16); 5444 5445 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5446 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5447 __ cmp(tmp1, tmp2); 5448 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5449 __ br(Assembler::NE, DIFF); 5450 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5451 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5452 __ br(Assembler::GE, LOOP_COMPARE16); 5453 __ cbz(cnt2, LENGTH_DIFF); 5454 5455 __ bind(LESS16); 5456 // each 8 compare 5457 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5458 __ br(Assembler::LE, LESS8); 5459 __ ldr(tmp1, Address(__ post(str1, 8))); 5460 __ ldr(tmp2, Address(__ post(str2, 8))); 5461 __ eor(rscratch2, tmp1, tmp2); 5462 __ cbnz(rscratch2, CAL_DIFFERENCE); 5463 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5464 5465 __ bind(LESS8); // directly load last 8 bytes 5466 if (!isLL) { 5467 __ add(cnt2, cnt2, cnt2); 5468 } 5469 __ ldr(tmp1, Address(str1, cnt2)); 5470 __ ldr(tmp2, Address(str2, cnt2)); 5471 __ eor(rscratch2, tmp1, tmp2); 5472 __ cbz(rscratch2, LENGTH_DIFF); 5473 __ b(CAL_DIFFERENCE); 5474 5475 __ bind(DIFF); 5476 __ cmp(tmp1, tmp2); 5477 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5478 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5479 // reuse rscratch2 register for the result of eor instruction 5480 __ eor(rscratch2, tmp1, tmp2); 5481 5482 __ bind(CAL_DIFFERENCE); 5483 __ rev(rscratch2, rscratch2); 5484 __ clz(rscratch2, rscratch2); 5485 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5486 __ lsrv(tmp1, tmp1, rscratch2); 5487 __ lsrv(tmp2, tmp2, rscratch2); 5488 if (isLL) { 5489 __ uxtbw(tmp1, tmp1); 5490 __ uxtbw(tmp2, tmp2); 5491 } else { 5492 __ uxthw(tmp1, tmp1); 5493 __ uxthw(tmp2, tmp2); 5494 } 5495 __ subw(result, tmp1, tmp2); 5496 5497 __ bind(LENGTH_DIFF); 5498 __ ret(lr); 5499 return entry; 5500 } 5501 5502 enum string_compare_mode { 5503 LL, 5504 LU, 5505 UL, 5506 UU, 5507 }; 5508 5509 // The following registers are declared in aarch64.ad 5510 // r0 = result 5511 // r1 = str1 5512 // r2 = cnt1 5513 // r3 = str2 5514 // r4 = cnt2 5515 // r10 = tmp1 5516 // r11 = tmp2 5517 // z0 = ztmp1 5518 // z1 = ztmp2 5519 // p0 = pgtmp1 5520 // p1 = pgtmp2 5521 address generate_compare_long_string_sve(string_compare_mode mode) { 5522 __ align(CodeEntryAlignment); 5523 address entry = __ pc(); 5524 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5525 tmp1 = r10, tmp2 = r11; 5526 5527 Label LOOP, DONE, MISMATCH; 5528 Register vec_len = tmp1; 5529 Register idx = tmp2; 5530 // The minimum of the string lengths has been stored in cnt2. 5531 Register cnt = cnt2; 5532 FloatRegister ztmp1 = z0, ztmp2 = z1; 5533 PRegister pgtmp1 = p0, pgtmp2 = p1; 5534 5535 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 5536 switch (mode) { \ 5537 case LL: \ 5538 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 5539 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 5540 break; \ 5541 case LU: \ 5542 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 5543 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5544 break; \ 5545 case UL: \ 5546 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5547 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 5548 break; \ 5549 case UU: \ 5550 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5551 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5552 break; \ 5553 default: \ 5554 ShouldNotReachHere(); \ 5555 } 5556 5557 const char* stubname; 5558 switch (mode) { 5559 case LL: stubname = "compare_long_string_same_encoding LL"; break; 5560 case LU: stubname = "compare_long_string_different_encoding LU"; break; 5561 case UL: stubname = "compare_long_string_different_encoding UL"; break; 5562 case UU: stubname = "compare_long_string_same_encoding UU"; break; 5563 default: ShouldNotReachHere(); 5564 } 5565 5566 StubCodeMark mark(this, "StubRoutines", stubname); 5567 5568 __ mov(idx, 0); 5569 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5570 5571 if (mode == LL) { 5572 __ sve_cntb(vec_len); 5573 } else { 5574 __ sve_cnth(vec_len); 5575 } 5576 5577 __ sub(rscratch1, cnt, vec_len); 5578 5579 __ bind(LOOP); 5580 5581 // main loop 5582 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5583 __ add(idx, idx, vec_len); 5584 // Compare strings. 5585 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5586 __ br(__ NE, MISMATCH); 5587 __ cmp(idx, rscratch1); 5588 __ br(__ LT, LOOP); 5589 5590 // post loop, last iteration 5591 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5592 5593 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5594 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5595 __ br(__ EQ, DONE); 5596 5597 __ bind(MISMATCH); 5598 5599 // Crop the vector to find its location. 5600 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 5601 // Extract the first different characters of each string. 5602 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 5603 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 5604 5605 // Compute the difference of the first different characters. 5606 __ sub(result, rscratch1, rscratch2); 5607 5608 __ bind(DONE); 5609 __ ret(lr); 5610 #undef LOAD_PAIR 5611 return entry; 5612 } 5613 5614 void generate_compare_long_strings() { 5615 if (UseSVE == 0) { 5616 StubRoutines::aarch64::_compare_long_string_LL 5617 = generate_compare_long_string_same_encoding(true); 5618 StubRoutines::aarch64::_compare_long_string_UU 5619 = generate_compare_long_string_same_encoding(false); 5620 StubRoutines::aarch64::_compare_long_string_LU 5621 = generate_compare_long_string_different_encoding(true); 5622 StubRoutines::aarch64::_compare_long_string_UL 5623 = generate_compare_long_string_different_encoding(false); 5624 } else { 5625 StubRoutines::aarch64::_compare_long_string_LL 5626 = generate_compare_long_string_sve(LL); 5627 StubRoutines::aarch64::_compare_long_string_UU 5628 = generate_compare_long_string_sve(UU); 5629 StubRoutines::aarch64::_compare_long_string_LU 5630 = generate_compare_long_string_sve(LU); 5631 StubRoutines::aarch64::_compare_long_string_UL 5632 = generate_compare_long_string_sve(UL); 5633 } 5634 } 5635 5636 // R0 = result 5637 // R1 = str2 5638 // R2 = cnt1 5639 // R3 = str1 5640 // R4 = cnt2 5641 // This generic linear code use few additional ideas, which makes it faster: 5642 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5643 // in order to skip initial loading(help in systems with 1 ld pipeline) 5644 // 2) we can use "fast" algorithm of finding single character to search for 5645 // first symbol with less branches(1 branch per each loaded register instead 5646 // of branch for each symbol), so, this is where constants like 5647 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5648 // 3) after loading and analyzing 1st register of source string, it can be 5649 // used to search for every 1st character entry, saving few loads in 5650 // comparison with "simplier-but-slower" implementation 5651 // 4) in order to avoid lots of push/pop operations, code below is heavily 5652 // re-using/re-initializing/compressing register values, which makes code 5653 // larger and a bit less readable, however, most of extra operations are 5654 // issued during loads or branches, so, penalty is minimal 5655 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5656 const char* stubName = str1_isL 5657 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5658 : "indexof_linear_uu"; 5659 __ align(CodeEntryAlignment); 5660 StubCodeMark mark(this, "StubRoutines", stubName); 5661 address entry = __ pc(); 5662 5663 int str1_chr_size = str1_isL ? 1 : 2; 5664 int str2_chr_size = str2_isL ? 1 : 2; 5665 int str1_chr_shift = str1_isL ? 0 : 1; 5666 int str2_chr_shift = str2_isL ? 0 : 1; 5667 bool isL = str1_isL && str2_isL; 5668 // parameters 5669 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5670 // temporary registers 5671 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5672 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5673 // redefinitions 5674 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5675 5676 __ push(spilled_regs, sp); 5677 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5678 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5679 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5680 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5681 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5682 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5683 // Read whole register from str1. It is safe, because length >=8 here 5684 __ ldr(ch1, Address(str1)); 5685 // Read whole register from str2. It is safe, because length >=8 here 5686 __ ldr(ch2, Address(str2)); 5687 __ sub(cnt2, cnt2, cnt1); 5688 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5689 if (str1_isL != str2_isL) { 5690 __ eor(v0, __ T16B, v0, v0); 5691 } 5692 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5693 __ mul(first, first, tmp1); 5694 // check if we have less than 1 register to check 5695 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5696 if (str1_isL != str2_isL) { 5697 __ fmovd(v1, ch1); 5698 } 5699 __ br(__ LE, L_SMALL); 5700 __ eor(ch2, first, ch2); 5701 if (str1_isL != str2_isL) { 5702 __ zip1(v1, __ T16B, v1, v0); 5703 } 5704 __ sub(tmp2, ch2, tmp1); 5705 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5706 __ bics(tmp2, tmp2, ch2); 5707 if (str1_isL != str2_isL) { 5708 __ fmovd(ch1, v1); 5709 } 5710 __ br(__ NE, L_HAS_ZERO); 5711 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5712 __ add(result, result, wordSize/str2_chr_size); 5713 __ add(str2, str2, wordSize); 5714 __ br(__ LT, L_POST_LOOP); 5715 __ BIND(L_LOOP); 5716 __ ldr(ch2, Address(str2)); 5717 __ eor(ch2, first, ch2); 5718 __ sub(tmp2, ch2, tmp1); 5719 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5720 __ bics(tmp2, tmp2, ch2); 5721 __ br(__ NE, L_HAS_ZERO); 5722 __ BIND(L_LOOP_PROCEED); 5723 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5724 __ add(str2, str2, wordSize); 5725 __ add(result, result, wordSize/str2_chr_size); 5726 __ br(__ GE, L_LOOP); 5727 __ BIND(L_POST_LOOP); 5728 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5729 __ br(__ LE, NOMATCH); 5730 __ ldr(ch2, Address(str2)); 5731 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5732 __ eor(ch2, first, ch2); 5733 __ sub(tmp2, ch2, tmp1); 5734 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5735 __ mov(tmp4, -1); // all bits set 5736 __ b(L_SMALL_PROCEED); 5737 __ align(OptoLoopAlignment); 5738 __ BIND(L_SMALL); 5739 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5740 __ eor(ch2, first, ch2); 5741 if (str1_isL != str2_isL) { 5742 __ zip1(v1, __ T16B, v1, v0); 5743 } 5744 __ sub(tmp2, ch2, tmp1); 5745 __ mov(tmp4, -1); // all bits set 5746 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5747 if (str1_isL != str2_isL) { 5748 __ fmovd(ch1, v1); // move converted 4 symbols 5749 } 5750 __ BIND(L_SMALL_PROCEED); 5751 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5752 __ bic(tmp2, tmp2, ch2); 5753 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5754 __ rbit(tmp2, tmp2); 5755 __ br(__ EQ, NOMATCH); 5756 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5757 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5758 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5759 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5760 if (str2_isL) { // LL 5761 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5762 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5763 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5764 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5765 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5766 } else { 5767 __ mov(ch2, 0xE); // all bits in byte set except last one 5768 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5769 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5770 __ lslv(tmp2, tmp2, tmp4); 5771 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5772 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5773 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5774 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5775 } 5776 __ cmp(ch1, ch2); 5777 __ mov(tmp4, wordSize/str2_chr_size); 5778 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5779 __ BIND(L_SMALL_CMP_LOOP); 5780 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5781 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5782 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5783 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5784 __ add(tmp4, tmp4, 1); 5785 __ cmp(tmp4, cnt1); 5786 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5787 __ cmp(first, ch2); 5788 __ br(__ EQ, L_SMALL_CMP_LOOP); 5789 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5790 __ cbz(tmp2, NOMATCH); // no more matches. exit 5791 __ clz(tmp4, tmp2); 5792 __ add(result, result, 1); // advance index 5793 __ add(str2, str2, str2_chr_size); // advance pointer 5794 __ b(L_SMALL_HAS_ZERO_LOOP); 5795 __ align(OptoLoopAlignment); 5796 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5797 __ cmp(first, ch2); 5798 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5799 __ b(DONE); 5800 __ align(OptoLoopAlignment); 5801 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5802 if (str2_isL) { // LL 5803 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5804 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5805 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5806 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5807 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5808 } else { 5809 __ mov(ch2, 0xE); // all bits in byte set except last one 5810 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5811 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5812 __ lslv(tmp2, tmp2, tmp4); 5813 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5814 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5815 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5816 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5817 } 5818 __ cmp(ch1, ch2); 5819 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5820 __ b(DONE); 5821 __ align(OptoLoopAlignment); 5822 __ BIND(L_HAS_ZERO); 5823 __ rbit(tmp2, tmp2); 5824 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 5825 // Now, perform compression of counters(cnt2 and cnt1) into one register. 5826 // It's fine because both counters are 32bit and are not changed in this 5827 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 5828 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 5829 __ sub(result, result, 1); 5830 __ BIND(L_HAS_ZERO_LOOP); 5831 __ mov(cnt1, wordSize/str2_chr_size); 5832 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5833 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 5834 if (str2_isL) { 5835 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5836 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5837 __ lslv(tmp2, tmp2, tmp4); 5838 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5839 __ add(tmp4, tmp4, 1); 5840 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5841 __ lsl(tmp2, tmp2, 1); 5842 __ mov(tmp4, wordSize/str2_chr_size); 5843 } else { 5844 __ mov(ch2, 0xE); 5845 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5846 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5847 __ lslv(tmp2, tmp2, tmp4); 5848 __ add(tmp4, tmp4, 1); 5849 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5850 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 5851 __ lsl(tmp2, tmp2, 1); 5852 __ mov(tmp4, wordSize/str2_chr_size); 5853 __ sub(str2, str2, str2_chr_size); 5854 } 5855 __ cmp(ch1, ch2); 5856 __ mov(tmp4, wordSize/str2_chr_size); 5857 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5858 __ BIND(L_CMP_LOOP); 5859 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5860 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5861 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5862 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5863 __ add(tmp4, tmp4, 1); 5864 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5865 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 5866 __ cmp(cnt1, ch2); 5867 __ br(__ EQ, L_CMP_LOOP); 5868 __ BIND(L_CMP_LOOP_NOMATCH); 5869 // here we're not matched 5870 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 5871 __ clz(tmp4, tmp2); 5872 __ add(str2, str2, str2_chr_size); // advance pointer 5873 __ b(L_HAS_ZERO_LOOP); 5874 __ align(OptoLoopAlignment); 5875 __ BIND(L_CMP_LOOP_LAST_CMP); 5876 __ cmp(cnt1, ch2); 5877 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5878 __ b(DONE); 5879 __ align(OptoLoopAlignment); 5880 __ BIND(L_CMP_LOOP_LAST_CMP2); 5881 if (str2_isL) { 5882 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5883 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5884 __ lslv(tmp2, tmp2, tmp4); 5885 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5886 __ add(tmp4, tmp4, 1); 5887 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5888 __ lsl(tmp2, tmp2, 1); 5889 } else { 5890 __ mov(ch2, 0xE); 5891 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5892 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5893 __ lslv(tmp2, tmp2, tmp4); 5894 __ add(tmp4, tmp4, 1); 5895 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5896 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 5897 __ lsl(tmp2, tmp2, 1); 5898 __ sub(str2, str2, str2_chr_size); 5899 } 5900 __ cmp(ch1, ch2); 5901 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5902 __ b(DONE); 5903 __ align(OptoLoopAlignment); 5904 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 5905 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 5906 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 5907 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 5908 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 5909 // result by analyzed characters value, so, we can just reset lower bits 5910 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 5911 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 5912 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 5913 // index of last analyzed substring inside current octet. So, str2 in at 5914 // respective start address. We need to advance it to next octet 5915 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 5916 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 5917 __ bfm(result, zr, 0, 2 - str2_chr_shift); 5918 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 5919 __ movw(cnt2, cnt2); 5920 __ b(L_LOOP_PROCEED); 5921 __ align(OptoLoopAlignment); 5922 __ BIND(NOMATCH); 5923 __ mov(result, -1); 5924 __ BIND(DONE); 5925 __ pop(spilled_regs, sp); 5926 __ ret(lr); 5927 return entry; 5928 } 5929 5930 void generate_string_indexof_stubs() { 5931 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 5932 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 5933 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 5934 } 5935 5936 void inflate_and_store_2_fp_registers(bool generatePrfm, 5937 FloatRegister src1, FloatRegister src2) { 5938 Register dst = r1; 5939 __ zip1(v1, __ T16B, src1, v0); 5940 __ zip2(v2, __ T16B, src1, v0); 5941 if (generatePrfm) { 5942 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 5943 } 5944 __ zip1(v3, __ T16B, src2, v0); 5945 __ zip2(v4, __ T16B, src2, v0); 5946 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 5947 } 5948 5949 // R0 = src 5950 // R1 = dst 5951 // R2 = len 5952 // R3 = len >> 3 5953 // V0 = 0 5954 // v1 = loaded 8 bytes 5955 address generate_large_byte_array_inflate() { 5956 __ align(CodeEntryAlignment); 5957 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 5958 address entry = __ pc(); 5959 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 5960 Register src = r0, dst = r1, len = r2, octetCounter = r3; 5961 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 5962 5963 // do one more 8-byte read to have address 16-byte aligned in most cases 5964 // also use single store instruction 5965 __ ldrd(v2, __ post(src, 8)); 5966 __ sub(octetCounter, octetCounter, 2); 5967 __ zip1(v1, __ T16B, v1, v0); 5968 __ zip1(v2, __ T16B, v2, v0); 5969 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 5970 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5971 __ subs(rscratch1, octetCounter, large_loop_threshold); 5972 __ br(__ LE, LOOP_START); 5973 __ b(LOOP_PRFM_START); 5974 __ bind(LOOP_PRFM); 5975 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5976 __ bind(LOOP_PRFM_START); 5977 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 5978 __ sub(octetCounter, octetCounter, 8); 5979 __ subs(rscratch1, octetCounter, large_loop_threshold); 5980 inflate_and_store_2_fp_registers(true, v3, v4); 5981 inflate_and_store_2_fp_registers(true, v5, v6); 5982 __ br(__ GT, LOOP_PRFM); 5983 __ cmp(octetCounter, (u1)8); 5984 __ br(__ LT, DONE); 5985 __ bind(LOOP); 5986 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5987 __ bind(LOOP_START); 5988 __ sub(octetCounter, octetCounter, 8); 5989 __ cmp(octetCounter, (u1)8); 5990 inflate_and_store_2_fp_registers(false, v3, v4); 5991 inflate_and_store_2_fp_registers(false, v5, v6); 5992 __ br(__ GE, LOOP); 5993 __ bind(DONE); 5994 __ ret(lr); 5995 return entry; 5996 } 5997 5998 /** 5999 * Arguments: 6000 * 6001 * Input: 6002 * c_rarg0 - current state address 6003 * c_rarg1 - H key address 6004 * c_rarg2 - data address 6005 * c_rarg3 - number of blocks 6006 * 6007 * Output: 6008 * Updated state at c_rarg0 6009 */ 6010 address generate_ghash_processBlocks() { 6011 // Bafflingly, GCM uses little-endian for the byte order, but 6012 // big-endian for the bit order. For example, the polynomial 1 is 6013 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6014 // 6015 // So, we must either reverse the bytes in each word and do 6016 // everything big-endian or reverse the bits in each byte and do 6017 // it little-endian. On AArch64 it's more idiomatic to reverse 6018 // the bits in each byte (we have an instruction, RBIT, to do 6019 // that) and keep the data in little-endian bit order through the 6020 // calculation, bit-reversing the inputs and outputs. 6021 6022 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6023 __ align(wordSize * 2); 6024 address p = __ pc(); 6025 __ emit_int64(0x87); // The low-order bits of the field 6026 // polynomial (i.e. p = z^7+z^2+z+1) 6027 // repeated in the low and high parts of a 6028 // 128-bit vector 6029 __ emit_int64(0x87); 6030 6031 __ align(CodeEntryAlignment); 6032 address start = __ pc(); 6033 6034 Register state = c_rarg0; 6035 Register subkeyH = c_rarg1; 6036 Register data = c_rarg2; 6037 Register blocks = c_rarg3; 6038 6039 FloatRegister vzr = v30; 6040 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6041 6042 __ ldrq(v24, p); // The field polynomial 6043 6044 __ ldrq(v0, Address(state)); 6045 __ ldrq(v1, Address(subkeyH)); 6046 6047 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6048 __ rbit(v0, __ T16B, v0); 6049 __ rev64(v1, __ T16B, v1); 6050 __ rbit(v1, __ T16B, v1); 6051 6052 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6053 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6054 6055 { 6056 Label L_ghash_loop; 6057 __ bind(L_ghash_loop); 6058 6059 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6060 // reversing each byte 6061 __ rbit(v2, __ T16B, v2); 6062 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6063 6064 // Multiply state in v2 by subkey in v1 6065 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6066 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6067 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6068 // Reduce v7:v5 by the field polynomial 6069 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6070 6071 __ sub(blocks, blocks, 1); 6072 __ cbnz(blocks, L_ghash_loop); 6073 } 6074 6075 // The bit-reversed result is at this point in v0 6076 __ rev64(v0, __ T16B, v0); 6077 __ rbit(v0, __ T16B, v0); 6078 6079 __ st1(v0, __ T16B, state); 6080 __ ret(lr); 6081 6082 return start; 6083 } 6084 6085 address generate_ghash_processBlocks_wide() { 6086 address small = generate_ghash_processBlocks(); 6087 6088 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6089 __ align(wordSize * 2); 6090 address p = __ pc(); 6091 __ emit_int64(0x87); // The low-order bits of the field 6092 // polynomial (i.e. p = z^7+z^2+z+1) 6093 // repeated in the low and high parts of a 6094 // 128-bit vector 6095 __ emit_int64(0x87); 6096 6097 __ align(CodeEntryAlignment); 6098 address start = __ pc(); 6099 6100 Register state = c_rarg0; 6101 Register subkeyH = c_rarg1; 6102 Register data = c_rarg2; 6103 Register blocks = c_rarg3; 6104 6105 const int unroll = 4; 6106 6107 __ cmp(blocks, (unsigned char)(unroll * 2)); 6108 __ br(__ LT, small); 6109 6110 if (unroll > 1) { 6111 // Save state before entering routine 6112 __ sub(sp, sp, 4 * 16); 6113 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6114 __ sub(sp, sp, 4 * 16); 6115 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6116 } 6117 6118 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6119 6120 if (unroll > 1) { 6121 // And restore state 6122 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6123 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6124 } 6125 6126 __ cmp(blocks, (unsigned char)0); 6127 __ br(__ GT, small); 6128 6129 __ ret(lr); 6130 6131 return start; 6132 } 6133 6134 void generate_base64_encode_simdround(Register src, Register dst, 6135 FloatRegister codec, u8 size) { 6136 6137 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6138 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6139 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6140 6141 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6142 6143 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6144 6145 __ ushr(ind0, arrangement, in0, 2); 6146 6147 __ ushr(ind1, arrangement, in1, 2); 6148 __ shl(in0, arrangement, in0, 6); 6149 __ orr(ind1, arrangement, ind1, in0); 6150 __ ushr(ind1, arrangement, ind1, 2); 6151 6152 __ ushr(ind2, arrangement, in2, 4); 6153 __ shl(in1, arrangement, in1, 4); 6154 __ orr(ind2, arrangement, in1, ind2); 6155 __ ushr(ind2, arrangement, ind2, 2); 6156 6157 __ shl(ind3, arrangement, in2, 2); 6158 __ ushr(ind3, arrangement, ind3, 2); 6159 6160 __ tbl(out0, arrangement, codec, 4, ind0); 6161 __ tbl(out1, arrangement, codec, 4, ind1); 6162 __ tbl(out2, arrangement, codec, 4, ind2); 6163 __ tbl(out3, arrangement, codec, 4, ind3); 6164 6165 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6166 } 6167 6168 /** 6169 * Arguments: 6170 * 6171 * Input: 6172 * c_rarg0 - src_start 6173 * c_rarg1 - src_offset 6174 * c_rarg2 - src_length 6175 * c_rarg3 - dest_start 6176 * c_rarg4 - dest_offset 6177 * c_rarg5 - isURL 6178 * 6179 */ 6180 address generate_base64_encodeBlock() { 6181 6182 static const char toBase64[64] = { 6183 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6184 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6185 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6186 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6187 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6188 }; 6189 6190 static const char toBase64URL[64] = { 6191 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6192 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6193 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6194 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6195 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6196 }; 6197 6198 __ align(CodeEntryAlignment); 6199 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6200 address start = __ pc(); 6201 6202 Register src = c_rarg0; // source array 6203 Register soff = c_rarg1; // source start offset 6204 Register send = c_rarg2; // source end offset 6205 Register dst = c_rarg3; // dest array 6206 Register doff = c_rarg4; // position for writing to dest array 6207 Register isURL = c_rarg5; // Base64 or URL character set 6208 6209 // c_rarg6 and c_rarg7 are free to use as temps 6210 Register codec = c_rarg6; 6211 Register length = c_rarg7; 6212 6213 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6214 6215 __ add(src, src, soff); 6216 __ add(dst, dst, doff); 6217 __ sub(length, send, soff); 6218 6219 // load the codec base address 6220 __ lea(codec, ExternalAddress((address) toBase64)); 6221 __ cbz(isURL, ProcessData); 6222 __ lea(codec, ExternalAddress((address) toBase64URL)); 6223 6224 __ BIND(ProcessData); 6225 6226 // too short to formup a SIMD loop, roll back 6227 __ cmp(length, (u1)24); 6228 __ br(Assembler::LT, Process3B); 6229 6230 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6231 6232 __ BIND(Process48B); 6233 __ cmp(length, (u1)48); 6234 __ br(Assembler::LT, Process24B); 6235 generate_base64_encode_simdround(src, dst, v0, 16); 6236 __ sub(length, length, 48); 6237 __ b(Process48B); 6238 6239 __ BIND(Process24B); 6240 __ cmp(length, (u1)24); 6241 __ br(Assembler::LT, SIMDExit); 6242 generate_base64_encode_simdround(src, dst, v0, 8); 6243 __ sub(length, length, 24); 6244 6245 __ BIND(SIMDExit); 6246 __ cbz(length, Exit); 6247 6248 __ BIND(Process3B); 6249 // 3 src bytes, 24 bits 6250 __ ldrb(r10, __ post(src, 1)); 6251 __ ldrb(r11, __ post(src, 1)); 6252 __ ldrb(r12, __ post(src, 1)); 6253 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6254 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6255 // codec index 6256 __ ubfmw(r15, r12, 18, 23); 6257 __ ubfmw(r14, r12, 12, 17); 6258 __ ubfmw(r13, r12, 6, 11); 6259 __ andw(r12, r12, 63); 6260 // get the code based on the codec 6261 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6262 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6263 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6264 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6265 __ strb(r15, __ post(dst, 1)); 6266 __ strb(r14, __ post(dst, 1)); 6267 __ strb(r13, __ post(dst, 1)); 6268 __ strb(r12, __ post(dst, 1)); 6269 __ sub(length, length, 3); 6270 __ cbnz(length, Process3B); 6271 6272 __ BIND(Exit); 6273 __ ret(lr); 6274 6275 return start; 6276 } 6277 6278 void generate_base64_decode_simdround(Register src, Register dst, 6279 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6280 6281 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6282 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6283 6284 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6285 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6286 6287 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6288 6289 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6290 6291 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6292 6293 // we need unsigned saturating subtract, to make sure all input values 6294 // in range [0, 63] will have 0U value in the higher half lookup 6295 __ uqsubv(decH0, __ T16B, in0, v27); 6296 __ uqsubv(decH1, __ T16B, in1, v27); 6297 __ uqsubv(decH2, __ T16B, in2, v27); 6298 __ uqsubv(decH3, __ T16B, in3, v27); 6299 6300 // lower half lookup 6301 __ tbl(decL0, arrangement, codecL, 4, in0); 6302 __ tbl(decL1, arrangement, codecL, 4, in1); 6303 __ tbl(decL2, arrangement, codecL, 4, in2); 6304 __ tbl(decL3, arrangement, codecL, 4, in3); 6305 6306 // higher half lookup 6307 __ tbx(decH0, arrangement, codecH, 4, decH0); 6308 __ tbx(decH1, arrangement, codecH, 4, decH1); 6309 __ tbx(decH2, arrangement, codecH, 4, decH2); 6310 __ tbx(decH3, arrangement, codecH, 4, decH3); 6311 6312 // combine lower and higher 6313 __ orr(decL0, arrangement, decL0, decH0); 6314 __ orr(decL1, arrangement, decL1, decH1); 6315 __ orr(decL2, arrangement, decL2, decH2); 6316 __ orr(decL3, arrangement, decL3, decH3); 6317 6318 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6319 __ cmhi(decH0, arrangement, decL0, v27); 6320 __ cmhi(decH1, arrangement, decL1, v27); 6321 __ cmhi(decH2, arrangement, decL2, v27); 6322 __ cmhi(decH3, arrangement, decL3, v27); 6323 __ orr(in0, arrangement, decH0, decH1); 6324 __ orr(in1, arrangement, decH2, decH3); 6325 __ orr(in2, arrangement, in0, in1); 6326 __ umaxv(in3, arrangement, in2); 6327 __ umov(rscratch2, in3, __ B, 0); 6328 6329 // get the data to output 6330 __ shl(out0, arrangement, decL0, 2); 6331 __ ushr(out1, arrangement, decL1, 4); 6332 __ orr(out0, arrangement, out0, out1); 6333 __ shl(out1, arrangement, decL1, 4); 6334 __ ushr(out2, arrangement, decL2, 2); 6335 __ orr(out1, arrangement, out1, out2); 6336 __ shl(out2, arrangement, decL2, 6); 6337 __ orr(out2, arrangement, out2, decL3); 6338 6339 __ cbz(rscratch2, NoIllegalData); 6340 6341 // handle illegal input 6342 __ umov(r10, in2, __ D, 0); 6343 if (size == 16) { 6344 __ cbnz(r10, ErrorInLowerHalf); 6345 6346 // illegal input is in higher half, store the lower half now. 6347 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6348 6349 __ umov(r10, in2, __ D, 1); 6350 __ umov(r11, out0, __ D, 1); 6351 __ umov(r12, out1, __ D, 1); 6352 __ umov(r13, out2, __ D, 1); 6353 __ b(StoreLegalData); 6354 6355 __ BIND(ErrorInLowerHalf); 6356 } 6357 __ umov(r11, out0, __ D, 0); 6358 __ umov(r12, out1, __ D, 0); 6359 __ umov(r13, out2, __ D, 0); 6360 6361 __ BIND(StoreLegalData); 6362 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6363 __ strb(r11, __ post(dst, 1)); 6364 __ strb(r12, __ post(dst, 1)); 6365 __ strb(r13, __ post(dst, 1)); 6366 __ lsr(r10, r10, 8); 6367 __ lsr(r11, r11, 8); 6368 __ lsr(r12, r12, 8); 6369 __ lsr(r13, r13, 8); 6370 __ b(StoreLegalData); 6371 6372 __ BIND(NoIllegalData); 6373 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6374 } 6375 6376 6377 /** 6378 * Arguments: 6379 * 6380 * Input: 6381 * c_rarg0 - src_start 6382 * c_rarg1 - src_offset 6383 * c_rarg2 - src_length 6384 * c_rarg3 - dest_start 6385 * c_rarg4 - dest_offset 6386 * c_rarg5 - isURL 6387 * c_rarg6 - isMIME 6388 * 6389 */ 6390 address generate_base64_decodeBlock() { 6391 6392 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6393 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6394 // titled "Base64 decoding". 6395 6396 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6397 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6398 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6399 static const uint8_t fromBase64ForNoSIMD[256] = { 6400 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6401 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6402 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6403 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6404 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6405 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6406 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6407 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6408 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6409 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6410 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6411 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6412 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6413 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6414 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6415 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6416 }; 6417 6418 static const uint8_t fromBase64URLForNoSIMD[256] = { 6419 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6420 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6421 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6422 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6423 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6424 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6425 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6426 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6427 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6428 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6429 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6430 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6431 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6432 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6433 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6434 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6435 }; 6436 6437 // A legal value of base64 code is in range [0, 127]. We need two lookups 6438 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6439 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6440 // table vector lookup use tbx, out of range indices are unchanged in 6441 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6442 // The value of index 64 is set to 0, so that we know that we already get the 6443 // decoded data with the 1st lookup. 6444 static const uint8_t fromBase64ForSIMD[128] = { 6445 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6446 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6447 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6448 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6449 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6450 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6451 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6452 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6453 }; 6454 6455 static const uint8_t fromBase64URLForSIMD[128] = { 6456 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6457 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6458 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6459 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6460 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6461 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6462 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6463 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6464 }; 6465 6466 __ align(CodeEntryAlignment); 6467 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6468 address start = __ pc(); 6469 6470 Register src = c_rarg0; // source array 6471 Register soff = c_rarg1; // source start offset 6472 Register send = c_rarg2; // source end offset 6473 Register dst = c_rarg3; // dest array 6474 Register doff = c_rarg4; // position for writing to dest array 6475 Register isURL = c_rarg5; // Base64 or URL character set 6476 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6477 6478 Register length = send; // reuse send as length of source data to process 6479 6480 Register simd_codec = c_rarg6; 6481 Register nosimd_codec = c_rarg7; 6482 6483 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6484 6485 __ enter(); 6486 6487 __ add(src, src, soff); 6488 __ add(dst, dst, doff); 6489 6490 __ mov(doff, dst); 6491 6492 __ sub(length, send, soff); 6493 __ bfm(length, zr, 0, 1); 6494 6495 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6496 __ cbz(isURL, ProcessData); 6497 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6498 6499 __ BIND(ProcessData); 6500 __ mov(rscratch1, length); 6501 __ cmp(length, (u1)144); // 144 = 80 + 64 6502 __ br(Assembler::LT, Process4B); 6503 6504 // In the MIME case, the line length cannot be more than 76 6505 // bytes (see RFC 2045). This is too short a block for SIMD 6506 // to be worthwhile, so we use non-SIMD here. 6507 __ movw(rscratch1, 79); 6508 6509 __ BIND(Process4B); 6510 __ ldrw(r14, __ post(src, 4)); 6511 __ ubfxw(r10, r14, 0, 8); 6512 __ ubfxw(r11, r14, 8, 8); 6513 __ ubfxw(r12, r14, 16, 8); 6514 __ ubfxw(r13, r14, 24, 8); 6515 // get the de-code 6516 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6517 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6518 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6519 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6520 // error detection, 255u indicates an illegal input 6521 __ orrw(r14, r10, r11); 6522 __ orrw(r15, r12, r13); 6523 __ orrw(r14, r14, r15); 6524 __ tbnz(r14, 7, Exit); 6525 // recover the data 6526 __ lslw(r14, r10, 10); 6527 __ bfiw(r14, r11, 4, 6); 6528 __ bfmw(r14, r12, 2, 5); 6529 __ rev16w(r14, r14); 6530 __ bfiw(r13, r12, 6, 2); 6531 __ strh(r14, __ post(dst, 2)); 6532 __ strb(r13, __ post(dst, 1)); 6533 // non-simd loop 6534 __ subsw(rscratch1, rscratch1, 4); 6535 __ br(Assembler::GT, Process4B); 6536 6537 // if exiting from PreProcess80B, rscratch1 == -1; 6538 // otherwise, rscratch1 == 0. 6539 __ cbzw(rscratch1, Exit); 6540 __ sub(length, length, 80); 6541 6542 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6543 __ cbz(isURL, SIMDEnter); 6544 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6545 6546 __ BIND(SIMDEnter); 6547 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6548 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6549 __ mov(rscratch1, 63); 6550 __ dup(v27, __ T16B, rscratch1); 6551 6552 __ BIND(Process64B); 6553 __ cmp(length, (u1)64); 6554 __ br(Assembler::LT, Process32B); 6555 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6556 __ sub(length, length, 64); 6557 __ b(Process64B); 6558 6559 __ BIND(Process32B); 6560 __ cmp(length, (u1)32); 6561 __ br(Assembler::LT, SIMDExit); 6562 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6563 __ sub(length, length, 32); 6564 __ b(Process32B); 6565 6566 __ BIND(SIMDExit); 6567 __ cbz(length, Exit); 6568 __ movw(rscratch1, length); 6569 __ b(Process4B); 6570 6571 __ BIND(Exit); 6572 __ sub(c_rarg0, dst, doff); 6573 6574 __ leave(); 6575 __ ret(lr); 6576 6577 return start; 6578 } 6579 6580 // Support for spin waits. 6581 address generate_spin_wait() { 6582 __ align(CodeEntryAlignment); 6583 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6584 address start = __ pc(); 6585 6586 __ spin_wait(); 6587 __ ret(lr); 6588 6589 return start; 6590 } 6591 6592 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 6593 6594 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6595 // 6596 // If LSE is in use, generate LSE versions of all the stubs. The 6597 // non-LSE versions are in atomic_aarch64.S. 6598 6599 // class AtomicStubMark records the entry point of a stub and the 6600 // stub pointer which will point to it. The stub pointer is set to 6601 // the entry point when ~AtomicStubMark() is called, which must be 6602 // after ICache::invalidate_range. This ensures safe publication of 6603 // the generated code. 6604 class AtomicStubMark { 6605 address _entry_point; 6606 aarch64_atomic_stub_t *_stub; 6607 MacroAssembler *_masm; 6608 public: 6609 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6610 _masm = masm; 6611 __ align(32); 6612 _entry_point = __ pc(); 6613 _stub = stub; 6614 } 6615 ~AtomicStubMark() { 6616 *_stub = (aarch64_atomic_stub_t)_entry_point; 6617 } 6618 }; 6619 6620 // NB: For memory_order_conservative we need a trailing membar after 6621 // LSE atomic operations but not a leading membar. 6622 // 6623 // We don't need a leading membar because a clause in the Arm ARM 6624 // says: 6625 // 6626 // Barrier-ordered-before 6627 // 6628 // Barrier instructions order prior Memory effects before subsequent 6629 // Memory effects generated by the same Observer. A read or a write 6630 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6631 // Observer if and only if RW1 appears in program order before RW 2 6632 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6633 // instruction with both Acquire and Release semantics. 6634 // 6635 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6636 // and Release semantics, therefore we don't need a leading 6637 // barrier. However, there is no corresponding Barrier-ordered-after 6638 // relationship, therefore we need a trailing membar to prevent a 6639 // later store or load from being reordered with the store in an 6640 // atomic instruction. 6641 // 6642 // This was checked by using the herd7 consistency model simulator 6643 // (http://diy.inria.fr/) with this test case: 6644 // 6645 // AArch64 LseCas 6646 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6647 // P0 | P1; 6648 // LDR W4, [X2] | MOV W3, #0; 6649 // DMB LD | MOV W4, #1; 6650 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6651 // | DMB ISH; 6652 // | STR W4, [X2]; 6653 // exists 6654 // (0:X3=0 /\ 0:X4=1) 6655 // 6656 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6657 // with the store to x in P1. Without the DMB in P1 this may happen. 6658 // 6659 // At the time of writing we don't know of any AArch64 hardware that 6660 // reorders stores in this way, but the Reference Manual permits it. 6661 6662 void gen_cas_entry(Assembler::operand_size size, 6663 atomic_memory_order order) { 6664 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6665 exchange_val = c_rarg2; 6666 bool acquire, release; 6667 switch (order) { 6668 case memory_order_relaxed: 6669 acquire = false; 6670 release = false; 6671 break; 6672 case memory_order_release: 6673 acquire = false; 6674 release = true; 6675 break; 6676 default: 6677 acquire = true; 6678 release = true; 6679 break; 6680 } 6681 __ mov(prev, compare_val); 6682 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6683 if (order == memory_order_conservative) { 6684 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6685 } 6686 if (size == Assembler::xword) { 6687 __ mov(r0, prev); 6688 } else { 6689 __ movw(r0, prev); 6690 } 6691 __ ret(lr); 6692 } 6693 6694 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6695 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6696 // If not relaxed, then default to conservative. Relaxed is the only 6697 // case we use enough to be worth specializing. 6698 if (order == memory_order_relaxed) { 6699 __ ldadd(size, incr, prev, addr); 6700 } else { 6701 __ ldaddal(size, incr, prev, addr); 6702 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6703 } 6704 if (size == Assembler::xword) { 6705 __ mov(r0, prev); 6706 } else { 6707 __ movw(r0, prev); 6708 } 6709 __ ret(lr); 6710 } 6711 6712 void gen_swpal_entry(Assembler::operand_size size) { 6713 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6714 __ swpal(size, incr, prev, addr); 6715 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6716 if (size == Assembler::xword) { 6717 __ mov(r0, prev); 6718 } else { 6719 __ movw(r0, prev); 6720 } 6721 __ ret(lr); 6722 } 6723 6724 void generate_atomic_entry_points() { 6725 if (! UseLSE) { 6726 return; 6727 } 6728 6729 __ align(CodeEntryAlignment); 6730 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6731 address first_entry = __ pc(); 6732 6733 // ADD, memory_order_conservative 6734 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6735 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6736 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6737 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6738 6739 // ADD, memory_order_relaxed 6740 AtomicStubMark mark_fetch_add_4_relaxed 6741 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6742 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6743 AtomicStubMark mark_fetch_add_8_relaxed 6744 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6745 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6746 6747 // XCHG, memory_order_conservative 6748 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6749 gen_swpal_entry(Assembler::word); 6750 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6751 gen_swpal_entry(Assembler::xword); 6752 6753 // CAS, memory_order_conservative 6754 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6755 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6756 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6757 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6758 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6759 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6760 6761 // CAS, memory_order_relaxed 6762 AtomicStubMark mark_cmpxchg_1_relaxed 6763 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6764 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6765 AtomicStubMark mark_cmpxchg_4_relaxed 6766 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6767 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6768 AtomicStubMark mark_cmpxchg_8_relaxed 6769 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6770 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6771 6772 AtomicStubMark mark_cmpxchg_4_release 6773 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6774 gen_cas_entry(MacroAssembler::word, memory_order_release); 6775 AtomicStubMark mark_cmpxchg_8_release 6776 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 6777 gen_cas_entry(MacroAssembler::xword, memory_order_release); 6778 6779 AtomicStubMark mark_cmpxchg_4_seq_cst 6780 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 6781 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 6782 AtomicStubMark mark_cmpxchg_8_seq_cst 6783 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 6784 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 6785 6786 ICache::invalidate_range(first_entry, __ pc() - first_entry); 6787 } 6788 #endif // LINUX 6789 6790 address generate_cont_thaw(Continuation::thaw_kind kind) { 6791 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 6792 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 6793 6794 address start = __ pc(); 6795 6796 if (return_barrier) { 6797 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 6798 __ mov(sp, rscratch1); 6799 } 6800 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6801 6802 if (return_barrier) { 6803 // preserve possible return value from a method returning to the return barrier 6804 __ fmovd(rscratch1, v0); 6805 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 6806 } 6807 6808 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 6809 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 6810 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 6811 6812 if (return_barrier) { 6813 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 6814 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 6815 __ fmovd(v0, rscratch1); 6816 } 6817 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6818 6819 6820 Label thaw_success; 6821 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 6822 __ cbnz(rscratch2, thaw_success); 6823 __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry())); 6824 __ br(rscratch1); 6825 __ bind(thaw_success); 6826 6827 // make room for the thawed frames 6828 __ sub(rscratch1, sp, rscratch2); 6829 __ andr(rscratch1, rscratch1, -16); // align 6830 __ mov(sp, rscratch1); 6831 6832 if (return_barrier) { 6833 // save original return value -- again 6834 __ fmovd(rscratch1, v0); 6835 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 6836 } 6837 6838 // If we want, we can templatize thaw by kind, and have three different entries 6839 __ movw(c_rarg1, (uint32_t)kind); 6840 6841 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 6842 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 6843 6844 if (return_barrier) { 6845 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 6846 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 6847 __ fmovd(v0, rscratch1); 6848 } else { 6849 __ mov(r0, zr); // return 0 (success) from doYield 6850 } 6851 6852 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 6853 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 6854 __ mov(rfp, sp); 6855 6856 if (return_barrier_exception) { 6857 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 6858 __ verify_oop(r0); 6859 __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19 6860 6861 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 6862 6863 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 6864 // __ reinitialize_ptrue(); 6865 6866 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 6867 6868 __ mov(r1, r0); // the exception handler 6869 __ mov(r0, r19); // restore return value contaning the exception oop 6870 __ verify_oop(r0); 6871 6872 __ leave(); 6873 __ mov(r3, lr); 6874 __ br(r1); // the exception handler 6875 } else { 6876 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 6877 __ leave(); 6878 __ ret(lr); 6879 } 6880 6881 return start; 6882 } 6883 6884 address generate_cont_thaw() { 6885 if (!Continuations::enabled()) return nullptr; 6886 6887 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 6888 address start = __ pc(); 6889 generate_cont_thaw(Continuation::thaw_top); 6890 return start; 6891 } 6892 6893 address generate_cont_returnBarrier() { 6894 if (!Continuations::enabled()) return nullptr; 6895 6896 // TODO: will probably need multiple return barriers depending on return type 6897 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 6898 address start = __ pc(); 6899 6900 generate_cont_thaw(Continuation::thaw_return_barrier); 6901 6902 return start; 6903 } 6904 6905 address generate_cont_returnBarrier_exception() { 6906 if (!Continuations::enabled()) return nullptr; 6907 6908 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 6909 address start = __ pc(); 6910 6911 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 6912 6913 return start; 6914 } 6915 6916 #if INCLUDE_JFR 6917 6918 static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { 6919 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 6920 __ mov(c_rarg0, thread); 6921 } 6922 6923 // The handle is dereferenced through a load barrier. 6924 static void jfr_epilogue(MacroAssembler* _masm) { 6925 __ reset_last_Java_frame(true); 6926 Label null_jobject; 6927 __ cbz(r0, null_jobject); 6928 DecoratorSet decorators = ACCESS_READ | IN_NATIVE; 6929 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 6930 bs->load_at(_masm, decorators, T_OBJECT, r0, Address(r0, 0), rscratch1, rscratch2); 6931 __ bind(null_jobject); 6932 } 6933 6934 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 6935 // It returns a jobject handle to the event writer. 6936 // The handle is dereferenced and the return value is the event writer oop. 6937 static RuntimeStub* generate_jfr_write_checkpoint() { 6938 enum layout { 6939 rbp_off, 6940 rbpH_off, 6941 return_off, 6942 return_off2, 6943 framesize // inclusive of return address 6944 }; 6945 6946 int insts_size = 512; 6947 int locs_size = 64; 6948 CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size); 6949 OopMapSet* oop_maps = new OopMapSet(); 6950 MacroAssembler* masm = new MacroAssembler(&code); 6951 MacroAssembler* _masm = masm; 6952 6953 address start = __ pc(); 6954 __ enter(); 6955 int frame_complete = __ pc() - start; 6956 address the_pc = __ pc(); 6957 jfr_prologue(the_pc, _masm, rthread); 6958 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 6959 jfr_epilogue(_masm); 6960 __ leave(); 6961 __ ret(lr); 6962 6963 OopMap* map = new OopMap(framesize, 1); // rfp 6964 oop_maps->add_gc_map(the_pc - start, map); 6965 6966 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 6967 RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete, 6968 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 6969 oop_maps, false); 6970 return stub; 6971 } 6972 6973 #endif // INCLUDE_JFR 6974 6975 // Continuation point for throwing of implicit exceptions that are 6976 // not handled in the current activation. Fabricates an exception 6977 // oop and initiates normal exception dispatching in this 6978 // frame. Since we need to preserve callee-saved values (currently 6979 // only for C2, but done for C1 as well) we need a callee-saved oop 6980 // map and therefore have to make these stubs into RuntimeStubs 6981 // rather than BufferBlobs. If the compiler needs all registers to 6982 // be preserved between the fault point and the exception handler 6983 // then it must assume responsibility for that in 6984 // AbstractCompiler::continuation_for_implicit_null_exception or 6985 // continuation_for_implicit_division_by_zero_exception. All other 6986 // implicit exceptions (e.g., NullPointerException or 6987 // AbstractMethodError on entry) are either at call sites or 6988 // otherwise assume that stack unwinding will be initiated, so 6989 // caller saved registers were assumed volatile in the compiler. 6990 6991 #undef __ 6992 #define __ masm-> 6993 6994 address generate_throw_exception(const char* name, 6995 address runtime_entry, 6996 Register arg1 = noreg, 6997 Register arg2 = noreg) { 6998 // Information about frame layout at time of blocking runtime call. 6999 // Note that we only have to preserve callee-saved registers since 7000 // the compilers are responsible for supplying a continuation point 7001 // if they expect all registers to be preserved. 7002 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 7003 enum layout { 7004 rfp_off = 0, 7005 rfp_off2, 7006 return_off, 7007 return_off2, 7008 framesize // inclusive of return address 7009 }; 7010 7011 int insts_size = 512; 7012 int locs_size = 64; 7013 7014 CodeBuffer code(name, insts_size, locs_size); 7015 OopMapSet* oop_maps = new OopMapSet(); 7016 MacroAssembler* masm = new MacroAssembler(&code); 7017 7018 address start = __ pc(); 7019 7020 // This is an inlined and slightly modified version of call_VM 7021 // which has the ability to fetch the return PC out of 7022 // thread-local storage and also sets up last_Java_sp slightly 7023 // differently than the real call_VM 7024 7025 __ enter(); // Save FP and LR before call 7026 7027 assert(is_even(framesize/2), "sp not 16-byte aligned"); 7028 7029 // lr and fp are already in place 7030 __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog 7031 7032 int frame_complete = __ pc() - start; 7033 7034 // Set up last_Java_sp and last_Java_fp 7035 address the_pc = __ pc(); 7036 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7037 7038 // Call runtime 7039 if (arg1 != noreg) { 7040 assert(arg2 != c_rarg1, "clobbered"); 7041 __ mov(c_rarg1, arg1); 7042 } 7043 if (arg2 != noreg) { 7044 __ mov(c_rarg2, arg2); 7045 } 7046 __ mov(c_rarg0, rthread); 7047 BLOCK_COMMENT("call runtime_entry"); 7048 __ mov(rscratch1, runtime_entry); 7049 __ blr(rscratch1); 7050 7051 // Generate oop map 7052 OopMap* map = new OopMap(framesize, 0); 7053 7054 oop_maps->add_gc_map(the_pc - start, map); 7055 7056 __ reset_last_Java_frame(true); 7057 7058 // Reinitialize the ptrue predicate register, in case the external runtime 7059 // call clobbers ptrue reg, as we may return to SVE compiled code. 7060 __ reinitialize_ptrue(); 7061 7062 __ leave(); 7063 7064 // check for pending exceptions 7065 #ifdef ASSERT 7066 Label L; 7067 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 7068 __ cbnz(rscratch1, L); 7069 __ should_not_reach_here(); 7070 __ bind(L); 7071 #endif // ASSERT 7072 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 7073 7074 // codeBlob framesize is in words (not VMRegImpl::slot_size) 7075 RuntimeStub* stub = 7076 RuntimeStub::new_runtime_stub(name, 7077 &code, 7078 frame_complete, 7079 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7080 oop_maps, false); 7081 return stub->entry_point(); 7082 } 7083 7084 class MontgomeryMultiplyGenerator : public MacroAssembler { 7085 7086 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7087 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7088 7089 RegSet _toSave; 7090 bool _squaring; 7091 7092 public: 7093 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7094 : MacroAssembler(as->code()), _squaring(squaring) { 7095 7096 // Register allocation 7097 7098 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7099 Pa_base = *regs; // Argument registers 7100 if (squaring) 7101 Pb_base = Pa_base; 7102 else 7103 Pb_base = *++regs; 7104 Pn_base = *++regs; 7105 Rlen= *++regs; 7106 inv = *++regs; 7107 Pm_base = *++regs; 7108 7109 // Working registers: 7110 Ra = *++regs; // The current digit of a, b, n, and m. 7111 Rb = *++regs; 7112 Rm = *++regs; 7113 Rn = *++regs; 7114 7115 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7116 Pb = *++regs; 7117 Pm = *++regs; 7118 Pn = *++regs; 7119 7120 t0 = *++regs; // Three registers which form a 7121 t1 = *++regs; // triple-precision accumuator. 7122 t2 = *++regs; 7123 7124 Ri = *++regs; // Inner and outer loop indexes. 7125 Rj = *++regs; 7126 7127 Rhi_ab = *++regs; // Product registers: low and high parts 7128 Rlo_ab = *++regs; // of a*b and m*n. 7129 Rhi_mn = *++regs; 7130 Rlo_mn = *++regs; 7131 7132 // r19 and up are callee-saved. 7133 _toSave = RegSet::range(r19, *regs) + Pm_base; 7134 } 7135 7136 private: 7137 void save_regs() { 7138 push(_toSave, sp); 7139 } 7140 7141 void restore_regs() { 7142 pop(_toSave, sp); 7143 } 7144 7145 template <typename T> 7146 void unroll_2(Register count, T block) { 7147 Label loop, end, odd; 7148 tbnz(count, 0, odd); 7149 cbz(count, end); 7150 align(16); 7151 bind(loop); 7152 (this->*block)(); 7153 bind(odd); 7154 (this->*block)(); 7155 subs(count, count, 2); 7156 br(Assembler::GT, loop); 7157 bind(end); 7158 } 7159 7160 template <typename T> 7161 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7162 Label loop, end, odd; 7163 tbnz(count, 0, odd); 7164 cbz(count, end); 7165 align(16); 7166 bind(loop); 7167 (this->*block)(d, s, tmp); 7168 bind(odd); 7169 (this->*block)(d, s, tmp); 7170 subs(count, count, 2); 7171 br(Assembler::GT, loop); 7172 bind(end); 7173 } 7174 7175 void pre1(RegisterOrConstant i) { 7176 block_comment("pre1"); 7177 // Pa = Pa_base; 7178 // Pb = Pb_base + i; 7179 // Pm = Pm_base; 7180 // Pn = Pn_base + i; 7181 // Ra = *Pa; 7182 // Rb = *Pb; 7183 // Rm = *Pm; 7184 // Rn = *Pn; 7185 ldr(Ra, Address(Pa_base)); 7186 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7187 ldr(Rm, Address(Pm_base)); 7188 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7189 lea(Pa, Address(Pa_base)); 7190 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7191 lea(Pm, Address(Pm_base)); 7192 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7193 7194 // Zero the m*n result. 7195 mov(Rhi_mn, zr); 7196 mov(Rlo_mn, zr); 7197 } 7198 7199 // The core multiply-accumulate step of a Montgomery 7200 // multiplication. The idea is to schedule operations as a 7201 // pipeline so that instructions with long latencies (loads and 7202 // multiplies) have time to complete before their results are 7203 // used. This most benefits in-order implementations of the 7204 // architecture but out-of-order ones also benefit. 7205 void step() { 7206 block_comment("step"); 7207 // MACC(Ra, Rb, t0, t1, t2); 7208 // Ra = *++Pa; 7209 // Rb = *--Pb; 7210 umulh(Rhi_ab, Ra, Rb); 7211 mul(Rlo_ab, Ra, Rb); 7212 ldr(Ra, pre(Pa, wordSize)); 7213 ldr(Rb, pre(Pb, -wordSize)); 7214 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7215 // previous iteration. 7216 // MACC(Rm, Rn, t0, t1, t2); 7217 // Rm = *++Pm; 7218 // Rn = *--Pn; 7219 umulh(Rhi_mn, Rm, Rn); 7220 mul(Rlo_mn, Rm, Rn); 7221 ldr(Rm, pre(Pm, wordSize)); 7222 ldr(Rn, pre(Pn, -wordSize)); 7223 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7224 } 7225 7226 void post1() { 7227 block_comment("post1"); 7228 7229 // MACC(Ra, Rb, t0, t1, t2); 7230 // Ra = *++Pa; 7231 // Rb = *--Pb; 7232 umulh(Rhi_ab, Ra, Rb); 7233 mul(Rlo_ab, Ra, Rb); 7234 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7235 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7236 7237 // *Pm = Rm = t0 * inv; 7238 mul(Rm, t0, inv); 7239 str(Rm, Address(Pm)); 7240 7241 // MACC(Rm, Rn, t0, t1, t2); 7242 // t0 = t1; t1 = t2; t2 = 0; 7243 umulh(Rhi_mn, Rm, Rn); 7244 7245 #ifndef PRODUCT 7246 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7247 { 7248 mul(Rlo_mn, Rm, Rn); 7249 add(Rlo_mn, t0, Rlo_mn); 7250 Label ok; 7251 cbz(Rlo_mn, ok); { 7252 stop("broken Montgomery multiply"); 7253 } bind(ok); 7254 } 7255 #endif 7256 // We have very carefully set things up so that 7257 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7258 // the lower half of Rm * Rn because we know the result already: 7259 // it must be -t0. t0 + (-t0) must generate a carry iff 7260 // t0 != 0. So, rather than do a mul and an adds we just set 7261 // the carry flag iff t0 is nonzero. 7262 // 7263 // mul(Rlo_mn, Rm, Rn); 7264 // adds(zr, t0, Rlo_mn); 7265 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7266 adcs(t0, t1, Rhi_mn); 7267 adc(t1, t2, zr); 7268 mov(t2, zr); 7269 } 7270 7271 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7272 block_comment("pre2"); 7273 // Pa = Pa_base + i-len; 7274 // Pb = Pb_base + len; 7275 // Pm = Pm_base + i-len; 7276 // Pn = Pn_base + len; 7277 7278 if (i.is_register()) { 7279 sub(Rj, i.as_register(), len); 7280 } else { 7281 mov(Rj, i.as_constant()); 7282 sub(Rj, Rj, len); 7283 } 7284 // Rj == i-len 7285 7286 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7287 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7288 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7289 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7290 7291 // Ra = *++Pa; 7292 // Rb = *--Pb; 7293 // Rm = *++Pm; 7294 // Rn = *--Pn; 7295 ldr(Ra, pre(Pa, wordSize)); 7296 ldr(Rb, pre(Pb, -wordSize)); 7297 ldr(Rm, pre(Pm, wordSize)); 7298 ldr(Rn, pre(Pn, -wordSize)); 7299 7300 mov(Rhi_mn, zr); 7301 mov(Rlo_mn, zr); 7302 } 7303 7304 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7305 block_comment("post2"); 7306 if (i.is_constant()) { 7307 mov(Rj, i.as_constant()-len.as_constant()); 7308 } else { 7309 sub(Rj, i.as_register(), len); 7310 } 7311 7312 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7313 7314 // As soon as we know the least significant digit of our result, 7315 // store it. 7316 // Pm_base[i-len] = t0; 7317 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7318 7319 // t0 = t1; t1 = t2; t2 = 0; 7320 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7321 adc(t1, t2, zr); 7322 mov(t2, zr); 7323 } 7324 7325 // A carry in t0 after Montgomery multiplication means that we 7326 // should subtract multiples of n from our result in m. We'll 7327 // keep doing that until there is no carry. 7328 void normalize(RegisterOrConstant len) { 7329 block_comment("normalize"); 7330 // while (t0) 7331 // t0 = sub(Pm_base, Pn_base, t0, len); 7332 Label loop, post, again; 7333 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7334 cbz(t0, post); { 7335 bind(again); { 7336 mov(i, zr); 7337 mov(cnt, len); 7338 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7339 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7340 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7341 align(16); 7342 bind(loop); { 7343 sbcs(Rm, Rm, Rn); 7344 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7345 add(i, i, 1); 7346 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7347 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7348 sub(cnt, cnt, 1); 7349 } cbnz(cnt, loop); 7350 sbc(t0, t0, zr); 7351 } cbnz(t0, again); 7352 } bind(post); 7353 } 7354 7355 // Move memory at s to d, reversing words. 7356 // Increments d to end of copied memory 7357 // Destroys tmp1, tmp2 7358 // Preserves len 7359 // Leaves s pointing to the address which was in d at start 7360 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7361 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7362 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7363 7364 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7365 mov(tmp1, len); 7366 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7367 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7368 } 7369 // where 7370 void reverse1(Register d, Register s, Register tmp) { 7371 ldr(tmp, pre(s, -wordSize)); 7372 ror(tmp, tmp, 32); 7373 str(tmp, post(d, wordSize)); 7374 } 7375 7376 void step_squaring() { 7377 // An extra ACC 7378 step(); 7379 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7380 } 7381 7382 void last_squaring(RegisterOrConstant i) { 7383 Label dont; 7384 // if ((i & 1) == 0) { 7385 tbnz(i.as_register(), 0, dont); { 7386 // MACC(Ra, Rb, t0, t1, t2); 7387 // Ra = *++Pa; 7388 // Rb = *--Pb; 7389 umulh(Rhi_ab, Ra, Rb); 7390 mul(Rlo_ab, Ra, Rb); 7391 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7392 } bind(dont); 7393 } 7394 7395 void extra_step_squaring() { 7396 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7397 7398 // MACC(Rm, Rn, t0, t1, t2); 7399 // Rm = *++Pm; 7400 // Rn = *--Pn; 7401 umulh(Rhi_mn, Rm, Rn); 7402 mul(Rlo_mn, Rm, Rn); 7403 ldr(Rm, pre(Pm, wordSize)); 7404 ldr(Rn, pre(Pn, -wordSize)); 7405 } 7406 7407 void post1_squaring() { 7408 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7409 7410 // *Pm = Rm = t0 * inv; 7411 mul(Rm, t0, inv); 7412 str(Rm, Address(Pm)); 7413 7414 // MACC(Rm, Rn, t0, t1, t2); 7415 // t0 = t1; t1 = t2; t2 = 0; 7416 umulh(Rhi_mn, Rm, Rn); 7417 7418 #ifndef PRODUCT 7419 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7420 { 7421 mul(Rlo_mn, Rm, Rn); 7422 add(Rlo_mn, t0, Rlo_mn); 7423 Label ok; 7424 cbz(Rlo_mn, ok); { 7425 stop("broken Montgomery multiply"); 7426 } bind(ok); 7427 } 7428 #endif 7429 // We have very carefully set things up so that 7430 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7431 // the lower half of Rm * Rn because we know the result already: 7432 // it must be -t0. t0 + (-t0) must generate a carry iff 7433 // t0 != 0. So, rather than do a mul and an adds we just set 7434 // the carry flag iff t0 is nonzero. 7435 // 7436 // mul(Rlo_mn, Rm, Rn); 7437 // adds(zr, t0, Rlo_mn); 7438 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7439 adcs(t0, t1, Rhi_mn); 7440 adc(t1, t2, zr); 7441 mov(t2, zr); 7442 } 7443 7444 void acc(Register Rhi, Register Rlo, 7445 Register t0, Register t1, Register t2) { 7446 adds(t0, t0, Rlo); 7447 adcs(t1, t1, Rhi); 7448 adc(t2, t2, zr); 7449 } 7450 7451 public: 7452 /** 7453 * Fast Montgomery multiplication. The derivation of the 7454 * algorithm is in A Cryptographic Library for the Motorola 7455 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7456 * 7457 * Arguments: 7458 * 7459 * Inputs for multiplication: 7460 * c_rarg0 - int array elements a 7461 * c_rarg1 - int array elements b 7462 * c_rarg2 - int array elements n (the modulus) 7463 * c_rarg3 - int length 7464 * c_rarg4 - int inv 7465 * c_rarg5 - int array elements m (the result) 7466 * 7467 * Inputs for squaring: 7468 * c_rarg0 - int array elements a 7469 * c_rarg1 - int array elements n (the modulus) 7470 * c_rarg2 - int length 7471 * c_rarg3 - int inv 7472 * c_rarg4 - int array elements m (the result) 7473 * 7474 */ 7475 address generate_multiply() { 7476 Label argh, nothing; 7477 bind(argh); 7478 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7479 7480 align(CodeEntryAlignment); 7481 address entry = pc(); 7482 7483 cbzw(Rlen, nothing); 7484 7485 enter(); 7486 7487 // Make room. 7488 cmpw(Rlen, 512); 7489 br(Assembler::HI, argh); 7490 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7491 andr(sp, Ra, -2 * wordSize); 7492 7493 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7494 7495 { 7496 // Copy input args, reversing as we go. We use Ra as a 7497 // temporary variable. 7498 reverse(Ra, Pa_base, Rlen, t0, t1); 7499 if (!_squaring) 7500 reverse(Ra, Pb_base, Rlen, t0, t1); 7501 reverse(Ra, Pn_base, Rlen, t0, t1); 7502 } 7503 7504 // Push all call-saved registers and also Pm_base which we'll need 7505 // at the end. 7506 save_regs(); 7507 7508 #ifndef PRODUCT 7509 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7510 { 7511 ldr(Rn, Address(Pn_base, 0)); 7512 mul(Rlo_mn, Rn, inv); 7513 subs(zr, Rlo_mn, -1); 7514 Label ok; 7515 br(EQ, ok); { 7516 stop("broken inverse in Montgomery multiply"); 7517 } bind(ok); 7518 } 7519 #endif 7520 7521 mov(Pm_base, Ra); 7522 7523 mov(t0, zr); 7524 mov(t1, zr); 7525 mov(t2, zr); 7526 7527 block_comment("for (int i = 0; i < len; i++) {"); 7528 mov(Ri, zr); { 7529 Label loop, end; 7530 cmpw(Ri, Rlen); 7531 br(Assembler::GE, end); 7532 7533 bind(loop); 7534 pre1(Ri); 7535 7536 block_comment(" for (j = i; j; j--) {"); { 7537 movw(Rj, Ri); 7538 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7539 } block_comment(" } // j"); 7540 7541 post1(); 7542 addw(Ri, Ri, 1); 7543 cmpw(Ri, Rlen); 7544 br(Assembler::LT, loop); 7545 bind(end); 7546 block_comment("} // i"); 7547 } 7548 7549 block_comment("for (int i = len; i < 2*len; i++) {"); 7550 mov(Ri, Rlen); { 7551 Label loop, end; 7552 cmpw(Ri, Rlen, Assembler::LSL, 1); 7553 br(Assembler::GE, end); 7554 7555 bind(loop); 7556 pre2(Ri, Rlen); 7557 7558 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7559 lslw(Rj, Rlen, 1); 7560 subw(Rj, Rj, Ri); 7561 subw(Rj, Rj, 1); 7562 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7563 } block_comment(" } // j"); 7564 7565 post2(Ri, Rlen); 7566 addw(Ri, Ri, 1); 7567 cmpw(Ri, Rlen, Assembler::LSL, 1); 7568 br(Assembler::LT, loop); 7569 bind(end); 7570 } 7571 block_comment("} // i"); 7572 7573 normalize(Rlen); 7574 7575 mov(Ra, Pm_base); // Save Pm_base in Ra 7576 restore_regs(); // Restore caller's Pm_base 7577 7578 // Copy our result into caller's Pm_base 7579 reverse(Pm_base, Ra, Rlen, t0, t1); 7580 7581 leave(); 7582 bind(nothing); 7583 ret(lr); 7584 7585 return entry; 7586 } 7587 // In C, approximately: 7588 7589 // void 7590 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7591 // julong Pn_base[], julong Pm_base[], 7592 // julong inv, int len) { 7593 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7594 // julong *Pa, *Pb, *Pn, *Pm; 7595 // julong Ra, Rb, Rn, Rm; 7596 7597 // int i; 7598 7599 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7600 7601 // for (i = 0; i < len; i++) { 7602 // int j; 7603 7604 // Pa = Pa_base; 7605 // Pb = Pb_base + i; 7606 // Pm = Pm_base; 7607 // Pn = Pn_base + i; 7608 7609 // Ra = *Pa; 7610 // Rb = *Pb; 7611 // Rm = *Pm; 7612 // Rn = *Pn; 7613 7614 // int iters = i; 7615 // for (j = 0; iters--; j++) { 7616 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7617 // MACC(Ra, Rb, t0, t1, t2); 7618 // Ra = *++Pa; 7619 // Rb = *--Pb; 7620 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7621 // MACC(Rm, Rn, t0, t1, t2); 7622 // Rm = *++Pm; 7623 // Rn = *--Pn; 7624 // } 7625 7626 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 7627 // MACC(Ra, Rb, t0, t1, t2); 7628 // *Pm = Rm = t0 * inv; 7629 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7630 // MACC(Rm, Rn, t0, t1, t2); 7631 7632 // assert(t0 == 0, "broken Montgomery multiply"); 7633 7634 // t0 = t1; t1 = t2; t2 = 0; 7635 // } 7636 7637 // for (i = len; i < 2*len; i++) { 7638 // int j; 7639 7640 // Pa = Pa_base + i-len; 7641 // Pb = Pb_base + len; 7642 // Pm = Pm_base + i-len; 7643 // Pn = Pn_base + len; 7644 7645 // Ra = *++Pa; 7646 // Rb = *--Pb; 7647 // Rm = *++Pm; 7648 // Rn = *--Pn; 7649 7650 // int iters = len*2-i-1; 7651 // for (j = i-len+1; iters--; j++) { 7652 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7653 // MACC(Ra, Rb, t0, t1, t2); 7654 // Ra = *++Pa; 7655 // Rb = *--Pb; 7656 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7657 // MACC(Rm, Rn, t0, t1, t2); 7658 // Rm = *++Pm; 7659 // Rn = *--Pn; 7660 // } 7661 7662 // Pm_base[i-len] = t0; 7663 // t0 = t1; t1 = t2; t2 = 0; 7664 // } 7665 7666 // while (t0) 7667 // t0 = sub(Pm_base, Pn_base, t0, len); 7668 // } 7669 7670 /** 7671 * Fast Montgomery squaring. This uses asymptotically 25% fewer 7672 * multiplies than Montgomery multiplication so it should be up to 7673 * 25% faster. However, its loop control is more complex and it 7674 * may actually run slower on some machines. 7675 * 7676 * Arguments: 7677 * 7678 * Inputs: 7679 * c_rarg0 - int array elements a 7680 * c_rarg1 - int array elements n (the modulus) 7681 * c_rarg2 - int length 7682 * c_rarg3 - int inv 7683 * c_rarg4 - int array elements m (the result) 7684 * 7685 */ 7686 address generate_square() { 7687 Label argh; 7688 bind(argh); 7689 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7690 7691 align(CodeEntryAlignment); 7692 address entry = pc(); 7693 7694 enter(); 7695 7696 // Make room. 7697 cmpw(Rlen, 512); 7698 br(Assembler::HI, argh); 7699 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7700 andr(sp, Ra, -2 * wordSize); 7701 7702 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7703 7704 { 7705 // Copy input args, reversing as we go. We use Ra as a 7706 // temporary variable. 7707 reverse(Ra, Pa_base, Rlen, t0, t1); 7708 reverse(Ra, Pn_base, Rlen, t0, t1); 7709 } 7710 7711 // Push all call-saved registers and also Pm_base which we'll need 7712 // at the end. 7713 save_regs(); 7714 7715 mov(Pm_base, Ra); 7716 7717 mov(t0, zr); 7718 mov(t1, zr); 7719 mov(t2, zr); 7720 7721 block_comment("for (int i = 0; i < len; i++) {"); 7722 mov(Ri, zr); { 7723 Label loop, end; 7724 bind(loop); 7725 cmp(Ri, Rlen); 7726 br(Assembler::GE, end); 7727 7728 pre1(Ri); 7729 7730 block_comment("for (j = (i+1)/2; j; j--) {"); { 7731 add(Rj, Ri, 1); 7732 lsr(Rj, Rj, 1); 7733 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7734 } block_comment(" } // j"); 7735 7736 last_squaring(Ri); 7737 7738 block_comment(" for (j = i/2; j; j--) {"); { 7739 lsr(Rj, Ri, 1); 7740 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7741 } block_comment(" } // j"); 7742 7743 post1_squaring(); 7744 add(Ri, Ri, 1); 7745 cmp(Ri, Rlen); 7746 br(Assembler::LT, loop); 7747 7748 bind(end); 7749 block_comment("} // i"); 7750 } 7751 7752 block_comment("for (int i = len; i < 2*len; i++) {"); 7753 mov(Ri, Rlen); { 7754 Label loop, end; 7755 bind(loop); 7756 cmp(Ri, Rlen, Assembler::LSL, 1); 7757 br(Assembler::GE, end); 7758 7759 pre2(Ri, Rlen); 7760 7761 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 7762 lsl(Rj, Rlen, 1); 7763 sub(Rj, Rj, Ri); 7764 sub(Rj, Rj, 1); 7765 lsr(Rj, Rj, 1); 7766 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7767 } block_comment(" } // j"); 7768 7769 last_squaring(Ri); 7770 7771 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 7772 lsl(Rj, Rlen, 1); 7773 sub(Rj, Rj, Ri); 7774 lsr(Rj, Rj, 1); 7775 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7776 } block_comment(" } // j"); 7777 7778 post2(Ri, Rlen); 7779 add(Ri, Ri, 1); 7780 cmp(Ri, Rlen, Assembler::LSL, 1); 7781 7782 br(Assembler::LT, loop); 7783 bind(end); 7784 block_comment("} // i"); 7785 } 7786 7787 normalize(Rlen); 7788 7789 mov(Ra, Pm_base); // Save Pm_base in Ra 7790 restore_regs(); // Restore caller's Pm_base 7791 7792 // Copy our result into caller's Pm_base 7793 reverse(Pm_base, Ra, Rlen, t0, t1); 7794 7795 leave(); 7796 ret(lr); 7797 7798 return entry; 7799 } 7800 // In C, approximately: 7801 7802 // void 7803 // montgomery_square(julong Pa_base[], julong Pn_base[], 7804 // julong Pm_base[], julong inv, int len) { 7805 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7806 // julong *Pa, *Pb, *Pn, *Pm; 7807 // julong Ra, Rb, Rn, Rm; 7808 7809 // int i; 7810 7811 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7812 7813 // for (i = 0; i < len; i++) { 7814 // int j; 7815 7816 // Pa = Pa_base; 7817 // Pb = Pa_base + i; 7818 // Pm = Pm_base; 7819 // Pn = Pn_base + i; 7820 7821 // Ra = *Pa; 7822 // Rb = *Pb; 7823 // Rm = *Pm; 7824 // Rn = *Pn; 7825 7826 // int iters = (i+1)/2; 7827 // for (j = 0; iters--; j++) { 7828 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7829 // MACC2(Ra, Rb, t0, t1, t2); 7830 // Ra = *++Pa; 7831 // Rb = *--Pb; 7832 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7833 // MACC(Rm, Rn, t0, t1, t2); 7834 // Rm = *++Pm; 7835 // Rn = *--Pn; 7836 // } 7837 // if ((i & 1) == 0) { 7838 // assert(Ra == Pa_base[j], "must be"); 7839 // MACC(Ra, Ra, t0, t1, t2); 7840 // } 7841 // iters = i/2; 7842 // assert(iters == i-j, "must be"); 7843 // for (; iters--; j++) { 7844 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7845 // MACC(Rm, Rn, t0, t1, t2); 7846 // Rm = *++Pm; 7847 // Rn = *--Pn; 7848 // } 7849 7850 // *Pm = Rm = t0 * inv; 7851 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7852 // MACC(Rm, Rn, t0, t1, t2); 7853 7854 // assert(t0 == 0, "broken Montgomery multiply"); 7855 7856 // t0 = t1; t1 = t2; t2 = 0; 7857 // } 7858 7859 // for (i = len; i < 2*len; i++) { 7860 // int start = i-len+1; 7861 // int end = start + (len - start)/2; 7862 // int j; 7863 7864 // Pa = Pa_base + i-len; 7865 // Pb = Pa_base + len; 7866 // Pm = Pm_base + i-len; 7867 // Pn = Pn_base + len; 7868 7869 // Ra = *++Pa; 7870 // Rb = *--Pb; 7871 // Rm = *++Pm; 7872 // Rn = *--Pn; 7873 7874 // int iters = (2*len-i-1)/2; 7875 // assert(iters == end-start, "must be"); 7876 // for (j = start; iters--; j++) { 7877 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7878 // MACC2(Ra, Rb, t0, t1, t2); 7879 // Ra = *++Pa; 7880 // Rb = *--Pb; 7881 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7882 // MACC(Rm, Rn, t0, t1, t2); 7883 // Rm = *++Pm; 7884 // Rn = *--Pn; 7885 // } 7886 // if ((i & 1) == 0) { 7887 // assert(Ra == Pa_base[j], "must be"); 7888 // MACC(Ra, Ra, t0, t1, t2); 7889 // } 7890 // iters = (2*len-i)/2; 7891 // assert(iters == len-j, "must be"); 7892 // for (; iters--; j++) { 7893 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7894 // MACC(Rm, Rn, t0, t1, t2); 7895 // Rm = *++Pm; 7896 // Rn = *--Pn; 7897 // } 7898 // Pm_base[i-len] = t0; 7899 // t0 = t1; t1 = t2; t2 = 0; 7900 // } 7901 7902 // while (t0) 7903 // t0 = sub(Pm_base, Pn_base, t0, len); 7904 // } 7905 }; 7906 7907 7908 // Initialization 7909 void generate_initial() { 7910 // Generate initial stubs and initializes the entry points 7911 7912 // entry points that exist in all platforms Note: This is code 7913 // that could be shared among different platforms - however the 7914 // benefit seems to be smaller than the disadvantage of having a 7915 // much more complicated generator structure. See also comment in 7916 // stubRoutines.hpp. 7917 7918 StubRoutines::_forward_exception_entry = generate_forward_exception(); 7919 7920 StubRoutines::_call_stub_entry = 7921 generate_call_stub(StubRoutines::_call_stub_return_address); 7922 7923 // is referenced by megamorphic call 7924 StubRoutines::_catch_exception_entry = generate_catch_exception(); 7925 7926 // Build this early so it's available for the interpreter. 7927 StubRoutines::_throw_StackOverflowError_entry = 7928 generate_throw_exception("StackOverflowError throw_exception", 7929 CAST_FROM_FN_PTR(address, 7930 SharedRuntime::throw_StackOverflowError)); 7931 StubRoutines::_throw_delayed_StackOverflowError_entry = 7932 generate_throw_exception("delayed StackOverflowError throw_exception", 7933 CAST_FROM_FN_PTR(address, 7934 SharedRuntime::throw_delayed_StackOverflowError)); 7935 if (UseCRC32Intrinsics) { 7936 // set table address before stub generation which use it 7937 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 7938 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 7939 } 7940 7941 if (UseCRC32CIntrinsics) { 7942 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 7943 } 7944 7945 // Disabled until JDK-8210858 is fixed 7946 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 7947 // StubRoutines::_dlog = generate_dlog(); 7948 // } 7949 7950 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 7951 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 7952 } 7953 7954 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 7955 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 7956 } 7957 } 7958 7959 void generate_phase1() { 7960 // Continuation stubs: 7961 StubRoutines::_cont_thaw = generate_cont_thaw(); 7962 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 7963 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 7964 7965 JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();) 7966 JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();) 7967 } 7968 7969 void generate_all() { 7970 // support for verify_oop (must happen after universe_init) 7971 if (VerifyOops) { 7972 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 7973 } 7974 StubRoutines::_throw_AbstractMethodError_entry = 7975 generate_throw_exception("AbstractMethodError throw_exception", 7976 CAST_FROM_FN_PTR(address, 7977 SharedRuntime:: 7978 throw_AbstractMethodError)); 7979 7980 StubRoutines::_throw_IncompatibleClassChangeError_entry = 7981 generate_throw_exception("IncompatibleClassChangeError throw_exception", 7982 CAST_FROM_FN_PTR(address, 7983 SharedRuntime:: 7984 throw_IncompatibleClassChangeError)); 7985 7986 StubRoutines::_throw_NullPointerException_at_call_entry = 7987 generate_throw_exception("NullPointerException at call throw_exception", 7988 CAST_FROM_FN_PTR(address, 7989 SharedRuntime:: 7990 throw_NullPointerException_at_call)); 7991 7992 if (UseSVE == 0) { 7993 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 7994 } 7995 7996 // arraycopy stubs used by compilers 7997 generate_arraycopy_stubs(); 7998 7999 // countPositives stub for large arrays. 8000 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8001 8002 // array equals stub for large arrays. 8003 if (!UseSimpleArrayEquals) { 8004 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8005 } 8006 8007 generate_compare_long_strings(); 8008 8009 generate_string_indexof_stubs(); 8010 8011 // byte_array_inflate stub for large arrays. 8012 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8013 8014 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8015 if (bs_nm != NULL) { 8016 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 8017 } 8018 #ifdef COMPILER2 8019 if (UseMultiplyToLenIntrinsic) { 8020 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8021 } 8022 8023 if (UseSquareToLenIntrinsic) { 8024 StubRoutines::_squareToLen = generate_squareToLen(); 8025 } 8026 8027 if (UseMulAddIntrinsic) { 8028 StubRoutines::_mulAdd = generate_mulAdd(); 8029 } 8030 8031 if (UseSIMDForBigIntegerShiftIntrinsics) { 8032 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8033 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8034 } 8035 8036 if (UseMontgomeryMultiplyIntrinsic) { 8037 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8038 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8039 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8040 } 8041 8042 if (UseMontgomerySquareIntrinsic) { 8043 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8044 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8045 // We use generate_multiply() rather than generate_square() 8046 // because it's faster for the sizes of modulus we care about. 8047 StubRoutines::_montgomerySquare = g.generate_multiply(); 8048 } 8049 #endif // COMPILER2 8050 8051 if (UseChaCha20Intrinsics) { 8052 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8053 } 8054 8055 if (UseBASE64Intrinsics) { 8056 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8057 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8058 } 8059 8060 // data cache line writeback 8061 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8062 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8063 8064 if (UseAESIntrinsics) { 8065 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8066 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8067 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8068 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8069 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8070 } 8071 if (UseGHASHIntrinsics) { 8072 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8073 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8074 } 8075 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8076 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8077 } 8078 8079 if (UseMD5Intrinsics) { 8080 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8081 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8082 } 8083 if (UseSHA1Intrinsics) { 8084 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8085 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8086 } 8087 if (UseSHA256Intrinsics) { 8088 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8089 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8090 } 8091 if (UseSHA512Intrinsics) { 8092 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8093 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8094 } 8095 if (UseSHA3Intrinsics) { 8096 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8097 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8098 } 8099 8100 // generate Adler32 intrinsics code 8101 if (UseAdler32Intrinsics) { 8102 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8103 } 8104 8105 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8106 8107 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8108 8109 generate_atomic_entry_points(); 8110 8111 #endif // LINUX 8112 8113 StubRoutines::aarch64::set_completed(); 8114 } 8115 8116 public: 8117 StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) { 8118 if (phase == 0) { 8119 generate_initial(); 8120 } else if (phase == 1) { 8121 generate_phase1(); // stubs that must be available for the interpreter 8122 } else { 8123 generate_all(); 8124 } 8125 } 8126 }; // end class declaration 8127 8128 #define UCM_TABLE_MAX_ENTRIES 8 8129 void StubGenerator_generate(CodeBuffer* code, int phase) { 8130 if (UnsafeCopyMemory::_table == NULL) { 8131 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 8132 } 8133 StubGenerator g(code, phase); 8134 } 8135 8136 8137 #if defined (LINUX) 8138 8139 // Define pointers to atomic stubs and initialize them to point to the 8140 // code in atomic_aarch64.S. 8141 8142 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8143 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8144 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8145 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8146 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8147 8148 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8149 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8150 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8151 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8152 DEFAULT_ATOMIC_OP(xchg, 4, ) 8153 DEFAULT_ATOMIC_OP(xchg, 8, ) 8154 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8155 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8156 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8157 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8158 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8159 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8160 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8161 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8162 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8163 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8164 8165 #undef DEFAULT_ATOMIC_OP 8166 8167 #endif // LINUX