1 /* 2 * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "runtime/atomic.hpp" 45 #include "runtime/continuation.hpp" 46 #include "runtime/continuationEntry.inline.hpp" 47 #include "runtime/frame.inline.hpp" 48 #include "runtime/handles.inline.hpp" 49 #include "runtime/javaThread.hpp" 50 #include "runtime/sharedRuntime.hpp" 51 #include "runtime/stubCodeGenerator.hpp" 52 #include "runtime/stubRoutines.hpp" 53 #include "utilities/align.hpp" 54 #include "utilities/globalDefinitions.hpp" 55 #include "utilities/powerOfTwo.hpp" 56 #ifdef COMPILER2 57 #include "opto/runtime.hpp" 58 #endif 59 #if INCLUDE_ZGC 60 #include "gc/z/zThreadLocalData.hpp" 61 #endif 62 63 // Declaration and definition of StubGenerator (no .hpp file). 64 // For a more detailed description of the stub routine structure 65 // see the comment in stubRoutines.hpp 66 67 #undef __ 68 #define __ _masm-> 69 70 #ifdef PRODUCT 71 #define BLOCK_COMMENT(str) /* nothing */ 72 #else 73 #define BLOCK_COMMENT(str) __ block_comment(str) 74 #endif 75 76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 77 78 // Stub Code definitions 79 80 class StubGenerator: public StubCodeGenerator { 81 private: 82 83 #ifdef PRODUCT 84 #define inc_counter_np(counter) ((void)0) 85 #else 86 void inc_counter_np_(int& counter) { 87 __ lea(rscratch2, ExternalAddress((address)&counter)); 88 __ ldrw(rscratch1, Address(rscratch2)); 89 __ addw(rscratch1, rscratch1, 1); 90 __ strw(rscratch1, Address(rscratch2)); 91 } 92 #define inc_counter_np(counter) \ 93 BLOCK_COMMENT("inc_counter " #counter); \ 94 inc_counter_np_(counter); 95 #endif 96 97 // Call stubs are used to call Java from C 98 // 99 // Arguments: 100 // c_rarg0: call wrapper address address 101 // c_rarg1: result address 102 // c_rarg2: result type BasicType 103 // c_rarg3: method Method* 104 // c_rarg4: (interpreter) entry point address 105 // c_rarg5: parameters intptr_t* 106 // c_rarg6: parameter size (in words) int 107 // c_rarg7: thread Thread* 108 // 109 // There is no return from the stub itself as any Java result 110 // is written to result 111 // 112 // we save r30 (lr) as the return PC at the base of the frame and 113 // link r29 (fp) below it as the frame pointer installing sp (r31) 114 // into fp. 115 // 116 // we save r0-r7, which accounts for all the c arguments. 117 // 118 // TODO: strictly do we need to save them all? they are treated as 119 // volatile by C so could we omit saving the ones we are going to 120 // place in global registers (thread? method?) or those we only use 121 // during setup of the Java call? 122 // 123 // we don't need to save r8 which C uses as an indirect result location 124 // return register. 125 // 126 // we don't need to save r9-r15 which both C and Java treat as 127 // volatile 128 // 129 // we don't need to save r16-18 because Java does not use them 130 // 131 // we save r19-r28 which Java uses as scratch registers and C 132 // expects to be callee-save 133 // 134 // we save the bottom 64 bits of each value stored in v8-v15; it is 135 // the responsibility of the caller to preserve larger values. 136 // 137 // so the stub frame looks like this when we enter Java code 138 // 139 // [ return_from_Java ] <--- sp 140 // [ argument word n ] 141 // ... 142 // -27 [ argument word 1 ] 143 // -26 [ saved v15 ] <--- sp_after_call 144 // -25 [ saved v14 ] 145 // -24 [ saved v13 ] 146 // -23 [ saved v12 ] 147 // -22 [ saved v11 ] 148 // -21 [ saved v10 ] 149 // -20 [ saved v9 ] 150 // -19 [ saved v8 ] 151 // -18 [ saved r28 ] 152 // -17 [ saved r27 ] 153 // -16 [ saved r26 ] 154 // -15 [ saved r25 ] 155 // -14 [ saved r24 ] 156 // -13 [ saved r23 ] 157 // -12 [ saved r22 ] 158 // -11 [ saved r21 ] 159 // -10 [ saved r20 ] 160 // -9 [ saved r19 ] 161 // -8 [ call wrapper (r0) ] 162 // -7 [ result (r1) ] 163 // -6 [ result type (r2) ] 164 // -5 [ method (r3) ] 165 // -4 [ entry point (r4) ] 166 // -3 [ parameters (r5) ] 167 // -2 [ parameter size (r6) ] 168 // -1 [ thread (r7) ] 169 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 170 // 1 [ saved lr (r30) ] 171 172 // Call stub stack layout word offsets from fp 173 enum call_stub_layout { 174 sp_after_call_off = -26, 175 176 d15_off = -26, 177 d13_off = -24, 178 d11_off = -22, 179 d9_off = -20, 180 181 r28_off = -18, 182 r26_off = -16, 183 r24_off = -14, 184 r22_off = -12, 185 r20_off = -10, 186 call_wrapper_off = -8, 187 result_off = -7, 188 result_type_off = -6, 189 method_off = -5, 190 entry_point_off = -4, 191 parameter_size_off = -2, 192 thread_off = -1, 193 fp_f = 0, 194 retaddr_off = 1, 195 }; 196 197 address generate_call_stub(address& return_address) { 198 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 199 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 200 "adjust this code"); 201 202 StubCodeMark mark(this, "StubRoutines", "call_stub"); 203 address start = __ pc(); 204 205 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 206 207 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 208 const Address result (rfp, result_off * wordSize); 209 const Address result_type (rfp, result_type_off * wordSize); 210 const Address method (rfp, method_off * wordSize); 211 const Address entry_point (rfp, entry_point_off * wordSize); 212 const Address parameter_size(rfp, parameter_size_off * wordSize); 213 214 const Address thread (rfp, thread_off * wordSize); 215 216 const Address d15_save (rfp, d15_off * wordSize); 217 const Address d13_save (rfp, d13_off * wordSize); 218 const Address d11_save (rfp, d11_off * wordSize); 219 const Address d9_save (rfp, d9_off * wordSize); 220 221 const Address r28_save (rfp, r28_off * wordSize); 222 const Address r26_save (rfp, r26_off * wordSize); 223 const Address r24_save (rfp, r24_off * wordSize); 224 const Address r22_save (rfp, r22_off * wordSize); 225 const Address r20_save (rfp, r20_off * wordSize); 226 227 // stub code 228 229 address aarch64_entry = __ pc(); 230 231 // set up frame and move sp to end of save area 232 __ enter(); 233 __ sub(sp, rfp, -sp_after_call_off * wordSize); 234 235 // save register parameters and Java scratch/global registers 236 // n.b. we save thread even though it gets installed in 237 // rthread because we want to sanity check rthread later 238 __ str(c_rarg7, thread); 239 __ strw(c_rarg6, parameter_size); 240 __ stp(c_rarg4, c_rarg5, entry_point); 241 __ stp(c_rarg2, c_rarg3, result_type); 242 __ stp(c_rarg0, c_rarg1, call_wrapper); 243 244 __ stp(r20, r19, r20_save); 245 __ stp(r22, r21, r22_save); 246 __ stp(r24, r23, r24_save); 247 __ stp(r26, r25, r26_save); 248 __ stp(r28, r27, r28_save); 249 250 __ stpd(v9, v8, d9_save); 251 __ stpd(v11, v10, d11_save); 252 __ stpd(v13, v12, d13_save); 253 __ stpd(v15, v14, d15_save); 254 255 // install Java thread in global register now we have saved 256 // whatever value it held 257 __ mov(rthread, c_rarg7); 258 // And method 259 __ mov(rmethod, c_rarg3); 260 261 // set up the heapbase register 262 __ reinit_heapbase(); 263 264 #ifdef ASSERT 265 // make sure we have no pending exceptions 266 { 267 Label L; 268 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 269 __ cmp(rscratch1, (u1)NULL_WORD); 270 __ br(Assembler::EQ, L); 271 __ stop("StubRoutines::call_stub: entered with pending exception"); 272 __ BIND(L); 273 } 274 #endif 275 // pass parameters if any 276 __ mov(esp, sp); 277 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 278 __ andr(sp, rscratch1, -2 * wordSize); 279 280 BLOCK_COMMENT("pass parameters if any"); 281 Label parameters_done; 282 // parameter count is still in c_rarg6 283 // and parameter pointer identifying param 1 is in c_rarg5 284 __ cbzw(c_rarg6, parameters_done); 285 286 address loop = __ pc(); 287 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 288 __ subsw(c_rarg6, c_rarg6, 1); 289 __ push(rscratch1); 290 __ br(Assembler::GT, loop); 291 292 __ BIND(parameters_done); 293 294 // call Java entry -- passing methdoOop, and current sp 295 // rmethod: Method* 296 // r19_sender_sp: sender sp 297 BLOCK_COMMENT("call Java function"); 298 __ mov(r19_sender_sp, sp); 299 __ blr(c_rarg4); 300 301 // we do this here because the notify will already have been done 302 // if we get to the next instruction via an exception 303 // 304 // n.b. adding this instruction here affects the calculation of 305 // whether or not a routine returns to the call stub (used when 306 // doing stack walks) since the normal test is to check the return 307 // pc against the address saved below. so we may need to allow for 308 // this extra instruction in the check. 309 310 // save current address for use by exception handling code 311 312 return_address = __ pc(); 313 314 // store result depending on type (everything that is not 315 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 316 // n.b. this assumes Java returns an integral result in r0 317 // and a floating result in j_farg0 318 __ ldr(j_rarg2, result); 319 Label is_long, is_float, is_double, exit; 320 __ ldr(j_rarg1, result_type); 321 __ cmp(j_rarg1, (u1)T_OBJECT); 322 __ br(Assembler::EQ, is_long); 323 __ cmp(j_rarg1, (u1)T_LONG); 324 __ br(Assembler::EQ, is_long); 325 __ cmp(j_rarg1, (u1)T_FLOAT); 326 __ br(Assembler::EQ, is_float); 327 __ cmp(j_rarg1, (u1)T_DOUBLE); 328 __ br(Assembler::EQ, is_double); 329 330 // handle T_INT case 331 __ strw(r0, Address(j_rarg2)); 332 333 __ BIND(exit); 334 335 // pop parameters 336 __ sub(esp, rfp, -sp_after_call_off * wordSize); 337 338 #ifdef ASSERT 339 // verify that threads correspond 340 { 341 Label L, S; 342 __ ldr(rscratch1, thread); 343 __ cmp(rthread, rscratch1); 344 __ br(Assembler::NE, S); 345 __ get_thread(rscratch1); 346 __ cmp(rthread, rscratch1); 347 __ br(Assembler::EQ, L); 348 __ BIND(S); 349 __ stop("StubRoutines::call_stub: threads must correspond"); 350 __ BIND(L); 351 } 352 #endif 353 354 __ pop_cont_fastpath(rthread); 355 356 // restore callee-save registers 357 __ ldpd(v15, v14, d15_save); 358 __ ldpd(v13, v12, d13_save); 359 __ ldpd(v11, v10, d11_save); 360 __ ldpd(v9, v8, d9_save); 361 362 __ ldp(r28, r27, r28_save); 363 __ ldp(r26, r25, r26_save); 364 __ ldp(r24, r23, r24_save); 365 __ ldp(r22, r21, r22_save); 366 __ ldp(r20, r19, r20_save); 367 368 __ ldp(c_rarg0, c_rarg1, call_wrapper); 369 __ ldrw(c_rarg2, result_type); 370 __ ldr(c_rarg3, method); 371 __ ldp(c_rarg4, c_rarg5, entry_point); 372 __ ldp(c_rarg6, c_rarg7, parameter_size); 373 374 // leave frame and return to caller 375 __ leave(); 376 __ ret(lr); 377 378 // handle return types different from T_INT 379 380 __ BIND(is_long); 381 __ str(r0, Address(j_rarg2, 0)); 382 __ br(Assembler::AL, exit); 383 384 __ BIND(is_float); 385 __ strs(j_farg0, Address(j_rarg2, 0)); 386 __ br(Assembler::AL, exit); 387 388 __ BIND(is_double); 389 __ strd(j_farg0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 return start; 393 } 394 395 // Return point for a Java call if there's an exception thrown in 396 // Java code. The exception is caught and transformed into a 397 // pending exception stored in JavaThread that can be tested from 398 // within the VM. 399 // 400 // Note: Usually the parameters are removed by the callee. In case 401 // of an exception crossing an activation frame boundary, that is 402 // not the case if the callee is compiled code => need to setup the 403 // rsp. 404 // 405 // r0: exception oop 406 407 address generate_catch_exception() { 408 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 409 address start = __ pc(); 410 411 // same as in generate_call_stub(): 412 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 413 const Address thread (rfp, thread_off * wordSize); 414 415 #ifdef ASSERT 416 // verify that threads correspond 417 { 418 Label L, S; 419 __ ldr(rscratch1, thread); 420 __ cmp(rthread, rscratch1); 421 __ br(Assembler::NE, S); 422 __ get_thread(rscratch1); 423 __ cmp(rthread, rscratch1); 424 __ br(Assembler::EQ, L); 425 __ bind(S); 426 __ stop("StubRoutines::catch_exception: threads must correspond"); 427 __ bind(L); 428 } 429 #endif 430 431 // set pending exception 432 __ verify_oop(r0); 433 434 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 435 __ mov(rscratch1, (address)__FILE__); 436 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 437 __ movw(rscratch1, (int)__LINE__); 438 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 439 440 // complete return to VM 441 assert(StubRoutines::_call_stub_return_address != NULL, 442 "_call_stub_return_address must have been generated before"); 443 __ b(StubRoutines::_call_stub_return_address); 444 445 return start; 446 } 447 448 // Continuation point for runtime calls returning with a pending 449 // exception. The pending exception check happened in the runtime 450 // or native call stub. The pending exception in Thread is 451 // converted into a Java-level exception. 452 // 453 // Contract with Java-level exception handlers: 454 // r0: exception 455 // r3: throwing pc 456 // 457 // NOTE: At entry of this stub, exception-pc must be in LR !! 458 459 // NOTE: this is always used as a jump target within generated code 460 // so it just needs to be generated code with no x86 prolog 461 462 address generate_forward_exception() { 463 StubCodeMark mark(this, "StubRoutines", "forward exception"); 464 address start = __ pc(); 465 466 // Upon entry, LR points to the return address returning into 467 // Java (interpreted or compiled) code; i.e., the return address 468 // becomes the throwing pc. 469 // 470 // Arguments pushed before the runtime call are still on the stack 471 // but the exception handler will reset the stack pointer -> 472 // ignore them. A potential result in registers can be ignored as 473 // well. 474 475 #ifdef ASSERT 476 // make sure this code is only executed if there is a pending exception 477 { 478 Label L; 479 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 480 __ cbnz(rscratch1, L); 481 __ stop("StubRoutines::forward exception: no pending exception (1)"); 482 __ bind(L); 483 } 484 #endif 485 486 // compute exception handler into r19 487 488 // call the VM to find the handler address associated with the 489 // caller address. pass thread in r0 and caller pc (ret address) 490 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 491 // the stack. 492 __ mov(c_rarg1, lr); 493 // lr will be trashed by the VM call so we move it to R19 494 // (callee-saved) because we also need to pass it to the handler 495 // returned by this call. 496 __ mov(r19, lr); 497 BLOCK_COMMENT("call exception_handler_for_return_address"); 498 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 499 SharedRuntime::exception_handler_for_return_address), 500 rthread, c_rarg1); 501 // Reinitialize the ptrue predicate register, in case the external runtime 502 // call clobbers ptrue reg, as we may return to SVE compiled code. 503 __ reinitialize_ptrue(); 504 505 // we should not really care that lr is no longer the callee 506 // address. we saved the value the handler needs in r19 so we can 507 // just copy it to r3. however, the C2 handler will push its own 508 // frame and then calls into the VM and the VM code asserts that 509 // the PC for the frame above the handler belongs to a compiled 510 // Java method. So, we restore lr here to satisfy that assert. 511 __ mov(lr, r19); 512 // setup r0 & r3 & clear pending exception 513 __ mov(r3, r19); 514 __ mov(r19, r0); 515 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 516 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 517 518 #ifdef ASSERT 519 // make sure exception is set 520 { 521 Label L; 522 __ cbnz(r0, L); 523 __ stop("StubRoutines::forward exception: no pending exception (2)"); 524 __ bind(L); 525 } 526 #endif 527 528 // continue at exception handler 529 // r0: exception 530 // r3: throwing pc 531 // r19: exception handler 532 __ verify_oop(r0); 533 __ br(r19); 534 535 return start; 536 } 537 538 // Non-destructive plausibility checks for oops 539 // 540 // Arguments: 541 // r0: oop to verify 542 // rscratch1: error message 543 // 544 // Stack after saving c_rarg3: 545 // [tos + 0]: saved c_rarg3 546 // [tos + 1]: saved c_rarg2 547 // [tos + 2]: saved lr 548 // [tos + 3]: saved rscratch2 549 // [tos + 4]: saved r0 550 // [tos + 5]: saved rscratch1 551 address generate_verify_oop() { 552 553 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 554 address start = __ pc(); 555 556 Label exit, error; 557 558 // save c_rarg2 and c_rarg3 559 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 560 561 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 562 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 563 __ ldr(c_rarg3, Address(c_rarg2)); 564 __ add(c_rarg3, c_rarg3, 1); 565 __ str(c_rarg3, Address(c_rarg2)); 566 567 // object is in r0 568 // make sure object is 'reasonable' 569 __ cbz(r0, exit); // if obj is NULL it is OK 570 571 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 572 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 573 574 // return if everything seems ok 575 __ bind(exit); 576 577 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 578 __ ret(lr); 579 580 // handle errors 581 __ bind(error); 582 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 583 584 __ push(RegSet::range(r0, r29), sp); 585 // debug(char* msg, int64_t pc, int64_t regs[]) 586 __ mov(c_rarg0, rscratch1); // pass address of error message 587 __ mov(c_rarg1, lr); // pass return address 588 __ mov(c_rarg2, sp); // pass address of regs on stack 589 #ifndef PRODUCT 590 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 591 #endif 592 BLOCK_COMMENT("call MacroAssembler::debug"); 593 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 594 __ blr(rscratch1); 595 __ hlt(0); 596 597 return start; 598 } 599 600 // Generate indices for iota vector. 601 address generate_iota_indices(const char *stub_name) { 602 __ align(CodeEntryAlignment); 603 StubCodeMark mark(this, "StubRoutines", stub_name); 604 address start = __ pc(); 605 // B 606 __ emit_data64(0x0706050403020100, relocInfo::none); 607 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 608 // H 609 __ emit_data64(0x0003000200010000, relocInfo::none); 610 __ emit_data64(0x0007000600050004, relocInfo::none); 611 // S 612 __ emit_data64(0x0000000100000000, relocInfo::none); 613 __ emit_data64(0x0000000300000002, relocInfo::none); 614 // D 615 __ emit_data64(0x0000000000000000, relocInfo::none); 616 __ emit_data64(0x0000000000000001, relocInfo::none); 617 // S - FP 618 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 619 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 620 // D - FP 621 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 622 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 623 return start; 624 } 625 626 // The inner part of zero_words(). This is the bulk operation, 627 // zeroing words in blocks, possibly using DC ZVA to do it. The 628 // caller is responsible for zeroing the last few words. 629 // 630 // Inputs: 631 // r10: the HeapWord-aligned base address of an array to zero. 632 // r11: the count in HeapWords, r11 > 0. 633 // 634 // Returns r10 and r11, adjusted for the caller to clear. 635 // r10: the base address of the tail of words left to clear. 636 // r11: the number of words in the tail. 637 // r11 < MacroAssembler::zero_words_block_size. 638 639 address generate_zero_blocks() { 640 Label done; 641 Label base_aligned; 642 643 Register base = r10, cnt = r11; 644 645 __ align(CodeEntryAlignment); 646 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 647 address start = __ pc(); 648 649 if (UseBlockZeroing) { 650 int zva_length = VM_Version::zva_length(); 651 652 // Ensure ZVA length can be divided by 16. This is required by 653 // the subsequent operations. 654 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 655 656 __ tbz(base, 3, base_aligned); 657 __ str(zr, Address(__ post(base, 8))); 658 __ sub(cnt, cnt, 1); 659 __ bind(base_aligned); 660 661 // Ensure count >= zva_length * 2 so that it still deserves a zva after 662 // alignment. 663 Label small; 664 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 665 __ subs(rscratch1, cnt, low_limit >> 3); 666 __ br(Assembler::LT, small); 667 __ zero_dcache_blocks(base, cnt); 668 __ bind(small); 669 } 670 671 { 672 // Number of stp instructions we'll unroll 673 const int unroll = 674 MacroAssembler::zero_words_block_size / 2; 675 // Clear the remaining blocks. 676 Label loop; 677 __ subs(cnt, cnt, unroll * 2); 678 __ br(Assembler::LT, done); 679 __ bind(loop); 680 for (int i = 0; i < unroll; i++) 681 __ stp(zr, zr, __ post(base, 16)); 682 __ subs(cnt, cnt, unroll * 2); 683 __ br(Assembler::GE, loop); 684 __ bind(done); 685 __ add(cnt, cnt, unroll * 2); 686 } 687 688 __ ret(lr); 689 690 return start; 691 } 692 693 694 typedef enum { 695 copy_forwards = 1, 696 copy_backwards = -1 697 } copy_direction; 698 699 // Bulk copy of blocks of 8 words. 700 // 701 // count is a count of words. 702 // 703 // Precondition: count >= 8 704 // 705 // Postconditions: 706 // 707 // The least significant bit of count contains the remaining count 708 // of words to copy. The rest of count is trash. 709 // 710 // s and d are adjusted to point to the remaining words to copy 711 // 712 void generate_copy_longs(Label &start, Register s, Register d, Register count, 713 copy_direction direction) { 714 int unit = wordSize * direction; 715 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 716 717 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 718 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 719 const Register stride = r13; 720 721 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 722 assert_different_registers(s, d, count, rscratch1); 723 724 Label again, drain; 725 const char *stub_name; 726 if (direction == copy_forwards) 727 stub_name = "forward_copy_longs"; 728 else 729 stub_name = "backward_copy_longs"; 730 731 __ align(CodeEntryAlignment); 732 733 StubCodeMark mark(this, "StubRoutines", stub_name); 734 735 __ bind(start); 736 737 Label unaligned_copy_long; 738 if (AvoidUnalignedAccesses) { 739 __ tbnz(d, 3, unaligned_copy_long); 740 } 741 742 if (direction == copy_forwards) { 743 __ sub(s, s, bias); 744 __ sub(d, d, bias); 745 } 746 747 #ifdef ASSERT 748 // Make sure we are never given < 8 words 749 { 750 Label L; 751 __ cmp(count, (u1)8); 752 __ br(Assembler::GE, L); 753 __ stop("genrate_copy_longs called with < 8 words"); 754 __ bind(L); 755 } 756 #endif 757 758 // Fill 8 registers 759 if (UseSIMDForMemoryOps) { 760 __ ldpq(v0, v1, Address(s, 4 * unit)); 761 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 762 } else { 763 __ ldp(t0, t1, Address(s, 2 * unit)); 764 __ ldp(t2, t3, Address(s, 4 * unit)); 765 __ ldp(t4, t5, Address(s, 6 * unit)); 766 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 767 } 768 769 __ subs(count, count, 16); 770 __ br(Assembler::LO, drain); 771 772 int prefetch = PrefetchCopyIntervalInBytes; 773 bool use_stride = false; 774 if (direction == copy_backwards) { 775 use_stride = prefetch > 256; 776 prefetch = -prefetch; 777 if (use_stride) __ mov(stride, prefetch); 778 } 779 780 __ bind(again); 781 782 if (PrefetchCopyIntervalInBytes > 0) 783 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 784 785 if (UseSIMDForMemoryOps) { 786 __ stpq(v0, v1, Address(d, 4 * unit)); 787 __ ldpq(v0, v1, Address(s, 4 * unit)); 788 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 789 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 790 } else { 791 __ stp(t0, t1, Address(d, 2 * unit)); 792 __ ldp(t0, t1, Address(s, 2 * unit)); 793 __ stp(t2, t3, Address(d, 4 * unit)); 794 __ ldp(t2, t3, Address(s, 4 * unit)); 795 __ stp(t4, t5, Address(d, 6 * unit)); 796 __ ldp(t4, t5, Address(s, 6 * unit)); 797 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 798 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 799 } 800 801 __ subs(count, count, 8); 802 __ br(Assembler::HS, again); 803 804 // Drain 805 __ bind(drain); 806 if (UseSIMDForMemoryOps) { 807 __ stpq(v0, v1, Address(d, 4 * unit)); 808 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 809 } else { 810 __ stp(t0, t1, Address(d, 2 * unit)); 811 __ stp(t2, t3, Address(d, 4 * unit)); 812 __ stp(t4, t5, Address(d, 6 * unit)); 813 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 814 } 815 816 { 817 Label L1, L2; 818 __ tbz(count, exact_log2(4), L1); 819 if (UseSIMDForMemoryOps) { 820 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 821 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 822 } else { 823 __ ldp(t0, t1, Address(s, 2 * unit)); 824 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 825 __ stp(t0, t1, Address(d, 2 * unit)); 826 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 827 } 828 __ bind(L1); 829 830 if (direction == copy_forwards) { 831 __ add(s, s, bias); 832 __ add(d, d, bias); 833 } 834 835 __ tbz(count, 1, L2); 836 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 837 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 838 __ bind(L2); 839 } 840 841 __ ret(lr); 842 843 if (AvoidUnalignedAccesses) { 844 Label drain, again; 845 // Register order for storing. Order is different for backward copy. 846 847 __ bind(unaligned_copy_long); 848 849 // source address is even aligned, target odd aligned 850 // 851 // when forward copying word pairs we read long pairs at offsets 852 // {0, 2, 4, 6} (in long words). when backwards copying we read 853 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 854 // address by -2 in the forwards case so we can compute the 855 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 856 // or -1. 857 // 858 // when forward copying we need to store 1 word, 3 pairs and 859 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 860 // zero offset We adjust the destination by -1 which means we 861 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 862 // 863 // When backwards copyng we need to store 1 word, 3 pairs and 864 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 865 // offsets {1, 3, 5, 7, 8} * unit. 866 867 if (direction == copy_forwards) { 868 __ sub(s, s, 16); 869 __ sub(d, d, 8); 870 } 871 872 // Fill 8 registers 873 // 874 // for forwards copy s was offset by -16 from the original input 875 // value of s so the register contents are at these offsets 876 // relative to the 64 bit block addressed by that original input 877 // and so on for each successive 64 byte block when s is updated 878 // 879 // t0 at offset 0, t1 at offset 8 880 // t2 at offset 16, t3 at offset 24 881 // t4 at offset 32, t5 at offset 40 882 // t6 at offset 48, t7 at offset 56 883 884 // for backwards copy s was not offset so the register contents 885 // are at these offsets into the preceding 64 byte block 886 // relative to that original input and so on for each successive 887 // preceding 64 byte block when s is updated. this explains the 888 // slightly counter-intuitive looking pattern of register usage 889 // in the stp instructions for backwards copy. 890 // 891 // t0 at offset -16, t1 at offset -8 892 // t2 at offset -32, t3 at offset -24 893 // t4 at offset -48, t5 at offset -40 894 // t6 at offset -64, t7 at offset -56 895 896 __ ldp(t0, t1, Address(s, 2 * unit)); 897 __ ldp(t2, t3, Address(s, 4 * unit)); 898 __ ldp(t4, t5, Address(s, 6 * unit)); 899 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 900 901 __ subs(count, count, 16); 902 __ br(Assembler::LO, drain); 903 904 int prefetch = PrefetchCopyIntervalInBytes; 905 bool use_stride = false; 906 if (direction == copy_backwards) { 907 use_stride = prefetch > 256; 908 prefetch = -prefetch; 909 if (use_stride) __ mov(stride, prefetch); 910 } 911 912 __ bind(again); 913 914 if (PrefetchCopyIntervalInBytes > 0) 915 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 916 917 if (direction == copy_forwards) { 918 // allowing for the offset of -8 the store instructions place 919 // registers into the target 64 bit block at the following 920 // offsets 921 // 922 // t0 at offset 0 923 // t1 at offset 8, t2 at offset 16 924 // t3 at offset 24, t4 at offset 32 925 // t5 at offset 40, t6 at offset 48 926 // t7 at offset 56 927 928 __ str(t0, Address(d, 1 * unit)); 929 __ stp(t1, t2, Address(d, 2 * unit)); 930 __ ldp(t0, t1, Address(s, 2 * unit)); 931 __ stp(t3, t4, Address(d, 4 * unit)); 932 __ ldp(t2, t3, Address(s, 4 * unit)); 933 __ stp(t5, t6, Address(d, 6 * unit)); 934 __ ldp(t4, t5, Address(s, 6 * unit)); 935 __ str(t7, Address(__ pre(d, 8 * unit))); 936 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 937 } else { 938 // d was not offset when we started so the registers are 939 // written into the 64 bit block preceding d with the following 940 // offsets 941 // 942 // t1 at offset -8 943 // t3 at offset -24, t0 at offset -16 944 // t5 at offset -48, t2 at offset -32 945 // t7 at offset -56, t4 at offset -48 946 // t6 at offset -64 947 // 948 // note that this matches the offsets previously noted for the 949 // loads 950 951 __ str(t1, Address(d, 1 * unit)); 952 __ stp(t3, t0, Address(d, 3 * unit)); 953 __ ldp(t0, t1, Address(s, 2 * unit)); 954 __ stp(t5, t2, Address(d, 5 * unit)); 955 __ ldp(t2, t3, Address(s, 4 * unit)); 956 __ stp(t7, t4, Address(d, 7 * unit)); 957 __ ldp(t4, t5, Address(s, 6 * unit)); 958 __ str(t6, Address(__ pre(d, 8 * unit))); 959 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 960 } 961 962 __ subs(count, count, 8); 963 __ br(Assembler::HS, again); 964 965 // Drain 966 // 967 // this uses the same pattern of offsets and register arguments 968 // as above 969 __ bind(drain); 970 if (direction == copy_forwards) { 971 __ str(t0, Address(d, 1 * unit)); 972 __ stp(t1, t2, Address(d, 2 * unit)); 973 __ stp(t3, t4, Address(d, 4 * unit)); 974 __ stp(t5, t6, Address(d, 6 * unit)); 975 __ str(t7, Address(__ pre(d, 8 * unit))); 976 } else { 977 __ str(t1, Address(d, 1 * unit)); 978 __ stp(t3, t0, Address(d, 3 * unit)); 979 __ stp(t5, t2, Address(d, 5 * unit)); 980 __ stp(t7, t4, Address(d, 7 * unit)); 981 __ str(t6, Address(__ pre(d, 8 * unit))); 982 } 983 // now we need to copy any remaining part block which may 984 // include a 4 word block subblock and/or a 2 word subblock. 985 // bits 2 and 1 in the count are the tell-tale for whether we 986 // have each such subblock 987 { 988 Label L1, L2; 989 __ tbz(count, exact_log2(4), L1); 990 // this is the same as above but copying only 4 longs hence 991 // with only one intervening stp between the str instructions 992 // but note that the offsets and registers still follow the 993 // same pattern 994 __ ldp(t0, t1, Address(s, 2 * unit)); 995 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 996 if (direction == copy_forwards) { 997 __ str(t0, Address(d, 1 * unit)); 998 __ stp(t1, t2, Address(d, 2 * unit)); 999 __ str(t3, Address(__ pre(d, 4 * unit))); 1000 } else { 1001 __ str(t1, Address(d, 1 * unit)); 1002 __ stp(t3, t0, Address(d, 3 * unit)); 1003 __ str(t2, Address(__ pre(d, 4 * unit))); 1004 } 1005 __ bind(L1); 1006 1007 __ tbz(count, 1, L2); 1008 // this is the same as above but copying only 2 longs hence 1009 // there is no intervening stp between the str instructions 1010 // but note that the offset and register patterns are still 1011 // the same 1012 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1013 if (direction == copy_forwards) { 1014 __ str(t0, Address(d, 1 * unit)); 1015 __ str(t1, Address(__ pre(d, 2 * unit))); 1016 } else { 1017 __ str(t1, Address(d, 1 * unit)); 1018 __ str(t0, Address(__ pre(d, 2 * unit))); 1019 } 1020 __ bind(L2); 1021 1022 // for forwards copy we need to re-adjust the offsets we 1023 // applied so that s and d are follow the last words written 1024 1025 if (direction == copy_forwards) { 1026 __ add(s, s, 16); 1027 __ add(d, d, 8); 1028 } 1029 1030 } 1031 1032 __ ret(lr); 1033 } 1034 } 1035 1036 // Small copy: less than 16 bytes. 1037 // 1038 // NB: Ignores all of the bits of count which represent more than 15 1039 // bytes, so a caller doesn't have to mask them. 1040 1041 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1042 bool is_backwards = step < 0; 1043 size_t granularity = uabs(step); 1044 int direction = is_backwards ? -1 : 1; 1045 int unit = wordSize * direction; 1046 1047 Label Lword, Lint, Lshort, Lbyte; 1048 1049 assert(granularity 1050 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1051 1052 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1053 1054 // ??? I don't know if this bit-test-and-branch is the right thing 1055 // to do. It does a lot of jumping, resulting in several 1056 // mispredicted branches. It might make more sense to do this 1057 // with something like Duff's device with a single computed branch. 1058 1059 __ tbz(count, 3 - exact_log2(granularity), Lword); 1060 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1061 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1062 __ bind(Lword); 1063 1064 if (granularity <= sizeof (jint)) { 1065 __ tbz(count, 2 - exact_log2(granularity), Lint); 1066 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1067 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1068 __ bind(Lint); 1069 } 1070 1071 if (granularity <= sizeof (jshort)) { 1072 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1073 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1074 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1075 __ bind(Lshort); 1076 } 1077 1078 if (granularity <= sizeof (jbyte)) { 1079 __ tbz(count, 0, Lbyte); 1080 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1081 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1082 __ bind(Lbyte); 1083 } 1084 } 1085 1086 Label copy_f, copy_b; 1087 1088 // All-singing all-dancing memory copy. 1089 // 1090 // Copy count units of memory from s to d. The size of a unit is 1091 // step, which can be positive or negative depending on the direction 1092 // of copy. If is_aligned is false, we align the source address. 1093 // 1094 1095 void copy_memory(bool is_aligned, Register s, Register d, 1096 Register count, Register tmp, int step) { 1097 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1098 bool is_backwards = step < 0; 1099 unsigned int granularity = uabs(step); 1100 const Register t0 = r3, t1 = r4; 1101 1102 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1103 // load all the data before writing anything 1104 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1105 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1106 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1107 const Register send = r17, dend = r16; 1108 1109 if (PrefetchCopyIntervalInBytes > 0) 1110 __ prfm(Address(s, 0), PLDL1KEEP); 1111 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1112 __ br(Assembler::HI, copy_big); 1113 1114 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1115 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1116 1117 __ cmp(count, u1(16/granularity)); 1118 __ br(Assembler::LS, copy16); 1119 1120 __ cmp(count, u1(64/granularity)); 1121 __ br(Assembler::HI, copy80); 1122 1123 __ cmp(count, u1(32/granularity)); 1124 __ br(Assembler::LS, copy32); 1125 1126 // 33..64 bytes 1127 if (UseSIMDForMemoryOps) { 1128 __ ldpq(v0, v1, Address(s, 0)); 1129 __ ldpq(v2, v3, Address(send, -32)); 1130 __ stpq(v0, v1, Address(d, 0)); 1131 __ stpq(v2, v3, Address(dend, -32)); 1132 } else { 1133 __ ldp(t0, t1, Address(s, 0)); 1134 __ ldp(t2, t3, Address(s, 16)); 1135 __ ldp(t4, t5, Address(send, -32)); 1136 __ ldp(t6, t7, Address(send, -16)); 1137 1138 __ stp(t0, t1, Address(d, 0)); 1139 __ stp(t2, t3, Address(d, 16)); 1140 __ stp(t4, t5, Address(dend, -32)); 1141 __ stp(t6, t7, Address(dend, -16)); 1142 } 1143 __ b(finish); 1144 1145 // 17..32 bytes 1146 __ bind(copy32); 1147 __ ldp(t0, t1, Address(s, 0)); 1148 __ ldp(t2, t3, Address(send, -16)); 1149 __ stp(t0, t1, Address(d, 0)); 1150 __ stp(t2, t3, Address(dend, -16)); 1151 __ b(finish); 1152 1153 // 65..80/96 bytes 1154 // (96 bytes if SIMD because we do 32 byes per instruction) 1155 __ bind(copy80); 1156 if (UseSIMDForMemoryOps) { 1157 __ ldpq(v0, v1, Address(s, 0)); 1158 __ ldpq(v2, v3, Address(s, 32)); 1159 // Unaligned pointers can be an issue for copying. 1160 // The issue has more chances to happen when granularity of data is 1161 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1162 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1163 // The most performance drop has been seen for the range 65-80 bytes. 1164 // For such cases using the pair of ldp/stp instead of the third pair of 1165 // ldpq/stpq fixes the performance issue. 1166 if (granularity < sizeof (jint)) { 1167 Label copy96; 1168 __ cmp(count, u1(80/granularity)); 1169 __ br(Assembler::HI, copy96); 1170 __ ldp(t0, t1, Address(send, -16)); 1171 1172 __ stpq(v0, v1, Address(d, 0)); 1173 __ stpq(v2, v3, Address(d, 32)); 1174 __ stp(t0, t1, Address(dend, -16)); 1175 __ b(finish); 1176 1177 __ bind(copy96); 1178 } 1179 __ ldpq(v4, v5, Address(send, -32)); 1180 1181 __ stpq(v0, v1, Address(d, 0)); 1182 __ stpq(v2, v3, Address(d, 32)); 1183 __ stpq(v4, v5, Address(dend, -32)); 1184 } else { 1185 __ ldp(t0, t1, Address(s, 0)); 1186 __ ldp(t2, t3, Address(s, 16)); 1187 __ ldp(t4, t5, Address(s, 32)); 1188 __ ldp(t6, t7, Address(s, 48)); 1189 __ ldp(t8, t9, Address(send, -16)); 1190 1191 __ stp(t0, t1, Address(d, 0)); 1192 __ stp(t2, t3, Address(d, 16)); 1193 __ stp(t4, t5, Address(d, 32)); 1194 __ stp(t6, t7, Address(d, 48)); 1195 __ stp(t8, t9, Address(dend, -16)); 1196 } 1197 __ b(finish); 1198 1199 // 0..16 bytes 1200 __ bind(copy16); 1201 __ cmp(count, u1(8/granularity)); 1202 __ br(Assembler::LO, copy8); 1203 1204 // 8..16 bytes 1205 __ ldr(t0, Address(s, 0)); 1206 __ ldr(t1, Address(send, -8)); 1207 __ str(t0, Address(d, 0)); 1208 __ str(t1, Address(dend, -8)); 1209 __ b(finish); 1210 1211 if (granularity < 8) { 1212 // 4..7 bytes 1213 __ bind(copy8); 1214 __ tbz(count, 2 - exact_log2(granularity), copy4); 1215 __ ldrw(t0, Address(s, 0)); 1216 __ ldrw(t1, Address(send, -4)); 1217 __ strw(t0, Address(d, 0)); 1218 __ strw(t1, Address(dend, -4)); 1219 __ b(finish); 1220 if (granularity < 4) { 1221 // 0..3 bytes 1222 __ bind(copy4); 1223 __ cbz(count, finish); // get rid of 0 case 1224 if (granularity == 2) { 1225 __ ldrh(t0, Address(s, 0)); 1226 __ strh(t0, Address(d, 0)); 1227 } else { // granularity == 1 1228 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1229 // the first and last byte. 1230 // Handle the 3 byte case by loading and storing base + count/2 1231 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1232 // This does means in the 1 byte case we load/store the same 1233 // byte 3 times. 1234 __ lsr(count, count, 1); 1235 __ ldrb(t0, Address(s, 0)); 1236 __ ldrb(t1, Address(send, -1)); 1237 __ ldrb(t2, Address(s, count)); 1238 __ strb(t0, Address(d, 0)); 1239 __ strb(t1, Address(dend, -1)); 1240 __ strb(t2, Address(d, count)); 1241 } 1242 __ b(finish); 1243 } 1244 } 1245 1246 __ bind(copy_big); 1247 if (is_backwards) { 1248 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1249 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1250 } 1251 1252 // Now we've got the small case out of the way we can align the 1253 // source address on a 2-word boundary. 1254 1255 Label aligned; 1256 1257 if (is_aligned) { 1258 // We may have to adjust by 1 word to get s 2-word-aligned. 1259 __ tbz(s, exact_log2(wordSize), aligned); 1260 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1261 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1262 __ sub(count, count, wordSize/granularity); 1263 } else { 1264 if (is_backwards) { 1265 __ andr(rscratch2, s, 2 * wordSize - 1); 1266 } else { 1267 __ neg(rscratch2, s); 1268 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1269 } 1270 // rscratch2 is the byte adjustment needed to align s. 1271 __ cbz(rscratch2, aligned); 1272 int shift = exact_log2(granularity); 1273 if (shift) __ lsr(rscratch2, rscratch2, shift); 1274 __ sub(count, count, rscratch2); 1275 1276 #if 0 1277 // ?? This code is only correct for a disjoint copy. It may or 1278 // may not make sense to use it in that case. 1279 1280 // Copy the first pair; s and d may not be aligned. 1281 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1282 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1283 1284 // Align s and d, adjust count 1285 if (is_backwards) { 1286 __ sub(s, s, rscratch2); 1287 __ sub(d, d, rscratch2); 1288 } else { 1289 __ add(s, s, rscratch2); 1290 __ add(d, d, rscratch2); 1291 } 1292 #else 1293 copy_memory_small(s, d, rscratch2, rscratch1, step); 1294 #endif 1295 } 1296 1297 __ bind(aligned); 1298 1299 // s is now 2-word-aligned. 1300 1301 // We have a count of units and some trailing bytes. Adjust the 1302 // count and do a bulk copy of words. 1303 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1304 if (direction == copy_forwards) 1305 __ bl(copy_f); 1306 else 1307 __ bl(copy_b); 1308 1309 // And the tail. 1310 copy_memory_small(s, d, count, tmp, step); 1311 1312 if (granularity >= 8) __ bind(copy8); 1313 if (granularity >= 4) __ bind(copy4); 1314 __ bind(finish); 1315 } 1316 1317 1318 void clobber_registers() { 1319 #ifdef ASSERT 1320 RegSet clobbered 1321 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1322 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1323 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1324 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1325 __ mov(*it, rscratch1); 1326 } 1327 #endif 1328 1329 } 1330 1331 // Scan over array at a for count oops, verifying each one. 1332 // Preserves a and count, clobbers rscratch1 and rscratch2. 1333 void verify_oop_array (int size, Register a, Register count, Register temp) { 1334 Label loop, end; 1335 __ mov(rscratch1, a); 1336 __ mov(rscratch2, zr); 1337 __ bind(loop); 1338 __ cmp(rscratch2, count); 1339 __ br(Assembler::HS, end); 1340 if (size == wordSize) { 1341 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1342 __ verify_oop(temp); 1343 } else { 1344 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1345 __ decode_heap_oop(temp); // calls verify_oop 1346 } 1347 __ add(rscratch2, rscratch2, 1); 1348 __ b(loop); 1349 __ bind(end); 1350 } 1351 1352 // Arguments: 1353 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1354 // ignored 1355 // is_oop - true => oop array, so generate store check code 1356 // name - stub name string 1357 // 1358 // Inputs: 1359 // c_rarg0 - source array address 1360 // c_rarg1 - destination array address 1361 // c_rarg2 - element count, treated as ssize_t, can be zero 1362 // 1363 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1364 // the hardware handle it. The two dwords within qwords that span 1365 // cache line boundaries will still be loaded and stored atomically. 1366 // 1367 // Side Effects: 1368 // disjoint_int_copy_entry is set to the no-overlap entry point 1369 // used by generate_conjoint_int_oop_copy(). 1370 // 1371 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1372 const char *name, bool dest_uninitialized = false) { 1373 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1374 RegSet saved_reg = RegSet::of(s, d, count); 1375 __ align(CodeEntryAlignment); 1376 StubCodeMark mark(this, "StubRoutines", name); 1377 address start = __ pc(); 1378 __ enter(); 1379 1380 if (entry != NULL) { 1381 *entry = __ pc(); 1382 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1383 BLOCK_COMMENT("Entry:"); 1384 } 1385 1386 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1387 if (dest_uninitialized) { 1388 decorators |= IS_DEST_UNINITIALIZED; 1389 } 1390 if (aligned) { 1391 decorators |= ARRAYCOPY_ALIGNED; 1392 } 1393 1394 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1395 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1396 1397 if (is_oop) { 1398 // save regs before copy_memory 1399 __ push(RegSet::of(d, count), sp); 1400 } 1401 { 1402 // UnsafeCopyMemory page error: continue after ucm 1403 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1404 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1405 copy_memory(aligned, s, d, count, rscratch1, size); 1406 } 1407 1408 if (is_oop) { 1409 __ pop(RegSet::of(d, count), sp); 1410 if (VerifyOops) 1411 verify_oop_array(size, d, count, r16); 1412 } 1413 1414 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1415 1416 __ leave(); 1417 __ mov(r0, zr); // return 0 1418 __ ret(lr); 1419 return start; 1420 } 1421 1422 // Arguments: 1423 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1424 // ignored 1425 // is_oop - true => oop array, so generate store check code 1426 // name - stub name string 1427 // 1428 // Inputs: 1429 // c_rarg0 - source array address 1430 // c_rarg1 - destination array address 1431 // c_rarg2 - element count, treated as ssize_t, can be zero 1432 // 1433 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1434 // the hardware handle it. The two dwords within qwords that span 1435 // cache line boundaries will still be loaded and stored atomically. 1436 // 1437 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1438 address *entry, const char *name, 1439 bool dest_uninitialized = false) { 1440 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1441 RegSet saved_regs = RegSet::of(s, d, count); 1442 StubCodeMark mark(this, "StubRoutines", name); 1443 address start = __ pc(); 1444 __ enter(); 1445 1446 if (entry != NULL) { 1447 *entry = __ pc(); 1448 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1449 BLOCK_COMMENT("Entry:"); 1450 } 1451 1452 // use fwd copy when (d-s) above_equal (count*size) 1453 __ sub(rscratch1, d, s); 1454 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1455 __ br(Assembler::HS, nooverlap_target); 1456 1457 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1458 if (dest_uninitialized) { 1459 decorators |= IS_DEST_UNINITIALIZED; 1460 } 1461 if (aligned) { 1462 decorators |= ARRAYCOPY_ALIGNED; 1463 } 1464 1465 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1466 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1467 1468 if (is_oop) { 1469 // save regs before copy_memory 1470 __ push(RegSet::of(d, count), sp); 1471 } 1472 { 1473 // UnsafeCopyMemory page error: continue after ucm 1474 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1475 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1476 copy_memory(aligned, s, d, count, rscratch1, -size); 1477 } 1478 if (is_oop) { 1479 __ pop(RegSet::of(d, count), sp); 1480 if (VerifyOops) 1481 verify_oop_array(size, d, count, r16); 1482 } 1483 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1484 __ leave(); 1485 __ mov(r0, zr); // return 0 1486 __ ret(lr); 1487 return start; 1488 } 1489 1490 // Arguments: 1491 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1492 // ignored 1493 // name - stub name string 1494 // 1495 // Inputs: 1496 // c_rarg0 - source array address 1497 // c_rarg1 - destination array address 1498 // c_rarg2 - element count, treated as ssize_t, can be zero 1499 // 1500 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1501 // we let the hardware handle it. The one to eight bytes within words, 1502 // dwords or qwords that span cache line boundaries will still be loaded 1503 // and stored atomically. 1504 // 1505 // Side Effects: 1506 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1507 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1508 // we let the hardware handle it. The one to eight bytes within words, 1509 // dwords or qwords that span cache line boundaries will still be loaded 1510 // and stored atomically. 1511 // 1512 // Side Effects: 1513 // disjoint_byte_copy_entry is set to the no-overlap entry point 1514 // used by generate_conjoint_byte_copy(). 1515 // 1516 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1517 const bool not_oop = false; 1518 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1519 } 1520 1521 // Arguments: 1522 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1523 // ignored 1524 // name - stub name string 1525 // 1526 // Inputs: 1527 // c_rarg0 - source array address 1528 // c_rarg1 - destination array address 1529 // c_rarg2 - element count, treated as ssize_t, can be zero 1530 // 1531 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1532 // we let the hardware handle it. The one to eight bytes within words, 1533 // dwords or qwords that span cache line boundaries will still be loaded 1534 // and stored atomically. 1535 // 1536 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1537 address* entry, const char *name) { 1538 const bool not_oop = false; 1539 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1540 } 1541 1542 // Arguments: 1543 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1544 // ignored 1545 // name - stub name string 1546 // 1547 // Inputs: 1548 // c_rarg0 - source array address 1549 // c_rarg1 - destination array address 1550 // c_rarg2 - element count, treated as ssize_t, can be zero 1551 // 1552 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1553 // let the hardware handle it. The two or four words within dwords 1554 // or qwords that span cache line boundaries will still be loaded 1555 // and stored atomically. 1556 // 1557 // Side Effects: 1558 // disjoint_short_copy_entry is set to the no-overlap entry point 1559 // used by generate_conjoint_short_copy(). 1560 // 1561 address generate_disjoint_short_copy(bool aligned, 1562 address* entry, const char *name) { 1563 const bool not_oop = false; 1564 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1565 } 1566 1567 // Arguments: 1568 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1569 // ignored 1570 // name - stub name string 1571 // 1572 // Inputs: 1573 // c_rarg0 - source array address 1574 // c_rarg1 - destination array address 1575 // c_rarg2 - element count, treated as ssize_t, can be zero 1576 // 1577 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1578 // let the hardware handle it. The two or four words within dwords 1579 // or qwords that span cache line boundaries will still be loaded 1580 // and stored atomically. 1581 // 1582 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1583 address *entry, const char *name) { 1584 const bool not_oop = false; 1585 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1586 1587 } 1588 // Arguments: 1589 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1590 // ignored 1591 // name - stub name string 1592 // 1593 // Inputs: 1594 // c_rarg0 - source array address 1595 // c_rarg1 - destination array address 1596 // c_rarg2 - element count, treated as ssize_t, can be zero 1597 // 1598 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1599 // the hardware handle it. The two dwords within qwords that span 1600 // cache line boundaries will still be loaded and stored atomically. 1601 // 1602 // Side Effects: 1603 // disjoint_int_copy_entry is set to the no-overlap entry point 1604 // used by generate_conjoint_int_oop_copy(). 1605 // 1606 address generate_disjoint_int_copy(bool aligned, address *entry, 1607 const char *name, bool dest_uninitialized = false) { 1608 const bool not_oop = false; 1609 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1610 } 1611 1612 // Arguments: 1613 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1614 // ignored 1615 // name - stub name string 1616 // 1617 // Inputs: 1618 // c_rarg0 - source array address 1619 // c_rarg1 - destination array address 1620 // c_rarg2 - element count, treated as ssize_t, can be zero 1621 // 1622 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1623 // the hardware handle it. The two dwords within qwords that span 1624 // cache line boundaries will still be loaded and stored atomically. 1625 // 1626 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1627 address *entry, const char *name, 1628 bool dest_uninitialized = false) { 1629 const bool not_oop = false; 1630 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1631 } 1632 1633 1634 // Arguments: 1635 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1636 // ignored 1637 // name - stub name string 1638 // 1639 // Inputs: 1640 // c_rarg0 - source array address 1641 // c_rarg1 - destination array address 1642 // c_rarg2 - element count, treated as size_t, can be zero 1643 // 1644 // Side Effects: 1645 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1646 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1647 // 1648 address generate_disjoint_long_copy(bool aligned, address *entry, 1649 const char *name, bool dest_uninitialized = false) { 1650 const bool not_oop = false; 1651 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1652 } 1653 1654 // Arguments: 1655 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1656 // ignored 1657 // name - stub name string 1658 // 1659 // Inputs: 1660 // c_rarg0 - source array address 1661 // c_rarg1 - destination array address 1662 // c_rarg2 - element count, treated as size_t, can be zero 1663 // 1664 address generate_conjoint_long_copy(bool aligned, 1665 address nooverlap_target, address *entry, 1666 const char *name, bool dest_uninitialized = false) { 1667 const bool not_oop = false; 1668 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1669 } 1670 1671 // Arguments: 1672 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1673 // ignored 1674 // name - stub name string 1675 // 1676 // Inputs: 1677 // c_rarg0 - source array address 1678 // c_rarg1 - destination array address 1679 // c_rarg2 - element count, treated as size_t, can be zero 1680 // 1681 // Side Effects: 1682 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1683 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1684 // 1685 address generate_disjoint_oop_copy(bool aligned, address *entry, 1686 const char *name, bool dest_uninitialized) { 1687 const bool is_oop = true; 1688 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1689 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1690 } 1691 1692 // Arguments: 1693 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1694 // ignored 1695 // name - stub name string 1696 // 1697 // Inputs: 1698 // c_rarg0 - source array address 1699 // c_rarg1 - destination array address 1700 // c_rarg2 - element count, treated as size_t, can be zero 1701 // 1702 address generate_conjoint_oop_copy(bool aligned, 1703 address nooverlap_target, address *entry, 1704 const char *name, bool dest_uninitialized) { 1705 const bool is_oop = true; 1706 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1707 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1708 name, dest_uninitialized); 1709 } 1710 1711 1712 // Helper for generating a dynamic type check. 1713 // Smashes rscratch1, rscratch2. 1714 void generate_type_check(Register sub_klass, 1715 Register super_check_offset, 1716 Register super_klass, 1717 Label& L_success) { 1718 assert_different_registers(sub_klass, super_check_offset, super_klass); 1719 1720 BLOCK_COMMENT("type_check:"); 1721 1722 Label L_miss; 1723 1724 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1725 super_check_offset); 1726 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1727 1728 // Fall through on failure! 1729 __ BIND(L_miss); 1730 } 1731 1732 // 1733 // Generate checkcasting array copy stub 1734 // 1735 // Input: 1736 // c_rarg0 - source array address 1737 // c_rarg1 - destination array address 1738 // c_rarg2 - element count, treated as ssize_t, can be zero 1739 // c_rarg3 - size_t ckoff (super_check_offset) 1740 // c_rarg4 - oop ckval (super_klass) 1741 // 1742 // Output: 1743 // r0 == 0 - success 1744 // r0 == -1^K - failure, where K is partial transfer count 1745 // 1746 address generate_checkcast_copy(const char *name, address *entry, 1747 bool dest_uninitialized = false) { 1748 1749 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1750 1751 // Input registers (after setup_arg_regs) 1752 const Register from = c_rarg0; // source array address 1753 const Register to = c_rarg1; // destination array address 1754 const Register count = c_rarg2; // elementscount 1755 const Register ckoff = c_rarg3; // super_check_offset 1756 const Register ckval = c_rarg4; // super_klass 1757 1758 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1759 RegSet wb_post_saved_regs = RegSet::of(count); 1760 1761 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1762 const Register copied_oop = r22; // actual oop copied 1763 const Register count_save = r21; // orig elementscount 1764 const Register start_to = r20; // destination array start address 1765 const Register r19_klass = r19; // oop._klass 1766 1767 //--------------------------------------------------------------- 1768 // Assembler stub will be used for this call to arraycopy 1769 // if the two arrays are subtypes of Object[] but the 1770 // destination array type is not equal to or a supertype 1771 // of the source type. Each element must be separately 1772 // checked. 1773 1774 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1775 copied_oop, r19_klass, count_save); 1776 1777 __ align(CodeEntryAlignment); 1778 StubCodeMark mark(this, "StubRoutines", name); 1779 address start = __ pc(); 1780 1781 __ enter(); // required for proper stackwalking of RuntimeStub frame 1782 1783 #ifdef ASSERT 1784 // caller guarantees that the arrays really are different 1785 // otherwise, we would have to make conjoint checks 1786 { Label L; 1787 __ b(L); // conjoint check not yet implemented 1788 __ stop("checkcast_copy within a single array"); 1789 __ bind(L); 1790 } 1791 #endif //ASSERT 1792 1793 // Caller of this entry point must set up the argument registers. 1794 if (entry != NULL) { 1795 *entry = __ pc(); 1796 BLOCK_COMMENT("Entry:"); 1797 } 1798 1799 // Empty array: Nothing to do. 1800 __ cbz(count, L_done); 1801 __ push(RegSet::of(r19, r20, r21, r22), sp); 1802 1803 #ifdef ASSERT 1804 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1805 // The ckoff and ckval must be mutually consistent, 1806 // even though caller generates both. 1807 { Label L; 1808 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1809 __ ldrw(start_to, Address(ckval, sco_offset)); 1810 __ cmpw(ckoff, start_to); 1811 __ br(Assembler::EQ, L); 1812 __ stop("super_check_offset inconsistent"); 1813 __ bind(L); 1814 } 1815 #endif //ASSERT 1816 1817 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1818 bool is_oop = true; 1819 if (dest_uninitialized) { 1820 decorators |= IS_DEST_UNINITIALIZED; 1821 } 1822 1823 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1824 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1825 1826 // save the original count 1827 __ mov(count_save, count); 1828 1829 // Copy from low to high addresses 1830 __ mov(start_to, to); // Save destination array start address 1831 __ b(L_load_element); 1832 1833 // ======== begin loop ======== 1834 // (Loop is rotated; its entry is L_load_element.) 1835 // Loop control: 1836 // for (; count != 0; count--) { 1837 // copied_oop = load_heap_oop(from++); 1838 // ... generate_type_check ...; 1839 // store_heap_oop(to++, copied_oop); 1840 // } 1841 __ align(OptoLoopAlignment); 1842 1843 __ BIND(L_store_element); 1844 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW); // store the oop 1845 __ sub(count, count, 1); 1846 __ cbz(count, L_do_card_marks); 1847 1848 // ======== loop entry is here ======== 1849 __ BIND(L_load_element); 1850 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1851 __ cbz(copied_oop, L_store_element); 1852 1853 __ load_klass(r19_klass, copied_oop);// query the object klass 1854 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1855 // ======== end loop ======== 1856 1857 // It was a real error; we must depend on the caller to finish the job. 1858 // Register count = remaining oops, count_orig = total oops. 1859 // Emit GC store barriers for the oops we have copied and report 1860 // their number to the caller. 1861 1862 __ subs(count, count_save, count); // K = partially copied oop count 1863 __ eon(count, count, zr); // report (-1^K) to caller 1864 __ br(Assembler::EQ, L_done_pop); 1865 1866 __ BIND(L_do_card_marks); 1867 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1868 1869 __ bind(L_done_pop); 1870 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1871 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1872 1873 __ bind(L_done); 1874 __ mov(r0, count); 1875 __ leave(); 1876 __ ret(lr); 1877 1878 return start; 1879 } 1880 1881 // Perform range checks on the proposed arraycopy. 1882 // Kills temp, but nothing else. 1883 // Also, clean the sign bits of src_pos and dst_pos. 1884 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1885 Register src_pos, // source position (c_rarg1) 1886 Register dst, // destination array oo (c_rarg2) 1887 Register dst_pos, // destination position (c_rarg3) 1888 Register length, 1889 Register temp, 1890 Label& L_failed) { 1891 BLOCK_COMMENT("arraycopy_range_checks:"); 1892 1893 assert_different_registers(rscratch1, temp); 1894 1895 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1896 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1897 __ addw(temp, length, src_pos); 1898 __ cmpw(temp, rscratch1); 1899 __ br(Assembler::HI, L_failed); 1900 1901 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1902 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1903 __ addw(temp, length, dst_pos); 1904 __ cmpw(temp, rscratch1); 1905 __ br(Assembler::HI, L_failed); 1906 1907 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1908 __ movw(src_pos, src_pos); 1909 __ movw(dst_pos, dst_pos); 1910 1911 BLOCK_COMMENT("arraycopy_range_checks done"); 1912 } 1913 1914 // These stubs get called from some dumb test routine. 1915 // I'll write them properly when they're called from 1916 // something that's actually doing something. 1917 static void fake_arraycopy_stub(address src, address dst, int count) { 1918 assert(count == 0, "huh?"); 1919 } 1920 1921 1922 // 1923 // Generate 'unsafe' array copy stub 1924 // Though just as safe as the other stubs, it takes an unscaled 1925 // size_t argument instead of an element count. 1926 // 1927 // Input: 1928 // c_rarg0 - source array address 1929 // c_rarg1 - destination array address 1930 // c_rarg2 - byte count, treated as ssize_t, can be zero 1931 // 1932 // Examines the alignment of the operands and dispatches 1933 // to a long, int, short, or byte copy loop. 1934 // 1935 address generate_unsafe_copy(const char *name, 1936 address byte_copy_entry, 1937 address short_copy_entry, 1938 address int_copy_entry, 1939 address long_copy_entry) { 1940 Label L_long_aligned, L_int_aligned, L_short_aligned; 1941 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1942 1943 __ align(CodeEntryAlignment); 1944 StubCodeMark mark(this, "StubRoutines", name); 1945 address start = __ pc(); 1946 __ enter(); // required for proper stackwalking of RuntimeStub frame 1947 1948 // bump this on entry, not on exit: 1949 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1950 1951 __ orr(rscratch1, s, d); 1952 __ orr(rscratch1, rscratch1, count); 1953 1954 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1955 __ cbz(rscratch1, L_long_aligned); 1956 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1957 __ cbz(rscratch1, L_int_aligned); 1958 __ tbz(rscratch1, 0, L_short_aligned); 1959 __ b(RuntimeAddress(byte_copy_entry)); 1960 1961 __ BIND(L_short_aligned); 1962 __ lsr(count, count, LogBytesPerShort); // size => short_count 1963 __ b(RuntimeAddress(short_copy_entry)); 1964 __ BIND(L_int_aligned); 1965 __ lsr(count, count, LogBytesPerInt); // size => int_count 1966 __ b(RuntimeAddress(int_copy_entry)); 1967 __ BIND(L_long_aligned); 1968 __ lsr(count, count, LogBytesPerLong); // size => long_count 1969 __ b(RuntimeAddress(long_copy_entry)); 1970 1971 return start; 1972 } 1973 1974 // 1975 // Generate generic array copy stubs 1976 // 1977 // Input: 1978 // c_rarg0 - src oop 1979 // c_rarg1 - src_pos (32-bits) 1980 // c_rarg2 - dst oop 1981 // c_rarg3 - dst_pos (32-bits) 1982 // c_rarg4 - element count (32-bits) 1983 // 1984 // Output: 1985 // r0 == 0 - success 1986 // r0 == -1^K - failure, where K is partial transfer count 1987 // 1988 address generate_generic_copy(const char *name, 1989 address byte_copy_entry, address short_copy_entry, 1990 address int_copy_entry, address oop_copy_entry, 1991 address long_copy_entry, address checkcast_copy_entry) { 1992 1993 Label L_failed, L_objArray; 1994 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1995 1996 // Input registers 1997 const Register src = c_rarg0; // source array oop 1998 const Register src_pos = c_rarg1; // source position 1999 const Register dst = c_rarg2; // destination array oop 2000 const Register dst_pos = c_rarg3; // destination position 2001 const Register length = c_rarg4; 2002 2003 2004 // Registers used as temps 2005 const Register dst_klass = c_rarg5; 2006 2007 __ align(CodeEntryAlignment); 2008 2009 StubCodeMark mark(this, "StubRoutines", name); 2010 2011 address start = __ pc(); 2012 2013 __ enter(); // required for proper stackwalking of RuntimeStub frame 2014 2015 // bump this on entry, not on exit: 2016 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2017 2018 //----------------------------------------------------------------------- 2019 // Assembler stub will be used for this call to arraycopy 2020 // if the following conditions are met: 2021 // 2022 // (1) src and dst must not be null. 2023 // (2) src_pos must not be negative. 2024 // (3) dst_pos must not be negative. 2025 // (4) length must not be negative. 2026 // (5) src klass and dst klass should be the same and not NULL. 2027 // (6) src and dst should be arrays. 2028 // (7) src_pos + length must not exceed length of src. 2029 // (8) dst_pos + length must not exceed length of dst. 2030 // 2031 2032 // if (src == NULL) return -1; 2033 __ cbz(src, L_failed); 2034 2035 // if (src_pos < 0) return -1; 2036 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2037 2038 // if (dst == NULL) return -1; 2039 __ cbz(dst, L_failed); 2040 2041 // if (dst_pos < 0) return -1; 2042 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2043 2044 // registers used as temp 2045 const Register scratch_length = r16; // elements count to copy 2046 const Register scratch_src_klass = r17; // array klass 2047 const Register lh = r15; // layout helper 2048 2049 // if (length < 0) return -1; 2050 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2051 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2052 2053 __ load_klass(scratch_src_klass, src); 2054 #ifdef ASSERT 2055 // assert(src->klass() != NULL); 2056 { 2057 BLOCK_COMMENT("assert klasses not null {"); 2058 Label L1, L2; 2059 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2060 __ bind(L1); 2061 __ stop("broken null klass"); 2062 __ bind(L2); 2063 __ load_klass(rscratch1, dst); 2064 __ cbz(rscratch1, L1); // this would be broken also 2065 BLOCK_COMMENT("} assert klasses not null done"); 2066 } 2067 #endif 2068 2069 // Load layout helper (32-bits) 2070 // 2071 // |array_tag| | header_size | element_type | |log2_element_size| 2072 // 32 30 24 16 8 2 0 2073 // 2074 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2075 // 2076 2077 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2078 2079 // Handle objArrays completely differently... 2080 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2081 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2082 __ movw(rscratch1, objArray_lh); 2083 __ eorw(rscratch2, lh, rscratch1); 2084 __ cbzw(rscratch2, L_objArray); 2085 2086 // if (src->klass() != dst->klass()) return -1; 2087 __ load_klass(rscratch2, dst); 2088 __ eor(rscratch2, rscratch2, scratch_src_klass); 2089 __ cbnz(rscratch2, L_failed); 2090 2091 // if (!src->is_Array()) return -1; 2092 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2093 2094 // At this point, it is known to be a typeArray (array_tag 0x3). 2095 #ifdef ASSERT 2096 { 2097 BLOCK_COMMENT("assert primitive array {"); 2098 Label L; 2099 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2100 __ cmpw(lh, rscratch2); 2101 __ br(Assembler::GE, L); 2102 __ stop("must be a primitive array"); 2103 __ bind(L); 2104 BLOCK_COMMENT("} assert primitive array done"); 2105 } 2106 #endif 2107 2108 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2109 rscratch2, L_failed); 2110 2111 // TypeArrayKlass 2112 // 2113 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2114 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2115 // 2116 2117 const Register rscratch1_offset = rscratch1; // array offset 2118 const Register r15_elsize = lh; // element size 2119 2120 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2121 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2122 __ add(src, src, rscratch1_offset); // src array offset 2123 __ add(dst, dst, rscratch1_offset); // dst array offset 2124 BLOCK_COMMENT("choose copy loop based on element size"); 2125 2126 // next registers should be set before the jump to corresponding stub 2127 const Register from = c_rarg0; // source array address 2128 const Register to = c_rarg1; // destination array address 2129 const Register count = c_rarg2; // elements count 2130 2131 // 'from', 'to', 'count' registers should be set in such order 2132 // since they are the same as 'src', 'src_pos', 'dst'. 2133 2134 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2135 2136 // The possible values of elsize are 0-3, i.e. exact_log2(element 2137 // size in bytes). We do a simple bitwise binary search. 2138 __ BIND(L_copy_bytes); 2139 __ tbnz(r15_elsize, 1, L_copy_ints); 2140 __ tbnz(r15_elsize, 0, L_copy_shorts); 2141 __ lea(from, Address(src, src_pos));// src_addr 2142 __ lea(to, Address(dst, dst_pos));// dst_addr 2143 __ movw(count, scratch_length); // length 2144 __ b(RuntimeAddress(byte_copy_entry)); 2145 2146 __ BIND(L_copy_shorts); 2147 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2148 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2149 __ movw(count, scratch_length); // length 2150 __ b(RuntimeAddress(short_copy_entry)); 2151 2152 __ BIND(L_copy_ints); 2153 __ tbnz(r15_elsize, 0, L_copy_longs); 2154 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2155 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2156 __ movw(count, scratch_length); // length 2157 __ b(RuntimeAddress(int_copy_entry)); 2158 2159 __ BIND(L_copy_longs); 2160 #ifdef ASSERT 2161 { 2162 BLOCK_COMMENT("assert long copy {"); 2163 Label L; 2164 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2165 __ cmpw(r15_elsize, LogBytesPerLong); 2166 __ br(Assembler::EQ, L); 2167 __ stop("must be long copy, but elsize is wrong"); 2168 __ bind(L); 2169 BLOCK_COMMENT("} assert long copy done"); 2170 } 2171 #endif 2172 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2173 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2174 __ movw(count, scratch_length); // length 2175 __ b(RuntimeAddress(long_copy_entry)); 2176 2177 // ObjArrayKlass 2178 __ BIND(L_objArray); 2179 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2180 2181 Label L_plain_copy, L_checkcast_copy; 2182 // test array classes for subtyping 2183 __ load_klass(r15, dst); 2184 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2185 __ br(Assembler::NE, L_checkcast_copy); 2186 2187 // Identically typed arrays can be copied without element-wise checks. 2188 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2189 rscratch2, L_failed); 2190 2191 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2192 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2193 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2194 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2195 __ movw(count, scratch_length); // length 2196 __ BIND(L_plain_copy); 2197 __ b(RuntimeAddress(oop_copy_entry)); 2198 2199 __ BIND(L_checkcast_copy); 2200 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2201 { 2202 // Before looking at dst.length, make sure dst is also an objArray. 2203 __ ldrw(rscratch1, Address(r15, lh_offset)); 2204 __ movw(rscratch2, objArray_lh); 2205 __ eorw(rscratch1, rscratch1, rscratch2); 2206 __ cbnzw(rscratch1, L_failed); 2207 2208 // It is safe to examine both src.length and dst.length. 2209 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2210 r15, L_failed); 2211 2212 __ load_klass(dst_klass, dst); // reload 2213 2214 // Marshal the base address arguments now, freeing registers. 2215 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2216 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2217 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2218 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2219 __ movw(count, length); // length (reloaded) 2220 Register sco_temp = c_rarg3; // this register is free now 2221 assert_different_registers(from, to, count, sco_temp, 2222 dst_klass, scratch_src_klass); 2223 // assert_clean_int(count, sco_temp); 2224 2225 // Generate the type check. 2226 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2227 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2228 2229 // Smashes rscratch1, rscratch2 2230 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2231 2232 // Fetch destination element klass from the ObjArrayKlass header. 2233 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2234 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2235 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2236 2237 // the checkcast_copy loop needs two extra arguments: 2238 assert(c_rarg3 == sco_temp, "#3 already in place"); 2239 // Set up arguments for checkcast_copy_entry. 2240 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2241 __ b(RuntimeAddress(checkcast_copy_entry)); 2242 } 2243 2244 __ BIND(L_failed); 2245 __ mov(r0, -1); 2246 __ leave(); // required for proper stackwalking of RuntimeStub frame 2247 __ ret(lr); 2248 2249 return start; 2250 } 2251 2252 // 2253 // Generate stub for array fill. If "aligned" is true, the 2254 // "to" address is assumed to be heapword aligned. 2255 // 2256 // Arguments for generated stub: 2257 // to: c_rarg0 2258 // value: c_rarg1 2259 // count: c_rarg2 treated as signed 2260 // 2261 address generate_fill(BasicType t, bool aligned, const char *name) { 2262 __ align(CodeEntryAlignment); 2263 StubCodeMark mark(this, "StubRoutines", name); 2264 address start = __ pc(); 2265 2266 BLOCK_COMMENT("Entry:"); 2267 2268 const Register to = c_rarg0; // source array address 2269 const Register value = c_rarg1; // value 2270 const Register count = c_rarg2; // elements count 2271 2272 const Register bz_base = r10; // base for block_zero routine 2273 const Register cnt_words = r11; // temp register 2274 2275 __ enter(); 2276 2277 Label L_fill_elements, L_exit1; 2278 2279 int shift = -1; 2280 switch (t) { 2281 case T_BYTE: 2282 shift = 0; 2283 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2284 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2285 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2286 __ br(Assembler::LO, L_fill_elements); 2287 break; 2288 case T_SHORT: 2289 shift = 1; 2290 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2291 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2292 __ br(Assembler::LO, L_fill_elements); 2293 break; 2294 case T_INT: 2295 shift = 2; 2296 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2297 __ br(Assembler::LO, L_fill_elements); 2298 break; 2299 default: ShouldNotReachHere(); 2300 } 2301 2302 // Align source address at 8 bytes address boundary. 2303 Label L_skip_align1, L_skip_align2, L_skip_align4; 2304 if (!aligned) { 2305 switch (t) { 2306 case T_BYTE: 2307 // One byte misalignment happens only for byte arrays. 2308 __ tbz(to, 0, L_skip_align1); 2309 __ strb(value, Address(__ post(to, 1))); 2310 __ subw(count, count, 1); 2311 __ bind(L_skip_align1); 2312 // Fallthrough 2313 case T_SHORT: 2314 // Two bytes misalignment happens only for byte and short (char) arrays. 2315 __ tbz(to, 1, L_skip_align2); 2316 __ strh(value, Address(__ post(to, 2))); 2317 __ subw(count, count, 2 >> shift); 2318 __ bind(L_skip_align2); 2319 // Fallthrough 2320 case T_INT: 2321 // Align to 8 bytes, we know we are 4 byte aligned to start. 2322 __ tbz(to, 2, L_skip_align4); 2323 __ strw(value, Address(__ post(to, 4))); 2324 __ subw(count, count, 4 >> shift); 2325 __ bind(L_skip_align4); 2326 break; 2327 default: ShouldNotReachHere(); 2328 } 2329 } 2330 2331 // 2332 // Fill large chunks 2333 // 2334 __ lsrw(cnt_words, count, 3 - shift); // number of words 2335 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2336 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2337 if (UseBlockZeroing) { 2338 Label non_block_zeroing, rest; 2339 // If the fill value is zero we can use the fast zero_words(). 2340 __ cbnz(value, non_block_zeroing); 2341 __ mov(bz_base, to); 2342 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2343 address tpc = __ zero_words(bz_base, cnt_words); 2344 if (tpc == nullptr) { 2345 fatal("CodeCache is full at generate_fill"); 2346 } 2347 __ b(rest); 2348 __ bind(non_block_zeroing); 2349 __ fill_words(to, cnt_words, value); 2350 __ bind(rest); 2351 } else { 2352 __ fill_words(to, cnt_words, value); 2353 } 2354 2355 // Remaining count is less than 8 bytes. Fill it by a single store. 2356 // Note that the total length is no less than 8 bytes. 2357 if (t == T_BYTE || t == T_SHORT) { 2358 Label L_exit1; 2359 __ cbzw(count, L_exit1); 2360 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2361 __ str(value, Address(to, -8)); // overwrite some elements 2362 __ bind(L_exit1); 2363 __ leave(); 2364 __ ret(lr); 2365 } 2366 2367 // Handle copies less than 8 bytes. 2368 Label L_fill_2, L_fill_4, L_exit2; 2369 __ bind(L_fill_elements); 2370 switch (t) { 2371 case T_BYTE: 2372 __ tbz(count, 0, L_fill_2); 2373 __ strb(value, Address(__ post(to, 1))); 2374 __ bind(L_fill_2); 2375 __ tbz(count, 1, L_fill_4); 2376 __ strh(value, Address(__ post(to, 2))); 2377 __ bind(L_fill_4); 2378 __ tbz(count, 2, L_exit2); 2379 __ strw(value, Address(to)); 2380 break; 2381 case T_SHORT: 2382 __ tbz(count, 0, L_fill_4); 2383 __ strh(value, Address(__ post(to, 2))); 2384 __ bind(L_fill_4); 2385 __ tbz(count, 1, L_exit2); 2386 __ strw(value, Address(to)); 2387 break; 2388 case T_INT: 2389 __ cbzw(count, L_exit2); 2390 __ strw(value, Address(to)); 2391 break; 2392 default: ShouldNotReachHere(); 2393 } 2394 __ bind(L_exit2); 2395 __ leave(); 2396 __ ret(lr); 2397 return start; 2398 } 2399 2400 address generate_data_cache_writeback() { 2401 const Register line = c_rarg0; // address of line to write back 2402 2403 __ align(CodeEntryAlignment); 2404 2405 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2406 2407 address start = __ pc(); 2408 __ enter(); 2409 __ cache_wb(Address(line, 0)); 2410 __ leave(); 2411 __ ret(lr); 2412 2413 return start; 2414 } 2415 2416 address generate_data_cache_writeback_sync() { 2417 const Register is_pre = c_rarg0; // pre or post sync 2418 2419 __ align(CodeEntryAlignment); 2420 2421 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2422 2423 // pre wbsync is a no-op 2424 // post wbsync translates to an sfence 2425 2426 Label skip; 2427 address start = __ pc(); 2428 __ enter(); 2429 __ cbnz(is_pre, skip); 2430 __ cache_wbsync(false); 2431 __ bind(skip); 2432 __ leave(); 2433 __ ret(lr); 2434 2435 return start; 2436 } 2437 2438 void generate_arraycopy_stubs() { 2439 address entry; 2440 address entry_jbyte_arraycopy; 2441 address entry_jshort_arraycopy; 2442 address entry_jint_arraycopy; 2443 address entry_oop_arraycopy; 2444 address entry_jlong_arraycopy; 2445 address entry_checkcast_arraycopy; 2446 2447 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2448 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2449 2450 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2451 2452 //*** jbyte 2453 // Always need aligned and unaligned versions 2454 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2455 "jbyte_disjoint_arraycopy"); 2456 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2457 &entry_jbyte_arraycopy, 2458 "jbyte_arraycopy"); 2459 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2460 "arrayof_jbyte_disjoint_arraycopy"); 2461 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2462 "arrayof_jbyte_arraycopy"); 2463 2464 //*** jshort 2465 // Always need aligned and unaligned versions 2466 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2467 "jshort_disjoint_arraycopy"); 2468 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2469 &entry_jshort_arraycopy, 2470 "jshort_arraycopy"); 2471 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2472 "arrayof_jshort_disjoint_arraycopy"); 2473 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2474 "arrayof_jshort_arraycopy"); 2475 2476 //*** jint 2477 // Aligned versions 2478 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2479 "arrayof_jint_disjoint_arraycopy"); 2480 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2481 "arrayof_jint_arraycopy"); 2482 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2483 // entry_jint_arraycopy always points to the unaligned version 2484 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2485 "jint_disjoint_arraycopy"); 2486 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2487 &entry_jint_arraycopy, 2488 "jint_arraycopy"); 2489 2490 //*** jlong 2491 // It is always aligned 2492 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2493 "arrayof_jlong_disjoint_arraycopy"); 2494 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2495 "arrayof_jlong_arraycopy"); 2496 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2497 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2498 2499 //*** oops 2500 { 2501 // With compressed oops we need unaligned versions; notice that 2502 // we overwrite entry_oop_arraycopy. 2503 bool aligned = !UseCompressedOops; 2504 2505 StubRoutines::_arrayof_oop_disjoint_arraycopy 2506 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2507 /*dest_uninitialized*/false); 2508 StubRoutines::_arrayof_oop_arraycopy 2509 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2510 /*dest_uninitialized*/false); 2511 // Aligned versions without pre-barriers 2512 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2513 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2514 /*dest_uninitialized*/true); 2515 StubRoutines::_arrayof_oop_arraycopy_uninit 2516 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2517 /*dest_uninitialized*/true); 2518 } 2519 2520 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2521 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2522 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2523 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2524 2525 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2526 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2527 /*dest_uninitialized*/true); 2528 2529 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2530 entry_jbyte_arraycopy, 2531 entry_jshort_arraycopy, 2532 entry_jint_arraycopy, 2533 entry_jlong_arraycopy); 2534 2535 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2536 entry_jbyte_arraycopy, 2537 entry_jshort_arraycopy, 2538 entry_jint_arraycopy, 2539 entry_oop_arraycopy, 2540 entry_jlong_arraycopy, 2541 entry_checkcast_arraycopy); 2542 2543 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2544 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2545 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2546 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2547 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2548 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2549 } 2550 2551 void generate_math_stubs() { Unimplemented(); } 2552 2553 // Arguments: 2554 // 2555 // Inputs: 2556 // c_rarg0 - source byte array address 2557 // c_rarg1 - destination byte array address 2558 // c_rarg2 - K (key) in little endian int array 2559 // 2560 address generate_aescrypt_encryptBlock() { 2561 __ align(CodeEntryAlignment); 2562 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2563 2564 const Register from = c_rarg0; // source array address 2565 const Register to = c_rarg1; // destination array address 2566 const Register key = c_rarg2; // key array address 2567 const Register keylen = rscratch1; 2568 2569 address start = __ pc(); 2570 __ enter(); 2571 2572 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2573 2574 __ aesenc_loadkeys(key, keylen); 2575 __ aesecb_encrypt(from, to, keylen); 2576 2577 __ mov(r0, 0); 2578 2579 __ leave(); 2580 __ ret(lr); 2581 2582 return start; 2583 } 2584 2585 // Arguments: 2586 // 2587 // Inputs: 2588 // c_rarg0 - source byte array address 2589 // c_rarg1 - destination byte array address 2590 // c_rarg2 - K (key) in little endian int array 2591 // 2592 address generate_aescrypt_decryptBlock() { 2593 assert(UseAES, "need AES cryptographic extension support"); 2594 __ align(CodeEntryAlignment); 2595 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2596 Label L_doLast; 2597 2598 const Register from = c_rarg0; // source array address 2599 const Register to = c_rarg1; // destination array address 2600 const Register key = c_rarg2; // key array address 2601 const Register keylen = rscratch1; 2602 2603 address start = __ pc(); 2604 __ enter(); // required for proper stackwalking of RuntimeStub frame 2605 2606 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2607 2608 __ aesecb_decrypt(from, to, key, keylen); 2609 2610 __ mov(r0, 0); 2611 2612 __ leave(); 2613 __ ret(lr); 2614 2615 return start; 2616 } 2617 2618 // Arguments: 2619 // 2620 // Inputs: 2621 // c_rarg0 - source byte array address 2622 // c_rarg1 - destination byte array address 2623 // c_rarg2 - K (key) in little endian int array 2624 // c_rarg3 - r vector byte array address 2625 // c_rarg4 - input length 2626 // 2627 // Output: 2628 // x0 - input length 2629 // 2630 address generate_cipherBlockChaining_encryptAESCrypt() { 2631 assert(UseAES, "need AES cryptographic extension support"); 2632 __ align(CodeEntryAlignment); 2633 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2634 2635 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2636 2637 const Register from = c_rarg0; // source array address 2638 const Register to = c_rarg1; // destination array address 2639 const Register key = c_rarg2; // key array address 2640 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2641 // and left with the results of the last encryption block 2642 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2643 const Register keylen = rscratch1; 2644 2645 address start = __ pc(); 2646 2647 __ enter(); 2648 2649 __ movw(rscratch2, len_reg); 2650 2651 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2652 2653 __ ld1(v0, __ T16B, rvec); 2654 2655 __ cmpw(keylen, 52); 2656 __ br(Assembler::CC, L_loadkeys_44); 2657 __ br(Assembler::EQ, L_loadkeys_52); 2658 2659 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2660 __ rev32(v17, __ T16B, v17); 2661 __ rev32(v18, __ T16B, v18); 2662 __ BIND(L_loadkeys_52); 2663 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2664 __ rev32(v19, __ T16B, v19); 2665 __ rev32(v20, __ T16B, v20); 2666 __ BIND(L_loadkeys_44); 2667 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2668 __ rev32(v21, __ T16B, v21); 2669 __ rev32(v22, __ T16B, v22); 2670 __ rev32(v23, __ T16B, v23); 2671 __ rev32(v24, __ T16B, v24); 2672 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2673 __ rev32(v25, __ T16B, v25); 2674 __ rev32(v26, __ T16B, v26); 2675 __ rev32(v27, __ T16B, v27); 2676 __ rev32(v28, __ T16B, v28); 2677 __ ld1(v29, v30, v31, __ T16B, key); 2678 __ rev32(v29, __ T16B, v29); 2679 __ rev32(v30, __ T16B, v30); 2680 __ rev32(v31, __ T16B, v31); 2681 2682 __ BIND(L_aes_loop); 2683 __ ld1(v1, __ T16B, __ post(from, 16)); 2684 __ eor(v0, __ T16B, v0, v1); 2685 2686 __ br(Assembler::CC, L_rounds_44); 2687 __ br(Assembler::EQ, L_rounds_52); 2688 2689 __ aese(v0, v17); __ aesmc(v0, v0); 2690 __ aese(v0, v18); __ aesmc(v0, v0); 2691 __ BIND(L_rounds_52); 2692 __ aese(v0, v19); __ aesmc(v0, v0); 2693 __ aese(v0, v20); __ aesmc(v0, v0); 2694 __ BIND(L_rounds_44); 2695 __ aese(v0, v21); __ aesmc(v0, v0); 2696 __ aese(v0, v22); __ aesmc(v0, v0); 2697 __ aese(v0, v23); __ aesmc(v0, v0); 2698 __ aese(v0, v24); __ aesmc(v0, v0); 2699 __ aese(v0, v25); __ aesmc(v0, v0); 2700 __ aese(v0, v26); __ aesmc(v0, v0); 2701 __ aese(v0, v27); __ aesmc(v0, v0); 2702 __ aese(v0, v28); __ aesmc(v0, v0); 2703 __ aese(v0, v29); __ aesmc(v0, v0); 2704 __ aese(v0, v30); 2705 __ eor(v0, __ T16B, v0, v31); 2706 2707 __ st1(v0, __ T16B, __ post(to, 16)); 2708 2709 __ subw(len_reg, len_reg, 16); 2710 __ cbnzw(len_reg, L_aes_loop); 2711 2712 __ st1(v0, __ T16B, rvec); 2713 2714 __ mov(r0, rscratch2); 2715 2716 __ leave(); 2717 __ ret(lr); 2718 2719 return start; 2720 } 2721 2722 // Arguments: 2723 // 2724 // Inputs: 2725 // c_rarg0 - source byte array address 2726 // c_rarg1 - destination byte array address 2727 // c_rarg2 - K (key) in little endian int array 2728 // c_rarg3 - r vector byte array address 2729 // c_rarg4 - input length 2730 // 2731 // Output: 2732 // r0 - input length 2733 // 2734 address generate_cipherBlockChaining_decryptAESCrypt() { 2735 assert(UseAES, "need AES cryptographic extension support"); 2736 __ align(CodeEntryAlignment); 2737 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2738 2739 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2740 2741 const Register from = c_rarg0; // source array address 2742 const Register to = c_rarg1; // destination array address 2743 const Register key = c_rarg2; // key array address 2744 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2745 // and left with the results of the last encryption block 2746 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2747 const Register keylen = rscratch1; 2748 2749 address start = __ pc(); 2750 2751 __ enter(); 2752 2753 __ movw(rscratch2, len_reg); 2754 2755 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2756 2757 __ ld1(v2, __ T16B, rvec); 2758 2759 __ ld1(v31, __ T16B, __ post(key, 16)); 2760 __ rev32(v31, __ T16B, v31); 2761 2762 __ cmpw(keylen, 52); 2763 __ br(Assembler::CC, L_loadkeys_44); 2764 __ br(Assembler::EQ, L_loadkeys_52); 2765 2766 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2767 __ rev32(v17, __ T16B, v17); 2768 __ rev32(v18, __ T16B, v18); 2769 __ BIND(L_loadkeys_52); 2770 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2771 __ rev32(v19, __ T16B, v19); 2772 __ rev32(v20, __ T16B, v20); 2773 __ BIND(L_loadkeys_44); 2774 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2775 __ rev32(v21, __ T16B, v21); 2776 __ rev32(v22, __ T16B, v22); 2777 __ rev32(v23, __ T16B, v23); 2778 __ rev32(v24, __ T16B, v24); 2779 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2780 __ rev32(v25, __ T16B, v25); 2781 __ rev32(v26, __ T16B, v26); 2782 __ rev32(v27, __ T16B, v27); 2783 __ rev32(v28, __ T16B, v28); 2784 __ ld1(v29, v30, __ T16B, key); 2785 __ rev32(v29, __ T16B, v29); 2786 __ rev32(v30, __ T16B, v30); 2787 2788 __ BIND(L_aes_loop); 2789 __ ld1(v0, __ T16B, __ post(from, 16)); 2790 __ orr(v1, __ T16B, v0, v0); 2791 2792 __ br(Assembler::CC, L_rounds_44); 2793 __ br(Assembler::EQ, L_rounds_52); 2794 2795 __ aesd(v0, v17); __ aesimc(v0, v0); 2796 __ aesd(v0, v18); __ aesimc(v0, v0); 2797 __ BIND(L_rounds_52); 2798 __ aesd(v0, v19); __ aesimc(v0, v0); 2799 __ aesd(v0, v20); __ aesimc(v0, v0); 2800 __ BIND(L_rounds_44); 2801 __ aesd(v0, v21); __ aesimc(v0, v0); 2802 __ aesd(v0, v22); __ aesimc(v0, v0); 2803 __ aesd(v0, v23); __ aesimc(v0, v0); 2804 __ aesd(v0, v24); __ aesimc(v0, v0); 2805 __ aesd(v0, v25); __ aesimc(v0, v0); 2806 __ aesd(v0, v26); __ aesimc(v0, v0); 2807 __ aesd(v0, v27); __ aesimc(v0, v0); 2808 __ aesd(v0, v28); __ aesimc(v0, v0); 2809 __ aesd(v0, v29); __ aesimc(v0, v0); 2810 __ aesd(v0, v30); 2811 __ eor(v0, __ T16B, v0, v31); 2812 __ eor(v0, __ T16B, v0, v2); 2813 2814 __ st1(v0, __ T16B, __ post(to, 16)); 2815 __ orr(v2, __ T16B, v1, v1); 2816 2817 __ subw(len_reg, len_reg, 16); 2818 __ cbnzw(len_reg, L_aes_loop); 2819 2820 __ st1(v2, __ T16B, rvec); 2821 2822 __ mov(r0, rscratch2); 2823 2824 __ leave(); 2825 __ ret(lr); 2826 2827 return start; 2828 } 2829 2830 // CTR AES crypt. 2831 // Arguments: 2832 // 2833 // Inputs: 2834 // c_rarg0 - source byte array address 2835 // c_rarg1 - destination byte array address 2836 // c_rarg2 - K (key) in little endian int array 2837 // c_rarg3 - counter vector byte array address 2838 // c_rarg4 - input length 2839 // c_rarg5 - saved encryptedCounter start 2840 // c_rarg6 - saved used length 2841 // 2842 // Output: 2843 // r0 - input length 2844 // 2845 address generate_counterMode_AESCrypt() { 2846 const Register in = c_rarg0; 2847 const Register out = c_rarg1; 2848 const Register key = c_rarg2; 2849 const Register counter = c_rarg3; 2850 const Register saved_len = c_rarg4, len = r10; 2851 const Register saved_encrypted_ctr = c_rarg5; 2852 const Register used_ptr = c_rarg6, used = r12; 2853 2854 const Register offset = r7; 2855 const Register keylen = r11; 2856 2857 const unsigned char block_size = 16; 2858 const int bulk_width = 4; 2859 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 2860 // performance with larger data sizes, but it also means that the 2861 // fast path isn't used until you have at least 8 blocks, and up 2862 // to 127 bytes of data will be executed on the slow path. For 2863 // that reason, and also so as not to blow away too much icache, 4 2864 // blocks seems like a sensible compromise. 2865 2866 // Algorithm: 2867 // 2868 // if (len == 0) { 2869 // goto DONE; 2870 // } 2871 // int result = len; 2872 // do { 2873 // if (used >= blockSize) { 2874 // if (len >= bulk_width * blockSize) { 2875 // CTR_large_block(); 2876 // if (len == 0) 2877 // goto DONE; 2878 // } 2879 // for (;;) { 2880 // 16ByteVector v0 = counter; 2881 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 2882 // used = 0; 2883 // if (len < blockSize) 2884 // break; /* goto NEXT */ 2885 // 16ByteVector v1 = load16Bytes(in, offset); 2886 // v1 = v1 ^ encryptedCounter; 2887 // store16Bytes(out, offset); 2888 // used = blockSize; 2889 // offset += blockSize; 2890 // len -= blockSize; 2891 // if (len == 0) 2892 // goto DONE; 2893 // } 2894 // } 2895 // NEXT: 2896 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 2897 // len--; 2898 // } while (len != 0); 2899 // DONE: 2900 // return result; 2901 // 2902 // CTR_large_block() 2903 // Wide bulk encryption of whole blocks. 2904 2905 __ align(CodeEntryAlignment); 2906 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 2907 const address start = __ pc(); 2908 __ enter(); 2909 2910 Label DONE, CTR_large_block, large_block_return; 2911 __ ldrw(used, Address(used_ptr)); 2912 __ cbzw(saved_len, DONE); 2913 2914 __ mov(len, saved_len); 2915 __ mov(offset, 0); 2916 2917 // Compute #rounds for AES based on the length of the key array 2918 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2919 2920 __ aesenc_loadkeys(key, keylen); 2921 2922 { 2923 Label L_CTR_loop, NEXT; 2924 2925 __ bind(L_CTR_loop); 2926 2927 __ cmp(used, block_size); 2928 __ br(__ LO, NEXT); 2929 2930 // Maybe we have a lot of data 2931 __ subsw(rscratch1, len, bulk_width * block_size); 2932 __ br(__ HS, CTR_large_block); 2933 __ BIND(large_block_return); 2934 __ cbzw(len, DONE); 2935 2936 // Setup the counter 2937 __ movi(v4, __ T4S, 0); 2938 __ movi(v5, __ T4S, 1); 2939 __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 } 2940 2941 __ ld1(v0, __ T16B, counter); // Load the counter into v0 2942 __ rev32(v16, __ T16B, v0); 2943 __ addv(v16, __ T4S, v16, v4); 2944 __ rev32(v16, __ T16B, v16); 2945 __ st1(v16, __ T16B, counter); // Save the incremented counter back 2946 2947 { 2948 // We have fewer than bulk_width blocks of data left. Encrypt 2949 // them one by one until there is less than a full block 2950 // remaining, being careful to save both the encrypted counter 2951 // and the counter. 2952 2953 Label inner_loop; 2954 __ bind(inner_loop); 2955 // Counter to encrypt is in v0 2956 __ aesecb_encrypt(noreg, noreg, keylen); 2957 __ st1(v0, __ T16B, saved_encrypted_ctr); 2958 2959 // Do we have a remaining full block? 2960 2961 __ mov(used, 0); 2962 __ cmp(len, block_size); 2963 __ br(__ LO, NEXT); 2964 2965 // Yes, we have a full block 2966 __ ldrq(v1, Address(in, offset)); 2967 __ eor(v1, __ T16B, v1, v0); 2968 __ strq(v1, Address(out, offset)); 2969 __ mov(used, block_size); 2970 __ add(offset, offset, block_size); 2971 2972 __ subw(len, len, block_size); 2973 __ cbzw(len, DONE); 2974 2975 // Increment the counter, store it back 2976 __ orr(v0, __ T16B, v16, v16); 2977 __ rev32(v16, __ T16B, v16); 2978 __ addv(v16, __ T4S, v16, v4); 2979 __ rev32(v16, __ T16B, v16); 2980 __ st1(v16, __ T16B, counter); // Save the incremented counter back 2981 2982 __ b(inner_loop); 2983 } 2984 2985 __ BIND(NEXT); 2986 2987 // Encrypt a single byte, and loop. 2988 // We expect this to be a rare event. 2989 __ ldrb(rscratch1, Address(in, offset)); 2990 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 2991 __ eor(rscratch1, rscratch1, rscratch2); 2992 __ strb(rscratch1, Address(out, offset)); 2993 __ add(offset, offset, 1); 2994 __ add(used, used, 1); 2995 __ subw(len, len,1); 2996 __ cbnzw(len, L_CTR_loop); 2997 } 2998 2999 __ bind(DONE); 3000 __ strw(used, Address(used_ptr)); 3001 __ mov(r0, saved_len); 3002 3003 __ leave(); // required for proper stackwalking of RuntimeStub frame 3004 __ ret(lr); 3005 3006 // Bulk encryption 3007 3008 __ BIND (CTR_large_block); 3009 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3010 3011 if (bulk_width == 8) { 3012 __ sub(sp, sp, 4 * 16); 3013 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3014 } 3015 __ sub(sp, sp, 4 * 16); 3016 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3017 RegSet saved_regs = (RegSet::of(in, out, offset) 3018 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3019 __ push(saved_regs, sp); 3020 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3021 __ add(in, in, offset); 3022 __ add(out, out, offset); 3023 3024 // Keys should already be loaded into the correct registers 3025 3026 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3027 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3028 3029 // AES/CTR loop 3030 { 3031 Label L_CTR_loop; 3032 __ BIND(L_CTR_loop); 3033 3034 // Setup the counters 3035 __ movi(v8, __ T4S, 0); 3036 __ movi(v9, __ T4S, 1); 3037 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3038 3039 for (int i = 0; i < bulk_width; i++) { 3040 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3041 __ rev32(v0_ofs, __ T16B, v16); 3042 __ addv(v16, __ T4S, v16, v8); 3043 } 3044 3045 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3046 3047 // Encrypt the counters 3048 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3049 3050 if (bulk_width == 8) { 3051 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3052 } 3053 3054 // XOR the encrypted counters with the inputs 3055 for (int i = 0; i < bulk_width; i++) { 3056 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3057 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3058 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3059 } 3060 3061 // Write the encrypted data 3062 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3063 if (bulk_width == 8) { 3064 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3065 } 3066 3067 __ subw(len, len, 16 * bulk_width); 3068 __ cbnzw(len, L_CTR_loop); 3069 } 3070 3071 // Save the counter back where it goes 3072 __ rev32(v16, __ T16B, v16); 3073 __ st1(v16, __ T16B, counter); 3074 3075 __ pop(saved_regs, sp); 3076 3077 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3078 if (bulk_width == 8) { 3079 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3080 } 3081 3082 __ andr(rscratch1, len, -16 * bulk_width); 3083 __ sub(len, len, rscratch1); 3084 __ add(offset, offset, rscratch1); 3085 __ mov(used, 16); 3086 __ strw(used, Address(used_ptr)); 3087 __ b(large_block_return); 3088 3089 return start; 3090 } 3091 3092 // Vector AES Galois Counter Mode implementation. Parameters: 3093 // 3094 // in = c_rarg0 3095 // len = c_rarg1 3096 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3097 // out = c_rarg3 3098 // key = c_rarg4 3099 // state = c_rarg5 - GHASH.state 3100 // subkeyHtbl = c_rarg6 - powers of H 3101 // counter = c_rarg7 - 16 bytes of CTR 3102 // return - number of processed bytes 3103 address generate_galoisCounterMode_AESCrypt() { 3104 address ghash_polynomial = __ pc(); 3105 __ emit_int64(0x87); // The low-order bits of the field 3106 // polynomial (i.e. p = z^7+z^2+z+1) 3107 // repeated in the low and high parts of a 3108 // 128-bit vector 3109 __ emit_int64(0x87); 3110 3111 __ align(CodeEntryAlignment); 3112 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3113 address start = __ pc(); 3114 __ enter(); 3115 3116 const Register in = c_rarg0; 3117 const Register len = c_rarg1; 3118 const Register ct = c_rarg2; 3119 const Register out = c_rarg3; 3120 // and updated with the incremented counter in the end 3121 3122 const Register key = c_rarg4; 3123 const Register state = c_rarg5; 3124 3125 const Register subkeyHtbl = c_rarg6; 3126 3127 const Register counter = c_rarg7; 3128 3129 const Register keylen = r10; 3130 // Save state before entering routine 3131 __ sub(sp, sp, 4 * 16); 3132 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3133 __ sub(sp, sp, 4 * 16); 3134 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3135 3136 // __ andr(len, len, -512); 3137 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3138 __ str(len, __ pre(sp, -2 * wordSize)); 3139 3140 Label DONE; 3141 __ cbz(len, DONE); 3142 3143 // Compute #rounds for AES based on the length of the key array 3144 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3145 3146 __ aesenc_loadkeys(key, keylen); 3147 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3148 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3149 3150 // AES/CTR loop 3151 { 3152 Label L_CTR_loop; 3153 __ BIND(L_CTR_loop); 3154 3155 // Setup the counters 3156 __ movi(v8, __ T4S, 0); 3157 __ movi(v9, __ T4S, 1); 3158 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3159 3160 assert(v0->encoding() < v8->encoding(), ""); 3161 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3162 FloatRegister f = as_FloatRegister(i); 3163 __ rev32(f, __ T16B, v16); 3164 __ addv(v16, __ T4S, v16, v8); 3165 } 3166 3167 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3168 3169 // Encrypt the counters 3170 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3171 3172 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3173 3174 // XOR the encrypted counters with the inputs 3175 for (int i = 0; i < 8; i++) { 3176 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3177 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3178 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3179 } 3180 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3181 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3182 3183 __ subw(len, len, 16 * 8); 3184 __ cbnzw(len, L_CTR_loop); 3185 } 3186 3187 __ rev32(v16, __ T16B, v16); 3188 __ st1(v16, __ T16B, counter); 3189 3190 __ ldr(len, Address(sp)); 3191 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3192 3193 // GHASH/CTR loop 3194 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3195 len, /*unrolls*/4); 3196 3197 #ifdef ASSERT 3198 { Label L; 3199 __ cmp(len, (unsigned char)0); 3200 __ br(Assembler::EQ, L); 3201 __ stop("stubGenerator: abort"); 3202 __ bind(L); 3203 } 3204 #endif 3205 3206 __ bind(DONE); 3207 // Return the number of bytes processed 3208 __ ldr(r0, __ post(sp, 2 * wordSize)); 3209 3210 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3211 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3212 3213 __ leave(); // required for proper stackwalking of RuntimeStub frame 3214 __ ret(lr); 3215 return start; 3216 } 3217 3218 // Utility routines for md5. 3219 // Clobbers r10 and r11. 3220 void md5_FF(Register buf, Register r1, Register r2, Register r3, Register r4, 3221 int k, int s, int t) { 3222 Register rscratch3 = r10; 3223 Register rscratch4 = r11; 3224 3225 __ eorw(rscratch3, r3, r4); 3226 __ movw(rscratch2, t); 3227 __ andw(rscratch3, rscratch3, r2); 3228 __ addw(rscratch4, r1, rscratch2); 3229 __ ldrw(rscratch1, Address(buf, k*4)); 3230 __ eorw(rscratch3, rscratch3, r4); 3231 __ addw(rscratch4, rscratch4, rscratch1); 3232 __ addw(rscratch3, rscratch3, rscratch4); 3233 __ rorw(rscratch2, rscratch3, 32 - s); 3234 __ addw(r1, rscratch2, r2); 3235 } 3236 3237 void md5_GG(Register buf, Register r1, Register r2, Register r3, Register r4, 3238 int k, int s, int t) { 3239 Register rscratch3 = r10; 3240 Register rscratch4 = r11; 3241 3242 __ andw(rscratch3, r2, r4); 3243 __ bicw(rscratch4, r3, r4); 3244 __ ldrw(rscratch1, Address(buf, k*4)); 3245 __ movw(rscratch2, t); 3246 __ orrw(rscratch3, rscratch3, rscratch4); 3247 __ addw(rscratch4, r1, rscratch2); 3248 __ addw(rscratch4, rscratch4, rscratch1); 3249 __ addw(rscratch3, rscratch3, rscratch4); 3250 __ rorw(rscratch2, rscratch3, 32 - s); 3251 __ addw(r1, rscratch2, r2); 3252 } 3253 3254 void md5_HH(Register buf, Register r1, Register r2, Register r3, Register r4, 3255 int k, int s, int t) { 3256 Register rscratch3 = r10; 3257 Register rscratch4 = r11; 3258 3259 __ eorw(rscratch3, r3, r4); 3260 __ movw(rscratch2, t); 3261 __ addw(rscratch4, r1, rscratch2); 3262 __ ldrw(rscratch1, Address(buf, k*4)); 3263 __ eorw(rscratch3, rscratch3, r2); 3264 __ addw(rscratch4, rscratch4, rscratch1); 3265 __ addw(rscratch3, rscratch3, rscratch4); 3266 __ rorw(rscratch2, rscratch3, 32 - s); 3267 __ addw(r1, rscratch2, r2); 3268 } 3269 3270 void md5_II(Register buf, Register r1, Register r2, Register r3, Register r4, 3271 int k, int s, int t) { 3272 Register rscratch3 = r10; 3273 Register rscratch4 = r11; 3274 3275 __ movw(rscratch3, t); 3276 __ ornw(rscratch2, r2, r4); 3277 __ addw(rscratch4, r1, rscratch3); 3278 __ ldrw(rscratch1, Address(buf, k*4)); 3279 __ eorw(rscratch3, rscratch2, r3); 3280 __ addw(rscratch4, rscratch4, rscratch1); 3281 __ addw(rscratch3, rscratch3, rscratch4); 3282 __ rorw(rscratch2, rscratch3, 32 - s); 3283 __ addw(r1, rscratch2, r2); 3284 } 3285 3286 // Arguments: 3287 // 3288 // Inputs: 3289 // c_rarg0 - byte[] source+offset 3290 // c_rarg1 - int[] SHA.state 3291 // c_rarg2 - int offset 3292 // c_rarg3 - int limit 3293 // 3294 address generate_md5_implCompress(bool multi_block, const char *name) { 3295 __ align(CodeEntryAlignment); 3296 StubCodeMark mark(this, "StubRoutines", name); 3297 address start = __ pc(); 3298 3299 Register buf = c_rarg0; 3300 Register state = c_rarg1; 3301 Register ofs = c_rarg2; 3302 Register limit = c_rarg3; 3303 Register a = r4; 3304 Register b = r5; 3305 Register c = r6; 3306 Register d = r7; 3307 Register rscratch3 = r10; 3308 Register rscratch4 = r11; 3309 3310 Label md5_loop; 3311 __ BIND(md5_loop); 3312 3313 // Save hash values for addition after rounds 3314 __ ldrw(a, Address(state, 0)); 3315 __ ldrw(b, Address(state, 4)); 3316 __ ldrw(c, Address(state, 8)); 3317 __ ldrw(d, Address(state, 12)); 3318 3319 // Round 1 3320 md5_FF(buf, a, b, c, d, 0, 7, 0xd76aa478); 3321 md5_FF(buf, d, a, b, c, 1, 12, 0xe8c7b756); 3322 md5_FF(buf, c, d, a, b, 2, 17, 0x242070db); 3323 md5_FF(buf, b, c, d, a, 3, 22, 0xc1bdceee); 3324 md5_FF(buf, a, b, c, d, 4, 7, 0xf57c0faf); 3325 md5_FF(buf, d, a, b, c, 5, 12, 0x4787c62a); 3326 md5_FF(buf, c, d, a, b, 6, 17, 0xa8304613); 3327 md5_FF(buf, b, c, d, a, 7, 22, 0xfd469501); 3328 md5_FF(buf, a, b, c, d, 8, 7, 0x698098d8); 3329 md5_FF(buf, d, a, b, c, 9, 12, 0x8b44f7af); 3330 md5_FF(buf, c, d, a, b, 10, 17, 0xffff5bb1); 3331 md5_FF(buf, b, c, d, a, 11, 22, 0x895cd7be); 3332 md5_FF(buf, a, b, c, d, 12, 7, 0x6b901122); 3333 md5_FF(buf, d, a, b, c, 13, 12, 0xfd987193); 3334 md5_FF(buf, c, d, a, b, 14, 17, 0xa679438e); 3335 md5_FF(buf, b, c, d, a, 15, 22, 0x49b40821); 3336 3337 // Round 2 3338 md5_GG(buf, a, b, c, d, 1, 5, 0xf61e2562); 3339 md5_GG(buf, d, a, b, c, 6, 9, 0xc040b340); 3340 md5_GG(buf, c, d, a, b, 11, 14, 0x265e5a51); 3341 md5_GG(buf, b, c, d, a, 0, 20, 0xe9b6c7aa); 3342 md5_GG(buf, a, b, c, d, 5, 5, 0xd62f105d); 3343 md5_GG(buf, d, a, b, c, 10, 9, 0x02441453); 3344 md5_GG(buf, c, d, a, b, 15, 14, 0xd8a1e681); 3345 md5_GG(buf, b, c, d, a, 4, 20, 0xe7d3fbc8); 3346 md5_GG(buf, a, b, c, d, 9, 5, 0x21e1cde6); 3347 md5_GG(buf, d, a, b, c, 14, 9, 0xc33707d6); 3348 md5_GG(buf, c, d, a, b, 3, 14, 0xf4d50d87); 3349 md5_GG(buf, b, c, d, a, 8, 20, 0x455a14ed); 3350 md5_GG(buf, a, b, c, d, 13, 5, 0xa9e3e905); 3351 md5_GG(buf, d, a, b, c, 2, 9, 0xfcefa3f8); 3352 md5_GG(buf, c, d, a, b, 7, 14, 0x676f02d9); 3353 md5_GG(buf, b, c, d, a, 12, 20, 0x8d2a4c8a); 3354 3355 // Round 3 3356 md5_HH(buf, a, b, c, d, 5, 4, 0xfffa3942); 3357 md5_HH(buf, d, a, b, c, 8, 11, 0x8771f681); 3358 md5_HH(buf, c, d, a, b, 11, 16, 0x6d9d6122); 3359 md5_HH(buf, b, c, d, a, 14, 23, 0xfde5380c); 3360 md5_HH(buf, a, b, c, d, 1, 4, 0xa4beea44); 3361 md5_HH(buf, d, a, b, c, 4, 11, 0x4bdecfa9); 3362 md5_HH(buf, c, d, a, b, 7, 16, 0xf6bb4b60); 3363 md5_HH(buf, b, c, d, a, 10, 23, 0xbebfbc70); 3364 md5_HH(buf, a, b, c, d, 13, 4, 0x289b7ec6); 3365 md5_HH(buf, d, a, b, c, 0, 11, 0xeaa127fa); 3366 md5_HH(buf, c, d, a, b, 3, 16, 0xd4ef3085); 3367 md5_HH(buf, b, c, d, a, 6, 23, 0x04881d05); 3368 md5_HH(buf, a, b, c, d, 9, 4, 0xd9d4d039); 3369 md5_HH(buf, d, a, b, c, 12, 11, 0xe6db99e5); 3370 md5_HH(buf, c, d, a, b, 15, 16, 0x1fa27cf8); 3371 md5_HH(buf, b, c, d, a, 2, 23, 0xc4ac5665); 3372 3373 // Round 4 3374 md5_II(buf, a, b, c, d, 0, 6, 0xf4292244); 3375 md5_II(buf, d, a, b, c, 7, 10, 0x432aff97); 3376 md5_II(buf, c, d, a, b, 14, 15, 0xab9423a7); 3377 md5_II(buf, b, c, d, a, 5, 21, 0xfc93a039); 3378 md5_II(buf, a, b, c, d, 12, 6, 0x655b59c3); 3379 md5_II(buf, d, a, b, c, 3, 10, 0x8f0ccc92); 3380 md5_II(buf, c, d, a, b, 10, 15, 0xffeff47d); 3381 md5_II(buf, b, c, d, a, 1, 21, 0x85845dd1); 3382 md5_II(buf, a, b, c, d, 8, 6, 0x6fa87e4f); 3383 md5_II(buf, d, a, b, c, 15, 10, 0xfe2ce6e0); 3384 md5_II(buf, c, d, a, b, 6, 15, 0xa3014314); 3385 md5_II(buf, b, c, d, a, 13, 21, 0x4e0811a1); 3386 md5_II(buf, a, b, c, d, 4, 6, 0xf7537e82); 3387 md5_II(buf, d, a, b, c, 11, 10, 0xbd3af235); 3388 md5_II(buf, c, d, a, b, 2, 15, 0x2ad7d2bb); 3389 md5_II(buf, b, c, d, a, 9, 21, 0xeb86d391); 3390 3391 // write hash values back in the correct order 3392 __ ldrw(rscratch1, Address(state, 0)); 3393 __ addw(rscratch1, rscratch1, a); 3394 __ strw(rscratch1, Address(state, 0)); 3395 3396 __ ldrw(rscratch2, Address(state, 4)); 3397 __ addw(rscratch2, rscratch2, b); 3398 __ strw(rscratch2, Address(state, 4)); 3399 3400 __ ldrw(rscratch3, Address(state, 8)); 3401 __ addw(rscratch3, rscratch3, c); 3402 __ strw(rscratch3, Address(state, 8)); 3403 3404 __ ldrw(rscratch4, Address(state, 12)); 3405 __ addw(rscratch4, rscratch4, d); 3406 __ strw(rscratch4, Address(state, 12)); 3407 3408 if (multi_block) { 3409 __ add(buf, buf, 64); 3410 __ add(ofs, ofs, 64); 3411 __ cmp(ofs, limit); 3412 __ br(Assembler::LE, md5_loop); 3413 __ mov(c_rarg0, ofs); // return ofs 3414 } 3415 3416 __ ret(lr); 3417 3418 return start; 3419 } 3420 3421 // Arguments: 3422 // 3423 // Inputs: 3424 // c_rarg0 - byte[] source+offset 3425 // c_rarg1 - int[] SHA.state 3426 // c_rarg2 - int offset 3427 // c_rarg3 - int limit 3428 // 3429 address generate_sha1_implCompress(bool multi_block, const char *name) { 3430 __ align(CodeEntryAlignment); 3431 StubCodeMark mark(this, "StubRoutines", name); 3432 address start = __ pc(); 3433 3434 Register buf = c_rarg0; 3435 Register state = c_rarg1; 3436 Register ofs = c_rarg2; 3437 Register limit = c_rarg3; 3438 3439 Label keys; 3440 Label sha1_loop; 3441 3442 // load the keys into v0..v3 3443 __ adr(rscratch1, keys); 3444 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3445 // load 5 words state into v6, v7 3446 __ ldrq(v6, Address(state, 0)); 3447 __ ldrs(v7, Address(state, 16)); 3448 3449 3450 __ BIND(sha1_loop); 3451 // load 64 bytes of data into v16..v19 3452 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3453 __ rev32(v16, __ T16B, v16); 3454 __ rev32(v17, __ T16B, v17); 3455 __ rev32(v18, __ T16B, v18); 3456 __ rev32(v19, __ T16B, v19); 3457 3458 // do the sha1 3459 __ addv(v4, __ T4S, v16, v0); 3460 __ orr(v20, __ T16B, v6, v6); 3461 3462 FloatRegister d0 = v16; 3463 FloatRegister d1 = v17; 3464 FloatRegister d2 = v18; 3465 FloatRegister d3 = v19; 3466 3467 for (int round = 0; round < 20; round++) { 3468 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3469 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3470 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3471 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3472 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3473 3474 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3475 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3476 __ sha1h(tmp2, __ T4S, v20); 3477 if (round < 5) 3478 __ sha1c(v20, __ T4S, tmp3, tmp4); 3479 else if (round < 10 || round >= 15) 3480 __ sha1p(v20, __ T4S, tmp3, tmp4); 3481 else 3482 __ sha1m(v20, __ T4S, tmp3, tmp4); 3483 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3484 3485 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3486 } 3487 3488 __ addv(v7, __ T2S, v7, v21); 3489 __ addv(v6, __ T4S, v6, v20); 3490 3491 if (multi_block) { 3492 __ add(ofs, ofs, 64); 3493 __ cmp(ofs, limit); 3494 __ br(Assembler::LE, sha1_loop); 3495 __ mov(c_rarg0, ofs); // return ofs 3496 } 3497 3498 __ strq(v6, Address(state, 0)); 3499 __ strs(v7, Address(state, 16)); 3500 3501 __ ret(lr); 3502 3503 __ bind(keys); 3504 __ emit_int32(0x5a827999); 3505 __ emit_int32(0x6ed9eba1); 3506 __ emit_int32(0x8f1bbcdc); 3507 __ emit_int32(0xca62c1d6); 3508 3509 return start; 3510 } 3511 3512 3513 // Arguments: 3514 // 3515 // Inputs: 3516 // c_rarg0 - byte[] source+offset 3517 // c_rarg1 - int[] SHA.state 3518 // c_rarg2 - int offset 3519 // c_rarg3 - int limit 3520 // 3521 address generate_sha256_implCompress(bool multi_block, const char *name) { 3522 static const uint32_t round_consts[64] = { 3523 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3524 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3525 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3526 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3527 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3528 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3529 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3530 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3531 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3532 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3533 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3534 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3535 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3536 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3537 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3538 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3539 }; 3540 __ align(CodeEntryAlignment); 3541 StubCodeMark mark(this, "StubRoutines", name); 3542 address start = __ pc(); 3543 3544 Register buf = c_rarg0; 3545 Register state = c_rarg1; 3546 Register ofs = c_rarg2; 3547 Register limit = c_rarg3; 3548 3549 Label sha1_loop; 3550 3551 __ stpd(v8, v9, __ pre(sp, -32)); 3552 __ stpd(v10, v11, Address(sp, 16)); 3553 3554 // dga == v0 3555 // dgb == v1 3556 // dg0 == v2 3557 // dg1 == v3 3558 // dg2 == v4 3559 // t0 == v6 3560 // t1 == v7 3561 3562 // load 16 keys to v16..v31 3563 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3564 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3565 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3566 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3567 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3568 3569 // load 8 words (256 bits) state 3570 __ ldpq(v0, v1, state); 3571 3572 __ BIND(sha1_loop); 3573 // load 64 bytes of data into v8..v11 3574 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3575 __ rev32(v8, __ T16B, v8); 3576 __ rev32(v9, __ T16B, v9); 3577 __ rev32(v10, __ T16B, v10); 3578 __ rev32(v11, __ T16B, v11); 3579 3580 __ addv(v6, __ T4S, v8, v16); 3581 __ orr(v2, __ T16B, v0, v0); 3582 __ orr(v3, __ T16B, v1, v1); 3583 3584 FloatRegister d0 = v8; 3585 FloatRegister d1 = v9; 3586 FloatRegister d2 = v10; 3587 FloatRegister d3 = v11; 3588 3589 3590 for (int round = 0; round < 16; round++) { 3591 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3592 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3593 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3594 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3595 3596 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3597 __ orr(v4, __ T16B, v2, v2); 3598 if (round < 15) 3599 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3600 __ sha256h(v2, __ T4S, v3, tmp2); 3601 __ sha256h2(v3, __ T4S, v4, tmp2); 3602 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3603 3604 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3605 } 3606 3607 __ addv(v0, __ T4S, v0, v2); 3608 __ addv(v1, __ T4S, v1, v3); 3609 3610 if (multi_block) { 3611 __ add(ofs, ofs, 64); 3612 __ cmp(ofs, limit); 3613 __ br(Assembler::LE, sha1_loop); 3614 __ mov(c_rarg0, ofs); // return ofs 3615 } 3616 3617 __ ldpd(v10, v11, Address(sp, 16)); 3618 __ ldpd(v8, v9, __ post(sp, 32)); 3619 3620 __ stpq(v0, v1, state); 3621 3622 __ ret(lr); 3623 3624 return start; 3625 } 3626 3627 // Double rounds for sha512. 3628 void sha512_dround(int dr, 3629 FloatRegister vi0, FloatRegister vi1, 3630 FloatRegister vi2, FloatRegister vi3, 3631 FloatRegister vi4, FloatRegister vrc0, 3632 FloatRegister vrc1, FloatRegister vin0, 3633 FloatRegister vin1, FloatRegister vin2, 3634 FloatRegister vin3, FloatRegister vin4) { 3635 if (dr < 36) { 3636 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3637 } 3638 __ addv(v5, __ T2D, vrc0, vin0); 3639 __ ext(v6, __ T16B, vi2, vi3, 8); 3640 __ ext(v5, __ T16B, v5, v5, 8); 3641 __ ext(v7, __ T16B, vi1, vi2, 8); 3642 __ addv(vi3, __ T2D, vi3, v5); 3643 if (dr < 32) { 3644 __ ext(v5, __ T16B, vin3, vin4, 8); 3645 __ sha512su0(vin0, __ T2D, vin1); 3646 } 3647 __ sha512h(vi3, __ T2D, v6, v7); 3648 if (dr < 32) { 3649 __ sha512su1(vin0, __ T2D, vin2, v5); 3650 } 3651 __ addv(vi4, __ T2D, vi1, vi3); 3652 __ sha512h2(vi3, __ T2D, vi1, vi0); 3653 } 3654 3655 // Arguments: 3656 // 3657 // Inputs: 3658 // c_rarg0 - byte[] source+offset 3659 // c_rarg1 - int[] SHA.state 3660 // c_rarg2 - int offset 3661 // c_rarg3 - int limit 3662 // 3663 address generate_sha512_implCompress(bool multi_block, const char *name) { 3664 static const uint64_t round_consts[80] = { 3665 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3666 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3667 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3668 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3669 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3670 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3671 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3672 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3673 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3674 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3675 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3676 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3677 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3678 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3679 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3680 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3681 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3682 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3683 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3684 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3685 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3686 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3687 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3688 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3689 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3690 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3691 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3692 }; 3693 3694 __ align(CodeEntryAlignment); 3695 StubCodeMark mark(this, "StubRoutines", name); 3696 address start = __ pc(); 3697 3698 Register buf = c_rarg0; 3699 Register state = c_rarg1; 3700 Register ofs = c_rarg2; 3701 Register limit = c_rarg3; 3702 3703 __ stpd(v8, v9, __ pre(sp, -64)); 3704 __ stpd(v10, v11, Address(sp, 16)); 3705 __ stpd(v12, v13, Address(sp, 32)); 3706 __ stpd(v14, v15, Address(sp, 48)); 3707 3708 Label sha512_loop; 3709 3710 // load state 3711 __ ld1(v8, v9, v10, v11, __ T2D, state); 3712 3713 // load first 4 round constants 3714 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3715 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3716 3717 __ BIND(sha512_loop); 3718 // load 128B of data into v12..v19 3719 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3720 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3721 __ rev64(v12, __ T16B, v12); 3722 __ rev64(v13, __ T16B, v13); 3723 __ rev64(v14, __ T16B, v14); 3724 __ rev64(v15, __ T16B, v15); 3725 __ rev64(v16, __ T16B, v16); 3726 __ rev64(v17, __ T16B, v17); 3727 __ rev64(v18, __ T16B, v18); 3728 __ rev64(v19, __ T16B, v19); 3729 3730 __ mov(rscratch2, rscratch1); 3731 3732 __ mov(v0, __ T16B, v8); 3733 __ mov(v1, __ T16B, v9); 3734 __ mov(v2, __ T16B, v10); 3735 __ mov(v3, __ T16B, v11); 3736 3737 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3738 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3739 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3740 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3741 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3742 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3743 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3744 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3745 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3746 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3747 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3748 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3749 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3750 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3751 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3752 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3753 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3754 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3755 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3756 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3757 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3758 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3759 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3760 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3761 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3762 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3763 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3764 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3765 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3766 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3767 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3768 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3769 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3770 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3771 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3772 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3773 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3774 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3775 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3776 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3777 3778 __ addv(v8, __ T2D, v8, v0); 3779 __ addv(v9, __ T2D, v9, v1); 3780 __ addv(v10, __ T2D, v10, v2); 3781 __ addv(v11, __ T2D, v11, v3); 3782 3783 if (multi_block) { 3784 __ add(ofs, ofs, 128); 3785 __ cmp(ofs, limit); 3786 __ br(Assembler::LE, sha512_loop); 3787 __ mov(c_rarg0, ofs); // return ofs 3788 } 3789 3790 __ st1(v8, v9, v10, v11, __ T2D, state); 3791 3792 __ ldpd(v14, v15, Address(sp, 48)); 3793 __ ldpd(v12, v13, Address(sp, 32)); 3794 __ ldpd(v10, v11, Address(sp, 16)); 3795 __ ldpd(v8, v9, __ post(sp, 64)); 3796 3797 __ ret(lr); 3798 3799 return start; 3800 } 3801 3802 // Arguments: 3803 // 3804 // Inputs: 3805 // c_rarg0 - byte[] source+offset 3806 // c_rarg1 - byte[] SHA.state 3807 // c_rarg2 - int block_size 3808 // c_rarg3 - int offset 3809 // c_rarg4 - int limit 3810 // 3811 address generate_sha3_implCompress(bool multi_block, const char *name) { 3812 static const uint64_t round_consts[24] = { 3813 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 3814 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 3815 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 3816 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 3817 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 3818 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 3819 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 3820 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 3821 }; 3822 3823 __ align(CodeEntryAlignment); 3824 StubCodeMark mark(this, "StubRoutines", name); 3825 address start = __ pc(); 3826 3827 Register buf = c_rarg0; 3828 Register state = c_rarg1; 3829 Register block_size = c_rarg2; 3830 Register ofs = c_rarg3; 3831 Register limit = c_rarg4; 3832 3833 Label sha3_loop, rounds24_loop; 3834 Label sha3_512_or_sha3_384, shake128; 3835 3836 __ stpd(v8, v9, __ pre(sp, -64)); 3837 __ stpd(v10, v11, Address(sp, 16)); 3838 __ stpd(v12, v13, Address(sp, 32)); 3839 __ stpd(v14, v15, Address(sp, 48)); 3840 3841 // load state 3842 __ add(rscratch1, state, 32); 3843 __ ld1(v0, v1, v2, v3, __ T1D, state); 3844 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 3845 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 3846 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 3847 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 3848 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 3849 __ ld1(v24, __ T1D, rscratch1); 3850 3851 __ BIND(sha3_loop); 3852 3853 // 24 keccak rounds 3854 __ movw(rscratch2, 24); 3855 3856 // load round_constants base 3857 __ lea(rscratch1, ExternalAddress((address) round_consts)); 3858 3859 // load input 3860 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3861 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 3862 __ eor(v0, __ T8B, v0, v25); 3863 __ eor(v1, __ T8B, v1, v26); 3864 __ eor(v2, __ T8B, v2, v27); 3865 __ eor(v3, __ T8B, v3, v28); 3866 __ eor(v4, __ T8B, v4, v29); 3867 __ eor(v5, __ T8B, v5, v30); 3868 __ eor(v6, __ T8B, v6, v31); 3869 3870 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 3871 __ tbz(block_size, 7, sha3_512_or_sha3_384); 3872 3873 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3874 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 3875 __ eor(v7, __ T8B, v7, v25); 3876 __ eor(v8, __ T8B, v8, v26); 3877 __ eor(v9, __ T8B, v9, v27); 3878 __ eor(v10, __ T8B, v10, v28); 3879 __ eor(v11, __ T8B, v11, v29); 3880 __ eor(v12, __ T8B, v12, v30); 3881 __ eor(v13, __ T8B, v13, v31); 3882 3883 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 3884 __ eor(v14, __ T8B, v14, v25); 3885 __ eor(v15, __ T8B, v15, v26); 3886 __ eor(v16, __ T8B, v16, v27); 3887 3888 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 3889 __ andw(c_rarg5, block_size, 48); 3890 __ cbzw(c_rarg5, rounds24_loop); 3891 3892 __ tbnz(block_size, 5, shake128); 3893 // block_size == 144, bit5 == 0, SHA3-244 3894 __ ldrd(v28, __ post(buf, 8)); 3895 __ eor(v17, __ T8B, v17, v28); 3896 __ b(rounds24_loop); 3897 3898 __ BIND(shake128); 3899 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 3900 __ eor(v17, __ T8B, v17, v28); 3901 __ eor(v18, __ T8B, v18, v29); 3902 __ eor(v19, __ T8B, v19, v30); 3903 __ eor(v20, __ T8B, v20, v31); 3904 __ b(rounds24_loop); // block_size == 168, SHAKE128 3905 3906 __ BIND(sha3_512_or_sha3_384); 3907 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 3908 __ eor(v7, __ T8B, v7, v25); 3909 __ eor(v8, __ T8B, v8, v26); 3910 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 3911 3912 // SHA3-384 3913 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 3914 __ eor(v9, __ T8B, v9, v27); 3915 __ eor(v10, __ T8B, v10, v28); 3916 __ eor(v11, __ T8B, v11, v29); 3917 __ eor(v12, __ T8B, v12, v30); 3918 3919 __ BIND(rounds24_loop); 3920 __ subw(rscratch2, rscratch2, 1); 3921 3922 __ eor3(v29, __ T16B, v4, v9, v14); 3923 __ eor3(v26, __ T16B, v1, v6, v11); 3924 __ eor3(v28, __ T16B, v3, v8, v13); 3925 __ eor3(v25, __ T16B, v0, v5, v10); 3926 __ eor3(v27, __ T16B, v2, v7, v12); 3927 __ eor3(v29, __ T16B, v29, v19, v24); 3928 __ eor3(v26, __ T16B, v26, v16, v21); 3929 __ eor3(v28, __ T16B, v28, v18, v23); 3930 __ eor3(v25, __ T16B, v25, v15, v20); 3931 __ eor3(v27, __ T16B, v27, v17, v22); 3932 3933 __ rax1(v30, __ T2D, v29, v26); 3934 __ rax1(v26, __ T2D, v26, v28); 3935 __ rax1(v28, __ T2D, v28, v25); 3936 __ rax1(v25, __ T2D, v25, v27); 3937 __ rax1(v27, __ T2D, v27, v29); 3938 3939 __ eor(v0, __ T16B, v0, v30); 3940 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 3941 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 3942 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 3943 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 3944 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 3945 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 3946 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 3947 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 3948 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 3949 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 3950 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 3951 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 3952 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 3953 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 3954 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 3955 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 3956 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 3957 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 3958 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 3959 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 3960 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 3961 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 3962 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 3963 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 3964 3965 __ bcax(v20, __ T16B, v31, v22, v8); 3966 __ bcax(v21, __ T16B, v8, v23, v22); 3967 __ bcax(v22, __ T16B, v22, v24, v23); 3968 __ bcax(v23, __ T16B, v23, v31, v24); 3969 __ bcax(v24, __ T16B, v24, v8, v31); 3970 3971 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 3972 3973 __ bcax(v17, __ T16B, v25, v19, v3); 3974 __ bcax(v18, __ T16B, v3, v15, v19); 3975 __ bcax(v19, __ T16B, v19, v16, v15); 3976 __ bcax(v15, __ T16B, v15, v25, v16); 3977 __ bcax(v16, __ T16B, v16, v3, v25); 3978 3979 __ bcax(v10, __ T16B, v29, v12, v26); 3980 __ bcax(v11, __ T16B, v26, v13, v12); 3981 __ bcax(v12, __ T16B, v12, v14, v13); 3982 __ bcax(v13, __ T16B, v13, v29, v14); 3983 __ bcax(v14, __ T16B, v14, v26, v29); 3984 3985 __ bcax(v7, __ T16B, v30, v9, v4); 3986 __ bcax(v8, __ T16B, v4, v5, v9); 3987 __ bcax(v9, __ T16B, v9, v6, v5); 3988 __ bcax(v5, __ T16B, v5, v30, v6); 3989 __ bcax(v6, __ T16B, v6, v4, v30); 3990 3991 __ bcax(v3, __ T16B, v27, v0, v28); 3992 __ bcax(v4, __ T16B, v28, v1, v0); 3993 __ bcax(v0, __ T16B, v0, v2, v1); 3994 __ bcax(v1, __ T16B, v1, v27, v2); 3995 __ bcax(v2, __ T16B, v2, v28, v27); 3996 3997 __ eor(v0, __ T16B, v0, v31); 3998 3999 __ cbnzw(rscratch2, rounds24_loop); 4000 4001 if (multi_block) { 4002 __ add(ofs, ofs, block_size); 4003 __ cmp(ofs, limit); 4004 __ br(Assembler::LE, sha3_loop); 4005 __ mov(c_rarg0, ofs); // return ofs 4006 } 4007 4008 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4009 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4010 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4011 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4012 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4013 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4014 __ st1(v24, __ T1D, state); 4015 4016 __ ldpd(v14, v15, Address(sp, 48)); 4017 __ ldpd(v12, v13, Address(sp, 32)); 4018 __ ldpd(v10, v11, Address(sp, 16)); 4019 __ ldpd(v8, v9, __ post(sp, 64)); 4020 4021 __ ret(lr); 4022 4023 return start; 4024 } 4025 4026 /** 4027 * Arguments: 4028 * 4029 * Inputs: 4030 * c_rarg0 - int crc 4031 * c_rarg1 - byte* buf 4032 * c_rarg2 - int length 4033 * 4034 * Output: 4035 * rax - int crc result 4036 */ 4037 address generate_updateBytesCRC32() { 4038 assert(UseCRC32Intrinsics, "what are we doing here?"); 4039 4040 __ align(CodeEntryAlignment); 4041 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4042 4043 address start = __ pc(); 4044 4045 const Register crc = c_rarg0; // crc 4046 const Register buf = c_rarg1; // source java byte array address 4047 const Register len = c_rarg2; // length 4048 const Register table0 = c_rarg3; // crc_table address 4049 const Register table1 = c_rarg4; 4050 const Register table2 = c_rarg5; 4051 const Register table3 = c_rarg6; 4052 const Register tmp3 = c_rarg7; 4053 4054 BLOCK_COMMENT("Entry:"); 4055 __ enter(); // required for proper stackwalking of RuntimeStub frame 4056 4057 __ kernel_crc32(crc, buf, len, 4058 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4059 4060 __ leave(); // required for proper stackwalking of RuntimeStub frame 4061 __ ret(lr); 4062 4063 return start; 4064 } 4065 4066 // ChaCha20 block function. This version parallelizes by loading 4067 // individual 32-bit state elements into vectors for four blocks 4068 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4069 // 4070 // state (int[16]) = c_rarg0 4071 // keystream (byte[1024]) = c_rarg1 4072 // return - number of bytes of keystream (always 256) 4073 address generate_chacha20Block_blockpar() { 4074 Label L_twoRounds, L_cc20_const; 4075 // The constant data is broken into two 128-bit segments to be loaded 4076 // onto FloatRegisters. The first 128 bits are a counter add overlay 4077 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4078 // The second 128-bits is a table constant used for 8-bit left rotations. 4079 __ BIND(L_cc20_const); 4080 __ emit_int64(0x0000000100000000UL); 4081 __ emit_int64(0x0000000300000002UL); 4082 __ emit_int64(0x0605040702010003UL); 4083 __ emit_int64(0x0E0D0C0F0A09080BUL); 4084 4085 __ align(CodeEntryAlignment); 4086 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4087 address start = __ pc(); 4088 __ enter(); 4089 4090 int i, j; 4091 const Register state = c_rarg0; 4092 const Register keystream = c_rarg1; 4093 const Register loopCtr = r10; 4094 const Register tmpAddr = r11; 4095 4096 const FloatRegister stateFirst = v0; 4097 const FloatRegister stateSecond = v1; 4098 const FloatRegister stateThird = v2; 4099 const FloatRegister stateFourth = v3; 4100 const FloatRegister origCtrState = v28; 4101 const FloatRegister scratch = v29; 4102 const FloatRegister lrot8Tbl = v30; 4103 4104 // Organize SIMD registers in an array that facilitates 4105 // putting repetitive opcodes into loop structures. It is 4106 // important that each grouping of 4 registers is monotonically 4107 // increasing to support the requirements of multi-register 4108 // instructions (e.g. ld4r, st4, etc.) 4109 const FloatRegister workSt[16] = { 4110 v4, v5, v6, v7, v16, v17, v18, v19, 4111 v20, v21, v22, v23, v24, v25, v26, v27 4112 }; 4113 4114 // Load from memory and interlace across 16 SIMD registers, 4115 // With each word from memory being broadcast to all lanes of 4116 // each successive SIMD register. 4117 // Addr(0) -> All lanes in workSt[i] 4118 // Addr(4) -> All lanes workSt[i + 1], etc. 4119 __ mov(tmpAddr, state); 4120 for (i = 0; i < 16; i += 4) { 4121 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4122 __ post(tmpAddr, 16)); 4123 } 4124 4125 // Pull in constant data. The first 16 bytes are the add overlay 4126 // which is applied to the vector holding the counter (state[12]). 4127 // The second 16 bytes is the index register for the 8-bit left 4128 // rotation tbl instruction. 4129 __ adr(tmpAddr, L_cc20_const); 4130 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4131 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4132 4133 // Set up the 10 iteration loop and perform all 8 quarter round ops 4134 __ mov(loopCtr, 10); 4135 __ BIND(L_twoRounds); 4136 4137 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4138 scratch, lrot8Tbl); 4139 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4140 scratch, lrot8Tbl); 4141 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4142 scratch, lrot8Tbl); 4143 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4144 scratch, lrot8Tbl); 4145 4146 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4147 scratch, lrot8Tbl); 4148 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4149 scratch, lrot8Tbl); 4150 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4151 scratch, lrot8Tbl); 4152 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4153 scratch, lrot8Tbl); 4154 4155 // Decrement and iterate 4156 __ sub(loopCtr, loopCtr, 1); 4157 __ cbnz(loopCtr, L_twoRounds); 4158 4159 __ mov(tmpAddr, state); 4160 4161 // Add the starting state back to the post-loop keystream 4162 // state. We read/interlace the state array from memory into 4163 // 4 registers similar to what we did in the beginning. Then 4164 // add the counter overlay onto workSt[12] at the end. 4165 for (i = 0; i < 16; i += 4) { 4166 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4167 __ post(tmpAddr, 16)); 4168 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4169 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4170 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4171 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4172 } 4173 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4174 4175 // Write to key stream, storing the same element out of workSt[0..15] 4176 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4177 // for the next element position. 4178 for (i = 0; i < 4; i++) { 4179 for (j = 0; j < 16; j += 4) { 4180 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4181 __ post(keystream, 16)); 4182 } 4183 } 4184 4185 __ mov(r0, 256); // Return length of output keystream 4186 __ leave(); 4187 __ ret(lr); 4188 4189 return start; 4190 } 4191 4192 /** 4193 * Arguments: 4194 * 4195 * Inputs: 4196 * c_rarg0 - int crc 4197 * c_rarg1 - byte* buf 4198 * c_rarg2 - int length 4199 * c_rarg3 - int* table 4200 * 4201 * Output: 4202 * r0 - int crc result 4203 */ 4204 address generate_updateBytesCRC32C() { 4205 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4206 4207 __ align(CodeEntryAlignment); 4208 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4209 4210 address start = __ pc(); 4211 4212 const Register crc = c_rarg0; // crc 4213 const Register buf = c_rarg1; // source java byte array address 4214 const Register len = c_rarg2; // length 4215 const Register table0 = c_rarg3; // crc_table address 4216 const Register table1 = c_rarg4; 4217 const Register table2 = c_rarg5; 4218 const Register table3 = c_rarg6; 4219 const Register tmp3 = c_rarg7; 4220 4221 BLOCK_COMMENT("Entry:"); 4222 __ enter(); // required for proper stackwalking of RuntimeStub frame 4223 4224 __ kernel_crc32c(crc, buf, len, 4225 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4226 4227 __ leave(); // required for proper stackwalking of RuntimeStub frame 4228 __ ret(lr); 4229 4230 return start; 4231 } 4232 4233 /*** 4234 * Arguments: 4235 * 4236 * Inputs: 4237 * c_rarg0 - int adler 4238 * c_rarg1 - byte* buff 4239 * c_rarg2 - int len 4240 * 4241 * Output: 4242 * c_rarg0 - int adler result 4243 */ 4244 address generate_updateBytesAdler32() { 4245 __ align(CodeEntryAlignment); 4246 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4247 address start = __ pc(); 4248 4249 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4250 4251 // Aliases 4252 Register adler = c_rarg0; 4253 Register s1 = c_rarg0; 4254 Register s2 = c_rarg3; 4255 Register buff = c_rarg1; 4256 Register len = c_rarg2; 4257 Register nmax = r4; 4258 Register base = r5; 4259 Register count = r6; 4260 Register temp0 = rscratch1; 4261 Register temp1 = rscratch2; 4262 FloatRegister vbytes = v0; 4263 FloatRegister vs1acc = v1; 4264 FloatRegister vs2acc = v2; 4265 FloatRegister vtable = v3; 4266 4267 // Max number of bytes we can process before having to take the mod 4268 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4269 uint64_t BASE = 0xfff1; 4270 uint64_t NMAX = 0x15B0; 4271 4272 __ mov(base, BASE); 4273 __ mov(nmax, NMAX); 4274 4275 // Load accumulation coefficients for the upper 16 bits 4276 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4277 __ ld1(vtable, __ T16B, Address(temp0)); 4278 4279 // s1 is initialized to the lower 16 bits of adler 4280 // s2 is initialized to the upper 16 bits of adler 4281 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4282 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4283 4284 // The pipelined loop needs at least 16 elements for 1 iteration 4285 // It does check this, but it is more effective to skip to the cleanup loop 4286 __ cmp(len, (u1)16); 4287 __ br(Assembler::HS, L_nmax); 4288 __ cbz(len, L_combine); 4289 4290 __ bind(L_simple_by1_loop); 4291 __ ldrb(temp0, Address(__ post(buff, 1))); 4292 __ add(s1, s1, temp0); 4293 __ add(s2, s2, s1); 4294 __ subs(len, len, 1); 4295 __ br(Assembler::HI, L_simple_by1_loop); 4296 4297 // s1 = s1 % BASE 4298 __ subs(temp0, s1, base); 4299 __ csel(s1, temp0, s1, Assembler::HS); 4300 4301 // s2 = s2 % BASE 4302 __ lsr(temp0, s2, 16); 4303 __ lsl(temp1, temp0, 4); 4304 __ sub(temp1, temp1, temp0); 4305 __ add(s2, temp1, s2, ext::uxth); 4306 4307 __ subs(temp0, s2, base); 4308 __ csel(s2, temp0, s2, Assembler::HS); 4309 4310 __ b(L_combine); 4311 4312 __ bind(L_nmax); 4313 __ subs(len, len, nmax); 4314 __ sub(count, nmax, 16); 4315 __ br(Assembler::LO, L_by16); 4316 4317 __ bind(L_nmax_loop); 4318 4319 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4320 vbytes, vs1acc, vs2acc, vtable); 4321 4322 __ subs(count, count, 16); 4323 __ br(Assembler::HS, L_nmax_loop); 4324 4325 // s1 = s1 % BASE 4326 __ lsr(temp0, s1, 16); 4327 __ lsl(temp1, temp0, 4); 4328 __ sub(temp1, temp1, temp0); 4329 __ add(temp1, temp1, s1, ext::uxth); 4330 4331 __ lsr(temp0, temp1, 16); 4332 __ lsl(s1, temp0, 4); 4333 __ sub(s1, s1, temp0); 4334 __ add(s1, s1, temp1, ext:: uxth); 4335 4336 __ subs(temp0, s1, base); 4337 __ csel(s1, temp0, s1, Assembler::HS); 4338 4339 // s2 = s2 % BASE 4340 __ lsr(temp0, s2, 16); 4341 __ lsl(temp1, temp0, 4); 4342 __ sub(temp1, temp1, temp0); 4343 __ add(temp1, temp1, s2, ext::uxth); 4344 4345 __ lsr(temp0, temp1, 16); 4346 __ lsl(s2, temp0, 4); 4347 __ sub(s2, s2, temp0); 4348 __ add(s2, s2, temp1, ext:: uxth); 4349 4350 __ subs(temp0, s2, base); 4351 __ csel(s2, temp0, s2, Assembler::HS); 4352 4353 __ subs(len, len, nmax); 4354 __ sub(count, nmax, 16); 4355 __ br(Assembler::HS, L_nmax_loop); 4356 4357 __ bind(L_by16); 4358 __ adds(len, len, count); 4359 __ br(Assembler::LO, L_by1); 4360 4361 __ bind(L_by16_loop); 4362 4363 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4364 vbytes, vs1acc, vs2acc, vtable); 4365 4366 __ subs(len, len, 16); 4367 __ br(Assembler::HS, L_by16_loop); 4368 4369 __ bind(L_by1); 4370 __ adds(len, len, 15); 4371 __ br(Assembler::LO, L_do_mod); 4372 4373 __ bind(L_by1_loop); 4374 __ ldrb(temp0, Address(__ post(buff, 1))); 4375 __ add(s1, temp0, s1); 4376 __ add(s2, s2, s1); 4377 __ subs(len, len, 1); 4378 __ br(Assembler::HS, L_by1_loop); 4379 4380 __ bind(L_do_mod); 4381 // s1 = s1 % BASE 4382 __ lsr(temp0, s1, 16); 4383 __ lsl(temp1, temp0, 4); 4384 __ sub(temp1, temp1, temp0); 4385 __ add(temp1, temp1, s1, ext::uxth); 4386 4387 __ lsr(temp0, temp1, 16); 4388 __ lsl(s1, temp0, 4); 4389 __ sub(s1, s1, temp0); 4390 __ add(s1, s1, temp1, ext:: uxth); 4391 4392 __ subs(temp0, s1, base); 4393 __ csel(s1, temp0, s1, Assembler::HS); 4394 4395 // s2 = s2 % BASE 4396 __ lsr(temp0, s2, 16); 4397 __ lsl(temp1, temp0, 4); 4398 __ sub(temp1, temp1, temp0); 4399 __ add(temp1, temp1, s2, ext::uxth); 4400 4401 __ lsr(temp0, temp1, 16); 4402 __ lsl(s2, temp0, 4); 4403 __ sub(s2, s2, temp0); 4404 __ add(s2, s2, temp1, ext:: uxth); 4405 4406 __ subs(temp0, s2, base); 4407 __ csel(s2, temp0, s2, Assembler::HS); 4408 4409 // Combine lower bits and higher bits 4410 __ bind(L_combine); 4411 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4412 4413 __ ret(lr); 4414 4415 return start; 4416 } 4417 4418 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4419 Register temp0, Register temp1, FloatRegister vbytes, 4420 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4421 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4422 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4423 // In non-vectorized code, we update s1 and s2 as: 4424 // s1 <- s1 + b1 4425 // s2 <- s2 + s1 4426 // s1 <- s1 + b2 4427 // s2 <- s2 + b1 4428 // ... 4429 // s1 <- s1 + b16 4430 // s2 <- s2 + s1 4431 // Putting above assignments together, we have: 4432 // s1_new = s1 + b1 + b2 + ... + b16 4433 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4434 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4435 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4436 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4437 4438 // s2 = s2 + s1 * 16 4439 __ add(s2, s2, s1, Assembler::LSL, 4); 4440 4441 // vs1acc = b1 + b2 + b3 + ... + b16 4442 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4443 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4444 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4445 __ uaddlv(vs1acc, __ T16B, vbytes); 4446 __ uaddlv(vs2acc, __ T8H, vs2acc); 4447 4448 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4449 __ fmovd(temp0, vs1acc); 4450 __ fmovd(temp1, vs2acc); 4451 __ add(s1, s1, temp0); 4452 __ add(s2, s2, temp1); 4453 } 4454 4455 /** 4456 * Arguments: 4457 * 4458 * Input: 4459 * c_rarg0 - x address 4460 * c_rarg1 - x length 4461 * c_rarg2 - y address 4462 * c_rarg3 - y length 4463 * c_rarg4 - z address 4464 * c_rarg5 - z length 4465 */ 4466 address generate_multiplyToLen() { 4467 __ align(CodeEntryAlignment); 4468 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4469 4470 address start = __ pc(); 4471 const Register x = r0; 4472 const Register xlen = r1; 4473 const Register y = r2; 4474 const Register ylen = r3; 4475 const Register z = r4; 4476 const Register zlen = r5; 4477 4478 const Register tmp1 = r10; 4479 const Register tmp2 = r11; 4480 const Register tmp3 = r12; 4481 const Register tmp4 = r13; 4482 const Register tmp5 = r14; 4483 const Register tmp6 = r15; 4484 const Register tmp7 = r16; 4485 4486 BLOCK_COMMENT("Entry:"); 4487 __ enter(); // required for proper stackwalking of RuntimeStub frame 4488 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4489 __ leave(); // required for proper stackwalking of RuntimeStub frame 4490 __ ret(lr); 4491 4492 return start; 4493 } 4494 4495 address generate_squareToLen() { 4496 // squareToLen algorithm for sizes 1..127 described in java code works 4497 // faster than multiply_to_len on some CPUs and slower on others, but 4498 // multiply_to_len shows a bit better overall results 4499 __ align(CodeEntryAlignment); 4500 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4501 address start = __ pc(); 4502 4503 const Register x = r0; 4504 const Register xlen = r1; 4505 const Register z = r2; 4506 const Register zlen = r3; 4507 const Register y = r4; // == x 4508 const Register ylen = r5; // == xlen 4509 4510 const Register tmp1 = r10; 4511 const Register tmp2 = r11; 4512 const Register tmp3 = r12; 4513 const Register tmp4 = r13; 4514 const Register tmp5 = r14; 4515 const Register tmp6 = r15; 4516 const Register tmp7 = r16; 4517 4518 RegSet spilled_regs = RegSet::of(y, ylen); 4519 BLOCK_COMMENT("Entry:"); 4520 __ enter(); 4521 __ push(spilled_regs, sp); 4522 __ mov(y, x); 4523 __ mov(ylen, xlen); 4524 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4525 __ pop(spilled_regs, sp); 4526 __ leave(); 4527 __ ret(lr); 4528 return start; 4529 } 4530 4531 address generate_mulAdd() { 4532 __ align(CodeEntryAlignment); 4533 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4534 4535 address start = __ pc(); 4536 4537 const Register out = r0; 4538 const Register in = r1; 4539 const Register offset = r2; 4540 const Register len = r3; 4541 const Register k = r4; 4542 4543 BLOCK_COMMENT("Entry:"); 4544 __ enter(); 4545 __ mul_add(out, in, offset, len, k); 4546 __ leave(); 4547 __ ret(lr); 4548 4549 return start; 4550 } 4551 4552 // Arguments: 4553 // 4554 // Input: 4555 // c_rarg0 - newArr address 4556 // c_rarg1 - oldArr address 4557 // c_rarg2 - newIdx 4558 // c_rarg3 - shiftCount 4559 // c_rarg4 - numIter 4560 // 4561 address generate_bigIntegerRightShift() { 4562 __ align(CodeEntryAlignment); 4563 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4564 address start = __ pc(); 4565 4566 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4567 4568 Register newArr = c_rarg0; 4569 Register oldArr = c_rarg1; 4570 Register newIdx = c_rarg2; 4571 Register shiftCount = c_rarg3; 4572 Register numIter = c_rarg4; 4573 Register idx = numIter; 4574 4575 Register newArrCur = rscratch1; 4576 Register shiftRevCount = rscratch2; 4577 Register oldArrCur = r13; 4578 Register oldArrNext = r14; 4579 4580 FloatRegister oldElem0 = v0; 4581 FloatRegister oldElem1 = v1; 4582 FloatRegister newElem = v2; 4583 FloatRegister shiftVCount = v3; 4584 FloatRegister shiftVRevCount = v4; 4585 4586 __ cbz(idx, Exit); 4587 4588 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4589 4590 // left shift count 4591 __ movw(shiftRevCount, 32); 4592 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4593 4594 // numIter too small to allow a 4-words SIMD loop, rolling back 4595 __ cmp(numIter, (u1)4); 4596 __ br(Assembler::LT, ShiftThree); 4597 4598 __ dup(shiftVCount, __ T4S, shiftCount); 4599 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4600 __ negr(shiftVCount, __ T4S, shiftVCount); 4601 4602 __ BIND(ShiftSIMDLoop); 4603 4604 // Calculate the load addresses 4605 __ sub(idx, idx, 4); 4606 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4607 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4608 __ add(oldArrCur, oldArrNext, 4); 4609 4610 // Load 4 words and process 4611 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4612 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4613 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4614 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4615 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4616 __ st1(newElem, __ T4S, Address(newArrCur)); 4617 4618 __ cmp(idx, (u1)4); 4619 __ br(Assembler::LT, ShiftTwoLoop); 4620 __ b(ShiftSIMDLoop); 4621 4622 __ BIND(ShiftTwoLoop); 4623 __ cbz(idx, Exit); 4624 __ cmp(idx, (u1)1); 4625 __ br(Assembler::EQ, ShiftOne); 4626 4627 // Calculate the load addresses 4628 __ sub(idx, idx, 2); 4629 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4630 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4631 __ add(oldArrCur, oldArrNext, 4); 4632 4633 // Load 2 words and process 4634 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4635 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4636 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4637 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4638 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4639 __ st1(newElem, __ T2S, Address(newArrCur)); 4640 __ b(ShiftTwoLoop); 4641 4642 __ BIND(ShiftThree); 4643 __ tbz(idx, 1, ShiftOne); 4644 __ tbz(idx, 0, ShiftTwo); 4645 __ ldrw(r10, Address(oldArr, 12)); 4646 __ ldrw(r11, Address(oldArr, 8)); 4647 __ lsrvw(r10, r10, shiftCount); 4648 __ lslvw(r11, r11, shiftRevCount); 4649 __ orrw(r12, r10, r11); 4650 __ strw(r12, Address(newArr, 8)); 4651 4652 __ BIND(ShiftTwo); 4653 __ ldrw(r10, Address(oldArr, 8)); 4654 __ ldrw(r11, Address(oldArr, 4)); 4655 __ lsrvw(r10, r10, shiftCount); 4656 __ lslvw(r11, r11, shiftRevCount); 4657 __ orrw(r12, r10, r11); 4658 __ strw(r12, Address(newArr, 4)); 4659 4660 __ BIND(ShiftOne); 4661 __ ldrw(r10, Address(oldArr, 4)); 4662 __ ldrw(r11, Address(oldArr)); 4663 __ lsrvw(r10, r10, shiftCount); 4664 __ lslvw(r11, r11, shiftRevCount); 4665 __ orrw(r12, r10, r11); 4666 __ strw(r12, Address(newArr)); 4667 4668 __ BIND(Exit); 4669 __ ret(lr); 4670 4671 return start; 4672 } 4673 4674 // Arguments: 4675 // 4676 // Input: 4677 // c_rarg0 - newArr address 4678 // c_rarg1 - oldArr address 4679 // c_rarg2 - newIdx 4680 // c_rarg3 - shiftCount 4681 // c_rarg4 - numIter 4682 // 4683 address generate_bigIntegerLeftShift() { 4684 __ align(CodeEntryAlignment); 4685 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4686 address start = __ pc(); 4687 4688 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4689 4690 Register newArr = c_rarg0; 4691 Register oldArr = c_rarg1; 4692 Register newIdx = c_rarg2; 4693 Register shiftCount = c_rarg3; 4694 Register numIter = c_rarg4; 4695 4696 Register shiftRevCount = rscratch1; 4697 Register oldArrNext = rscratch2; 4698 4699 FloatRegister oldElem0 = v0; 4700 FloatRegister oldElem1 = v1; 4701 FloatRegister newElem = v2; 4702 FloatRegister shiftVCount = v3; 4703 FloatRegister shiftVRevCount = v4; 4704 4705 __ cbz(numIter, Exit); 4706 4707 __ add(oldArrNext, oldArr, 4); 4708 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4709 4710 // right shift count 4711 __ movw(shiftRevCount, 32); 4712 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4713 4714 // numIter too small to allow a 4-words SIMD loop, rolling back 4715 __ cmp(numIter, (u1)4); 4716 __ br(Assembler::LT, ShiftThree); 4717 4718 __ dup(shiftVCount, __ T4S, shiftCount); 4719 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4720 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4721 4722 __ BIND(ShiftSIMDLoop); 4723 4724 // load 4 words and process 4725 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4726 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4727 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4728 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4729 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4730 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4731 __ sub(numIter, numIter, 4); 4732 4733 __ cmp(numIter, (u1)4); 4734 __ br(Assembler::LT, ShiftTwoLoop); 4735 __ b(ShiftSIMDLoop); 4736 4737 __ BIND(ShiftTwoLoop); 4738 __ cbz(numIter, Exit); 4739 __ cmp(numIter, (u1)1); 4740 __ br(Assembler::EQ, ShiftOne); 4741 4742 // load 2 words and process 4743 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4744 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4745 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4746 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4747 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4748 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4749 __ sub(numIter, numIter, 2); 4750 __ b(ShiftTwoLoop); 4751 4752 __ BIND(ShiftThree); 4753 __ ldrw(r10, __ post(oldArr, 4)); 4754 __ ldrw(r11, __ post(oldArrNext, 4)); 4755 __ lslvw(r10, r10, shiftCount); 4756 __ lsrvw(r11, r11, shiftRevCount); 4757 __ orrw(r12, r10, r11); 4758 __ strw(r12, __ post(newArr, 4)); 4759 __ tbz(numIter, 1, Exit); 4760 __ tbz(numIter, 0, ShiftOne); 4761 4762 __ BIND(ShiftTwo); 4763 __ ldrw(r10, __ post(oldArr, 4)); 4764 __ ldrw(r11, __ post(oldArrNext, 4)); 4765 __ lslvw(r10, r10, shiftCount); 4766 __ lsrvw(r11, r11, shiftRevCount); 4767 __ orrw(r12, r10, r11); 4768 __ strw(r12, __ post(newArr, 4)); 4769 4770 __ BIND(ShiftOne); 4771 __ ldrw(r10, Address(oldArr)); 4772 __ ldrw(r11, Address(oldArrNext)); 4773 __ lslvw(r10, r10, shiftCount); 4774 __ lsrvw(r11, r11, shiftRevCount); 4775 __ orrw(r12, r10, r11); 4776 __ strw(r12, Address(newArr)); 4777 4778 __ BIND(Exit); 4779 __ ret(lr); 4780 4781 return start; 4782 } 4783 4784 address generate_count_positives(address &count_positives_long) { 4785 const u1 large_loop_size = 64; 4786 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4787 int dcache_line = VM_Version::dcache_line_size(); 4788 4789 Register ary1 = r1, len = r2, result = r0; 4790 4791 __ align(CodeEntryAlignment); 4792 4793 StubCodeMark mark(this, "StubRoutines", "count_positives"); 4794 4795 address entry = __ pc(); 4796 4797 __ enter(); 4798 // precondition: a copy of len is already in result 4799 // __ mov(result, len); 4800 4801 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 4802 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 4803 4804 __ cmp(len, (u1)15); 4805 __ br(Assembler::GT, LEN_OVER_15); 4806 // The only case when execution falls into this code is when pointer is near 4807 // the end of memory page and we have to avoid reading next page 4808 __ add(ary1, ary1, len); 4809 __ subs(len, len, 8); 4810 __ br(Assembler::GT, LEN_OVER_8); 4811 __ ldr(rscratch2, Address(ary1, -8)); 4812 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 4813 __ lsrv(rscratch2, rscratch2, rscratch1); 4814 __ tst(rscratch2, UPPER_BIT_MASK); 4815 __ csel(result, zr, result, Assembler::NE); 4816 __ leave(); 4817 __ ret(lr); 4818 __ bind(LEN_OVER_8); 4819 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 4820 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 4821 __ tst(rscratch2, UPPER_BIT_MASK); 4822 __ br(Assembler::NE, RET_NO_POP); 4823 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 4824 __ lsrv(rscratch1, rscratch1, rscratch2); 4825 __ tst(rscratch1, UPPER_BIT_MASK); 4826 __ bind(RET_NO_POP); 4827 __ csel(result, zr, result, Assembler::NE); 4828 __ leave(); 4829 __ ret(lr); 4830 4831 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 4832 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 4833 4834 count_positives_long = __ pc(); // 2nd entry point 4835 4836 __ enter(); 4837 4838 __ bind(LEN_OVER_15); 4839 __ push(spilled_regs, sp); 4840 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 4841 __ cbz(rscratch2, ALIGNED); 4842 __ ldp(tmp6, tmp1, Address(ary1)); 4843 __ mov(tmp5, 16); 4844 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 4845 __ add(ary1, ary1, rscratch1); 4846 __ orr(tmp6, tmp6, tmp1); 4847 __ tst(tmp6, UPPER_BIT_MASK); 4848 __ br(Assembler::NE, RET_ADJUST); 4849 __ sub(len, len, rscratch1); 4850 4851 __ bind(ALIGNED); 4852 __ cmp(len, large_loop_size); 4853 __ br(Assembler::LT, CHECK_16); 4854 // Perform 16-byte load as early return in pre-loop to handle situation 4855 // when initially aligned large array has negative values at starting bytes, 4856 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 4857 // slower. Cases with negative bytes further ahead won't be affected that 4858 // much. In fact, it'll be faster due to early loads, less instructions and 4859 // less branches in LARGE_LOOP. 4860 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 4861 __ sub(len, len, 16); 4862 __ orr(tmp6, tmp6, tmp1); 4863 __ tst(tmp6, UPPER_BIT_MASK); 4864 __ br(Assembler::NE, RET_ADJUST_16); 4865 __ cmp(len, large_loop_size); 4866 __ br(Assembler::LT, CHECK_16); 4867 4868 if (SoftwarePrefetchHintDistance >= 0 4869 && SoftwarePrefetchHintDistance >= dcache_line) { 4870 // initial prefetch 4871 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 4872 } 4873 __ bind(LARGE_LOOP); 4874 if (SoftwarePrefetchHintDistance >= 0) { 4875 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 4876 } 4877 // Issue load instructions first, since it can save few CPU/MEM cycles, also 4878 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 4879 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 4880 // instructions per cycle and have less branches, but this approach disables 4881 // early return, thus, all 64 bytes are loaded and checked every time. 4882 __ ldp(tmp2, tmp3, Address(ary1)); 4883 __ ldp(tmp4, tmp5, Address(ary1, 16)); 4884 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 4885 __ ldp(tmp6, tmp1, Address(ary1, 48)); 4886 __ add(ary1, ary1, large_loop_size); 4887 __ sub(len, len, large_loop_size); 4888 __ orr(tmp2, tmp2, tmp3); 4889 __ orr(tmp4, tmp4, tmp5); 4890 __ orr(rscratch1, rscratch1, rscratch2); 4891 __ orr(tmp6, tmp6, tmp1); 4892 __ orr(tmp2, tmp2, tmp4); 4893 __ orr(rscratch1, rscratch1, tmp6); 4894 __ orr(tmp2, tmp2, rscratch1); 4895 __ tst(tmp2, UPPER_BIT_MASK); 4896 __ br(Assembler::NE, RET_ADJUST_LONG); 4897 __ cmp(len, large_loop_size); 4898 __ br(Assembler::GE, LARGE_LOOP); 4899 4900 __ bind(CHECK_16); // small 16-byte load pre-loop 4901 __ cmp(len, (u1)16); 4902 __ br(Assembler::LT, POST_LOOP16); 4903 4904 __ bind(LOOP16); // small 16-byte load loop 4905 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 4906 __ sub(len, len, 16); 4907 __ orr(tmp2, tmp2, tmp3); 4908 __ tst(tmp2, UPPER_BIT_MASK); 4909 __ br(Assembler::NE, RET_ADJUST_16); 4910 __ cmp(len, (u1)16); 4911 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 4912 4913 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 4914 __ cmp(len, (u1)8); 4915 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 4916 __ ldr(tmp3, Address(__ post(ary1, 8))); 4917 __ tst(tmp3, UPPER_BIT_MASK); 4918 __ br(Assembler::NE, RET_ADJUST); 4919 __ sub(len, len, 8); 4920 4921 __ bind(POST_LOOP16_LOAD_TAIL); 4922 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 4923 __ ldr(tmp1, Address(ary1)); 4924 __ mov(tmp2, 64); 4925 __ sub(tmp4, tmp2, len, __ LSL, 3); 4926 __ lslv(tmp1, tmp1, tmp4); 4927 __ tst(tmp1, UPPER_BIT_MASK); 4928 __ br(Assembler::NE, RET_ADJUST); 4929 // Fallthrough 4930 4931 __ bind(RET_LEN); 4932 __ pop(spilled_regs, sp); 4933 __ leave(); 4934 __ ret(lr); 4935 4936 // difference result - len is the count of guaranteed to be 4937 // positive bytes 4938 4939 __ bind(RET_ADJUST_LONG); 4940 __ add(len, len, (u1)(large_loop_size - 16)); 4941 __ bind(RET_ADJUST_16); 4942 __ add(len, len, 16); 4943 __ bind(RET_ADJUST); 4944 __ pop(spilled_regs, sp); 4945 __ leave(); 4946 __ sub(result, result, len); 4947 __ ret(lr); 4948 4949 return entry; 4950 } 4951 4952 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 4953 bool usePrefetch, Label &NOT_EQUAL) { 4954 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4955 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 4956 tmp7 = r12, tmp8 = r13; 4957 Label LOOP; 4958 4959 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4960 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4961 __ bind(LOOP); 4962 if (usePrefetch) { 4963 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 4964 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 4965 } 4966 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 4967 __ eor(tmp1, tmp1, tmp2); 4968 __ eor(tmp3, tmp3, tmp4); 4969 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 4970 __ orr(tmp1, tmp1, tmp3); 4971 __ cbnz(tmp1, NOT_EQUAL); 4972 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4973 __ eor(tmp5, tmp5, tmp6); 4974 __ eor(tmp7, tmp7, tmp8); 4975 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4976 __ orr(tmp5, tmp5, tmp7); 4977 __ cbnz(tmp5, NOT_EQUAL); 4978 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 4979 __ eor(tmp1, tmp1, tmp2); 4980 __ eor(tmp3, tmp3, tmp4); 4981 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 4982 __ orr(tmp1, tmp1, tmp3); 4983 __ cbnz(tmp1, NOT_EQUAL); 4984 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4985 __ eor(tmp5, tmp5, tmp6); 4986 __ sub(cnt1, cnt1, 8 * wordSize); 4987 __ eor(tmp7, tmp7, tmp8); 4988 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4989 // tmp6 is not used. MacroAssembler::subs is used here (rather than 4990 // cmp) because subs allows an unlimited range of immediate operand. 4991 __ subs(tmp6, cnt1, loopThreshold); 4992 __ orr(tmp5, tmp5, tmp7); 4993 __ cbnz(tmp5, NOT_EQUAL); 4994 __ br(__ GE, LOOP); 4995 // post-loop 4996 __ eor(tmp1, tmp1, tmp2); 4997 __ eor(tmp3, tmp3, tmp4); 4998 __ orr(tmp1, tmp1, tmp3); 4999 __ sub(cnt1, cnt1, 2 * wordSize); 5000 __ cbnz(tmp1, NOT_EQUAL); 5001 } 5002 5003 void generate_large_array_equals_loop_simd(int loopThreshold, 5004 bool usePrefetch, Label &NOT_EQUAL) { 5005 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5006 tmp2 = rscratch2; 5007 Label LOOP; 5008 5009 __ bind(LOOP); 5010 if (usePrefetch) { 5011 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5012 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5013 } 5014 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5015 __ sub(cnt1, cnt1, 8 * wordSize); 5016 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5017 __ subs(tmp1, cnt1, loopThreshold); 5018 __ eor(v0, __ T16B, v0, v4); 5019 __ eor(v1, __ T16B, v1, v5); 5020 __ eor(v2, __ T16B, v2, v6); 5021 __ eor(v3, __ T16B, v3, v7); 5022 __ orr(v0, __ T16B, v0, v1); 5023 __ orr(v1, __ T16B, v2, v3); 5024 __ orr(v0, __ T16B, v0, v1); 5025 __ umov(tmp1, v0, __ D, 0); 5026 __ umov(tmp2, v0, __ D, 1); 5027 __ orr(tmp1, tmp1, tmp2); 5028 __ cbnz(tmp1, NOT_EQUAL); 5029 __ br(__ GE, LOOP); 5030 } 5031 5032 // a1 = r1 - array1 address 5033 // a2 = r2 - array2 address 5034 // result = r0 - return value. Already contains "false" 5035 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5036 // r3-r5 are reserved temporary registers 5037 address generate_large_array_equals() { 5038 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5039 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5040 tmp7 = r12, tmp8 = r13; 5041 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5042 SMALL_LOOP, POST_LOOP; 5043 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5044 // calculate if at least 32 prefetched bytes are used 5045 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5046 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5047 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5048 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5049 tmp5, tmp6, tmp7, tmp8); 5050 5051 __ align(CodeEntryAlignment); 5052 5053 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5054 5055 address entry = __ pc(); 5056 __ enter(); 5057 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5058 // also advance pointers to use post-increment instead of pre-increment 5059 __ add(a1, a1, wordSize); 5060 __ add(a2, a2, wordSize); 5061 if (AvoidUnalignedAccesses) { 5062 // both implementations (SIMD/nonSIMD) are using relatively large load 5063 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5064 // on some CPUs in case of address is not at least 16-byte aligned. 5065 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5066 // load if needed at least for 1st address and make if 16-byte aligned. 5067 Label ALIGNED16; 5068 __ tbz(a1, 3, ALIGNED16); 5069 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5070 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5071 __ sub(cnt1, cnt1, wordSize); 5072 __ eor(tmp1, tmp1, tmp2); 5073 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5074 __ bind(ALIGNED16); 5075 } 5076 if (UseSIMDForArrayEquals) { 5077 if (SoftwarePrefetchHintDistance >= 0) { 5078 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5079 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5080 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5081 /* prfm = */ true, NOT_EQUAL); 5082 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5083 __ br(__ LT, TAIL); 5084 } 5085 __ bind(NO_PREFETCH_LARGE_LOOP); 5086 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5087 /* prfm = */ false, NOT_EQUAL); 5088 } else { 5089 __ push(spilled_regs, sp); 5090 if (SoftwarePrefetchHintDistance >= 0) { 5091 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5092 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5093 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5094 /* prfm = */ true, NOT_EQUAL); 5095 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5096 __ br(__ LT, TAIL); 5097 } 5098 __ bind(NO_PREFETCH_LARGE_LOOP); 5099 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5100 /* prfm = */ false, NOT_EQUAL); 5101 } 5102 __ bind(TAIL); 5103 __ cbz(cnt1, EQUAL); 5104 __ subs(cnt1, cnt1, wordSize); 5105 __ br(__ LE, POST_LOOP); 5106 __ bind(SMALL_LOOP); 5107 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5108 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5109 __ subs(cnt1, cnt1, wordSize); 5110 __ eor(tmp1, tmp1, tmp2); 5111 __ cbnz(tmp1, NOT_EQUAL); 5112 __ br(__ GT, SMALL_LOOP); 5113 __ bind(POST_LOOP); 5114 __ ldr(tmp1, Address(a1, cnt1)); 5115 __ ldr(tmp2, Address(a2, cnt1)); 5116 __ eor(tmp1, tmp1, tmp2); 5117 __ cbnz(tmp1, NOT_EQUAL); 5118 __ bind(EQUAL); 5119 __ mov(result, true); 5120 __ bind(NOT_EQUAL); 5121 if (!UseSIMDForArrayEquals) { 5122 __ pop(spilled_regs, sp); 5123 } 5124 __ bind(NOT_EQUAL_NO_POP); 5125 __ leave(); 5126 __ ret(lr); 5127 return entry; 5128 } 5129 5130 address generate_dsin_dcos(bool isCos) { 5131 __ align(CodeEntryAlignment); 5132 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5133 address start = __ pc(); 5134 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5135 (address)StubRoutines::aarch64::_two_over_pi, 5136 (address)StubRoutines::aarch64::_pio2, 5137 (address)StubRoutines::aarch64::_dsin_coef, 5138 (address)StubRoutines::aarch64::_dcos_coef); 5139 return start; 5140 } 5141 5142 address generate_dlog() { 5143 __ align(CodeEntryAlignment); 5144 StubCodeMark mark(this, "StubRoutines", "dlog"); 5145 address entry = __ pc(); 5146 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 5147 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 5148 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 5149 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 5150 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 5151 return entry; 5152 } 5153 5154 5155 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5156 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5157 Label &DIFF2) { 5158 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5159 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5160 5161 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5162 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5163 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5164 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5165 5166 __ fmovd(tmpL, vtmp3); 5167 __ eor(rscratch2, tmp3, tmpL); 5168 __ cbnz(rscratch2, DIFF2); 5169 5170 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5171 __ umov(tmpL, vtmp3, __ D, 1); 5172 __ eor(rscratch2, tmpU, tmpL); 5173 __ cbnz(rscratch2, DIFF1); 5174 5175 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5176 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5177 __ fmovd(tmpL, vtmp); 5178 __ eor(rscratch2, tmp3, tmpL); 5179 __ cbnz(rscratch2, DIFF2); 5180 5181 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5182 __ umov(tmpL, vtmp, __ D, 1); 5183 __ eor(rscratch2, tmpU, tmpL); 5184 __ cbnz(rscratch2, DIFF1); 5185 } 5186 5187 // r0 = result 5188 // r1 = str1 5189 // r2 = cnt1 5190 // r3 = str2 5191 // r4 = cnt2 5192 // r10 = tmp1 5193 // r11 = tmp2 5194 address generate_compare_long_string_different_encoding(bool isLU) { 5195 __ align(CodeEntryAlignment); 5196 StubCodeMark mark(this, "StubRoutines", isLU 5197 ? "compare_long_string_different_encoding LU" 5198 : "compare_long_string_different_encoding UL"); 5199 address entry = __ pc(); 5200 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5201 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5202 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5203 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5204 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5205 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5206 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5207 5208 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5209 5210 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5211 // cnt2 == amount of characters left to compare 5212 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5213 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5214 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5215 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5216 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5217 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5218 __ eor(rscratch2, tmp1, tmp2); 5219 __ mov(rscratch1, tmp2); 5220 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5221 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5222 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5223 __ push(spilled_regs, sp); 5224 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5225 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5226 5227 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5228 5229 if (SoftwarePrefetchHintDistance >= 0) { 5230 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5231 __ br(__ LT, NO_PREFETCH); 5232 __ bind(LARGE_LOOP_PREFETCH); 5233 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5234 __ mov(tmp4, 2); 5235 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5236 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5237 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5238 __ subs(tmp4, tmp4, 1); 5239 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5240 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5241 __ mov(tmp4, 2); 5242 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5243 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5244 __ subs(tmp4, tmp4, 1); 5245 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5246 __ sub(cnt2, cnt2, 64); 5247 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5248 __ br(__ GE, LARGE_LOOP_PREFETCH); 5249 } 5250 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5251 __ bind(NO_PREFETCH); 5252 __ subs(cnt2, cnt2, 16); 5253 __ br(__ LT, TAIL); 5254 __ align(OptoLoopAlignment); 5255 __ bind(SMALL_LOOP); // smaller loop 5256 __ subs(cnt2, cnt2, 16); 5257 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5258 __ br(__ GE, SMALL_LOOP); 5259 __ cmn(cnt2, (u1)16); 5260 __ br(__ EQ, LOAD_LAST); 5261 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5262 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5263 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5264 __ ldr(tmp3, Address(cnt1, -8)); 5265 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5266 __ b(LOAD_LAST); 5267 __ bind(DIFF2); 5268 __ mov(tmpU, tmp3); 5269 __ bind(DIFF1); 5270 __ pop(spilled_regs, sp); 5271 __ b(CALCULATE_DIFFERENCE); 5272 __ bind(LOAD_LAST); 5273 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5274 // No need to load it again 5275 __ mov(tmpU, tmp3); 5276 __ pop(spilled_regs, sp); 5277 5278 // tmp2 points to the address of the last 4 Latin1 characters right now 5279 __ ldrs(vtmp, Address(tmp2)); 5280 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5281 __ fmovd(tmpL, vtmp); 5282 5283 __ eor(rscratch2, tmpU, tmpL); 5284 __ cbz(rscratch2, DONE); 5285 5286 // Find the first different characters in the longwords and 5287 // compute their difference. 5288 __ bind(CALCULATE_DIFFERENCE); 5289 __ rev(rscratch2, rscratch2); 5290 __ clz(rscratch2, rscratch2); 5291 __ andr(rscratch2, rscratch2, -16); 5292 __ lsrv(tmp1, tmp1, rscratch2); 5293 __ uxthw(tmp1, tmp1); 5294 __ lsrv(rscratch1, rscratch1, rscratch2); 5295 __ uxthw(rscratch1, rscratch1); 5296 __ subw(result, tmp1, rscratch1); 5297 __ bind(DONE); 5298 __ ret(lr); 5299 return entry; 5300 } 5301 5302 address generate_method_entry_barrier() { 5303 __ align(CodeEntryAlignment); 5304 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5305 5306 Label deoptimize_label; 5307 5308 address start = __ pc(); 5309 5310 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5311 5312 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5313 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5314 // We can get here despite the nmethod being good, if we have not 5315 // yet applied our cross modification fence (or data fence). 5316 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5317 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5318 __ ldrw(rscratch2, rscratch2); 5319 __ strw(rscratch2, thread_epoch_addr); 5320 __ isb(); 5321 __ membar(__ LoadLoad); 5322 } 5323 5324 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5325 5326 __ enter(); 5327 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5328 5329 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5330 5331 __ push_call_clobbered_registers(); 5332 5333 __ mov(c_rarg0, rscratch2); 5334 __ call_VM_leaf 5335 (CAST_FROM_FN_PTR 5336 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5337 5338 __ reset_last_Java_frame(true); 5339 5340 __ mov(rscratch1, r0); 5341 5342 __ pop_call_clobbered_registers(); 5343 5344 __ cbnz(rscratch1, deoptimize_label); 5345 5346 __ leave(); 5347 __ ret(lr); 5348 5349 __ BIND(deoptimize_label); 5350 5351 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5352 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5353 5354 __ mov(sp, rscratch1); 5355 __ br(rscratch2); 5356 5357 return start; 5358 } 5359 5360 address generate_check_lock_stack() { 5361 __ align(CodeEntryAlignment); 5362 StubCodeMark mark(this, "StubRoutines", "check_lock_stack"); 5363 5364 address start = __ pc(); 5365 5366 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5367 __ enter(); 5368 __ push_call_clobbered_registers(); 5369 5370 __ mov(c_rarg0, r9); 5371 __ call_VM_leaf(CAST_FROM_FN_PTR(address, LockStack::ensure_lock_stack_size), 1); 5372 5373 5374 __ pop_call_clobbered_registers(); 5375 __ leave(); 5376 __ reset_last_Java_frame(true); 5377 5378 __ ret(lr); 5379 5380 return start; 5381 } 5382 5383 // r0 = result 5384 // r1 = str1 5385 // r2 = cnt1 5386 // r3 = str2 5387 // r4 = cnt2 5388 // r10 = tmp1 5389 // r11 = tmp2 5390 address generate_compare_long_string_same_encoding(bool isLL) { 5391 __ align(CodeEntryAlignment); 5392 StubCodeMark mark(this, "StubRoutines", isLL 5393 ? "compare_long_string_same_encoding LL" 5394 : "compare_long_string_same_encoding UU"); 5395 address entry = __ pc(); 5396 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5397 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5398 5399 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5400 5401 // exit from large loop when less than 64 bytes left to read or we're about 5402 // to prefetch memory behind array border 5403 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5404 5405 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5406 __ eor(rscratch2, tmp1, tmp2); 5407 __ cbnz(rscratch2, CAL_DIFFERENCE); 5408 5409 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5410 // update pointers, because of previous read 5411 __ add(str1, str1, wordSize); 5412 __ add(str2, str2, wordSize); 5413 if (SoftwarePrefetchHintDistance >= 0) { 5414 __ align(OptoLoopAlignment); 5415 __ bind(LARGE_LOOP_PREFETCH); 5416 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5417 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5418 5419 for (int i = 0; i < 4; i++) { 5420 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5421 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5422 __ cmp(tmp1, tmp2); 5423 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5424 __ br(Assembler::NE, DIFF); 5425 } 5426 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5427 __ add(str1, str1, 64); 5428 __ add(str2, str2, 64); 5429 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5430 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5431 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5432 } 5433 5434 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5435 __ br(Assembler::LE, LESS16); 5436 __ align(OptoLoopAlignment); 5437 __ bind(LOOP_COMPARE16); 5438 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5439 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5440 __ cmp(tmp1, tmp2); 5441 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5442 __ br(Assembler::NE, DIFF); 5443 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5444 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5445 __ br(Assembler::LT, LESS16); 5446 5447 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5448 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5449 __ cmp(tmp1, tmp2); 5450 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5451 __ br(Assembler::NE, DIFF); 5452 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5453 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5454 __ br(Assembler::GE, LOOP_COMPARE16); 5455 __ cbz(cnt2, LENGTH_DIFF); 5456 5457 __ bind(LESS16); 5458 // each 8 compare 5459 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5460 __ br(Assembler::LE, LESS8); 5461 __ ldr(tmp1, Address(__ post(str1, 8))); 5462 __ ldr(tmp2, Address(__ post(str2, 8))); 5463 __ eor(rscratch2, tmp1, tmp2); 5464 __ cbnz(rscratch2, CAL_DIFFERENCE); 5465 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5466 5467 __ bind(LESS8); // directly load last 8 bytes 5468 if (!isLL) { 5469 __ add(cnt2, cnt2, cnt2); 5470 } 5471 __ ldr(tmp1, Address(str1, cnt2)); 5472 __ ldr(tmp2, Address(str2, cnt2)); 5473 __ eor(rscratch2, tmp1, tmp2); 5474 __ cbz(rscratch2, LENGTH_DIFF); 5475 __ b(CAL_DIFFERENCE); 5476 5477 __ bind(DIFF); 5478 __ cmp(tmp1, tmp2); 5479 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5480 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5481 // reuse rscratch2 register for the result of eor instruction 5482 __ eor(rscratch2, tmp1, tmp2); 5483 5484 __ bind(CAL_DIFFERENCE); 5485 __ rev(rscratch2, rscratch2); 5486 __ clz(rscratch2, rscratch2); 5487 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5488 __ lsrv(tmp1, tmp1, rscratch2); 5489 __ lsrv(tmp2, tmp2, rscratch2); 5490 if (isLL) { 5491 __ uxtbw(tmp1, tmp1); 5492 __ uxtbw(tmp2, tmp2); 5493 } else { 5494 __ uxthw(tmp1, tmp1); 5495 __ uxthw(tmp2, tmp2); 5496 } 5497 __ subw(result, tmp1, tmp2); 5498 5499 __ bind(LENGTH_DIFF); 5500 __ ret(lr); 5501 return entry; 5502 } 5503 5504 enum string_compare_mode { 5505 LL, 5506 LU, 5507 UL, 5508 UU, 5509 }; 5510 5511 // The following registers are declared in aarch64.ad 5512 // r0 = result 5513 // r1 = str1 5514 // r2 = cnt1 5515 // r3 = str2 5516 // r4 = cnt2 5517 // r10 = tmp1 5518 // r11 = tmp2 5519 // z0 = ztmp1 5520 // z1 = ztmp2 5521 // p0 = pgtmp1 5522 // p1 = pgtmp2 5523 address generate_compare_long_string_sve(string_compare_mode mode) { 5524 __ align(CodeEntryAlignment); 5525 address entry = __ pc(); 5526 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5527 tmp1 = r10, tmp2 = r11; 5528 5529 Label LOOP, DONE, MISMATCH; 5530 Register vec_len = tmp1; 5531 Register idx = tmp2; 5532 // The minimum of the string lengths has been stored in cnt2. 5533 Register cnt = cnt2; 5534 FloatRegister ztmp1 = z0, ztmp2 = z1; 5535 PRegister pgtmp1 = p0, pgtmp2 = p1; 5536 5537 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 5538 switch (mode) { \ 5539 case LL: \ 5540 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 5541 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 5542 break; \ 5543 case LU: \ 5544 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 5545 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5546 break; \ 5547 case UL: \ 5548 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5549 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 5550 break; \ 5551 case UU: \ 5552 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5553 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5554 break; \ 5555 default: \ 5556 ShouldNotReachHere(); \ 5557 } 5558 5559 const char* stubname; 5560 switch (mode) { 5561 case LL: stubname = "compare_long_string_same_encoding LL"; break; 5562 case LU: stubname = "compare_long_string_different_encoding LU"; break; 5563 case UL: stubname = "compare_long_string_different_encoding UL"; break; 5564 case UU: stubname = "compare_long_string_same_encoding UU"; break; 5565 default: ShouldNotReachHere(); 5566 } 5567 5568 StubCodeMark mark(this, "StubRoutines", stubname); 5569 5570 __ mov(idx, 0); 5571 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5572 5573 if (mode == LL) { 5574 __ sve_cntb(vec_len); 5575 } else { 5576 __ sve_cnth(vec_len); 5577 } 5578 5579 __ sub(rscratch1, cnt, vec_len); 5580 5581 __ bind(LOOP); 5582 5583 // main loop 5584 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5585 __ add(idx, idx, vec_len); 5586 // Compare strings. 5587 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5588 __ br(__ NE, MISMATCH); 5589 __ cmp(idx, rscratch1); 5590 __ br(__ LT, LOOP); 5591 5592 // post loop, last iteration 5593 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5594 5595 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5596 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5597 __ br(__ EQ, DONE); 5598 5599 __ bind(MISMATCH); 5600 5601 // Crop the vector to find its location. 5602 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 5603 // Extract the first different characters of each string. 5604 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 5605 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 5606 5607 // Compute the difference of the first different characters. 5608 __ sub(result, rscratch1, rscratch2); 5609 5610 __ bind(DONE); 5611 __ ret(lr); 5612 #undef LOAD_PAIR 5613 return entry; 5614 } 5615 5616 void generate_compare_long_strings() { 5617 if (UseSVE == 0) { 5618 StubRoutines::aarch64::_compare_long_string_LL 5619 = generate_compare_long_string_same_encoding(true); 5620 StubRoutines::aarch64::_compare_long_string_UU 5621 = generate_compare_long_string_same_encoding(false); 5622 StubRoutines::aarch64::_compare_long_string_LU 5623 = generate_compare_long_string_different_encoding(true); 5624 StubRoutines::aarch64::_compare_long_string_UL 5625 = generate_compare_long_string_different_encoding(false); 5626 } else { 5627 StubRoutines::aarch64::_compare_long_string_LL 5628 = generate_compare_long_string_sve(LL); 5629 StubRoutines::aarch64::_compare_long_string_UU 5630 = generate_compare_long_string_sve(UU); 5631 StubRoutines::aarch64::_compare_long_string_LU 5632 = generate_compare_long_string_sve(LU); 5633 StubRoutines::aarch64::_compare_long_string_UL 5634 = generate_compare_long_string_sve(UL); 5635 } 5636 } 5637 5638 // R0 = result 5639 // R1 = str2 5640 // R2 = cnt1 5641 // R3 = str1 5642 // R4 = cnt2 5643 // This generic linear code use few additional ideas, which makes it faster: 5644 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5645 // in order to skip initial loading(help in systems with 1 ld pipeline) 5646 // 2) we can use "fast" algorithm of finding single character to search for 5647 // first symbol with less branches(1 branch per each loaded register instead 5648 // of branch for each symbol), so, this is where constants like 5649 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5650 // 3) after loading and analyzing 1st register of source string, it can be 5651 // used to search for every 1st character entry, saving few loads in 5652 // comparison with "simplier-but-slower" implementation 5653 // 4) in order to avoid lots of push/pop operations, code below is heavily 5654 // re-using/re-initializing/compressing register values, which makes code 5655 // larger and a bit less readable, however, most of extra operations are 5656 // issued during loads or branches, so, penalty is minimal 5657 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5658 const char* stubName = str1_isL 5659 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5660 : "indexof_linear_uu"; 5661 __ align(CodeEntryAlignment); 5662 StubCodeMark mark(this, "StubRoutines", stubName); 5663 address entry = __ pc(); 5664 5665 int str1_chr_size = str1_isL ? 1 : 2; 5666 int str2_chr_size = str2_isL ? 1 : 2; 5667 int str1_chr_shift = str1_isL ? 0 : 1; 5668 int str2_chr_shift = str2_isL ? 0 : 1; 5669 bool isL = str1_isL && str2_isL; 5670 // parameters 5671 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5672 // temporary registers 5673 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5674 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5675 // redefinitions 5676 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5677 5678 __ push(spilled_regs, sp); 5679 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5680 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5681 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5682 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5683 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5684 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5685 // Read whole register from str1. It is safe, because length >=8 here 5686 __ ldr(ch1, Address(str1)); 5687 // Read whole register from str2. It is safe, because length >=8 here 5688 __ ldr(ch2, Address(str2)); 5689 __ sub(cnt2, cnt2, cnt1); 5690 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5691 if (str1_isL != str2_isL) { 5692 __ eor(v0, __ T16B, v0, v0); 5693 } 5694 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5695 __ mul(first, first, tmp1); 5696 // check if we have less than 1 register to check 5697 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5698 if (str1_isL != str2_isL) { 5699 __ fmovd(v1, ch1); 5700 } 5701 __ br(__ LE, L_SMALL); 5702 __ eor(ch2, first, ch2); 5703 if (str1_isL != str2_isL) { 5704 __ zip1(v1, __ T16B, v1, v0); 5705 } 5706 __ sub(tmp2, ch2, tmp1); 5707 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5708 __ bics(tmp2, tmp2, ch2); 5709 if (str1_isL != str2_isL) { 5710 __ fmovd(ch1, v1); 5711 } 5712 __ br(__ NE, L_HAS_ZERO); 5713 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5714 __ add(result, result, wordSize/str2_chr_size); 5715 __ add(str2, str2, wordSize); 5716 __ br(__ LT, L_POST_LOOP); 5717 __ BIND(L_LOOP); 5718 __ ldr(ch2, Address(str2)); 5719 __ eor(ch2, first, ch2); 5720 __ sub(tmp2, ch2, tmp1); 5721 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5722 __ bics(tmp2, tmp2, ch2); 5723 __ br(__ NE, L_HAS_ZERO); 5724 __ BIND(L_LOOP_PROCEED); 5725 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5726 __ add(str2, str2, wordSize); 5727 __ add(result, result, wordSize/str2_chr_size); 5728 __ br(__ GE, L_LOOP); 5729 __ BIND(L_POST_LOOP); 5730 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5731 __ br(__ LE, NOMATCH); 5732 __ ldr(ch2, Address(str2)); 5733 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5734 __ eor(ch2, first, ch2); 5735 __ sub(tmp2, ch2, tmp1); 5736 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5737 __ mov(tmp4, -1); // all bits set 5738 __ b(L_SMALL_PROCEED); 5739 __ align(OptoLoopAlignment); 5740 __ BIND(L_SMALL); 5741 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5742 __ eor(ch2, first, ch2); 5743 if (str1_isL != str2_isL) { 5744 __ zip1(v1, __ T16B, v1, v0); 5745 } 5746 __ sub(tmp2, ch2, tmp1); 5747 __ mov(tmp4, -1); // all bits set 5748 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5749 if (str1_isL != str2_isL) { 5750 __ fmovd(ch1, v1); // move converted 4 symbols 5751 } 5752 __ BIND(L_SMALL_PROCEED); 5753 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5754 __ bic(tmp2, tmp2, ch2); 5755 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5756 __ rbit(tmp2, tmp2); 5757 __ br(__ EQ, NOMATCH); 5758 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5759 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5760 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5761 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5762 if (str2_isL) { // LL 5763 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5764 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5765 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5766 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5767 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5768 } else { 5769 __ mov(ch2, 0xE); // all bits in byte set except last one 5770 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5771 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5772 __ lslv(tmp2, tmp2, tmp4); 5773 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5774 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5775 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5776 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5777 } 5778 __ cmp(ch1, ch2); 5779 __ mov(tmp4, wordSize/str2_chr_size); 5780 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5781 __ BIND(L_SMALL_CMP_LOOP); 5782 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5783 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5784 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5785 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5786 __ add(tmp4, tmp4, 1); 5787 __ cmp(tmp4, cnt1); 5788 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5789 __ cmp(first, ch2); 5790 __ br(__ EQ, L_SMALL_CMP_LOOP); 5791 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5792 __ cbz(tmp2, NOMATCH); // no more matches. exit 5793 __ clz(tmp4, tmp2); 5794 __ add(result, result, 1); // advance index 5795 __ add(str2, str2, str2_chr_size); // advance pointer 5796 __ b(L_SMALL_HAS_ZERO_LOOP); 5797 __ align(OptoLoopAlignment); 5798 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5799 __ cmp(first, ch2); 5800 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5801 __ b(DONE); 5802 __ align(OptoLoopAlignment); 5803 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5804 if (str2_isL) { // LL 5805 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5806 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5807 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5808 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5809 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5810 } else { 5811 __ mov(ch2, 0xE); // all bits in byte set except last one 5812 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5813 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5814 __ lslv(tmp2, tmp2, tmp4); 5815 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5816 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5817 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5818 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5819 } 5820 __ cmp(ch1, ch2); 5821 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5822 __ b(DONE); 5823 __ align(OptoLoopAlignment); 5824 __ BIND(L_HAS_ZERO); 5825 __ rbit(tmp2, tmp2); 5826 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 5827 // Now, perform compression of counters(cnt2 and cnt1) into one register. 5828 // It's fine because both counters are 32bit and are not changed in this 5829 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 5830 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 5831 __ sub(result, result, 1); 5832 __ BIND(L_HAS_ZERO_LOOP); 5833 __ mov(cnt1, wordSize/str2_chr_size); 5834 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5835 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 5836 if (str2_isL) { 5837 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5838 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5839 __ lslv(tmp2, tmp2, tmp4); 5840 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5841 __ add(tmp4, tmp4, 1); 5842 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5843 __ lsl(tmp2, tmp2, 1); 5844 __ mov(tmp4, wordSize/str2_chr_size); 5845 } else { 5846 __ mov(ch2, 0xE); 5847 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5848 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5849 __ lslv(tmp2, tmp2, tmp4); 5850 __ add(tmp4, tmp4, 1); 5851 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5852 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 5853 __ lsl(tmp2, tmp2, 1); 5854 __ mov(tmp4, wordSize/str2_chr_size); 5855 __ sub(str2, str2, str2_chr_size); 5856 } 5857 __ cmp(ch1, ch2); 5858 __ mov(tmp4, wordSize/str2_chr_size); 5859 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5860 __ BIND(L_CMP_LOOP); 5861 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5862 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5863 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5864 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5865 __ add(tmp4, tmp4, 1); 5866 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5867 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 5868 __ cmp(cnt1, ch2); 5869 __ br(__ EQ, L_CMP_LOOP); 5870 __ BIND(L_CMP_LOOP_NOMATCH); 5871 // here we're not matched 5872 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 5873 __ clz(tmp4, tmp2); 5874 __ add(str2, str2, str2_chr_size); // advance pointer 5875 __ b(L_HAS_ZERO_LOOP); 5876 __ align(OptoLoopAlignment); 5877 __ BIND(L_CMP_LOOP_LAST_CMP); 5878 __ cmp(cnt1, ch2); 5879 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5880 __ b(DONE); 5881 __ align(OptoLoopAlignment); 5882 __ BIND(L_CMP_LOOP_LAST_CMP2); 5883 if (str2_isL) { 5884 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5885 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5886 __ lslv(tmp2, tmp2, tmp4); 5887 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5888 __ add(tmp4, tmp4, 1); 5889 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5890 __ lsl(tmp2, tmp2, 1); 5891 } else { 5892 __ mov(ch2, 0xE); 5893 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5894 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5895 __ lslv(tmp2, tmp2, tmp4); 5896 __ add(tmp4, tmp4, 1); 5897 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5898 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 5899 __ lsl(tmp2, tmp2, 1); 5900 __ sub(str2, str2, str2_chr_size); 5901 } 5902 __ cmp(ch1, ch2); 5903 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5904 __ b(DONE); 5905 __ align(OptoLoopAlignment); 5906 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 5907 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 5908 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 5909 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 5910 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 5911 // result by analyzed characters value, so, we can just reset lower bits 5912 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 5913 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 5914 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 5915 // index of last analyzed substring inside current octet. So, str2 in at 5916 // respective start address. We need to advance it to next octet 5917 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 5918 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 5919 __ bfm(result, zr, 0, 2 - str2_chr_shift); 5920 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 5921 __ movw(cnt2, cnt2); 5922 __ b(L_LOOP_PROCEED); 5923 __ align(OptoLoopAlignment); 5924 __ BIND(NOMATCH); 5925 __ mov(result, -1); 5926 __ BIND(DONE); 5927 __ pop(spilled_regs, sp); 5928 __ ret(lr); 5929 return entry; 5930 } 5931 5932 void generate_string_indexof_stubs() { 5933 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 5934 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 5935 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 5936 } 5937 5938 void inflate_and_store_2_fp_registers(bool generatePrfm, 5939 FloatRegister src1, FloatRegister src2) { 5940 Register dst = r1; 5941 __ zip1(v1, __ T16B, src1, v0); 5942 __ zip2(v2, __ T16B, src1, v0); 5943 if (generatePrfm) { 5944 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 5945 } 5946 __ zip1(v3, __ T16B, src2, v0); 5947 __ zip2(v4, __ T16B, src2, v0); 5948 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 5949 } 5950 5951 // R0 = src 5952 // R1 = dst 5953 // R2 = len 5954 // R3 = len >> 3 5955 // V0 = 0 5956 // v1 = loaded 8 bytes 5957 address generate_large_byte_array_inflate() { 5958 __ align(CodeEntryAlignment); 5959 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 5960 address entry = __ pc(); 5961 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 5962 Register src = r0, dst = r1, len = r2, octetCounter = r3; 5963 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 5964 5965 // do one more 8-byte read to have address 16-byte aligned in most cases 5966 // also use single store instruction 5967 __ ldrd(v2, __ post(src, 8)); 5968 __ sub(octetCounter, octetCounter, 2); 5969 __ zip1(v1, __ T16B, v1, v0); 5970 __ zip1(v2, __ T16B, v2, v0); 5971 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 5972 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5973 __ subs(rscratch1, octetCounter, large_loop_threshold); 5974 __ br(__ LE, LOOP_START); 5975 __ b(LOOP_PRFM_START); 5976 __ bind(LOOP_PRFM); 5977 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5978 __ bind(LOOP_PRFM_START); 5979 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 5980 __ sub(octetCounter, octetCounter, 8); 5981 __ subs(rscratch1, octetCounter, large_loop_threshold); 5982 inflate_and_store_2_fp_registers(true, v3, v4); 5983 inflate_and_store_2_fp_registers(true, v5, v6); 5984 __ br(__ GT, LOOP_PRFM); 5985 __ cmp(octetCounter, (u1)8); 5986 __ br(__ LT, DONE); 5987 __ bind(LOOP); 5988 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5989 __ bind(LOOP_START); 5990 __ sub(octetCounter, octetCounter, 8); 5991 __ cmp(octetCounter, (u1)8); 5992 inflate_and_store_2_fp_registers(false, v3, v4); 5993 inflate_and_store_2_fp_registers(false, v5, v6); 5994 __ br(__ GE, LOOP); 5995 __ bind(DONE); 5996 __ ret(lr); 5997 return entry; 5998 } 5999 6000 /** 6001 * Arguments: 6002 * 6003 * Input: 6004 * c_rarg0 - current state address 6005 * c_rarg1 - H key address 6006 * c_rarg2 - data address 6007 * c_rarg3 - number of blocks 6008 * 6009 * Output: 6010 * Updated state at c_rarg0 6011 */ 6012 address generate_ghash_processBlocks() { 6013 // Bafflingly, GCM uses little-endian for the byte order, but 6014 // big-endian for the bit order. For example, the polynomial 1 is 6015 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6016 // 6017 // So, we must either reverse the bytes in each word and do 6018 // everything big-endian or reverse the bits in each byte and do 6019 // it little-endian. On AArch64 it's more idiomatic to reverse 6020 // the bits in each byte (we have an instruction, RBIT, to do 6021 // that) and keep the data in little-endian bit order through the 6022 // calculation, bit-reversing the inputs and outputs. 6023 6024 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6025 __ align(wordSize * 2); 6026 address p = __ pc(); 6027 __ emit_int64(0x87); // The low-order bits of the field 6028 // polynomial (i.e. p = z^7+z^2+z+1) 6029 // repeated in the low and high parts of a 6030 // 128-bit vector 6031 __ emit_int64(0x87); 6032 6033 __ align(CodeEntryAlignment); 6034 address start = __ pc(); 6035 6036 Register state = c_rarg0; 6037 Register subkeyH = c_rarg1; 6038 Register data = c_rarg2; 6039 Register blocks = c_rarg3; 6040 6041 FloatRegister vzr = v30; 6042 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6043 6044 __ ldrq(v24, p); // The field polynomial 6045 6046 __ ldrq(v0, Address(state)); 6047 __ ldrq(v1, Address(subkeyH)); 6048 6049 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6050 __ rbit(v0, __ T16B, v0); 6051 __ rev64(v1, __ T16B, v1); 6052 __ rbit(v1, __ T16B, v1); 6053 6054 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6055 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6056 6057 { 6058 Label L_ghash_loop; 6059 __ bind(L_ghash_loop); 6060 6061 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6062 // reversing each byte 6063 __ rbit(v2, __ T16B, v2); 6064 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6065 6066 // Multiply state in v2 by subkey in v1 6067 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6068 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6069 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6070 // Reduce v7:v5 by the field polynomial 6071 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6072 6073 __ sub(blocks, blocks, 1); 6074 __ cbnz(blocks, L_ghash_loop); 6075 } 6076 6077 // The bit-reversed result is at this point in v0 6078 __ rev64(v0, __ T16B, v0); 6079 __ rbit(v0, __ T16B, v0); 6080 6081 __ st1(v0, __ T16B, state); 6082 __ ret(lr); 6083 6084 return start; 6085 } 6086 6087 address generate_ghash_processBlocks_wide() { 6088 address small = generate_ghash_processBlocks(); 6089 6090 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6091 __ align(wordSize * 2); 6092 address p = __ pc(); 6093 __ emit_int64(0x87); // The low-order bits of the field 6094 // polynomial (i.e. p = z^7+z^2+z+1) 6095 // repeated in the low and high parts of a 6096 // 128-bit vector 6097 __ emit_int64(0x87); 6098 6099 __ align(CodeEntryAlignment); 6100 address start = __ pc(); 6101 6102 Register state = c_rarg0; 6103 Register subkeyH = c_rarg1; 6104 Register data = c_rarg2; 6105 Register blocks = c_rarg3; 6106 6107 const int unroll = 4; 6108 6109 __ cmp(blocks, (unsigned char)(unroll * 2)); 6110 __ br(__ LT, small); 6111 6112 if (unroll > 1) { 6113 // Save state before entering routine 6114 __ sub(sp, sp, 4 * 16); 6115 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6116 __ sub(sp, sp, 4 * 16); 6117 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6118 } 6119 6120 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6121 6122 if (unroll > 1) { 6123 // And restore state 6124 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6125 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6126 } 6127 6128 __ cmp(blocks, (unsigned char)0); 6129 __ br(__ GT, small); 6130 6131 __ ret(lr); 6132 6133 return start; 6134 } 6135 6136 void generate_base64_encode_simdround(Register src, Register dst, 6137 FloatRegister codec, u8 size) { 6138 6139 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6140 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6141 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6142 6143 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6144 6145 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6146 6147 __ ushr(ind0, arrangement, in0, 2); 6148 6149 __ ushr(ind1, arrangement, in1, 2); 6150 __ shl(in0, arrangement, in0, 6); 6151 __ orr(ind1, arrangement, ind1, in0); 6152 __ ushr(ind1, arrangement, ind1, 2); 6153 6154 __ ushr(ind2, arrangement, in2, 4); 6155 __ shl(in1, arrangement, in1, 4); 6156 __ orr(ind2, arrangement, in1, ind2); 6157 __ ushr(ind2, arrangement, ind2, 2); 6158 6159 __ shl(ind3, arrangement, in2, 2); 6160 __ ushr(ind3, arrangement, ind3, 2); 6161 6162 __ tbl(out0, arrangement, codec, 4, ind0); 6163 __ tbl(out1, arrangement, codec, 4, ind1); 6164 __ tbl(out2, arrangement, codec, 4, ind2); 6165 __ tbl(out3, arrangement, codec, 4, ind3); 6166 6167 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6168 } 6169 6170 /** 6171 * Arguments: 6172 * 6173 * Input: 6174 * c_rarg0 - src_start 6175 * c_rarg1 - src_offset 6176 * c_rarg2 - src_length 6177 * c_rarg3 - dest_start 6178 * c_rarg4 - dest_offset 6179 * c_rarg5 - isURL 6180 * 6181 */ 6182 address generate_base64_encodeBlock() { 6183 6184 static const char toBase64[64] = { 6185 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6186 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6187 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6188 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6189 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6190 }; 6191 6192 static const char toBase64URL[64] = { 6193 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6194 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6195 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6196 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6197 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6198 }; 6199 6200 __ align(CodeEntryAlignment); 6201 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6202 address start = __ pc(); 6203 6204 Register src = c_rarg0; // source array 6205 Register soff = c_rarg1; // source start offset 6206 Register send = c_rarg2; // source end offset 6207 Register dst = c_rarg3; // dest array 6208 Register doff = c_rarg4; // position for writing to dest array 6209 Register isURL = c_rarg5; // Base64 or URL character set 6210 6211 // c_rarg6 and c_rarg7 are free to use as temps 6212 Register codec = c_rarg6; 6213 Register length = c_rarg7; 6214 6215 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6216 6217 __ add(src, src, soff); 6218 __ add(dst, dst, doff); 6219 __ sub(length, send, soff); 6220 6221 // load the codec base address 6222 __ lea(codec, ExternalAddress((address) toBase64)); 6223 __ cbz(isURL, ProcessData); 6224 __ lea(codec, ExternalAddress((address) toBase64URL)); 6225 6226 __ BIND(ProcessData); 6227 6228 // too short to formup a SIMD loop, roll back 6229 __ cmp(length, (u1)24); 6230 __ br(Assembler::LT, Process3B); 6231 6232 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6233 6234 __ BIND(Process48B); 6235 __ cmp(length, (u1)48); 6236 __ br(Assembler::LT, Process24B); 6237 generate_base64_encode_simdround(src, dst, v0, 16); 6238 __ sub(length, length, 48); 6239 __ b(Process48B); 6240 6241 __ BIND(Process24B); 6242 __ cmp(length, (u1)24); 6243 __ br(Assembler::LT, SIMDExit); 6244 generate_base64_encode_simdround(src, dst, v0, 8); 6245 __ sub(length, length, 24); 6246 6247 __ BIND(SIMDExit); 6248 __ cbz(length, Exit); 6249 6250 __ BIND(Process3B); 6251 // 3 src bytes, 24 bits 6252 __ ldrb(r10, __ post(src, 1)); 6253 __ ldrb(r11, __ post(src, 1)); 6254 __ ldrb(r12, __ post(src, 1)); 6255 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6256 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6257 // codec index 6258 __ ubfmw(r15, r12, 18, 23); 6259 __ ubfmw(r14, r12, 12, 17); 6260 __ ubfmw(r13, r12, 6, 11); 6261 __ andw(r12, r12, 63); 6262 // get the code based on the codec 6263 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6264 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6265 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6266 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6267 __ strb(r15, __ post(dst, 1)); 6268 __ strb(r14, __ post(dst, 1)); 6269 __ strb(r13, __ post(dst, 1)); 6270 __ strb(r12, __ post(dst, 1)); 6271 __ sub(length, length, 3); 6272 __ cbnz(length, Process3B); 6273 6274 __ BIND(Exit); 6275 __ ret(lr); 6276 6277 return start; 6278 } 6279 6280 void generate_base64_decode_simdround(Register src, Register dst, 6281 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6282 6283 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6284 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6285 6286 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6287 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6288 6289 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6290 6291 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6292 6293 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6294 6295 // we need unsigned saturating subtract, to make sure all input values 6296 // in range [0, 63] will have 0U value in the higher half lookup 6297 __ uqsubv(decH0, __ T16B, in0, v27); 6298 __ uqsubv(decH1, __ T16B, in1, v27); 6299 __ uqsubv(decH2, __ T16B, in2, v27); 6300 __ uqsubv(decH3, __ T16B, in3, v27); 6301 6302 // lower half lookup 6303 __ tbl(decL0, arrangement, codecL, 4, in0); 6304 __ tbl(decL1, arrangement, codecL, 4, in1); 6305 __ tbl(decL2, arrangement, codecL, 4, in2); 6306 __ tbl(decL3, arrangement, codecL, 4, in3); 6307 6308 // higher half lookup 6309 __ tbx(decH0, arrangement, codecH, 4, decH0); 6310 __ tbx(decH1, arrangement, codecH, 4, decH1); 6311 __ tbx(decH2, arrangement, codecH, 4, decH2); 6312 __ tbx(decH3, arrangement, codecH, 4, decH3); 6313 6314 // combine lower and higher 6315 __ orr(decL0, arrangement, decL0, decH0); 6316 __ orr(decL1, arrangement, decL1, decH1); 6317 __ orr(decL2, arrangement, decL2, decH2); 6318 __ orr(decL3, arrangement, decL3, decH3); 6319 6320 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6321 __ cmhi(decH0, arrangement, decL0, v27); 6322 __ cmhi(decH1, arrangement, decL1, v27); 6323 __ cmhi(decH2, arrangement, decL2, v27); 6324 __ cmhi(decH3, arrangement, decL3, v27); 6325 __ orr(in0, arrangement, decH0, decH1); 6326 __ orr(in1, arrangement, decH2, decH3); 6327 __ orr(in2, arrangement, in0, in1); 6328 __ umaxv(in3, arrangement, in2); 6329 __ umov(rscratch2, in3, __ B, 0); 6330 6331 // get the data to output 6332 __ shl(out0, arrangement, decL0, 2); 6333 __ ushr(out1, arrangement, decL1, 4); 6334 __ orr(out0, arrangement, out0, out1); 6335 __ shl(out1, arrangement, decL1, 4); 6336 __ ushr(out2, arrangement, decL2, 2); 6337 __ orr(out1, arrangement, out1, out2); 6338 __ shl(out2, arrangement, decL2, 6); 6339 __ orr(out2, arrangement, out2, decL3); 6340 6341 __ cbz(rscratch2, NoIllegalData); 6342 6343 // handle illegal input 6344 __ umov(r10, in2, __ D, 0); 6345 if (size == 16) { 6346 __ cbnz(r10, ErrorInLowerHalf); 6347 6348 // illegal input is in higher half, store the lower half now. 6349 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6350 6351 __ umov(r10, in2, __ D, 1); 6352 __ umov(r11, out0, __ D, 1); 6353 __ umov(r12, out1, __ D, 1); 6354 __ umov(r13, out2, __ D, 1); 6355 __ b(StoreLegalData); 6356 6357 __ BIND(ErrorInLowerHalf); 6358 } 6359 __ umov(r11, out0, __ D, 0); 6360 __ umov(r12, out1, __ D, 0); 6361 __ umov(r13, out2, __ D, 0); 6362 6363 __ BIND(StoreLegalData); 6364 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6365 __ strb(r11, __ post(dst, 1)); 6366 __ strb(r12, __ post(dst, 1)); 6367 __ strb(r13, __ post(dst, 1)); 6368 __ lsr(r10, r10, 8); 6369 __ lsr(r11, r11, 8); 6370 __ lsr(r12, r12, 8); 6371 __ lsr(r13, r13, 8); 6372 __ b(StoreLegalData); 6373 6374 __ BIND(NoIllegalData); 6375 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6376 } 6377 6378 6379 /** 6380 * Arguments: 6381 * 6382 * Input: 6383 * c_rarg0 - src_start 6384 * c_rarg1 - src_offset 6385 * c_rarg2 - src_length 6386 * c_rarg3 - dest_start 6387 * c_rarg4 - dest_offset 6388 * c_rarg5 - isURL 6389 * c_rarg6 - isMIME 6390 * 6391 */ 6392 address generate_base64_decodeBlock() { 6393 6394 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6395 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6396 // titled "Base64 decoding". 6397 6398 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6399 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6400 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6401 static const uint8_t fromBase64ForNoSIMD[256] = { 6402 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6403 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6404 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6405 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6406 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6407 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6408 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6409 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6410 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6411 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6412 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6413 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6414 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6415 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6416 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6417 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6418 }; 6419 6420 static const uint8_t fromBase64URLForNoSIMD[256] = { 6421 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6422 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6423 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6424 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6425 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6426 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6427 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6428 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6429 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6430 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6431 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6432 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6433 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6434 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6435 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6436 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6437 }; 6438 6439 // A legal value of base64 code is in range [0, 127]. We need two lookups 6440 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6441 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6442 // table vector lookup use tbx, out of range indices are unchanged in 6443 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6444 // The value of index 64 is set to 0, so that we know that we already get the 6445 // decoded data with the 1st lookup. 6446 static const uint8_t fromBase64ForSIMD[128] = { 6447 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6448 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6449 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6450 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6451 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6452 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6453 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6454 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6455 }; 6456 6457 static const uint8_t fromBase64URLForSIMD[128] = { 6458 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6459 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6460 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6461 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6462 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6463 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6464 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6465 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6466 }; 6467 6468 __ align(CodeEntryAlignment); 6469 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6470 address start = __ pc(); 6471 6472 Register src = c_rarg0; // source array 6473 Register soff = c_rarg1; // source start offset 6474 Register send = c_rarg2; // source end offset 6475 Register dst = c_rarg3; // dest array 6476 Register doff = c_rarg4; // position for writing to dest array 6477 Register isURL = c_rarg5; // Base64 or URL character set 6478 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6479 6480 Register length = send; // reuse send as length of source data to process 6481 6482 Register simd_codec = c_rarg6; 6483 Register nosimd_codec = c_rarg7; 6484 6485 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6486 6487 __ enter(); 6488 6489 __ add(src, src, soff); 6490 __ add(dst, dst, doff); 6491 6492 __ mov(doff, dst); 6493 6494 __ sub(length, send, soff); 6495 __ bfm(length, zr, 0, 1); 6496 6497 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6498 __ cbz(isURL, ProcessData); 6499 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6500 6501 __ BIND(ProcessData); 6502 __ mov(rscratch1, length); 6503 __ cmp(length, (u1)144); // 144 = 80 + 64 6504 __ br(Assembler::LT, Process4B); 6505 6506 // In the MIME case, the line length cannot be more than 76 6507 // bytes (see RFC 2045). This is too short a block for SIMD 6508 // to be worthwhile, so we use non-SIMD here. 6509 __ movw(rscratch1, 79); 6510 6511 __ BIND(Process4B); 6512 __ ldrw(r14, __ post(src, 4)); 6513 __ ubfxw(r10, r14, 0, 8); 6514 __ ubfxw(r11, r14, 8, 8); 6515 __ ubfxw(r12, r14, 16, 8); 6516 __ ubfxw(r13, r14, 24, 8); 6517 // get the de-code 6518 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6519 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6520 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6521 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6522 // error detection, 255u indicates an illegal input 6523 __ orrw(r14, r10, r11); 6524 __ orrw(r15, r12, r13); 6525 __ orrw(r14, r14, r15); 6526 __ tbnz(r14, 7, Exit); 6527 // recover the data 6528 __ lslw(r14, r10, 10); 6529 __ bfiw(r14, r11, 4, 6); 6530 __ bfmw(r14, r12, 2, 5); 6531 __ rev16w(r14, r14); 6532 __ bfiw(r13, r12, 6, 2); 6533 __ strh(r14, __ post(dst, 2)); 6534 __ strb(r13, __ post(dst, 1)); 6535 // non-simd loop 6536 __ subsw(rscratch1, rscratch1, 4); 6537 __ br(Assembler::GT, Process4B); 6538 6539 // if exiting from PreProcess80B, rscratch1 == -1; 6540 // otherwise, rscratch1 == 0. 6541 __ cbzw(rscratch1, Exit); 6542 __ sub(length, length, 80); 6543 6544 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6545 __ cbz(isURL, SIMDEnter); 6546 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6547 6548 __ BIND(SIMDEnter); 6549 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6550 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6551 __ mov(rscratch1, 63); 6552 __ dup(v27, __ T16B, rscratch1); 6553 6554 __ BIND(Process64B); 6555 __ cmp(length, (u1)64); 6556 __ br(Assembler::LT, Process32B); 6557 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6558 __ sub(length, length, 64); 6559 __ b(Process64B); 6560 6561 __ BIND(Process32B); 6562 __ cmp(length, (u1)32); 6563 __ br(Assembler::LT, SIMDExit); 6564 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6565 __ sub(length, length, 32); 6566 __ b(Process32B); 6567 6568 __ BIND(SIMDExit); 6569 __ cbz(length, Exit); 6570 __ movw(rscratch1, length); 6571 __ b(Process4B); 6572 6573 __ BIND(Exit); 6574 __ sub(c_rarg0, dst, doff); 6575 6576 __ leave(); 6577 __ ret(lr); 6578 6579 return start; 6580 } 6581 6582 // Support for spin waits. 6583 address generate_spin_wait() { 6584 __ align(CodeEntryAlignment); 6585 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6586 address start = __ pc(); 6587 6588 __ spin_wait(); 6589 __ ret(lr); 6590 6591 return start; 6592 } 6593 6594 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 6595 6596 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6597 // 6598 // If LSE is in use, generate LSE versions of all the stubs. The 6599 // non-LSE versions are in atomic_aarch64.S. 6600 6601 // class AtomicStubMark records the entry point of a stub and the 6602 // stub pointer which will point to it. The stub pointer is set to 6603 // the entry point when ~AtomicStubMark() is called, which must be 6604 // after ICache::invalidate_range. This ensures safe publication of 6605 // the generated code. 6606 class AtomicStubMark { 6607 address _entry_point; 6608 aarch64_atomic_stub_t *_stub; 6609 MacroAssembler *_masm; 6610 public: 6611 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6612 _masm = masm; 6613 __ align(32); 6614 _entry_point = __ pc(); 6615 _stub = stub; 6616 } 6617 ~AtomicStubMark() { 6618 *_stub = (aarch64_atomic_stub_t)_entry_point; 6619 } 6620 }; 6621 6622 // NB: For memory_order_conservative we need a trailing membar after 6623 // LSE atomic operations but not a leading membar. 6624 // 6625 // We don't need a leading membar because a clause in the Arm ARM 6626 // says: 6627 // 6628 // Barrier-ordered-before 6629 // 6630 // Barrier instructions order prior Memory effects before subsequent 6631 // Memory effects generated by the same Observer. A read or a write 6632 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6633 // Observer if and only if RW1 appears in program order before RW 2 6634 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6635 // instruction with both Acquire and Release semantics. 6636 // 6637 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6638 // and Release semantics, therefore we don't need a leading 6639 // barrier. However, there is no corresponding Barrier-ordered-after 6640 // relationship, therefore we need a trailing membar to prevent a 6641 // later store or load from being reordered with the store in an 6642 // atomic instruction. 6643 // 6644 // This was checked by using the herd7 consistency model simulator 6645 // (http://diy.inria.fr/) with this test case: 6646 // 6647 // AArch64 LseCas 6648 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6649 // P0 | P1; 6650 // LDR W4, [X2] | MOV W3, #0; 6651 // DMB LD | MOV W4, #1; 6652 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6653 // | DMB ISH; 6654 // | STR W4, [X2]; 6655 // exists 6656 // (0:X3=0 /\ 0:X4=1) 6657 // 6658 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6659 // with the store to x in P1. Without the DMB in P1 this may happen. 6660 // 6661 // At the time of writing we don't know of any AArch64 hardware that 6662 // reorders stores in this way, but the Reference Manual permits it. 6663 6664 void gen_cas_entry(Assembler::operand_size size, 6665 atomic_memory_order order) { 6666 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6667 exchange_val = c_rarg2; 6668 bool acquire, release; 6669 switch (order) { 6670 case memory_order_relaxed: 6671 acquire = false; 6672 release = false; 6673 break; 6674 case memory_order_release: 6675 acquire = false; 6676 release = true; 6677 break; 6678 default: 6679 acquire = true; 6680 release = true; 6681 break; 6682 } 6683 __ mov(prev, compare_val); 6684 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6685 if (order == memory_order_conservative) { 6686 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6687 } 6688 if (size == Assembler::xword) { 6689 __ mov(r0, prev); 6690 } else { 6691 __ movw(r0, prev); 6692 } 6693 __ ret(lr); 6694 } 6695 6696 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6697 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6698 // If not relaxed, then default to conservative. Relaxed is the only 6699 // case we use enough to be worth specializing. 6700 if (order == memory_order_relaxed) { 6701 __ ldadd(size, incr, prev, addr); 6702 } else { 6703 __ ldaddal(size, incr, prev, addr); 6704 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6705 } 6706 if (size == Assembler::xword) { 6707 __ mov(r0, prev); 6708 } else { 6709 __ movw(r0, prev); 6710 } 6711 __ ret(lr); 6712 } 6713 6714 void gen_swpal_entry(Assembler::operand_size size) { 6715 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6716 __ swpal(size, incr, prev, addr); 6717 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6718 if (size == Assembler::xword) { 6719 __ mov(r0, prev); 6720 } else { 6721 __ movw(r0, prev); 6722 } 6723 __ ret(lr); 6724 } 6725 6726 void generate_atomic_entry_points() { 6727 if (! UseLSE) { 6728 return; 6729 } 6730 6731 __ align(CodeEntryAlignment); 6732 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6733 address first_entry = __ pc(); 6734 6735 // ADD, memory_order_conservative 6736 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6737 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6738 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6739 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6740 6741 // ADD, memory_order_relaxed 6742 AtomicStubMark mark_fetch_add_4_relaxed 6743 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6744 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6745 AtomicStubMark mark_fetch_add_8_relaxed 6746 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6747 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6748 6749 // XCHG, memory_order_conservative 6750 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6751 gen_swpal_entry(Assembler::word); 6752 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6753 gen_swpal_entry(Assembler::xword); 6754 6755 // CAS, memory_order_conservative 6756 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6757 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6758 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6759 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6760 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6761 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6762 6763 // CAS, memory_order_relaxed 6764 AtomicStubMark mark_cmpxchg_1_relaxed 6765 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6766 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6767 AtomicStubMark mark_cmpxchg_4_relaxed 6768 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6769 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6770 AtomicStubMark mark_cmpxchg_8_relaxed 6771 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6772 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6773 6774 AtomicStubMark mark_cmpxchg_4_release 6775 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6776 gen_cas_entry(MacroAssembler::word, memory_order_release); 6777 AtomicStubMark mark_cmpxchg_8_release 6778 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 6779 gen_cas_entry(MacroAssembler::xword, memory_order_release); 6780 6781 AtomicStubMark mark_cmpxchg_4_seq_cst 6782 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 6783 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 6784 AtomicStubMark mark_cmpxchg_8_seq_cst 6785 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 6786 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 6787 6788 ICache::invalidate_range(first_entry, __ pc() - first_entry); 6789 } 6790 #endif // LINUX 6791 6792 address generate_cont_thaw(Continuation::thaw_kind kind) { 6793 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 6794 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 6795 6796 address start = __ pc(); 6797 6798 if (return_barrier) { 6799 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 6800 __ mov(sp, rscratch1); 6801 } 6802 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6803 6804 if (return_barrier) { 6805 // preserve possible return value from a method returning to the return barrier 6806 __ fmovd(rscratch1, v0); 6807 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 6808 } 6809 6810 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 6811 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 6812 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 6813 6814 if (return_barrier) { 6815 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 6816 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 6817 __ fmovd(v0, rscratch1); 6818 } 6819 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6820 6821 6822 Label thaw_success; 6823 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 6824 __ cbnz(rscratch2, thaw_success); 6825 __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry())); 6826 __ br(rscratch1); 6827 __ bind(thaw_success); 6828 6829 // make room for the thawed frames 6830 __ sub(rscratch1, sp, rscratch2); 6831 __ andr(rscratch1, rscratch1, -16); // align 6832 __ mov(sp, rscratch1); 6833 6834 if (return_barrier) { 6835 // save original return value -- again 6836 __ fmovd(rscratch1, v0); 6837 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 6838 } 6839 6840 // If we want, we can templatize thaw by kind, and have three different entries 6841 __ movw(c_rarg1, (uint32_t)kind); 6842 6843 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 6844 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 6845 6846 if (return_barrier) { 6847 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 6848 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 6849 __ fmovd(v0, rscratch1); 6850 } else { 6851 __ mov(r0, zr); // return 0 (success) from doYield 6852 } 6853 6854 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 6855 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 6856 __ mov(rfp, sp); 6857 6858 if (return_barrier_exception) { 6859 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 6860 __ verify_oop(r0); 6861 __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19 6862 6863 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 6864 6865 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 6866 // __ reinitialize_ptrue(); 6867 6868 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 6869 6870 __ mov(r1, r0); // the exception handler 6871 __ mov(r0, r19); // restore return value contaning the exception oop 6872 __ verify_oop(r0); 6873 6874 __ leave(); 6875 __ mov(r3, lr); 6876 __ br(r1); // the exception handler 6877 } else { 6878 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 6879 __ leave(); 6880 __ ret(lr); 6881 } 6882 6883 return start; 6884 } 6885 6886 address generate_cont_thaw() { 6887 if (!Continuations::enabled()) return nullptr; 6888 6889 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 6890 address start = __ pc(); 6891 generate_cont_thaw(Continuation::thaw_top); 6892 return start; 6893 } 6894 6895 address generate_cont_returnBarrier() { 6896 if (!Continuations::enabled()) return nullptr; 6897 6898 // TODO: will probably need multiple return barriers depending on return type 6899 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 6900 address start = __ pc(); 6901 6902 generate_cont_thaw(Continuation::thaw_return_barrier); 6903 6904 return start; 6905 } 6906 6907 address generate_cont_returnBarrier_exception() { 6908 if (!Continuations::enabled()) return nullptr; 6909 6910 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 6911 address start = __ pc(); 6912 6913 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 6914 6915 return start; 6916 } 6917 6918 #if INCLUDE_JFR 6919 6920 static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { 6921 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 6922 __ mov(c_rarg0, thread); 6923 } 6924 6925 // The handle is dereferenced through a load barrier. 6926 static void jfr_epilogue(MacroAssembler* _masm) { 6927 __ reset_last_Java_frame(true); 6928 __ resolve_global_jobject(r0, rscratch1, rscratch2); 6929 } 6930 6931 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 6932 // It returns a jobject handle to the event writer. 6933 // The handle is dereferenced and the return value is the event writer oop. 6934 static RuntimeStub* generate_jfr_write_checkpoint() { 6935 enum layout { 6936 rbp_off, 6937 rbpH_off, 6938 return_off, 6939 return_off2, 6940 framesize // inclusive of return address 6941 }; 6942 6943 int insts_size = 1024; 6944 int locs_size = 64; 6945 CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size); 6946 OopMapSet* oop_maps = new OopMapSet(); 6947 MacroAssembler* masm = new MacroAssembler(&code); 6948 MacroAssembler* _masm = masm; 6949 6950 address start = __ pc(); 6951 __ enter(); 6952 int frame_complete = __ pc() - start; 6953 address the_pc = __ pc(); 6954 jfr_prologue(the_pc, _masm, rthread); 6955 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 6956 jfr_epilogue(_masm); 6957 __ leave(); 6958 __ ret(lr); 6959 6960 OopMap* map = new OopMap(framesize, 1); // rfp 6961 oop_maps->add_gc_map(the_pc - start, map); 6962 6963 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 6964 RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete, 6965 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 6966 oop_maps, false); 6967 return stub; 6968 } 6969 6970 #endif // INCLUDE_JFR 6971 6972 // Continuation point for throwing of implicit exceptions that are 6973 // not handled in the current activation. Fabricates an exception 6974 // oop and initiates normal exception dispatching in this 6975 // frame. Since we need to preserve callee-saved values (currently 6976 // only for C2, but done for C1 as well) we need a callee-saved oop 6977 // map and therefore have to make these stubs into RuntimeStubs 6978 // rather than BufferBlobs. If the compiler needs all registers to 6979 // be preserved between the fault point and the exception handler 6980 // then it must assume responsibility for that in 6981 // AbstractCompiler::continuation_for_implicit_null_exception or 6982 // continuation_for_implicit_division_by_zero_exception. All other 6983 // implicit exceptions (e.g., NullPointerException or 6984 // AbstractMethodError on entry) are either at call sites or 6985 // otherwise assume that stack unwinding will be initiated, so 6986 // caller saved registers were assumed volatile in the compiler. 6987 6988 #undef __ 6989 #define __ masm-> 6990 6991 address generate_throw_exception(const char* name, 6992 address runtime_entry, 6993 Register arg1 = noreg, 6994 Register arg2 = noreg) { 6995 // Information about frame layout at time of blocking runtime call. 6996 // Note that we only have to preserve callee-saved registers since 6997 // the compilers are responsible for supplying a continuation point 6998 // if they expect all registers to be preserved. 6999 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 7000 enum layout { 7001 rfp_off = 0, 7002 rfp_off2, 7003 return_off, 7004 return_off2, 7005 framesize // inclusive of return address 7006 }; 7007 7008 int insts_size = 512; 7009 int locs_size = 64; 7010 7011 CodeBuffer code(name, insts_size, locs_size); 7012 OopMapSet* oop_maps = new OopMapSet(); 7013 MacroAssembler* masm = new MacroAssembler(&code); 7014 7015 address start = __ pc(); 7016 7017 // This is an inlined and slightly modified version of call_VM 7018 // which has the ability to fetch the return PC out of 7019 // thread-local storage and also sets up last_Java_sp slightly 7020 // differently than the real call_VM 7021 7022 __ enter(); // Save FP and LR before call 7023 7024 assert(is_even(framesize/2), "sp not 16-byte aligned"); 7025 7026 // lr and fp are already in place 7027 __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog 7028 7029 int frame_complete = __ pc() - start; 7030 7031 // Set up last_Java_sp and last_Java_fp 7032 address the_pc = __ pc(); 7033 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7034 7035 // Call runtime 7036 if (arg1 != noreg) { 7037 assert(arg2 != c_rarg1, "clobbered"); 7038 __ mov(c_rarg1, arg1); 7039 } 7040 if (arg2 != noreg) { 7041 __ mov(c_rarg2, arg2); 7042 } 7043 __ mov(c_rarg0, rthread); 7044 BLOCK_COMMENT("call runtime_entry"); 7045 __ mov(rscratch1, runtime_entry); 7046 __ blr(rscratch1); 7047 7048 // Generate oop map 7049 OopMap* map = new OopMap(framesize, 0); 7050 7051 oop_maps->add_gc_map(the_pc - start, map); 7052 7053 __ reset_last_Java_frame(true); 7054 7055 // Reinitialize the ptrue predicate register, in case the external runtime 7056 // call clobbers ptrue reg, as we may return to SVE compiled code. 7057 __ reinitialize_ptrue(); 7058 7059 __ leave(); 7060 7061 // check for pending exceptions 7062 #ifdef ASSERT 7063 Label L; 7064 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 7065 __ cbnz(rscratch1, L); 7066 __ should_not_reach_here(); 7067 __ bind(L); 7068 #endif // ASSERT 7069 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 7070 7071 // codeBlob framesize is in words (not VMRegImpl::slot_size) 7072 RuntimeStub* stub = 7073 RuntimeStub::new_runtime_stub(name, 7074 &code, 7075 frame_complete, 7076 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7077 oop_maps, false); 7078 return stub->entry_point(); 7079 } 7080 7081 class MontgomeryMultiplyGenerator : public MacroAssembler { 7082 7083 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7084 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7085 7086 RegSet _toSave; 7087 bool _squaring; 7088 7089 public: 7090 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7091 : MacroAssembler(as->code()), _squaring(squaring) { 7092 7093 // Register allocation 7094 7095 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7096 Pa_base = *regs; // Argument registers 7097 if (squaring) 7098 Pb_base = Pa_base; 7099 else 7100 Pb_base = *++regs; 7101 Pn_base = *++regs; 7102 Rlen= *++regs; 7103 inv = *++regs; 7104 Pm_base = *++regs; 7105 7106 // Working registers: 7107 Ra = *++regs; // The current digit of a, b, n, and m. 7108 Rb = *++regs; 7109 Rm = *++regs; 7110 Rn = *++regs; 7111 7112 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7113 Pb = *++regs; 7114 Pm = *++regs; 7115 Pn = *++regs; 7116 7117 t0 = *++regs; // Three registers which form a 7118 t1 = *++regs; // triple-precision accumuator. 7119 t2 = *++regs; 7120 7121 Ri = *++regs; // Inner and outer loop indexes. 7122 Rj = *++regs; 7123 7124 Rhi_ab = *++regs; // Product registers: low and high parts 7125 Rlo_ab = *++regs; // of a*b and m*n. 7126 Rhi_mn = *++regs; 7127 Rlo_mn = *++regs; 7128 7129 // r19 and up are callee-saved. 7130 _toSave = RegSet::range(r19, *regs) + Pm_base; 7131 } 7132 7133 private: 7134 void save_regs() { 7135 push(_toSave, sp); 7136 } 7137 7138 void restore_regs() { 7139 pop(_toSave, sp); 7140 } 7141 7142 template <typename T> 7143 void unroll_2(Register count, T block) { 7144 Label loop, end, odd; 7145 tbnz(count, 0, odd); 7146 cbz(count, end); 7147 align(16); 7148 bind(loop); 7149 (this->*block)(); 7150 bind(odd); 7151 (this->*block)(); 7152 subs(count, count, 2); 7153 br(Assembler::GT, loop); 7154 bind(end); 7155 } 7156 7157 template <typename T> 7158 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7159 Label loop, end, odd; 7160 tbnz(count, 0, odd); 7161 cbz(count, end); 7162 align(16); 7163 bind(loop); 7164 (this->*block)(d, s, tmp); 7165 bind(odd); 7166 (this->*block)(d, s, tmp); 7167 subs(count, count, 2); 7168 br(Assembler::GT, loop); 7169 bind(end); 7170 } 7171 7172 void pre1(RegisterOrConstant i) { 7173 block_comment("pre1"); 7174 // Pa = Pa_base; 7175 // Pb = Pb_base + i; 7176 // Pm = Pm_base; 7177 // Pn = Pn_base + i; 7178 // Ra = *Pa; 7179 // Rb = *Pb; 7180 // Rm = *Pm; 7181 // Rn = *Pn; 7182 ldr(Ra, Address(Pa_base)); 7183 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7184 ldr(Rm, Address(Pm_base)); 7185 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7186 lea(Pa, Address(Pa_base)); 7187 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7188 lea(Pm, Address(Pm_base)); 7189 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7190 7191 // Zero the m*n result. 7192 mov(Rhi_mn, zr); 7193 mov(Rlo_mn, zr); 7194 } 7195 7196 // The core multiply-accumulate step of a Montgomery 7197 // multiplication. The idea is to schedule operations as a 7198 // pipeline so that instructions with long latencies (loads and 7199 // multiplies) have time to complete before their results are 7200 // used. This most benefits in-order implementations of the 7201 // architecture but out-of-order ones also benefit. 7202 void step() { 7203 block_comment("step"); 7204 // MACC(Ra, Rb, t0, t1, t2); 7205 // Ra = *++Pa; 7206 // Rb = *--Pb; 7207 umulh(Rhi_ab, Ra, Rb); 7208 mul(Rlo_ab, Ra, Rb); 7209 ldr(Ra, pre(Pa, wordSize)); 7210 ldr(Rb, pre(Pb, -wordSize)); 7211 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7212 // previous iteration. 7213 // MACC(Rm, Rn, t0, t1, t2); 7214 // Rm = *++Pm; 7215 // Rn = *--Pn; 7216 umulh(Rhi_mn, Rm, Rn); 7217 mul(Rlo_mn, Rm, Rn); 7218 ldr(Rm, pre(Pm, wordSize)); 7219 ldr(Rn, pre(Pn, -wordSize)); 7220 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7221 } 7222 7223 void post1() { 7224 block_comment("post1"); 7225 7226 // MACC(Ra, Rb, t0, t1, t2); 7227 // Ra = *++Pa; 7228 // Rb = *--Pb; 7229 umulh(Rhi_ab, Ra, Rb); 7230 mul(Rlo_ab, Ra, Rb); 7231 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7232 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7233 7234 // *Pm = Rm = t0 * inv; 7235 mul(Rm, t0, inv); 7236 str(Rm, Address(Pm)); 7237 7238 // MACC(Rm, Rn, t0, t1, t2); 7239 // t0 = t1; t1 = t2; t2 = 0; 7240 umulh(Rhi_mn, Rm, Rn); 7241 7242 #ifndef PRODUCT 7243 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7244 { 7245 mul(Rlo_mn, Rm, Rn); 7246 add(Rlo_mn, t0, Rlo_mn); 7247 Label ok; 7248 cbz(Rlo_mn, ok); { 7249 stop("broken Montgomery multiply"); 7250 } bind(ok); 7251 } 7252 #endif 7253 // We have very carefully set things up so that 7254 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7255 // the lower half of Rm * Rn because we know the result already: 7256 // it must be -t0. t0 + (-t0) must generate a carry iff 7257 // t0 != 0. So, rather than do a mul and an adds we just set 7258 // the carry flag iff t0 is nonzero. 7259 // 7260 // mul(Rlo_mn, Rm, Rn); 7261 // adds(zr, t0, Rlo_mn); 7262 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7263 adcs(t0, t1, Rhi_mn); 7264 adc(t1, t2, zr); 7265 mov(t2, zr); 7266 } 7267 7268 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7269 block_comment("pre2"); 7270 // Pa = Pa_base + i-len; 7271 // Pb = Pb_base + len; 7272 // Pm = Pm_base + i-len; 7273 // Pn = Pn_base + len; 7274 7275 if (i.is_register()) { 7276 sub(Rj, i.as_register(), len); 7277 } else { 7278 mov(Rj, i.as_constant()); 7279 sub(Rj, Rj, len); 7280 } 7281 // Rj == i-len 7282 7283 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7284 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7285 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7286 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7287 7288 // Ra = *++Pa; 7289 // Rb = *--Pb; 7290 // Rm = *++Pm; 7291 // Rn = *--Pn; 7292 ldr(Ra, pre(Pa, wordSize)); 7293 ldr(Rb, pre(Pb, -wordSize)); 7294 ldr(Rm, pre(Pm, wordSize)); 7295 ldr(Rn, pre(Pn, -wordSize)); 7296 7297 mov(Rhi_mn, zr); 7298 mov(Rlo_mn, zr); 7299 } 7300 7301 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7302 block_comment("post2"); 7303 if (i.is_constant()) { 7304 mov(Rj, i.as_constant()-len.as_constant()); 7305 } else { 7306 sub(Rj, i.as_register(), len); 7307 } 7308 7309 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7310 7311 // As soon as we know the least significant digit of our result, 7312 // store it. 7313 // Pm_base[i-len] = t0; 7314 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7315 7316 // t0 = t1; t1 = t2; t2 = 0; 7317 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7318 adc(t1, t2, zr); 7319 mov(t2, zr); 7320 } 7321 7322 // A carry in t0 after Montgomery multiplication means that we 7323 // should subtract multiples of n from our result in m. We'll 7324 // keep doing that until there is no carry. 7325 void normalize(RegisterOrConstant len) { 7326 block_comment("normalize"); 7327 // while (t0) 7328 // t0 = sub(Pm_base, Pn_base, t0, len); 7329 Label loop, post, again; 7330 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7331 cbz(t0, post); { 7332 bind(again); { 7333 mov(i, zr); 7334 mov(cnt, len); 7335 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7336 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7337 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7338 align(16); 7339 bind(loop); { 7340 sbcs(Rm, Rm, Rn); 7341 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7342 add(i, i, 1); 7343 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7344 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7345 sub(cnt, cnt, 1); 7346 } cbnz(cnt, loop); 7347 sbc(t0, t0, zr); 7348 } cbnz(t0, again); 7349 } bind(post); 7350 } 7351 7352 // Move memory at s to d, reversing words. 7353 // Increments d to end of copied memory 7354 // Destroys tmp1, tmp2 7355 // Preserves len 7356 // Leaves s pointing to the address which was in d at start 7357 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7358 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7359 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7360 7361 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7362 mov(tmp1, len); 7363 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7364 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7365 } 7366 // where 7367 void reverse1(Register d, Register s, Register tmp) { 7368 ldr(tmp, pre(s, -wordSize)); 7369 ror(tmp, tmp, 32); 7370 str(tmp, post(d, wordSize)); 7371 } 7372 7373 void step_squaring() { 7374 // An extra ACC 7375 step(); 7376 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7377 } 7378 7379 void last_squaring(RegisterOrConstant i) { 7380 Label dont; 7381 // if ((i & 1) == 0) { 7382 tbnz(i.as_register(), 0, dont); { 7383 // MACC(Ra, Rb, t0, t1, t2); 7384 // Ra = *++Pa; 7385 // Rb = *--Pb; 7386 umulh(Rhi_ab, Ra, Rb); 7387 mul(Rlo_ab, Ra, Rb); 7388 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7389 } bind(dont); 7390 } 7391 7392 void extra_step_squaring() { 7393 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7394 7395 // MACC(Rm, Rn, t0, t1, t2); 7396 // Rm = *++Pm; 7397 // Rn = *--Pn; 7398 umulh(Rhi_mn, Rm, Rn); 7399 mul(Rlo_mn, Rm, Rn); 7400 ldr(Rm, pre(Pm, wordSize)); 7401 ldr(Rn, pre(Pn, -wordSize)); 7402 } 7403 7404 void post1_squaring() { 7405 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7406 7407 // *Pm = Rm = t0 * inv; 7408 mul(Rm, t0, inv); 7409 str(Rm, Address(Pm)); 7410 7411 // MACC(Rm, Rn, t0, t1, t2); 7412 // t0 = t1; t1 = t2; t2 = 0; 7413 umulh(Rhi_mn, Rm, Rn); 7414 7415 #ifndef PRODUCT 7416 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7417 { 7418 mul(Rlo_mn, Rm, Rn); 7419 add(Rlo_mn, t0, Rlo_mn); 7420 Label ok; 7421 cbz(Rlo_mn, ok); { 7422 stop("broken Montgomery multiply"); 7423 } bind(ok); 7424 } 7425 #endif 7426 // We have very carefully set things up so that 7427 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7428 // the lower half of Rm * Rn because we know the result already: 7429 // it must be -t0. t0 + (-t0) must generate a carry iff 7430 // t0 != 0. So, rather than do a mul and an adds we just set 7431 // the carry flag iff t0 is nonzero. 7432 // 7433 // mul(Rlo_mn, Rm, Rn); 7434 // adds(zr, t0, Rlo_mn); 7435 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7436 adcs(t0, t1, Rhi_mn); 7437 adc(t1, t2, zr); 7438 mov(t2, zr); 7439 } 7440 7441 void acc(Register Rhi, Register Rlo, 7442 Register t0, Register t1, Register t2) { 7443 adds(t0, t0, Rlo); 7444 adcs(t1, t1, Rhi); 7445 adc(t2, t2, zr); 7446 } 7447 7448 public: 7449 /** 7450 * Fast Montgomery multiplication. The derivation of the 7451 * algorithm is in A Cryptographic Library for the Motorola 7452 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7453 * 7454 * Arguments: 7455 * 7456 * Inputs for multiplication: 7457 * c_rarg0 - int array elements a 7458 * c_rarg1 - int array elements b 7459 * c_rarg2 - int array elements n (the modulus) 7460 * c_rarg3 - int length 7461 * c_rarg4 - int inv 7462 * c_rarg5 - int array elements m (the result) 7463 * 7464 * Inputs for squaring: 7465 * c_rarg0 - int array elements a 7466 * c_rarg1 - int array elements n (the modulus) 7467 * c_rarg2 - int length 7468 * c_rarg3 - int inv 7469 * c_rarg4 - int array elements m (the result) 7470 * 7471 */ 7472 address generate_multiply() { 7473 Label argh, nothing; 7474 bind(argh); 7475 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7476 7477 align(CodeEntryAlignment); 7478 address entry = pc(); 7479 7480 cbzw(Rlen, nothing); 7481 7482 enter(); 7483 7484 // Make room. 7485 cmpw(Rlen, 512); 7486 br(Assembler::HI, argh); 7487 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7488 andr(sp, Ra, -2 * wordSize); 7489 7490 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7491 7492 { 7493 // Copy input args, reversing as we go. We use Ra as a 7494 // temporary variable. 7495 reverse(Ra, Pa_base, Rlen, t0, t1); 7496 if (!_squaring) 7497 reverse(Ra, Pb_base, Rlen, t0, t1); 7498 reverse(Ra, Pn_base, Rlen, t0, t1); 7499 } 7500 7501 // Push all call-saved registers and also Pm_base which we'll need 7502 // at the end. 7503 save_regs(); 7504 7505 #ifndef PRODUCT 7506 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7507 { 7508 ldr(Rn, Address(Pn_base, 0)); 7509 mul(Rlo_mn, Rn, inv); 7510 subs(zr, Rlo_mn, -1); 7511 Label ok; 7512 br(EQ, ok); { 7513 stop("broken inverse in Montgomery multiply"); 7514 } bind(ok); 7515 } 7516 #endif 7517 7518 mov(Pm_base, Ra); 7519 7520 mov(t0, zr); 7521 mov(t1, zr); 7522 mov(t2, zr); 7523 7524 block_comment("for (int i = 0; i < len; i++) {"); 7525 mov(Ri, zr); { 7526 Label loop, end; 7527 cmpw(Ri, Rlen); 7528 br(Assembler::GE, end); 7529 7530 bind(loop); 7531 pre1(Ri); 7532 7533 block_comment(" for (j = i; j; j--) {"); { 7534 movw(Rj, Ri); 7535 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7536 } block_comment(" } // j"); 7537 7538 post1(); 7539 addw(Ri, Ri, 1); 7540 cmpw(Ri, Rlen); 7541 br(Assembler::LT, loop); 7542 bind(end); 7543 block_comment("} // i"); 7544 } 7545 7546 block_comment("for (int i = len; i < 2*len; i++) {"); 7547 mov(Ri, Rlen); { 7548 Label loop, end; 7549 cmpw(Ri, Rlen, Assembler::LSL, 1); 7550 br(Assembler::GE, end); 7551 7552 bind(loop); 7553 pre2(Ri, Rlen); 7554 7555 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7556 lslw(Rj, Rlen, 1); 7557 subw(Rj, Rj, Ri); 7558 subw(Rj, Rj, 1); 7559 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7560 } block_comment(" } // j"); 7561 7562 post2(Ri, Rlen); 7563 addw(Ri, Ri, 1); 7564 cmpw(Ri, Rlen, Assembler::LSL, 1); 7565 br(Assembler::LT, loop); 7566 bind(end); 7567 } 7568 block_comment("} // i"); 7569 7570 normalize(Rlen); 7571 7572 mov(Ra, Pm_base); // Save Pm_base in Ra 7573 restore_regs(); // Restore caller's Pm_base 7574 7575 // Copy our result into caller's Pm_base 7576 reverse(Pm_base, Ra, Rlen, t0, t1); 7577 7578 leave(); 7579 bind(nothing); 7580 ret(lr); 7581 7582 return entry; 7583 } 7584 // In C, approximately: 7585 7586 // void 7587 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7588 // julong Pn_base[], julong Pm_base[], 7589 // julong inv, int len) { 7590 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7591 // julong *Pa, *Pb, *Pn, *Pm; 7592 // julong Ra, Rb, Rn, Rm; 7593 7594 // int i; 7595 7596 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7597 7598 // for (i = 0; i < len; i++) { 7599 // int j; 7600 7601 // Pa = Pa_base; 7602 // Pb = Pb_base + i; 7603 // Pm = Pm_base; 7604 // Pn = Pn_base + i; 7605 7606 // Ra = *Pa; 7607 // Rb = *Pb; 7608 // Rm = *Pm; 7609 // Rn = *Pn; 7610 7611 // int iters = i; 7612 // for (j = 0; iters--; j++) { 7613 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7614 // MACC(Ra, Rb, t0, t1, t2); 7615 // Ra = *++Pa; 7616 // Rb = *--Pb; 7617 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7618 // MACC(Rm, Rn, t0, t1, t2); 7619 // Rm = *++Pm; 7620 // Rn = *--Pn; 7621 // } 7622 7623 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 7624 // MACC(Ra, Rb, t0, t1, t2); 7625 // *Pm = Rm = t0 * inv; 7626 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7627 // MACC(Rm, Rn, t0, t1, t2); 7628 7629 // assert(t0 == 0, "broken Montgomery multiply"); 7630 7631 // t0 = t1; t1 = t2; t2 = 0; 7632 // } 7633 7634 // for (i = len; i < 2*len; i++) { 7635 // int j; 7636 7637 // Pa = Pa_base + i-len; 7638 // Pb = Pb_base + len; 7639 // Pm = Pm_base + i-len; 7640 // Pn = Pn_base + len; 7641 7642 // Ra = *++Pa; 7643 // Rb = *--Pb; 7644 // Rm = *++Pm; 7645 // Rn = *--Pn; 7646 7647 // int iters = len*2-i-1; 7648 // for (j = i-len+1; iters--; j++) { 7649 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7650 // MACC(Ra, Rb, t0, t1, t2); 7651 // Ra = *++Pa; 7652 // Rb = *--Pb; 7653 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7654 // MACC(Rm, Rn, t0, t1, t2); 7655 // Rm = *++Pm; 7656 // Rn = *--Pn; 7657 // } 7658 7659 // Pm_base[i-len] = t0; 7660 // t0 = t1; t1 = t2; t2 = 0; 7661 // } 7662 7663 // while (t0) 7664 // t0 = sub(Pm_base, Pn_base, t0, len); 7665 // } 7666 7667 /** 7668 * Fast Montgomery squaring. This uses asymptotically 25% fewer 7669 * multiplies than Montgomery multiplication so it should be up to 7670 * 25% faster. However, its loop control is more complex and it 7671 * may actually run slower on some machines. 7672 * 7673 * Arguments: 7674 * 7675 * Inputs: 7676 * c_rarg0 - int array elements a 7677 * c_rarg1 - int array elements n (the modulus) 7678 * c_rarg2 - int length 7679 * c_rarg3 - int inv 7680 * c_rarg4 - int array elements m (the result) 7681 * 7682 */ 7683 address generate_square() { 7684 Label argh; 7685 bind(argh); 7686 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7687 7688 align(CodeEntryAlignment); 7689 address entry = pc(); 7690 7691 enter(); 7692 7693 // Make room. 7694 cmpw(Rlen, 512); 7695 br(Assembler::HI, argh); 7696 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7697 andr(sp, Ra, -2 * wordSize); 7698 7699 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7700 7701 { 7702 // Copy input args, reversing as we go. We use Ra as a 7703 // temporary variable. 7704 reverse(Ra, Pa_base, Rlen, t0, t1); 7705 reverse(Ra, Pn_base, Rlen, t0, t1); 7706 } 7707 7708 // Push all call-saved registers and also Pm_base which we'll need 7709 // at the end. 7710 save_regs(); 7711 7712 mov(Pm_base, Ra); 7713 7714 mov(t0, zr); 7715 mov(t1, zr); 7716 mov(t2, zr); 7717 7718 block_comment("for (int i = 0; i < len; i++) {"); 7719 mov(Ri, zr); { 7720 Label loop, end; 7721 bind(loop); 7722 cmp(Ri, Rlen); 7723 br(Assembler::GE, end); 7724 7725 pre1(Ri); 7726 7727 block_comment("for (j = (i+1)/2; j; j--) {"); { 7728 add(Rj, Ri, 1); 7729 lsr(Rj, Rj, 1); 7730 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7731 } block_comment(" } // j"); 7732 7733 last_squaring(Ri); 7734 7735 block_comment(" for (j = i/2; j; j--) {"); { 7736 lsr(Rj, Ri, 1); 7737 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7738 } block_comment(" } // j"); 7739 7740 post1_squaring(); 7741 add(Ri, Ri, 1); 7742 cmp(Ri, Rlen); 7743 br(Assembler::LT, loop); 7744 7745 bind(end); 7746 block_comment("} // i"); 7747 } 7748 7749 block_comment("for (int i = len; i < 2*len; i++) {"); 7750 mov(Ri, Rlen); { 7751 Label loop, end; 7752 bind(loop); 7753 cmp(Ri, Rlen, Assembler::LSL, 1); 7754 br(Assembler::GE, end); 7755 7756 pre2(Ri, Rlen); 7757 7758 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 7759 lsl(Rj, Rlen, 1); 7760 sub(Rj, Rj, Ri); 7761 sub(Rj, Rj, 1); 7762 lsr(Rj, Rj, 1); 7763 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7764 } block_comment(" } // j"); 7765 7766 last_squaring(Ri); 7767 7768 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 7769 lsl(Rj, Rlen, 1); 7770 sub(Rj, Rj, Ri); 7771 lsr(Rj, Rj, 1); 7772 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7773 } block_comment(" } // j"); 7774 7775 post2(Ri, Rlen); 7776 add(Ri, Ri, 1); 7777 cmp(Ri, Rlen, Assembler::LSL, 1); 7778 7779 br(Assembler::LT, loop); 7780 bind(end); 7781 block_comment("} // i"); 7782 } 7783 7784 normalize(Rlen); 7785 7786 mov(Ra, Pm_base); // Save Pm_base in Ra 7787 restore_regs(); // Restore caller's Pm_base 7788 7789 // Copy our result into caller's Pm_base 7790 reverse(Pm_base, Ra, Rlen, t0, t1); 7791 7792 leave(); 7793 ret(lr); 7794 7795 return entry; 7796 } 7797 // In C, approximately: 7798 7799 // void 7800 // montgomery_square(julong Pa_base[], julong Pn_base[], 7801 // julong Pm_base[], julong inv, int len) { 7802 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7803 // julong *Pa, *Pb, *Pn, *Pm; 7804 // julong Ra, Rb, Rn, Rm; 7805 7806 // int i; 7807 7808 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7809 7810 // for (i = 0; i < len; i++) { 7811 // int j; 7812 7813 // Pa = Pa_base; 7814 // Pb = Pa_base + i; 7815 // Pm = Pm_base; 7816 // Pn = Pn_base + i; 7817 7818 // Ra = *Pa; 7819 // Rb = *Pb; 7820 // Rm = *Pm; 7821 // Rn = *Pn; 7822 7823 // int iters = (i+1)/2; 7824 // for (j = 0; iters--; j++) { 7825 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7826 // MACC2(Ra, Rb, t0, t1, t2); 7827 // Ra = *++Pa; 7828 // Rb = *--Pb; 7829 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7830 // MACC(Rm, Rn, t0, t1, t2); 7831 // Rm = *++Pm; 7832 // Rn = *--Pn; 7833 // } 7834 // if ((i & 1) == 0) { 7835 // assert(Ra == Pa_base[j], "must be"); 7836 // MACC(Ra, Ra, t0, t1, t2); 7837 // } 7838 // iters = i/2; 7839 // assert(iters == i-j, "must be"); 7840 // for (; iters--; j++) { 7841 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7842 // MACC(Rm, Rn, t0, t1, t2); 7843 // Rm = *++Pm; 7844 // Rn = *--Pn; 7845 // } 7846 7847 // *Pm = Rm = t0 * inv; 7848 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7849 // MACC(Rm, Rn, t0, t1, t2); 7850 7851 // assert(t0 == 0, "broken Montgomery multiply"); 7852 7853 // t0 = t1; t1 = t2; t2 = 0; 7854 // } 7855 7856 // for (i = len; i < 2*len; i++) { 7857 // int start = i-len+1; 7858 // int end = start + (len - start)/2; 7859 // int j; 7860 7861 // Pa = Pa_base + i-len; 7862 // Pb = Pa_base + len; 7863 // Pm = Pm_base + i-len; 7864 // Pn = Pn_base + len; 7865 7866 // Ra = *++Pa; 7867 // Rb = *--Pb; 7868 // Rm = *++Pm; 7869 // Rn = *--Pn; 7870 7871 // int iters = (2*len-i-1)/2; 7872 // assert(iters == end-start, "must be"); 7873 // for (j = start; iters--; j++) { 7874 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7875 // MACC2(Ra, Rb, t0, t1, t2); 7876 // Ra = *++Pa; 7877 // Rb = *--Pb; 7878 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7879 // MACC(Rm, Rn, t0, t1, t2); 7880 // Rm = *++Pm; 7881 // Rn = *--Pn; 7882 // } 7883 // if ((i & 1) == 0) { 7884 // assert(Ra == Pa_base[j], "must be"); 7885 // MACC(Ra, Ra, t0, t1, t2); 7886 // } 7887 // iters = (2*len-i)/2; 7888 // assert(iters == len-j, "must be"); 7889 // for (; iters--; j++) { 7890 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7891 // MACC(Rm, Rn, t0, t1, t2); 7892 // Rm = *++Pm; 7893 // Rn = *--Pn; 7894 // } 7895 // Pm_base[i-len] = t0; 7896 // t0 = t1; t1 = t2; t2 = 0; 7897 // } 7898 7899 // while (t0) 7900 // t0 = sub(Pm_base, Pn_base, t0, len); 7901 // } 7902 }; 7903 7904 7905 // Initialization 7906 void generate_initial() { 7907 // Generate initial stubs and initializes the entry points 7908 7909 // entry points that exist in all platforms Note: This is code 7910 // that could be shared among different platforms - however the 7911 // benefit seems to be smaller than the disadvantage of having a 7912 // much more complicated generator structure. See also comment in 7913 // stubRoutines.hpp. 7914 7915 StubRoutines::_forward_exception_entry = generate_forward_exception(); 7916 7917 StubRoutines::_call_stub_entry = 7918 generate_call_stub(StubRoutines::_call_stub_return_address); 7919 7920 // is referenced by megamorphic call 7921 StubRoutines::_catch_exception_entry = generate_catch_exception(); 7922 7923 // Build this early so it's available for the interpreter. 7924 StubRoutines::_throw_StackOverflowError_entry = 7925 generate_throw_exception("StackOverflowError throw_exception", 7926 CAST_FROM_FN_PTR(address, 7927 SharedRuntime::throw_StackOverflowError)); 7928 StubRoutines::_throw_delayed_StackOverflowError_entry = 7929 generate_throw_exception("delayed StackOverflowError throw_exception", 7930 CAST_FROM_FN_PTR(address, 7931 SharedRuntime::throw_delayed_StackOverflowError)); 7932 if (UseCRC32Intrinsics) { 7933 // set table address before stub generation which use it 7934 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 7935 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 7936 } 7937 7938 if (UseCRC32CIntrinsics) { 7939 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 7940 } 7941 7942 // Disabled until JDK-8210858 is fixed 7943 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 7944 // StubRoutines::_dlog = generate_dlog(); 7945 // } 7946 7947 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 7948 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 7949 } 7950 7951 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 7952 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 7953 } 7954 } 7955 7956 void generate_phase1() { 7957 // Continuation stubs: 7958 StubRoutines::_cont_thaw = generate_cont_thaw(); 7959 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 7960 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 7961 7962 JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();) 7963 JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();) 7964 } 7965 7966 void generate_all() { 7967 // support for verify_oop (must happen after universe_init) 7968 if (VerifyOops) { 7969 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 7970 } 7971 StubRoutines::_throw_AbstractMethodError_entry = 7972 generate_throw_exception("AbstractMethodError throw_exception", 7973 CAST_FROM_FN_PTR(address, 7974 SharedRuntime:: 7975 throw_AbstractMethodError)); 7976 7977 StubRoutines::_throw_IncompatibleClassChangeError_entry = 7978 generate_throw_exception("IncompatibleClassChangeError throw_exception", 7979 CAST_FROM_FN_PTR(address, 7980 SharedRuntime:: 7981 throw_IncompatibleClassChangeError)); 7982 7983 StubRoutines::_throw_NullPointerException_at_call_entry = 7984 generate_throw_exception("NullPointerException at call throw_exception", 7985 CAST_FROM_FN_PTR(address, 7986 SharedRuntime:: 7987 throw_NullPointerException_at_call)); 7988 7989 if (UseSVE == 0) { 7990 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 7991 } 7992 7993 // arraycopy stubs used by compilers 7994 generate_arraycopy_stubs(); 7995 7996 // countPositives stub for large arrays. 7997 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 7998 7999 // array equals stub for large arrays. 8000 if (!UseSimpleArrayEquals) { 8001 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8002 } 8003 8004 generate_compare_long_strings(); 8005 8006 generate_string_indexof_stubs(); 8007 8008 // byte_array_inflate stub for large arrays. 8009 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8010 8011 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8012 if (bs_nm != NULL) { 8013 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 8014 } 8015 if (UseFastLocking) { 8016 StubRoutines::aarch64::_check_lock_stack = generate_check_lock_stack(); 8017 } 8018 #ifdef COMPILER2 8019 if (UseMultiplyToLenIntrinsic) { 8020 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8021 } 8022 8023 if (UseSquareToLenIntrinsic) { 8024 StubRoutines::_squareToLen = generate_squareToLen(); 8025 } 8026 8027 if (UseMulAddIntrinsic) { 8028 StubRoutines::_mulAdd = generate_mulAdd(); 8029 } 8030 8031 if (UseSIMDForBigIntegerShiftIntrinsics) { 8032 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8033 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8034 } 8035 8036 if (UseMontgomeryMultiplyIntrinsic) { 8037 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8038 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8039 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8040 } 8041 8042 if (UseMontgomerySquareIntrinsic) { 8043 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8044 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8045 // We use generate_multiply() rather than generate_square() 8046 // because it's faster for the sizes of modulus we care about. 8047 StubRoutines::_montgomerySquare = g.generate_multiply(); 8048 } 8049 #endif // COMPILER2 8050 8051 if (UseChaCha20Intrinsics) { 8052 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8053 } 8054 8055 if (UseBASE64Intrinsics) { 8056 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8057 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8058 } 8059 8060 // data cache line writeback 8061 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8062 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8063 8064 if (UseAESIntrinsics) { 8065 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8066 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8067 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8068 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8069 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8070 } 8071 if (UseGHASHIntrinsics) { 8072 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8073 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8074 } 8075 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8076 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8077 } 8078 8079 if (UseMD5Intrinsics) { 8080 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8081 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8082 } 8083 if (UseSHA1Intrinsics) { 8084 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8085 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8086 } 8087 if (UseSHA256Intrinsics) { 8088 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8089 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8090 } 8091 if (UseSHA512Intrinsics) { 8092 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8093 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8094 } 8095 if (UseSHA3Intrinsics) { 8096 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8097 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8098 } 8099 8100 // generate Adler32 intrinsics code 8101 if (UseAdler32Intrinsics) { 8102 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8103 } 8104 8105 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8106 8107 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8108 8109 generate_atomic_entry_points(); 8110 8111 #endif // LINUX 8112 8113 StubRoutines::aarch64::set_completed(); 8114 } 8115 8116 public: 8117 StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) { 8118 if (phase == 0) { 8119 generate_initial(); 8120 } else if (phase == 1) { 8121 generate_phase1(); // stubs that must be available for the interpreter 8122 } else { 8123 generate_all(); 8124 } 8125 } 8126 }; // end class declaration 8127 8128 #define UCM_TABLE_MAX_ENTRIES 8 8129 void StubGenerator_generate(CodeBuffer* code, int phase) { 8130 if (UnsafeCopyMemory::_table == NULL) { 8131 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 8132 } 8133 StubGenerator g(code, phase); 8134 } 8135 8136 8137 #if defined (LINUX) 8138 8139 // Define pointers to atomic stubs and initialize them to point to the 8140 // code in atomic_aarch64.S. 8141 8142 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8143 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8144 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8145 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8146 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8147 8148 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8149 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8150 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8151 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8152 DEFAULT_ATOMIC_OP(xchg, 4, ) 8153 DEFAULT_ATOMIC_OP(xchg, 8, ) 8154 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8155 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8156 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8157 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8158 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8159 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8160 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8161 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8162 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8163 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8164 8165 #undef DEFAULT_ATOMIC_OP 8166 8167 #endif // LINUX