1 /* 2 * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "runtime/atomic.hpp" 45 #include "runtime/continuation.hpp" 46 #include "runtime/continuationEntry.inline.hpp" 47 #include "runtime/frame.inline.hpp" 48 #include "runtime/handles.inline.hpp" 49 #include "runtime/javaThread.hpp" 50 #include "runtime/sharedRuntime.hpp" 51 #include "runtime/stubCodeGenerator.hpp" 52 #include "runtime/stubRoutines.hpp" 53 #include "utilities/align.hpp" 54 #include "utilities/globalDefinitions.hpp" 55 #include "utilities/powerOfTwo.hpp" 56 #ifdef COMPILER2 57 #include "opto/runtime.hpp" 58 #endif 59 #if INCLUDE_ZGC 60 #include "gc/z/zThreadLocalData.hpp" 61 #endif 62 63 // Declaration and definition of StubGenerator (no .hpp file). 64 // For a more detailed description of the stub routine structure 65 // see the comment in stubRoutines.hpp 66 67 #undef __ 68 #define __ _masm-> 69 70 #ifdef PRODUCT 71 #define BLOCK_COMMENT(str) /* nothing */ 72 #else 73 #define BLOCK_COMMENT(str) __ block_comment(str) 74 #endif 75 76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 77 78 // Stub Code definitions 79 80 class StubGenerator: public StubCodeGenerator { 81 private: 82 83 #ifdef PRODUCT 84 #define inc_counter_np(counter) ((void)0) 85 #else 86 void inc_counter_np_(int& counter) { 87 __ lea(rscratch2, ExternalAddress((address)&counter)); 88 __ ldrw(rscratch1, Address(rscratch2)); 89 __ addw(rscratch1, rscratch1, 1); 90 __ strw(rscratch1, Address(rscratch2)); 91 } 92 #define inc_counter_np(counter) \ 93 BLOCK_COMMENT("inc_counter " #counter); \ 94 inc_counter_np_(counter); 95 #endif 96 97 // Call stubs are used to call Java from C 98 // 99 // Arguments: 100 // c_rarg0: call wrapper address address 101 // c_rarg1: result address 102 // c_rarg2: result type BasicType 103 // c_rarg3: method Method* 104 // c_rarg4: (interpreter) entry point address 105 // c_rarg5: parameters intptr_t* 106 // c_rarg6: parameter size (in words) int 107 // c_rarg7: thread Thread* 108 // 109 // There is no return from the stub itself as any Java result 110 // is written to result 111 // 112 // we save r30 (lr) as the return PC at the base of the frame and 113 // link r29 (fp) below it as the frame pointer installing sp (r31) 114 // into fp. 115 // 116 // we save r0-r7, which accounts for all the c arguments. 117 // 118 // TODO: strictly do we need to save them all? they are treated as 119 // volatile by C so could we omit saving the ones we are going to 120 // place in global registers (thread? method?) or those we only use 121 // during setup of the Java call? 122 // 123 // we don't need to save r8 which C uses as an indirect result location 124 // return register. 125 // 126 // we don't need to save r9-r15 which both C and Java treat as 127 // volatile 128 // 129 // we don't need to save r16-18 because Java does not use them 130 // 131 // we save r19-r28 which Java uses as scratch registers and C 132 // expects to be callee-save 133 // 134 // we save the bottom 64 bits of each value stored in v8-v15; it is 135 // the responsibility of the caller to preserve larger values. 136 // 137 // so the stub frame looks like this when we enter Java code 138 // 139 // [ return_from_Java ] <--- sp 140 // [ argument word n ] 141 // ... 142 // -27 [ argument word 1 ] 143 // -26 [ saved v15 ] <--- sp_after_call 144 // -25 [ saved v14 ] 145 // -24 [ saved v13 ] 146 // -23 [ saved v12 ] 147 // -22 [ saved v11 ] 148 // -21 [ saved v10 ] 149 // -20 [ saved v9 ] 150 // -19 [ saved v8 ] 151 // -18 [ saved r28 ] 152 // -17 [ saved r27 ] 153 // -16 [ saved r26 ] 154 // -15 [ saved r25 ] 155 // -14 [ saved r24 ] 156 // -13 [ saved r23 ] 157 // -12 [ saved r22 ] 158 // -11 [ saved r21 ] 159 // -10 [ saved r20 ] 160 // -9 [ saved r19 ] 161 // -8 [ call wrapper (r0) ] 162 // -7 [ result (r1) ] 163 // -6 [ result type (r2) ] 164 // -5 [ method (r3) ] 165 // -4 [ entry point (r4) ] 166 // -3 [ parameters (r5) ] 167 // -2 [ parameter size (r6) ] 168 // -1 [ thread (r7) ] 169 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 170 // 1 [ saved lr (r30) ] 171 172 // Call stub stack layout word offsets from fp 173 enum call_stub_layout { 174 sp_after_call_off = -26, 175 176 d15_off = -26, 177 d13_off = -24, 178 d11_off = -22, 179 d9_off = -20, 180 181 r28_off = -18, 182 r26_off = -16, 183 r24_off = -14, 184 r22_off = -12, 185 r20_off = -10, 186 call_wrapper_off = -8, 187 result_off = -7, 188 result_type_off = -6, 189 method_off = -5, 190 entry_point_off = -4, 191 parameter_size_off = -2, 192 thread_off = -1, 193 fp_f = 0, 194 retaddr_off = 1, 195 }; 196 197 address generate_call_stub(address& return_address) { 198 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 199 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 200 "adjust this code"); 201 202 StubCodeMark mark(this, "StubRoutines", "call_stub"); 203 address start = __ pc(); 204 205 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 206 207 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 208 const Address result (rfp, result_off * wordSize); 209 const Address result_type (rfp, result_type_off * wordSize); 210 const Address method (rfp, method_off * wordSize); 211 const Address entry_point (rfp, entry_point_off * wordSize); 212 const Address parameter_size(rfp, parameter_size_off * wordSize); 213 214 const Address thread (rfp, thread_off * wordSize); 215 216 const Address d15_save (rfp, d15_off * wordSize); 217 const Address d13_save (rfp, d13_off * wordSize); 218 const Address d11_save (rfp, d11_off * wordSize); 219 const Address d9_save (rfp, d9_off * wordSize); 220 221 const Address r28_save (rfp, r28_off * wordSize); 222 const Address r26_save (rfp, r26_off * wordSize); 223 const Address r24_save (rfp, r24_off * wordSize); 224 const Address r22_save (rfp, r22_off * wordSize); 225 const Address r20_save (rfp, r20_off * wordSize); 226 227 // stub code 228 229 address aarch64_entry = __ pc(); 230 231 // set up frame and move sp to end of save area 232 __ enter(); 233 __ sub(sp, rfp, -sp_after_call_off * wordSize); 234 235 // save register parameters and Java scratch/global registers 236 // n.b. we save thread even though it gets installed in 237 // rthread because we want to sanity check rthread later 238 __ str(c_rarg7, thread); 239 __ strw(c_rarg6, parameter_size); 240 __ stp(c_rarg4, c_rarg5, entry_point); 241 __ stp(c_rarg2, c_rarg3, result_type); 242 __ stp(c_rarg0, c_rarg1, call_wrapper); 243 244 __ stp(r20, r19, r20_save); 245 __ stp(r22, r21, r22_save); 246 __ stp(r24, r23, r24_save); 247 __ stp(r26, r25, r26_save); 248 __ stp(r28, r27, r28_save); 249 250 __ stpd(v9, v8, d9_save); 251 __ stpd(v11, v10, d11_save); 252 __ stpd(v13, v12, d13_save); 253 __ stpd(v15, v14, d15_save); 254 255 // install Java thread in global register now we have saved 256 // whatever value it held 257 __ mov(rthread, c_rarg7); 258 // And method 259 __ mov(rmethod, c_rarg3); 260 261 // set up the heapbase register 262 __ reinit_heapbase(); 263 264 #ifdef ASSERT 265 // make sure we have no pending exceptions 266 { 267 Label L; 268 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 269 __ cmp(rscratch1, (u1)NULL_WORD); 270 __ br(Assembler::EQ, L); 271 __ stop("StubRoutines::call_stub: entered with pending exception"); 272 __ BIND(L); 273 } 274 #endif 275 // pass parameters if any 276 __ mov(esp, sp); 277 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 278 __ andr(sp, rscratch1, -2 * wordSize); 279 280 BLOCK_COMMENT("pass parameters if any"); 281 Label parameters_done; 282 // parameter count is still in c_rarg6 283 // and parameter pointer identifying param 1 is in c_rarg5 284 __ cbzw(c_rarg6, parameters_done); 285 286 address loop = __ pc(); 287 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 288 __ subsw(c_rarg6, c_rarg6, 1); 289 __ push(rscratch1); 290 __ br(Assembler::GT, loop); 291 292 __ BIND(parameters_done); 293 294 // call Java entry -- passing methdoOop, and current sp 295 // rmethod: Method* 296 // r19_sender_sp: sender sp 297 BLOCK_COMMENT("call Java function"); 298 __ mov(r19_sender_sp, sp); 299 __ blr(c_rarg4); 300 301 // we do this here because the notify will already have been done 302 // if we get to the next instruction via an exception 303 // 304 // n.b. adding this instruction here affects the calculation of 305 // whether or not a routine returns to the call stub (used when 306 // doing stack walks) since the normal test is to check the return 307 // pc against the address saved below. so we may need to allow for 308 // this extra instruction in the check. 309 310 // save current address for use by exception handling code 311 312 return_address = __ pc(); 313 314 // store result depending on type (everything that is not 315 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 316 // n.b. this assumes Java returns an integral result in r0 317 // and a floating result in j_farg0 318 __ ldr(j_rarg2, result); 319 Label is_long, is_float, is_double, exit; 320 __ ldr(j_rarg1, result_type); 321 __ cmp(j_rarg1, (u1)T_OBJECT); 322 __ br(Assembler::EQ, is_long); 323 __ cmp(j_rarg1, (u1)T_LONG); 324 __ br(Assembler::EQ, is_long); 325 __ cmp(j_rarg1, (u1)T_FLOAT); 326 __ br(Assembler::EQ, is_float); 327 __ cmp(j_rarg1, (u1)T_DOUBLE); 328 __ br(Assembler::EQ, is_double); 329 330 // handle T_INT case 331 __ strw(r0, Address(j_rarg2)); 332 333 __ BIND(exit); 334 335 // pop parameters 336 __ sub(esp, rfp, -sp_after_call_off * wordSize); 337 338 #ifdef ASSERT 339 // verify that threads correspond 340 { 341 Label L, S; 342 __ ldr(rscratch1, thread); 343 __ cmp(rthread, rscratch1); 344 __ br(Assembler::NE, S); 345 __ get_thread(rscratch1); 346 __ cmp(rthread, rscratch1); 347 __ br(Assembler::EQ, L); 348 __ BIND(S); 349 __ stop("StubRoutines::call_stub: threads must correspond"); 350 __ BIND(L); 351 } 352 #endif 353 354 __ pop_cont_fastpath(rthread); 355 356 // restore callee-save registers 357 __ ldpd(v15, v14, d15_save); 358 __ ldpd(v13, v12, d13_save); 359 __ ldpd(v11, v10, d11_save); 360 __ ldpd(v9, v8, d9_save); 361 362 __ ldp(r28, r27, r28_save); 363 __ ldp(r26, r25, r26_save); 364 __ ldp(r24, r23, r24_save); 365 __ ldp(r22, r21, r22_save); 366 __ ldp(r20, r19, r20_save); 367 368 __ ldp(c_rarg0, c_rarg1, call_wrapper); 369 __ ldrw(c_rarg2, result_type); 370 __ ldr(c_rarg3, method); 371 __ ldp(c_rarg4, c_rarg5, entry_point); 372 __ ldp(c_rarg6, c_rarg7, parameter_size); 373 374 // leave frame and return to caller 375 __ leave(); 376 __ ret(lr); 377 378 // handle return types different from T_INT 379 380 __ BIND(is_long); 381 __ str(r0, Address(j_rarg2, 0)); 382 __ br(Assembler::AL, exit); 383 384 __ BIND(is_float); 385 __ strs(j_farg0, Address(j_rarg2, 0)); 386 __ br(Assembler::AL, exit); 387 388 __ BIND(is_double); 389 __ strd(j_farg0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 return start; 393 } 394 395 // Return point for a Java call if there's an exception thrown in 396 // Java code. The exception is caught and transformed into a 397 // pending exception stored in JavaThread that can be tested from 398 // within the VM. 399 // 400 // Note: Usually the parameters are removed by the callee. In case 401 // of an exception crossing an activation frame boundary, that is 402 // not the case if the callee is compiled code => need to setup the 403 // rsp. 404 // 405 // r0: exception oop 406 407 address generate_catch_exception() { 408 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 409 address start = __ pc(); 410 411 // same as in generate_call_stub(): 412 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 413 const Address thread (rfp, thread_off * wordSize); 414 415 #ifdef ASSERT 416 // verify that threads correspond 417 { 418 Label L, S; 419 __ ldr(rscratch1, thread); 420 __ cmp(rthread, rscratch1); 421 __ br(Assembler::NE, S); 422 __ get_thread(rscratch1); 423 __ cmp(rthread, rscratch1); 424 __ br(Assembler::EQ, L); 425 __ bind(S); 426 __ stop("StubRoutines::catch_exception: threads must correspond"); 427 __ bind(L); 428 } 429 #endif 430 431 // set pending exception 432 __ verify_oop(r0); 433 434 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 435 __ mov(rscratch1, (address)__FILE__); 436 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 437 __ movw(rscratch1, (int)__LINE__); 438 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 439 440 // complete return to VM 441 assert(StubRoutines::_call_stub_return_address != nullptr, 442 "_call_stub_return_address must have been generated before"); 443 __ b(StubRoutines::_call_stub_return_address); 444 445 return start; 446 } 447 448 // Continuation point for runtime calls returning with a pending 449 // exception. The pending exception check happened in the runtime 450 // or native call stub. The pending exception in Thread is 451 // converted into a Java-level exception. 452 // 453 // Contract with Java-level exception handlers: 454 // r0: exception 455 // r3: throwing pc 456 // 457 // NOTE: At entry of this stub, exception-pc must be in LR !! 458 459 // NOTE: this is always used as a jump target within generated code 460 // so it just needs to be generated code with no x86 prolog 461 462 address generate_forward_exception() { 463 StubCodeMark mark(this, "StubRoutines", "forward exception"); 464 address start = __ pc(); 465 466 // Upon entry, LR points to the return address returning into 467 // Java (interpreted or compiled) code; i.e., the return address 468 // becomes the throwing pc. 469 // 470 // Arguments pushed before the runtime call are still on the stack 471 // but the exception handler will reset the stack pointer -> 472 // ignore them. A potential result in registers can be ignored as 473 // well. 474 475 #ifdef ASSERT 476 // make sure this code is only executed if there is a pending exception 477 { 478 Label L; 479 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 480 __ cbnz(rscratch1, L); 481 __ stop("StubRoutines::forward exception: no pending exception (1)"); 482 __ bind(L); 483 } 484 #endif 485 486 // compute exception handler into r19 487 488 // call the VM to find the handler address associated with the 489 // caller address. pass thread in r0 and caller pc (ret address) 490 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 491 // the stack. 492 __ mov(c_rarg1, lr); 493 // lr will be trashed by the VM call so we move it to R19 494 // (callee-saved) because we also need to pass it to the handler 495 // returned by this call. 496 __ mov(r19, lr); 497 BLOCK_COMMENT("call exception_handler_for_return_address"); 498 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 499 SharedRuntime::exception_handler_for_return_address), 500 rthread, c_rarg1); 501 // Reinitialize the ptrue predicate register, in case the external runtime 502 // call clobbers ptrue reg, as we may return to SVE compiled code. 503 __ reinitialize_ptrue(); 504 505 // we should not really care that lr is no longer the callee 506 // address. we saved the value the handler needs in r19 so we can 507 // just copy it to r3. however, the C2 handler will push its own 508 // frame and then calls into the VM and the VM code asserts that 509 // the PC for the frame above the handler belongs to a compiled 510 // Java method. So, we restore lr here to satisfy that assert. 511 __ mov(lr, r19); 512 // setup r0 & r3 & clear pending exception 513 __ mov(r3, r19); 514 __ mov(r19, r0); 515 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 516 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 517 518 #ifdef ASSERT 519 // make sure exception is set 520 { 521 Label L; 522 __ cbnz(r0, L); 523 __ stop("StubRoutines::forward exception: no pending exception (2)"); 524 __ bind(L); 525 } 526 #endif 527 528 // continue at exception handler 529 // r0: exception 530 // r3: throwing pc 531 // r19: exception handler 532 __ verify_oop(r0); 533 __ br(r19); 534 535 return start; 536 } 537 538 // Non-destructive plausibility checks for oops 539 // 540 // Arguments: 541 // r0: oop to verify 542 // rscratch1: error message 543 // 544 // Stack after saving c_rarg3: 545 // [tos + 0]: saved c_rarg3 546 // [tos + 1]: saved c_rarg2 547 // [tos + 2]: saved lr 548 // [tos + 3]: saved rscratch2 549 // [tos + 4]: saved r0 550 // [tos + 5]: saved rscratch1 551 address generate_verify_oop() { 552 553 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 554 address start = __ pc(); 555 556 Label exit, error; 557 558 // save c_rarg2 and c_rarg3 559 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 560 561 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 562 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 563 __ ldr(c_rarg3, Address(c_rarg2)); 564 __ add(c_rarg3, c_rarg3, 1); 565 __ str(c_rarg3, Address(c_rarg2)); 566 567 // object is in r0 568 // make sure object is 'reasonable' 569 __ cbz(r0, exit); // if obj is null it is OK 570 571 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 572 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 573 574 // return if everything seems ok 575 __ bind(exit); 576 577 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 578 __ ret(lr); 579 580 // handle errors 581 __ bind(error); 582 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 583 584 __ push(RegSet::range(r0, r29), sp); 585 // debug(char* msg, int64_t pc, int64_t regs[]) 586 __ mov(c_rarg0, rscratch1); // pass address of error message 587 __ mov(c_rarg1, lr); // pass return address 588 __ mov(c_rarg2, sp); // pass address of regs on stack 589 #ifndef PRODUCT 590 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 591 #endif 592 BLOCK_COMMENT("call MacroAssembler::debug"); 593 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 594 __ blr(rscratch1); 595 __ hlt(0); 596 597 return start; 598 } 599 600 // Generate indices for iota vector. 601 address generate_iota_indices(const char *stub_name) { 602 __ align(CodeEntryAlignment); 603 StubCodeMark mark(this, "StubRoutines", stub_name); 604 address start = __ pc(); 605 // B 606 __ emit_data64(0x0706050403020100, relocInfo::none); 607 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 608 // H 609 __ emit_data64(0x0003000200010000, relocInfo::none); 610 __ emit_data64(0x0007000600050004, relocInfo::none); 611 // S 612 __ emit_data64(0x0000000100000000, relocInfo::none); 613 __ emit_data64(0x0000000300000002, relocInfo::none); 614 // D 615 __ emit_data64(0x0000000000000000, relocInfo::none); 616 __ emit_data64(0x0000000000000001, relocInfo::none); 617 // S - FP 618 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 619 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 620 // D - FP 621 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 622 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 623 return start; 624 } 625 626 // The inner part of zero_words(). This is the bulk operation, 627 // zeroing words in blocks, possibly using DC ZVA to do it. The 628 // caller is responsible for zeroing the last few words. 629 // 630 // Inputs: 631 // r10: the HeapWord-aligned base address of an array to zero. 632 // r11: the count in HeapWords, r11 > 0. 633 // 634 // Returns r10 and r11, adjusted for the caller to clear. 635 // r10: the base address of the tail of words left to clear. 636 // r11: the number of words in the tail. 637 // r11 < MacroAssembler::zero_words_block_size. 638 639 address generate_zero_blocks() { 640 Label done; 641 Label base_aligned; 642 643 Register base = r10, cnt = r11; 644 645 __ align(CodeEntryAlignment); 646 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 647 address start = __ pc(); 648 649 if (UseBlockZeroing) { 650 int zva_length = VM_Version::zva_length(); 651 652 // Ensure ZVA length can be divided by 16. This is required by 653 // the subsequent operations. 654 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 655 656 __ tbz(base, 3, base_aligned); 657 __ str(zr, Address(__ post(base, 8))); 658 __ sub(cnt, cnt, 1); 659 __ bind(base_aligned); 660 661 // Ensure count >= zva_length * 2 so that it still deserves a zva after 662 // alignment. 663 Label small; 664 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 665 __ subs(rscratch1, cnt, low_limit >> 3); 666 __ br(Assembler::LT, small); 667 __ zero_dcache_blocks(base, cnt); 668 __ bind(small); 669 } 670 671 { 672 // Number of stp instructions we'll unroll 673 const int unroll = 674 MacroAssembler::zero_words_block_size / 2; 675 // Clear the remaining blocks. 676 Label loop; 677 __ subs(cnt, cnt, unroll * 2); 678 __ br(Assembler::LT, done); 679 __ bind(loop); 680 for (int i = 0; i < unroll; i++) 681 __ stp(zr, zr, __ post(base, 16)); 682 __ subs(cnt, cnt, unroll * 2); 683 __ br(Assembler::GE, loop); 684 __ bind(done); 685 __ add(cnt, cnt, unroll * 2); 686 } 687 688 __ ret(lr); 689 690 return start; 691 } 692 693 694 typedef enum { 695 copy_forwards = 1, 696 copy_backwards = -1 697 } copy_direction; 698 699 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 700 // for arraycopy stubs. 701 class ArrayCopyBarrierSetHelper : StackObj { 702 BarrierSetAssembler* _bs_asm; 703 MacroAssembler* _masm; 704 DecoratorSet _decorators; 705 BasicType _type; 706 Register _gct1; 707 Register _gct2; 708 Register _gct3; 709 FloatRegister _gcvt1; 710 FloatRegister _gcvt2; 711 FloatRegister _gcvt3; 712 713 public: 714 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 715 DecoratorSet decorators, 716 BasicType type, 717 Register gct1, 718 Register gct2, 719 Register gct3, 720 FloatRegister gcvt1, 721 FloatRegister gcvt2, 722 FloatRegister gcvt3) 723 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 724 _masm(masm), 725 _decorators(decorators), 726 _type(type), 727 _gct1(gct1), 728 _gct2(gct2), 729 _gct3(gct3), 730 _gcvt1(gcvt1), 731 _gcvt2(gcvt2), 732 _gcvt3(gcvt3) { 733 } 734 735 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 736 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 737 dst1, dst2, src, 738 _gct1, _gct2, _gcvt1); 739 } 740 741 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 742 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 743 dst, src1, src2, 744 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 745 } 746 747 void copy_load_at_16(Register dst1, Register dst2, Address src) { 748 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 749 dst1, dst2, src, 750 _gct1); 751 } 752 753 void copy_store_at_16(Address dst, Register src1, Register src2) { 754 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 755 dst, src1, src2, 756 _gct1, _gct2, _gct3); 757 } 758 759 void copy_load_at_8(Register dst, Address src) { 760 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 761 dst, noreg, src, 762 _gct1); 763 } 764 765 void copy_store_at_8(Address dst, Register src) { 766 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 767 dst, src, noreg, 768 _gct1, _gct2, _gct3); 769 } 770 }; 771 772 // Bulk copy of blocks of 8 words. 773 // 774 // count is a count of words. 775 // 776 // Precondition: count >= 8 777 // 778 // Postconditions: 779 // 780 // The least significant bit of count contains the remaining count 781 // of words to copy. The rest of count is trash. 782 // 783 // s and d are adjusted to point to the remaining words to copy 784 // 785 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 786 copy_direction direction) { 787 int unit = wordSize * direction; 788 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 789 790 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 791 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 792 const Register stride = r14; 793 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 794 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 795 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 796 797 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 798 assert_different_registers(s, d, count, rscratch1, rscratch2); 799 800 Label again, drain; 801 const char *stub_name; 802 if (direction == copy_forwards) 803 stub_name = "forward_copy_longs"; 804 else 805 stub_name = "backward_copy_longs"; 806 807 __ align(CodeEntryAlignment); 808 809 StubCodeMark mark(this, "StubRoutines", stub_name); 810 811 __ bind(start); 812 813 Label unaligned_copy_long; 814 if (AvoidUnalignedAccesses) { 815 __ tbnz(d, 3, unaligned_copy_long); 816 } 817 818 if (direction == copy_forwards) { 819 __ sub(s, s, bias); 820 __ sub(d, d, bias); 821 } 822 823 #ifdef ASSERT 824 // Make sure we are never given < 8 words 825 { 826 Label L; 827 __ cmp(count, (u1)8); 828 __ br(Assembler::GE, L); 829 __ stop("genrate_copy_longs called with < 8 words"); 830 __ bind(L); 831 } 832 #endif 833 834 // Fill 8 registers 835 if (UseSIMDForMemoryOps) { 836 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 837 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 838 } else { 839 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 840 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 841 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 842 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 843 } 844 845 __ subs(count, count, 16); 846 __ br(Assembler::LO, drain); 847 848 int prefetch = PrefetchCopyIntervalInBytes; 849 bool use_stride = false; 850 if (direction == copy_backwards) { 851 use_stride = prefetch > 256; 852 prefetch = -prefetch; 853 if (use_stride) __ mov(stride, prefetch); 854 } 855 856 __ bind(again); 857 858 if (PrefetchCopyIntervalInBytes > 0) 859 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 860 861 if (UseSIMDForMemoryOps) { 862 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 863 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 864 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 865 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 866 } else { 867 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 868 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 869 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 870 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 871 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 872 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 873 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 874 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 875 } 876 877 __ subs(count, count, 8); 878 __ br(Assembler::HS, again); 879 880 // Drain 881 __ bind(drain); 882 if (UseSIMDForMemoryOps) { 883 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 884 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 885 } else { 886 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 887 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 888 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 889 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 890 } 891 892 { 893 Label L1, L2; 894 __ tbz(count, exact_log2(4), L1); 895 if (UseSIMDForMemoryOps) { 896 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 897 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 898 } else { 899 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 900 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 901 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 902 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 903 } 904 __ bind(L1); 905 906 if (direction == copy_forwards) { 907 __ add(s, s, bias); 908 __ add(d, d, bias); 909 } 910 911 __ tbz(count, 1, L2); 912 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 913 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 914 __ bind(L2); 915 } 916 917 __ ret(lr); 918 919 if (AvoidUnalignedAccesses) { 920 Label drain, again; 921 // Register order for storing. Order is different for backward copy. 922 923 __ bind(unaligned_copy_long); 924 925 // source address is even aligned, target odd aligned 926 // 927 // when forward copying word pairs we read long pairs at offsets 928 // {0, 2, 4, 6} (in long words). when backwards copying we read 929 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 930 // address by -2 in the forwards case so we can compute the 931 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 932 // or -1. 933 // 934 // when forward copying we need to store 1 word, 3 pairs and 935 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 936 // zero offset We adjust the destination by -1 which means we 937 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 938 // 939 // When backwards copyng we need to store 1 word, 3 pairs and 940 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 941 // offsets {1, 3, 5, 7, 8} * unit. 942 943 if (direction == copy_forwards) { 944 __ sub(s, s, 16); 945 __ sub(d, d, 8); 946 } 947 948 // Fill 8 registers 949 // 950 // for forwards copy s was offset by -16 from the original input 951 // value of s so the register contents are at these offsets 952 // relative to the 64 bit block addressed by that original input 953 // and so on for each successive 64 byte block when s is updated 954 // 955 // t0 at offset 0, t1 at offset 8 956 // t2 at offset 16, t3 at offset 24 957 // t4 at offset 32, t5 at offset 40 958 // t6 at offset 48, t7 at offset 56 959 960 // for backwards copy s was not offset so the register contents 961 // are at these offsets into the preceding 64 byte block 962 // relative to that original input and so on for each successive 963 // preceding 64 byte block when s is updated. this explains the 964 // slightly counter-intuitive looking pattern of register usage 965 // in the stp instructions for backwards copy. 966 // 967 // t0 at offset -16, t1 at offset -8 968 // t2 at offset -32, t3 at offset -24 969 // t4 at offset -48, t5 at offset -40 970 // t6 at offset -64, t7 at offset -56 971 972 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 973 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 974 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 975 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 976 977 __ subs(count, count, 16); 978 __ br(Assembler::LO, drain); 979 980 int prefetch = PrefetchCopyIntervalInBytes; 981 bool use_stride = false; 982 if (direction == copy_backwards) { 983 use_stride = prefetch > 256; 984 prefetch = -prefetch; 985 if (use_stride) __ mov(stride, prefetch); 986 } 987 988 __ bind(again); 989 990 if (PrefetchCopyIntervalInBytes > 0) 991 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 992 993 if (direction == copy_forwards) { 994 // allowing for the offset of -8 the store instructions place 995 // registers into the target 64 bit block at the following 996 // offsets 997 // 998 // t0 at offset 0 999 // t1 at offset 8, t2 at offset 16 1000 // t3 at offset 24, t4 at offset 32 1001 // t5 at offset 40, t6 at offset 48 1002 // t7 at offset 56 1003 1004 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1005 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1006 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1007 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1008 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1009 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1010 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1011 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1012 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1013 } else { 1014 // d was not offset when we started so the registers are 1015 // written into the 64 bit block preceding d with the following 1016 // offsets 1017 // 1018 // t1 at offset -8 1019 // t3 at offset -24, t0 at offset -16 1020 // t5 at offset -48, t2 at offset -32 1021 // t7 at offset -56, t4 at offset -48 1022 // t6 at offset -64 1023 // 1024 // note that this matches the offsets previously noted for the 1025 // loads 1026 1027 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1028 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1029 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1030 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1031 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1032 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1033 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1034 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1035 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1036 } 1037 1038 __ subs(count, count, 8); 1039 __ br(Assembler::HS, again); 1040 1041 // Drain 1042 // 1043 // this uses the same pattern of offsets and register arguments 1044 // as above 1045 __ bind(drain); 1046 if (direction == copy_forwards) { 1047 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1048 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1049 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1050 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1051 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1052 } else { 1053 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1054 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1055 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1056 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1057 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1058 } 1059 // now we need to copy any remaining part block which may 1060 // include a 4 word block subblock and/or a 2 word subblock. 1061 // bits 2 and 1 in the count are the tell-tale for whether we 1062 // have each such subblock 1063 { 1064 Label L1, L2; 1065 __ tbz(count, exact_log2(4), L1); 1066 // this is the same as above but copying only 4 longs hence 1067 // with only one intervening stp between the str instructions 1068 // but note that the offsets and registers still follow the 1069 // same pattern 1070 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1071 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1072 if (direction == copy_forwards) { 1073 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1074 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1075 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1076 } else { 1077 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1078 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1079 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1080 } 1081 __ bind(L1); 1082 1083 __ tbz(count, 1, L2); 1084 // this is the same as above but copying only 2 longs hence 1085 // there is no intervening stp between the str instructions 1086 // but note that the offset and register patterns are still 1087 // the same 1088 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1089 if (direction == copy_forwards) { 1090 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1091 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1092 } else { 1093 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1094 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1095 } 1096 __ bind(L2); 1097 1098 // for forwards copy we need to re-adjust the offsets we 1099 // applied so that s and d are follow the last words written 1100 1101 if (direction == copy_forwards) { 1102 __ add(s, s, 16); 1103 __ add(d, d, 8); 1104 } 1105 1106 } 1107 1108 __ ret(lr); 1109 } 1110 } 1111 1112 // Small copy: less than 16 bytes. 1113 // 1114 // NB: Ignores all of the bits of count which represent more than 15 1115 // bytes, so a caller doesn't have to mask them. 1116 1117 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1118 bool is_backwards = step < 0; 1119 size_t granularity = uabs(step); 1120 int direction = is_backwards ? -1 : 1; 1121 1122 Label Lword, Lint, Lshort, Lbyte; 1123 1124 assert(granularity 1125 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1126 1127 const Register t0 = r3; 1128 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1129 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1130 1131 // ??? I don't know if this bit-test-and-branch is the right thing 1132 // to do. It does a lot of jumping, resulting in several 1133 // mispredicted branches. It might make more sense to do this 1134 // with something like Duff's device with a single computed branch. 1135 1136 __ tbz(count, 3 - exact_log2(granularity), Lword); 1137 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1138 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1139 __ bind(Lword); 1140 1141 if (granularity <= sizeof (jint)) { 1142 __ tbz(count, 2 - exact_log2(granularity), Lint); 1143 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1144 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1145 __ bind(Lint); 1146 } 1147 1148 if (granularity <= sizeof (jshort)) { 1149 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1150 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1151 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1152 __ bind(Lshort); 1153 } 1154 1155 if (granularity <= sizeof (jbyte)) { 1156 __ tbz(count, 0, Lbyte); 1157 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1158 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1159 __ bind(Lbyte); 1160 } 1161 } 1162 1163 Label copy_f, copy_b; 1164 Label copy_obj_f, copy_obj_b; 1165 Label copy_obj_uninit_f, copy_obj_uninit_b; 1166 1167 // All-singing all-dancing memory copy. 1168 // 1169 // Copy count units of memory from s to d. The size of a unit is 1170 // step, which can be positive or negative depending on the direction 1171 // of copy. If is_aligned is false, we align the source address. 1172 // 1173 1174 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1175 Register s, Register d, Register count, int step) { 1176 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1177 bool is_backwards = step < 0; 1178 unsigned int granularity = uabs(step); 1179 const Register t0 = r3, t1 = r4; 1180 1181 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1182 // load all the data before writing anything 1183 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1184 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1185 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1186 const Register send = r17, dend = r16; 1187 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1188 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1189 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1190 1191 if (PrefetchCopyIntervalInBytes > 0) 1192 __ prfm(Address(s, 0), PLDL1KEEP); 1193 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1194 __ br(Assembler::HI, copy_big); 1195 1196 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1197 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1198 1199 __ cmp(count, u1(16/granularity)); 1200 __ br(Assembler::LS, copy16); 1201 1202 __ cmp(count, u1(64/granularity)); 1203 __ br(Assembler::HI, copy80); 1204 1205 __ cmp(count, u1(32/granularity)); 1206 __ br(Assembler::LS, copy32); 1207 1208 // 33..64 bytes 1209 if (UseSIMDForMemoryOps) { 1210 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1211 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1212 bs.copy_store_at_32(Address(d, 0), v0, v1); 1213 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1214 } else { 1215 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1216 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1217 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1218 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1219 1220 bs.copy_store_at_16(Address(d, 0), t0, t1); 1221 bs.copy_store_at_16(Address(d, 16), t2, t3); 1222 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1223 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1224 } 1225 __ b(finish); 1226 1227 // 17..32 bytes 1228 __ bind(copy32); 1229 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1230 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1231 1232 bs.copy_store_at_16(Address(d, 0), t0, t1); 1233 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1234 __ b(finish); 1235 1236 // 65..80/96 bytes 1237 // (96 bytes if SIMD because we do 32 byes per instruction) 1238 __ bind(copy80); 1239 if (UseSIMDForMemoryOps) { 1240 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1241 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1242 // Unaligned pointers can be an issue for copying. 1243 // The issue has more chances to happen when granularity of data is 1244 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1245 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1246 // The most performance drop has been seen for the range 65-80 bytes. 1247 // For such cases using the pair of ldp/stp instead of the third pair of 1248 // ldpq/stpq fixes the performance issue. 1249 if (granularity < sizeof (jint)) { 1250 Label copy96; 1251 __ cmp(count, u1(80/granularity)); 1252 __ br(Assembler::HI, copy96); 1253 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1254 1255 bs.copy_store_at_32(Address(d, 0), v0, v1); 1256 bs.copy_store_at_32(Address(d, 32), v2, v3); 1257 1258 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1259 __ b(finish); 1260 1261 __ bind(copy96); 1262 } 1263 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1264 1265 bs.copy_store_at_32(Address(d, 0), v0, v1); 1266 bs.copy_store_at_32(Address(d, 32), v2, v3); 1267 1268 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1269 } else { 1270 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1271 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1272 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1273 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1274 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1275 1276 bs.copy_store_at_16(Address(d, 0), t0, t1); 1277 bs.copy_store_at_16(Address(d, 16), t2, t3); 1278 bs.copy_store_at_16(Address(d, 32), t4, t5); 1279 bs.copy_store_at_16(Address(d, 48), t6, t7); 1280 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1281 } 1282 __ b(finish); 1283 1284 // 0..16 bytes 1285 __ bind(copy16); 1286 __ cmp(count, u1(8/granularity)); 1287 __ br(Assembler::LO, copy8); 1288 1289 // 8..16 bytes 1290 bs.copy_load_at_8(t0, Address(s, 0)); 1291 bs.copy_load_at_8(t1, Address(send, -8)); 1292 bs.copy_store_at_8(Address(d, 0), t0); 1293 bs.copy_store_at_8(Address(dend, -8), t1); 1294 __ b(finish); 1295 1296 if (granularity < 8) { 1297 // 4..7 bytes 1298 __ bind(copy8); 1299 __ tbz(count, 2 - exact_log2(granularity), copy4); 1300 __ ldrw(t0, Address(s, 0)); 1301 __ ldrw(t1, Address(send, -4)); 1302 __ strw(t0, Address(d, 0)); 1303 __ strw(t1, Address(dend, -4)); 1304 __ b(finish); 1305 if (granularity < 4) { 1306 // 0..3 bytes 1307 __ bind(copy4); 1308 __ cbz(count, finish); // get rid of 0 case 1309 if (granularity == 2) { 1310 __ ldrh(t0, Address(s, 0)); 1311 __ strh(t0, Address(d, 0)); 1312 } else { // granularity == 1 1313 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1314 // the first and last byte. 1315 // Handle the 3 byte case by loading and storing base + count/2 1316 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1317 // This does means in the 1 byte case we load/store the same 1318 // byte 3 times. 1319 __ lsr(count, count, 1); 1320 __ ldrb(t0, Address(s, 0)); 1321 __ ldrb(t1, Address(send, -1)); 1322 __ ldrb(t2, Address(s, count)); 1323 __ strb(t0, Address(d, 0)); 1324 __ strb(t1, Address(dend, -1)); 1325 __ strb(t2, Address(d, count)); 1326 } 1327 __ b(finish); 1328 } 1329 } 1330 1331 __ bind(copy_big); 1332 if (is_backwards) { 1333 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1334 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1335 } 1336 1337 // Now we've got the small case out of the way we can align the 1338 // source address on a 2-word boundary. 1339 1340 // Here we will materialize a count in r15, which is used by copy_memory_small 1341 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1342 // Up until here, we have used t9, which aliases r15, but from here on, that register 1343 // can not be used as a temp register, as it contains the count. 1344 1345 Label aligned; 1346 1347 if (is_aligned) { 1348 // We may have to adjust by 1 word to get s 2-word-aligned. 1349 __ tbz(s, exact_log2(wordSize), aligned); 1350 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1351 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1352 __ sub(count, count, wordSize/granularity); 1353 } else { 1354 if (is_backwards) { 1355 __ andr(r15, s, 2 * wordSize - 1); 1356 } else { 1357 __ neg(r15, s); 1358 __ andr(r15, r15, 2 * wordSize - 1); 1359 } 1360 // r15 is the byte adjustment needed to align s. 1361 __ cbz(r15, aligned); 1362 int shift = exact_log2(granularity); 1363 if (shift) __ lsr(r15, r15, shift); 1364 __ sub(count, count, r15); 1365 1366 #if 0 1367 // ?? This code is only correct for a disjoint copy. It may or 1368 // may not make sense to use it in that case. 1369 1370 // Copy the first pair; s and d may not be aligned. 1371 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1372 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1373 1374 // Align s and d, adjust count 1375 if (is_backwards) { 1376 __ sub(s, s, r15); 1377 __ sub(d, d, r15); 1378 } else { 1379 __ add(s, s, r15); 1380 __ add(d, d, r15); 1381 } 1382 #else 1383 copy_memory_small(decorators, type, s, d, r15, step); 1384 #endif 1385 } 1386 1387 __ bind(aligned); 1388 1389 // s is now 2-word-aligned. 1390 1391 // We have a count of units and some trailing bytes. Adjust the 1392 // count and do a bulk copy of words. 1393 __ lsr(r15, count, exact_log2(wordSize/granularity)); 1394 if (direction == copy_forwards) { 1395 if (type != T_OBJECT) { 1396 __ bl(copy_f); 1397 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1398 __ bl(copy_obj_uninit_f); 1399 } else { 1400 __ bl(copy_obj_f); 1401 } 1402 } else { 1403 if (type != T_OBJECT) { 1404 __ bl(copy_b); 1405 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1406 __ bl(copy_obj_uninit_b); 1407 } else { 1408 __ bl(copy_obj_b); 1409 } 1410 } 1411 1412 // And the tail. 1413 copy_memory_small(decorators, type, s, d, count, step); 1414 1415 if (granularity >= 8) __ bind(copy8); 1416 if (granularity >= 4) __ bind(copy4); 1417 __ bind(finish); 1418 } 1419 1420 1421 void clobber_registers() { 1422 #ifdef ASSERT 1423 RegSet clobbered 1424 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1425 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1426 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1427 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1428 __ mov(*it, rscratch1); 1429 } 1430 #endif 1431 1432 } 1433 1434 // Scan over array at a for count oops, verifying each one. 1435 // Preserves a and count, clobbers rscratch1 and rscratch2. 1436 void verify_oop_array (int size, Register a, Register count, Register temp) { 1437 Label loop, end; 1438 __ mov(rscratch1, a); 1439 __ mov(rscratch2, zr); 1440 __ bind(loop); 1441 __ cmp(rscratch2, count); 1442 __ br(Assembler::HS, end); 1443 if (size == wordSize) { 1444 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1445 __ verify_oop(temp); 1446 } else { 1447 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1448 __ decode_heap_oop(temp); // calls verify_oop 1449 } 1450 __ add(rscratch2, rscratch2, 1); 1451 __ b(loop); 1452 __ bind(end); 1453 } 1454 1455 // Arguments: 1456 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1457 // ignored 1458 // is_oop - true => oop array, so generate store check code 1459 // name - stub name string 1460 // 1461 // Inputs: 1462 // c_rarg0 - source array address 1463 // c_rarg1 - destination array address 1464 // c_rarg2 - element count, treated as ssize_t, can be zero 1465 // 1466 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1467 // the hardware handle it. The two dwords within qwords that span 1468 // cache line boundaries will still be loaded and stored atomically. 1469 // 1470 // Side Effects: 1471 // disjoint_int_copy_entry is set to the no-overlap entry point 1472 // used by generate_conjoint_int_oop_copy(). 1473 // 1474 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1475 const char *name, bool dest_uninitialized = false) { 1476 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1477 RegSet saved_reg = RegSet::of(s, d, count); 1478 __ align(CodeEntryAlignment); 1479 StubCodeMark mark(this, "StubRoutines", name); 1480 address start = __ pc(); 1481 __ enter(); 1482 1483 if (entry != nullptr) { 1484 *entry = __ pc(); 1485 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1486 BLOCK_COMMENT("Entry:"); 1487 } 1488 1489 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1490 if (dest_uninitialized) { 1491 decorators |= IS_DEST_UNINITIALIZED; 1492 } 1493 if (aligned) { 1494 decorators |= ARRAYCOPY_ALIGNED; 1495 } 1496 1497 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1498 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1499 1500 if (is_oop) { 1501 // save regs before copy_memory 1502 __ push(RegSet::of(d, count), sp); 1503 } 1504 { 1505 // UnsafeCopyMemory page error: continue after ucm 1506 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1507 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1508 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1509 } 1510 1511 if (is_oop) { 1512 __ pop(RegSet::of(d, count), sp); 1513 if (VerifyOops) 1514 verify_oop_array(size, d, count, r16); 1515 } 1516 1517 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1518 1519 __ leave(); 1520 __ mov(r0, zr); // return 0 1521 __ ret(lr); 1522 return start; 1523 } 1524 1525 // Arguments: 1526 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1527 // ignored 1528 // is_oop - true => oop array, so generate store check code 1529 // name - stub name string 1530 // 1531 // Inputs: 1532 // c_rarg0 - source array address 1533 // c_rarg1 - destination array address 1534 // c_rarg2 - element count, treated as ssize_t, can be zero 1535 // 1536 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1537 // the hardware handle it. The two dwords within qwords that span 1538 // cache line boundaries will still be loaded and stored atomically. 1539 // 1540 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1541 address *entry, const char *name, 1542 bool dest_uninitialized = false) { 1543 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1544 RegSet saved_regs = RegSet::of(s, d, count); 1545 StubCodeMark mark(this, "StubRoutines", name); 1546 address start = __ pc(); 1547 __ enter(); 1548 1549 if (entry != nullptr) { 1550 *entry = __ pc(); 1551 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1552 BLOCK_COMMENT("Entry:"); 1553 } 1554 1555 // use fwd copy when (d-s) above_equal (count*size) 1556 __ sub(rscratch1, d, s); 1557 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1558 __ br(Assembler::HS, nooverlap_target); 1559 1560 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1561 if (dest_uninitialized) { 1562 decorators |= IS_DEST_UNINITIALIZED; 1563 } 1564 if (aligned) { 1565 decorators |= ARRAYCOPY_ALIGNED; 1566 } 1567 1568 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1569 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1570 1571 if (is_oop) { 1572 // save regs before copy_memory 1573 __ push(RegSet::of(d, count), sp); 1574 } 1575 { 1576 // UnsafeCopyMemory page error: continue after ucm 1577 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1578 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1579 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1580 } 1581 if (is_oop) { 1582 __ pop(RegSet::of(d, count), sp); 1583 if (VerifyOops) 1584 verify_oop_array(size, d, count, r16); 1585 } 1586 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1587 __ leave(); 1588 __ mov(r0, zr); // return 0 1589 __ ret(lr); 1590 return start; 1591 } 1592 1593 // Arguments: 1594 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1595 // ignored 1596 // name - stub name string 1597 // 1598 // Inputs: 1599 // c_rarg0 - source array address 1600 // c_rarg1 - destination array address 1601 // c_rarg2 - element count, treated as ssize_t, can be zero 1602 // 1603 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1604 // we let the hardware handle it. The one to eight bytes within words, 1605 // dwords or qwords that span cache line boundaries will still be loaded 1606 // and stored atomically. 1607 // 1608 // Side Effects: 1609 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1610 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1611 // we let the hardware handle it. The one to eight bytes within words, 1612 // dwords or qwords that span cache line boundaries will still be loaded 1613 // and stored atomically. 1614 // 1615 // Side Effects: 1616 // disjoint_byte_copy_entry is set to the no-overlap entry point 1617 // used by generate_conjoint_byte_copy(). 1618 // 1619 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1620 const bool not_oop = false; 1621 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1622 } 1623 1624 // Arguments: 1625 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1626 // ignored 1627 // name - stub name string 1628 // 1629 // Inputs: 1630 // c_rarg0 - source array address 1631 // c_rarg1 - destination array address 1632 // c_rarg2 - element count, treated as ssize_t, can be zero 1633 // 1634 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1635 // we let the hardware handle it. The one to eight bytes within words, 1636 // dwords or qwords that span cache line boundaries will still be loaded 1637 // and stored atomically. 1638 // 1639 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1640 address* entry, const char *name) { 1641 const bool not_oop = false; 1642 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1643 } 1644 1645 // Arguments: 1646 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1647 // ignored 1648 // name - stub name string 1649 // 1650 // Inputs: 1651 // c_rarg0 - source array address 1652 // c_rarg1 - destination array address 1653 // c_rarg2 - element count, treated as ssize_t, can be zero 1654 // 1655 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1656 // let the hardware handle it. The two or four words within dwords 1657 // or qwords that span cache line boundaries will still be loaded 1658 // and stored atomically. 1659 // 1660 // Side Effects: 1661 // disjoint_short_copy_entry is set to the no-overlap entry point 1662 // used by generate_conjoint_short_copy(). 1663 // 1664 address generate_disjoint_short_copy(bool aligned, 1665 address* entry, const char *name) { 1666 const bool not_oop = false; 1667 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1668 } 1669 1670 // Arguments: 1671 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1672 // ignored 1673 // name - stub name string 1674 // 1675 // Inputs: 1676 // c_rarg0 - source array address 1677 // c_rarg1 - destination array address 1678 // c_rarg2 - element count, treated as ssize_t, can be zero 1679 // 1680 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1681 // let the hardware handle it. The two or four words within dwords 1682 // or qwords that span cache line boundaries will still be loaded 1683 // and stored atomically. 1684 // 1685 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1686 address *entry, const char *name) { 1687 const bool not_oop = false; 1688 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1689 1690 } 1691 // Arguments: 1692 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1693 // ignored 1694 // name - stub name string 1695 // 1696 // Inputs: 1697 // c_rarg0 - source array address 1698 // c_rarg1 - destination array address 1699 // c_rarg2 - element count, treated as ssize_t, can be zero 1700 // 1701 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1702 // the hardware handle it. The two dwords within qwords that span 1703 // cache line boundaries will still be loaded and stored atomically. 1704 // 1705 // Side Effects: 1706 // disjoint_int_copy_entry is set to the no-overlap entry point 1707 // used by generate_conjoint_int_oop_copy(). 1708 // 1709 address generate_disjoint_int_copy(bool aligned, address *entry, 1710 const char *name, bool dest_uninitialized = false) { 1711 const bool not_oop = false; 1712 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1713 } 1714 1715 // Arguments: 1716 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1717 // ignored 1718 // name - stub name string 1719 // 1720 // Inputs: 1721 // c_rarg0 - source array address 1722 // c_rarg1 - destination array address 1723 // c_rarg2 - element count, treated as ssize_t, can be zero 1724 // 1725 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1726 // the hardware handle it. The two dwords within qwords that span 1727 // cache line boundaries will still be loaded and stored atomically. 1728 // 1729 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1730 address *entry, const char *name, 1731 bool dest_uninitialized = false) { 1732 const bool not_oop = false; 1733 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1734 } 1735 1736 1737 // Arguments: 1738 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1739 // ignored 1740 // name - stub name string 1741 // 1742 // Inputs: 1743 // c_rarg0 - source array address 1744 // c_rarg1 - destination array address 1745 // c_rarg2 - element count, treated as size_t, can be zero 1746 // 1747 // Side Effects: 1748 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1749 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1750 // 1751 address generate_disjoint_long_copy(bool aligned, address *entry, 1752 const char *name, bool dest_uninitialized = false) { 1753 const bool not_oop = false; 1754 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1755 } 1756 1757 // Arguments: 1758 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1759 // ignored 1760 // name - stub name string 1761 // 1762 // Inputs: 1763 // c_rarg0 - source array address 1764 // c_rarg1 - destination array address 1765 // c_rarg2 - element count, treated as size_t, can be zero 1766 // 1767 address generate_conjoint_long_copy(bool aligned, 1768 address nooverlap_target, address *entry, 1769 const char *name, bool dest_uninitialized = false) { 1770 const bool not_oop = false; 1771 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1772 } 1773 1774 // Arguments: 1775 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1776 // ignored 1777 // name - stub name string 1778 // 1779 // Inputs: 1780 // c_rarg0 - source array address 1781 // c_rarg1 - destination array address 1782 // c_rarg2 - element count, treated as size_t, can be zero 1783 // 1784 // Side Effects: 1785 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1786 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1787 // 1788 address generate_disjoint_oop_copy(bool aligned, address *entry, 1789 const char *name, bool dest_uninitialized) { 1790 const bool is_oop = true; 1791 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1792 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1793 } 1794 1795 // Arguments: 1796 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1797 // ignored 1798 // name - stub name string 1799 // 1800 // Inputs: 1801 // c_rarg0 - source array address 1802 // c_rarg1 - destination array address 1803 // c_rarg2 - element count, treated as size_t, can be zero 1804 // 1805 address generate_conjoint_oop_copy(bool aligned, 1806 address nooverlap_target, address *entry, 1807 const char *name, bool dest_uninitialized) { 1808 const bool is_oop = true; 1809 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1810 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1811 name, dest_uninitialized); 1812 } 1813 1814 1815 // Helper for generating a dynamic type check. 1816 // Smashes rscratch1, rscratch2. 1817 void generate_type_check(Register sub_klass, 1818 Register super_check_offset, 1819 Register super_klass, 1820 Label& L_success) { 1821 assert_different_registers(sub_klass, super_check_offset, super_klass); 1822 1823 BLOCK_COMMENT("type_check:"); 1824 1825 Label L_miss; 1826 1827 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1828 super_check_offset); 1829 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1830 1831 // Fall through on failure! 1832 __ BIND(L_miss); 1833 } 1834 1835 // 1836 // Generate checkcasting array copy stub 1837 // 1838 // Input: 1839 // c_rarg0 - source array address 1840 // c_rarg1 - destination array address 1841 // c_rarg2 - element count, treated as ssize_t, can be zero 1842 // c_rarg3 - size_t ckoff (super_check_offset) 1843 // c_rarg4 - oop ckval (super_klass) 1844 // 1845 // Output: 1846 // r0 == 0 - success 1847 // r0 == -1^K - failure, where K is partial transfer count 1848 // 1849 address generate_checkcast_copy(const char *name, address *entry, 1850 bool dest_uninitialized = false) { 1851 1852 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1853 1854 // Input registers (after setup_arg_regs) 1855 const Register from = c_rarg0; // source array address 1856 const Register to = c_rarg1; // destination array address 1857 const Register count = c_rarg2; // elementscount 1858 const Register ckoff = c_rarg3; // super_check_offset 1859 const Register ckval = c_rarg4; // super_klass 1860 1861 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1862 RegSet wb_post_saved_regs = RegSet::of(count); 1863 1864 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1865 const Register copied_oop = r22; // actual oop copied 1866 const Register count_save = r21; // orig elementscount 1867 const Register start_to = r20; // destination array start address 1868 const Register r19_klass = r19; // oop._klass 1869 1870 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1871 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1872 1873 //--------------------------------------------------------------- 1874 // Assembler stub will be used for this call to arraycopy 1875 // if the two arrays are subtypes of Object[] but the 1876 // destination array type is not equal to or a supertype 1877 // of the source type. Each element must be separately 1878 // checked. 1879 1880 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1881 copied_oop, r19_klass, count_save); 1882 1883 __ align(CodeEntryAlignment); 1884 StubCodeMark mark(this, "StubRoutines", name); 1885 address start = __ pc(); 1886 1887 __ enter(); // required for proper stackwalking of RuntimeStub frame 1888 1889 #ifdef ASSERT 1890 // caller guarantees that the arrays really are different 1891 // otherwise, we would have to make conjoint checks 1892 { Label L; 1893 __ b(L); // conjoint check not yet implemented 1894 __ stop("checkcast_copy within a single array"); 1895 __ bind(L); 1896 } 1897 #endif //ASSERT 1898 1899 // Caller of this entry point must set up the argument registers. 1900 if (entry != nullptr) { 1901 *entry = __ pc(); 1902 BLOCK_COMMENT("Entry:"); 1903 } 1904 1905 // Empty array: Nothing to do. 1906 __ cbz(count, L_done); 1907 __ push(RegSet::of(r19, r20, r21, r22), sp); 1908 1909 #ifdef ASSERT 1910 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1911 // The ckoff and ckval must be mutually consistent, 1912 // even though caller generates both. 1913 { Label L; 1914 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1915 __ ldrw(start_to, Address(ckval, sco_offset)); 1916 __ cmpw(ckoff, start_to); 1917 __ br(Assembler::EQ, L); 1918 __ stop("super_check_offset inconsistent"); 1919 __ bind(L); 1920 } 1921 #endif //ASSERT 1922 1923 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1924 bool is_oop = true; 1925 int element_size = UseCompressedOops ? 4 : 8; 1926 if (dest_uninitialized) { 1927 decorators |= IS_DEST_UNINITIALIZED; 1928 } 1929 1930 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1931 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1932 1933 // save the original count 1934 __ mov(count_save, count); 1935 1936 // Copy from low to high addresses 1937 __ mov(start_to, to); // Save destination array start address 1938 __ b(L_load_element); 1939 1940 // ======== begin loop ======== 1941 // (Loop is rotated; its entry is L_load_element.) 1942 // Loop control: 1943 // for (; count != 0; count--) { 1944 // copied_oop = load_heap_oop(from++); 1945 // ... generate_type_check ...; 1946 // store_heap_oop(to++, copied_oop); 1947 // } 1948 __ align(OptoLoopAlignment); 1949 1950 __ BIND(L_store_element); 1951 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1952 __ post(to, element_size), copied_oop, noreg, 1953 gct1, gct2, gct3); 1954 __ sub(count, count, 1); 1955 __ cbz(count, L_do_card_marks); 1956 1957 // ======== loop entry is here ======== 1958 __ BIND(L_load_element); 1959 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1960 copied_oop, noreg, __ post(from, element_size), 1961 gct1); 1962 __ cbz(copied_oop, L_store_element); 1963 1964 __ load_klass(r19_klass, copied_oop);// query the object klass 1965 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1966 // ======== end loop ======== 1967 1968 // It was a real error; we must depend on the caller to finish the job. 1969 // Register count = remaining oops, count_orig = total oops. 1970 // Emit GC store barriers for the oops we have copied and report 1971 // their number to the caller. 1972 1973 __ subs(count, count_save, count); // K = partially copied oop count 1974 __ eon(count, count, zr); // report (-1^K) to caller 1975 __ br(Assembler::EQ, L_done_pop); 1976 1977 __ BIND(L_do_card_marks); 1978 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1979 1980 __ bind(L_done_pop); 1981 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1982 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1983 1984 __ bind(L_done); 1985 __ mov(r0, count); 1986 __ leave(); 1987 __ ret(lr); 1988 1989 return start; 1990 } 1991 1992 // Perform range checks on the proposed arraycopy. 1993 // Kills temp, but nothing else. 1994 // Also, clean the sign bits of src_pos and dst_pos. 1995 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1996 Register src_pos, // source position (c_rarg1) 1997 Register dst, // destination array oo (c_rarg2) 1998 Register dst_pos, // destination position (c_rarg3) 1999 Register length, 2000 Register temp, 2001 Label& L_failed) { 2002 BLOCK_COMMENT("arraycopy_range_checks:"); 2003 2004 assert_different_registers(rscratch1, temp); 2005 2006 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2007 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2008 __ addw(temp, length, src_pos); 2009 __ cmpw(temp, rscratch1); 2010 __ br(Assembler::HI, L_failed); 2011 2012 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2013 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2014 __ addw(temp, length, dst_pos); 2015 __ cmpw(temp, rscratch1); 2016 __ br(Assembler::HI, L_failed); 2017 2018 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2019 __ movw(src_pos, src_pos); 2020 __ movw(dst_pos, dst_pos); 2021 2022 BLOCK_COMMENT("arraycopy_range_checks done"); 2023 } 2024 2025 // These stubs get called from some dumb test routine. 2026 // I'll write them properly when they're called from 2027 // something that's actually doing something. 2028 static void fake_arraycopy_stub(address src, address dst, int count) { 2029 assert(count == 0, "huh?"); 2030 } 2031 2032 2033 // 2034 // Generate 'unsafe' array copy stub 2035 // Though just as safe as the other stubs, it takes an unscaled 2036 // size_t argument instead of an element count. 2037 // 2038 // Input: 2039 // c_rarg0 - source array address 2040 // c_rarg1 - destination array address 2041 // c_rarg2 - byte count, treated as ssize_t, can be zero 2042 // 2043 // Examines the alignment of the operands and dispatches 2044 // to a long, int, short, or byte copy loop. 2045 // 2046 address generate_unsafe_copy(const char *name, 2047 address byte_copy_entry, 2048 address short_copy_entry, 2049 address int_copy_entry, 2050 address long_copy_entry) { 2051 Label L_long_aligned, L_int_aligned, L_short_aligned; 2052 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2053 2054 __ align(CodeEntryAlignment); 2055 StubCodeMark mark(this, "StubRoutines", name); 2056 address start = __ pc(); 2057 __ enter(); // required for proper stackwalking of RuntimeStub frame 2058 2059 // bump this on entry, not on exit: 2060 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2061 2062 __ orr(rscratch1, s, d); 2063 __ orr(rscratch1, rscratch1, count); 2064 2065 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2066 __ cbz(rscratch1, L_long_aligned); 2067 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2068 __ cbz(rscratch1, L_int_aligned); 2069 __ tbz(rscratch1, 0, L_short_aligned); 2070 __ b(RuntimeAddress(byte_copy_entry)); 2071 2072 __ BIND(L_short_aligned); 2073 __ lsr(count, count, LogBytesPerShort); // size => short_count 2074 __ b(RuntimeAddress(short_copy_entry)); 2075 __ BIND(L_int_aligned); 2076 __ lsr(count, count, LogBytesPerInt); // size => int_count 2077 __ b(RuntimeAddress(int_copy_entry)); 2078 __ BIND(L_long_aligned); 2079 __ lsr(count, count, LogBytesPerLong); // size => long_count 2080 __ b(RuntimeAddress(long_copy_entry)); 2081 2082 return start; 2083 } 2084 2085 // 2086 // Generate generic array copy stubs 2087 // 2088 // Input: 2089 // c_rarg0 - src oop 2090 // c_rarg1 - src_pos (32-bits) 2091 // c_rarg2 - dst oop 2092 // c_rarg3 - dst_pos (32-bits) 2093 // c_rarg4 - element count (32-bits) 2094 // 2095 // Output: 2096 // r0 == 0 - success 2097 // r0 == -1^K - failure, where K is partial transfer count 2098 // 2099 address generate_generic_copy(const char *name, 2100 address byte_copy_entry, address short_copy_entry, 2101 address int_copy_entry, address oop_copy_entry, 2102 address long_copy_entry, address checkcast_copy_entry) { 2103 2104 Label L_failed, L_objArray; 2105 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2106 2107 // Input registers 2108 const Register src = c_rarg0; // source array oop 2109 const Register src_pos = c_rarg1; // source position 2110 const Register dst = c_rarg2; // destination array oop 2111 const Register dst_pos = c_rarg3; // destination position 2112 const Register length = c_rarg4; 2113 2114 2115 // Registers used as temps 2116 const Register dst_klass = c_rarg5; 2117 2118 __ align(CodeEntryAlignment); 2119 2120 StubCodeMark mark(this, "StubRoutines", name); 2121 2122 address start = __ pc(); 2123 2124 __ enter(); // required for proper stackwalking of RuntimeStub frame 2125 2126 // bump this on entry, not on exit: 2127 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2128 2129 //----------------------------------------------------------------------- 2130 // Assembler stub will be used for this call to arraycopy 2131 // if the following conditions are met: 2132 // 2133 // (1) src and dst must not be null. 2134 // (2) src_pos must not be negative. 2135 // (3) dst_pos must not be negative. 2136 // (4) length must not be negative. 2137 // (5) src klass and dst klass should be the same and not null. 2138 // (6) src and dst should be arrays. 2139 // (7) src_pos + length must not exceed length of src. 2140 // (8) dst_pos + length must not exceed length of dst. 2141 // 2142 2143 // if (src == nullptr) return -1; 2144 __ cbz(src, L_failed); 2145 2146 // if (src_pos < 0) return -1; 2147 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2148 2149 // if (dst == nullptr) return -1; 2150 __ cbz(dst, L_failed); 2151 2152 // if (dst_pos < 0) return -1; 2153 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2154 2155 // registers used as temp 2156 const Register scratch_length = r16; // elements count to copy 2157 const Register scratch_src_klass = r17; // array klass 2158 const Register lh = r15; // layout helper 2159 2160 // if (length < 0) return -1; 2161 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2162 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2163 2164 __ load_klass(scratch_src_klass, src); 2165 #ifdef ASSERT 2166 // assert(src->klass() != nullptr); 2167 { 2168 BLOCK_COMMENT("assert klasses not null {"); 2169 Label L1, L2; 2170 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2171 __ bind(L1); 2172 __ stop("broken null klass"); 2173 __ bind(L2); 2174 __ load_klass(rscratch1, dst); 2175 __ cbz(rscratch1, L1); // this would be broken also 2176 BLOCK_COMMENT("} assert klasses not null done"); 2177 } 2178 #endif 2179 2180 // Load layout helper (32-bits) 2181 // 2182 // |array_tag| | header_size | element_type | |log2_element_size| 2183 // 32 30 24 16 8 2 0 2184 // 2185 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2186 // 2187 2188 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2189 2190 // Handle objArrays completely differently... 2191 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2192 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2193 __ movw(rscratch1, objArray_lh); 2194 __ eorw(rscratch2, lh, rscratch1); 2195 __ cbzw(rscratch2, L_objArray); 2196 2197 // if (src->klass() != dst->klass()) return -1; 2198 __ load_klass(rscratch2, dst); 2199 __ eor(rscratch2, rscratch2, scratch_src_klass); 2200 __ cbnz(rscratch2, L_failed); 2201 2202 // if (!src->is_Array()) return -1; 2203 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2204 2205 // At this point, it is known to be a typeArray (array_tag 0x3). 2206 #ifdef ASSERT 2207 { 2208 BLOCK_COMMENT("assert primitive array {"); 2209 Label L; 2210 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2211 __ cmpw(lh, rscratch2); 2212 __ br(Assembler::GE, L); 2213 __ stop("must be a primitive array"); 2214 __ bind(L); 2215 BLOCK_COMMENT("} assert primitive array done"); 2216 } 2217 #endif 2218 2219 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2220 rscratch2, L_failed); 2221 2222 // TypeArrayKlass 2223 // 2224 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2225 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2226 // 2227 2228 const Register rscratch1_offset = rscratch1; // array offset 2229 const Register r15_elsize = lh; // element size 2230 2231 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2232 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2233 __ add(src, src, rscratch1_offset); // src array offset 2234 __ add(dst, dst, rscratch1_offset); // dst array offset 2235 BLOCK_COMMENT("choose copy loop based on element size"); 2236 2237 // next registers should be set before the jump to corresponding stub 2238 const Register from = c_rarg0; // source array address 2239 const Register to = c_rarg1; // destination array address 2240 const Register count = c_rarg2; // elements count 2241 2242 // 'from', 'to', 'count' registers should be set in such order 2243 // since they are the same as 'src', 'src_pos', 'dst'. 2244 2245 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2246 2247 // The possible values of elsize are 0-3, i.e. exact_log2(element 2248 // size in bytes). We do a simple bitwise binary search. 2249 __ BIND(L_copy_bytes); 2250 __ tbnz(r15_elsize, 1, L_copy_ints); 2251 __ tbnz(r15_elsize, 0, L_copy_shorts); 2252 __ lea(from, Address(src, src_pos));// src_addr 2253 __ lea(to, Address(dst, dst_pos));// dst_addr 2254 __ movw(count, scratch_length); // length 2255 __ b(RuntimeAddress(byte_copy_entry)); 2256 2257 __ BIND(L_copy_shorts); 2258 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2259 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2260 __ movw(count, scratch_length); // length 2261 __ b(RuntimeAddress(short_copy_entry)); 2262 2263 __ BIND(L_copy_ints); 2264 __ tbnz(r15_elsize, 0, L_copy_longs); 2265 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2266 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2267 __ movw(count, scratch_length); // length 2268 __ b(RuntimeAddress(int_copy_entry)); 2269 2270 __ BIND(L_copy_longs); 2271 #ifdef ASSERT 2272 { 2273 BLOCK_COMMENT("assert long copy {"); 2274 Label L; 2275 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2276 __ cmpw(r15_elsize, LogBytesPerLong); 2277 __ br(Assembler::EQ, L); 2278 __ stop("must be long copy, but elsize is wrong"); 2279 __ bind(L); 2280 BLOCK_COMMENT("} assert long copy done"); 2281 } 2282 #endif 2283 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2284 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2285 __ movw(count, scratch_length); // length 2286 __ b(RuntimeAddress(long_copy_entry)); 2287 2288 // ObjArrayKlass 2289 __ BIND(L_objArray); 2290 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2291 2292 Label L_plain_copy, L_checkcast_copy; 2293 // test array classes for subtyping 2294 __ load_klass(r15, dst); 2295 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2296 __ br(Assembler::NE, L_checkcast_copy); 2297 2298 // Identically typed arrays can be copied without element-wise checks. 2299 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2300 rscratch2, L_failed); 2301 2302 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2303 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2304 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2305 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2306 __ movw(count, scratch_length); // length 2307 __ BIND(L_plain_copy); 2308 __ b(RuntimeAddress(oop_copy_entry)); 2309 2310 __ BIND(L_checkcast_copy); 2311 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2312 { 2313 // Before looking at dst.length, make sure dst is also an objArray. 2314 __ ldrw(rscratch1, Address(r15, lh_offset)); 2315 __ movw(rscratch2, objArray_lh); 2316 __ eorw(rscratch1, rscratch1, rscratch2); 2317 __ cbnzw(rscratch1, L_failed); 2318 2319 // It is safe to examine both src.length and dst.length. 2320 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2321 r15, L_failed); 2322 2323 __ load_klass(dst_klass, dst); // reload 2324 2325 // Marshal the base address arguments now, freeing registers. 2326 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2327 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2328 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2329 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2330 __ movw(count, length); // length (reloaded) 2331 Register sco_temp = c_rarg3; // this register is free now 2332 assert_different_registers(from, to, count, sco_temp, 2333 dst_klass, scratch_src_klass); 2334 // assert_clean_int(count, sco_temp); 2335 2336 // Generate the type check. 2337 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2338 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2339 2340 // Smashes rscratch1, rscratch2 2341 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2342 2343 // Fetch destination element klass from the ObjArrayKlass header. 2344 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2345 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2346 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2347 2348 // the checkcast_copy loop needs two extra arguments: 2349 assert(c_rarg3 == sco_temp, "#3 already in place"); 2350 // Set up arguments for checkcast_copy_entry. 2351 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2352 __ b(RuntimeAddress(checkcast_copy_entry)); 2353 } 2354 2355 __ BIND(L_failed); 2356 __ mov(r0, -1); 2357 __ leave(); // required for proper stackwalking of RuntimeStub frame 2358 __ ret(lr); 2359 2360 return start; 2361 } 2362 2363 // 2364 // Generate stub for array fill. If "aligned" is true, the 2365 // "to" address is assumed to be heapword aligned. 2366 // 2367 // Arguments for generated stub: 2368 // to: c_rarg0 2369 // value: c_rarg1 2370 // count: c_rarg2 treated as signed 2371 // 2372 address generate_fill(BasicType t, bool aligned, const char *name) { 2373 __ align(CodeEntryAlignment); 2374 StubCodeMark mark(this, "StubRoutines", name); 2375 address start = __ pc(); 2376 2377 BLOCK_COMMENT("Entry:"); 2378 2379 const Register to = c_rarg0; // source array address 2380 const Register value = c_rarg1; // value 2381 const Register count = c_rarg2; // elements count 2382 2383 const Register bz_base = r10; // base for block_zero routine 2384 const Register cnt_words = r11; // temp register 2385 2386 __ enter(); 2387 2388 Label L_fill_elements, L_exit1; 2389 2390 int shift = -1; 2391 switch (t) { 2392 case T_BYTE: 2393 shift = 0; 2394 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2395 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2396 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2397 __ br(Assembler::LO, L_fill_elements); 2398 break; 2399 case T_SHORT: 2400 shift = 1; 2401 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2402 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2403 __ br(Assembler::LO, L_fill_elements); 2404 break; 2405 case T_INT: 2406 shift = 2; 2407 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2408 __ br(Assembler::LO, L_fill_elements); 2409 break; 2410 default: ShouldNotReachHere(); 2411 } 2412 2413 // Align source address at 8 bytes address boundary. 2414 Label L_skip_align1, L_skip_align2, L_skip_align4; 2415 if (!aligned) { 2416 switch (t) { 2417 case T_BYTE: 2418 // One byte misalignment happens only for byte arrays. 2419 __ tbz(to, 0, L_skip_align1); 2420 __ strb(value, Address(__ post(to, 1))); 2421 __ subw(count, count, 1); 2422 __ bind(L_skip_align1); 2423 // Fallthrough 2424 case T_SHORT: 2425 // Two bytes misalignment happens only for byte and short (char) arrays. 2426 __ tbz(to, 1, L_skip_align2); 2427 __ strh(value, Address(__ post(to, 2))); 2428 __ subw(count, count, 2 >> shift); 2429 __ bind(L_skip_align2); 2430 // Fallthrough 2431 case T_INT: 2432 // Align to 8 bytes, we know we are 4 byte aligned to start. 2433 __ tbz(to, 2, L_skip_align4); 2434 __ strw(value, Address(__ post(to, 4))); 2435 __ subw(count, count, 4 >> shift); 2436 __ bind(L_skip_align4); 2437 break; 2438 default: ShouldNotReachHere(); 2439 } 2440 } 2441 2442 // 2443 // Fill large chunks 2444 // 2445 __ lsrw(cnt_words, count, 3 - shift); // number of words 2446 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2447 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2448 if (UseBlockZeroing) { 2449 Label non_block_zeroing, rest; 2450 // If the fill value is zero we can use the fast zero_words(). 2451 __ cbnz(value, non_block_zeroing); 2452 __ mov(bz_base, to); 2453 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2454 address tpc = __ zero_words(bz_base, cnt_words); 2455 if (tpc == nullptr) { 2456 fatal("CodeCache is full at generate_fill"); 2457 } 2458 __ b(rest); 2459 __ bind(non_block_zeroing); 2460 __ fill_words(to, cnt_words, value); 2461 __ bind(rest); 2462 } else { 2463 __ fill_words(to, cnt_words, value); 2464 } 2465 2466 // Remaining count is less than 8 bytes. Fill it by a single store. 2467 // Note that the total length is no less than 8 bytes. 2468 if (t == T_BYTE || t == T_SHORT) { 2469 Label L_exit1; 2470 __ cbzw(count, L_exit1); 2471 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2472 __ str(value, Address(to, -8)); // overwrite some elements 2473 __ bind(L_exit1); 2474 __ leave(); 2475 __ ret(lr); 2476 } 2477 2478 // Handle copies less than 8 bytes. 2479 Label L_fill_2, L_fill_4, L_exit2; 2480 __ bind(L_fill_elements); 2481 switch (t) { 2482 case T_BYTE: 2483 __ tbz(count, 0, L_fill_2); 2484 __ strb(value, Address(__ post(to, 1))); 2485 __ bind(L_fill_2); 2486 __ tbz(count, 1, L_fill_4); 2487 __ strh(value, Address(__ post(to, 2))); 2488 __ bind(L_fill_4); 2489 __ tbz(count, 2, L_exit2); 2490 __ strw(value, Address(to)); 2491 break; 2492 case T_SHORT: 2493 __ tbz(count, 0, L_fill_4); 2494 __ strh(value, Address(__ post(to, 2))); 2495 __ bind(L_fill_4); 2496 __ tbz(count, 1, L_exit2); 2497 __ strw(value, Address(to)); 2498 break; 2499 case T_INT: 2500 __ cbzw(count, L_exit2); 2501 __ strw(value, Address(to)); 2502 break; 2503 default: ShouldNotReachHere(); 2504 } 2505 __ bind(L_exit2); 2506 __ leave(); 2507 __ ret(lr); 2508 return start; 2509 } 2510 2511 address generate_data_cache_writeback() { 2512 const Register line = c_rarg0; // address of line to write back 2513 2514 __ align(CodeEntryAlignment); 2515 2516 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2517 2518 address start = __ pc(); 2519 __ enter(); 2520 __ cache_wb(Address(line, 0)); 2521 __ leave(); 2522 __ ret(lr); 2523 2524 return start; 2525 } 2526 2527 address generate_data_cache_writeback_sync() { 2528 const Register is_pre = c_rarg0; // pre or post sync 2529 2530 __ align(CodeEntryAlignment); 2531 2532 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2533 2534 // pre wbsync is a no-op 2535 // post wbsync translates to an sfence 2536 2537 Label skip; 2538 address start = __ pc(); 2539 __ enter(); 2540 __ cbnz(is_pre, skip); 2541 __ cache_wbsync(false); 2542 __ bind(skip); 2543 __ leave(); 2544 __ ret(lr); 2545 2546 return start; 2547 } 2548 2549 void generate_arraycopy_stubs() { 2550 address entry; 2551 address entry_jbyte_arraycopy; 2552 address entry_jshort_arraycopy; 2553 address entry_jint_arraycopy; 2554 address entry_oop_arraycopy; 2555 address entry_jlong_arraycopy; 2556 address entry_checkcast_arraycopy; 2557 2558 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2559 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2560 2561 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2562 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2563 2564 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2565 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2566 2567 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2568 2569 //*** jbyte 2570 // Always need aligned and unaligned versions 2571 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2572 "jbyte_disjoint_arraycopy"); 2573 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2574 &entry_jbyte_arraycopy, 2575 "jbyte_arraycopy"); 2576 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2577 "arrayof_jbyte_disjoint_arraycopy"); 2578 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2579 "arrayof_jbyte_arraycopy"); 2580 2581 //*** jshort 2582 // Always need aligned and unaligned versions 2583 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2584 "jshort_disjoint_arraycopy"); 2585 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2586 &entry_jshort_arraycopy, 2587 "jshort_arraycopy"); 2588 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2589 "arrayof_jshort_disjoint_arraycopy"); 2590 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2591 "arrayof_jshort_arraycopy"); 2592 2593 //*** jint 2594 // Aligned versions 2595 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2596 "arrayof_jint_disjoint_arraycopy"); 2597 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2598 "arrayof_jint_arraycopy"); 2599 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2600 // entry_jint_arraycopy always points to the unaligned version 2601 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2602 "jint_disjoint_arraycopy"); 2603 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2604 &entry_jint_arraycopy, 2605 "jint_arraycopy"); 2606 2607 //*** jlong 2608 // It is always aligned 2609 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2610 "arrayof_jlong_disjoint_arraycopy"); 2611 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2612 "arrayof_jlong_arraycopy"); 2613 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2614 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2615 2616 //*** oops 2617 { 2618 // With compressed oops we need unaligned versions; notice that 2619 // we overwrite entry_oop_arraycopy. 2620 bool aligned = !UseCompressedOops; 2621 2622 StubRoutines::_arrayof_oop_disjoint_arraycopy 2623 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2624 /*dest_uninitialized*/false); 2625 StubRoutines::_arrayof_oop_arraycopy 2626 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2627 /*dest_uninitialized*/false); 2628 // Aligned versions without pre-barriers 2629 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2630 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2631 /*dest_uninitialized*/true); 2632 StubRoutines::_arrayof_oop_arraycopy_uninit 2633 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2634 /*dest_uninitialized*/true); 2635 } 2636 2637 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2638 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2639 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2640 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2641 2642 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2643 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2644 /*dest_uninitialized*/true); 2645 2646 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2647 entry_jbyte_arraycopy, 2648 entry_jshort_arraycopy, 2649 entry_jint_arraycopy, 2650 entry_jlong_arraycopy); 2651 2652 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2653 entry_jbyte_arraycopy, 2654 entry_jshort_arraycopy, 2655 entry_jint_arraycopy, 2656 entry_oop_arraycopy, 2657 entry_jlong_arraycopy, 2658 entry_checkcast_arraycopy); 2659 2660 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2661 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2662 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2663 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2664 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2665 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2666 } 2667 2668 void generate_math_stubs() { Unimplemented(); } 2669 2670 // Arguments: 2671 // 2672 // Inputs: 2673 // c_rarg0 - source byte array address 2674 // c_rarg1 - destination byte array address 2675 // c_rarg2 - K (key) in little endian int array 2676 // 2677 address generate_aescrypt_encryptBlock() { 2678 __ align(CodeEntryAlignment); 2679 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2680 2681 const Register from = c_rarg0; // source array address 2682 const Register to = c_rarg1; // destination array address 2683 const Register key = c_rarg2; // key array address 2684 const Register keylen = rscratch1; 2685 2686 address start = __ pc(); 2687 __ enter(); 2688 2689 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2690 2691 __ aesenc_loadkeys(key, keylen); 2692 __ aesecb_encrypt(from, to, keylen); 2693 2694 __ mov(r0, 0); 2695 2696 __ leave(); 2697 __ ret(lr); 2698 2699 return start; 2700 } 2701 2702 // Arguments: 2703 // 2704 // Inputs: 2705 // c_rarg0 - source byte array address 2706 // c_rarg1 - destination byte array address 2707 // c_rarg2 - K (key) in little endian int array 2708 // 2709 address generate_aescrypt_decryptBlock() { 2710 assert(UseAES, "need AES cryptographic extension support"); 2711 __ align(CodeEntryAlignment); 2712 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2713 Label L_doLast; 2714 2715 const Register from = c_rarg0; // source array address 2716 const Register to = c_rarg1; // destination array address 2717 const Register key = c_rarg2; // key array address 2718 const Register keylen = rscratch1; 2719 2720 address start = __ pc(); 2721 __ enter(); // required for proper stackwalking of RuntimeStub frame 2722 2723 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2724 2725 __ aesecb_decrypt(from, to, key, keylen); 2726 2727 __ mov(r0, 0); 2728 2729 __ leave(); 2730 __ ret(lr); 2731 2732 return start; 2733 } 2734 2735 // Arguments: 2736 // 2737 // Inputs: 2738 // c_rarg0 - source byte array address 2739 // c_rarg1 - destination byte array address 2740 // c_rarg2 - K (key) in little endian int array 2741 // c_rarg3 - r vector byte array address 2742 // c_rarg4 - input length 2743 // 2744 // Output: 2745 // x0 - input length 2746 // 2747 address generate_cipherBlockChaining_encryptAESCrypt() { 2748 assert(UseAES, "need AES cryptographic extension support"); 2749 __ align(CodeEntryAlignment); 2750 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2751 2752 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2753 2754 const Register from = c_rarg0; // source array address 2755 const Register to = c_rarg1; // destination array address 2756 const Register key = c_rarg2; // key array address 2757 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2758 // and left with the results of the last encryption block 2759 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2760 const Register keylen = rscratch1; 2761 2762 address start = __ pc(); 2763 2764 __ enter(); 2765 2766 __ movw(rscratch2, len_reg); 2767 2768 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2769 2770 __ ld1(v0, __ T16B, rvec); 2771 2772 __ cmpw(keylen, 52); 2773 __ br(Assembler::CC, L_loadkeys_44); 2774 __ br(Assembler::EQ, L_loadkeys_52); 2775 2776 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2777 __ rev32(v17, __ T16B, v17); 2778 __ rev32(v18, __ T16B, v18); 2779 __ BIND(L_loadkeys_52); 2780 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2781 __ rev32(v19, __ T16B, v19); 2782 __ rev32(v20, __ T16B, v20); 2783 __ BIND(L_loadkeys_44); 2784 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2785 __ rev32(v21, __ T16B, v21); 2786 __ rev32(v22, __ T16B, v22); 2787 __ rev32(v23, __ T16B, v23); 2788 __ rev32(v24, __ T16B, v24); 2789 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2790 __ rev32(v25, __ T16B, v25); 2791 __ rev32(v26, __ T16B, v26); 2792 __ rev32(v27, __ T16B, v27); 2793 __ rev32(v28, __ T16B, v28); 2794 __ ld1(v29, v30, v31, __ T16B, key); 2795 __ rev32(v29, __ T16B, v29); 2796 __ rev32(v30, __ T16B, v30); 2797 __ rev32(v31, __ T16B, v31); 2798 2799 __ BIND(L_aes_loop); 2800 __ ld1(v1, __ T16B, __ post(from, 16)); 2801 __ eor(v0, __ T16B, v0, v1); 2802 2803 __ br(Assembler::CC, L_rounds_44); 2804 __ br(Assembler::EQ, L_rounds_52); 2805 2806 __ aese(v0, v17); __ aesmc(v0, v0); 2807 __ aese(v0, v18); __ aesmc(v0, v0); 2808 __ BIND(L_rounds_52); 2809 __ aese(v0, v19); __ aesmc(v0, v0); 2810 __ aese(v0, v20); __ aesmc(v0, v0); 2811 __ BIND(L_rounds_44); 2812 __ aese(v0, v21); __ aesmc(v0, v0); 2813 __ aese(v0, v22); __ aesmc(v0, v0); 2814 __ aese(v0, v23); __ aesmc(v0, v0); 2815 __ aese(v0, v24); __ aesmc(v0, v0); 2816 __ aese(v0, v25); __ aesmc(v0, v0); 2817 __ aese(v0, v26); __ aesmc(v0, v0); 2818 __ aese(v0, v27); __ aesmc(v0, v0); 2819 __ aese(v0, v28); __ aesmc(v0, v0); 2820 __ aese(v0, v29); __ aesmc(v0, v0); 2821 __ aese(v0, v30); 2822 __ eor(v0, __ T16B, v0, v31); 2823 2824 __ st1(v0, __ T16B, __ post(to, 16)); 2825 2826 __ subw(len_reg, len_reg, 16); 2827 __ cbnzw(len_reg, L_aes_loop); 2828 2829 __ st1(v0, __ T16B, rvec); 2830 2831 __ mov(r0, rscratch2); 2832 2833 __ leave(); 2834 __ ret(lr); 2835 2836 return start; 2837 } 2838 2839 // Arguments: 2840 // 2841 // Inputs: 2842 // c_rarg0 - source byte array address 2843 // c_rarg1 - destination byte array address 2844 // c_rarg2 - K (key) in little endian int array 2845 // c_rarg3 - r vector byte array address 2846 // c_rarg4 - input length 2847 // 2848 // Output: 2849 // r0 - input length 2850 // 2851 address generate_cipherBlockChaining_decryptAESCrypt() { 2852 assert(UseAES, "need AES cryptographic extension support"); 2853 __ align(CodeEntryAlignment); 2854 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2855 2856 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2857 2858 const Register from = c_rarg0; // source array address 2859 const Register to = c_rarg1; // destination array address 2860 const Register key = c_rarg2; // key array address 2861 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2862 // and left with the results of the last encryption block 2863 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2864 const Register keylen = rscratch1; 2865 2866 address start = __ pc(); 2867 2868 __ enter(); 2869 2870 __ movw(rscratch2, len_reg); 2871 2872 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2873 2874 __ ld1(v2, __ T16B, rvec); 2875 2876 __ ld1(v31, __ T16B, __ post(key, 16)); 2877 __ rev32(v31, __ T16B, v31); 2878 2879 __ cmpw(keylen, 52); 2880 __ br(Assembler::CC, L_loadkeys_44); 2881 __ br(Assembler::EQ, L_loadkeys_52); 2882 2883 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2884 __ rev32(v17, __ T16B, v17); 2885 __ rev32(v18, __ T16B, v18); 2886 __ BIND(L_loadkeys_52); 2887 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2888 __ rev32(v19, __ T16B, v19); 2889 __ rev32(v20, __ T16B, v20); 2890 __ BIND(L_loadkeys_44); 2891 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2892 __ rev32(v21, __ T16B, v21); 2893 __ rev32(v22, __ T16B, v22); 2894 __ rev32(v23, __ T16B, v23); 2895 __ rev32(v24, __ T16B, v24); 2896 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2897 __ rev32(v25, __ T16B, v25); 2898 __ rev32(v26, __ T16B, v26); 2899 __ rev32(v27, __ T16B, v27); 2900 __ rev32(v28, __ T16B, v28); 2901 __ ld1(v29, v30, __ T16B, key); 2902 __ rev32(v29, __ T16B, v29); 2903 __ rev32(v30, __ T16B, v30); 2904 2905 __ BIND(L_aes_loop); 2906 __ ld1(v0, __ T16B, __ post(from, 16)); 2907 __ orr(v1, __ T16B, v0, v0); 2908 2909 __ br(Assembler::CC, L_rounds_44); 2910 __ br(Assembler::EQ, L_rounds_52); 2911 2912 __ aesd(v0, v17); __ aesimc(v0, v0); 2913 __ aesd(v0, v18); __ aesimc(v0, v0); 2914 __ BIND(L_rounds_52); 2915 __ aesd(v0, v19); __ aesimc(v0, v0); 2916 __ aesd(v0, v20); __ aesimc(v0, v0); 2917 __ BIND(L_rounds_44); 2918 __ aesd(v0, v21); __ aesimc(v0, v0); 2919 __ aesd(v0, v22); __ aesimc(v0, v0); 2920 __ aesd(v0, v23); __ aesimc(v0, v0); 2921 __ aesd(v0, v24); __ aesimc(v0, v0); 2922 __ aesd(v0, v25); __ aesimc(v0, v0); 2923 __ aesd(v0, v26); __ aesimc(v0, v0); 2924 __ aesd(v0, v27); __ aesimc(v0, v0); 2925 __ aesd(v0, v28); __ aesimc(v0, v0); 2926 __ aesd(v0, v29); __ aesimc(v0, v0); 2927 __ aesd(v0, v30); 2928 __ eor(v0, __ T16B, v0, v31); 2929 __ eor(v0, __ T16B, v0, v2); 2930 2931 __ st1(v0, __ T16B, __ post(to, 16)); 2932 __ orr(v2, __ T16B, v1, v1); 2933 2934 __ subw(len_reg, len_reg, 16); 2935 __ cbnzw(len_reg, L_aes_loop); 2936 2937 __ st1(v2, __ T16B, rvec); 2938 2939 __ mov(r0, rscratch2); 2940 2941 __ leave(); 2942 __ ret(lr); 2943 2944 return start; 2945 } 2946 2947 // CTR AES crypt. 2948 // Arguments: 2949 // 2950 // Inputs: 2951 // c_rarg0 - source byte array address 2952 // c_rarg1 - destination byte array address 2953 // c_rarg2 - K (key) in little endian int array 2954 // c_rarg3 - counter vector byte array address 2955 // c_rarg4 - input length 2956 // c_rarg5 - saved encryptedCounter start 2957 // c_rarg6 - saved used length 2958 // 2959 // Output: 2960 // r0 - input length 2961 // 2962 address generate_counterMode_AESCrypt() { 2963 const Register in = c_rarg0; 2964 const Register out = c_rarg1; 2965 const Register key = c_rarg2; 2966 const Register counter = c_rarg3; 2967 const Register saved_len = c_rarg4, len = r10; 2968 const Register saved_encrypted_ctr = c_rarg5; 2969 const Register used_ptr = c_rarg6, used = r12; 2970 2971 const Register offset = r7; 2972 const Register keylen = r11; 2973 2974 const unsigned char block_size = 16; 2975 const int bulk_width = 4; 2976 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 2977 // performance with larger data sizes, but it also means that the 2978 // fast path isn't used until you have at least 8 blocks, and up 2979 // to 127 bytes of data will be executed on the slow path. For 2980 // that reason, and also so as not to blow away too much icache, 4 2981 // blocks seems like a sensible compromise. 2982 2983 // Algorithm: 2984 // 2985 // if (len == 0) { 2986 // goto DONE; 2987 // } 2988 // int result = len; 2989 // do { 2990 // if (used >= blockSize) { 2991 // if (len >= bulk_width * blockSize) { 2992 // CTR_large_block(); 2993 // if (len == 0) 2994 // goto DONE; 2995 // } 2996 // for (;;) { 2997 // 16ByteVector v0 = counter; 2998 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 2999 // used = 0; 3000 // if (len < blockSize) 3001 // break; /* goto NEXT */ 3002 // 16ByteVector v1 = load16Bytes(in, offset); 3003 // v1 = v1 ^ encryptedCounter; 3004 // store16Bytes(out, offset); 3005 // used = blockSize; 3006 // offset += blockSize; 3007 // len -= blockSize; 3008 // if (len == 0) 3009 // goto DONE; 3010 // } 3011 // } 3012 // NEXT: 3013 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3014 // len--; 3015 // } while (len != 0); 3016 // DONE: 3017 // return result; 3018 // 3019 // CTR_large_block() 3020 // Wide bulk encryption of whole blocks. 3021 3022 __ align(CodeEntryAlignment); 3023 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3024 const address start = __ pc(); 3025 __ enter(); 3026 3027 Label DONE, CTR_large_block, large_block_return; 3028 __ ldrw(used, Address(used_ptr)); 3029 __ cbzw(saved_len, DONE); 3030 3031 __ mov(len, saved_len); 3032 __ mov(offset, 0); 3033 3034 // Compute #rounds for AES based on the length of the key array 3035 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3036 3037 __ aesenc_loadkeys(key, keylen); 3038 3039 { 3040 Label L_CTR_loop, NEXT; 3041 3042 __ bind(L_CTR_loop); 3043 3044 __ cmp(used, block_size); 3045 __ br(__ LO, NEXT); 3046 3047 // Maybe we have a lot of data 3048 __ subsw(rscratch1, len, bulk_width * block_size); 3049 __ br(__ HS, CTR_large_block); 3050 __ BIND(large_block_return); 3051 __ cbzw(len, DONE); 3052 3053 // Setup the counter 3054 __ movi(v4, __ T4S, 0); 3055 __ movi(v5, __ T4S, 1); 3056 __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 } 3057 3058 __ ld1(v0, __ T16B, counter); // Load the counter into v0 3059 __ rev32(v16, __ T16B, v0); 3060 __ addv(v16, __ T4S, v16, v4); 3061 __ rev32(v16, __ T16B, v16); 3062 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3063 3064 { 3065 // We have fewer than bulk_width blocks of data left. Encrypt 3066 // them one by one until there is less than a full block 3067 // remaining, being careful to save both the encrypted counter 3068 // and the counter. 3069 3070 Label inner_loop; 3071 __ bind(inner_loop); 3072 // Counter to encrypt is in v0 3073 __ aesecb_encrypt(noreg, noreg, keylen); 3074 __ st1(v0, __ T16B, saved_encrypted_ctr); 3075 3076 // Do we have a remaining full block? 3077 3078 __ mov(used, 0); 3079 __ cmp(len, block_size); 3080 __ br(__ LO, NEXT); 3081 3082 // Yes, we have a full block 3083 __ ldrq(v1, Address(in, offset)); 3084 __ eor(v1, __ T16B, v1, v0); 3085 __ strq(v1, Address(out, offset)); 3086 __ mov(used, block_size); 3087 __ add(offset, offset, block_size); 3088 3089 __ subw(len, len, block_size); 3090 __ cbzw(len, DONE); 3091 3092 // Increment the counter, store it back 3093 __ orr(v0, __ T16B, v16, v16); 3094 __ rev32(v16, __ T16B, v16); 3095 __ addv(v16, __ T4S, v16, v4); 3096 __ rev32(v16, __ T16B, v16); 3097 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3098 3099 __ b(inner_loop); 3100 } 3101 3102 __ BIND(NEXT); 3103 3104 // Encrypt a single byte, and loop. 3105 // We expect this to be a rare event. 3106 __ ldrb(rscratch1, Address(in, offset)); 3107 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3108 __ eor(rscratch1, rscratch1, rscratch2); 3109 __ strb(rscratch1, Address(out, offset)); 3110 __ add(offset, offset, 1); 3111 __ add(used, used, 1); 3112 __ subw(len, len,1); 3113 __ cbnzw(len, L_CTR_loop); 3114 } 3115 3116 __ bind(DONE); 3117 __ strw(used, Address(used_ptr)); 3118 __ mov(r0, saved_len); 3119 3120 __ leave(); // required for proper stackwalking of RuntimeStub frame 3121 __ ret(lr); 3122 3123 // Bulk encryption 3124 3125 __ BIND (CTR_large_block); 3126 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3127 3128 if (bulk_width == 8) { 3129 __ sub(sp, sp, 4 * 16); 3130 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3131 } 3132 __ sub(sp, sp, 4 * 16); 3133 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3134 RegSet saved_regs = (RegSet::of(in, out, offset) 3135 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3136 __ push(saved_regs, sp); 3137 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3138 __ add(in, in, offset); 3139 __ add(out, out, offset); 3140 3141 // Keys should already be loaded into the correct registers 3142 3143 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3144 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3145 3146 // AES/CTR loop 3147 { 3148 Label L_CTR_loop; 3149 __ BIND(L_CTR_loop); 3150 3151 // Setup the counters 3152 __ movi(v8, __ T4S, 0); 3153 __ movi(v9, __ T4S, 1); 3154 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3155 3156 for (int i = 0; i < bulk_width; i++) { 3157 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3158 __ rev32(v0_ofs, __ T16B, v16); 3159 __ addv(v16, __ T4S, v16, v8); 3160 } 3161 3162 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3163 3164 // Encrypt the counters 3165 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3166 3167 if (bulk_width == 8) { 3168 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3169 } 3170 3171 // XOR the encrypted counters with the inputs 3172 for (int i = 0; i < bulk_width; i++) { 3173 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3174 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3175 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3176 } 3177 3178 // Write the encrypted data 3179 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3180 if (bulk_width == 8) { 3181 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3182 } 3183 3184 __ subw(len, len, 16 * bulk_width); 3185 __ cbnzw(len, L_CTR_loop); 3186 } 3187 3188 // Save the counter back where it goes 3189 __ rev32(v16, __ T16B, v16); 3190 __ st1(v16, __ T16B, counter); 3191 3192 __ pop(saved_regs, sp); 3193 3194 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3195 if (bulk_width == 8) { 3196 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3197 } 3198 3199 __ andr(rscratch1, len, -16 * bulk_width); 3200 __ sub(len, len, rscratch1); 3201 __ add(offset, offset, rscratch1); 3202 __ mov(used, 16); 3203 __ strw(used, Address(used_ptr)); 3204 __ b(large_block_return); 3205 3206 return start; 3207 } 3208 3209 // Vector AES Galois Counter Mode implementation. Parameters: 3210 // 3211 // in = c_rarg0 3212 // len = c_rarg1 3213 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3214 // out = c_rarg3 3215 // key = c_rarg4 3216 // state = c_rarg5 - GHASH.state 3217 // subkeyHtbl = c_rarg6 - powers of H 3218 // counter = c_rarg7 - 16 bytes of CTR 3219 // return - number of processed bytes 3220 address generate_galoisCounterMode_AESCrypt() { 3221 address ghash_polynomial = __ pc(); 3222 __ emit_int64(0x87); // The low-order bits of the field 3223 // polynomial (i.e. p = z^7+z^2+z+1) 3224 // repeated in the low and high parts of a 3225 // 128-bit vector 3226 __ emit_int64(0x87); 3227 3228 __ align(CodeEntryAlignment); 3229 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3230 address start = __ pc(); 3231 __ enter(); 3232 3233 const Register in = c_rarg0; 3234 const Register len = c_rarg1; 3235 const Register ct = c_rarg2; 3236 const Register out = c_rarg3; 3237 // and updated with the incremented counter in the end 3238 3239 const Register key = c_rarg4; 3240 const Register state = c_rarg5; 3241 3242 const Register subkeyHtbl = c_rarg6; 3243 3244 const Register counter = c_rarg7; 3245 3246 const Register keylen = r10; 3247 // Save state before entering routine 3248 __ sub(sp, sp, 4 * 16); 3249 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3250 __ sub(sp, sp, 4 * 16); 3251 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3252 3253 // __ andr(len, len, -512); 3254 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3255 __ str(len, __ pre(sp, -2 * wordSize)); 3256 3257 Label DONE; 3258 __ cbz(len, DONE); 3259 3260 // Compute #rounds for AES based on the length of the key array 3261 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3262 3263 __ aesenc_loadkeys(key, keylen); 3264 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3265 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3266 3267 // AES/CTR loop 3268 { 3269 Label L_CTR_loop; 3270 __ BIND(L_CTR_loop); 3271 3272 // Setup the counters 3273 __ movi(v8, __ T4S, 0); 3274 __ movi(v9, __ T4S, 1); 3275 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3276 3277 assert(v0->encoding() < v8->encoding(), ""); 3278 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3279 FloatRegister f = as_FloatRegister(i); 3280 __ rev32(f, __ T16B, v16); 3281 __ addv(v16, __ T4S, v16, v8); 3282 } 3283 3284 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3285 3286 // Encrypt the counters 3287 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3288 3289 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3290 3291 // XOR the encrypted counters with the inputs 3292 for (int i = 0; i < 8; i++) { 3293 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3294 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3295 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3296 } 3297 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3298 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3299 3300 __ subw(len, len, 16 * 8); 3301 __ cbnzw(len, L_CTR_loop); 3302 } 3303 3304 __ rev32(v16, __ T16B, v16); 3305 __ st1(v16, __ T16B, counter); 3306 3307 __ ldr(len, Address(sp)); 3308 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3309 3310 // GHASH/CTR loop 3311 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3312 len, /*unrolls*/4); 3313 3314 #ifdef ASSERT 3315 { Label L; 3316 __ cmp(len, (unsigned char)0); 3317 __ br(Assembler::EQ, L); 3318 __ stop("stubGenerator: abort"); 3319 __ bind(L); 3320 } 3321 #endif 3322 3323 __ bind(DONE); 3324 // Return the number of bytes processed 3325 __ ldr(r0, __ post(sp, 2 * wordSize)); 3326 3327 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3328 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3329 3330 __ leave(); // required for proper stackwalking of RuntimeStub frame 3331 __ ret(lr); 3332 return start; 3333 } 3334 3335 class Cached64Bytes { 3336 private: 3337 MacroAssembler *_masm; 3338 Register _regs[8]; 3339 3340 public: 3341 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3342 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3343 auto it = rs.begin(); 3344 for (auto &r: _regs) { 3345 r = *it; 3346 ++it; 3347 } 3348 } 3349 3350 void gen_loads(Register base) { 3351 for (int i = 0; i < 8; i += 2) { 3352 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3353 } 3354 } 3355 3356 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3357 void extract_u32(Register dest, int i) { 3358 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3359 } 3360 }; 3361 3362 // Utility routines for md5. 3363 // Clobbers r10 and r11. 3364 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3365 int k, int s, int t) { 3366 Register rscratch3 = r10; 3367 Register rscratch4 = r11; 3368 3369 __ eorw(rscratch3, r3, r4); 3370 __ movw(rscratch2, t); 3371 __ andw(rscratch3, rscratch3, r2); 3372 __ addw(rscratch4, r1, rscratch2); 3373 reg_cache.extract_u32(rscratch1, k); 3374 __ eorw(rscratch3, rscratch3, r4); 3375 __ addw(rscratch4, rscratch4, rscratch1); 3376 __ addw(rscratch3, rscratch3, rscratch4); 3377 __ rorw(rscratch2, rscratch3, 32 - s); 3378 __ addw(r1, rscratch2, r2); 3379 } 3380 3381 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3382 int k, int s, int t) { 3383 Register rscratch3 = r10; 3384 Register rscratch4 = r11; 3385 3386 __ andw(rscratch3, r2, r4); 3387 __ bicw(rscratch4, r3, r4); 3388 reg_cache.extract_u32(rscratch1, k); 3389 __ movw(rscratch2, t); 3390 __ orrw(rscratch3, rscratch3, rscratch4); 3391 __ addw(rscratch4, r1, rscratch2); 3392 __ addw(rscratch4, rscratch4, rscratch1); 3393 __ addw(rscratch3, rscratch3, rscratch4); 3394 __ rorw(rscratch2, rscratch3, 32 - s); 3395 __ addw(r1, rscratch2, r2); 3396 } 3397 3398 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3399 int k, int s, int t) { 3400 Register rscratch3 = r10; 3401 Register rscratch4 = r11; 3402 3403 __ eorw(rscratch3, r3, r4); 3404 __ movw(rscratch2, t); 3405 __ addw(rscratch4, r1, rscratch2); 3406 reg_cache.extract_u32(rscratch1, k); 3407 __ eorw(rscratch3, rscratch3, r2); 3408 __ addw(rscratch4, rscratch4, rscratch1); 3409 __ addw(rscratch3, rscratch3, rscratch4); 3410 __ rorw(rscratch2, rscratch3, 32 - s); 3411 __ addw(r1, rscratch2, r2); 3412 } 3413 3414 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3415 int k, int s, int t) { 3416 Register rscratch3 = r10; 3417 Register rscratch4 = r11; 3418 3419 __ movw(rscratch3, t); 3420 __ ornw(rscratch2, r2, r4); 3421 __ addw(rscratch4, r1, rscratch3); 3422 reg_cache.extract_u32(rscratch1, k); 3423 __ eorw(rscratch3, rscratch2, r3); 3424 __ addw(rscratch4, rscratch4, rscratch1); 3425 __ addw(rscratch3, rscratch3, rscratch4); 3426 __ rorw(rscratch2, rscratch3, 32 - s); 3427 __ addw(r1, rscratch2, r2); 3428 } 3429 3430 // Arguments: 3431 // 3432 // Inputs: 3433 // c_rarg0 - byte[] source+offset 3434 // c_rarg1 - int[] SHA.state 3435 // c_rarg2 - int offset 3436 // c_rarg3 - int limit 3437 // 3438 address generate_md5_implCompress(bool multi_block, const char *name) { 3439 __ align(CodeEntryAlignment); 3440 StubCodeMark mark(this, "StubRoutines", name); 3441 address start = __ pc(); 3442 3443 Register buf = c_rarg0; 3444 Register state = c_rarg1; 3445 Register ofs = c_rarg2; 3446 Register limit = c_rarg3; 3447 Register a = r4; 3448 Register b = r5; 3449 Register c = r6; 3450 Register d = r7; 3451 Register rscratch3 = r10; 3452 Register rscratch4 = r11; 3453 3454 Register state_regs[2] = { r12, r13 }; 3455 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3456 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3457 3458 __ push(saved_regs, sp); 3459 3460 __ ldp(state_regs[0], state_regs[1], Address(state)); 3461 __ ubfx(a, state_regs[0], 0, 32); 3462 __ ubfx(b, state_regs[0], 32, 32); 3463 __ ubfx(c, state_regs[1], 0, 32); 3464 __ ubfx(d, state_regs[1], 32, 32); 3465 3466 Label md5_loop; 3467 __ BIND(md5_loop); 3468 3469 reg_cache.gen_loads(buf); 3470 3471 // Round 1 3472 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3473 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3474 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3475 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3476 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3477 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3478 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3479 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3480 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3481 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3482 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3483 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3484 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3485 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3486 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3487 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3488 3489 // Round 2 3490 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3491 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3492 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3493 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3494 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3495 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3496 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3497 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3498 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3499 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3500 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3501 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3502 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3503 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3504 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3505 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3506 3507 // Round 3 3508 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3509 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3510 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3511 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3512 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3513 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3514 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3515 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3516 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3517 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3518 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3519 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3520 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3521 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3522 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3523 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3524 3525 // Round 4 3526 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3527 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3528 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3529 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3530 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3531 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3532 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3533 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3534 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3535 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3536 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3537 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3538 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3539 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3540 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3541 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3542 3543 __ addw(a, state_regs[0], a); 3544 __ ubfx(rscratch2, state_regs[0], 32, 32); 3545 __ addw(b, rscratch2, b); 3546 __ addw(c, state_regs[1], c); 3547 __ ubfx(rscratch4, state_regs[1], 32, 32); 3548 __ addw(d, rscratch4, d); 3549 3550 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3551 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3552 3553 if (multi_block) { 3554 __ add(buf, buf, 64); 3555 __ add(ofs, ofs, 64); 3556 __ cmp(ofs, limit); 3557 __ br(Assembler::LE, md5_loop); 3558 __ mov(c_rarg0, ofs); // return ofs 3559 } 3560 3561 // write hash values back in the correct order 3562 __ stp(state_regs[0], state_regs[1], Address(state)); 3563 3564 __ pop(saved_regs, sp); 3565 3566 __ ret(lr); 3567 3568 return start; 3569 } 3570 3571 // Arguments: 3572 // 3573 // Inputs: 3574 // c_rarg0 - byte[] source+offset 3575 // c_rarg1 - int[] SHA.state 3576 // c_rarg2 - int offset 3577 // c_rarg3 - int limit 3578 // 3579 address generate_sha1_implCompress(bool multi_block, const char *name) { 3580 __ align(CodeEntryAlignment); 3581 StubCodeMark mark(this, "StubRoutines", name); 3582 address start = __ pc(); 3583 3584 Register buf = c_rarg0; 3585 Register state = c_rarg1; 3586 Register ofs = c_rarg2; 3587 Register limit = c_rarg3; 3588 3589 Label keys; 3590 Label sha1_loop; 3591 3592 // load the keys into v0..v3 3593 __ adr(rscratch1, keys); 3594 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3595 // load 5 words state into v6, v7 3596 __ ldrq(v6, Address(state, 0)); 3597 __ ldrs(v7, Address(state, 16)); 3598 3599 3600 __ BIND(sha1_loop); 3601 // load 64 bytes of data into v16..v19 3602 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3603 __ rev32(v16, __ T16B, v16); 3604 __ rev32(v17, __ T16B, v17); 3605 __ rev32(v18, __ T16B, v18); 3606 __ rev32(v19, __ T16B, v19); 3607 3608 // do the sha1 3609 __ addv(v4, __ T4S, v16, v0); 3610 __ orr(v20, __ T16B, v6, v6); 3611 3612 FloatRegister d0 = v16; 3613 FloatRegister d1 = v17; 3614 FloatRegister d2 = v18; 3615 FloatRegister d3 = v19; 3616 3617 for (int round = 0; round < 20; round++) { 3618 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3619 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3620 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3621 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3622 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3623 3624 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3625 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3626 __ sha1h(tmp2, __ T4S, v20); 3627 if (round < 5) 3628 __ sha1c(v20, __ T4S, tmp3, tmp4); 3629 else if (round < 10 || round >= 15) 3630 __ sha1p(v20, __ T4S, tmp3, tmp4); 3631 else 3632 __ sha1m(v20, __ T4S, tmp3, tmp4); 3633 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3634 3635 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3636 } 3637 3638 __ addv(v7, __ T2S, v7, v21); 3639 __ addv(v6, __ T4S, v6, v20); 3640 3641 if (multi_block) { 3642 __ add(ofs, ofs, 64); 3643 __ cmp(ofs, limit); 3644 __ br(Assembler::LE, sha1_loop); 3645 __ mov(c_rarg0, ofs); // return ofs 3646 } 3647 3648 __ strq(v6, Address(state, 0)); 3649 __ strs(v7, Address(state, 16)); 3650 3651 __ ret(lr); 3652 3653 __ bind(keys); 3654 __ emit_int32(0x5a827999); 3655 __ emit_int32(0x6ed9eba1); 3656 __ emit_int32(0x8f1bbcdc); 3657 __ emit_int32(0xca62c1d6); 3658 3659 return start; 3660 } 3661 3662 3663 // Arguments: 3664 // 3665 // Inputs: 3666 // c_rarg0 - byte[] source+offset 3667 // c_rarg1 - int[] SHA.state 3668 // c_rarg2 - int offset 3669 // c_rarg3 - int limit 3670 // 3671 address generate_sha256_implCompress(bool multi_block, const char *name) { 3672 static const uint32_t round_consts[64] = { 3673 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3674 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3675 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3676 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3677 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3678 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3679 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3680 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3681 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3682 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3683 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3684 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3685 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3686 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3687 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3688 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3689 }; 3690 __ align(CodeEntryAlignment); 3691 StubCodeMark mark(this, "StubRoutines", name); 3692 address start = __ pc(); 3693 3694 Register buf = c_rarg0; 3695 Register state = c_rarg1; 3696 Register ofs = c_rarg2; 3697 Register limit = c_rarg3; 3698 3699 Label sha1_loop; 3700 3701 __ stpd(v8, v9, __ pre(sp, -32)); 3702 __ stpd(v10, v11, Address(sp, 16)); 3703 3704 // dga == v0 3705 // dgb == v1 3706 // dg0 == v2 3707 // dg1 == v3 3708 // dg2 == v4 3709 // t0 == v6 3710 // t1 == v7 3711 3712 // load 16 keys to v16..v31 3713 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3714 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3715 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3716 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3717 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3718 3719 // load 8 words (256 bits) state 3720 __ ldpq(v0, v1, state); 3721 3722 __ BIND(sha1_loop); 3723 // load 64 bytes of data into v8..v11 3724 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3725 __ rev32(v8, __ T16B, v8); 3726 __ rev32(v9, __ T16B, v9); 3727 __ rev32(v10, __ T16B, v10); 3728 __ rev32(v11, __ T16B, v11); 3729 3730 __ addv(v6, __ T4S, v8, v16); 3731 __ orr(v2, __ T16B, v0, v0); 3732 __ orr(v3, __ T16B, v1, v1); 3733 3734 FloatRegister d0 = v8; 3735 FloatRegister d1 = v9; 3736 FloatRegister d2 = v10; 3737 FloatRegister d3 = v11; 3738 3739 3740 for (int round = 0; round < 16; round++) { 3741 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3742 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3743 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3744 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3745 3746 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3747 __ orr(v4, __ T16B, v2, v2); 3748 if (round < 15) 3749 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3750 __ sha256h(v2, __ T4S, v3, tmp2); 3751 __ sha256h2(v3, __ T4S, v4, tmp2); 3752 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3753 3754 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3755 } 3756 3757 __ addv(v0, __ T4S, v0, v2); 3758 __ addv(v1, __ T4S, v1, v3); 3759 3760 if (multi_block) { 3761 __ add(ofs, ofs, 64); 3762 __ cmp(ofs, limit); 3763 __ br(Assembler::LE, sha1_loop); 3764 __ mov(c_rarg0, ofs); // return ofs 3765 } 3766 3767 __ ldpd(v10, v11, Address(sp, 16)); 3768 __ ldpd(v8, v9, __ post(sp, 32)); 3769 3770 __ stpq(v0, v1, state); 3771 3772 __ ret(lr); 3773 3774 return start; 3775 } 3776 3777 // Double rounds for sha512. 3778 void sha512_dround(int dr, 3779 FloatRegister vi0, FloatRegister vi1, 3780 FloatRegister vi2, FloatRegister vi3, 3781 FloatRegister vi4, FloatRegister vrc0, 3782 FloatRegister vrc1, FloatRegister vin0, 3783 FloatRegister vin1, FloatRegister vin2, 3784 FloatRegister vin3, FloatRegister vin4) { 3785 if (dr < 36) { 3786 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3787 } 3788 __ addv(v5, __ T2D, vrc0, vin0); 3789 __ ext(v6, __ T16B, vi2, vi3, 8); 3790 __ ext(v5, __ T16B, v5, v5, 8); 3791 __ ext(v7, __ T16B, vi1, vi2, 8); 3792 __ addv(vi3, __ T2D, vi3, v5); 3793 if (dr < 32) { 3794 __ ext(v5, __ T16B, vin3, vin4, 8); 3795 __ sha512su0(vin0, __ T2D, vin1); 3796 } 3797 __ sha512h(vi3, __ T2D, v6, v7); 3798 if (dr < 32) { 3799 __ sha512su1(vin0, __ T2D, vin2, v5); 3800 } 3801 __ addv(vi4, __ T2D, vi1, vi3); 3802 __ sha512h2(vi3, __ T2D, vi1, vi0); 3803 } 3804 3805 // Arguments: 3806 // 3807 // Inputs: 3808 // c_rarg0 - byte[] source+offset 3809 // c_rarg1 - int[] SHA.state 3810 // c_rarg2 - int offset 3811 // c_rarg3 - int limit 3812 // 3813 address generate_sha512_implCompress(bool multi_block, const char *name) { 3814 static const uint64_t round_consts[80] = { 3815 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3816 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3817 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3818 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3819 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3820 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3821 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3822 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3823 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3824 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3825 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3826 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3827 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3828 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3829 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3830 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3831 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3832 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3833 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3834 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3835 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3836 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3837 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3838 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3839 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3840 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3841 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3842 }; 3843 3844 __ align(CodeEntryAlignment); 3845 StubCodeMark mark(this, "StubRoutines", name); 3846 address start = __ pc(); 3847 3848 Register buf = c_rarg0; 3849 Register state = c_rarg1; 3850 Register ofs = c_rarg2; 3851 Register limit = c_rarg3; 3852 3853 __ stpd(v8, v9, __ pre(sp, -64)); 3854 __ stpd(v10, v11, Address(sp, 16)); 3855 __ stpd(v12, v13, Address(sp, 32)); 3856 __ stpd(v14, v15, Address(sp, 48)); 3857 3858 Label sha512_loop; 3859 3860 // load state 3861 __ ld1(v8, v9, v10, v11, __ T2D, state); 3862 3863 // load first 4 round constants 3864 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3865 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3866 3867 __ BIND(sha512_loop); 3868 // load 128B of data into v12..v19 3869 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3870 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3871 __ rev64(v12, __ T16B, v12); 3872 __ rev64(v13, __ T16B, v13); 3873 __ rev64(v14, __ T16B, v14); 3874 __ rev64(v15, __ T16B, v15); 3875 __ rev64(v16, __ T16B, v16); 3876 __ rev64(v17, __ T16B, v17); 3877 __ rev64(v18, __ T16B, v18); 3878 __ rev64(v19, __ T16B, v19); 3879 3880 __ mov(rscratch2, rscratch1); 3881 3882 __ mov(v0, __ T16B, v8); 3883 __ mov(v1, __ T16B, v9); 3884 __ mov(v2, __ T16B, v10); 3885 __ mov(v3, __ T16B, v11); 3886 3887 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3888 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3889 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3890 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3891 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3892 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3893 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3894 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3895 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3896 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3897 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3898 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3899 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3900 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3901 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3902 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3903 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3904 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3905 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3906 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3907 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3908 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3909 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3910 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3911 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3912 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3913 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3914 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3915 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3916 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3917 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3918 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3919 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3920 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3921 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3922 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3923 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3924 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3925 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3926 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3927 3928 __ addv(v8, __ T2D, v8, v0); 3929 __ addv(v9, __ T2D, v9, v1); 3930 __ addv(v10, __ T2D, v10, v2); 3931 __ addv(v11, __ T2D, v11, v3); 3932 3933 if (multi_block) { 3934 __ add(ofs, ofs, 128); 3935 __ cmp(ofs, limit); 3936 __ br(Assembler::LE, sha512_loop); 3937 __ mov(c_rarg0, ofs); // return ofs 3938 } 3939 3940 __ st1(v8, v9, v10, v11, __ T2D, state); 3941 3942 __ ldpd(v14, v15, Address(sp, 48)); 3943 __ ldpd(v12, v13, Address(sp, 32)); 3944 __ ldpd(v10, v11, Address(sp, 16)); 3945 __ ldpd(v8, v9, __ post(sp, 64)); 3946 3947 __ ret(lr); 3948 3949 return start; 3950 } 3951 3952 // Arguments: 3953 // 3954 // Inputs: 3955 // c_rarg0 - byte[] source+offset 3956 // c_rarg1 - byte[] SHA.state 3957 // c_rarg2 - int block_size 3958 // c_rarg3 - int offset 3959 // c_rarg4 - int limit 3960 // 3961 address generate_sha3_implCompress(bool multi_block, const char *name) { 3962 static const uint64_t round_consts[24] = { 3963 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 3964 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 3965 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 3966 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 3967 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 3968 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 3969 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 3970 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 3971 }; 3972 3973 __ align(CodeEntryAlignment); 3974 StubCodeMark mark(this, "StubRoutines", name); 3975 address start = __ pc(); 3976 3977 Register buf = c_rarg0; 3978 Register state = c_rarg1; 3979 Register block_size = c_rarg2; 3980 Register ofs = c_rarg3; 3981 Register limit = c_rarg4; 3982 3983 Label sha3_loop, rounds24_loop; 3984 Label sha3_512_or_sha3_384, shake128; 3985 3986 __ stpd(v8, v9, __ pre(sp, -64)); 3987 __ stpd(v10, v11, Address(sp, 16)); 3988 __ stpd(v12, v13, Address(sp, 32)); 3989 __ stpd(v14, v15, Address(sp, 48)); 3990 3991 // load state 3992 __ add(rscratch1, state, 32); 3993 __ ld1(v0, v1, v2, v3, __ T1D, state); 3994 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 3995 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 3996 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 3997 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 3998 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 3999 __ ld1(v24, __ T1D, rscratch1); 4000 4001 __ BIND(sha3_loop); 4002 4003 // 24 keccak rounds 4004 __ movw(rscratch2, 24); 4005 4006 // load round_constants base 4007 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4008 4009 // load input 4010 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4011 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4012 __ eor(v0, __ T8B, v0, v25); 4013 __ eor(v1, __ T8B, v1, v26); 4014 __ eor(v2, __ T8B, v2, v27); 4015 __ eor(v3, __ T8B, v3, v28); 4016 __ eor(v4, __ T8B, v4, v29); 4017 __ eor(v5, __ T8B, v5, v30); 4018 __ eor(v6, __ T8B, v6, v31); 4019 4020 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4021 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4022 4023 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4024 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4025 __ eor(v7, __ T8B, v7, v25); 4026 __ eor(v8, __ T8B, v8, v26); 4027 __ eor(v9, __ T8B, v9, v27); 4028 __ eor(v10, __ T8B, v10, v28); 4029 __ eor(v11, __ T8B, v11, v29); 4030 __ eor(v12, __ T8B, v12, v30); 4031 __ eor(v13, __ T8B, v13, v31); 4032 4033 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4034 __ eor(v14, __ T8B, v14, v25); 4035 __ eor(v15, __ T8B, v15, v26); 4036 __ eor(v16, __ T8B, v16, v27); 4037 4038 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4039 __ andw(c_rarg5, block_size, 48); 4040 __ cbzw(c_rarg5, rounds24_loop); 4041 4042 __ tbnz(block_size, 5, shake128); 4043 // block_size == 144, bit5 == 0, SHA3-244 4044 __ ldrd(v28, __ post(buf, 8)); 4045 __ eor(v17, __ T8B, v17, v28); 4046 __ b(rounds24_loop); 4047 4048 __ BIND(shake128); 4049 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4050 __ eor(v17, __ T8B, v17, v28); 4051 __ eor(v18, __ T8B, v18, v29); 4052 __ eor(v19, __ T8B, v19, v30); 4053 __ eor(v20, __ T8B, v20, v31); 4054 __ b(rounds24_loop); // block_size == 168, SHAKE128 4055 4056 __ BIND(sha3_512_or_sha3_384); 4057 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4058 __ eor(v7, __ T8B, v7, v25); 4059 __ eor(v8, __ T8B, v8, v26); 4060 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4061 4062 // SHA3-384 4063 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4064 __ eor(v9, __ T8B, v9, v27); 4065 __ eor(v10, __ T8B, v10, v28); 4066 __ eor(v11, __ T8B, v11, v29); 4067 __ eor(v12, __ T8B, v12, v30); 4068 4069 __ BIND(rounds24_loop); 4070 __ subw(rscratch2, rscratch2, 1); 4071 4072 __ eor3(v29, __ T16B, v4, v9, v14); 4073 __ eor3(v26, __ T16B, v1, v6, v11); 4074 __ eor3(v28, __ T16B, v3, v8, v13); 4075 __ eor3(v25, __ T16B, v0, v5, v10); 4076 __ eor3(v27, __ T16B, v2, v7, v12); 4077 __ eor3(v29, __ T16B, v29, v19, v24); 4078 __ eor3(v26, __ T16B, v26, v16, v21); 4079 __ eor3(v28, __ T16B, v28, v18, v23); 4080 __ eor3(v25, __ T16B, v25, v15, v20); 4081 __ eor3(v27, __ T16B, v27, v17, v22); 4082 4083 __ rax1(v30, __ T2D, v29, v26); 4084 __ rax1(v26, __ T2D, v26, v28); 4085 __ rax1(v28, __ T2D, v28, v25); 4086 __ rax1(v25, __ T2D, v25, v27); 4087 __ rax1(v27, __ T2D, v27, v29); 4088 4089 __ eor(v0, __ T16B, v0, v30); 4090 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4091 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4092 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4093 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4094 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4095 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4096 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4097 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4098 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4099 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4100 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4101 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4102 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4103 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4104 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4105 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4106 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4107 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4108 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4109 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4110 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4111 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4112 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4113 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4114 4115 __ bcax(v20, __ T16B, v31, v22, v8); 4116 __ bcax(v21, __ T16B, v8, v23, v22); 4117 __ bcax(v22, __ T16B, v22, v24, v23); 4118 __ bcax(v23, __ T16B, v23, v31, v24); 4119 __ bcax(v24, __ T16B, v24, v8, v31); 4120 4121 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4122 4123 __ bcax(v17, __ T16B, v25, v19, v3); 4124 __ bcax(v18, __ T16B, v3, v15, v19); 4125 __ bcax(v19, __ T16B, v19, v16, v15); 4126 __ bcax(v15, __ T16B, v15, v25, v16); 4127 __ bcax(v16, __ T16B, v16, v3, v25); 4128 4129 __ bcax(v10, __ T16B, v29, v12, v26); 4130 __ bcax(v11, __ T16B, v26, v13, v12); 4131 __ bcax(v12, __ T16B, v12, v14, v13); 4132 __ bcax(v13, __ T16B, v13, v29, v14); 4133 __ bcax(v14, __ T16B, v14, v26, v29); 4134 4135 __ bcax(v7, __ T16B, v30, v9, v4); 4136 __ bcax(v8, __ T16B, v4, v5, v9); 4137 __ bcax(v9, __ T16B, v9, v6, v5); 4138 __ bcax(v5, __ T16B, v5, v30, v6); 4139 __ bcax(v6, __ T16B, v6, v4, v30); 4140 4141 __ bcax(v3, __ T16B, v27, v0, v28); 4142 __ bcax(v4, __ T16B, v28, v1, v0); 4143 __ bcax(v0, __ T16B, v0, v2, v1); 4144 __ bcax(v1, __ T16B, v1, v27, v2); 4145 __ bcax(v2, __ T16B, v2, v28, v27); 4146 4147 __ eor(v0, __ T16B, v0, v31); 4148 4149 __ cbnzw(rscratch2, rounds24_loop); 4150 4151 if (multi_block) { 4152 __ add(ofs, ofs, block_size); 4153 __ cmp(ofs, limit); 4154 __ br(Assembler::LE, sha3_loop); 4155 __ mov(c_rarg0, ofs); // return ofs 4156 } 4157 4158 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4159 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4160 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4161 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4162 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4163 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4164 __ st1(v24, __ T1D, state); 4165 4166 __ ldpd(v14, v15, Address(sp, 48)); 4167 __ ldpd(v12, v13, Address(sp, 32)); 4168 __ ldpd(v10, v11, Address(sp, 16)); 4169 __ ldpd(v8, v9, __ post(sp, 64)); 4170 4171 __ ret(lr); 4172 4173 return start; 4174 } 4175 4176 /** 4177 * Arguments: 4178 * 4179 * Inputs: 4180 * c_rarg0 - int crc 4181 * c_rarg1 - byte* buf 4182 * c_rarg2 - int length 4183 * 4184 * Output: 4185 * rax - int crc result 4186 */ 4187 address generate_updateBytesCRC32() { 4188 assert(UseCRC32Intrinsics, "what are we doing here?"); 4189 4190 __ align(CodeEntryAlignment); 4191 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4192 4193 address start = __ pc(); 4194 4195 const Register crc = c_rarg0; // crc 4196 const Register buf = c_rarg1; // source java byte array address 4197 const Register len = c_rarg2; // length 4198 const Register table0 = c_rarg3; // crc_table address 4199 const Register table1 = c_rarg4; 4200 const Register table2 = c_rarg5; 4201 const Register table3 = c_rarg6; 4202 const Register tmp3 = c_rarg7; 4203 4204 BLOCK_COMMENT("Entry:"); 4205 __ enter(); // required for proper stackwalking of RuntimeStub frame 4206 4207 __ kernel_crc32(crc, buf, len, 4208 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4209 4210 __ leave(); // required for proper stackwalking of RuntimeStub frame 4211 __ ret(lr); 4212 4213 return start; 4214 } 4215 4216 // ChaCha20 block function. This version parallelizes by loading 4217 // individual 32-bit state elements into vectors for four blocks 4218 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4219 // 4220 // state (int[16]) = c_rarg0 4221 // keystream (byte[1024]) = c_rarg1 4222 // return - number of bytes of keystream (always 256) 4223 address generate_chacha20Block_blockpar() { 4224 Label L_twoRounds, L_cc20_const; 4225 // The constant data is broken into two 128-bit segments to be loaded 4226 // onto FloatRegisters. The first 128 bits are a counter add overlay 4227 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4228 // The second 128-bits is a table constant used for 8-bit left rotations. 4229 __ BIND(L_cc20_const); 4230 __ emit_int64(0x0000000100000000UL); 4231 __ emit_int64(0x0000000300000002UL); 4232 __ emit_int64(0x0605040702010003UL); 4233 __ emit_int64(0x0E0D0C0F0A09080BUL); 4234 4235 __ align(CodeEntryAlignment); 4236 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4237 address start = __ pc(); 4238 __ enter(); 4239 4240 int i, j; 4241 const Register state = c_rarg0; 4242 const Register keystream = c_rarg1; 4243 const Register loopCtr = r10; 4244 const Register tmpAddr = r11; 4245 4246 const FloatRegister stateFirst = v0; 4247 const FloatRegister stateSecond = v1; 4248 const FloatRegister stateThird = v2; 4249 const FloatRegister stateFourth = v3; 4250 const FloatRegister origCtrState = v28; 4251 const FloatRegister scratch = v29; 4252 const FloatRegister lrot8Tbl = v30; 4253 4254 // Organize SIMD registers in an array that facilitates 4255 // putting repetitive opcodes into loop structures. It is 4256 // important that each grouping of 4 registers is monotonically 4257 // increasing to support the requirements of multi-register 4258 // instructions (e.g. ld4r, st4, etc.) 4259 const FloatRegister workSt[16] = { 4260 v4, v5, v6, v7, v16, v17, v18, v19, 4261 v20, v21, v22, v23, v24, v25, v26, v27 4262 }; 4263 4264 // Load from memory and interlace across 16 SIMD registers, 4265 // With each word from memory being broadcast to all lanes of 4266 // each successive SIMD register. 4267 // Addr(0) -> All lanes in workSt[i] 4268 // Addr(4) -> All lanes workSt[i + 1], etc. 4269 __ mov(tmpAddr, state); 4270 for (i = 0; i < 16; i += 4) { 4271 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4272 __ post(tmpAddr, 16)); 4273 } 4274 4275 // Pull in constant data. The first 16 bytes are the add overlay 4276 // which is applied to the vector holding the counter (state[12]). 4277 // The second 16 bytes is the index register for the 8-bit left 4278 // rotation tbl instruction. 4279 __ adr(tmpAddr, L_cc20_const); 4280 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4281 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4282 4283 // Set up the 10 iteration loop and perform all 8 quarter round ops 4284 __ mov(loopCtr, 10); 4285 __ BIND(L_twoRounds); 4286 4287 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4288 scratch, lrot8Tbl); 4289 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4290 scratch, lrot8Tbl); 4291 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4292 scratch, lrot8Tbl); 4293 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4294 scratch, lrot8Tbl); 4295 4296 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4297 scratch, lrot8Tbl); 4298 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4299 scratch, lrot8Tbl); 4300 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4301 scratch, lrot8Tbl); 4302 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4303 scratch, lrot8Tbl); 4304 4305 // Decrement and iterate 4306 __ sub(loopCtr, loopCtr, 1); 4307 __ cbnz(loopCtr, L_twoRounds); 4308 4309 __ mov(tmpAddr, state); 4310 4311 // Add the starting state back to the post-loop keystream 4312 // state. We read/interlace the state array from memory into 4313 // 4 registers similar to what we did in the beginning. Then 4314 // add the counter overlay onto workSt[12] at the end. 4315 for (i = 0; i < 16; i += 4) { 4316 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4317 __ post(tmpAddr, 16)); 4318 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4319 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4320 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4321 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4322 } 4323 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4324 4325 // Write to key stream, storing the same element out of workSt[0..15] 4326 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4327 // for the next element position. 4328 for (i = 0; i < 4; i++) { 4329 for (j = 0; j < 16; j += 4) { 4330 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4331 __ post(keystream, 16)); 4332 } 4333 } 4334 4335 __ mov(r0, 256); // Return length of output keystream 4336 __ leave(); 4337 __ ret(lr); 4338 4339 return start; 4340 } 4341 4342 /** 4343 * Arguments: 4344 * 4345 * Inputs: 4346 * c_rarg0 - int crc 4347 * c_rarg1 - byte* buf 4348 * c_rarg2 - int length 4349 * c_rarg3 - int* table 4350 * 4351 * Output: 4352 * r0 - int crc result 4353 */ 4354 address generate_updateBytesCRC32C() { 4355 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4356 4357 __ align(CodeEntryAlignment); 4358 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4359 4360 address start = __ pc(); 4361 4362 const Register crc = c_rarg0; // crc 4363 const Register buf = c_rarg1; // source java byte array address 4364 const Register len = c_rarg2; // length 4365 const Register table0 = c_rarg3; // crc_table address 4366 const Register table1 = c_rarg4; 4367 const Register table2 = c_rarg5; 4368 const Register table3 = c_rarg6; 4369 const Register tmp3 = c_rarg7; 4370 4371 BLOCK_COMMENT("Entry:"); 4372 __ enter(); // required for proper stackwalking of RuntimeStub frame 4373 4374 __ kernel_crc32c(crc, buf, len, 4375 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4376 4377 __ leave(); // required for proper stackwalking of RuntimeStub frame 4378 __ ret(lr); 4379 4380 return start; 4381 } 4382 4383 /*** 4384 * Arguments: 4385 * 4386 * Inputs: 4387 * c_rarg0 - int adler 4388 * c_rarg1 - byte* buff 4389 * c_rarg2 - int len 4390 * 4391 * Output: 4392 * c_rarg0 - int adler result 4393 */ 4394 address generate_updateBytesAdler32() { 4395 __ align(CodeEntryAlignment); 4396 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4397 address start = __ pc(); 4398 4399 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4400 4401 // Aliases 4402 Register adler = c_rarg0; 4403 Register s1 = c_rarg0; 4404 Register s2 = c_rarg3; 4405 Register buff = c_rarg1; 4406 Register len = c_rarg2; 4407 Register nmax = r4; 4408 Register base = r5; 4409 Register count = r6; 4410 Register temp0 = rscratch1; 4411 Register temp1 = rscratch2; 4412 FloatRegister vbytes = v0; 4413 FloatRegister vs1acc = v1; 4414 FloatRegister vs2acc = v2; 4415 FloatRegister vtable = v3; 4416 4417 // Max number of bytes we can process before having to take the mod 4418 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4419 uint64_t BASE = 0xfff1; 4420 uint64_t NMAX = 0x15B0; 4421 4422 __ mov(base, BASE); 4423 __ mov(nmax, NMAX); 4424 4425 // Load accumulation coefficients for the upper 16 bits 4426 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4427 __ ld1(vtable, __ T16B, Address(temp0)); 4428 4429 // s1 is initialized to the lower 16 bits of adler 4430 // s2 is initialized to the upper 16 bits of adler 4431 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4432 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4433 4434 // The pipelined loop needs at least 16 elements for 1 iteration 4435 // It does check this, but it is more effective to skip to the cleanup loop 4436 __ cmp(len, (u1)16); 4437 __ br(Assembler::HS, L_nmax); 4438 __ cbz(len, L_combine); 4439 4440 __ bind(L_simple_by1_loop); 4441 __ ldrb(temp0, Address(__ post(buff, 1))); 4442 __ add(s1, s1, temp0); 4443 __ add(s2, s2, s1); 4444 __ subs(len, len, 1); 4445 __ br(Assembler::HI, L_simple_by1_loop); 4446 4447 // s1 = s1 % BASE 4448 __ subs(temp0, s1, base); 4449 __ csel(s1, temp0, s1, Assembler::HS); 4450 4451 // s2 = s2 % BASE 4452 __ lsr(temp0, s2, 16); 4453 __ lsl(temp1, temp0, 4); 4454 __ sub(temp1, temp1, temp0); 4455 __ add(s2, temp1, s2, ext::uxth); 4456 4457 __ subs(temp0, s2, base); 4458 __ csel(s2, temp0, s2, Assembler::HS); 4459 4460 __ b(L_combine); 4461 4462 __ bind(L_nmax); 4463 __ subs(len, len, nmax); 4464 __ sub(count, nmax, 16); 4465 __ br(Assembler::LO, L_by16); 4466 4467 __ bind(L_nmax_loop); 4468 4469 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4470 vbytes, vs1acc, vs2acc, vtable); 4471 4472 __ subs(count, count, 16); 4473 __ br(Assembler::HS, L_nmax_loop); 4474 4475 // s1 = s1 % BASE 4476 __ lsr(temp0, s1, 16); 4477 __ lsl(temp1, temp0, 4); 4478 __ sub(temp1, temp1, temp0); 4479 __ add(temp1, temp1, s1, ext::uxth); 4480 4481 __ lsr(temp0, temp1, 16); 4482 __ lsl(s1, temp0, 4); 4483 __ sub(s1, s1, temp0); 4484 __ add(s1, s1, temp1, ext:: uxth); 4485 4486 __ subs(temp0, s1, base); 4487 __ csel(s1, temp0, s1, Assembler::HS); 4488 4489 // s2 = s2 % BASE 4490 __ lsr(temp0, s2, 16); 4491 __ lsl(temp1, temp0, 4); 4492 __ sub(temp1, temp1, temp0); 4493 __ add(temp1, temp1, s2, ext::uxth); 4494 4495 __ lsr(temp0, temp1, 16); 4496 __ lsl(s2, temp0, 4); 4497 __ sub(s2, s2, temp0); 4498 __ add(s2, s2, temp1, ext:: uxth); 4499 4500 __ subs(temp0, s2, base); 4501 __ csel(s2, temp0, s2, Assembler::HS); 4502 4503 __ subs(len, len, nmax); 4504 __ sub(count, nmax, 16); 4505 __ br(Assembler::HS, L_nmax_loop); 4506 4507 __ bind(L_by16); 4508 __ adds(len, len, count); 4509 __ br(Assembler::LO, L_by1); 4510 4511 __ bind(L_by16_loop); 4512 4513 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4514 vbytes, vs1acc, vs2acc, vtable); 4515 4516 __ subs(len, len, 16); 4517 __ br(Assembler::HS, L_by16_loop); 4518 4519 __ bind(L_by1); 4520 __ adds(len, len, 15); 4521 __ br(Assembler::LO, L_do_mod); 4522 4523 __ bind(L_by1_loop); 4524 __ ldrb(temp0, Address(__ post(buff, 1))); 4525 __ add(s1, temp0, s1); 4526 __ add(s2, s2, s1); 4527 __ subs(len, len, 1); 4528 __ br(Assembler::HS, L_by1_loop); 4529 4530 __ bind(L_do_mod); 4531 // s1 = s1 % BASE 4532 __ lsr(temp0, s1, 16); 4533 __ lsl(temp1, temp0, 4); 4534 __ sub(temp1, temp1, temp0); 4535 __ add(temp1, temp1, s1, ext::uxth); 4536 4537 __ lsr(temp0, temp1, 16); 4538 __ lsl(s1, temp0, 4); 4539 __ sub(s1, s1, temp0); 4540 __ add(s1, s1, temp1, ext:: uxth); 4541 4542 __ subs(temp0, s1, base); 4543 __ csel(s1, temp0, s1, Assembler::HS); 4544 4545 // s2 = s2 % BASE 4546 __ lsr(temp0, s2, 16); 4547 __ lsl(temp1, temp0, 4); 4548 __ sub(temp1, temp1, temp0); 4549 __ add(temp1, temp1, s2, ext::uxth); 4550 4551 __ lsr(temp0, temp1, 16); 4552 __ lsl(s2, temp0, 4); 4553 __ sub(s2, s2, temp0); 4554 __ add(s2, s2, temp1, ext:: uxth); 4555 4556 __ subs(temp0, s2, base); 4557 __ csel(s2, temp0, s2, Assembler::HS); 4558 4559 // Combine lower bits and higher bits 4560 __ bind(L_combine); 4561 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4562 4563 __ ret(lr); 4564 4565 return start; 4566 } 4567 4568 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4569 Register temp0, Register temp1, FloatRegister vbytes, 4570 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4571 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4572 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4573 // In non-vectorized code, we update s1 and s2 as: 4574 // s1 <- s1 + b1 4575 // s2 <- s2 + s1 4576 // s1 <- s1 + b2 4577 // s2 <- s2 + b1 4578 // ... 4579 // s1 <- s1 + b16 4580 // s2 <- s2 + s1 4581 // Putting above assignments together, we have: 4582 // s1_new = s1 + b1 + b2 + ... + b16 4583 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4584 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4585 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4586 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4587 4588 // s2 = s2 + s1 * 16 4589 __ add(s2, s2, s1, Assembler::LSL, 4); 4590 4591 // vs1acc = b1 + b2 + b3 + ... + b16 4592 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4593 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4594 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4595 __ uaddlv(vs1acc, __ T16B, vbytes); 4596 __ uaddlv(vs2acc, __ T8H, vs2acc); 4597 4598 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4599 __ fmovd(temp0, vs1acc); 4600 __ fmovd(temp1, vs2acc); 4601 __ add(s1, s1, temp0); 4602 __ add(s2, s2, temp1); 4603 } 4604 4605 /** 4606 * Arguments: 4607 * 4608 * Input: 4609 * c_rarg0 - x address 4610 * c_rarg1 - x length 4611 * c_rarg2 - y address 4612 * c_rarg3 - y length 4613 * c_rarg4 - z address 4614 * c_rarg5 - z length 4615 */ 4616 address generate_multiplyToLen() { 4617 __ align(CodeEntryAlignment); 4618 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4619 4620 address start = __ pc(); 4621 const Register x = r0; 4622 const Register xlen = r1; 4623 const Register y = r2; 4624 const Register ylen = r3; 4625 const Register z = r4; 4626 const Register zlen = r5; 4627 4628 const Register tmp1 = r10; 4629 const Register tmp2 = r11; 4630 const Register tmp3 = r12; 4631 const Register tmp4 = r13; 4632 const Register tmp5 = r14; 4633 const Register tmp6 = r15; 4634 const Register tmp7 = r16; 4635 4636 BLOCK_COMMENT("Entry:"); 4637 __ enter(); // required for proper stackwalking of RuntimeStub frame 4638 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4639 __ leave(); // required for proper stackwalking of RuntimeStub frame 4640 __ ret(lr); 4641 4642 return start; 4643 } 4644 4645 address generate_squareToLen() { 4646 // squareToLen algorithm for sizes 1..127 described in java code works 4647 // faster than multiply_to_len on some CPUs and slower on others, but 4648 // multiply_to_len shows a bit better overall results 4649 __ align(CodeEntryAlignment); 4650 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4651 address start = __ pc(); 4652 4653 const Register x = r0; 4654 const Register xlen = r1; 4655 const Register z = r2; 4656 const Register zlen = r3; 4657 const Register y = r4; // == x 4658 const Register ylen = r5; // == xlen 4659 4660 const Register tmp1 = r10; 4661 const Register tmp2 = r11; 4662 const Register tmp3 = r12; 4663 const Register tmp4 = r13; 4664 const Register tmp5 = r14; 4665 const Register tmp6 = r15; 4666 const Register tmp7 = r16; 4667 4668 RegSet spilled_regs = RegSet::of(y, ylen); 4669 BLOCK_COMMENT("Entry:"); 4670 __ enter(); 4671 __ push(spilled_regs, sp); 4672 __ mov(y, x); 4673 __ mov(ylen, xlen); 4674 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4675 __ pop(spilled_regs, sp); 4676 __ leave(); 4677 __ ret(lr); 4678 return start; 4679 } 4680 4681 address generate_mulAdd() { 4682 __ align(CodeEntryAlignment); 4683 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4684 4685 address start = __ pc(); 4686 4687 const Register out = r0; 4688 const Register in = r1; 4689 const Register offset = r2; 4690 const Register len = r3; 4691 const Register k = r4; 4692 4693 BLOCK_COMMENT("Entry:"); 4694 __ enter(); 4695 __ mul_add(out, in, offset, len, k); 4696 __ leave(); 4697 __ ret(lr); 4698 4699 return start; 4700 } 4701 4702 // Arguments: 4703 // 4704 // Input: 4705 // c_rarg0 - newArr address 4706 // c_rarg1 - oldArr address 4707 // c_rarg2 - newIdx 4708 // c_rarg3 - shiftCount 4709 // c_rarg4 - numIter 4710 // 4711 address generate_bigIntegerRightShift() { 4712 __ align(CodeEntryAlignment); 4713 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4714 address start = __ pc(); 4715 4716 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4717 4718 Register newArr = c_rarg0; 4719 Register oldArr = c_rarg1; 4720 Register newIdx = c_rarg2; 4721 Register shiftCount = c_rarg3; 4722 Register numIter = c_rarg4; 4723 Register idx = numIter; 4724 4725 Register newArrCur = rscratch1; 4726 Register shiftRevCount = rscratch2; 4727 Register oldArrCur = r13; 4728 Register oldArrNext = r14; 4729 4730 FloatRegister oldElem0 = v0; 4731 FloatRegister oldElem1 = v1; 4732 FloatRegister newElem = v2; 4733 FloatRegister shiftVCount = v3; 4734 FloatRegister shiftVRevCount = v4; 4735 4736 __ cbz(idx, Exit); 4737 4738 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4739 4740 // left shift count 4741 __ movw(shiftRevCount, 32); 4742 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4743 4744 // numIter too small to allow a 4-words SIMD loop, rolling back 4745 __ cmp(numIter, (u1)4); 4746 __ br(Assembler::LT, ShiftThree); 4747 4748 __ dup(shiftVCount, __ T4S, shiftCount); 4749 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4750 __ negr(shiftVCount, __ T4S, shiftVCount); 4751 4752 __ BIND(ShiftSIMDLoop); 4753 4754 // Calculate the load addresses 4755 __ sub(idx, idx, 4); 4756 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4757 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4758 __ add(oldArrCur, oldArrNext, 4); 4759 4760 // Load 4 words and process 4761 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4762 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4763 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4764 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4765 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4766 __ st1(newElem, __ T4S, Address(newArrCur)); 4767 4768 __ cmp(idx, (u1)4); 4769 __ br(Assembler::LT, ShiftTwoLoop); 4770 __ b(ShiftSIMDLoop); 4771 4772 __ BIND(ShiftTwoLoop); 4773 __ cbz(idx, Exit); 4774 __ cmp(idx, (u1)1); 4775 __ br(Assembler::EQ, ShiftOne); 4776 4777 // Calculate the load addresses 4778 __ sub(idx, idx, 2); 4779 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4780 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4781 __ add(oldArrCur, oldArrNext, 4); 4782 4783 // Load 2 words and process 4784 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4785 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4786 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4787 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4788 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4789 __ st1(newElem, __ T2S, Address(newArrCur)); 4790 __ b(ShiftTwoLoop); 4791 4792 __ BIND(ShiftThree); 4793 __ tbz(idx, 1, ShiftOne); 4794 __ tbz(idx, 0, ShiftTwo); 4795 __ ldrw(r10, Address(oldArr, 12)); 4796 __ ldrw(r11, Address(oldArr, 8)); 4797 __ lsrvw(r10, r10, shiftCount); 4798 __ lslvw(r11, r11, shiftRevCount); 4799 __ orrw(r12, r10, r11); 4800 __ strw(r12, Address(newArr, 8)); 4801 4802 __ BIND(ShiftTwo); 4803 __ ldrw(r10, Address(oldArr, 8)); 4804 __ ldrw(r11, Address(oldArr, 4)); 4805 __ lsrvw(r10, r10, shiftCount); 4806 __ lslvw(r11, r11, shiftRevCount); 4807 __ orrw(r12, r10, r11); 4808 __ strw(r12, Address(newArr, 4)); 4809 4810 __ BIND(ShiftOne); 4811 __ ldrw(r10, Address(oldArr, 4)); 4812 __ ldrw(r11, Address(oldArr)); 4813 __ lsrvw(r10, r10, shiftCount); 4814 __ lslvw(r11, r11, shiftRevCount); 4815 __ orrw(r12, r10, r11); 4816 __ strw(r12, Address(newArr)); 4817 4818 __ BIND(Exit); 4819 __ ret(lr); 4820 4821 return start; 4822 } 4823 4824 // Arguments: 4825 // 4826 // Input: 4827 // c_rarg0 - newArr address 4828 // c_rarg1 - oldArr address 4829 // c_rarg2 - newIdx 4830 // c_rarg3 - shiftCount 4831 // c_rarg4 - numIter 4832 // 4833 address generate_bigIntegerLeftShift() { 4834 __ align(CodeEntryAlignment); 4835 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4836 address start = __ pc(); 4837 4838 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4839 4840 Register newArr = c_rarg0; 4841 Register oldArr = c_rarg1; 4842 Register newIdx = c_rarg2; 4843 Register shiftCount = c_rarg3; 4844 Register numIter = c_rarg4; 4845 4846 Register shiftRevCount = rscratch1; 4847 Register oldArrNext = rscratch2; 4848 4849 FloatRegister oldElem0 = v0; 4850 FloatRegister oldElem1 = v1; 4851 FloatRegister newElem = v2; 4852 FloatRegister shiftVCount = v3; 4853 FloatRegister shiftVRevCount = v4; 4854 4855 __ cbz(numIter, Exit); 4856 4857 __ add(oldArrNext, oldArr, 4); 4858 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4859 4860 // right shift count 4861 __ movw(shiftRevCount, 32); 4862 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4863 4864 // numIter too small to allow a 4-words SIMD loop, rolling back 4865 __ cmp(numIter, (u1)4); 4866 __ br(Assembler::LT, ShiftThree); 4867 4868 __ dup(shiftVCount, __ T4S, shiftCount); 4869 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4870 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4871 4872 __ BIND(ShiftSIMDLoop); 4873 4874 // load 4 words and process 4875 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4876 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4877 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4878 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4879 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4880 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4881 __ sub(numIter, numIter, 4); 4882 4883 __ cmp(numIter, (u1)4); 4884 __ br(Assembler::LT, ShiftTwoLoop); 4885 __ b(ShiftSIMDLoop); 4886 4887 __ BIND(ShiftTwoLoop); 4888 __ cbz(numIter, Exit); 4889 __ cmp(numIter, (u1)1); 4890 __ br(Assembler::EQ, ShiftOne); 4891 4892 // load 2 words and process 4893 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4894 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4895 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4896 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4897 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4898 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4899 __ sub(numIter, numIter, 2); 4900 __ b(ShiftTwoLoop); 4901 4902 __ BIND(ShiftThree); 4903 __ ldrw(r10, __ post(oldArr, 4)); 4904 __ ldrw(r11, __ post(oldArrNext, 4)); 4905 __ lslvw(r10, r10, shiftCount); 4906 __ lsrvw(r11, r11, shiftRevCount); 4907 __ orrw(r12, r10, r11); 4908 __ strw(r12, __ post(newArr, 4)); 4909 __ tbz(numIter, 1, Exit); 4910 __ tbz(numIter, 0, ShiftOne); 4911 4912 __ BIND(ShiftTwo); 4913 __ ldrw(r10, __ post(oldArr, 4)); 4914 __ ldrw(r11, __ post(oldArrNext, 4)); 4915 __ lslvw(r10, r10, shiftCount); 4916 __ lsrvw(r11, r11, shiftRevCount); 4917 __ orrw(r12, r10, r11); 4918 __ strw(r12, __ post(newArr, 4)); 4919 4920 __ BIND(ShiftOne); 4921 __ ldrw(r10, Address(oldArr)); 4922 __ ldrw(r11, Address(oldArrNext)); 4923 __ lslvw(r10, r10, shiftCount); 4924 __ lsrvw(r11, r11, shiftRevCount); 4925 __ orrw(r12, r10, r11); 4926 __ strw(r12, Address(newArr)); 4927 4928 __ BIND(Exit); 4929 __ ret(lr); 4930 4931 return start; 4932 } 4933 4934 address generate_count_positives(address &count_positives_long) { 4935 const u1 large_loop_size = 64; 4936 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4937 int dcache_line = VM_Version::dcache_line_size(); 4938 4939 Register ary1 = r1, len = r2, result = r0; 4940 4941 __ align(CodeEntryAlignment); 4942 4943 StubCodeMark mark(this, "StubRoutines", "count_positives"); 4944 4945 address entry = __ pc(); 4946 4947 __ enter(); 4948 // precondition: a copy of len is already in result 4949 // __ mov(result, len); 4950 4951 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 4952 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 4953 4954 __ cmp(len, (u1)15); 4955 __ br(Assembler::GT, LEN_OVER_15); 4956 // The only case when execution falls into this code is when pointer is near 4957 // the end of memory page and we have to avoid reading next page 4958 __ add(ary1, ary1, len); 4959 __ subs(len, len, 8); 4960 __ br(Assembler::GT, LEN_OVER_8); 4961 __ ldr(rscratch2, Address(ary1, -8)); 4962 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 4963 __ lsrv(rscratch2, rscratch2, rscratch1); 4964 __ tst(rscratch2, UPPER_BIT_MASK); 4965 __ csel(result, zr, result, Assembler::NE); 4966 __ leave(); 4967 __ ret(lr); 4968 __ bind(LEN_OVER_8); 4969 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 4970 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 4971 __ tst(rscratch2, UPPER_BIT_MASK); 4972 __ br(Assembler::NE, RET_NO_POP); 4973 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 4974 __ lsrv(rscratch1, rscratch1, rscratch2); 4975 __ tst(rscratch1, UPPER_BIT_MASK); 4976 __ bind(RET_NO_POP); 4977 __ csel(result, zr, result, Assembler::NE); 4978 __ leave(); 4979 __ ret(lr); 4980 4981 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 4982 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 4983 4984 count_positives_long = __ pc(); // 2nd entry point 4985 4986 __ enter(); 4987 4988 __ bind(LEN_OVER_15); 4989 __ push(spilled_regs, sp); 4990 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 4991 __ cbz(rscratch2, ALIGNED); 4992 __ ldp(tmp6, tmp1, Address(ary1)); 4993 __ mov(tmp5, 16); 4994 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 4995 __ add(ary1, ary1, rscratch1); 4996 __ orr(tmp6, tmp6, tmp1); 4997 __ tst(tmp6, UPPER_BIT_MASK); 4998 __ br(Assembler::NE, RET_ADJUST); 4999 __ sub(len, len, rscratch1); 5000 5001 __ bind(ALIGNED); 5002 __ cmp(len, large_loop_size); 5003 __ br(Assembler::LT, CHECK_16); 5004 // Perform 16-byte load as early return in pre-loop to handle situation 5005 // when initially aligned large array has negative values at starting bytes, 5006 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5007 // slower. Cases with negative bytes further ahead won't be affected that 5008 // much. In fact, it'll be faster due to early loads, less instructions and 5009 // less branches in LARGE_LOOP. 5010 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5011 __ sub(len, len, 16); 5012 __ orr(tmp6, tmp6, tmp1); 5013 __ tst(tmp6, UPPER_BIT_MASK); 5014 __ br(Assembler::NE, RET_ADJUST_16); 5015 __ cmp(len, large_loop_size); 5016 __ br(Assembler::LT, CHECK_16); 5017 5018 if (SoftwarePrefetchHintDistance >= 0 5019 && SoftwarePrefetchHintDistance >= dcache_line) { 5020 // initial prefetch 5021 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5022 } 5023 __ bind(LARGE_LOOP); 5024 if (SoftwarePrefetchHintDistance >= 0) { 5025 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5026 } 5027 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5028 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5029 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5030 // instructions per cycle and have less branches, but this approach disables 5031 // early return, thus, all 64 bytes are loaded and checked every time. 5032 __ ldp(tmp2, tmp3, Address(ary1)); 5033 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5034 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5035 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5036 __ add(ary1, ary1, large_loop_size); 5037 __ sub(len, len, large_loop_size); 5038 __ orr(tmp2, tmp2, tmp3); 5039 __ orr(tmp4, tmp4, tmp5); 5040 __ orr(rscratch1, rscratch1, rscratch2); 5041 __ orr(tmp6, tmp6, tmp1); 5042 __ orr(tmp2, tmp2, tmp4); 5043 __ orr(rscratch1, rscratch1, tmp6); 5044 __ orr(tmp2, tmp2, rscratch1); 5045 __ tst(tmp2, UPPER_BIT_MASK); 5046 __ br(Assembler::NE, RET_ADJUST_LONG); 5047 __ cmp(len, large_loop_size); 5048 __ br(Assembler::GE, LARGE_LOOP); 5049 5050 __ bind(CHECK_16); // small 16-byte load pre-loop 5051 __ cmp(len, (u1)16); 5052 __ br(Assembler::LT, POST_LOOP16); 5053 5054 __ bind(LOOP16); // small 16-byte load loop 5055 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5056 __ sub(len, len, 16); 5057 __ orr(tmp2, tmp2, tmp3); 5058 __ tst(tmp2, UPPER_BIT_MASK); 5059 __ br(Assembler::NE, RET_ADJUST_16); 5060 __ cmp(len, (u1)16); 5061 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5062 5063 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5064 __ cmp(len, (u1)8); 5065 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5066 __ ldr(tmp3, Address(__ post(ary1, 8))); 5067 __ tst(tmp3, UPPER_BIT_MASK); 5068 __ br(Assembler::NE, RET_ADJUST); 5069 __ sub(len, len, 8); 5070 5071 __ bind(POST_LOOP16_LOAD_TAIL); 5072 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5073 __ ldr(tmp1, Address(ary1)); 5074 __ mov(tmp2, 64); 5075 __ sub(tmp4, tmp2, len, __ LSL, 3); 5076 __ lslv(tmp1, tmp1, tmp4); 5077 __ tst(tmp1, UPPER_BIT_MASK); 5078 __ br(Assembler::NE, RET_ADJUST); 5079 // Fallthrough 5080 5081 __ bind(RET_LEN); 5082 __ pop(spilled_regs, sp); 5083 __ leave(); 5084 __ ret(lr); 5085 5086 // difference result - len is the count of guaranteed to be 5087 // positive bytes 5088 5089 __ bind(RET_ADJUST_LONG); 5090 __ add(len, len, (u1)(large_loop_size - 16)); 5091 __ bind(RET_ADJUST_16); 5092 __ add(len, len, 16); 5093 __ bind(RET_ADJUST); 5094 __ pop(spilled_regs, sp); 5095 __ leave(); 5096 __ sub(result, result, len); 5097 __ ret(lr); 5098 5099 return entry; 5100 } 5101 5102 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5103 bool usePrefetch, Label &NOT_EQUAL) { 5104 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5105 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5106 tmp7 = r12, tmp8 = r13; 5107 Label LOOP; 5108 5109 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5110 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5111 __ bind(LOOP); 5112 if (usePrefetch) { 5113 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5114 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5115 } 5116 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5117 __ eor(tmp1, tmp1, tmp2); 5118 __ eor(tmp3, tmp3, tmp4); 5119 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5120 __ orr(tmp1, tmp1, tmp3); 5121 __ cbnz(tmp1, NOT_EQUAL); 5122 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5123 __ eor(tmp5, tmp5, tmp6); 5124 __ eor(tmp7, tmp7, tmp8); 5125 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5126 __ orr(tmp5, tmp5, tmp7); 5127 __ cbnz(tmp5, NOT_EQUAL); 5128 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5129 __ eor(tmp1, tmp1, tmp2); 5130 __ eor(tmp3, tmp3, tmp4); 5131 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5132 __ orr(tmp1, tmp1, tmp3); 5133 __ cbnz(tmp1, NOT_EQUAL); 5134 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5135 __ eor(tmp5, tmp5, tmp6); 5136 __ sub(cnt1, cnt1, 8 * wordSize); 5137 __ eor(tmp7, tmp7, tmp8); 5138 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5139 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5140 // cmp) because subs allows an unlimited range of immediate operand. 5141 __ subs(tmp6, cnt1, loopThreshold); 5142 __ orr(tmp5, tmp5, tmp7); 5143 __ cbnz(tmp5, NOT_EQUAL); 5144 __ br(__ GE, LOOP); 5145 // post-loop 5146 __ eor(tmp1, tmp1, tmp2); 5147 __ eor(tmp3, tmp3, tmp4); 5148 __ orr(tmp1, tmp1, tmp3); 5149 __ sub(cnt1, cnt1, 2 * wordSize); 5150 __ cbnz(tmp1, NOT_EQUAL); 5151 } 5152 5153 void generate_large_array_equals_loop_simd(int loopThreshold, 5154 bool usePrefetch, Label &NOT_EQUAL) { 5155 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5156 tmp2 = rscratch2; 5157 Label LOOP; 5158 5159 __ bind(LOOP); 5160 if (usePrefetch) { 5161 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5162 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5163 } 5164 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5165 __ sub(cnt1, cnt1, 8 * wordSize); 5166 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5167 __ subs(tmp1, cnt1, loopThreshold); 5168 __ eor(v0, __ T16B, v0, v4); 5169 __ eor(v1, __ T16B, v1, v5); 5170 __ eor(v2, __ T16B, v2, v6); 5171 __ eor(v3, __ T16B, v3, v7); 5172 __ orr(v0, __ T16B, v0, v1); 5173 __ orr(v1, __ T16B, v2, v3); 5174 __ orr(v0, __ T16B, v0, v1); 5175 __ umov(tmp1, v0, __ D, 0); 5176 __ umov(tmp2, v0, __ D, 1); 5177 __ orr(tmp1, tmp1, tmp2); 5178 __ cbnz(tmp1, NOT_EQUAL); 5179 __ br(__ GE, LOOP); 5180 } 5181 5182 // a1 = r1 - array1 address 5183 // a2 = r2 - array2 address 5184 // result = r0 - return value. Already contains "false" 5185 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5186 // r3-r5 are reserved temporary registers 5187 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5188 address generate_large_array_equals() { 5189 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5190 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5191 tmp7 = r12, tmp8 = r13; 5192 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5193 SMALL_LOOP, POST_LOOP; 5194 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5195 // calculate if at least 32 prefetched bytes are used 5196 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5197 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5198 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5199 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5200 tmp5, tmp6, tmp7, tmp8); 5201 5202 __ align(CodeEntryAlignment); 5203 5204 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5205 5206 address entry = __ pc(); 5207 __ enter(); 5208 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5209 // also advance pointers to use post-increment instead of pre-increment 5210 __ add(a1, a1, wordSize); 5211 __ add(a2, a2, wordSize); 5212 if (AvoidUnalignedAccesses) { 5213 // both implementations (SIMD/nonSIMD) are using relatively large load 5214 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5215 // on some CPUs in case of address is not at least 16-byte aligned. 5216 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5217 // load if needed at least for 1st address and make if 16-byte aligned. 5218 Label ALIGNED16; 5219 __ tbz(a1, 3, ALIGNED16); 5220 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5221 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5222 __ sub(cnt1, cnt1, wordSize); 5223 __ eor(tmp1, tmp1, tmp2); 5224 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5225 __ bind(ALIGNED16); 5226 } 5227 if (UseSIMDForArrayEquals) { 5228 if (SoftwarePrefetchHintDistance >= 0) { 5229 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5230 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5231 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5232 /* prfm = */ true, NOT_EQUAL); 5233 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5234 __ br(__ LT, TAIL); 5235 } 5236 __ bind(NO_PREFETCH_LARGE_LOOP); 5237 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5238 /* prfm = */ false, NOT_EQUAL); 5239 } else { 5240 __ push(spilled_regs, sp); 5241 if (SoftwarePrefetchHintDistance >= 0) { 5242 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5243 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5244 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5245 /* prfm = */ true, NOT_EQUAL); 5246 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5247 __ br(__ LT, TAIL); 5248 } 5249 __ bind(NO_PREFETCH_LARGE_LOOP); 5250 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5251 /* prfm = */ false, NOT_EQUAL); 5252 } 5253 __ bind(TAIL); 5254 __ cbz(cnt1, EQUAL); 5255 __ subs(cnt1, cnt1, wordSize); 5256 __ br(__ LE, POST_LOOP); 5257 __ bind(SMALL_LOOP); 5258 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5259 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5260 __ subs(cnt1, cnt1, wordSize); 5261 __ eor(tmp1, tmp1, tmp2); 5262 __ cbnz(tmp1, NOT_EQUAL); 5263 __ br(__ GT, SMALL_LOOP); 5264 __ bind(POST_LOOP); 5265 __ ldr(tmp1, Address(a1, cnt1)); 5266 __ ldr(tmp2, Address(a2, cnt1)); 5267 __ eor(tmp1, tmp1, tmp2); 5268 __ cbnz(tmp1, NOT_EQUAL); 5269 __ bind(EQUAL); 5270 __ mov(result, true); 5271 __ bind(NOT_EQUAL); 5272 if (!UseSIMDForArrayEquals) { 5273 __ pop(spilled_regs, sp); 5274 } 5275 __ bind(NOT_EQUAL_NO_POP); 5276 __ leave(); 5277 __ ret(lr); 5278 return entry; 5279 } 5280 5281 address generate_dsin_dcos(bool isCos) { 5282 __ align(CodeEntryAlignment); 5283 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5284 address start = __ pc(); 5285 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5286 (address)StubRoutines::aarch64::_two_over_pi, 5287 (address)StubRoutines::aarch64::_pio2, 5288 (address)StubRoutines::aarch64::_dsin_coef, 5289 (address)StubRoutines::aarch64::_dcos_coef); 5290 return start; 5291 } 5292 5293 address generate_dlog() { 5294 __ align(CodeEntryAlignment); 5295 StubCodeMark mark(this, "StubRoutines", "dlog"); 5296 address entry = __ pc(); 5297 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 5298 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 5299 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 5300 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 5301 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 5302 return entry; 5303 } 5304 5305 5306 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5307 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5308 Label &DIFF2) { 5309 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5310 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5311 5312 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5313 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5314 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5315 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5316 5317 __ fmovd(tmpL, vtmp3); 5318 __ eor(rscratch2, tmp3, tmpL); 5319 __ cbnz(rscratch2, DIFF2); 5320 5321 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5322 __ umov(tmpL, vtmp3, __ D, 1); 5323 __ eor(rscratch2, tmpU, tmpL); 5324 __ cbnz(rscratch2, DIFF1); 5325 5326 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5327 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5328 __ fmovd(tmpL, vtmp); 5329 __ eor(rscratch2, tmp3, tmpL); 5330 __ cbnz(rscratch2, DIFF2); 5331 5332 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5333 __ umov(tmpL, vtmp, __ D, 1); 5334 __ eor(rscratch2, tmpU, tmpL); 5335 __ cbnz(rscratch2, DIFF1); 5336 } 5337 5338 // r0 = result 5339 // r1 = str1 5340 // r2 = cnt1 5341 // r3 = str2 5342 // r4 = cnt2 5343 // r10 = tmp1 5344 // r11 = tmp2 5345 address generate_compare_long_string_different_encoding(bool isLU) { 5346 __ align(CodeEntryAlignment); 5347 StubCodeMark mark(this, "StubRoutines", isLU 5348 ? "compare_long_string_different_encoding LU" 5349 : "compare_long_string_different_encoding UL"); 5350 address entry = __ pc(); 5351 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5352 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5353 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5354 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5355 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5356 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5357 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5358 5359 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5360 5361 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5362 // cnt2 == amount of characters left to compare 5363 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5364 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5365 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5366 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5367 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5368 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5369 __ eor(rscratch2, tmp1, tmp2); 5370 __ mov(rscratch1, tmp2); 5371 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5372 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5373 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5374 __ push(spilled_regs, sp); 5375 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5376 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5377 5378 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5379 5380 if (SoftwarePrefetchHintDistance >= 0) { 5381 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5382 __ br(__ LT, NO_PREFETCH); 5383 __ bind(LARGE_LOOP_PREFETCH); 5384 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5385 __ mov(tmp4, 2); 5386 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5387 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5388 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5389 __ subs(tmp4, tmp4, 1); 5390 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5391 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5392 __ mov(tmp4, 2); 5393 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5394 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5395 __ subs(tmp4, tmp4, 1); 5396 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5397 __ sub(cnt2, cnt2, 64); 5398 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5399 __ br(__ GE, LARGE_LOOP_PREFETCH); 5400 } 5401 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5402 __ bind(NO_PREFETCH); 5403 __ subs(cnt2, cnt2, 16); 5404 __ br(__ LT, TAIL); 5405 __ align(OptoLoopAlignment); 5406 __ bind(SMALL_LOOP); // smaller loop 5407 __ subs(cnt2, cnt2, 16); 5408 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5409 __ br(__ GE, SMALL_LOOP); 5410 __ cmn(cnt2, (u1)16); 5411 __ br(__ EQ, LOAD_LAST); 5412 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5413 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5414 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5415 __ ldr(tmp3, Address(cnt1, -8)); 5416 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5417 __ b(LOAD_LAST); 5418 __ bind(DIFF2); 5419 __ mov(tmpU, tmp3); 5420 __ bind(DIFF1); 5421 __ pop(spilled_regs, sp); 5422 __ b(CALCULATE_DIFFERENCE); 5423 __ bind(LOAD_LAST); 5424 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5425 // No need to load it again 5426 __ mov(tmpU, tmp3); 5427 __ pop(spilled_regs, sp); 5428 5429 // tmp2 points to the address of the last 4 Latin1 characters right now 5430 __ ldrs(vtmp, Address(tmp2)); 5431 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5432 __ fmovd(tmpL, vtmp); 5433 5434 __ eor(rscratch2, tmpU, tmpL); 5435 __ cbz(rscratch2, DONE); 5436 5437 // Find the first different characters in the longwords and 5438 // compute their difference. 5439 __ bind(CALCULATE_DIFFERENCE); 5440 __ rev(rscratch2, rscratch2); 5441 __ clz(rscratch2, rscratch2); 5442 __ andr(rscratch2, rscratch2, -16); 5443 __ lsrv(tmp1, tmp1, rscratch2); 5444 __ uxthw(tmp1, tmp1); 5445 __ lsrv(rscratch1, rscratch1, rscratch2); 5446 __ uxthw(rscratch1, rscratch1); 5447 __ subw(result, tmp1, rscratch1); 5448 __ bind(DONE); 5449 __ ret(lr); 5450 return entry; 5451 } 5452 5453 address generate_method_entry_barrier() { 5454 __ align(CodeEntryAlignment); 5455 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5456 5457 Label deoptimize_label; 5458 5459 address start = __ pc(); 5460 5461 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5462 5463 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5464 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5465 // We can get here despite the nmethod being good, if we have not 5466 // yet applied our cross modification fence (or data fence). 5467 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5468 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5469 __ ldrw(rscratch2, rscratch2); 5470 __ strw(rscratch2, thread_epoch_addr); 5471 __ isb(); 5472 __ membar(__ LoadLoad); 5473 } 5474 5475 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5476 5477 __ enter(); 5478 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5479 5480 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5481 5482 __ push_call_clobbered_registers(); 5483 5484 __ mov(c_rarg0, rscratch2); 5485 __ call_VM_leaf 5486 (CAST_FROM_FN_PTR 5487 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5488 5489 __ reset_last_Java_frame(true); 5490 5491 __ mov(rscratch1, r0); 5492 5493 __ pop_call_clobbered_registers(); 5494 5495 __ cbnz(rscratch1, deoptimize_label); 5496 5497 __ leave(); 5498 __ ret(lr); 5499 5500 __ BIND(deoptimize_label); 5501 5502 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5503 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5504 5505 __ mov(sp, rscratch1); 5506 __ br(rscratch2); 5507 5508 return start; 5509 } 5510 5511 // r0 = result 5512 // r1 = str1 5513 // r2 = cnt1 5514 // r3 = str2 5515 // r4 = cnt2 5516 // r10 = tmp1 5517 // r11 = tmp2 5518 address generate_compare_long_string_same_encoding(bool isLL) { 5519 __ align(CodeEntryAlignment); 5520 StubCodeMark mark(this, "StubRoutines", isLL 5521 ? "compare_long_string_same_encoding LL" 5522 : "compare_long_string_same_encoding UU"); 5523 address entry = __ pc(); 5524 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5525 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5526 5527 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5528 5529 // exit from large loop when less than 64 bytes left to read or we're about 5530 // to prefetch memory behind array border 5531 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5532 5533 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5534 __ eor(rscratch2, tmp1, tmp2); 5535 __ cbnz(rscratch2, CAL_DIFFERENCE); 5536 5537 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5538 // update pointers, because of previous read 5539 __ add(str1, str1, wordSize); 5540 __ add(str2, str2, wordSize); 5541 if (SoftwarePrefetchHintDistance >= 0) { 5542 __ align(OptoLoopAlignment); 5543 __ bind(LARGE_LOOP_PREFETCH); 5544 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5545 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5546 5547 for (int i = 0; i < 4; i++) { 5548 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5549 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5550 __ cmp(tmp1, tmp2); 5551 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5552 __ br(Assembler::NE, DIFF); 5553 } 5554 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5555 __ add(str1, str1, 64); 5556 __ add(str2, str2, 64); 5557 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5558 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5559 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5560 } 5561 5562 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5563 __ br(Assembler::LE, LESS16); 5564 __ align(OptoLoopAlignment); 5565 __ bind(LOOP_COMPARE16); 5566 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5567 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5568 __ cmp(tmp1, tmp2); 5569 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5570 __ br(Assembler::NE, DIFF); 5571 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5572 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5573 __ br(Assembler::LT, LESS16); 5574 5575 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5576 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5577 __ cmp(tmp1, tmp2); 5578 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5579 __ br(Assembler::NE, DIFF); 5580 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5581 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5582 __ br(Assembler::GE, LOOP_COMPARE16); 5583 __ cbz(cnt2, LENGTH_DIFF); 5584 5585 __ bind(LESS16); 5586 // each 8 compare 5587 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5588 __ br(Assembler::LE, LESS8); 5589 __ ldr(tmp1, Address(__ post(str1, 8))); 5590 __ ldr(tmp2, Address(__ post(str2, 8))); 5591 __ eor(rscratch2, tmp1, tmp2); 5592 __ cbnz(rscratch2, CAL_DIFFERENCE); 5593 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5594 5595 __ bind(LESS8); // directly load last 8 bytes 5596 if (!isLL) { 5597 __ add(cnt2, cnt2, cnt2); 5598 } 5599 __ ldr(tmp1, Address(str1, cnt2)); 5600 __ ldr(tmp2, Address(str2, cnt2)); 5601 __ eor(rscratch2, tmp1, tmp2); 5602 __ cbz(rscratch2, LENGTH_DIFF); 5603 __ b(CAL_DIFFERENCE); 5604 5605 __ bind(DIFF); 5606 __ cmp(tmp1, tmp2); 5607 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5608 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5609 // reuse rscratch2 register for the result of eor instruction 5610 __ eor(rscratch2, tmp1, tmp2); 5611 5612 __ bind(CAL_DIFFERENCE); 5613 __ rev(rscratch2, rscratch2); 5614 __ clz(rscratch2, rscratch2); 5615 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5616 __ lsrv(tmp1, tmp1, rscratch2); 5617 __ lsrv(tmp2, tmp2, rscratch2); 5618 if (isLL) { 5619 __ uxtbw(tmp1, tmp1); 5620 __ uxtbw(tmp2, tmp2); 5621 } else { 5622 __ uxthw(tmp1, tmp1); 5623 __ uxthw(tmp2, tmp2); 5624 } 5625 __ subw(result, tmp1, tmp2); 5626 5627 __ bind(LENGTH_DIFF); 5628 __ ret(lr); 5629 return entry; 5630 } 5631 5632 enum string_compare_mode { 5633 LL, 5634 LU, 5635 UL, 5636 UU, 5637 }; 5638 5639 // The following registers are declared in aarch64.ad 5640 // r0 = result 5641 // r1 = str1 5642 // r2 = cnt1 5643 // r3 = str2 5644 // r4 = cnt2 5645 // r10 = tmp1 5646 // r11 = tmp2 5647 // z0 = ztmp1 5648 // z1 = ztmp2 5649 // p0 = pgtmp1 5650 // p1 = pgtmp2 5651 address generate_compare_long_string_sve(string_compare_mode mode) { 5652 __ align(CodeEntryAlignment); 5653 address entry = __ pc(); 5654 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5655 tmp1 = r10, tmp2 = r11; 5656 5657 Label LOOP, DONE, MISMATCH; 5658 Register vec_len = tmp1; 5659 Register idx = tmp2; 5660 // The minimum of the string lengths has been stored in cnt2. 5661 Register cnt = cnt2; 5662 FloatRegister ztmp1 = z0, ztmp2 = z1; 5663 PRegister pgtmp1 = p0, pgtmp2 = p1; 5664 5665 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 5666 switch (mode) { \ 5667 case LL: \ 5668 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 5669 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 5670 break; \ 5671 case LU: \ 5672 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 5673 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5674 break; \ 5675 case UL: \ 5676 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5677 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 5678 break; \ 5679 case UU: \ 5680 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 5681 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 5682 break; \ 5683 default: \ 5684 ShouldNotReachHere(); \ 5685 } 5686 5687 const char* stubname; 5688 switch (mode) { 5689 case LL: stubname = "compare_long_string_same_encoding LL"; break; 5690 case LU: stubname = "compare_long_string_different_encoding LU"; break; 5691 case UL: stubname = "compare_long_string_different_encoding UL"; break; 5692 case UU: stubname = "compare_long_string_same_encoding UU"; break; 5693 default: ShouldNotReachHere(); 5694 } 5695 5696 StubCodeMark mark(this, "StubRoutines", stubname); 5697 5698 __ mov(idx, 0); 5699 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5700 5701 if (mode == LL) { 5702 __ sve_cntb(vec_len); 5703 } else { 5704 __ sve_cnth(vec_len); 5705 } 5706 5707 __ sub(rscratch1, cnt, vec_len); 5708 5709 __ bind(LOOP); 5710 5711 // main loop 5712 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5713 __ add(idx, idx, vec_len); 5714 // Compare strings. 5715 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5716 __ br(__ NE, MISMATCH); 5717 __ cmp(idx, rscratch1); 5718 __ br(__ LT, LOOP); 5719 5720 // post loop, last iteration 5721 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 5722 5723 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 5724 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 5725 __ br(__ EQ, DONE); 5726 5727 __ bind(MISMATCH); 5728 5729 // Crop the vector to find its location. 5730 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 5731 // Extract the first different characters of each string. 5732 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 5733 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 5734 5735 // Compute the difference of the first different characters. 5736 __ sub(result, rscratch1, rscratch2); 5737 5738 __ bind(DONE); 5739 __ ret(lr); 5740 #undef LOAD_PAIR 5741 return entry; 5742 } 5743 5744 void generate_compare_long_strings() { 5745 if (UseSVE == 0) { 5746 StubRoutines::aarch64::_compare_long_string_LL 5747 = generate_compare_long_string_same_encoding(true); 5748 StubRoutines::aarch64::_compare_long_string_UU 5749 = generate_compare_long_string_same_encoding(false); 5750 StubRoutines::aarch64::_compare_long_string_LU 5751 = generate_compare_long_string_different_encoding(true); 5752 StubRoutines::aarch64::_compare_long_string_UL 5753 = generate_compare_long_string_different_encoding(false); 5754 } else { 5755 StubRoutines::aarch64::_compare_long_string_LL 5756 = generate_compare_long_string_sve(LL); 5757 StubRoutines::aarch64::_compare_long_string_UU 5758 = generate_compare_long_string_sve(UU); 5759 StubRoutines::aarch64::_compare_long_string_LU 5760 = generate_compare_long_string_sve(LU); 5761 StubRoutines::aarch64::_compare_long_string_UL 5762 = generate_compare_long_string_sve(UL); 5763 } 5764 } 5765 5766 // R0 = result 5767 // R1 = str2 5768 // R2 = cnt1 5769 // R3 = str1 5770 // R4 = cnt2 5771 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 5772 // 5773 // This generic linear code use few additional ideas, which makes it faster: 5774 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5775 // in order to skip initial loading(help in systems with 1 ld pipeline) 5776 // 2) we can use "fast" algorithm of finding single character to search for 5777 // first symbol with less branches(1 branch per each loaded register instead 5778 // of branch for each symbol), so, this is where constants like 5779 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5780 // 3) after loading and analyzing 1st register of source string, it can be 5781 // used to search for every 1st character entry, saving few loads in 5782 // comparison with "simplier-but-slower" implementation 5783 // 4) in order to avoid lots of push/pop operations, code below is heavily 5784 // re-using/re-initializing/compressing register values, which makes code 5785 // larger and a bit less readable, however, most of extra operations are 5786 // issued during loads or branches, so, penalty is minimal 5787 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5788 const char* stubName = str1_isL 5789 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5790 : "indexof_linear_uu"; 5791 __ align(CodeEntryAlignment); 5792 StubCodeMark mark(this, "StubRoutines", stubName); 5793 address entry = __ pc(); 5794 5795 int str1_chr_size = str1_isL ? 1 : 2; 5796 int str2_chr_size = str2_isL ? 1 : 2; 5797 int str1_chr_shift = str1_isL ? 0 : 1; 5798 int str2_chr_shift = str2_isL ? 0 : 1; 5799 bool isL = str1_isL && str2_isL; 5800 // parameters 5801 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5802 // temporary registers 5803 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5804 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5805 // redefinitions 5806 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5807 5808 __ push(spilled_regs, sp); 5809 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5810 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5811 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5812 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5813 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5814 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5815 // Read whole register from str1. It is safe, because length >=8 here 5816 __ ldr(ch1, Address(str1)); 5817 // Read whole register from str2. It is safe, because length >=8 here 5818 __ ldr(ch2, Address(str2)); 5819 __ sub(cnt2, cnt2, cnt1); 5820 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5821 if (str1_isL != str2_isL) { 5822 __ eor(v0, __ T16B, v0, v0); 5823 } 5824 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5825 __ mul(first, first, tmp1); 5826 // check if we have less than 1 register to check 5827 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5828 if (str1_isL != str2_isL) { 5829 __ fmovd(v1, ch1); 5830 } 5831 __ br(__ LE, L_SMALL); 5832 __ eor(ch2, first, ch2); 5833 if (str1_isL != str2_isL) { 5834 __ zip1(v1, __ T16B, v1, v0); 5835 } 5836 __ sub(tmp2, ch2, tmp1); 5837 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5838 __ bics(tmp2, tmp2, ch2); 5839 if (str1_isL != str2_isL) { 5840 __ fmovd(ch1, v1); 5841 } 5842 __ br(__ NE, L_HAS_ZERO); 5843 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5844 __ add(result, result, wordSize/str2_chr_size); 5845 __ add(str2, str2, wordSize); 5846 __ br(__ LT, L_POST_LOOP); 5847 __ BIND(L_LOOP); 5848 __ ldr(ch2, Address(str2)); 5849 __ eor(ch2, first, ch2); 5850 __ sub(tmp2, ch2, tmp1); 5851 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5852 __ bics(tmp2, tmp2, ch2); 5853 __ br(__ NE, L_HAS_ZERO); 5854 __ BIND(L_LOOP_PROCEED); 5855 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5856 __ add(str2, str2, wordSize); 5857 __ add(result, result, wordSize/str2_chr_size); 5858 __ br(__ GE, L_LOOP); 5859 __ BIND(L_POST_LOOP); 5860 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5861 __ br(__ LE, NOMATCH); 5862 __ ldr(ch2, Address(str2)); 5863 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5864 __ eor(ch2, first, ch2); 5865 __ sub(tmp2, ch2, tmp1); 5866 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5867 __ mov(tmp4, -1); // all bits set 5868 __ b(L_SMALL_PROCEED); 5869 __ align(OptoLoopAlignment); 5870 __ BIND(L_SMALL); 5871 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5872 __ eor(ch2, first, ch2); 5873 if (str1_isL != str2_isL) { 5874 __ zip1(v1, __ T16B, v1, v0); 5875 } 5876 __ sub(tmp2, ch2, tmp1); 5877 __ mov(tmp4, -1); // all bits set 5878 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5879 if (str1_isL != str2_isL) { 5880 __ fmovd(ch1, v1); // move converted 4 symbols 5881 } 5882 __ BIND(L_SMALL_PROCEED); 5883 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5884 __ bic(tmp2, tmp2, ch2); 5885 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5886 __ rbit(tmp2, tmp2); 5887 __ br(__ EQ, NOMATCH); 5888 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5889 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5890 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5891 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5892 if (str2_isL) { // LL 5893 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5894 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5895 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5896 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5897 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5898 } else { 5899 __ mov(ch2, 0xE); // all bits in byte set except last one 5900 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5901 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5902 __ lslv(tmp2, tmp2, tmp4); 5903 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5904 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5905 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5906 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5907 } 5908 __ cmp(ch1, ch2); 5909 __ mov(tmp4, wordSize/str2_chr_size); 5910 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5911 __ BIND(L_SMALL_CMP_LOOP); 5912 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5913 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5914 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5915 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5916 __ add(tmp4, tmp4, 1); 5917 __ cmp(tmp4, cnt1); 5918 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5919 __ cmp(first, ch2); 5920 __ br(__ EQ, L_SMALL_CMP_LOOP); 5921 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5922 __ cbz(tmp2, NOMATCH); // no more matches. exit 5923 __ clz(tmp4, tmp2); 5924 __ add(result, result, 1); // advance index 5925 __ add(str2, str2, str2_chr_size); // advance pointer 5926 __ b(L_SMALL_HAS_ZERO_LOOP); 5927 __ align(OptoLoopAlignment); 5928 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5929 __ cmp(first, ch2); 5930 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5931 __ b(DONE); 5932 __ align(OptoLoopAlignment); 5933 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5934 if (str2_isL) { // LL 5935 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5936 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5937 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5938 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5939 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5940 } else { 5941 __ mov(ch2, 0xE); // all bits in byte set except last one 5942 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5943 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5944 __ lslv(tmp2, tmp2, tmp4); 5945 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5946 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5947 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5948 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5949 } 5950 __ cmp(ch1, ch2); 5951 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5952 __ b(DONE); 5953 __ align(OptoLoopAlignment); 5954 __ BIND(L_HAS_ZERO); 5955 __ rbit(tmp2, tmp2); 5956 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 5957 // Now, perform compression of counters(cnt2 and cnt1) into one register. 5958 // It's fine because both counters are 32bit and are not changed in this 5959 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 5960 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 5961 __ sub(result, result, 1); 5962 __ BIND(L_HAS_ZERO_LOOP); 5963 __ mov(cnt1, wordSize/str2_chr_size); 5964 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5965 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 5966 if (str2_isL) { 5967 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5968 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5969 __ lslv(tmp2, tmp2, tmp4); 5970 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5971 __ add(tmp4, tmp4, 1); 5972 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5973 __ lsl(tmp2, tmp2, 1); 5974 __ mov(tmp4, wordSize/str2_chr_size); 5975 } else { 5976 __ mov(ch2, 0xE); 5977 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5978 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5979 __ lslv(tmp2, tmp2, tmp4); 5980 __ add(tmp4, tmp4, 1); 5981 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5982 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 5983 __ lsl(tmp2, tmp2, 1); 5984 __ mov(tmp4, wordSize/str2_chr_size); 5985 __ sub(str2, str2, str2_chr_size); 5986 } 5987 __ cmp(ch1, ch2); 5988 __ mov(tmp4, wordSize/str2_chr_size); 5989 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5990 __ BIND(L_CMP_LOOP); 5991 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5992 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5993 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5994 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5995 __ add(tmp4, tmp4, 1); 5996 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5997 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 5998 __ cmp(cnt1, ch2); 5999 __ br(__ EQ, L_CMP_LOOP); 6000 __ BIND(L_CMP_LOOP_NOMATCH); 6001 // here we're not matched 6002 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6003 __ clz(tmp4, tmp2); 6004 __ add(str2, str2, str2_chr_size); // advance pointer 6005 __ b(L_HAS_ZERO_LOOP); 6006 __ align(OptoLoopAlignment); 6007 __ BIND(L_CMP_LOOP_LAST_CMP); 6008 __ cmp(cnt1, ch2); 6009 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6010 __ b(DONE); 6011 __ align(OptoLoopAlignment); 6012 __ BIND(L_CMP_LOOP_LAST_CMP2); 6013 if (str2_isL) { 6014 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6015 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6016 __ lslv(tmp2, tmp2, tmp4); 6017 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6018 __ add(tmp4, tmp4, 1); 6019 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6020 __ lsl(tmp2, tmp2, 1); 6021 } else { 6022 __ mov(ch2, 0xE); 6023 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6024 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6025 __ lslv(tmp2, tmp2, tmp4); 6026 __ add(tmp4, tmp4, 1); 6027 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6028 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6029 __ lsl(tmp2, tmp2, 1); 6030 __ sub(str2, str2, str2_chr_size); 6031 } 6032 __ cmp(ch1, ch2); 6033 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6034 __ b(DONE); 6035 __ align(OptoLoopAlignment); 6036 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6037 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6038 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6039 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6040 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6041 // result by analyzed characters value, so, we can just reset lower bits 6042 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6043 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6044 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6045 // index of last analyzed substring inside current octet. So, str2 in at 6046 // respective start address. We need to advance it to next octet 6047 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6048 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6049 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6050 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6051 __ movw(cnt2, cnt2); 6052 __ b(L_LOOP_PROCEED); 6053 __ align(OptoLoopAlignment); 6054 __ BIND(NOMATCH); 6055 __ mov(result, -1); 6056 __ BIND(DONE); 6057 __ pop(spilled_regs, sp); 6058 __ ret(lr); 6059 return entry; 6060 } 6061 6062 void generate_string_indexof_stubs() { 6063 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6064 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6065 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6066 } 6067 6068 void inflate_and_store_2_fp_registers(bool generatePrfm, 6069 FloatRegister src1, FloatRegister src2) { 6070 Register dst = r1; 6071 __ zip1(v1, __ T16B, src1, v0); 6072 __ zip2(v2, __ T16B, src1, v0); 6073 if (generatePrfm) { 6074 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6075 } 6076 __ zip1(v3, __ T16B, src2, v0); 6077 __ zip2(v4, __ T16B, src2, v0); 6078 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6079 } 6080 6081 // R0 = src 6082 // R1 = dst 6083 // R2 = len 6084 // R3 = len >> 3 6085 // V0 = 0 6086 // v1 = loaded 8 bytes 6087 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6088 address generate_large_byte_array_inflate() { 6089 __ align(CodeEntryAlignment); 6090 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6091 address entry = __ pc(); 6092 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6093 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6094 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6095 6096 // do one more 8-byte read to have address 16-byte aligned in most cases 6097 // also use single store instruction 6098 __ ldrd(v2, __ post(src, 8)); 6099 __ sub(octetCounter, octetCounter, 2); 6100 __ zip1(v1, __ T16B, v1, v0); 6101 __ zip1(v2, __ T16B, v2, v0); 6102 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6103 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6104 __ subs(rscratch1, octetCounter, large_loop_threshold); 6105 __ br(__ LE, LOOP_START); 6106 __ b(LOOP_PRFM_START); 6107 __ bind(LOOP_PRFM); 6108 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6109 __ bind(LOOP_PRFM_START); 6110 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6111 __ sub(octetCounter, octetCounter, 8); 6112 __ subs(rscratch1, octetCounter, large_loop_threshold); 6113 inflate_and_store_2_fp_registers(true, v3, v4); 6114 inflate_and_store_2_fp_registers(true, v5, v6); 6115 __ br(__ GT, LOOP_PRFM); 6116 __ cmp(octetCounter, (u1)8); 6117 __ br(__ LT, DONE); 6118 __ bind(LOOP); 6119 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6120 __ bind(LOOP_START); 6121 __ sub(octetCounter, octetCounter, 8); 6122 __ cmp(octetCounter, (u1)8); 6123 inflate_and_store_2_fp_registers(false, v3, v4); 6124 inflate_and_store_2_fp_registers(false, v5, v6); 6125 __ br(__ GE, LOOP); 6126 __ bind(DONE); 6127 __ ret(lr); 6128 return entry; 6129 } 6130 6131 /** 6132 * Arguments: 6133 * 6134 * Input: 6135 * c_rarg0 - current state address 6136 * c_rarg1 - H key address 6137 * c_rarg2 - data address 6138 * c_rarg3 - number of blocks 6139 * 6140 * Output: 6141 * Updated state at c_rarg0 6142 */ 6143 address generate_ghash_processBlocks() { 6144 // Bafflingly, GCM uses little-endian for the byte order, but 6145 // big-endian for the bit order. For example, the polynomial 1 is 6146 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6147 // 6148 // So, we must either reverse the bytes in each word and do 6149 // everything big-endian or reverse the bits in each byte and do 6150 // it little-endian. On AArch64 it's more idiomatic to reverse 6151 // the bits in each byte (we have an instruction, RBIT, to do 6152 // that) and keep the data in little-endian bit order through the 6153 // calculation, bit-reversing the inputs and outputs. 6154 6155 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6156 __ align(wordSize * 2); 6157 address p = __ pc(); 6158 __ emit_int64(0x87); // The low-order bits of the field 6159 // polynomial (i.e. p = z^7+z^2+z+1) 6160 // repeated in the low and high parts of a 6161 // 128-bit vector 6162 __ emit_int64(0x87); 6163 6164 __ align(CodeEntryAlignment); 6165 address start = __ pc(); 6166 6167 Register state = c_rarg0; 6168 Register subkeyH = c_rarg1; 6169 Register data = c_rarg2; 6170 Register blocks = c_rarg3; 6171 6172 FloatRegister vzr = v30; 6173 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6174 6175 __ ldrq(v24, p); // The field polynomial 6176 6177 __ ldrq(v0, Address(state)); 6178 __ ldrq(v1, Address(subkeyH)); 6179 6180 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6181 __ rbit(v0, __ T16B, v0); 6182 __ rev64(v1, __ T16B, v1); 6183 __ rbit(v1, __ T16B, v1); 6184 6185 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6186 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6187 6188 { 6189 Label L_ghash_loop; 6190 __ bind(L_ghash_loop); 6191 6192 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6193 // reversing each byte 6194 __ rbit(v2, __ T16B, v2); 6195 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6196 6197 // Multiply state in v2 by subkey in v1 6198 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6199 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6200 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6201 // Reduce v7:v5 by the field polynomial 6202 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6203 6204 __ sub(blocks, blocks, 1); 6205 __ cbnz(blocks, L_ghash_loop); 6206 } 6207 6208 // The bit-reversed result is at this point in v0 6209 __ rev64(v0, __ T16B, v0); 6210 __ rbit(v0, __ T16B, v0); 6211 6212 __ st1(v0, __ T16B, state); 6213 __ ret(lr); 6214 6215 return start; 6216 } 6217 6218 address generate_ghash_processBlocks_wide() { 6219 address small = generate_ghash_processBlocks(); 6220 6221 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6222 __ align(wordSize * 2); 6223 address p = __ pc(); 6224 __ emit_int64(0x87); // The low-order bits of the field 6225 // polynomial (i.e. p = z^7+z^2+z+1) 6226 // repeated in the low and high parts of a 6227 // 128-bit vector 6228 __ emit_int64(0x87); 6229 6230 __ align(CodeEntryAlignment); 6231 address start = __ pc(); 6232 6233 Register state = c_rarg0; 6234 Register subkeyH = c_rarg1; 6235 Register data = c_rarg2; 6236 Register blocks = c_rarg3; 6237 6238 const int unroll = 4; 6239 6240 __ cmp(blocks, (unsigned char)(unroll * 2)); 6241 __ br(__ LT, small); 6242 6243 if (unroll > 1) { 6244 // Save state before entering routine 6245 __ sub(sp, sp, 4 * 16); 6246 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6247 __ sub(sp, sp, 4 * 16); 6248 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6249 } 6250 6251 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6252 6253 if (unroll > 1) { 6254 // And restore state 6255 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6256 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6257 } 6258 6259 __ cmp(blocks, (unsigned char)0); 6260 __ br(__ GT, small); 6261 6262 __ ret(lr); 6263 6264 return start; 6265 } 6266 6267 void generate_base64_encode_simdround(Register src, Register dst, 6268 FloatRegister codec, u8 size) { 6269 6270 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6271 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6272 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6273 6274 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6275 6276 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6277 6278 __ ushr(ind0, arrangement, in0, 2); 6279 6280 __ ushr(ind1, arrangement, in1, 2); 6281 __ shl(in0, arrangement, in0, 6); 6282 __ orr(ind1, arrangement, ind1, in0); 6283 __ ushr(ind1, arrangement, ind1, 2); 6284 6285 __ ushr(ind2, arrangement, in2, 4); 6286 __ shl(in1, arrangement, in1, 4); 6287 __ orr(ind2, arrangement, in1, ind2); 6288 __ ushr(ind2, arrangement, ind2, 2); 6289 6290 __ shl(ind3, arrangement, in2, 2); 6291 __ ushr(ind3, arrangement, ind3, 2); 6292 6293 __ tbl(out0, arrangement, codec, 4, ind0); 6294 __ tbl(out1, arrangement, codec, 4, ind1); 6295 __ tbl(out2, arrangement, codec, 4, ind2); 6296 __ tbl(out3, arrangement, codec, 4, ind3); 6297 6298 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6299 } 6300 6301 /** 6302 * Arguments: 6303 * 6304 * Input: 6305 * c_rarg0 - src_start 6306 * c_rarg1 - src_offset 6307 * c_rarg2 - src_length 6308 * c_rarg3 - dest_start 6309 * c_rarg4 - dest_offset 6310 * c_rarg5 - isURL 6311 * 6312 */ 6313 address generate_base64_encodeBlock() { 6314 6315 static const char toBase64[64] = { 6316 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6317 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6318 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6319 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6320 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6321 }; 6322 6323 static const char toBase64URL[64] = { 6324 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6325 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6326 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6327 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6328 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6329 }; 6330 6331 __ align(CodeEntryAlignment); 6332 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6333 address start = __ pc(); 6334 6335 Register src = c_rarg0; // source array 6336 Register soff = c_rarg1; // source start offset 6337 Register send = c_rarg2; // source end offset 6338 Register dst = c_rarg3; // dest array 6339 Register doff = c_rarg4; // position for writing to dest array 6340 Register isURL = c_rarg5; // Base64 or URL character set 6341 6342 // c_rarg6 and c_rarg7 are free to use as temps 6343 Register codec = c_rarg6; 6344 Register length = c_rarg7; 6345 6346 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6347 6348 __ add(src, src, soff); 6349 __ add(dst, dst, doff); 6350 __ sub(length, send, soff); 6351 6352 // load the codec base address 6353 __ lea(codec, ExternalAddress((address) toBase64)); 6354 __ cbz(isURL, ProcessData); 6355 __ lea(codec, ExternalAddress((address) toBase64URL)); 6356 6357 __ BIND(ProcessData); 6358 6359 // too short to formup a SIMD loop, roll back 6360 __ cmp(length, (u1)24); 6361 __ br(Assembler::LT, Process3B); 6362 6363 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6364 6365 __ BIND(Process48B); 6366 __ cmp(length, (u1)48); 6367 __ br(Assembler::LT, Process24B); 6368 generate_base64_encode_simdround(src, dst, v0, 16); 6369 __ sub(length, length, 48); 6370 __ b(Process48B); 6371 6372 __ BIND(Process24B); 6373 __ cmp(length, (u1)24); 6374 __ br(Assembler::LT, SIMDExit); 6375 generate_base64_encode_simdround(src, dst, v0, 8); 6376 __ sub(length, length, 24); 6377 6378 __ BIND(SIMDExit); 6379 __ cbz(length, Exit); 6380 6381 __ BIND(Process3B); 6382 // 3 src bytes, 24 bits 6383 __ ldrb(r10, __ post(src, 1)); 6384 __ ldrb(r11, __ post(src, 1)); 6385 __ ldrb(r12, __ post(src, 1)); 6386 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6387 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6388 // codec index 6389 __ ubfmw(r15, r12, 18, 23); 6390 __ ubfmw(r14, r12, 12, 17); 6391 __ ubfmw(r13, r12, 6, 11); 6392 __ andw(r12, r12, 63); 6393 // get the code based on the codec 6394 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6395 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6396 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6397 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6398 __ strb(r15, __ post(dst, 1)); 6399 __ strb(r14, __ post(dst, 1)); 6400 __ strb(r13, __ post(dst, 1)); 6401 __ strb(r12, __ post(dst, 1)); 6402 __ sub(length, length, 3); 6403 __ cbnz(length, Process3B); 6404 6405 __ BIND(Exit); 6406 __ ret(lr); 6407 6408 return start; 6409 } 6410 6411 void generate_base64_decode_simdround(Register src, Register dst, 6412 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6413 6414 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6415 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6416 6417 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6418 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6419 6420 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6421 6422 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6423 6424 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6425 6426 // we need unsigned saturating subtract, to make sure all input values 6427 // in range [0, 63] will have 0U value in the higher half lookup 6428 __ uqsubv(decH0, __ T16B, in0, v27); 6429 __ uqsubv(decH1, __ T16B, in1, v27); 6430 __ uqsubv(decH2, __ T16B, in2, v27); 6431 __ uqsubv(decH3, __ T16B, in3, v27); 6432 6433 // lower half lookup 6434 __ tbl(decL0, arrangement, codecL, 4, in0); 6435 __ tbl(decL1, arrangement, codecL, 4, in1); 6436 __ tbl(decL2, arrangement, codecL, 4, in2); 6437 __ tbl(decL3, arrangement, codecL, 4, in3); 6438 6439 // higher half lookup 6440 __ tbx(decH0, arrangement, codecH, 4, decH0); 6441 __ tbx(decH1, arrangement, codecH, 4, decH1); 6442 __ tbx(decH2, arrangement, codecH, 4, decH2); 6443 __ tbx(decH3, arrangement, codecH, 4, decH3); 6444 6445 // combine lower and higher 6446 __ orr(decL0, arrangement, decL0, decH0); 6447 __ orr(decL1, arrangement, decL1, decH1); 6448 __ orr(decL2, arrangement, decL2, decH2); 6449 __ orr(decL3, arrangement, decL3, decH3); 6450 6451 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6452 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6453 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6454 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6455 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6456 __ orr(in0, arrangement, decH0, decH1); 6457 __ orr(in1, arrangement, decH2, decH3); 6458 __ orr(in2, arrangement, in0, in1); 6459 __ umaxv(in3, arrangement, in2); 6460 __ umov(rscratch2, in3, __ B, 0); 6461 6462 // get the data to output 6463 __ shl(out0, arrangement, decL0, 2); 6464 __ ushr(out1, arrangement, decL1, 4); 6465 __ orr(out0, arrangement, out0, out1); 6466 __ shl(out1, arrangement, decL1, 4); 6467 __ ushr(out2, arrangement, decL2, 2); 6468 __ orr(out1, arrangement, out1, out2); 6469 __ shl(out2, arrangement, decL2, 6); 6470 __ orr(out2, arrangement, out2, decL3); 6471 6472 __ cbz(rscratch2, NoIllegalData); 6473 6474 // handle illegal input 6475 __ umov(r10, in2, __ D, 0); 6476 if (size == 16) { 6477 __ cbnz(r10, ErrorInLowerHalf); 6478 6479 // illegal input is in higher half, store the lower half now. 6480 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6481 6482 __ umov(r10, in2, __ D, 1); 6483 __ umov(r11, out0, __ D, 1); 6484 __ umov(r12, out1, __ D, 1); 6485 __ umov(r13, out2, __ D, 1); 6486 __ b(StoreLegalData); 6487 6488 __ BIND(ErrorInLowerHalf); 6489 } 6490 __ umov(r11, out0, __ D, 0); 6491 __ umov(r12, out1, __ D, 0); 6492 __ umov(r13, out2, __ D, 0); 6493 6494 __ BIND(StoreLegalData); 6495 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6496 __ strb(r11, __ post(dst, 1)); 6497 __ strb(r12, __ post(dst, 1)); 6498 __ strb(r13, __ post(dst, 1)); 6499 __ lsr(r10, r10, 8); 6500 __ lsr(r11, r11, 8); 6501 __ lsr(r12, r12, 8); 6502 __ lsr(r13, r13, 8); 6503 __ b(StoreLegalData); 6504 6505 __ BIND(NoIllegalData); 6506 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6507 } 6508 6509 6510 /** 6511 * Arguments: 6512 * 6513 * Input: 6514 * c_rarg0 - src_start 6515 * c_rarg1 - src_offset 6516 * c_rarg2 - src_length 6517 * c_rarg3 - dest_start 6518 * c_rarg4 - dest_offset 6519 * c_rarg5 - isURL 6520 * c_rarg6 - isMIME 6521 * 6522 */ 6523 address generate_base64_decodeBlock() { 6524 6525 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6526 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6527 // titled "Base64 decoding". 6528 6529 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6530 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6531 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6532 static const uint8_t fromBase64ForNoSIMD[256] = { 6533 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6534 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6535 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6536 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6537 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6538 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6539 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6540 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6541 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6542 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6543 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6544 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6545 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6546 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6547 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6548 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6549 }; 6550 6551 static const uint8_t fromBase64URLForNoSIMD[256] = { 6552 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6553 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6554 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6555 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6556 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6557 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6558 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6559 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6560 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6561 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6562 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6563 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6564 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6565 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6566 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6567 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6568 }; 6569 6570 // A legal value of base64 code is in range [0, 127]. We need two lookups 6571 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6572 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6573 // table vector lookup use tbx, out of range indices are unchanged in 6574 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6575 // The value of index 64 is set to 0, so that we know that we already get the 6576 // decoded data with the 1st lookup. 6577 static const uint8_t fromBase64ForSIMD[128] = { 6578 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6579 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6580 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6581 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6582 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6583 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6584 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6585 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6586 }; 6587 6588 static const uint8_t fromBase64URLForSIMD[128] = { 6589 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6590 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6591 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6592 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6593 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6594 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6595 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6596 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6597 }; 6598 6599 __ align(CodeEntryAlignment); 6600 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6601 address start = __ pc(); 6602 6603 Register src = c_rarg0; // source array 6604 Register soff = c_rarg1; // source start offset 6605 Register send = c_rarg2; // source end offset 6606 Register dst = c_rarg3; // dest array 6607 Register doff = c_rarg4; // position for writing to dest array 6608 Register isURL = c_rarg5; // Base64 or URL character set 6609 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6610 6611 Register length = send; // reuse send as length of source data to process 6612 6613 Register simd_codec = c_rarg6; 6614 Register nosimd_codec = c_rarg7; 6615 6616 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6617 6618 __ enter(); 6619 6620 __ add(src, src, soff); 6621 __ add(dst, dst, doff); 6622 6623 __ mov(doff, dst); 6624 6625 __ sub(length, send, soff); 6626 __ bfm(length, zr, 0, 1); 6627 6628 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6629 __ cbz(isURL, ProcessData); 6630 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6631 6632 __ BIND(ProcessData); 6633 __ mov(rscratch1, length); 6634 __ cmp(length, (u1)144); // 144 = 80 + 64 6635 __ br(Assembler::LT, Process4B); 6636 6637 // In the MIME case, the line length cannot be more than 76 6638 // bytes (see RFC 2045). This is too short a block for SIMD 6639 // to be worthwhile, so we use non-SIMD here. 6640 __ movw(rscratch1, 79); 6641 6642 __ BIND(Process4B); 6643 __ ldrw(r14, __ post(src, 4)); 6644 __ ubfxw(r10, r14, 0, 8); 6645 __ ubfxw(r11, r14, 8, 8); 6646 __ ubfxw(r12, r14, 16, 8); 6647 __ ubfxw(r13, r14, 24, 8); 6648 // get the de-code 6649 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6650 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6651 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6652 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6653 // error detection, 255u indicates an illegal input 6654 __ orrw(r14, r10, r11); 6655 __ orrw(r15, r12, r13); 6656 __ orrw(r14, r14, r15); 6657 __ tbnz(r14, 7, Exit); 6658 // recover the data 6659 __ lslw(r14, r10, 10); 6660 __ bfiw(r14, r11, 4, 6); 6661 __ bfmw(r14, r12, 2, 5); 6662 __ rev16w(r14, r14); 6663 __ bfiw(r13, r12, 6, 2); 6664 __ strh(r14, __ post(dst, 2)); 6665 __ strb(r13, __ post(dst, 1)); 6666 // non-simd loop 6667 __ subsw(rscratch1, rscratch1, 4); 6668 __ br(Assembler::GT, Process4B); 6669 6670 // if exiting from PreProcess80B, rscratch1 == -1; 6671 // otherwise, rscratch1 == 0. 6672 __ cbzw(rscratch1, Exit); 6673 __ sub(length, length, 80); 6674 6675 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6676 __ cbz(isURL, SIMDEnter); 6677 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6678 6679 __ BIND(SIMDEnter); 6680 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6681 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6682 __ mov(rscratch1, 63); 6683 __ dup(v27, __ T16B, rscratch1); 6684 6685 __ BIND(Process64B); 6686 __ cmp(length, (u1)64); 6687 __ br(Assembler::LT, Process32B); 6688 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6689 __ sub(length, length, 64); 6690 __ b(Process64B); 6691 6692 __ BIND(Process32B); 6693 __ cmp(length, (u1)32); 6694 __ br(Assembler::LT, SIMDExit); 6695 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6696 __ sub(length, length, 32); 6697 __ b(Process32B); 6698 6699 __ BIND(SIMDExit); 6700 __ cbz(length, Exit); 6701 __ movw(rscratch1, length); 6702 __ b(Process4B); 6703 6704 __ BIND(Exit); 6705 __ sub(c_rarg0, dst, doff); 6706 6707 __ leave(); 6708 __ ret(lr); 6709 6710 return start; 6711 } 6712 6713 // Support for spin waits. 6714 address generate_spin_wait() { 6715 __ align(CodeEntryAlignment); 6716 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6717 address start = __ pc(); 6718 6719 __ spin_wait(); 6720 __ ret(lr); 6721 6722 return start; 6723 } 6724 6725 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 6726 6727 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6728 // 6729 // If LSE is in use, generate LSE versions of all the stubs. The 6730 // non-LSE versions are in atomic_aarch64.S. 6731 6732 // class AtomicStubMark records the entry point of a stub and the 6733 // stub pointer which will point to it. The stub pointer is set to 6734 // the entry point when ~AtomicStubMark() is called, which must be 6735 // after ICache::invalidate_range. This ensures safe publication of 6736 // the generated code. 6737 class AtomicStubMark { 6738 address _entry_point; 6739 aarch64_atomic_stub_t *_stub; 6740 MacroAssembler *_masm; 6741 public: 6742 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6743 _masm = masm; 6744 __ align(32); 6745 _entry_point = __ pc(); 6746 _stub = stub; 6747 } 6748 ~AtomicStubMark() { 6749 *_stub = (aarch64_atomic_stub_t)_entry_point; 6750 } 6751 }; 6752 6753 // NB: For memory_order_conservative we need a trailing membar after 6754 // LSE atomic operations but not a leading membar. 6755 // 6756 // We don't need a leading membar because a clause in the Arm ARM 6757 // says: 6758 // 6759 // Barrier-ordered-before 6760 // 6761 // Barrier instructions order prior Memory effects before subsequent 6762 // Memory effects generated by the same Observer. A read or a write 6763 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6764 // Observer if and only if RW1 appears in program order before RW 2 6765 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6766 // instruction with both Acquire and Release semantics. 6767 // 6768 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6769 // and Release semantics, therefore we don't need a leading 6770 // barrier. However, there is no corresponding Barrier-ordered-after 6771 // relationship, therefore we need a trailing membar to prevent a 6772 // later store or load from being reordered with the store in an 6773 // atomic instruction. 6774 // 6775 // This was checked by using the herd7 consistency model simulator 6776 // (http://diy.inria.fr/) with this test case: 6777 // 6778 // AArch64 LseCas 6779 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6780 // P0 | P1; 6781 // LDR W4, [X2] | MOV W3, #0; 6782 // DMB LD | MOV W4, #1; 6783 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6784 // | DMB ISH; 6785 // | STR W4, [X2]; 6786 // exists 6787 // (0:X3=0 /\ 0:X4=1) 6788 // 6789 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6790 // with the store to x in P1. Without the DMB in P1 this may happen. 6791 // 6792 // At the time of writing we don't know of any AArch64 hardware that 6793 // reorders stores in this way, but the Reference Manual permits it. 6794 6795 void gen_cas_entry(Assembler::operand_size size, 6796 atomic_memory_order order) { 6797 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6798 exchange_val = c_rarg2; 6799 bool acquire, release; 6800 switch (order) { 6801 case memory_order_relaxed: 6802 acquire = false; 6803 release = false; 6804 break; 6805 case memory_order_release: 6806 acquire = false; 6807 release = true; 6808 break; 6809 default: 6810 acquire = true; 6811 release = true; 6812 break; 6813 } 6814 __ mov(prev, compare_val); 6815 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6816 if (order == memory_order_conservative) { 6817 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6818 } 6819 if (size == Assembler::xword) { 6820 __ mov(r0, prev); 6821 } else { 6822 __ movw(r0, prev); 6823 } 6824 __ ret(lr); 6825 } 6826 6827 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6828 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6829 // If not relaxed, then default to conservative. Relaxed is the only 6830 // case we use enough to be worth specializing. 6831 if (order == memory_order_relaxed) { 6832 __ ldadd(size, incr, prev, addr); 6833 } else { 6834 __ ldaddal(size, incr, prev, addr); 6835 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6836 } 6837 if (size == Assembler::xword) { 6838 __ mov(r0, prev); 6839 } else { 6840 __ movw(r0, prev); 6841 } 6842 __ ret(lr); 6843 } 6844 6845 void gen_swpal_entry(Assembler::operand_size size) { 6846 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6847 __ swpal(size, incr, prev, addr); 6848 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6849 if (size == Assembler::xword) { 6850 __ mov(r0, prev); 6851 } else { 6852 __ movw(r0, prev); 6853 } 6854 __ ret(lr); 6855 } 6856 6857 void generate_atomic_entry_points() { 6858 if (! UseLSE) { 6859 return; 6860 } 6861 6862 __ align(CodeEntryAlignment); 6863 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6864 address first_entry = __ pc(); 6865 6866 // ADD, memory_order_conservative 6867 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6868 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6869 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6870 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6871 6872 // ADD, memory_order_relaxed 6873 AtomicStubMark mark_fetch_add_4_relaxed 6874 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6875 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6876 AtomicStubMark mark_fetch_add_8_relaxed 6877 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6878 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6879 6880 // XCHG, memory_order_conservative 6881 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6882 gen_swpal_entry(Assembler::word); 6883 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6884 gen_swpal_entry(Assembler::xword); 6885 6886 // CAS, memory_order_conservative 6887 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6888 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6889 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6890 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6891 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6892 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6893 6894 // CAS, memory_order_relaxed 6895 AtomicStubMark mark_cmpxchg_1_relaxed 6896 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6897 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6898 AtomicStubMark mark_cmpxchg_4_relaxed 6899 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6900 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6901 AtomicStubMark mark_cmpxchg_8_relaxed 6902 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6903 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6904 6905 AtomicStubMark mark_cmpxchg_4_release 6906 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6907 gen_cas_entry(MacroAssembler::word, memory_order_release); 6908 AtomicStubMark mark_cmpxchg_8_release 6909 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 6910 gen_cas_entry(MacroAssembler::xword, memory_order_release); 6911 6912 AtomicStubMark mark_cmpxchg_4_seq_cst 6913 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 6914 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 6915 AtomicStubMark mark_cmpxchg_8_seq_cst 6916 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 6917 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 6918 6919 ICache::invalidate_range(first_entry, __ pc() - first_entry); 6920 } 6921 #endif // LINUX 6922 6923 address generate_cont_thaw(Continuation::thaw_kind kind) { 6924 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 6925 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 6926 6927 address start = __ pc(); 6928 6929 if (return_barrier) { 6930 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 6931 __ mov(sp, rscratch1); 6932 } 6933 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6934 6935 if (return_barrier) { 6936 // preserve possible return value from a method returning to the return barrier 6937 __ fmovd(rscratch1, v0); 6938 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 6939 } 6940 6941 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 6942 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 6943 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 6944 6945 if (return_barrier) { 6946 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 6947 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 6948 __ fmovd(v0, rscratch1); 6949 } 6950 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 6951 6952 6953 Label thaw_success; 6954 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 6955 __ cbnz(rscratch2, thaw_success); 6956 __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry())); 6957 __ br(rscratch1); 6958 __ bind(thaw_success); 6959 6960 // make room for the thawed frames 6961 __ sub(rscratch1, sp, rscratch2); 6962 __ andr(rscratch1, rscratch1, -16); // align 6963 __ mov(sp, rscratch1); 6964 6965 if (return_barrier) { 6966 // save original return value -- again 6967 __ fmovd(rscratch1, v0); 6968 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 6969 } 6970 6971 // If we want, we can templatize thaw by kind, and have three different entries 6972 __ movw(c_rarg1, (uint32_t)kind); 6973 6974 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 6975 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 6976 6977 if (return_barrier) { 6978 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 6979 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 6980 __ fmovd(v0, rscratch1); 6981 } else { 6982 __ mov(r0, zr); // return 0 (success) from doYield 6983 } 6984 6985 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 6986 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 6987 __ mov(rfp, sp); 6988 6989 if (return_barrier_exception) { 6990 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 6991 __ verify_oop(r0); 6992 __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19 6993 6994 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 6995 6996 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 6997 // __ reinitialize_ptrue(); 6998 6999 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7000 7001 __ mov(r1, r0); // the exception handler 7002 __ mov(r0, r19); // restore return value contaning the exception oop 7003 __ verify_oop(r0); 7004 7005 __ leave(); 7006 __ mov(r3, lr); 7007 __ br(r1); // the exception handler 7008 } else { 7009 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7010 __ leave(); 7011 __ ret(lr); 7012 } 7013 7014 return start; 7015 } 7016 7017 address generate_cont_thaw() { 7018 if (!Continuations::enabled()) return nullptr; 7019 7020 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7021 address start = __ pc(); 7022 generate_cont_thaw(Continuation::thaw_top); 7023 return start; 7024 } 7025 7026 address generate_cont_returnBarrier() { 7027 if (!Continuations::enabled()) return nullptr; 7028 7029 // TODO: will probably need multiple return barriers depending on return type 7030 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7031 address start = __ pc(); 7032 7033 generate_cont_thaw(Continuation::thaw_return_barrier); 7034 7035 return start; 7036 } 7037 7038 address generate_cont_returnBarrier_exception() { 7039 if (!Continuations::enabled()) return nullptr; 7040 7041 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7042 address start = __ pc(); 7043 7044 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7045 7046 return start; 7047 } 7048 7049 #if INCLUDE_JFR 7050 7051 static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { 7052 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7053 __ mov(c_rarg0, thread); 7054 } 7055 7056 // The handle is dereferenced through a load barrier. 7057 static void jfr_epilogue(MacroAssembler* _masm) { 7058 __ reset_last_Java_frame(true); 7059 __ resolve_global_jobject(r0, rscratch1, rscratch2); 7060 } 7061 7062 // For c2: c_rarg0 is junk, call to runtime to write a checkpoint. 7063 // It returns a jobject handle to the event writer. 7064 // The handle is dereferenced and the return value is the event writer oop. 7065 static RuntimeStub* generate_jfr_write_checkpoint() { 7066 enum layout { 7067 rbp_off, 7068 rbpH_off, 7069 return_off, 7070 return_off2, 7071 framesize // inclusive of return address 7072 }; 7073 7074 int insts_size = 1024; 7075 int locs_size = 64; 7076 CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size); 7077 OopMapSet* oop_maps = new OopMapSet(); 7078 MacroAssembler* masm = new MacroAssembler(&code); 7079 MacroAssembler* _masm = masm; 7080 7081 address start = __ pc(); 7082 __ enter(); 7083 int frame_complete = __ pc() - start; 7084 address the_pc = __ pc(); 7085 jfr_prologue(the_pc, _masm, rthread); 7086 __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1); 7087 jfr_epilogue(_masm); 7088 __ leave(); 7089 __ ret(lr); 7090 7091 OopMap* map = new OopMap(framesize, 1); // rfp 7092 oop_maps->add_gc_map(the_pc - start, map); 7093 7094 RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size) 7095 RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete, 7096 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7097 oop_maps, false); 7098 return stub; 7099 } 7100 7101 #endif // INCLUDE_JFR 7102 7103 // Continuation point for throwing of implicit exceptions that are 7104 // not handled in the current activation. Fabricates an exception 7105 // oop and initiates normal exception dispatching in this 7106 // frame. Since we need to preserve callee-saved values (currently 7107 // only for C2, but done for C1 as well) we need a callee-saved oop 7108 // map and therefore have to make these stubs into RuntimeStubs 7109 // rather than BufferBlobs. If the compiler needs all registers to 7110 // be preserved between the fault point and the exception handler 7111 // then it must assume responsibility for that in 7112 // AbstractCompiler::continuation_for_implicit_null_exception or 7113 // continuation_for_implicit_division_by_zero_exception. All other 7114 // implicit exceptions (e.g., NullPointerException or 7115 // AbstractMethodError on entry) are either at call sites or 7116 // otherwise assume that stack unwinding will be initiated, so 7117 // caller saved registers were assumed volatile in the compiler. 7118 7119 #undef __ 7120 #define __ masm-> 7121 7122 address generate_throw_exception(const char* name, 7123 address runtime_entry, 7124 Register arg1 = noreg, 7125 Register arg2 = noreg) { 7126 // Information about frame layout at time of blocking runtime call. 7127 // Note that we only have to preserve callee-saved registers since 7128 // the compilers are responsible for supplying a continuation point 7129 // if they expect all registers to be preserved. 7130 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 7131 enum layout { 7132 rfp_off = 0, 7133 rfp_off2, 7134 return_off, 7135 return_off2, 7136 framesize // inclusive of return address 7137 }; 7138 7139 int insts_size = 512; 7140 int locs_size = 64; 7141 7142 CodeBuffer code(name, insts_size, locs_size); 7143 OopMapSet* oop_maps = new OopMapSet(); 7144 MacroAssembler* masm = new MacroAssembler(&code); 7145 7146 address start = __ pc(); 7147 7148 // This is an inlined and slightly modified version of call_VM 7149 // which has the ability to fetch the return PC out of 7150 // thread-local storage and also sets up last_Java_sp slightly 7151 // differently than the real call_VM 7152 7153 __ enter(); // Save FP and LR before call 7154 7155 assert(is_even(framesize/2), "sp not 16-byte aligned"); 7156 7157 // lr and fp are already in place 7158 __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog 7159 7160 int frame_complete = __ pc() - start; 7161 7162 // Set up last_Java_sp and last_Java_fp 7163 address the_pc = __ pc(); 7164 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 7165 7166 // Call runtime 7167 if (arg1 != noreg) { 7168 assert(arg2 != c_rarg1, "clobbered"); 7169 __ mov(c_rarg1, arg1); 7170 } 7171 if (arg2 != noreg) { 7172 __ mov(c_rarg2, arg2); 7173 } 7174 __ mov(c_rarg0, rthread); 7175 BLOCK_COMMENT("call runtime_entry"); 7176 __ mov(rscratch1, runtime_entry); 7177 __ blr(rscratch1); 7178 7179 // Generate oop map 7180 OopMap* map = new OopMap(framesize, 0); 7181 7182 oop_maps->add_gc_map(the_pc - start, map); 7183 7184 __ reset_last_Java_frame(true); 7185 7186 // Reinitialize the ptrue predicate register, in case the external runtime 7187 // call clobbers ptrue reg, as we may return to SVE compiled code. 7188 __ reinitialize_ptrue(); 7189 7190 __ leave(); 7191 7192 // check for pending exceptions 7193 #ifdef ASSERT 7194 Label L; 7195 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 7196 __ cbnz(rscratch1, L); 7197 __ should_not_reach_here(); 7198 __ bind(L); 7199 #endif // ASSERT 7200 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 7201 7202 // codeBlob framesize is in words (not VMRegImpl::slot_size) 7203 RuntimeStub* stub = 7204 RuntimeStub::new_runtime_stub(name, 7205 &code, 7206 frame_complete, 7207 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 7208 oop_maps, false); 7209 return stub->entry_point(); 7210 } 7211 7212 class MontgomeryMultiplyGenerator : public MacroAssembler { 7213 7214 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7215 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7216 7217 RegSet _toSave; 7218 bool _squaring; 7219 7220 public: 7221 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7222 : MacroAssembler(as->code()), _squaring(squaring) { 7223 7224 // Register allocation 7225 7226 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7227 Pa_base = *regs; // Argument registers 7228 if (squaring) 7229 Pb_base = Pa_base; 7230 else 7231 Pb_base = *++regs; 7232 Pn_base = *++regs; 7233 Rlen= *++regs; 7234 inv = *++regs; 7235 Pm_base = *++regs; 7236 7237 // Working registers: 7238 Ra = *++regs; // The current digit of a, b, n, and m. 7239 Rb = *++regs; 7240 Rm = *++regs; 7241 Rn = *++regs; 7242 7243 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7244 Pb = *++regs; 7245 Pm = *++regs; 7246 Pn = *++regs; 7247 7248 t0 = *++regs; // Three registers which form a 7249 t1 = *++regs; // triple-precision accumuator. 7250 t2 = *++regs; 7251 7252 Ri = *++regs; // Inner and outer loop indexes. 7253 Rj = *++regs; 7254 7255 Rhi_ab = *++regs; // Product registers: low and high parts 7256 Rlo_ab = *++regs; // of a*b and m*n. 7257 Rhi_mn = *++regs; 7258 Rlo_mn = *++regs; 7259 7260 // r19 and up are callee-saved. 7261 _toSave = RegSet::range(r19, *regs) + Pm_base; 7262 } 7263 7264 private: 7265 void save_regs() { 7266 push(_toSave, sp); 7267 } 7268 7269 void restore_regs() { 7270 pop(_toSave, sp); 7271 } 7272 7273 template <typename T> 7274 void unroll_2(Register count, T block) { 7275 Label loop, end, odd; 7276 tbnz(count, 0, odd); 7277 cbz(count, end); 7278 align(16); 7279 bind(loop); 7280 (this->*block)(); 7281 bind(odd); 7282 (this->*block)(); 7283 subs(count, count, 2); 7284 br(Assembler::GT, loop); 7285 bind(end); 7286 } 7287 7288 template <typename T> 7289 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7290 Label loop, end, odd; 7291 tbnz(count, 0, odd); 7292 cbz(count, end); 7293 align(16); 7294 bind(loop); 7295 (this->*block)(d, s, tmp); 7296 bind(odd); 7297 (this->*block)(d, s, tmp); 7298 subs(count, count, 2); 7299 br(Assembler::GT, loop); 7300 bind(end); 7301 } 7302 7303 void pre1(RegisterOrConstant i) { 7304 block_comment("pre1"); 7305 // Pa = Pa_base; 7306 // Pb = Pb_base + i; 7307 // Pm = Pm_base; 7308 // Pn = Pn_base + i; 7309 // Ra = *Pa; 7310 // Rb = *Pb; 7311 // Rm = *Pm; 7312 // Rn = *Pn; 7313 ldr(Ra, Address(Pa_base)); 7314 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7315 ldr(Rm, Address(Pm_base)); 7316 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7317 lea(Pa, Address(Pa_base)); 7318 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7319 lea(Pm, Address(Pm_base)); 7320 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7321 7322 // Zero the m*n result. 7323 mov(Rhi_mn, zr); 7324 mov(Rlo_mn, zr); 7325 } 7326 7327 // The core multiply-accumulate step of a Montgomery 7328 // multiplication. The idea is to schedule operations as a 7329 // pipeline so that instructions with long latencies (loads and 7330 // multiplies) have time to complete before their results are 7331 // used. This most benefits in-order implementations of the 7332 // architecture but out-of-order ones also benefit. 7333 void step() { 7334 block_comment("step"); 7335 // MACC(Ra, Rb, t0, t1, t2); 7336 // Ra = *++Pa; 7337 // Rb = *--Pb; 7338 umulh(Rhi_ab, Ra, Rb); 7339 mul(Rlo_ab, Ra, Rb); 7340 ldr(Ra, pre(Pa, wordSize)); 7341 ldr(Rb, pre(Pb, -wordSize)); 7342 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7343 // previous iteration. 7344 // MACC(Rm, Rn, t0, t1, t2); 7345 // Rm = *++Pm; 7346 // Rn = *--Pn; 7347 umulh(Rhi_mn, Rm, Rn); 7348 mul(Rlo_mn, Rm, Rn); 7349 ldr(Rm, pre(Pm, wordSize)); 7350 ldr(Rn, pre(Pn, -wordSize)); 7351 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7352 } 7353 7354 void post1() { 7355 block_comment("post1"); 7356 7357 // MACC(Ra, Rb, t0, t1, t2); 7358 // Ra = *++Pa; 7359 // Rb = *--Pb; 7360 umulh(Rhi_ab, Ra, Rb); 7361 mul(Rlo_ab, Ra, Rb); 7362 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7363 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7364 7365 // *Pm = Rm = t0 * inv; 7366 mul(Rm, t0, inv); 7367 str(Rm, Address(Pm)); 7368 7369 // MACC(Rm, Rn, t0, t1, t2); 7370 // t0 = t1; t1 = t2; t2 = 0; 7371 umulh(Rhi_mn, Rm, Rn); 7372 7373 #ifndef PRODUCT 7374 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7375 { 7376 mul(Rlo_mn, Rm, Rn); 7377 add(Rlo_mn, t0, Rlo_mn); 7378 Label ok; 7379 cbz(Rlo_mn, ok); { 7380 stop("broken Montgomery multiply"); 7381 } bind(ok); 7382 } 7383 #endif 7384 // We have very carefully set things up so that 7385 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7386 // the lower half of Rm * Rn because we know the result already: 7387 // it must be -t0. t0 + (-t0) must generate a carry iff 7388 // t0 != 0. So, rather than do a mul and an adds we just set 7389 // the carry flag iff t0 is nonzero. 7390 // 7391 // mul(Rlo_mn, Rm, Rn); 7392 // adds(zr, t0, Rlo_mn); 7393 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7394 adcs(t0, t1, Rhi_mn); 7395 adc(t1, t2, zr); 7396 mov(t2, zr); 7397 } 7398 7399 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7400 block_comment("pre2"); 7401 // Pa = Pa_base + i-len; 7402 // Pb = Pb_base + len; 7403 // Pm = Pm_base + i-len; 7404 // Pn = Pn_base + len; 7405 7406 if (i.is_register()) { 7407 sub(Rj, i.as_register(), len); 7408 } else { 7409 mov(Rj, i.as_constant()); 7410 sub(Rj, Rj, len); 7411 } 7412 // Rj == i-len 7413 7414 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7415 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7416 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7417 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7418 7419 // Ra = *++Pa; 7420 // Rb = *--Pb; 7421 // Rm = *++Pm; 7422 // Rn = *--Pn; 7423 ldr(Ra, pre(Pa, wordSize)); 7424 ldr(Rb, pre(Pb, -wordSize)); 7425 ldr(Rm, pre(Pm, wordSize)); 7426 ldr(Rn, pre(Pn, -wordSize)); 7427 7428 mov(Rhi_mn, zr); 7429 mov(Rlo_mn, zr); 7430 } 7431 7432 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7433 block_comment("post2"); 7434 if (i.is_constant()) { 7435 mov(Rj, i.as_constant()-len.as_constant()); 7436 } else { 7437 sub(Rj, i.as_register(), len); 7438 } 7439 7440 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7441 7442 // As soon as we know the least significant digit of our result, 7443 // store it. 7444 // Pm_base[i-len] = t0; 7445 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7446 7447 // t0 = t1; t1 = t2; t2 = 0; 7448 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7449 adc(t1, t2, zr); 7450 mov(t2, zr); 7451 } 7452 7453 // A carry in t0 after Montgomery multiplication means that we 7454 // should subtract multiples of n from our result in m. We'll 7455 // keep doing that until there is no carry. 7456 void normalize(RegisterOrConstant len) { 7457 block_comment("normalize"); 7458 // while (t0) 7459 // t0 = sub(Pm_base, Pn_base, t0, len); 7460 Label loop, post, again; 7461 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7462 cbz(t0, post); { 7463 bind(again); { 7464 mov(i, zr); 7465 mov(cnt, len); 7466 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7467 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7468 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7469 align(16); 7470 bind(loop); { 7471 sbcs(Rm, Rm, Rn); 7472 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7473 add(i, i, 1); 7474 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7475 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7476 sub(cnt, cnt, 1); 7477 } cbnz(cnt, loop); 7478 sbc(t0, t0, zr); 7479 } cbnz(t0, again); 7480 } bind(post); 7481 } 7482 7483 // Move memory at s to d, reversing words. 7484 // Increments d to end of copied memory 7485 // Destroys tmp1, tmp2 7486 // Preserves len 7487 // Leaves s pointing to the address which was in d at start 7488 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7489 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7490 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7491 7492 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7493 mov(tmp1, len); 7494 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7495 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7496 } 7497 // where 7498 void reverse1(Register d, Register s, Register tmp) { 7499 ldr(tmp, pre(s, -wordSize)); 7500 ror(tmp, tmp, 32); 7501 str(tmp, post(d, wordSize)); 7502 } 7503 7504 void step_squaring() { 7505 // An extra ACC 7506 step(); 7507 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7508 } 7509 7510 void last_squaring(RegisterOrConstant i) { 7511 Label dont; 7512 // if ((i & 1) == 0) { 7513 tbnz(i.as_register(), 0, dont); { 7514 // MACC(Ra, Rb, t0, t1, t2); 7515 // Ra = *++Pa; 7516 // Rb = *--Pb; 7517 umulh(Rhi_ab, Ra, Rb); 7518 mul(Rlo_ab, Ra, Rb); 7519 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7520 } bind(dont); 7521 } 7522 7523 void extra_step_squaring() { 7524 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7525 7526 // MACC(Rm, Rn, t0, t1, t2); 7527 // Rm = *++Pm; 7528 // Rn = *--Pn; 7529 umulh(Rhi_mn, Rm, Rn); 7530 mul(Rlo_mn, Rm, Rn); 7531 ldr(Rm, pre(Pm, wordSize)); 7532 ldr(Rn, pre(Pn, -wordSize)); 7533 } 7534 7535 void post1_squaring() { 7536 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7537 7538 // *Pm = Rm = t0 * inv; 7539 mul(Rm, t0, inv); 7540 str(Rm, Address(Pm)); 7541 7542 // MACC(Rm, Rn, t0, t1, t2); 7543 // t0 = t1; t1 = t2; t2 = 0; 7544 umulh(Rhi_mn, Rm, Rn); 7545 7546 #ifndef PRODUCT 7547 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7548 { 7549 mul(Rlo_mn, Rm, Rn); 7550 add(Rlo_mn, t0, Rlo_mn); 7551 Label ok; 7552 cbz(Rlo_mn, ok); { 7553 stop("broken Montgomery multiply"); 7554 } bind(ok); 7555 } 7556 #endif 7557 // We have very carefully set things up so that 7558 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7559 // the lower half of Rm * Rn because we know the result already: 7560 // it must be -t0. t0 + (-t0) must generate a carry iff 7561 // t0 != 0. So, rather than do a mul and an adds we just set 7562 // the carry flag iff t0 is nonzero. 7563 // 7564 // mul(Rlo_mn, Rm, Rn); 7565 // adds(zr, t0, Rlo_mn); 7566 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7567 adcs(t0, t1, Rhi_mn); 7568 adc(t1, t2, zr); 7569 mov(t2, zr); 7570 } 7571 7572 void acc(Register Rhi, Register Rlo, 7573 Register t0, Register t1, Register t2) { 7574 adds(t0, t0, Rlo); 7575 adcs(t1, t1, Rhi); 7576 adc(t2, t2, zr); 7577 } 7578 7579 public: 7580 /** 7581 * Fast Montgomery multiplication. The derivation of the 7582 * algorithm is in A Cryptographic Library for the Motorola 7583 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7584 * 7585 * Arguments: 7586 * 7587 * Inputs for multiplication: 7588 * c_rarg0 - int array elements a 7589 * c_rarg1 - int array elements b 7590 * c_rarg2 - int array elements n (the modulus) 7591 * c_rarg3 - int length 7592 * c_rarg4 - int inv 7593 * c_rarg5 - int array elements m (the result) 7594 * 7595 * Inputs for squaring: 7596 * c_rarg0 - int array elements a 7597 * c_rarg1 - int array elements n (the modulus) 7598 * c_rarg2 - int length 7599 * c_rarg3 - int inv 7600 * c_rarg4 - int array elements m (the result) 7601 * 7602 */ 7603 address generate_multiply() { 7604 Label argh, nothing; 7605 bind(argh); 7606 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7607 7608 align(CodeEntryAlignment); 7609 address entry = pc(); 7610 7611 cbzw(Rlen, nothing); 7612 7613 enter(); 7614 7615 // Make room. 7616 cmpw(Rlen, 512); 7617 br(Assembler::HI, argh); 7618 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7619 andr(sp, Ra, -2 * wordSize); 7620 7621 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7622 7623 { 7624 // Copy input args, reversing as we go. We use Ra as a 7625 // temporary variable. 7626 reverse(Ra, Pa_base, Rlen, t0, t1); 7627 if (!_squaring) 7628 reverse(Ra, Pb_base, Rlen, t0, t1); 7629 reverse(Ra, Pn_base, Rlen, t0, t1); 7630 } 7631 7632 // Push all call-saved registers and also Pm_base which we'll need 7633 // at the end. 7634 save_regs(); 7635 7636 #ifndef PRODUCT 7637 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7638 { 7639 ldr(Rn, Address(Pn_base, 0)); 7640 mul(Rlo_mn, Rn, inv); 7641 subs(zr, Rlo_mn, -1); 7642 Label ok; 7643 br(EQ, ok); { 7644 stop("broken inverse in Montgomery multiply"); 7645 } bind(ok); 7646 } 7647 #endif 7648 7649 mov(Pm_base, Ra); 7650 7651 mov(t0, zr); 7652 mov(t1, zr); 7653 mov(t2, zr); 7654 7655 block_comment("for (int i = 0; i < len; i++) {"); 7656 mov(Ri, zr); { 7657 Label loop, end; 7658 cmpw(Ri, Rlen); 7659 br(Assembler::GE, end); 7660 7661 bind(loop); 7662 pre1(Ri); 7663 7664 block_comment(" for (j = i; j; j--) {"); { 7665 movw(Rj, Ri); 7666 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7667 } block_comment(" } // j"); 7668 7669 post1(); 7670 addw(Ri, Ri, 1); 7671 cmpw(Ri, Rlen); 7672 br(Assembler::LT, loop); 7673 bind(end); 7674 block_comment("} // i"); 7675 } 7676 7677 block_comment("for (int i = len; i < 2*len; i++) {"); 7678 mov(Ri, Rlen); { 7679 Label loop, end; 7680 cmpw(Ri, Rlen, Assembler::LSL, 1); 7681 br(Assembler::GE, end); 7682 7683 bind(loop); 7684 pre2(Ri, Rlen); 7685 7686 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7687 lslw(Rj, Rlen, 1); 7688 subw(Rj, Rj, Ri); 7689 subw(Rj, Rj, 1); 7690 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7691 } block_comment(" } // j"); 7692 7693 post2(Ri, Rlen); 7694 addw(Ri, Ri, 1); 7695 cmpw(Ri, Rlen, Assembler::LSL, 1); 7696 br(Assembler::LT, loop); 7697 bind(end); 7698 } 7699 block_comment("} // i"); 7700 7701 normalize(Rlen); 7702 7703 mov(Ra, Pm_base); // Save Pm_base in Ra 7704 restore_regs(); // Restore caller's Pm_base 7705 7706 // Copy our result into caller's Pm_base 7707 reverse(Pm_base, Ra, Rlen, t0, t1); 7708 7709 leave(); 7710 bind(nothing); 7711 ret(lr); 7712 7713 return entry; 7714 } 7715 // In C, approximately: 7716 7717 // void 7718 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7719 // julong Pn_base[], julong Pm_base[], 7720 // julong inv, int len) { 7721 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7722 // julong *Pa, *Pb, *Pn, *Pm; 7723 // julong Ra, Rb, Rn, Rm; 7724 7725 // int i; 7726 7727 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7728 7729 // for (i = 0; i < len; i++) { 7730 // int j; 7731 7732 // Pa = Pa_base; 7733 // Pb = Pb_base + i; 7734 // Pm = Pm_base; 7735 // Pn = Pn_base + i; 7736 7737 // Ra = *Pa; 7738 // Rb = *Pb; 7739 // Rm = *Pm; 7740 // Rn = *Pn; 7741 7742 // int iters = i; 7743 // for (j = 0; iters--; j++) { 7744 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7745 // MACC(Ra, Rb, t0, t1, t2); 7746 // Ra = *++Pa; 7747 // Rb = *--Pb; 7748 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7749 // MACC(Rm, Rn, t0, t1, t2); 7750 // Rm = *++Pm; 7751 // Rn = *--Pn; 7752 // } 7753 7754 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 7755 // MACC(Ra, Rb, t0, t1, t2); 7756 // *Pm = Rm = t0 * inv; 7757 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7758 // MACC(Rm, Rn, t0, t1, t2); 7759 7760 // assert(t0 == 0, "broken Montgomery multiply"); 7761 7762 // t0 = t1; t1 = t2; t2 = 0; 7763 // } 7764 7765 // for (i = len; i < 2*len; i++) { 7766 // int j; 7767 7768 // Pa = Pa_base + i-len; 7769 // Pb = Pb_base + len; 7770 // Pm = Pm_base + i-len; 7771 // Pn = Pn_base + len; 7772 7773 // Ra = *++Pa; 7774 // Rb = *--Pb; 7775 // Rm = *++Pm; 7776 // Rn = *--Pn; 7777 7778 // int iters = len*2-i-1; 7779 // for (j = i-len+1; iters--; j++) { 7780 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7781 // MACC(Ra, Rb, t0, t1, t2); 7782 // Ra = *++Pa; 7783 // Rb = *--Pb; 7784 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7785 // MACC(Rm, Rn, t0, t1, t2); 7786 // Rm = *++Pm; 7787 // Rn = *--Pn; 7788 // } 7789 7790 // Pm_base[i-len] = t0; 7791 // t0 = t1; t1 = t2; t2 = 0; 7792 // } 7793 7794 // while (t0) 7795 // t0 = sub(Pm_base, Pn_base, t0, len); 7796 // } 7797 7798 /** 7799 * Fast Montgomery squaring. This uses asymptotically 25% fewer 7800 * multiplies than Montgomery multiplication so it should be up to 7801 * 25% faster. However, its loop control is more complex and it 7802 * may actually run slower on some machines. 7803 * 7804 * Arguments: 7805 * 7806 * Inputs: 7807 * c_rarg0 - int array elements a 7808 * c_rarg1 - int array elements n (the modulus) 7809 * c_rarg2 - int length 7810 * c_rarg3 - int inv 7811 * c_rarg4 - int array elements m (the result) 7812 * 7813 */ 7814 address generate_square() { 7815 Label argh; 7816 bind(argh); 7817 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7818 7819 align(CodeEntryAlignment); 7820 address entry = pc(); 7821 7822 enter(); 7823 7824 // Make room. 7825 cmpw(Rlen, 512); 7826 br(Assembler::HI, argh); 7827 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7828 andr(sp, Ra, -2 * wordSize); 7829 7830 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7831 7832 { 7833 // Copy input args, reversing as we go. We use Ra as a 7834 // temporary variable. 7835 reverse(Ra, Pa_base, Rlen, t0, t1); 7836 reverse(Ra, Pn_base, Rlen, t0, t1); 7837 } 7838 7839 // Push all call-saved registers and also Pm_base which we'll need 7840 // at the end. 7841 save_regs(); 7842 7843 mov(Pm_base, Ra); 7844 7845 mov(t0, zr); 7846 mov(t1, zr); 7847 mov(t2, zr); 7848 7849 block_comment("for (int i = 0; i < len; i++) {"); 7850 mov(Ri, zr); { 7851 Label loop, end; 7852 bind(loop); 7853 cmp(Ri, Rlen); 7854 br(Assembler::GE, end); 7855 7856 pre1(Ri); 7857 7858 block_comment("for (j = (i+1)/2; j; j--) {"); { 7859 add(Rj, Ri, 1); 7860 lsr(Rj, Rj, 1); 7861 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7862 } block_comment(" } // j"); 7863 7864 last_squaring(Ri); 7865 7866 block_comment(" for (j = i/2; j; j--) {"); { 7867 lsr(Rj, Ri, 1); 7868 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7869 } block_comment(" } // j"); 7870 7871 post1_squaring(); 7872 add(Ri, Ri, 1); 7873 cmp(Ri, Rlen); 7874 br(Assembler::LT, loop); 7875 7876 bind(end); 7877 block_comment("} // i"); 7878 } 7879 7880 block_comment("for (int i = len; i < 2*len; i++) {"); 7881 mov(Ri, Rlen); { 7882 Label loop, end; 7883 bind(loop); 7884 cmp(Ri, Rlen, Assembler::LSL, 1); 7885 br(Assembler::GE, end); 7886 7887 pre2(Ri, Rlen); 7888 7889 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 7890 lsl(Rj, Rlen, 1); 7891 sub(Rj, Rj, Ri); 7892 sub(Rj, Rj, 1); 7893 lsr(Rj, Rj, 1); 7894 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7895 } block_comment(" } // j"); 7896 7897 last_squaring(Ri); 7898 7899 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 7900 lsl(Rj, Rlen, 1); 7901 sub(Rj, Rj, Ri); 7902 lsr(Rj, Rj, 1); 7903 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7904 } block_comment(" } // j"); 7905 7906 post2(Ri, Rlen); 7907 add(Ri, Ri, 1); 7908 cmp(Ri, Rlen, Assembler::LSL, 1); 7909 7910 br(Assembler::LT, loop); 7911 bind(end); 7912 block_comment("} // i"); 7913 } 7914 7915 normalize(Rlen); 7916 7917 mov(Ra, Pm_base); // Save Pm_base in Ra 7918 restore_regs(); // Restore caller's Pm_base 7919 7920 // Copy our result into caller's Pm_base 7921 reverse(Pm_base, Ra, Rlen, t0, t1); 7922 7923 leave(); 7924 ret(lr); 7925 7926 return entry; 7927 } 7928 // In C, approximately: 7929 7930 // void 7931 // montgomery_square(julong Pa_base[], julong Pn_base[], 7932 // julong Pm_base[], julong inv, int len) { 7933 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7934 // julong *Pa, *Pb, *Pn, *Pm; 7935 // julong Ra, Rb, Rn, Rm; 7936 7937 // int i; 7938 7939 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7940 7941 // for (i = 0; i < len; i++) { 7942 // int j; 7943 7944 // Pa = Pa_base; 7945 // Pb = Pa_base + i; 7946 // Pm = Pm_base; 7947 // Pn = Pn_base + i; 7948 7949 // Ra = *Pa; 7950 // Rb = *Pb; 7951 // Rm = *Pm; 7952 // Rn = *Pn; 7953 7954 // int iters = (i+1)/2; 7955 // for (j = 0; iters--; j++) { 7956 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7957 // MACC2(Ra, Rb, t0, t1, t2); 7958 // Ra = *++Pa; 7959 // Rb = *--Pb; 7960 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7961 // MACC(Rm, Rn, t0, t1, t2); 7962 // Rm = *++Pm; 7963 // Rn = *--Pn; 7964 // } 7965 // if ((i & 1) == 0) { 7966 // assert(Ra == Pa_base[j], "must be"); 7967 // MACC(Ra, Ra, t0, t1, t2); 7968 // } 7969 // iters = i/2; 7970 // assert(iters == i-j, "must be"); 7971 // for (; iters--; j++) { 7972 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7973 // MACC(Rm, Rn, t0, t1, t2); 7974 // Rm = *++Pm; 7975 // Rn = *--Pn; 7976 // } 7977 7978 // *Pm = Rm = t0 * inv; 7979 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7980 // MACC(Rm, Rn, t0, t1, t2); 7981 7982 // assert(t0 == 0, "broken Montgomery multiply"); 7983 7984 // t0 = t1; t1 = t2; t2 = 0; 7985 // } 7986 7987 // for (i = len; i < 2*len; i++) { 7988 // int start = i-len+1; 7989 // int end = start + (len - start)/2; 7990 // int j; 7991 7992 // Pa = Pa_base + i-len; 7993 // Pb = Pa_base + len; 7994 // Pm = Pm_base + i-len; 7995 // Pn = Pn_base + len; 7996 7997 // Ra = *++Pa; 7998 // Rb = *--Pb; 7999 // Rm = *++Pm; 8000 // Rn = *--Pn; 8001 8002 // int iters = (2*len-i-1)/2; 8003 // assert(iters == end-start, "must be"); 8004 // for (j = start; iters--; j++) { 8005 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8006 // MACC2(Ra, Rb, t0, t1, t2); 8007 // Ra = *++Pa; 8008 // Rb = *--Pb; 8009 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8010 // MACC(Rm, Rn, t0, t1, t2); 8011 // Rm = *++Pm; 8012 // Rn = *--Pn; 8013 // } 8014 // if ((i & 1) == 0) { 8015 // assert(Ra == Pa_base[j], "must be"); 8016 // MACC(Ra, Ra, t0, t1, t2); 8017 // } 8018 // iters = (2*len-i)/2; 8019 // assert(iters == len-j, "must be"); 8020 // for (; iters--; j++) { 8021 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8022 // MACC(Rm, Rn, t0, t1, t2); 8023 // Rm = *++Pm; 8024 // Rn = *--Pn; 8025 // } 8026 // Pm_base[i-len] = t0; 8027 // t0 = t1; t1 = t2; t2 = 0; 8028 // } 8029 8030 // while (t0) 8031 // t0 = sub(Pm_base, Pn_base, t0, len); 8032 // } 8033 }; 8034 8035 8036 // Initialization 8037 void generate_initial_stubs() { 8038 // Generate initial stubs and initializes the entry points 8039 8040 // entry points that exist in all platforms Note: This is code 8041 // that could be shared among different platforms - however the 8042 // benefit seems to be smaller than the disadvantage of having a 8043 // much more complicated generator structure. See also comment in 8044 // stubRoutines.hpp. 8045 8046 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8047 8048 StubRoutines::_call_stub_entry = 8049 generate_call_stub(StubRoutines::_call_stub_return_address); 8050 8051 // is referenced by megamorphic call 8052 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8053 8054 // Build this early so it's available for the interpreter. 8055 StubRoutines::_throw_StackOverflowError_entry = 8056 generate_throw_exception("StackOverflowError throw_exception", 8057 CAST_FROM_FN_PTR(address, 8058 SharedRuntime::throw_StackOverflowError)); 8059 StubRoutines::_throw_delayed_StackOverflowError_entry = 8060 generate_throw_exception("delayed StackOverflowError throw_exception", 8061 CAST_FROM_FN_PTR(address, 8062 SharedRuntime::throw_delayed_StackOverflowError)); 8063 8064 // Initialize table for copy memory (arraycopy) check. 8065 if (UnsafeCopyMemory::_table == nullptr) { 8066 UnsafeCopyMemory::create_table(8); 8067 } 8068 8069 if (UseCRC32Intrinsics) { 8070 // set table address before stub generation which use it 8071 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8072 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8073 } 8074 8075 if (UseCRC32CIntrinsics) { 8076 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8077 } 8078 8079 // Disabled until JDK-8210858 is fixed 8080 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 8081 // StubRoutines::_dlog = generate_dlog(); 8082 // } 8083 8084 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8085 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8086 } 8087 8088 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8089 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8090 } 8091 } 8092 8093 void generate_continuation_stubs() { 8094 // Continuation stubs: 8095 StubRoutines::_cont_thaw = generate_cont_thaw(); 8096 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8097 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8098 8099 JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();) 8100 JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();) 8101 } 8102 8103 void generate_final_stubs() { 8104 // support for verify_oop (must happen after universe_init) 8105 if (VerifyOops) { 8106 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8107 } 8108 StubRoutines::_throw_AbstractMethodError_entry = 8109 generate_throw_exception("AbstractMethodError throw_exception", 8110 CAST_FROM_FN_PTR(address, 8111 SharedRuntime:: 8112 throw_AbstractMethodError)); 8113 8114 StubRoutines::_throw_IncompatibleClassChangeError_entry = 8115 generate_throw_exception("IncompatibleClassChangeError throw_exception", 8116 CAST_FROM_FN_PTR(address, 8117 SharedRuntime:: 8118 throw_IncompatibleClassChangeError)); 8119 8120 StubRoutines::_throw_NullPointerException_at_call_entry = 8121 generate_throw_exception("NullPointerException at call throw_exception", 8122 CAST_FROM_FN_PTR(address, 8123 SharedRuntime:: 8124 throw_NullPointerException_at_call)); 8125 8126 // arraycopy stubs used by compilers 8127 generate_arraycopy_stubs(); 8128 8129 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8130 if (bs_nm != nullptr) { 8131 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 8132 } 8133 8134 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8135 8136 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8137 8138 generate_atomic_entry_points(); 8139 8140 #endif // LINUX 8141 8142 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8143 } 8144 8145 void generate_compiler_stubs() { 8146 #if COMPILER2_OR_JVMCI 8147 8148 if (UseSVE == 0) { 8149 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8150 } 8151 8152 // array equals stub for large arrays. 8153 if (!UseSimpleArrayEquals) { 8154 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8155 } 8156 8157 // byte_array_inflate stub for large arrays. 8158 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8159 8160 // countPositives stub for large arrays. 8161 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8162 8163 generate_compare_long_strings(); 8164 8165 generate_string_indexof_stubs(); 8166 8167 #ifdef COMPILER2 8168 if (UseMultiplyToLenIntrinsic) { 8169 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8170 } 8171 8172 if (UseSquareToLenIntrinsic) { 8173 StubRoutines::_squareToLen = generate_squareToLen(); 8174 } 8175 8176 if (UseMulAddIntrinsic) { 8177 StubRoutines::_mulAdd = generate_mulAdd(); 8178 } 8179 8180 if (UseSIMDForBigIntegerShiftIntrinsics) { 8181 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8182 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8183 } 8184 8185 if (UseMontgomeryMultiplyIntrinsic) { 8186 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8187 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8188 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8189 } 8190 8191 if (UseMontgomerySquareIntrinsic) { 8192 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8193 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8194 // We use generate_multiply() rather than generate_square() 8195 // because it's faster for the sizes of modulus we care about. 8196 StubRoutines::_montgomerySquare = g.generate_multiply(); 8197 } 8198 #endif // COMPILER2 8199 8200 if (UseChaCha20Intrinsics) { 8201 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8202 } 8203 8204 if (UseBASE64Intrinsics) { 8205 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8206 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8207 } 8208 8209 // data cache line writeback 8210 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8211 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8212 8213 if (UseAESIntrinsics) { 8214 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8215 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8216 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8217 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8218 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8219 } 8220 if (UseGHASHIntrinsics) { 8221 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8222 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8223 } 8224 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8225 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8226 } 8227 8228 if (UseMD5Intrinsics) { 8229 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8230 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8231 } 8232 if (UseSHA1Intrinsics) { 8233 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8234 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8235 } 8236 if (UseSHA256Intrinsics) { 8237 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8238 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8239 } 8240 if (UseSHA512Intrinsics) { 8241 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8242 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8243 } 8244 if (UseSHA3Intrinsics) { 8245 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8246 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8247 } 8248 8249 // generate Adler32 intrinsics code 8250 if (UseAdler32Intrinsics) { 8251 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8252 } 8253 #endif // COMPILER2_OR_JVMCI 8254 } 8255 8256 public: 8257 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8258 switch(kind) { 8259 case Initial_stubs: 8260 generate_initial_stubs(); 8261 break; 8262 case Continuation_stubs: 8263 generate_continuation_stubs(); 8264 break; 8265 case Compiler_stubs: 8266 generate_compiler_stubs(); 8267 break; 8268 case Final_stubs: 8269 generate_final_stubs(); 8270 break; 8271 default: 8272 fatal("unexpected stubs kind: %d", kind); 8273 break; 8274 }; 8275 } 8276 }; // end class declaration 8277 8278 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8279 StubGenerator g(code, kind); 8280 } 8281 8282 8283 #if defined (LINUX) 8284 8285 // Define pointers to atomic stubs and initialize them to point to the 8286 // code in atomic_aarch64.S. 8287 8288 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8289 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8290 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8291 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8292 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8293 8294 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8295 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8296 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8297 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8298 DEFAULT_ATOMIC_OP(xchg, 4, ) 8299 DEFAULT_ATOMIC_OP(xchg, 8, ) 8300 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8301 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8302 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8303 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8304 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8305 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8306 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8307 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8308 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8309 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8310 8311 #undef DEFAULT_ATOMIC_OP 8312 8313 #endif // LINUX