1 /* 2 * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "runtime/atomic.hpp" 45 #include "runtime/frame.inline.hpp" 46 #include "runtime/handles.inline.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/stubCodeGenerator.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "runtime/thread.inline.hpp" 51 #include "utilities/align.hpp" 52 #include "utilities/powerOfTwo.hpp" 53 #ifdef COMPILER2 54 #include "opto/runtime.hpp" 55 #endif 56 #if INCLUDE_ZGC 57 #include "gc/z/zThreadLocalData.hpp" 58 #endif 59 60 // Declaration and definition of StubGenerator (no .hpp file). 61 // For a more detailed description of the stub routine structure 62 // see the comment in stubRoutines.hpp 63 64 #undef __ 65 #define __ _masm-> 66 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 67 68 #ifdef PRODUCT 69 #define BLOCK_COMMENT(str) /* nothing */ 70 #else 71 #define BLOCK_COMMENT(str) __ block_comment(str) 72 #endif 73 74 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 75 76 // Stub Code definitions 77 78 class StubGenerator: public StubCodeGenerator { 79 private: 80 81 #ifdef PRODUCT 82 #define inc_counter_np(counter) ((void)0) 83 #else 84 void inc_counter_np_(int& counter) { 85 __ lea(rscratch2, ExternalAddress((address)&counter)); 86 __ ldrw(rscratch1, Address(rscratch2)); 87 __ addw(rscratch1, rscratch1, 1); 88 __ strw(rscratch1, Address(rscratch2)); 89 } 90 #define inc_counter_np(counter) \ 91 BLOCK_COMMENT("inc_counter " #counter); \ 92 inc_counter_np_(counter); 93 #endif 94 95 // Call stubs are used to call Java from C 96 // 97 // Arguments: 98 // c_rarg0: call wrapper address address 99 // c_rarg1: result address 100 // c_rarg2: result type BasicType 101 // c_rarg3: method Method* 102 // c_rarg4: (interpreter) entry point address 103 // c_rarg5: parameters intptr_t* 104 // c_rarg6: parameter size (in words) int 105 // c_rarg7: thread Thread* 106 // 107 // There is no return from the stub itself as any Java result 108 // is written to result 109 // 110 // we save r30 (lr) as the return PC at the base of the frame and 111 // link r29 (fp) below it as the frame pointer installing sp (r31) 112 // into fp. 113 // 114 // we save r0-r7, which accounts for all the c arguments. 115 // 116 // TODO: strictly do we need to save them all? they are treated as 117 // volatile by C so could we omit saving the ones we are going to 118 // place in global registers (thread? method?) or those we only use 119 // during setup of the Java call? 120 // 121 // we don't need to save r8 which C uses as an indirect result location 122 // return register. 123 // 124 // we don't need to save r9-r15 which both C and Java treat as 125 // volatile 126 // 127 // we don't need to save r16-18 because Java does not use them 128 // 129 // we save r19-r28 which Java uses as scratch registers and C 130 // expects to be callee-save 131 // 132 // we save the bottom 64 bits of each value stored in v8-v15; it is 133 // the responsibility of the caller to preserve larger values. 134 // 135 // so the stub frame looks like this when we enter Java code 136 // 137 // [ return_from_Java ] <--- sp 138 // [ argument word n ] 139 // ... 140 // -27 [ argument word 1 ] 141 // -26 [ saved v15 ] <--- sp_after_call 142 // -25 [ saved v14 ] 143 // -24 [ saved v13 ] 144 // -23 [ saved v12 ] 145 // -22 [ saved v11 ] 146 // -21 [ saved v10 ] 147 // -20 [ saved v9 ] 148 // -19 [ saved v8 ] 149 // -18 [ saved r28 ] 150 // -17 [ saved r27 ] 151 // -16 [ saved r26 ] 152 // -15 [ saved r25 ] 153 // -14 [ saved r24 ] 154 // -13 [ saved r23 ] 155 // -12 [ saved r22 ] 156 // -11 [ saved r21 ] 157 // -10 [ saved r20 ] 158 // -9 [ saved r19 ] 159 // -8 [ call wrapper (r0) ] 160 // -7 [ result (r1) ] 161 // -6 [ result type (r2) ] 162 // -5 [ method (r3) ] 163 // -4 [ entry point (r4) ] 164 // -3 [ parameters (r5) ] 165 // -2 [ parameter size (r6) ] 166 // -1 [ thread (r7) ] 167 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 168 // 1 [ saved lr (r30) ] 169 170 // Call stub stack layout word offsets from fp 171 enum call_stub_layout { 172 sp_after_call_off = -26, 173 174 d15_off = -26, 175 d13_off = -24, 176 d11_off = -22, 177 d9_off = -20, 178 179 r28_off = -18, 180 r26_off = -16, 181 r24_off = -14, 182 r22_off = -12, 183 r20_off = -10, 184 call_wrapper_off = -8, 185 result_off = -7, 186 result_type_off = -6, 187 method_off = -5, 188 entry_point_off = -4, 189 parameter_size_off = -2, 190 thread_off = -1, 191 fp_f = 0, 192 retaddr_off = 1, 193 }; 194 195 address generate_call_stub(address& return_address) { 196 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 197 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 198 "adjust this code"); 199 200 StubCodeMark mark(this, "StubRoutines", "call_stub"); 201 address start = __ pc(); 202 203 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 204 205 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 206 const Address result (rfp, result_off * wordSize); 207 const Address result_type (rfp, result_type_off * wordSize); 208 const Address method (rfp, method_off * wordSize); 209 const Address entry_point (rfp, entry_point_off * wordSize); 210 const Address parameter_size(rfp, parameter_size_off * wordSize); 211 212 const Address thread (rfp, thread_off * wordSize); 213 214 const Address d15_save (rfp, d15_off * wordSize); 215 const Address d13_save (rfp, d13_off * wordSize); 216 const Address d11_save (rfp, d11_off * wordSize); 217 const Address d9_save (rfp, d9_off * wordSize); 218 219 const Address r28_save (rfp, r28_off * wordSize); 220 const Address r26_save (rfp, r26_off * wordSize); 221 const Address r24_save (rfp, r24_off * wordSize); 222 const Address r22_save (rfp, r22_off * wordSize); 223 const Address r20_save (rfp, r20_off * wordSize); 224 225 // stub code 226 227 address aarch64_entry = __ pc(); 228 229 // set up frame and move sp to end of save area 230 __ enter(); 231 __ sub(sp, rfp, -sp_after_call_off * wordSize); 232 233 // save register parameters and Java scratch/global registers 234 // n.b. we save thread even though it gets installed in 235 // rthread because we want to sanity check rthread later 236 __ str(c_rarg7, thread); 237 __ strw(c_rarg6, parameter_size); 238 __ stp(c_rarg4, c_rarg5, entry_point); 239 __ stp(c_rarg2, c_rarg3, result_type); 240 __ stp(c_rarg0, c_rarg1, call_wrapper); 241 242 __ stp(r20, r19, r20_save); 243 __ stp(r22, r21, r22_save); 244 __ stp(r24, r23, r24_save); 245 __ stp(r26, r25, r26_save); 246 __ stp(r28, r27, r28_save); 247 248 __ stpd(v9, v8, d9_save); 249 __ stpd(v11, v10, d11_save); 250 __ stpd(v13, v12, d13_save); 251 __ stpd(v15, v14, d15_save); 252 253 // install Java thread in global register now we have saved 254 // whatever value it held 255 __ mov(rthread, c_rarg7); 256 // And method 257 __ mov(rmethod, c_rarg3); 258 259 // set up the heapbase register 260 __ reinit_heapbase(); 261 262 #ifdef ASSERT 263 // make sure we have no pending exceptions 264 { 265 Label L; 266 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 267 __ cmp(rscratch1, (u1)NULL_WORD); 268 __ br(Assembler::EQ, L); 269 __ stop("StubRoutines::call_stub: entered with pending exception"); 270 __ BIND(L); 271 } 272 #endif 273 // pass parameters if any 274 __ mov(esp, sp); 275 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 276 __ andr(sp, rscratch1, -2 * wordSize); 277 278 BLOCK_COMMENT("pass parameters if any"); 279 Label parameters_done; 280 // parameter count is still in c_rarg6 281 // and parameter pointer identifying param 1 is in c_rarg5 282 __ cbzw(c_rarg6, parameters_done); 283 284 address loop = __ pc(); 285 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 286 __ subsw(c_rarg6, c_rarg6, 1); 287 __ push(rscratch1); 288 __ br(Assembler::GT, loop); 289 290 __ BIND(parameters_done); 291 292 // call Java entry -- passing methdoOop, and current sp 293 // rmethod: Method* 294 // r13: sender sp 295 BLOCK_COMMENT("call Java function"); 296 __ mov(r13, sp); 297 __ blr(c_rarg4); 298 299 // we do this here because the notify will already have been done 300 // if we get to the next instruction via an exception 301 // 302 // n.b. adding this instruction here affects the calculation of 303 // whether or not a routine returns to the call stub (used when 304 // doing stack walks) since the normal test is to check the return 305 // pc against the address saved below. so we may need to allow for 306 // this extra instruction in the check. 307 308 // save current address for use by exception handling code 309 310 return_address = __ pc(); 311 312 // store result depending on type (everything that is not 313 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 314 // n.b. this assumes Java returns an integral result in r0 315 // and a floating result in j_farg0 316 __ ldr(j_rarg2, result); 317 Label is_long, is_float, is_double, exit; 318 __ ldr(j_rarg1, result_type); 319 __ cmp(j_rarg1, (u1)T_OBJECT); 320 __ br(Assembler::EQ, is_long); 321 __ cmp(j_rarg1, (u1)T_LONG); 322 __ br(Assembler::EQ, is_long); 323 __ cmp(j_rarg1, (u1)T_FLOAT); 324 __ br(Assembler::EQ, is_float); 325 __ cmp(j_rarg1, (u1)T_DOUBLE); 326 __ br(Assembler::EQ, is_double); 327 328 // handle T_INT case 329 __ strw(r0, Address(j_rarg2)); 330 331 __ BIND(exit); 332 333 // pop parameters 334 __ sub(esp, rfp, -sp_after_call_off * wordSize); 335 336 #ifdef ASSERT 337 // verify that threads correspond 338 { 339 Label L, S; 340 __ ldr(rscratch1, thread); 341 __ cmp(rthread, rscratch1); 342 __ br(Assembler::NE, S); 343 __ get_thread(rscratch1); 344 __ cmp(rthread, rscratch1); 345 __ br(Assembler::EQ, L); 346 __ BIND(S); 347 __ stop("StubRoutines::call_stub: threads must correspond"); 348 __ BIND(L); 349 } 350 #endif 351 352 // restore callee-save registers 353 __ ldpd(v15, v14, d15_save); 354 __ ldpd(v13, v12, d13_save); 355 __ ldpd(v11, v10, d11_save); 356 __ ldpd(v9, v8, d9_save); 357 358 __ ldp(r28, r27, r28_save); 359 __ ldp(r26, r25, r26_save); 360 __ ldp(r24, r23, r24_save); 361 __ ldp(r22, r21, r22_save); 362 __ ldp(r20, r19, r20_save); 363 364 __ ldp(c_rarg0, c_rarg1, call_wrapper); 365 __ ldrw(c_rarg2, result_type); 366 __ ldr(c_rarg3, method); 367 __ ldp(c_rarg4, c_rarg5, entry_point); 368 __ ldp(c_rarg6, c_rarg7, parameter_size); 369 370 // leave frame and return to caller 371 __ leave(); 372 __ ret(lr); 373 374 // handle return types different from T_INT 375 376 __ BIND(is_long); 377 __ str(r0, Address(j_rarg2, 0)); 378 __ br(Assembler::AL, exit); 379 380 __ BIND(is_float); 381 __ strs(j_farg0, Address(j_rarg2, 0)); 382 __ br(Assembler::AL, exit); 383 384 __ BIND(is_double); 385 __ strd(j_farg0, Address(j_rarg2, 0)); 386 __ br(Assembler::AL, exit); 387 388 return start; 389 } 390 391 // Return point for a Java call if there's an exception thrown in 392 // Java code. The exception is caught and transformed into a 393 // pending exception stored in JavaThread that can be tested from 394 // within the VM. 395 // 396 // Note: Usually the parameters are removed by the callee. In case 397 // of an exception crossing an activation frame boundary, that is 398 // not the case if the callee is compiled code => need to setup the 399 // rsp. 400 // 401 // r0: exception oop 402 403 address generate_catch_exception() { 404 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 405 address start = __ pc(); 406 407 // same as in generate_call_stub(): 408 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 409 const Address thread (rfp, thread_off * wordSize); 410 411 #ifdef ASSERT 412 // verify that threads correspond 413 { 414 Label L, S; 415 __ ldr(rscratch1, thread); 416 __ cmp(rthread, rscratch1); 417 __ br(Assembler::NE, S); 418 __ get_thread(rscratch1); 419 __ cmp(rthread, rscratch1); 420 __ br(Assembler::EQ, L); 421 __ bind(S); 422 __ stop("StubRoutines::catch_exception: threads must correspond"); 423 __ bind(L); 424 } 425 #endif 426 427 // set pending exception 428 __ verify_oop(r0); 429 430 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 431 __ mov(rscratch1, (address)__FILE__); 432 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 433 __ movw(rscratch1, (int)__LINE__); 434 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 435 436 // complete return to VM 437 assert(StubRoutines::_call_stub_return_address != NULL, 438 "_call_stub_return_address must have been generated before"); 439 __ b(StubRoutines::_call_stub_return_address); 440 441 return start; 442 } 443 444 // Continuation point for runtime calls returning with a pending 445 // exception. The pending exception check happened in the runtime 446 // or native call stub. The pending exception in Thread is 447 // converted into a Java-level exception. 448 // 449 // Contract with Java-level exception handlers: 450 // r0: exception 451 // r3: throwing pc 452 // 453 // NOTE: At entry of this stub, exception-pc must be in LR !! 454 455 // NOTE: this is always used as a jump target within generated code 456 // so it just needs to be generated code with no x86 prolog 457 458 address generate_forward_exception() { 459 StubCodeMark mark(this, "StubRoutines", "forward exception"); 460 address start = __ pc(); 461 462 // Upon entry, LR points to the return address returning into 463 // Java (interpreted or compiled) code; i.e., the return address 464 // becomes the throwing pc. 465 // 466 // Arguments pushed before the runtime call are still on the stack 467 // but the exception handler will reset the stack pointer -> 468 // ignore them. A potential result in registers can be ignored as 469 // well. 470 471 #ifdef ASSERT 472 // make sure this code is only executed if there is a pending exception 473 { 474 Label L; 475 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 476 __ cbnz(rscratch1, L); 477 __ stop("StubRoutines::forward exception: no pending exception (1)"); 478 __ bind(L); 479 } 480 #endif 481 482 // compute exception handler into r19 483 484 // call the VM to find the handler address associated with the 485 // caller address. pass thread in r0 and caller pc (ret address) 486 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 487 // the stack. 488 __ mov(c_rarg1, lr); 489 // lr will be trashed by the VM call so we move it to R19 490 // (callee-saved) because we also need to pass it to the handler 491 // returned by this call. 492 __ mov(r19, lr); 493 BLOCK_COMMENT("call exception_handler_for_return_address"); 494 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 495 SharedRuntime::exception_handler_for_return_address), 496 rthread, c_rarg1); 497 // Reinitialize the ptrue predicate register, in case the external runtime 498 // call clobbers ptrue reg, as we may return to SVE compiled code. 499 __ reinitialize_ptrue(); 500 501 // we should not really care that lr is no longer the callee 502 // address. we saved the value the handler needs in r19 so we can 503 // just copy it to r3. however, the C2 handler will push its own 504 // frame and then calls into the VM and the VM code asserts that 505 // the PC for the frame above the handler belongs to a compiled 506 // Java method. So, we restore lr here to satisfy that assert. 507 __ mov(lr, r19); 508 // setup r0 & r3 & clear pending exception 509 __ mov(r3, r19); 510 __ mov(r19, r0); 511 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 512 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 513 514 #ifdef ASSERT 515 // make sure exception is set 516 { 517 Label L; 518 __ cbnz(r0, L); 519 __ stop("StubRoutines::forward exception: no pending exception (2)"); 520 __ bind(L); 521 } 522 #endif 523 524 // continue at exception handler 525 // r0: exception 526 // r3: throwing pc 527 // r19: exception handler 528 __ verify_oop(r0); 529 __ br(r19); 530 531 return start; 532 } 533 534 // Non-destructive plausibility checks for oops 535 // 536 // Arguments: 537 // r0: oop to verify 538 // rscratch1: error message 539 // 540 // Stack after saving c_rarg3: 541 // [tos + 0]: saved c_rarg3 542 // [tos + 1]: saved c_rarg2 543 // [tos + 2]: saved lr 544 // [tos + 3]: saved rscratch2 545 // [tos + 4]: saved r0 546 // [tos + 5]: saved rscratch1 547 address generate_verify_oop() { 548 549 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 550 address start = __ pc(); 551 552 Label exit, error; 553 554 // save c_rarg2 and c_rarg3 555 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 556 557 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 558 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 559 __ ldr(c_rarg3, Address(c_rarg2)); 560 __ add(c_rarg3, c_rarg3, 1); 561 __ str(c_rarg3, Address(c_rarg2)); 562 563 // object is in r0 564 // make sure object is 'reasonable' 565 __ cbz(r0, exit); // if obj is NULL it is OK 566 567 #if INCLUDE_ZGC 568 if (UseZGC) { 569 // Check if mask is good. 570 // verifies that ZAddressBadMask & r0 == 0 571 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 572 __ andr(c_rarg2, r0, c_rarg3); 573 __ cbnz(c_rarg2, error); 574 } 575 #endif 576 577 // Check if the oop is in the right area of memory 578 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 579 __ andr(c_rarg2, r0, c_rarg3); 580 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 581 582 // Compare c_rarg2 and c_rarg3. We don't use a compare 583 // instruction here because the flags register is live. 584 __ eor(c_rarg2, c_rarg2, c_rarg3); 585 __ cbnz(c_rarg2, error); 586 587 // make sure klass is 'reasonable', which is not zero. 588 __ load_klass(r0, r0); // get klass 589 __ cbz(r0, error); // if klass is NULL it is broken 590 591 // return if everything seems ok 592 __ bind(exit); 593 594 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 595 __ ret(lr); 596 597 // handle errors 598 __ bind(error); 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 601 __ push(RegSet::range(r0, r29), sp); 602 // debug(char* msg, int64_t pc, int64_t regs[]) 603 __ mov(c_rarg0, rscratch1); // pass address of error message 604 __ mov(c_rarg1, lr); // pass return address 605 __ mov(c_rarg2, sp); // pass address of regs on stack 606 #ifndef PRODUCT 607 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 608 #endif 609 BLOCK_COMMENT("call MacroAssembler::debug"); 610 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 611 __ blr(rscratch1); 612 __ hlt(0); 613 614 return start; 615 } 616 617 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 618 619 // Generate indices for iota vector. 620 address generate_iota_indices(const char *stub_name) { 621 __ align(CodeEntryAlignment); 622 StubCodeMark mark(this, "StubRoutines", stub_name); 623 address start = __ pc(); 624 __ emit_data64(0x0706050403020100, relocInfo::none); 625 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 626 return start; 627 } 628 629 // The inner part of zero_words(). This is the bulk operation, 630 // zeroing words in blocks, possibly using DC ZVA to do it. The 631 // caller is responsible for zeroing the last few words. 632 // 633 // Inputs: 634 // r10: the HeapWord-aligned base address of an array to zero. 635 // r11: the count in HeapWords, r11 > 0. 636 // 637 // Returns r10 and r11, adjusted for the caller to clear. 638 // r10: the base address of the tail of words left to clear. 639 // r11: the number of words in the tail. 640 // r11 < MacroAssembler::zero_words_block_size. 641 642 address generate_zero_blocks() { 643 Label done; 644 Label base_aligned; 645 646 Register base = r10, cnt = r11; 647 648 __ align(CodeEntryAlignment); 649 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 650 address start = __ pc(); 651 652 if (UseBlockZeroing) { 653 int zva_length = VM_Version::zva_length(); 654 655 // Ensure ZVA length can be divided by 16. This is required by 656 // the subsequent operations. 657 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 658 659 __ tbz(base, 3, base_aligned); 660 __ str(zr, Address(__ post(base, 8))); 661 __ sub(cnt, cnt, 1); 662 __ bind(base_aligned); 663 664 // Ensure count >= zva_length * 2 so that it still deserves a zva after 665 // alignment. 666 Label small; 667 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 668 __ subs(rscratch1, cnt, low_limit >> 3); 669 __ br(Assembler::LT, small); 670 __ zero_dcache_blocks(base, cnt); 671 __ bind(small); 672 } 673 674 { 675 // Number of stp instructions we'll unroll 676 const int unroll = 677 MacroAssembler::zero_words_block_size / 2; 678 // Clear the remaining blocks. 679 Label loop; 680 __ subs(cnt, cnt, unroll * 2); 681 __ br(Assembler::LT, done); 682 __ bind(loop); 683 for (int i = 0; i < unroll; i++) 684 __ stp(zr, zr, __ post(base, 16)); 685 __ subs(cnt, cnt, unroll * 2); 686 __ br(Assembler::GE, loop); 687 __ bind(done); 688 __ add(cnt, cnt, unroll * 2); 689 } 690 691 __ ret(lr); 692 693 return start; 694 } 695 696 697 typedef enum { 698 copy_forwards = 1, 699 copy_backwards = -1 700 } copy_direction; 701 702 // Bulk copy of blocks of 8 words. 703 // 704 // count is a count of words. 705 // 706 // Precondition: count >= 8 707 // 708 // Postconditions: 709 // 710 // The least significant bit of count contains the remaining count 711 // of words to copy. The rest of count is trash. 712 // 713 // s and d are adjusted to point to the remaining words to copy 714 // 715 void generate_copy_longs(Label &start, Register s, Register d, Register count, 716 copy_direction direction) { 717 int unit = wordSize * direction; 718 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 719 720 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 721 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 722 const Register stride = r13; 723 724 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 725 assert_different_registers(s, d, count, rscratch1); 726 727 Label again, drain; 728 const char *stub_name; 729 if (direction == copy_forwards) 730 stub_name = "forward_copy_longs"; 731 else 732 stub_name = "backward_copy_longs"; 733 734 __ align(CodeEntryAlignment); 735 736 StubCodeMark mark(this, "StubRoutines", stub_name); 737 738 __ bind(start); 739 740 Label unaligned_copy_long; 741 if (AvoidUnalignedAccesses) { 742 __ tbnz(d, 3, unaligned_copy_long); 743 } 744 745 if (direction == copy_forwards) { 746 __ sub(s, s, bias); 747 __ sub(d, d, bias); 748 } 749 750 #ifdef ASSERT 751 // Make sure we are never given < 8 words 752 { 753 Label L; 754 __ cmp(count, (u1)8); 755 __ br(Assembler::GE, L); 756 __ stop("genrate_copy_longs called with < 8 words"); 757 __ bind(L); 758 } 759 #endif 760 761 // Fill 8 registers 762 if (UseSIMDForMemoryOps) { 763 __ ldpq(v0, v1, Address(s, 4 * unit)); 764 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 765 } else { 766 __ ldp(t0, t1, Address(s, 2 * unit)); 767 __ ldp(t2, t3, Address(s, 4 * unit)); 768 __ ldp(t4, t5, Address(s, 6 * unit)); 769 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 770 } 771 772 __ subs(count, count, 16); 773 __ br(Assembler::LO, drain); 774 775 int prefetch = PrefetchCopyIntervalInBytes; 776 bool use_stride = false; 777 if (direction == copy_backwards) { 778 use_stride = prefetch > 256; 779 prefetch = -prefetch; 780 if (use_stride) __ mov(stride, prefetch); 781 } 782 783 __ bind(again); 784 785 if (PrefetchCopyIntervalInBytes > 0) 786 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 787 788 if (UseSIMDForMemoryOps) { 789 __ stpq(v0, v1, Address(d, 4 * unit)); 790 __ ldpq(v0, v1, Address(s, 4 * unit)); 791 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 792 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 793 } else { 794 __ stp(t0, t1, Address(d, 2 * unit)); 795 __ ldp(t0, t1, Address(s, 2 * unit)); 796 __ stp(t2, t3, Address(d, 4 * unit)); 797 __ ldp(t2, t3, Address(s, 4 * unit)); 798 __ stp(t4, t5, Address(d, 6 * unit)); 799 __ ldp(t4, t5, Address(s, 6 * unit)); 800 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 801 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 802 } 803 804 __ subs(count, count, 8); 805 __ br(Assembler::HS, again); 806 807 // Drain 808 __ bind(drain); 809 if (UseSIMDForMemoryOps) { 810 __ stpq(v0, v1, Address(d, 4 * unit)); 811 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 812 } else { 813 __ stp(t0, t1, Address(d, 2 * unit)); 814 __ stp(t2, t3, Address(d, 4 * unit)); 815 __ stp(t4, t5, Address(d, 6 * unit)); 816 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 817 } 818 819 { 820 Label L1, L2; 821 __ tbz(count, exact_log2(4), L1); 822 if (UseSIMDForMemoryOps) { 823 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 824 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 825 } else { 826 __ ldp(t0, t1, Address(s, 2 * unit)); 827 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 828 __ stp(t0, t1, Address(d, 2 * unit)); 829 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 830 } 831 __ bind(L1); 832 833 if (direction == copy_forwards) { 834 __ add(s, s, bias); 835 __ add(d, d, bias); 836 } 837 838 __ tbz(count, 1, L2); 839 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 840 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 841 __ bind(L2); 842 } 843 844 __ ret(lr); 845 846 if (AvoidUnalignedAccesses) { 847 Label drain, again; 848 // Register order for storing. Order is different for backward copy. 849 850 __ bind(unaligned_copy_long); 851 852 // source address is even aligned, target odd aligned 853 // 854 // when forward copying word pairs we read long pairs at offsets 855 // {0, 2, 4, 6} (in long words). when backwards copying we read 856 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 857 // address by -2 in the forwards case so we can compute the 858 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 859 // or -1. 860 // 861 // when forward copying we need to store 1 word, 3 pairs and 862 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 863 // zero offset We adjust the destination by -1 which means we 864 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 865 // 866 // When backwards copyng we need to store 1 word, 3 pairs and 867 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 868 // offsets {1, 3, 5, 7, 8} * unit. 869 870 if (direction == copy_forwards) { 871 __ sub(s, s, 16); 872 __ sub(d, d, 8); 873 } 874 875 // Fill 8 registers 876 // 877 // for forwards copy s was offset by -16 from the original input 878 // value of s so the register contents are at these offsets 879 // relative to the 64 bit block addressed by that original input 880 // and so on for each successive 64 byte block when s is updated 881 // 882 // t0 at offset 0, t1 at offset 8 883 // t2 at offset 16, t3 at offset 24 884 // t4 at offset 32, t5 at offset 40 885 // t6 at offset 48, t7 at offset 56 886 887 // for backwards copy s was not offset so the register contents 888 // are at these offsets into the preceding 64 byte block 889 // relative to that original input and so on for each successive 890 // preceding 64 byte block when s is updated. this explains the 891 // slightly counter-intuitive looking pattern of register usage 892 // in the stp instructions for backwards copy. 893 // 894 // t0 at offset -16, t1 at offset -8 895 // t2 at offset -32, t3 at offset -24 896 // t4 at offset -48, t5 at offset -40 897 // t6 at offset -64, t7 at offset -56 898 899 __ ldp(t0, t1, Address(s, 2 * unit)); 900 __ ldp(t2, t3, Address(s, 4 * unit)); 901 __ ldp(t4, t5, Address(s, 6 * unit)); 902 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 903 904 __ subs(count, count, 16); 905 __ br(Assembler::LO, drain); 906 907 int prefetch = PrefetchCopyIntervalInBytes; 908 bool use_stride = false; 909 if (direction == copy_backwards) { 910 use_stride = prefetch > 256; 911 prefetch = -prefetch; 912 if (use_stride) __ mov(stride, prefetch); 913 } 914 915 __ bind(again); 916 917 if (PrefetchCopyIntervalInBytes > 0) 918 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 919 920 if (direction == copy_forwards) { 921 // allowing for the offset of -8 the store instructions place 922 // registers into the target 64 bit block at the following 923 // offsets 924 // 925 // t0 at offset 0 926 // t1 at offset 8, t2 at offset 16 927 // t3 at offset 24, t4 at offset 32 928 // t5 at offset 40, t6 at offset 48 929 // t7 at offset 56 930 931 __ str(t0, Address(d, 1 * unit)); 932 __ stp(t1, t2, Address(d, 2 * unit)); 933 __ ldp(t0, t1, Address(s, 2 * unit)); 934 __ stp(t3, t4, Address(d, 4 * unit)); 935 __ ldp(t2, t3, Address(s, 4 * unit)); 936 __ stp(t5, t6, Address(d, 6 * unit)); 937 __ ldp(t4, t5, Address(s, 6 * unit)); 938 __ str(t7, Address(__ pre(d, 8 * unit))); 939 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 940 } else { 941 // d was not offset when we started so the registers are 942 // written into the 64 bit block preceding d with the following 943 // offsets 944 // 945 // t1 at offset -8 946 // t3 at offset -24, t0 at offset -16 947 // t5 at offset -48, t2 at offset -32 948 // t7 at offset -56, t4 at offset -48 949 // t6 at offset -64 950 // 951 // note that this matches the offsets previously noted for the 952 // loads 953 954 __ str(t1, Address(d, 1 * unit)); 955 __ stp(t3, t0, Address(d, 3 * unit)); 956 __ ldp(t0, t1, Address(s, 2 * unit)); 957 __ stp(t5, t2, Address(d, 5 * unit)); 958 __ ldp(t2, t3, Address(s, 4 * unit)); 959 __ stp(t7, t4, Address(d, 7 * unit)); 960 __ ldp(t4, t5, Address(s, 6 * unit)); 961 __ str(t6, Address(__ pre(d, 8 * unit))); 962 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 963 } 964 965 __ subs(count, count, 8); 966 __ br(Assembler::HS, again); 967 968 // Drain 969 // 970 // this uses the same pattern of offsets and register arguments 971 // as above 972 __ bind(drain); 973 if (direction == copy_forwards) { 974 __ str(t0, Address(d, 1 * unit)); 975 __ stp(t1, t2, Address(d, 2 * unit)); 976 __ stp(t3, t4, Address(d, 4 * unit)); 977 __ stp(t5, t6, Address(d, 6 * unit)); 978 __ str(t7, Address(__ pre(d, 8 * unit))); 979 } else { 980 __ str(t1, Address(d, 1 * unit)); 981 __ stp(t3, t0, Address(d, 3 * unit)); 982 __ stp(t5, t2, Address(d, 5 * unit)); 983 __ stp(t7, t4, Address(d, 7 * unit)); 984 __ str(t6, Address(__ pre(d, 8 * unit))); 985 } 986 // now we need to copy any remaining part block which may 987 // include a 4 word block subblock and/or a 2 word subblock. 988 // bits 2 and 1 in the count are the tell-tale for whether we 989 // have each such subblock 990 { 991 Label L1, L2; 992 __ tbz(count, exact_log2(4), L1); 993 // this is the same as above but copying only 4 longs hence 994 // with only one intervening stp between the str instructions 995 // but note that the offsets and registers still follow the 996 // same pattern 997 __ ldp(t0, t1, Address(s, 2 * unit)); 998 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 999 if (direction == copy_forwards) { 1000 __ str(t0, Address(d, 1 * unit)); 1001 __ stp(t1, t2, Address(d, 2 * unit)); 1002 __ str(t3, Address(__ pre(d, 4 * unit))); 1003 } else { 1004 __ str(t1, Address(d, 1 * unit)); 1005 __ stp(t3, t0, Address(d, 3 * unit)); 1006 __ str(t2, Address(__ pre(d, 4 * unit))); 1007 } 1008 __ bind(L1); 1009 1010 __ tbz(count, 1, L2); 1011 // this is the same as above but copying only 2 longs hence 1012 // there is no intervening stp between the str instructions 1013 // but note that the offset and register patterns are still 1014 // the same 1015 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1016 if (direction == copy_forwards) { 1017 __ str(t0, Address(d, 1 * unit)); 1018 __ str(t1, Address(__ pre(d, 2 * unit))); 1019 } else { 1020 __ str(t1, Address(d, 1 * unit)); 1021 __ str(t0, Address(__ pre(d, 2 * unit))); 1022 } 1023 __ bind(L2); 1024 1025 // for forwards copy we need to re-adjust the offsets we 1026 // applied so that s and d are follow the last words written 1027 1028 if (direction == copy_forwards) { 1029 __ add(s, s, 16); 1030 __ add(d, d, 8); 1031 } 1032 1033 } 1034 1035 __ ret(lr); 1036 } 1037 } 1038 1039 // Small copy: less than 16 bytes. 1040 // 1041 // NB: Ignores all of the bits of count which represent more than 15 1042 // bytes, so a caller doesn't have to mask them. 1043 1044 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1045 bool is_backwards = step < 0; 1046 size_t granularity = uabs(step); 1047 int direction = is_backwards ? -1 : 1; 1048 int unit = wordSize * direction; 1049 1050 Label Lword, Lint, Lshort, Lbyte; 1051 1052 assert(granularity 1053 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1054 1055 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1056 1057 // ??? I don't know if this bit-test-and-branch is the right thing 1058 // to do. It does a lot of jumping, resulting in several 1059 // mispredicted branches. It might make more sense to do this 1060 // with something like Duff's device with a single computed branch. 1061 1062 __ tbz(count, 3 - exact_log2(granularity), Lword); 1063 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1064 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1065 __ bind(Lword); 1066 1067 if (granularity <= sizeof (jint)) { 1068 __ tbz(count, 2 - exact_log2(granularity), Lint); 1069 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1070 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1071 __ bind(Lint); 1072 } 1073 1074 if (granularity <= sizeof (jshort)) { 1075 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1076 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1077 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1078 __ bind(Lshort); 1079 } 1080 1081 if (granularity <= sizeof (jbyte)) { 1082 __ tbz(count, 0, Lbyte); 1083 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1084 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1085 __ bind(Lbyte); 1086 } 1087 } 1088 1089 Label copy_f, copy_b; 1090 1091 // All-singing all-dancing memory copy. 1092 // 1093 // Copy count units of memory from s to d. The size of a unit is 1094 // step, which can be positive or negative depending on the direction 1095 // of copy. If is_aligned is false, we align the source address. 1096 // 1097 1098 void copy_memory(bool is_aligned, Register s, Register d, 1099 Register count, Register tmp, int step) { 1100 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1101 bool is_backwards = step < 0; 1102 unsigned int granularity = uabs(step); 1103 const Register t0 = r3, t1 = r4; 1104 1105 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1106 // load all the data before writing anything 1107 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1108 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1109 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1110 const Register send = r17, dend = r16; 1111 1112 if (PrefetchCopyIntervalInBytes > 0) 1113 __ prfm(Address(s, 0), PLDL1KEEP); 1114 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1115 __ br(Assembler::HI, copy_big); 1116 1117 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1118 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1119 1120 __ cmp(count, u1(16/granularity)); 1121 __ br(Assembler::LS, copy16); 1122 1123 __ cmp(count, u1(64/granularity)); 1124 __ br(Assembler::HI, copy80); 1125 1126 __ cmp(count, u1(32/granularity)); 1127 __ br(Assembler::LS, copy32); 1128 1129 // 33..64 bytes 1130 if (UseSIMDForMemoryOps) { 1131 __ ldpq(v0, v1, Address(s, 0)); 1132 __ ldpq(v2, v3, Address(send, -32)); 1133 __ stpq(v0, v1, Address(d, 0)); 1134 __ stpq(v2, v3, Address(dend, -32)); 1135 } else { 1136 __ ldp(t0, t1, Address(s, 0)); 1137 __ ldp(t2, t3, Address(s, 16)); 1138 __ ldp(t4, t5, Address(send, -32)); 1139 __ ldp(t6, t7, Address(send, -16)); 1140 1141 __ stp(t0, t1, Address(d, 0)); 1142 __ stp(t2, t3, Address(d, 16)); 1143 __ stp(t4, t5, Address(dend, -32)); 1144 __ stp(t6, t7, Address(dend, -16)); 1145 } 1146 __ b(finish); 1147 1148 // 17..32 bytes 1149 __ bind(copy32); 1150 __ ldp(t0, t1, Address(s, 0)); 1151 __ ldp(t2, t3, Address(send, -16)); 1152 __ stp(t0, t1, Address(d, 0)); 1153 __ stp(t2, t3, Address(dend, -16)); 1154 __ b(finish); 1155 1156 // 65..80/96 bytes 1157 // (96 bytes if SIMD because we do 32 byes per instruction) 1158 __ bind(copy80); 1159 if (UseSIMDForMemoryOps) { 1160 __ ldpq(v0, v1, Address(s, 0)); 1161 __ ldpq(v2, v3, Address(s, 32)); 1162 // Unaligned pointers can be an issue for copying. 1163 // The issue has more chances to happen when granularity of data is 1164 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1165 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1166 // The most performance drop has been seen for the range 65-80 bytes. 1167 // For such cases using the pair of ldp/stp instead of the third pair of 1168 // ldpq/stpq fixes the performance issue. 1169 if (granularity < sizeof (jint)) { 1170 Label copy96; 1171 __ cmp(count, u1(80/granularity)); 1172 __ br(Assembler::HI, copy96); 1173 __ ldp(t0, t1, Address(send, -16)); 1174 1175 __ stpq(v0, v1, Address(d, 0)); 1176 __ stpq(v2, v3, Address(d, 32)); 1177 __ stp(t0, t1, Address(dend, -16)); 1178 __ b(finish); 1179 1180 __ bind(copy96); 1181 } 1182 __ ldpq(v4, v5, Address(send, -32)); 1183 1184 __ stpq(v0, v1, Address(d, 0)); 1185 __ stpq(v2, v3, Address(d, 32)); 1186 __ stpq(v4, v5, Address(dend, -32)); 1187 } else { 1188 __ ldp(t0, t1, Address(s, 0)); 1189 __ ldp(t2, t3, Address(s, 16)); 1190 __ ldp(t4, t5, Address(s, 32)); 1191 __ ldp(t6, t7, Address(s, 48)); 1192 __ ldp(t8, t9, Address(send, -16)); 1193 1194 __ stp(t0, t1, Address(d, 0)); 1195 __ stp(t2, t3, Address(d, 16)); 1196 __ stp(t4, t5, Address(d, 32)); 1197 __ stp(t6, t7, Address(d, 48)); 1198 __ stp(t8, t9, Address(dend, -16)); 1199 } 1200 __ b(finish); 1201 1202 // 0..16 bytes 1203 __ bind(copy16); 1204 __ cmp(count, u1(8/granularity)); 1205 __ br(Assembler::LO, copy8); 1206 1207 // 8..16 bytes 1208 __ ldr(t0, Address(s, 0)); 1209 __ ldr(t1, Address(send, -8)); 1210 __ str(t0, Address(d, 0)); 1211 __ str(t1, Address(dend, -8)); 1212 __ b(finish); 1213 1214 if (granularity < 8) { 1215 // 4..7 bytes 1216 __ bind(copy8); 1217 __ tbz(count, 2 - exact_log2(granularity), copy4); 1218 __ ldrw(t0, Address(s, 0)); 1219 __ ldrw(t1, Address(send, -4)); 1220 __ strw(t0, Address(d, 0)); 1221 __ strw(t1, Address(dend, -4)); 1222 __ b(finish); 1223 if (granularity < 4) { 1224 // 0..3 bytes 1225 __ bind(copy4); 1226 __ cbz(count, finish); // get rid of 0 case 1227 if (granularity == 2) { 1228 __ ldrh(t0, Address(s, 0)); 1229 __ strh(t0, Address(d, 0)); 1230 } else { // granularity == 1 1231 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1232 // the first and last byte. 1233 // Handle the 3 byte case by loading and storing base + count/2 1234 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1235 // This does means in the 1 byte case we load/store the same 1236 // byte 3 times. 1237 __ lsr(count, count, 1); 1238 __ ldrb(t0, Address(s, 0)); 1239 __ ldrb(t1, Address(send, -1)); 1240 __ ldrb(t2, Address(s, count)); 1241 __ strb(t0, Address(d, 0)); 1242 __ strb(t1, Address(dend, -1)); 1243 __ strb(t2, Address(d, count)); 1244 } 1245 __ b(finish); 1246 } 1247 } 1248 1249 __ bind(copy_big); 1250 if (is_backwards) { 1251 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1252 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1253 } 1254 1255 // Now we've got the small case out of the way we can align the 1256 // source address on a 2-word boundary. 1257 1258 Label aligned; 1259 1260 if (is_aligned) { 1261 // We may have to adjust by 1 word to get s 2-word-aligned. 1262 __ tbz(s, exact_log2(wordSize), aligned); 1263 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1264 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1265 __ sub(count, count, wordSize/granularity); 1266 } else { 1267 if (is_backwards) { 1268 __ andr(rscratch2, s, 2 * wordSize - 1); 1269 } else { 1270 __ neg(rscratch2, s); 1271 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1272 } 1273 // rscratch2 is the byte adjustment needed to align s. 1274 __ cbz(rscratch2, aligned); 1275 int shift = exact_log2(granularity); 1276 if (shift) __ lsr(rscratch2, rscratch2, shift); 1277 __ sub(count, count, rscratch2); 1278 1279 #if 0 1280 // ?? This code is only correct for a disjoint copy. It may or 1281 // may not make sense to use it in that case. 1282 1283 // Copy the first pair; s and d may not be aligned. 1284 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1285 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1286 1287 // Align s and d, adjust count 1288 if (is_backwards) { 1289 __ sub(s, s, rscratch2); 1290 __ sub(d, d, rscratch2); 1291 } else { 1292 __ add(s, s, rscratch2); 1293 __ add(d, d, rscratch2); 1294 } 1295 #else 1296 copy_memory_small(s, d, rscratch2, rscratch1, step); 1297 #endif 1298 } 1299 1300 __ bind(aligned); 1301 1302 // s is now 2-word-aligned. 1303 1304 // We have a count of units and some trailing bytes. Adjust the 1305 // count and do a bulk copy of words. 1306 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1307 if (direction == copy_forwards) 1308 __ bl(copy_f); 1309 else 1310 __ bl(copy_b); 1311 1312 // And the tail. 1313 copy_memory_small(s, d, count, tmp, step); 1314 1315 if (granularity >= 8) __ bind(copy8); 1316 if (granularity >= 4) __ bind(copy4); 1317 __ bind(finish); 1318 } 1319 1320 1321 void clobber_registers() { 1322 #ifdef ASSERT 1323 RegSet clobbered 1324 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1325 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1326 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1327 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1328 __ mov(*it, rscratch1); 1329 } 1330 #endif 1331 1332 } 1333 1334 // Scan over array at a for count oops, verifying each one. 1335 // Preserves a and count, clobbers rscratch1 and rscratch2. 1336 void verify_oop_array (int size, Register a, Register count, Register temp) { 1337 Label loop, end; 1338 __ mov(rscratch1, a); 1339 __ mov(rscratch2, zr); 1340 __ bind(loop); 1341 __ cmp(rscratch2, count); 1342 __ br(Assembler::HS, end); 1343 if (size == wordSize) { 1344 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1345 __ verify_oop(temp); 1346 } else { 1347 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1348 __ decode_heap_oop(temp); // calls verify_oop 1349 } 1350 __ add(rscratch2, rscratch2, 1); 1351 __ b(loop); 1352 __ bind(end); 1353 } 1354 1355 // Arguments: 1356 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1357 // ignored 1358 // is_oop - true => oop array, so generate store check code 1359 // name - stub name string 1360 // 1361 // Inputs: 1362 // c_rarg0 - source array address 1363 // c_rarg1 - destination array address 1364 // c_rarg2 - element count, treated as ssize_t, can be zero 1365 // 1366 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1367 // the hardware handle it. The two dwords within qwords that span 1368 // cache line boundaries will still be loaded and stored atomically. 1369 // 1370 // Side Effects: 1371 // disjoint_int_copy_entry is set to the no-overlap entry point 1372 // used by generate_conjoint_int_oop_copy(). 1373 // 1374 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1375 const char *name, bool dest_uninitialized = false) { 1376 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1377 RegSet saved_reg = RegSet::of(s, d, count); 1378 __ align(CodeEntryAlignment); 1379 StubCodeMark mark(this, "StubRoutines", name); 1380 address start = __ pc(); 1381 __ enter(); 1382 1383 if (entry != NULL) { 1384 *entry = __ pc(); 1385 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1386 BLOCK_COMMENT("Entry:"); 1387 } 1388 1389 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1390 if (dest_uninitialized) { 1391 decorators |= IS_DEST_UNINITIALIZED; 1392 } 1393 if (aligned) { 1394 decorators |= ARRAYCOPY_ALIGNED; 1395 } 1396 1397 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1398 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1399 1400 if (is_oop) { 1401 // save regs before copy_memory 1402 __ push(RegSet::of(d, count), sp); 1403 } 1404 { 1405 // UnsafeCopyMemory page error: continue after ucm 1406 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1407 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1408 copy_memory(aligned, s, d, count, rscratch1, size); 1409 } 1410 1411 if (is_oop) { 1412 __ pop(RegSet::of(d, count), sp); 1413 if (VerifyOops) 1414 verify_oop_array(size, d, count, r16); 1415 } 1416 1417 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1418 1419 __ leave(); 1420 __ mov(r0, zr); // return 0 1421 __ ret(lr); 1422 return start; 1423 } 1424 1425 // Arguments: 1426 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1427 // ignored 1428 // is_oop - true => oop array, so generate store check code 1429 // name - stub name string 1430 // 1431 // Inputs: 1432 // c_rarg0 - source array address 1433 // c_rarg1 - destination array address 1434 // c_rarg2 - element count, treated as ssize_t, can be zero 1435 // 1436 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1437 // the hardware handle it. The two dwords within qwords that span 1438 // cache line boundaries will still be loaded and stored atomically. 1439 // 1440 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1441 address *entry, const char *name, 1442 bool dest_uninitialized = false) { 1443 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1444 RegSet saved_regs = RegSet::of(s, d, count); 1445 StubCodeMark mark(this, "StubRoutines", name); 1446 address start = __ pc(); 1447 __ enter(); 1448 1449 if (entry != NULL) { 1450 *entry = __ pc(); 1451 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1452 BLOCK_COMMENT("Entry:"); 1453 } 1454 1455 // use fwd copy when (d-s) above_equal (count*size) 1456 __ sub(rscratch1, d, s); 1457 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1458 __ br(Assembler::HS, nooverlap_target); 1459 1460 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1461 if (dest_uninitialized) { 1462 decorators |= IS_DEST_UNINITIALIZED; 1463 } 1464 if (aligned) { 1465 decorators |= ARRAYCOPY_ALIGNED; 1466 } 1467 1468 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1469 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1470 1471 if (is_oop) { 1472 // save regs before copy_memory 1473 __ push(RegSet::of(d, count), sp); 1474 } 1475 { 1476 // UnsafeCopyMemory page error: continue after ucm 1477 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1478 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1479 copy_memory(aligned, s, d, count, rscratch1, -size); 1480 } 1481 if (is_oop) { 1482 __ pop(RegSet::of(d, count), sp); 1483 if (VerifyOops) 1484 verify_oop_array(size, d, count, r16); 1485 } 1486 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1487 __ leave(); 1488 __ mov(r0, zr); // return 0 1489 __ ret(lr); 1490 return start; 1491 } 1492 1493 // Arguments: 1494 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1495 // ignored 1496 // name - stub name string 1497 // 1498 // Inputs: 1499 // c_rarg0 - source array address 1500 // c_rarg1 - destination array address 1501 // c_rarg2 - element count, treated as ssize_t, can be zero 1502 // 1503 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1504 // we let the hardware handle it. The one to eight bytes within words, 1505 // dwords or qwords that span cache line boundaries will still be loaded 1506 // and stored atomically. 1507 // 1508 // Side Effects: 1509 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1510 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1511 // we let the hardware handle it. The one to eight bytes within words, 1512 // dwords or qwords that span cache line boundaries will still be loaded 1513 // and stored atomically. 1514 // 1515 // Side Effects: 1516 // disjoint_byte_copy_entry is set to the no-overlap entry point 1517 // used by generate_conjoint_byte_copy(). 1518 // 1519 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1520 const bool not_oop = false; 1521 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1522 } 1523 1524 // Arguments: 1525 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1526 // ignored 1527 // name - stub name string 1528 // 1529 // Inputs: 1530 // c_rarg0 - source array address 1531 // c_rarg1 - destination array address 1532 // c_rarg2 - element count, treated as ssize_t, can be zero 1533 // 1534 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1535 // we let the hardware handle it. The one to eight bytes within words, 1536 // dwords or qwords that span cache line boundaries will still be loaded 1537 // and stored atomically. 1538 // 1539 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1540 address* entry, const char *name) { 1541 const bool not_oop = false; 1542 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1543 } 1544 1545 // Arguments: 1546 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1547 // ignored 1548 // name - stub name string 1549 // 1550 // Inputs: 1551 // c_rarg0 - source array address 1552 // c_rarg1 - destination array address 1553 // c_rarg2 - element count, treated as ssize_t, can be zero 1554 // 1555 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1556 // let the hardware handle it. The two or four words within dwords 1557 // or qwords that span cache line boundaries will still be loaded 1558 // and stored atomically. 1559 // 1560 // Side Effects: 1561 // disjoint_short_copy_entry is set to the no-overlap entry point 1562 // used by generate_conjoint_short_copy(). 1563 // 1564 address generate_disjoint_short_copy(bool aligned, 1565 address* entry, const char *name) { 1566 const bool not_oop = false; 1567 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1568 } 1569 1570 // Arguments: 1571 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1572 // ignored 1573 // name - stub name string 1574 // 1575 // Inputs: 1576 // c_rarg0 - source array address 1577 // c_rarg1 - destination array address 1578 // c_rarg2 - element count, treated as ssize_t, can be zero 1579 // 1580 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1581 // let the hardware handle it. The two or four words within dwords 1582 // or qwords that span cache line boundaries will still be loaded 1583 // and stored atomically. 1584 // 1585 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1586 address *entry, const char *name) { 1587 const bool not_oop = false; 1588 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1589 1590 } 1591 // Arguments: 1592 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1593 // ignored 1594 // name - stub name string 1595 // 1596 // Inputs: 1597 // c_rarg0 - source array address 1598 // c_rarg1 - destination array address 1599 // c_rarg2 - element count, treated as ssize_t, can be zero 1600 // 1601 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1602 // the hardware handle it. The two dwords within qwords that span 1603 // cache line boundaries will still be loaded and stored atomically. 1604 // 1605 // Side Effects: 1606 // disjoint_int_copy_entry is set to the no-overlap entry point 1607 // used by generate_conjoint_int_oop_copy(). 1608 // 1609 address generate_disjoint_int_copy(bool aligned, address *entry, 1610 const char *name, bool dest_uninitialized = false) { 1611 const bool not_oop = false; 1612 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1613 } 1614 1615 // Arguments: 1616 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1617 // ignored 1618 // name - stub name string 1619 // 1620 // Inputs: 1621 // c_rarg0 - source array address 1622 // c_rarg1 - destination array address 1623 // c_rarg2 - element count, treated as ssize_t, can be zero 1624 // 1625 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1626 // the hardware handle it. The two dwords within qwords that span 1627 // cache line boundaries will still be loaded and stored atomically. 1628 // 1629 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1630 address *entry, const char *name, 1631 bool dest_uninitialized = false) { 1632 const bool not_oop = false; 1633 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1634 } 1635 1636 1637 // Arguments: 1638 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1639 // ignored 1640 // name - stub name string 1641 // 1642 // Inputs: 1643 // c_rarg0 - source array address 1644 // c_rarg1 - destination array address 1645 // c_rarg2 - element count, treated as size_t, can be zero 1646 // 1647 // Side Effects: 1648 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1649 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1650 // 1651 address generate_disjoint_long_copy(bool aligned, address *entry, 1652 const char *name, bool dest_uninitialized = false) { 1653 const bool not_oop = false; 1654 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1655 } 1656 1657 // Arguments: 1658 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1659 // ignored 1660 // name - stub name string 1661 // 1662 // Inputs: 1663 // c_rarg0 - source array address 1664 // c_rarg1 - destination array address 1665 // c_rarg2 - element count, treated as size_t, can be zero 1666 // 1667 address generate_conjoint_long_copy(bool aligned, 1668 address nooverlap_target, address *entry, 1669 const char *name, bool dest_uninitialized = false) { 1670 const bool not_oop = false; 1671 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1672 } 1673 1674 // Arguments: 1675 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1676 // ignored 1677 // name - stub name string 1678 // 1679 // Inputs: 1680 // c_rarg0 - source array address 1681 // c_rarg1 - destination array address 1682 // c_rarg2 - element count, treated as size_t, can be zero 1683 // 1684 // Side Effects: 1685 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1686 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1687 // 1688 address generate_disjoint_oop_copy(bool aligned, address *entry, 1689 const char *name, bool dest_uninitialized) { 1690 const bool is_oop = true; 1691 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1692 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1693 } 1694 1695 // Arguments: 1696 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1697 // ignored 1698 // name - stub name string 1699 // 1700 // Inputs: 1701 // c_rarg0 - source array address 1702 // c_rarg1 - destination array address 1703 // c_rarg2 - element count, treated as size_t, can be zero 1704 // 1705 address generate_conjoint_oop_copy(bool aligned, 1706 address nooverlap_target, address *entry, 1707 const char *name, bool dest_uninitialized) { 1708 const bool is_oop = true; 1709 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1710 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1711 name, dest_uninitialized); 1712 } 1713 1714 1715 // Helper for generating a dynamic type check. 1716 // Smashes rscratch1, rscratch2. 1717 void generate_type_check(Register sub_klass, 1718 Register super_check_offset, 1719 Register super_klass, 1720 Label& L_success) { 1721 assert_different_registers(sub_klass, super_check_offset, super_klass); 1722 1723 BLOCK_COMMENT("type_check:"); 1724 1725 Label L_miss; 1726 1727 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1728 super_check_offset); 1729 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1730 1731 // Fall through on failure! 1732 __ BIND(L_miss); 1733 } 1734 1735 // 1736 // Generate checkcasting array copy stub 1737 // 1738 // Input: 1739 // c_rarg0 - source array address 1740 // c_rarg1 - destination array address 1741 // c_rarg2 - element count, treated as ssize_t, can be zero 1742 // c_rarg3 - size_t ckoff (super_check_offset) 1743 // c_rarg4 - oop ckval (super_klass) 1744 // 1745 // Output: 1746 // r0 == 0 - success 1747 // r0 == -1^K - failure, where K is partial transfer count 1748 // 1749 address generate_checkcast_copy(const char *name, address *entry, 1750 bool dest_uninitialized = false) { 1751 1752 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1753 1754 // Input registers (after setup_arg_regs) 1755 const Register from = c_rarg0; // source array address 1756 const Register to = c_rarg1; // destination array address 1757 const Register count = c_rarg2; // elementscount 1758 const Register ckoff = c_rarg3; // super_check_offset 1759 const Register ckval = c_rarg4; // super_klass 1760 1761 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1762 RegSet wb_post_saved_regs = RegSet::of(count); 1763 1764 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1765 const Register copied_oop = r22; // actual oop copied 1766 const Register count_save = r21; // orig elementscount 1767 const Register start_to = r20; // destination array start address 1768 const Register r19_klass = r19; // oop._klass 1769 1770 //--------------------------------------------------------------- 1771 // Assembler stub will be used for this call to arraycopy 1772 // if the two arrays are subtypes of Object[] but the 1773 // destination array type is not equal to or a supertype 1774 // of the source type. Each element must be separately 1775 // checked. 1776 1777 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1778 copied_oop, r19_klass, count_save); 1779 1780 __ align(CodeEntryAlignment); 1781 StubCodeMark mark(this, "StubRoutines", name); 1782 address start = __ pc(); 1783 1784 __ enter(); // required for proper stackwalking of RuntimeStub frame 1785 1786 #ifdef ASSERT 1787 // caller guarantees that the arrays really are different 1788 // otherwise, we would have to make conjoint checks 1789 { Label L; 1790 array_overlap_test(L, TIMES_OOP); 1791 __ stop("checkcast_copy within a single array"); 1792 __ bind(L); 1793 } 1794 #endif //ASSERT 1795 1796 // Caller of this entry point must set up the argument registers. 1797 if (entry != NULL) { 1798 *entry = __ pc(); 1799 BLOCK_COMMENT("Entry:"); 1800 } 1801 1802 // Empty array: Nothing to do. 1803 __ cbz(count, L_done); 1804 __ push(RegSet::of(r19, r20, r21, r22), sp); 1805 1806 #ifdef ASSERT 1807 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1808 // The ckoff and ckval must be mutually consistent, 1809 // even though caller generates both. 1810 { Label L; 1811 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1812 __ ldrw(start_to, Address(ckval, sco_offset)); 1813 __ cmpw(ckoff, start_to); 1814 __ br(Assembler::EQ, L); 1815 __ stop("super_check_offset inconsistent"); 1816 __ bind(L); 1817 } 1818 #endif //ASSERT 1819 1820 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1821 bool is_oop = true; 1822 if (dest_uninitialized) { 1823 decorators |= IS_DEST_UNINITIALIZED; 1824 } 1825 1826 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1827 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1828 1829 // save the original count 1830 __ mov(count_save, count); 1831 1832 // Copy from low to high addresses 1833 __ mov(start_to, to); // Save destination array start address 1834 __ b(L_load_element); 1835 1836 // ======== begin loop ======== 1837 // (Loop is rotated; its entry is L_load_element.) 1838 // Loop control: 1839 // for (; count != 0; count--) { 1840 // copied_oop = load_heap_oop(from++); 1841 // ... generate_type_check ...; 1842 // store_heap_oop(to++, copied_oop); 1843 // } 1844 __ align(OptoLoopAlignment); 1845 1846 __ BIND(L_store_element); 1847 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1848 __ sub(count, count, 1); 1849 __ cbz(count, L_do_card_marks); 1850 1851 // ======== loop entry is here ======== 1852 __ BIND(L_load_element); 1853 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1854 __ cbz(copied_oop, L_store_element); 1855 1856 __ load_klass(r19_klass, copied_oop);// query the object klass 1857 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1858 // ======== end loop ======== 1859 1860 // It was a real error; we must depend on the caller to finish the job. 1861 // Register count = remaining oops, count_orig = total oops. 1862 // Emit GC store barriers for the oops we have copied and report 1863 // their number to the caller. 1864 1865 __ subs(count, count_save, count); // K = partially copied oop count 1866 __ eon(count, count, zr); // report (-1^K) to caller 1867 __ br(Assembler::EQ, L_done_pop); 1868 1869 __ BIND(L_do_card_marks); 1870 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1871 1872 __ bind(L_done_pop); 1873 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1874 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1875 1876 __ bind(L_done); 1877 __ mov(r0, count); 1878 __ leave(); 1879 __ ret(lr); 1880 1881 return start; 1882 } 1883 1884 // Perform range checks on the proposed arraycopy. 1885 // Kills temp, but nothing else. 1886 // Also, clean the sign bits of src_pos and dst_pos. 1887 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1888 Register src_pos, // source position (c_rarg1) 1889 Register dst, // destination array oo (c_rarg2) 1890 Register dst_pos, // destination position (c_rarg3) 1891 Register length, 1892 Register temp, 1893 Label& L_failed) { 1894 BLOCK_COMMENT("arraycopy_range_checks:"); 1895 1896 assert_different_registers(rscratch1, temp); 1897 1898 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1899 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1900 __ addw(temp, length, src_pos); 1901 __ cmpw(temp, rscratch1); 1902 __ br(Assembler::HI, L_failed); 1903 1904 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1905 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1906 __ addw(temp, length, dst_pos); 1907 __ cmpw(temp, rscratch1); 1908 __ br(Assembler::HI, L_failed); 1909 1910 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1911 __ movw(src_pos, src_pos); 1912 __ movw(dst_pos, dst_pos); 1913 1914 BLOCK_COMMENT("arraycopy_range_checks done"); 1915 } 1916 1917 // These stubs get called from some dumb test routine. 1918 // I'll write them properly when they're called from 1919 // something that's actually doing something. 1920 static void fake_arraycopy_stub(address src, address dst, int count) { 1921 assert(count == 0, "huh?"); 1922 } 1923 1924 1925 // 1926 // Generate 'unsafe' array copy stub 1927 // Though just as safe as the other stubs, it takes an unscaled 1928 // size_t argument instead of an element count. 1929 // 1930 // Input: 1931 // c_rarg0 - source array address 1932 // c_rarg1 - destination array address 1933 // c_rarg2 - byte count, treated as ssize_t, can be zero 1934 // 1935 // Examines the alignment of the operands and dispatches 1936 // to a long, int, short, or byte copy loop. 1937 // 1938 address generate_unsafe_copy(const char *name, 1939 address byte_copy_entry, 1940 address short_copy_entry, 1941 address int_copy_entry, 1942 address long_copy_entry) { 1943 Label L_long_aligned, L_int_aligned, L_short_aligned; 1944 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1945 1946 __ align(CodeEntryAlignment); 1947 StubCodeMark mark(this, "StubRoutines", name); 1948 address start = __ pc(); 1949 __ enter(); // required for proper stackwalking of RuntimeStub frame 1950 1951 // bump this on entry, not on exit: 1952 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1953 1954 __ orr(rscratch1, s, d); 1955 __ orr(rscratch1, rscratch1, count); 1956 1957 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1958 __ cbz(rscratch1, L_long_aligned); 1959 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1960 __ cbz(rscratch1, L_int_aligned); 1961 __ tbz(rscratch1, 0, L_short_aligned); 1962 __ b(RuntimeAddress(byte_copy_entry)); 1963 1964 __ BIND(L_short_aligned); 1965 __ lsr(count, count, LogBytesPerShort); // size => short_count 1966 __ b(RuntimeAddress(short_copy_entry)); 1967 __ BIND(L_int_aligned); 1968 __ lsr(count, count, LogBytesPerInt); // size => int_count 1969 __ b(RuntimeAddress(int_copy_entry)); 1970 __ BIND(L_long_aligned); 1971 __ lsr(count, count, LogBytesPerLong); // size => long_count 1972 __ b(RuntimeAddress(long_copy_entry)); 1973 1974 return start; 1975 } 1976 1977 // 1978 // Generate generic array copy stubs 1979 // 1980 // Input: 1981 // c_rarg0 - src oop 1982 // c_rarg1 - src_pos (32-bits) 1983 // c_rarg2 - dst oop 1984 // c_rarg3 - dst_pos (32-bits) 1985 // c_rarg4 - element count (32-bits) 1986 // 1987 // Output: 1988 // r0 == 0 - success 1989 // r0 == -1^K - failure, where K is partial transfer count 1990 // 1991 address generate_generic_copy(const char *name, 1992 address byte_copy_entry, address short_copy_entry, 1993 address int_copy_entry, address oop_copy_entry, 1994 address long_copy_entry, address checkcast_copy_entry) { 1995 1996 Label L_failed, L_objArray; 1997 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1998 1999 // Input registers 2000 const Register src = c_rarg0; // source array oop 2001 const Register src_pos = c_rarg1; // source position 2002 const Register dst = c_rarg2; // destination array oop 2003 const Register dst_pos = c_rarg3; // destination position 2004 const Register length = c_rarg4; 2005 2006 2007 // Registers used as temps 2008 const Register dst_klass = c_rarg5; 2009 2010 __ align(CodeEntryAlignment); 2011 2012 StubCodeMark mark(this, "StubRoutines", name); 2013 2014 address start = __ pc(); 2015 2016 __ enter(); // required for proper stackwalking of RuntimeStub frame 2017 2018 // bump this on entry, not on exit: 2019 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2020 2021 //----------------------------------------------------------------------- 2022 // Assembler stub will be used for this call to arraycopy 2023 // if the following conditions are met: 2024 // 2025 // (1) src and dst must not be null. 2026 // (2) src_pos must not be negative. 2027 // (3) dst_pos must not be negative. 2028 // (4) length must not be negative. 2029 // (5) src klass and dst klass should be the same and not NULL. 2030 // (6) src and dst should be arrays. 2031 // (7) src_pos + length must not exceed length of src. 2032 // (8) dst_pos + length must not exceed length of dst. 2033 // 2034 2035 // if (src == NULL) return -1; 2036 __ cbz(src, L_failed); 2037 2038 // if (src_pos < 0) return -1; 2039 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2040 2041 // if (dst == NULL) return -1; 2042 __ cbz(dst, L_failed); 2043 2044 // if (dst_pos < 0) return -1; 2045 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2046 2047 // registers used as temp 2048 const Register scratch_length = r16; // elements count to copy 2049 const Register scratch_src_klass = r17; // array klass 2050 const Register lh = r15; // layout helper 2051 2052 // if (length < 0) return -1; 2053 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2054 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2055 2056 __ load_klass(scratch_src_klass, src); 2057 #ifdef ASSERT 2058 // assert(src->klass() != NULL); 2059 { 2060 BLOCK_COMMENT("assert klasses not null {"); 2061 Label L1, L2; 2062 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2063 __ bind(L1); 2064 __ stop("broken null klass"); 2065 __ bind(L2); 2066 __ load_klass(rscratch1, dst); 2067 __ cbz(rscratch1, L1); // this would be broken also 2068 BLOCK_COMMENT("} assert klasses not null done"); 2069 } 2070 #endif 2071 2072 // Load layout helper (32-bits) 2073 // 2074 // |array_tag| | header_size | element_type | |log2_element_size| 2075 // 32 30 24 16 8 2 0 2076 // 2077 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2078 // 2079 2080 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2081 2082 // Handle objArrays completely differently... 2083 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2084 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2085 __ movw(rscratch1, objArray_lh); 2086 __ eorw(rscratch2, lh, rscratch1); 2087 __ cbzw(rscratch2, L_objArray); 2088 2089 // if (src->klass() != dst->klass()) return -1; 2090 __ load_klass(rscratch2, dst); 2091 __ eor(rscratch2, rscratch2, scratch_src_klass); 2092 __ cbnz(rscratch2, L_failed); 2093 2094 // if (!src->is_Array()) return -1; 2095 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2096 2097 // At this point, it is known to be a typeArray (array_tag 0x3). 2098 #ifdef ASSERT 2099 { 2100 BLOCK_COMMENT("assert primitive array {"); 2101 Label L; 2102 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2103 __ cmpw(lh, rscratch2); 2104 __ br(Assembler::GE, L); 2105 __ stop("must be a primitive array"); 2106 __ bind(L); 2107 BLOCK_COMMENT("} assert primitive array done"); 2108 } 2109 #endif 2110 2111 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2112 rscratch2, L_failed); 2113 2114 // TypeArrayKlass 2115 // 2116 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2117 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2118 // 2119 2120 const Register rscratch1_offset = rscratch1; // array offset 2121 const Register r15_elsize = lh; // element size 2122 2123 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2124 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2125 __ add(src, src, rscratch1_offset); // src array offset 2126 __ add(dst, dst, rscratch1_offset); // dst array offset 2127 BLOCK_COMMENT("choose copy loop based on element size"); 2128 2129 // next registers should be set before the jump to corresponding stub 2130 const Register from = c_rarg0; // source array address 2131 const Register to = c_rarg1; // destination array address 2132 const Register count = c_rarg2; // elements count 2133 2134 // 'from', 'to', 'count' registers should be set in such order 2135 // since they are the same as 'src', 'src_pos', 'dst'. 2136 2137 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2138 2139 // The possible values of elsize are 0-3, i.e. exact_log2(element 2140 // size in bytes). We do a simple bitwise binary search. 2141 __ BIND(L_copy_bytes); 2142 __ tbnz(r15_elsize, 1, L_copy_ints); 2143 __ tbnz(r15_elsize, 0, L_copy_shorts); 2144 __ lea(from, Address(src, src_pos));// src_addr 2145 __ lea(to, Address(dst, dst_pos));// dst_addr 2146 __ movw(count, scratch_length); // length 2147 __ b(RuntimeAddress(byte_copy_entry)); 2148 2149 __ BIND(L_copy_shorts); 2150 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2151 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2152 __ movw(count, scratch_length); // length 2153 __ b(RuntimeAddress(short_copy_entry)); 2154 2155 __ BIND(L_copy_ints); 2156 __ tbnz(r15_elsize, 0, L_copy_longs); 2157 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2158 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2159 __ movw(count, scratch_length); // length 2160 __ b(RuntimeAddress(int_copy_entry)); 2161 2162 __ BIND(L_copy_longs); 2163 #ifdef ASSERT 2164 { 2165 BLOCK_COMMENT("assert long copy {"); 2166 Label L; 2167 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2168 __ cmpw(r15_elsize, LogBytesPerLong); 2169 __ br(Assembler::EQ, L); 2170 __ stop("must be long copy, but elsize is wrong"); 2171 __ bind(L); 2172 BLOCK_COMMENT("} assert long copy done"); 2173 } 2174 #endif 2175 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2176 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2177 __ movw(count, scratch_length); // length 2178 __ b(RuntimeAddress(long_copy_entry)); 2179 2180 // ObjArrayKlass 2181 __ BIND(L_objArray); 2182 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2183 2184 Label L_plain_copy, L_checkcast_copy; 2185 // test array classes for subtyping 2186 __ load_klass(r15, dst); 2187 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2188 __ br(Assembler::NE, L_checkcast_copy); 2189 2190 // Identically typed arrays can be copied without element-wise checks. 2191 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2192 rscratch2, L_failed); 2193 2194 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2195 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2196 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2197 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2198 __ movw(count, scratch_length); // length 2199 __ BIND(L_plain_copy); 2200 __ b(RuntimeAddress(oop_copy_entry)); 2201 2202 __ BIND(L_checkcast_copy); 2203 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2204 { 2205 // Before looking at dst.length, make sure dst is also an objArray. 2206 __ ldrw(rscratch1, Address(r15, lh_offset)); 2207 __ movw(rscratch2, objArray_lh); 2208 __ eorw(rscratch1, rscratch1, rscratch2); 2209 __ cbnzw(rscratch1, L_failed); 2210 2211 // It is safe to examine both src.length and dst.length. 2212 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2213 r15, L_failed); 2214 2215 __ load_klass(dst_klass, dst); // reload 2216 2217 // Marshal the base address arguments now, freeing registers. 2218 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2219 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2220 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2221 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2222 __ movw(count, length); // length (reloaded) 2223 Register sco_temp = c_rarg3; // this register is free now 2224 assert_different_registers(from, to, count, sco_temp, 2225 dst_klass, scratch_src_klass); 2226 // assert_clean_int(count, sco_temp); 2227 2228 // Generate the type check. 2229 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2230 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2231 2232 // Smashes rscratch1, rscratch2 2233 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2234 2235 // Fetch destination element klass from the ObjArrayKlass header. 2236 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2237 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2238 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2239 2240 // the checkcast_copy loop needs two extra arguments: 2241 assert(c_rarg3 == sco_temp, "#3 already in place"); 2242 // Set up arguments for checkcast_copy_entry. 2243 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2244 __ b(RuntimeAddress(checkcast_copy_entry)); 2245 } 2246 2247 __ BIND(L_failed); 2248 __ mov(r0, -1); 2249 __ leave(); // required for proper stackwalking of RuntimeStub frame 2250 __ ret(lr); 2251 2252 return start; 2253 } 2254 2255 // 2256 // Generate stub for array fill. If "aligned" is true, the 2257 // "to" address is assumed to be heapword aligned. 2258 // 2259 // Arguments for generated stub: 2260 // to: c_rarg0 2261 // value: c_rarg1 2262 // count: c_rarg2 treated as signed 2263 // 2264 address generate_fill(BasicType t, bool aligned, const char *name) { 2265 __ align(CodeEntryAlignment); 2266 StubCodeMark mark(this, "StubRoutines", name); 2267 address start = __ pc(); 2268 2269 BLOCK_COMMENT("Entry:"); 2270 2271 const Register to = c_rarg0; // source array address 2272 const Register value = c_rarg1; // value 2273 const Register count = c_rarg2; // elements count 2274 2275 const Register bz_base = r10; // base for block_zero routine 2276 const Register cnt_words = r11; // temp register 2277 2278 __ enter(); 2279 2280 Label L_fill_elements, L_exit1; 2281 2282 int shift = -1; 2283 switch (t) { 2284 case T_BYTE: 2285 shift = 0; 2286 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2287 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2288 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2289 __ br(Assembler::LO, L_fill_elements); 2290 break; 2291 case T_SHORT: 2292 shift = 1; 2293 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2294 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2295 __ br(Assembler::LO, L_fill_elements); 2296 break; 2297 case T_INT: 2298 shift = 2; 2299 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2300 __ br(Assembler::LO, L_fill_elements); 2301 break; 2302 default: ShouldNotReachHere(); 2303 } 2304 2305 // Align source address at 8 bytes address boundary. 2306 Label L_skip_align1, L_skip_align2, L_skip_align4; 2307 if (!aligned) { 2308 switch (t) { 2309 case T_BYTE: 2310 // One byte misalignment happens only for byte arrays. 2311 __ tbz(to, 0, L_skip_align1); 2312 __ strb(value, Address(__ post(to, 1))); 2313 __ subw(count, count, 1); 2314 __ bind(L_skip_align1); 2315 // Fallthrough 2316 case T_SHORT: 2317 // Two bytes misalignment happens only for byte and short (char) arrays. 2318 __ tbz(to, 1, L_skip_align2); 2319 __ strh(value, Address(__ post(to, 2))); 2320 __ subw(count, count, 2 >> shift); 2321 __ bind(L_skip_align2); 2322 // Fallthrough 2323 case T_INT: 2324 // Align to 8 bytes, we know we are 4 byte aligned to start. 2325 __ tbz(to, 2, L_skip_align4); 2326 __ strw(value, Address(__ post(to, 4))); 2327 __ subw(count, count, 4 >> shift); 2328 __ bind(L_skip_align4); 2329 break; 2330 default: ShouldNotReachHere(); 2331 } 2332 } 2333 2334 // 2335 // Fill large chunks 2336 // 2337 __ lsrw(cnt_words, count, 3 - shift); // number of words 2338 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2339 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2340 if (UseBlockZeroing) { 2341 Label non_block_zeroing, rest; 2342 // If the fill value is zero we can use the fast zero_words(). 2343 __ cbnz(value, non_block_zeroing); 2344 __ mov(bz_base, to); 2345 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2346 __ zero_words(bz_base, cnt_words); 2347 __ b(rest); 2348 __ bind(non_block_zeroing); 2349 __ fill_words(to, cnt_words, value); 2350 __ bind(rest); 2351 } else { 2352 __ fill_words(to, cnt_words, value); 2353 } 2354 2355 // Remaining count is less than 8 bytes. Fill it by a single store. 2356 // Note that the total length is no less than 8 bytes. 2357 if (t == T_BYTE || t == T_SHORT) { 2358 Label L_exit1; 2359 __ cbzw(count, L_exit1); 2360 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2361 __ str(value, Address(to, -8)); // overwrite some elements 2362 __ bind(L_exit1); 2363 __ leave(); 2364 __ ret(lr); 2365 } 2366 2367 // Handle copies less than 8 bytes. 2368 Label L_fill_2, L_fill_4, L_exit2; 2369 __ bind(L_fill_elements); 2370 switch (t) { 2371 case T_BYTE: 2372 __ tbz(count, 0, L_fill_2); 2373 __ strb(value, Address(__ post(to, 1))); 2374 __ bind(L_fill_2); 2375 __ tbz(count, 1, L_fill_4); 2376 __ strh(value, Address(__ post(to, 2))); 2377 __ bind(L_fill_4); 2378 __ tbz(count, 2, L_exit2); 2379 __ strw(value, Address(to)); 2380 break; 2381 case T_SHORT: 2382 __ tbz(count, 0, L_fill_4); 2383 __ strh(value, Address(__ post(to, 2))); 2384 __ bind(L_fill_4); 2385 __ tbz(count, 1, L_exit2); 2386 __ strw(value, Address(to)); 2387 break; 2388 case T_INT: 2389 __ cbzw(count, L_exit2); 2390 __ strw(value, Address(to)); 2391 break; 2392 default: ShouldNotReachHere(); 2393 } 2394 __ bind(L_exit2); 2395 __ leave(); 2396 __ ret(lr); 2397 return start; 2398 } 2399 2400 address generate_data_cache_writeback() { 2401 const Register line = c_rarg0; // address of line to write back 2402 2403 __ align(CodeEntryAlignment); 2404 2405 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2406 2407 address start = __ pc(); 2408 __ enter(); 2409 __ cache_wb(Address(line, 0)); 2410 __ leave(); 2411 __ ret(lr); 2412 2413 return start; 2414 } 2415 2416 address generate_data_cache_writeback_sync() { 2417 const Register is_pre = c_rarg0; // pre or post sync 2418 2419 __ align(CodeEntryAlignment); 2420 2421 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2422 2423 // pre wbsync is a no-op 2424 // post wbsync translates to an sfence 2425 2426 Label skip; 2427 address start = __ pc(); 2428 __ enter(); 2429 __ cbnz(is_pre, skip); 2430 __ cache_wbsync(false); 2431 __ bind(skip); 2432 __ leave(); 2433 __ ret(lr); 2434 2435 return start; 2436 } 2437 2438 void generate_arraycopy_stubs() { 2439 address entry; 2440 address entry_jbyte_arraycopy; 2441 address entry_jshort_arraycopy; 2442 address entry_jint_arraycopy; 2443 address entry_oop_arraycopy; 2444 address entry_jlong_arraycopy; 2445 address entry_checkcast_arraycopy; 2446 2447 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2448 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2449 2450 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2451 2452 //*** jbyte 2453 // Always need aligned and unaligned versions 2454 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2455 "jbyte_disjoint_arraycopy"); 2456 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2457 &entry_jbyte_arraycopy, 2458 "jbyte_arraycopy"); 2459 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2460 "arrayof_jbyte_disjoint_arraycopy"); 2461 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2462 "arrayof_jbyte_arraycopy"); 2463 2464 //*** jshort 2465 // Always need aligned and unaligned versions 2466 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2467 "jshort_disjoint_arraycopy"); 2468 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2469 &entry_jshort_arraycopy, 2470 "jshort_arraycopy"); 2471 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2472 "arrayof_jshort_disjoint_arraycopy"); 2473 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2474 "arrayof_jshort_arraycopy"); 2475 2476 //*** jint 2477 // Aligned versions 2478 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2479 "arrayof_jint_disjoint_arraycopy"); 2480 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2481 "arrayof_jint_arraycopy"); 2482 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2483 // entry_jint_arraycopy always points to the unaligned version 2484 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2485 "jint_disjoint_arraycopy"); 2486 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2487 &entry_jint_arraycopy, 2488 "jint_arraycopy"); 2489 2490 //*** jlong 2491 // It is always aligned 2492 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2493 "arrayof_jlong_disjoint_arraycopy"); 2494 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2495 "arrayof_jlong_arraycopy"); 2496 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2497 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2498 2499 //*** oops 2500 { 2501 // With compressed oops we need unaligned versions; notice that 2502 // we overwrite entry_oop_arraycopy. 2503 bool aligned = !UseCompressedOops; 2504 2505 StubRoutines::_arrayof_oop_disjoint_arraycopy 2506 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2507 /*dest_uninitialized*/false); 2508 StubRoutines::_arrayof_oop_arraycopy 2509 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2510 /*dest_uninitialized*/false); 2511 // Aligned versions without pre-barriers 2512 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2513 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2514 /*dest_uninitialized*/true); 2515 StubRoutines::_arrayof_oop_arraycopy_uninit 2516 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2517 /*dest_uninitialized*/true); 2518 } 2519 2520 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2521 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2522 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2523 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2524 2525 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2526 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2527 /*dest_uninitialized*/true); 2528 2529 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2530 entry_jbyte_arraycopy, 2531 entry_jshort_arraycopy, 2532 entry_jint_arraycopy, 2533 entry_jlong_arraycopy); 2534 2535 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2536 entry_jbyte_arraycopy, 2537 entry_jshort_arraycopy, 2538 entry_jint_arraycopy, 2539 entry_oop_arraycopy, 2540 entry_jlong_arraycopy, 2541 entry_checkcast_arraycopy); 2542 2543 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2544 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2545 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2546 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2547 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2548 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2549 } 2550 2551 void generate_math_stubs() { Unimplemented(); } 2552 2553 // Arguments: 2554 // 2555 // Inputs: 2556 // c_rarg0 - source byte array address 2557 // c_rarg1 - destination byte array address 2558 // c_rarg2 - K (key) in little endian int array 2559 // 2560 address generate_aescrypt_encryptBlock() { 2561 __ align(CodeEntryAlignment); 2562 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2563 2564 const Register from = c_rarg0; // source array address 2565 const Register to = c_rarg1; // destination array address 2566 const Register key = c_rarg2; // key array address 2567 const Register keylen = rscratch1; 2568 2569 address start = __ pc(); 2570 __ enter(); 2571 2572 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2573 2574 __ aesenc_loadkeys(key, keylen); 2575 __ aesecb_encrypt(from, to, keylen); 2576 2577 __ mov(r0, 0); 2578 2579 __ leave(); 2580 __ ret(lr); 2581 2582 return start; 2583 } 2584 2585 // Arguments: 2586 // 2587 // Inputs: 2588 // c_rarg0 - source byte array address 2589 // c_rarg1 - destination byte array address 2590 // c_rarg2 - K (key) in little endian int array 2591 // 2592 address generate_aescrypt_decryptBlock() { 2593 assert(UseAES, "need AES cryptographic extension support"); 2594 __ align(CodeEntryAlignment); 2595 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2596 Label L_doLast; 2597 2598 const Register from = c_rarg0; // source array address 2599 const Register to = c_rarg1; // destination array address 2600 const Register key = c_rarg2; // key array address 2601 const Register keylen = rscratch1; 2602 2603 address start = __ pc(); 2604 __ enter(); // required for proper stackwalking of RuntimeStub frame 2605 2606 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2607 2608 __ aesecb_decrypt(from, to, key, keylen); 2609 2610 __ mov(r0, 0); 2611 2612 __ leave(); 2613 __ ret(lr); 2614 2615 return start; 2616 } 2617 2618 // Arguments: 2619 // 2620 // Inputs: 2621 // c_rarg0 - source byte array address 2622 // c_rarg1 - destination byte array address 2623 // c_rarg2 - K (key) in little endian int array 2624 // c_rarg3 - r vector byte array address 2625 // c_rarg4 - input length 2626 // 2627 // Output: 2628 // x0 - input length 2629 // 2630 address generate_cipherBlockChaining_encryptAESCrypt() { 2631 assert(UseAES, "need AES cryptographic extension support"); 2632 __ align(CodeEntryAlignment); 2633 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2634 2635 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2636 2637 const Register from = c_rarg0; // source array address 2638 const Register to = c_rarg1; // destination array address 2639 const Register key = c_rarg2; // key array address 2640 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2641 // and left with the results of the last encryption block 2642 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2643 const Register keylen = rscratch1; 2644 2645 address start = __ pc(); 2646 2647 __ enter(); 2648 2649 __ movw(rscratch2, len_reg); 2650 2651 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2652 2653 __ ld1(v0, __ T16B, rvec); 2654 2655 __ cmpw(keylen, 52); 2656 __ br(Assembler::CC, L_loadkeys_44); 2657 __ br(Assembler::EQ, L_loadkeys_52); 2658 2659 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2660 __ rev32(v17, __ T16B, v17); 2661 __ rev32(v18, __ T16B, v18); 2662 __ BIND(L_loadkeys_52); 2663 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2664 __ rev32(v19, __ T16B, v19); 2665 __ rev32(v20, __ T16B, v20); 2666 __ BIND(L_loadkeys_44); 2667 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2668 __ rev32(v21, __ T16B, v21); 2669 __ rev32(v22, __ T16B, v22); 2670 __ rev32(v23, __ T16B, v23); 2671 __ rev32(v24, __ T16B, v24); 2672 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2673 __ rev32(v25, __ T16B, v25); 2674 __ rev32(v26, __ T16B, v26); 2675 __ rev32(v27, __ T16B, v27); 2676 __ rev32(v28, __ T16B, v28); 2677 __ ld1(v29, v30, v31, __ T16B, key); 2678 __ rev32(v29, __ T16B, v29); 2679 __ rev32(v30, __ T16B, v30); 2680 __ rev32(v31, __ T16B, v31); 2681 2682 __ BIND(L_aes_loop); 2683 __ ld1(v1, __ T16B, __ post(from, 16)); 2684 __ eor(v0, __ T16B, v0, v1); 2685 2686 __ br(Assembler::CC, L_rounds_44); 2687 __ br(Assembler::EQ, L_rounds_52); 2688 2689 __ aese(v0, v17); __ aesmc(v0, v0); 2690 __ aese(v0, v18); __ aesmc(v0, v0); 2691 __ BIND(L_rounds_52); 2692 __ aese(v0, v19); __ aesmc(v0, v0); 2693 __ aese(v0, v20); __ aesmc(v0, v0); 2694 __ BIND(L_rounds_44); 2695 __ aese(v0, v21); __ aesmc(v0, v0); 2696 __ aese(v0, v22); __ aesmc(v0, v0); 2697 __ aese(v0, v23); __ aesmc(v0, v0); 2698 __ aese(v0, v24); __ aesmc(v0, v0); 2699 __ aese(v0, v25); __ aesmc(v0, v0); 2700 __ aese(v0, v26); __ aesmc(v0, v0); 2701 __ aese(v0, v27); __ aesmc(v0, v0); 2702 __ aese(v0, v28); __ aesmc(v0, v0); 2703 __ aese(v0, v29); __ aesmc(v0, v0); 2704 __ aese(v0, v30); 2705 __ eor(v0, __ T16B, v0, v31); 2706 2707 __ st1(v0, __ T16B, __ post(to, 16)); 2708 2709 __ subw(len_reg, len_reg, 16); 2710 __ cbnzw(len_reg, L_aes_loop); 2711 2712 __ st1(v0, __ T16B, rvec); 2713 2714 __ mov(r0, rscratch2); 2715 2716 __ leave(); 2717 __ ret(lr); 2718 2719 return start; 2720 } 2721 2722 // Arguments: 2723 // 2724 // Inputs: 2725 // c_rarg0 - source byte array address 2726 // c_rarg1 - destination byte array address 2727 // c_rarg2 - K (key) in little endian int array 2728 // c_rarg3 - r vector byte array address 2729 // c_rarg4 - input length 2730 // 2731 // Output: 2732 // r0 - input length 2733 // 2734 address generate_cipherBlockChaining_decryptAESCrypt() { 2735 assert(UseAES, "need AES cryptographic extension support"); 2736 __ align(CodeEntryAlignment); 2737 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2738 2739 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2740 2741 const Register from = c_rarg0; // source array address 2742 const Register to = c_rarg1; // destination array address 2743 const Register key = c_rarg2; // key array address 2744 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2745 // and left with the results of the last encryption block 2746 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2747 const Register keylen = rscratch1; 2748 2749 address start = __ pc(); 2750 2751 __ enter(); 2752 2753 __ movw(rscratch2, len_reg); 2754 2755 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2756 2757 __ ld1(v2, __ T16B, rvec); 2758 2759 __ ld1(v31, __ T16B, __ post(key, 16)); 2760 __ rev32(v31, __ T16B, v31); 2761 2762 __ cmpw(keylen, 52); 2763 __ br(Assembler::CC, L_loadkeys_44); 2764 __ br(Assembler::EQ, L_loadkeys_52); 2765 2766 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2767 __ rev32(v17, __ T16B, v17); 2768 __ rev32(v18, __ T16B, v18); 2769 __ BIND(L_loadkeys_52); 2770 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2771 __ rev32(v19, __ T16B, v19); 2772 __ rev32(v20, __ T16B, v20); 2773 __ BIND(L_loadkeys_44); 2774 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2775 __ rev32(v21, __ T16B, v21); 2776 __ rev32(v22, __ T16B, v22); 2777 __ rev32(v23, __ T16B, v23); 2778 __ rev32(v24, __ T16B, v24); 2779 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2780 __ rev32(v25, __ T16B, v25); 2781 __ rev32(v26, __ T16B, v26); 2782 __ rev32(v27, __ T16B, v27); 2783 __ rev32(v28, __ T16B, v28); 2784 __ ld1(v29, v30, __ T16B, key); 2785 __ rev32(v29, __ T16B, v29); 2786 __ rev32(v30, __ T16B, v30); 2787 2788 __ BIND(L_aes_loop); 2789 __ ld1(v0, __ T16B, __ post(from, 16)); 2790 __ orr(v1, __ T16B, v0, v0); 2791 2792 __ br(Assembler::CC, L_rounds_44); 2793 __ br(Assembler::EQ, L_rounds_52); 2794 2795 __ aesd(v0, v17); __ aesimc(v0, v0); 2796 __ aesd(v0, v18); __ aesimc(v0, v0); 2797 __ BIND(L_rounds_52); 2798 __ aesd(v0, v19); __ aesimc(v0, v0); 2799 __ aesd(v0, v20); __ aesimc(v0, v0); 2800 __ BIND(L_rounds_44); 2801 __ aesd(v0, v21); __ aesimc(v0, v0); 2802 __ aesd(v0, v22); __ aesimc(v0, v0); 2803 __ aesd(v0, v23); __ aesimc(v0, v0); 2804 __ aesd(v0, v24); __ aesimc(v0, v0); 2805 __ aesd(v0, v25); __ aesimc(v0, v0); 2806 __ aesd(v0, v26); __ aesimc(v0, v0); 2807 __ aesd(v0, v27); __ aesimc(v0, v0); 2808 __ aesd(v0, v28); __ aesimc(v0, v0); 2809 __ aesd(v0, v29); __ aesimc(v0, v0); 2810 __ aesd(v0, v30); 2811 __ eor(v0, __ T16B, v0, v31); 2812 __ eor(v0, __ T16B, v0, v2); 2813 2814 __ st1(v0, __ T16B, __ post(to, 16)); 2815 __ orr(v2, __ T16B, v1, v1); 2816 2817 __ subw(len_reg, len_reg, 16); 2818 __ cbnzw(len_reg, L_aes_loop); 2819 2820 __ st1(v2, __ T16B, rvec); 2821 2822 __ mov(r0, rscratch2); 2823 2824 __ leave(); 2825 __ ret(lr); 2826 2827 return start; 2828 } 2829 2830 // CTR AES crypt. 2831 // Arguments: 2832 // 2833 // Inputs: 2834 // c_rarg0 - source byte array address 2835 // c_rarg1 - destination byte array address 2836 // c_rarg2 - K (key) in little endian int array 2837 // c_rarg3 - counter vector byte array address 2838 // c_rarg4 - input length 2839 // c_rarg5 - saved encryptedCounter start 2840 // c_rarg6 - saved used length 2841 // 2842 // Output: 2843 // r0 - input length 2844 // 2845 address generate_counterMode_AESCrypt() { 2846 const Register in = c_rarg0; 2847 const Register out = c_rarg1; 2848 const Register key = c_rarg2; 2849 const Register counter = c_rarg3; 2850 const Register saved_len = c_rarg4, len = r10; 2851 const Register saved_encrypted_ctr = c_rarg5; 2852 const Register used_ptr = c_rarg6, used = r12; 2853 2854 const Register offset = r7; 2855 const Register keylen = r11; 2856 2857 const unsigned char block_size = 16; 2858 const int bulk_width = 4; 2859 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 2860 // performance with larger data sizes, but it also means that the 2861 // fast path isn't used until you have at least 8 blocks, and up 2862 // to 127 bytes of data will be executed on the slow path. For 2863 // that reason, and also so as not to blow away too much icache, 4 2864 // blocks seems like a sensible compromise. 2865 2866 // Algorithm: 2867 // 2868 // if (len == 0) { 2869 // goto DONE; 2870 // } 2871 // int result = len; 2872 // do { 2873 // if (used >= blockSize) { 2874 // if (len >= bulk_width * blockSize) { 2875 // CTR_large_block(); 2876 // if (len == 0) 2877 // goto DONE; 2878 // } 2879 // for (;;) { 2880 // 16ByteVector v0 = counter; 2881 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 2882 // used = 0; 2883 // if (len < blockSize) 2884 // break; /* goto NEXT */ 2885 // 16ByteVector v1 = load16Bytes(in, offset); 2886 // v1 = v1 ^ encryptedCounter; 2887 // store16Bytes(out, offset); 2888 // used = blockSize; 2889 // offset += blockSize; 2890 // len -= blockSize; 2891 // if (len == 0) 2892 // goto DONE; 2893 // } 2894 // } 2895 // NEXT: 2896 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 2897 // len--; 2898 // } while (len != 0); 2899 // DONE: 2900 // return result; 2901 // 2902 // CTR_large_block() 2903 // Wide bulk encryption of whole blocks. 2904 2905 __ align(CodeEntryAlignment); 2906 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 2907 const address start = __ pc(); 2908 __ enter(); 2909 2910 Label DONE, CTR_large_block, large_block_return; 2911 __ ldrw(used, Address(used_ptr)); 2912 __ cbzw(saved_len, DONE); 2913 2914 __ mov(len, saved_len); 2915 __ mov(offset, 0); 2916 2917 // Compute #rounds for AES based on the length of the key array 2918 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2919 2920 __ aesenc_loadkeys(key, keylen); 2921 2922 { 2923 Label L_CTR_loop, NEXT; 2924 2925 __ bind(L_CTR_loop); 2926 2927 __ cmp(used, block_size); 2928 __ br(__ LO, NEXT); 2929 2930 // Maybe we have a lot of data 2931 __ subsw(rscratch1, len, bulk_width * block_size); 2932 __ br(__ HS, CTR_large_block); 2933 __ BIND(large_block_return); 2934 __ cbzw(len, DONE); 2935 2936 // Setup the counter 2937 __ movi(v4, __ T4S, 0); 2938 __ movi(v5, __ T4S, 1); 2939 __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 } 2940 2941 __ ld1(v0, __ T16B, counter); // Load the counter into v0 2942 __ rev32(v16, __ T16B, v0); 2943 __ addv(v16, __ T4S, v16, v4); 2944 __ rev32(v16, __ T16B, v16); 2945 __ st1(v16, __ T16B, counter); // Save the incremented counter back 2946 2947 { 2948 // We have fewer than bulk_width blocks of data left. Encrypt 2949 // them one by one until there is less than a full block 2950 // remaining, being careful to save both the encrypted counter 2951 // and the counter. 2952 2953 Label inner_loop; 2954 __ bind(inner_loop); 2955 // Counter to encrypt is in v0 2956 __ aesecb_encrypt(noreg, noreg, keylen); 2957 __ st1(v0, __ T16B, saved_encrypted_ctr); 2958 2959 // Do we have a remaining full block? 2960 2961 __ mov(used, 0); 2962 __ cmp(len, block_size); 2963 __ br(__ LO, NEXT); 2964 2965 // Yes, we have a full block 2966 __ ldrq(v1, Address(in, offset)); 2967 __ eor(v1, __ T16B, v1, v0); 2968 __ strq(v1, Address(out, offset)); 2969 __ mov(used, block_size); 2970 __ add(offset, offset, block_size); 2971 2972 __ subw(len, len, block_size); 2973 __ cbzw(len, DONE); 2974 2975 // Increment the counter, store it back 2976 __ orr(v0, __ T16B, v16, v16); 2977 __ rev32(v16, __ T16B, v16); 2978 __ addv(v16, __ T4S, v16, v4); 2979 __ rev32(v16, __ T16B, v16); 2980 __ st1(v16, __ T16B, counter); // Save the incremented counter back 2981 2982 __ b(inner_loop); 2983 } 2984 2985 __ BIND(NEXT); 2986 2987 // Encrypt a single byte, and loop. 2988 // We expect this to be a rare event. 2989 __ ldrb(rscratch1, Address(in, offset)); 2990 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 2991 __ eor(rscratch1, rscratch1, rscratch2); 2992 __ strb(rscratch1, Address(out, offset)); 2993 __ add(offset, offset, 1); 2994 __ add(used, used, 1); 2995 __ subw(len, len,1); 2996 __ cbnzw(len, L_CTR_loop); 2997 } 2998 2999 __ bind(DONE); 3000 __ strw(used, Address(used_ptr)); 3001 __ mov(r0, saved_len); 3002 3003 __ leave(); // required for proper stackwalking of RuntimeStub frame 3004 __ ret(lr); 3005 3006 // Bulk encryption 3007 3008 __ BIND (CTR_large_block); 3009 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3010 3011 if (bulk_width == 8) { 3012 __ sub(sp, sp, 4 * 16); 3013 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3014 } 3015 __ sub(sp, sp, 4 * 16); 3016 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3017 RegSet saved_regs = (RegSet::of(in, out, offset) 3018 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3019 __ push(saved_regs, sp); 3020 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3021 __ add(in, in, offset); 3022 __ add(out, out, offset); 3023 3024 // Keys should already be loaded into the correct registers 3025 3026 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3027 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3028 3029 // AES/CTR loop 3030 { 3031 Label L_CTR_loop; 3032 __ BIND(L_CTR_loop); 3033 3034 // Setup the counters 3035 __ movi(v8, __ T4S, 0); 3036 __ movi(v9, __ T4S, 1); 3037 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3038 3039 for (FloatRegister f = v0; f < v0 + bulk_width; f++) { 3040 __ rev32(f, __ T16B, v16); 3041 __ addv(v16, __ T4S, v16, v8); 3042 } 3043 3044 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3045 3046 // Encrypt the counters 3047 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3048 3049 if (bulk_width == 8) { 3050 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3051 } 3052 3053 // XOR the encrypted counters with the inputs 3054 for (int i = 0; i < bulk_width; i++) { 3055 __ eor(v0 + i, __ T16B, v0 + i, v8 + i); 3056 } 3057 3058 // Write the encrypted data 3059 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3060 if (bulk_width == 8) { 3061 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3062 } 3063 3064 __ subw(len, len, 16 * bulk_width); 3065 __ cbnzw(len, L_CTR_loop); 3066 } 3067 3068 // Save the counter back where it goes 3069 __ rev32(v16, __ T16B, v16); 3070 __ st1(v16, __ T16B, counter); 3071 3072 __ pop(saved_regs, sp); 3073 3074 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3075 if (bulk_width == 8) { 3076 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3077 } 3078 3079 __ andr(rscratch1, len, -16 * bulk_width); 3080 __ sub(len, len, rscratch1); 3081 __ add(offset, offset, rscratch1); 3082 __ mov(used, 16); 3083 __ strw(used, Address(used_ptr)); 3084 __ b(large_block_return); 3085 3086 return start; 3087 } 3088 3089 // Vector AES Galois Counter Mode implementation. Parameters: 3090 // 3091 // in = c_rarg0 3092 // len = c_rarg1 3093 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3094 // out = c_rarg3 3095 // key = c_rarg4 3096 // state = c_rarg5 - GHASH.state 3097 // subkeyHtbl = c_rarg6 - powers of H 3098 // counter = c_rarg7 - 16 bytes of CTR 3099 // return - number of processed bytes 3100 address generate_galoisCounterMode_AESCrypt() { 3101 address ghash_polynomial = __ pc(); 3102 __ emit_int64(0x87); // The low-order bits of the field 3103 // polynomial (i.e. p = z^7+z^2+z+1) 3104 // repeated in the low and high parts of a 3105 // 128-bit vector 3106 __ emit_int64(0x87); 3107 3108 __ align(CodeEntryAlignment); 3109 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3110 address start = __ pc(); 3111 __ enter(); 3112 3113 const Register in = c_rarg0; 3114 const Register len = c_rarg1; 3115 const Register ct = c_rarg2; 3116 const Register out = c_rarg3; 3117 // and updated with the incremented counter in the end 3118 3119 const Register key = c_rarg4; 3120 const Register state = c_rarg5; 3121 3122 const Register subkeyHtbl = c_rarg6; 3123 3124 const Register counter = c_rarg7; 3125 3126 const Register keylen = r10; 3127 // Save state before entering routine 3128 __ sub(sp, sp, 4 * 16); 3129 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3130 __ sub(sp, sp, 4 * 16); 3131 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3132 3133 // __ andr(len, len, -512); 3134 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3135 __ str(len, __ pre(sp, -2 * wordSize)); 3136 3137 Label DONE; 3138 __ cbz(len, DONE); 3139 3140 // Compute #rounds for AES based on the length of the key array 3141 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3142 3143 __ aesenc_loadkeys(key, keylen); 3144 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3145 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3146 3147 // AES/CTR loop 3148 { 3149 Label L_CTR_loop; 3150 __ BIND(L_CTR_loop); 3151 3152 // Setup the counters 3153 __ movi(v8, __ T4S, 0); 3154 __ movi(v9, __ T4S, 1); 3155 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3156 for (FloatRegister f = v0; f < v8; f++) { 3157 __ rev32(f, __ T16B, v16); 3158 __ addv(v16, __ T4S, v16, v8); 3159 } 3160 3161 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3162 3163 // Encrypt the counters 3164 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3165 3166 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3167 3168 // XOR the encrypted counters with the inputs 3169 for (int i = 0; i < 8; i++) { 3170 __ eor(v0 + i, __ T16B, v0 + i, v8 + i); 3171 } 3172 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3173 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3174 3175 __ subw(len, len, 16 * 8); 3176 __ cbnzw(len, L_CTR_loop); 3177 } 3178 3179 __ rev32(v16, __ T16B, v16); 3180 __ st1(v16, __ T16B, counter); 3181 3182 __ ldr(len, Address(sp)); 3183 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3184 3185 // GHASH/CTR loop 3186 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3187 len, /*unrolls*/4); 3188 3189 #ifdef ASSERT 3190 { Label L; 3191 __ cmp(len, (unsigned char)0); 3192 __ br(Assembler::EQ, L); 3193 __ stop("stubGenerator: abort"); 3194 __ bind(L); 3195 } 3196 #endif 3197 3198 __ bind(DONE); 3199 // Return the number of bytes processed 3200 __ ldr(r0, __ post(sp, 2 * wordSize)); 3201 3202 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3203 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3204 3205 __ leave(); // required for proper stackwalking of RuntimeStub frame 3206 __ ret(lr); 3207 return start; 3208 } 3209 3210 // Arguments: 3211 // 3212 // Inputs: 3213 // c_rarg0 - byte[] source+offset 3214 // c_rarg1 - int[] SHA.state 3215 // c_rarg2 - int offset 3216 // c_rarg3 - int limit 3217 // 3218 address generate_md5_implCompress(bool multi_block, const char *name) { 3219 __ align(CodeEntryAlignment); 3220 StubCodeMark mark(this, "StubRoutines", name); 3221 address start = __ pc(); 3222 3223 Register buf = c_rarg0; 3224 Register state = c_rarg1; 3225 Register ofs = c_rarg2; 3226 Register limit = c_rarg3; 3227 Register a = r4; 3228 Register b = r5; 3229 Register c = r6; 3230 Register d = r7; 3231 Register rscratch3 = r10; 3232 Register rscratch4 = r11; 3233 3234 Label keys; 3235 Label md5_loop; 3236 3237 __ BIND(md5_loop); 3238 3239 // Save hash values for addition after rounds 3240 __ ldrw(a, Address(state, 0)); 3241 __ ldrw(b, Address(state, 4)); 3242 __ ldrw(c, Address(state, 8)); 3243 __ ldrw(d, Address(state, 12)); 3244 3245 #define FF(r1, r2, r3, r4, k, s, t) \ 3246 __ eorw(rscratch3, r3, r4); \ 3247 __ movw(rscratch2, t); \ 3248 __ andw(rscratch3, rscratch3, r2); \ 3249 __ addw(rscratch4, r1, rscratch2); \ 3250 __ ldrw(rscratch1, Address(buf, k*4)); \ 3251 __ eorw(rscratch3, rscratch3, r4); \ 3252 __ addw(rscratch3, rscratch3, rscratch1); \ 3253 __ addw(rscratch3, rscratch3, rscratch4); \ 3254 __ rorw(rscratch2, rscratch3, 32 - s); \ 3255 __ addw(r1, rscratch2, r2); 3256 3257 #define GG(r1, r2, r3, r4, k, s, t) \ 3258 __ eorw(rscratch2, r2, r3); \ 3259 __ ldrw(rscratch1, Address(buf, k*4)); \ 3260 __ andw(rscratch3, rscratch2, r4); \ 3261 __ movw(rscratch2, t); \ 3262 __ eorw(rscratch3, rscratch3, r3); \ 3263 __ addw(rscratch4, r1, rscratch2); \ 3264 __ addw(rscratch3, rscratch3, rscratch1); \ 3265 __ addw(rscratch3, rscratch3, rscratch4); \ 3266 __ rorw(rscratch2, rscratch3, 32 - s); \ 3267 __ addw(r1, rscratch2, r2); 3268 3269 #define HH(r1, r2, r3, r4, k, s, t) \ 3270 __ eorw(rscratch3, r3, r4); \ 3271 __ movw(rscratch2, t); \ 3272 __ addw(rscratch4, r1, rscratch2); \ 3273 __ ldrw(rscratch1, Address(buf, k*4)); \ 3274 __ eorw(rscratch3, rscratch3, r2); \ 3275 __ addw(rscratch3, rscratch3, rscratch1); \ 3276 __ addw(rscratch3, rscratch3, rscratch4); \ 3277 __ rorw(rscratch2, rscratch3, 32 - s); \ 3278 __ addw(r1, rscratch2, r2); 3279 3280 #define II(r1, r2, r3, r4, k, s, t) \ 3281 __ movw(rscratch3, t); \ 3282 __ ornw(rscratch2, r2, r4); \ 3283 __ addw(rscratch4, r1, rscratch3); \ 3284 __ ldrw(rscratch1, Address(buf, k*4)); \ 3285 __ eorw(rscratch3, rscratch2, r3); \ 3286 __ addw(rscratch3, rscratch3, rscratch1); \ 3287 __ addw(rscratch3, rscratch3, rscratch4); \ 3288 __ rorw(rscratch2, rscratch3, 32 - s); \ 3289 __ addw(r1, rscratch2, r2); 3290 3291 // Round 1 3292 FF(a, b, c, d, 0, 7, 0xd76aa478) 3293 FF(d, a, b, c, 1, 12, 0xe8c7b756) 3294 FF(c, d, a, b, 2, 17, 0x242070db) 3295 FF(b, c, d, a, 3, 22, 0xc1bdceee) 3296 FF(a, b, c, d, 4, 7, 0xf57c0faf) 3297 FF(d, a, b, c, 5, 12, 0x4787c62a) 3298 FF(c, d, a, b, 6, 17, 0xa8304613) 3299 FF(b, c, d, a, 7, 22, 0xfd469501) 3300 FF(a, b, c, d, 8, 7, 0x698098d8) 3301 FF(d, a, b, c, 9, 12, 0x8b44f7af) 3302 FF(c, d, a, b, 10, 17, 0xffff5bb1) 3303 FF(b, c, d, a, 11, 22, 0x895cd7be) 3304 FF(a, b, c, d, 12, 7, 0x6b901122) 3305 FF(d, a, b, c, 13, 12, 0xfd987193) 3306 FF(c, d, a, b, 14, 17, 0xa679438e) 3307 FF(b, c, d, a, 15, 22, 0x49b40821) 3308 3309 // Round 2 3310 GG(a, b, c, d, 1, 5, 0xf61e2562) 3311 GG(d, a, b, c, 6, 9, 0xc040b340) 3312 GG(c, d, a, b, 11, 14, 0x265e5a51) 3313 GG(b, c, d, a, 0, 20, 0xe9b6c7aa) 3314 GG(a, b, c, d, 5, 5, 0xd62f105d) 3315 GG(d, a, b, c, 10, 9, 0x02441453) 3316 GG(c, d, a, b, 15, 14, 0xd8a1e681) 3317 GG(b, c, d, a, 4, 20, 0xe7d3fbc8) 3318 GG(a, b, c, d, 9, 5, 0x21e1cde6) 3319 GG(d, a, b, c, 14, 9, 0xc33707d6) 3320 GG(c, d, a, b, 3, 14, 0xf4d50d87) 3321 GG(b, c, d, a, 8, 20, 0x455a14ed) 3322 GG(a, b, c, d, 13, 5, 0xa9e3e905) 3323 GG(d, a, b, c, 2, 9, 0xfcefa3f8) 3324 GG(c, d, a, b, 7, 14, 0x676f02d9) 3325 GG(b, c, d, a, 12, 20, 0x8d2a4c8a) 3326 3327 // Round 3 3328 HH(a, b, c, d, 5, 4, 0xfffa3942) 3329 HH(d, a, b, c, 8, 11, 0x8771f681) 3330 HH(c, d, a, b, 11, 16, 0x6d9d6122) 3331 HH(b, c, d, a, 14, 23, 0xfde5380c) 3332 HH(a, b, c, d, 1, 4, 0xa4beea44) 3333 HH(d, a, b, c, 4, 11, 0x4bdecfa9) 3334 HH(c, d, a, b, 7, 16, 0xf6bb4b60) 3335 HH(b, c, d, a, 10, 23, 0xbebfbc70) 3336 HH(a, b, c, d, 13, 4, 0x289b7ec6) 3337 HH(d, a, b, c, 0, 11, 0xeaa127fa) 3338 HH(c, d, a, b, 3, 16, 0xd4ef3085) 3339 HH(b, c, d, a, 6, 23, 0x04881d05) 3340 HH(a, b, c, d, 9, 4, 0xd9d4d039) 3341 HH(d, a, b, c, 12, 11, 0xe6db99e5) 3342 HH(c, d, a, b, 15, 16, 0x1fa27cf8) 3343 HH(b, c, d, a, 2, 23, 0xc4ac5665) 3344 3345 // Round 4 3346 II(a, b, c, d, 0, 6, 0xf4292244) 3347 II(d, a, b, c, 7, 10, 0x432aff97) 3348 II(c, d, a, b, 14, 15, 0xab9423a7) 3349 II(b, c, d, a, 5, 21, 0xfc93a039) 3350 II(a, b, c, d, 12, 6, 0x655b59c3) 3351 II(d, a, b, c, 3, 10, 0x8f0ccc92) 3352 II(c, d, a, b, 10, 15, 0xffeff47d) 3353 II(b, c, d, a, 1, 21, 0x85845dd1) 3354 II(a, b, c, d, 8, 6, 0x6fa87e4f) 3355 II(d, a, b, c, 15, 10, 0xfe2ce6e0) 3356 II(c, d, a, b, 6, 15, 0xa3014314) 3357 II(b, c, d, a, 13, 21, 0x4e0811a1) 3358 II(a, b, c, d, 4, 6, 0xf7537e82) 3359 II(d, a, b, c, 11, 10, 0xbd3af235) 3360 II(c, d, a, b, 2, 15, 0x2ad7d2bb) 3361 II(b, c, d, a, 9, 21, 0xeb86d391) 3362 3363 #undef FF 3364 #undef GG 3365 #undef HH 3366 #undef II 3367 3368 // write hash values back in the correct order 3369 __ ldrw(rscratch1, Address(state, 0)); 3370 __ addw(rscratch1, rscratch1, a); 3371 __ strw(rscratch1, Address(state, 0)); 3372 3373 __ ldrw(rscratch2, Address(state, 4)); 3374 __ addw(rscratch2, rscratch2, b); 3375 __ strw(rscratch2, Address(state, 4)); 3376 3377 __ ldrw(rscratch3, Address(state, 8)); 3378 __ addw(rscratch3, rscratch3, c); 3379 __ strw(rscratch3, Address(state, 8)); 3380 3381 __ ldrw(rscratch4, Address(state, 12)); 3382 __ addw(rscratch4, rscratch4, d); 3383 __ strw(rscratch4, Address(state, 12)); 3384 3385 if (multi_block) { 3386 __ add(buf, buf, 64); 3387 __ add(ofs, ofs, 64); 3388 __ cmp(ofs, limit); 3389 __ br(Assembler::LE, md5_loop); 3390 __ mov(c_rarg0, ofs); // return ofs 3391 } 3392 3393 __ ret(lr); 3394 3395 return start; 3396 } 3397 3398 // Arguments: 3399 // 3400 // Inputs: 3401 // c_rarg0 - byte[] source+offset 3402 // c_rarg1 - int[] SHA.state 3403 // c_rarg2 - int offset 3404 // c_rarg3 - int limit 3405 // 3406 address generate_sha1_implCompress(bool multi_block, const char *name) { 3407 __ align(CodeEntryAlignment); 3408 StubCodeMark mark(this, "StubRoutines", name); 3409 address start = __ pc(); 3410 3411 Register buf = c_rarg0; 3412 Register state = c_rarg1; 3413 Register ofs = c_rarg2; 3414 Register limit = c_rarg3; 3415 3416 Label keys; 3417 Label sha1_loop; 3418 3419 // load the keys into v0..v3 3420 __ adr(rscratch1, keys); 3421 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3422 // load 5 words state into v6, v7 3423 __ ldrq(v6, Address(state, 0)); 3424 __ ldrs(v7, Address(state, 16)); 3425 3426 3427 __ BIND(sha1_loop); 3428 // load 64 bytes of data into v16..v19 3429 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3430 __ rev32(v16, __ T16B, v16); 3431 __ rev32(v17, __ T16B, v17); 3432 __ rev32(v18, __ T16B, v18); 3433 __ rev32(v19, __ T16B, v19); 3434 3435 // do the sha1 3436 __ addv(v4, __ T4S, v16, v0); 3437 __ orr(v20, __ T16B, v6, v6); 3438 3439 FloatRegister d0 = v16; 3440 FloatRegister d1 = v17; 3441 FloatRegister d2 = v18; 3442 FloatRegister d3 = v19; 3443 3444 for (int round = 0; round < 20; round++) { 3445 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3446 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3447 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3448 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3449 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3450 3451 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3452 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3453 __ sha1h(tmp2, __ T4S, v20); 3454 if (round < 5) 3455 __ sha1c(v20, __ T4S, tmp3, tmp4); 3456 else if (round < 10 || round >= 15) 3457 __ sha1p(v20, __ T4S, tmp3, tmp4); 3458 else 3459 __ sha1m(v20, __ T4S, tmp3, tmp4); 3460 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3461 3462 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3463 } 3464 3465 __ addv(v7, __ T2S, v7, v21); 3466 __ addv(v6, __ T4S, v6, v20); 3467 3468 if (multi_block) { 3469 __ add(ofs, ofs, 64); 3470 __ cmp(ofs, limit); 3471 __ br(Assembler::LE, sha1_loop); 3472 __ mov(c_rarg0, ofs); // return ofs 3473 } 3474 3475 __ strq(v6, Address(state, 0)); 3476 __ strs(v7, Address(state, 16)); 3477 3478 __ ret(lr); 3479 3480 __ bind(keys); 3481 __ emit_int32(0x5a827999); 3482 __ emit_int32(0x6ed9eba1); 3483 __ emit_int32(0x8f1bbcdc); 3484 __ emit_int32(0xca62c1d6); 3485 3486 return start; 3487 } 3488 3489 3490 // Arguments: 3491 // 3492 // Inputs: 3493 // c_rarg0 - byte[] source+offset 3494 // c_rarg1 - int[] SHA.state 3495 // c_rarg2 - int offset 3496 // c_rarg3 - int limit 3497 // 3498 address generate_sha256_implCompress(bool multi_block, const char *name) { 3499 static const uint32_t round_consts[64] = { 3500 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3501 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3502 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3503 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3504 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3505 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3506 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3507 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3508 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3509 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3510 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3511 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3512 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3513 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3514 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3515 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3516 }; 3517 __ align(CodeEntryAlignment); 3518 StubCodeMark mark(this, "StubRoutines", name); 3519 address start = __ pc(); 3520 3521 Register buf = c_rarg0; 3522 Register state = c_rarg1; 3523 Register ofs = c_rarg2; 3524 Register limit = c_rarg3; 3525 3526 Label sha1_loop; 3527 3528 __ stpd(v8, v9, __ pre(sp, -32)); 3529 __ stpd(v10, v11, Address(sp, 16)); 3530 3531 // dga == v0 3532 // dgb == v1 3533 // dg0 == v2 3534 // dg1 == v3 3535 // dg2 == v4 3536 // t0 == v6 3537 // t1 == v7 3538 3539 // load 16 keys to v16..v31 3540 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3541 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3542 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3543 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3544 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3545 3546 // load 8 words (256 bits) state 3547 __ ldpq(v0, v1, state); 3548 3549 __ BIND(sha1_loop); 3550 // load 64 bytes of data into v8..v11 3551 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3552 __ rev32(v8, __ T16B, v8); 3553 __ rev32(v9, __ T16B, v9); 3554 __ rev32(v10, __ T16B, v10); 3555 __ rev32(v11, __ T16B, v11); 3556 3557 __ addv(v6, __ T4S, v8, v16); 3558 __ orr(v2, __ T16B, v0, v0); 3559 __ orr(v3, __ T16B, v1, v1); 3560 3561 FloatRegister d0 = v8; 3562 FloatRegister d1 = v9; 3563 FloatRegister d2 = v10; 3564 FloatRegister d3 = v11; 3565 3566 3567 for (int round = 0; round < 16; round++) { 3568 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3569 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3570 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3571 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3572 3573 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3574 __ orr(v4, __ T16B, v2, v2); 3575 if (round < 15) 3576 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3577 __ sha256h(v2, __ T4S, v3, tmp2); 3578 __ sha256h2(v3, __ T4S, v4, tmp2); 3579 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3580 3581 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3582 } 3583 3584 __ addv(v0, __ T4S, v0, v2); 3585 __ addv(v1, __ T4S, v1, v3); 3586 3587 if (multi_block) { 3588 __ add(ofs, ofs, 64); 3589 __ cmp(ofs, limit); 3590 __ br(Assembler::LE, sha1_loop); 3591 __ mov(c_rarg0, ofs); // return ofs 3592 } 3593 3594 __ ldpd(v10, v11, Address(sp, 16)); 3595 __ ldpd(v8, v9, __ post(sp, 32)); 3596 3597 __ stpq(v0, v1, state); 3598 3599 __ ret(lr); 3600 3601 return start; 3602 } 3603 3604 // Arguments: 3605 // 3606 // Inputs: 3607 // c_rarg0 - byte[] source+offset 3608 // c_rarg1 - int[] SHA.state 3609 // c_rarg2 - int offset 3610 // c_rarg3 - int limit 3611 // 3612 address generate_sha512_implCompress(bool multi_block, const char *name) { 3613 static const uint64_t round_consts[80] = { 3614 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3615 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3616 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3617 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3618 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3619 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3620 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3621 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3622 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3623 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3624 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3625 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3626 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3627 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3628 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3629 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3630 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3631 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3632 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3633 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3634 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3635 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3636 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3637 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3638 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3639 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3640 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3641 }; 3642 3643 // Double rounds for sha512. 3644 #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \ 3645 if (dr < 36) \ 3646 __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16)); \ 3647 __ addv(v5, __ T2D, v##rc0, v##in0); \ 3648 __ ext(v6, __ T16B, v##i2, v##i3, 8); \ 3649 __ ext(v5, __ T16B, v5, v5, 8); \ 3650 __ ext(v7, __ T16B, v##i1, v##i2, 8); \ 3651 __ addv(v##i3, __ T2D, v##i3, v5); \ 3652 if (dr < 32) { \ 3653 __ ext(v5, __ T16B, v##in3, v##in4, 8); \ 3654 __ sha512su0(v##in0, __ T2D, v##in1); \ 3655 } \ 3656 __ sha512h(v##i3, __ T2D, v6, v7); \ 3657 if (dr < 32) \ 3658 __ sha512su1(v##in0, __ T2D, v##in2, v5); \ 3659 __ addv(v##i4, __ T2D, v##i1, v##i3); \ 3660 __ sha512h2(v##i3, __ T2D, v##i1, v##i0); \ 3661 3662 __ align(CodeEntryAlignment); 3663 StubCodeMark mark(this, "StubRoutines", name); 3664 address start = __ pc(); 3665 3666 Register buf = c_rarg0; 3667 Register state = c_rarg1; 3668 Register ofs = c_rarg2; 3669 Register limit = c_rarg3; 3670 3671 __ stpd(v8, v9, __ pre(sp, -64)); 3672 __ stpd(v10, v11, Address(sp, 16)); 3673 __ stpd(v12, v13, Address(sp, 32)); 3674 __ stpd(v14, v15, Address(sp, 48)); 3675 3676 Label sha512_loop; 3677 3678 // load state 3679 __ ld1(v8, v9, v10, v11, __ T2D, state); 3680 3681 // load first 4 round constants 3682 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3683 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3684 3685 __ BIND(sha512_loop); 3686 // load 128B of data into v12..v19 3687 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3688 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3689 __ rev64(v12, __ T16B, v12); 3690 __ rev64(v13, __ T16B, v13); 3691 __ rev64(v14, __ T16B, v14); 3692 __ rev64(v15, __ T16B, v15); 3693 __ rev64(v16, __ T16B, v16); 3694 __ rev64(v17, __ T16B, v17); 3695 __ rev64(v18, __ T16B, v18); 3696 __ rev64(v19, __ T16B, v19); 3697 3698 __ mov(rscratch2, rscratch1); 3699 3700 __ mov(v0, __ T16B, v8); 3701 __ mov(v1, __ T16B, v9); 3702 __ mov(v2, __ T16B, v10); 3703 __ mov(v3, __ T16B, v11); 3704 3705 sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17); 3706 sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18); 3707 sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19); 3708 sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12); 3709 sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13); 3710 sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14); 3711 sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15); 3712 sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16); 3713 sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17); 3714 sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18); 3715 sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19); 3716 sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12); 3717 sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13); 3718 sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14); 3719 sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15); 3720 sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16); 3721 sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17); 3722 sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18); 3723 sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19); 3724 sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12); 3725 sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13); 3726 sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14); 3727 sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15); 3728 sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16); 3729 sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17); 3730 sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18); 3731 sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19); 3732 sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12); 3733 sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13); 3734 sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14); 3735 sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15); 3736 sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16); 3737 sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12, 0, 0, 0, 0); 3738 sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13, 0, 0, 0, 0); 3739 sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14, 0, 0, 0, 0); 3740 sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15, 0, 0, 0, 0); 3741 sha512_dround(36, 3, 0, 4, 2, 1, 24, 0, 16, 0, 0, 0, 0); 3742 sha512_dround(37, 2, 3, 1, 4, 0, 25, 0, 17, 0, 0, 0, 0); 3743 sha512_dround(38, 4, 2, 0, 1, 3, 26, 0, 18, 0, 0, 0, 0); 3744 sha512_dround(39, 1, 4, 3, 0, 2, 27, 0, 19, 0, 0, 0, 0); 3745 3746 __ addv(v8, __ T2D, v8, v0); 3747 __ addv(v9, __ T2D, v9, v1); 3748 __ addv(v10, __ T2D, v10, v2); 3749 __ addv(v11, __ T2D, v11, v3); 3750 3751 if (multi_block) { 3752 __ add(ofs, ofs, 128); 3753 __ cmp(ofs, limit); 3754 __ br(Assembler::LE, sha512_loop); 3755 __ mov(c_rarg0, ofs); // return ofs 3756 } 3757 3758 __ st1(v8, v9, v10, v11, __ T2D, state); 3759 3760 __ ldpd(v14, v15, Address(sp, 48)); 3761 __ ldpd(v12, v13, Address(sp, 32)); 3762 __ ldpd(v10, v11, Address(sp, 16)); 3763 __ ldpd(v8, v9, __ post(sp, 64)); 3764 3765 __ ret(lr); 3766 3767 return start; 3768 } 3769 3770 // Arguments: 3771 // 3772 // Inputs: 3773 // c_rarg0 - byte[] source+offset 3774 // c_rarg1 - byte[] SHA.state 3775 // c_rarg2 - int digest_length 3776 // c_rarg3 - int offset 3777 // c_rarg4 - int limit 3778 // 3779 address generate_sha3_implCompress(bool multi_block, const char *name) { 3780 static const uint64_t round_consts[24] = { 3781 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 3782 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 3783 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 3784 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 3785 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 3786 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 3787 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 3788 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 3789 }; 3790 3791 __ align(CodeEntryAlignment); 3792 StubCodeMark mark(this, "StubRoutines", name); 3793 address start = __ pc(); 3794 3795 Register buf = c_rarg0; 3796 Register state = c_rarg1; 3797 Register digest_length = c_rarg2; 3798 Register ofs = c_rarg3; 3799 Register limit = c_rarg4; 3800 3801 Label sha3_loop, rounds24_loop; 3802 Label sha3_512, sha3_384_or_224, sha3_256; 3803 3804 __ stpd(v8, v9, __ pre(sp, -64)); 3805 __ stpd(v10, v11, Address(sp, 16)); 3806 __ stpd(v12, v13, Address(sp, 32)); 3807 __ stpd(v14, v15, Address(sp, 48)); 3808 3809 // load state 3810 __ add(rscratch1, state, 32); 3811 __ ld1(v0, v1, v2, v3, __ T1D, state); 3812 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 3813 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 3814 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 3815 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 3816 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 3817 __ ld1(v24, __ T1D, rscratch1); 3818 3819 __ BIND(sha3_loop); 3820 3821 // 24 keccak rounds 3822 __ movw(rscratch2, 24); 3823 3824 // load round_constants base 3825 __ lea(rscratch1, ExternalAddress((address) round_consts)); 3826 3827 // load input 3828 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3829 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 3830 __ eor(v0, __ T8B, v0, v25); 3831 __ eor(v1, __ T8B, v1, v26); 3832 __ eor(v2, __ T8B, v2, v27); 3833 __ eor(v3, __ T8B, v3, v28); 3834 __ eor(v4, __ T8B, v4, v29); 3835 __ eor(v5, __ T8B, v5, v30); 3836 __ eor(v6, __ T8B, v6, v31); 3837 3838 // digest_length == 64, SHA3-512 3839 __ tbnz(digest_length, 6, sha3_512); 3840 3841 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3842 __ ld1(v29, v30, __ T8B, __ post(buf, 16)); 3843 __ eor(v7, __ T8B, v7, v25); 3844 __ eor(v8, __ T8B, v8, v26); 3845 __ eor(v9, __ T8B, v9, v27); 3846 __ eor(v10, __ T8B, v10, v28); 3847 __ eor(v11, __ T8B, v11, v29); 3848 __ eor(v12, __ T8B, v12, v30); 3849 3850 // digest_length == 28, SHA3-224; digest_length == 48, SHA3-384 3851 __ tbnz(digest_length, 4, sha3_384_or_224); 3852 3853 // SHA3-256 3854 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3855 __ eor(v13, __ T8B, v13, v25); 3856 __ eor(v14, __ T8B, v14, v26); 3857 __ eor(v15, __ T8B, v15, v27); 3858 __ eor(v16, __ T8B, v16, v28); 3859 __ b(rounds24_loop); 3860 3861 __ BIND(sha3_384_or_224); 3862 __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384 3863 3864 // SHA3-224 3865 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3866 __ ld1(v29, __ T8B, __ post(buf, 8)); 3867 __ eor(v13, __ T8B, v13, v25); 3868 __ eor(v14, __ T8B, v14, v26); 3869 __ eor(v15, __ T8B, v15, v27); 3870 __ eor(v16, __ T8B, v16, v28); 3871 __ eor(v17, __ T8B, v17, v29); 3872 __ b(rounds24_loop); 3873 3874 __ BIND(sha3_512); 3875 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 3876 __ eor(v7, __ T8B, v7, v25); 3877 __ eor(v8, __ T8B, v8, v26); 3878 3879 __ BIND(rounds24_loop); 3880 __ subw(rscratch2, rscratch2, 1); 3881 3882 __ eor3(v29, __ T16B, v4, v9, v14); 3883 __ eor3(v26, __ T16B, v1, v6, v11); 3884 __ eor3(v28, __ T16B, v3, v8, v13); 3885 __ eor3(v25, __ T16B, v0, v5, v10); 3886 __ eor3(v27, __ T16B, v2, v7, v12); 3887 __ eor3(v29, __ T16B, v29, v19, v24); 3888 __ eor3(v26, __ T16B, v26, v16, v21); 3889 __ eor3(v28, __ T16B, v28, v18, v23); 3890 __ eor3(v25, __ T16B, v25, v15, v20); 3891 __ eor3(v27, __ T16B, v27, v17, v22); 3892 3893 __ rax1(v30, __ T2D, v29, v26); 3894 __ rax1(v26, __ T2D, v26, v28); 3895 __ rax1(v28, __ T2D, v28, v25); 3896 __ rax1(v25, __ T2D, v25, v27); 3897 __ rax1(v27, __ T2D, v27, v29); 3898 3899 __ eor(v0, __ T16B, v0, v30); 3900 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 3901 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 3902 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 3903 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 3904 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 3905 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 3906 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 3907 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 3908 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 3909 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 3910 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 3911 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 3912 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 3913 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 3914 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 3915 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 3916 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 3917 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 3918 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 3919 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 3920 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 3921 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 3922 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 3923 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 3924 3925 __ bcax(v20, __ T16B, v31, v22, v8); 3926 __ bcax(v21, __ T16B, v8, v23, v22); 3927 __ bcax(v22, __ T16B, v22, v24, v23); 3928 __ bcax(v23, __ T16B, v23, v31, v24); 3929 __ bcax(v24, __ T16B, v24, v8, v31); 3930 3931 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 3932 3933 __ bcax(v17, __ T16B, v25, v19, v3); 3934 __ bcax(v18, __ T16B, v3, v15, v19); 3935 __ bcax(v19, __ T16B, v19, v16, v15); 3936 __ bcax(v15, __ T16B, v15, v25, v16); 3937 __ bcax(v16, __ T16B, v16, v3, v25); 3938 3939 __ bcax(v10, __ T16B, v29, v12, v26); 3940 __ bcax(v11, __ T16B, v26, v13, v12); 3941 __ bcax(v12, __ T16B, v12, v14, v13); 3942 __ bcax(v13, __ T16B, v13, v29, v14); 3943 __ bcax(v14, __ T16B, v14, v26, v29); 3944 3945 __ bcax(v7, __ T16B, v30, v9, v4); 3946 __ bcax(v8, __ T16B, v4, v5, v9); 3947 __ bcax(v9, __ T16B, v9, v6, v5); 3948 __ bcax(v5, __ T16B, v5, v30, v6); 3949 __ bcax(v6, __ T16B, v6, v4, v30); 3950 3951 __ bcax(v3, __ T16B, v27, v0, v28); 3952 __ bcax(v4, __ T16B, v28, v1, v0); 3953 __ bcax(v0, __ T16B, v0, v2, v1); 3954 __ bcax(v1, __ T16B, v1, v27, v2); 3955 __ bcax(v2, __ T16B, v2, v28, v27); 3956 3957 __ eor(v0, __ T16B, v0, v31); 3958 3959 __ cbnzw(rscratch2, rounds24_loop); 3960 3961 if (multi_block) { 3962 // block_size = 200 - 2 * digest_length, ofs += block_size 3963 __ add(ofs, ofs, 200); 3964 __ sub(ofs, ofs, digest_length, Assembler::LSL, 1); 3965 3966 __ cmp(ofs, limit); 3967 __ br(Assembler::LE, sha3_loop); 3968 __ mov(c_rarg0, ofs); // return ofs 3969 } 3970 3971 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 3972 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 3973 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 3974 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 3975 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 3976 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 3977 __ st1(v24, __ T1D, state); 3978 3979 __ ldpd(v14, v15, Address(sp, 48)); 3980 __ ldpd(v12, v13, Address(sp, 32)); 3981 __ ldpd(v10, v11, Address(sp, 16)); 3982 __ ldpd(v8, v9, __ post(sp, 64)); 3983 3984 __ ret(lr); 3985 3986 return start; 3987 } 3988 3989 /** 3990 * Arguments: 3991 * 3992 * Inputs: 3993 * c_rarg0 - int crc 3994 * c_rarg1 - byte* buf 3995 * c_rarg2 - int length 3996 * 3997 * Output: 3998 * rax - int crc result 3999 */ 4000 address generate_updateBytesCRC32() { 4001 assert(UseCRC32Intrinsics, "what are we doing here?"); 4002 4003 __ align(CodeEntryAlignment); 4004 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4005 4006 address start = __ pc(); 4007 4008 const Register crc = c_rarg0; // crc 4009 const Register buf = c_rarg1; // source java byte array address 4010 const Register len = c_rarg2; // length 4011 const Register table0 = c_rarg3; // crc_table address 4012 const Register table1 = c_rarg4; 4013 const Register table2 = c_rarg5; 4014 const Register table3 = c_rarg6; 4015 const Register tmp3 = c_rarg7; 4016 4017 BLOCK_COMMENT("Entry:"); 4018 __ enter(); // required for proper stackwalking of RuntimeStub frame 4019 4020 __ kernel_crc32(crc, buf, len, 4021 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4022 4023 __ leave(); // required for proper stackwalking of RuntimeStub frame 4024 __ ret(lr); 4025 4026 return start; 4027 } 4028 4029 /** 4030 * Arguments: 4031 * 4032 * Inputs: 4033 * c_rarg0 - int crc 4034 * c_rarg1 - byte* buf 4035 * c_rarg2 - int length 4036 * c_rarg3 - int* table 4037 * 4038 * Output: 4039 * r0 - int crc result 4040 */ 4041 address generate_updateBytesCRC32C() { 4042 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4043 4044 __ align(CodeEntryAlignment); 4045 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4046 4047 address start = __ pc(); 4048 4049 const Register crc = c_rarg0; // crc 4050 const Register buf = c_rarg1; // source java byte array address 4051 const Register len = c_rarg2; // length 4052 const Register table0 = c_rarg3; // crc_table address 4053 const Register table1 = c_rarg4; 4054 const Register table2 = c_rarg5; 4055 const Register table3 = c_rarg6; 4056 const Register tmp3 = c_rarg7; 4057 4058 BLOCK_COMMENT("Entry:"); 4059 __ enter(); // required for proper stackwalking of RuntimeStub frame 4060 4061 __ kernel_crc32c(crc, buf, len, 4062 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4063 4064 __ leave(); // required for proper stackwalking of RuntimeStub frame 4065 __ ret(lr); 4066 4067 return start; 4068 } 4069 4070 /*** 4071 * Arguments: 4072 * 4073 * Inputs: 4074 * c_rarg0 - int adler 4075 * c_rarg1 - byte* buff 4076 * c_rarg2 - int len 4077 * 4078 * Output: 4079 * c_rarg0 - int adler result 4080 */ 4081 address generate_updateBytesAdler32() { 4082 __ align(CodeEntryAlignment); 4083 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4084 address start = __ pc(); 4085 4086 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4087 4088 // Aliases 4089 Register adler = c_rarg0; 4090 Register s1 = c_rarg0; 4091 Register s2 = c_rarg3; 4092 Register buff = c_rarg1; 4093 Register len = c_rarg2; 4094 Register nmax = r4; 4095 Register base = r5; 4096 Register count = r6; 4097 Register temp0 = rscratch1; 4098 Register temp1 = rscratch2; 4099 FloatRegister vbytes = v0; 4100 FloatRegister vs1acc = v1; 4101 FloatRegister vs2acc = v2; 4102 FloatRegister vtable = v3; 4103 4104 // Max number of bytes we can process before having to take the mod 4105 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4106 uint64_t BASE = 0xfff1; 4107 uint64_t NMAX = 0x15B0; 4108 4109 __ mov(base, BASE); 4110 __ mov(nmax, NMAX); 4111 4112 // Load accumulation coefficients for the upper 16 bits 4113 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4114 __ ld1(vtable, __ T16B, Address(temp0)); 4115 4116 // s1 is initialized to the lower 16 bits of adler 4117 // s2 is initialized to the upper 16 bits of adler 4118 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4119 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4120 4121 // The pipelined loop needs at least 16 elements for 1 iteration 4122 // It does check this, but it is more effective to skip to the cleanup loop 4123 __ cmp(len, (u1)16); 4124 __ br(Assembler::HS, L_nmax); 4125 __ cbz(len, L_combine); 4126 4127 __ bind(L_simple_by1_loop); 4128 __ ldrb(temp0, Address(__ post(buff, 1))); 4129 __ add(s1, s1, temp0); 4130 __ add(s2, s2, s1); 4131 __ subs(len, len, 1); 4132 __ br(Assembler::HI, L_simple_by1_loop); 4133 4134 // s1 = s1 % BASE 4135 __ subs(temp0, s1, base); 4136 __ csel(s1, temp0, s1, Assembler::HS); 4137 4138 // s2 = s2 % BASE 4139 __ lsr(temp0, s2, 16); 4140 __ lsl(temp1, temp0, 4); 4141 __ sub(temp1, temp1, temp0); 4142 __ add(s2, temp1, s2, ext::uxth); 4143 4144 __ subs(temp0, s2, base); 4145 __ csel(s2, temp0, s2, Assembler::HS); 4146 4147 __ b(L_combine); 4148 4149 __ bind(L_nmax); 4150 __ subs(len, len, nmax); 4151 __ sub(count, nmax, 16); 4152 __ br(Assembler::LO, L_by16); 4153 4154 __ bind(L_nmax_loop); 4155 4156 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4157 vbytes, vs1acc, vs2acc, vtable); 4158 4159 __ subs(count, count, 16); 4160 __ br(Assembler::HS, L_nmax_loop); 4161 4162 // s1 = s1 % BASE 4163 __ lsr(temp0, s1, 16); 4164 __ lsl(temp1, temp0, 4); 4165 __ sub(temp1, temp1, temp0); 4166 __ add(temp1, temp1, s1, ext::uxth); 4167 4168 __ lsr(temp0, temp1, 16); 4169 __ lsl(s1, temp0, 4); 4170 __ sub(s1, s1, temp0); 4171 __ add(s1, s1, temp1, ext:: uxth); 4172 4173 __ subs(temp0, s1, base); 4174 __ csel(s1, temp0, s1, Assembler::HS); 4175 4176 // s2 = s2 % BASE 4177 __ lsr(temp0, s2, 16); 4178 __ lsl(temp1, temp0, 4); 4179 __ sub(temp1, temp1, temp0); 4180 __ add(temp1, temp1, s2, ext::uxth); 4181 4182 __ lsr(temp0, temp1, 16); 4183 __ lsl(s2, temp0, 4); 4184 __ sub(s2, s2, temp0); 4185 __ add(s2, s2, temp1, ext:: uxth); 4186 4187 __ subs(temp0, s2, base); 4188 __ csel(s2, temp0, s2, Assembler::HS); 4189 4190 __ subs(len, len, nmax); 4191 __ sub(count, nmax, 16); 4192 __ br(Assembler::HS, L_nmax_loop); 4193 4194 __ bind(L_by16); 4195 __ adds(len, len, count); 4196 __ br(Assembler::LO, L_by1); 4197 4198 __ bind(L_by16_loop); 4199 4200 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4201 vbytes, vs1acc, vs2acc, vtable); 4202 4203 __ subs(len, len, 16); 4204 __ br(Assembler::HS, L_by16_loop); 4205 4206 __ bind(L_by1); 4207 __ adds(len, len, 15); 4208 __ br(Assembler::LO, L_do_mod); 4209 4210 __ bind(L_by1_loop); 4211 __ ldrb(temp0, Address(__ post(buff, 1))); 4212 __ add(s1, temp0, s1); 4213 __ add(s2, s2, s1); 4214 __ subs(len, len, 1); 4215 __ br(Assembler::HS, L_by1_loop); 4216 4217 __ bind(L_do_mod); 4218 // s1 = s1 % BASE 4219 __ lsr(temp0, s1, 16); 4220 __ lsl(temp1, temp0, 4); 4221 __ sub(temp1, temp1, temp0); 4222 __ add(temp1, temp1, s1, ext::uxth); 4223 4224 __ lsr(temp0, temp1, 16); 4225 __ lsl(s1, temp0, 4); 4226 __ sub(s1, s1, temp0); 4227 __ add(s1, s1, temp1, ext:: uxth); 4228 4229 __ subs(temp0, s1, base); 4230 __ csel(s1, temp0, s1, Assembler::HS); 4231 4232 // s2 = s2 % BASE 4233 __ lsr(temp0, s2, 16); 4234 __ lsl(temp1, temp0, 4); 4235 __ sub(temp1, temp1, temp0); 4236 __ add(temp1, temp1, s2, ext::uxth); 4237 4238 __ lsr(temp0, temp1, 16); 4239 __ lsl(s2, temp0, 4); 4240 __ sub(s2, s2, temp0); 4241 __ add(s2, s2, temp1, ext:: uxth); 4242 4243 __ subs(temp0, s2, base); 4244 __ csel(s2, temp0, s2, Assembler::HS); 4245 4246 // Combine lower bits and higher bits 4247 __ bind(L_combine); 4248 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4249 4250 __ ret(lr); 4251 4252 return start; 4253 } 4254 4255 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4256 Register temp0, Register temp1, FloatRegister vbytes, 4257 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4258 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4259 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4260 // In non-vectorized code, we update s1 and s2 as: 4261 // s1 <- s1 + b1 4262 // s2 <- s2 + s1 4263 // s1 <- s1 + b2 4264 // s2 <- s2 + b1 4265 // ... 4266 // s1 <- s1 + b16 4267 // s2 <- s2 + s1 4268 // Putting above assignments together, we have: 4269 // s1_new = s1 + b1 + b2 + ... + b16 4270 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4271 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4272 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4273 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4274 4275 // s2 = s2 + s1 * 16 4276 __ add(s2, s2, s1, Assembler::LSL, 4); 4277 4278 // vs1acc = b1 + b2 + b3 + ... + b16 4279 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4280 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4281 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4282 __ uaddlv(vs1acc, __ T16B, vbytes); 4283 __ uaddlv(vs2acc, __ T8H, vs2acc); 4284 4285 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4286 __ fmovd(temp0, vs1acc); 4287 __ fmovd(temp1, vs2acc); 4288 __ add(s1, s1, temp0); 4289 __ add(s2, s2, temp1); 4290 } 4291 4292 /** 4293 * Arguments: 4294 * 4295 * Input: 4296 * c_rarg0 - x address 4297 * c_rarg1 - x length 4298 * c_rarg2 - y address 4299 * c_rarg3 - y length 4300 * c_rarg4 - z address 4301 * c_rarg5 - z length 4302 */ 4303 address generate_multiplyToLen() { 4304 __ align(CodeEntryAlignment); 4305 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4306 4307 address start = __ pc(); 4308 const Register x = r0; 4309 const Register xlen = r1; 4310 const Register y = r2; 4311 const Register ylen = r3; 4312 const Register z = r4; 4313 const Register zlen = r5; 4314 4315 const Register tmp1 = r10; 4316 const Register tmp2 = r11; 4317 const Register tmp3 = r12; 4318 const Register tmp4 = r13; 4319 const Register tmp5 = r14; 4320 const Register tmp6 = r15; 4321 const Register tmp7 = r16; 4322 4323 BLOCK_COMMENT("Entry:"); 4324 __ enter(); // required for proper stackwalking of RuntimeStub frame 4325 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4326 __ leave(); // required for proper stackwalking of RuntimeStub frame 4327 __ ret(lr); 4328 4329 return start; 4330 } 4331 4332 address generate_squareToLen() { 4333 // squareToLen algorithm for sizes 1..127 described in java code works 4334 // faster than multiply_to_len on some CPUs and slower on others, but 4335 // multiply_to_len shows a bit better overall results 4336 __ align(CodeEntryAlignment); 4337 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4338 address start = __ pc(); 4339 4340 const Register x = r0; 4341 const Register xlen = r1; 4342 const Register z = r2; 4343 const Register zlen = r3; 4344 const Register y = r4; // == x 4345 const Register ylen = r5; // == xlen 4346 4347 const Register tmp1 = r10; 4348 const Register tmp2 = r11; 4349 const Register tmp3 = r12; 4350 const Register tmp4 = r13; 4351 const Register tmp5 = r14; 4352 const Register tmp6 = r15; 4353 const Register tmp7 = r16; 4354 4355 RegSet spilled_regs = RegSet::of(y, ylen); 4356 BLOCK_COMMENT("Entry:"); 4357 __ enter(); 4358 __ push(spilled_regs, sp); 4359 __ mov(y, x); 4360 __ mov(ylen, xlen); 4361 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4362 __ pop(spilled_regs, sp); 4363 __ leave(); 4364 __ ret(lr); 4365 return start; 4366 } 4367 4368 address generate_mulAdd() { 4369 __ align(CodeEntryAlignment); 4370 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4371 4372 address start = __ pc(); 4373 4374 const Register out = r0; 4375 const Register in = r1; 4376 const Register offset = r2; 4377 const Register len = r3; 4378 const Register k = r4; 4379 4380 BLOCK_COMMENT("Entry:"); 4381 __ enter(); 4382 __ mul_add(out, in, offset, len, k); 4383 __ leave(); 4384 __ ret(lr); 4385 4386 return start; 4387 } 4388 4389 // Arguments: 4390 // 4391 // Input: 4392 // c_rarg0 - newArr address 4393 // c_rarg1 - oldArr address 4394 // c_rarg2 - newIdx 4395 // c_rarg3 - shiftCount 4396 // c_rarg4 - numIter 4397 // 4398 address generate_bigIntegerRightShift() { 4399 __ align(CodeEntryAlignment); 4400 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4401 address start = __ pc(); 4402 4403 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4404 4405 Register newArr = c_rarg0; 4406 Register oldArr = c_rarg1; 4407 Register newIdx = c_rarg2; 4408 Register shiftCount = c_rarg3; 4409 Register numIter = c_rarg4; 4410 Register idx = numIter; 4411 4412 Register newArrCur = rscratch1; 4413 Register shiftRevCount = rscratch2; 4414 Register oldArrCur = r13; 4415 Register oldArrNext = r14; 4416 4417 FloatRegister oldElem0 = v0; 4418 FloatRegister oldElem1 = v1; 4419 FloatRegister newElem = v2; 4420 FloatRegister shiftVCount = v3; 4421 FloatRegister shiftVRevCount = v4; 4422 4423 __ cbz(idx, Exit); 4424 4425 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4426 4427 // left shift count 4428 __ movw(shiftRevCount, 32); 4429 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4430 4431 // numIter too small to allow a 4-words SIMD loop, rolling back 4432 __ cmp(numIter, (u1)4); 4433 __ br(Assembler::LT, ShiftThree); 4434 4435 __ dup(shiftVCount, __ T4S, shiftCount); 4436 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4437 __ negr(shiftVCount, __ T4S, shiftVCount); 4438 4439 __ BIND(ShiftSIMDLoop); 4440 4441 // Calculate the load addresses 4442 __ sub(idx, idx, 4); 4443 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4444 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4445 __ add(oldArrCur, oldArrNext, 4); 4446 4447 // Load 4 words and process 4448 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4449 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4450 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4451 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4452 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4453 __ st1(newElem, __ T4S, Address(newArrCur)); 4454 4455 __ cmp(idx, (u1)4); 4456 __ br(Assembler::LT, ShiftTwoLoop); 4457 __ b(ShiftSIMDLoop); 4458 4459 __ BIND(ShiftTwoLoop); 4460 __ cbz(idx, Exit); 4461 __ cmp(idx, (u1)1); 4462 __ br(Assembler::EQ, ShiftOne); 4463 4464 // Calculate the load addresses 4465 __ sub(idx, idx, 2); 4466 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4467 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4468 __ add(oldArrCur, oldArrNext, 4); 4469 4470 // Load 2 words and process 4471 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4472 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4473 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4474 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4475 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4476 __ st1(newElem, __ T2S, Address(newArrCur)); 4477 __ b(ShiftTwoLoop); 4478 4479 __ BIND(ShiftThree); 4480 __ tbz(idx, 1, ShiftOne); 4481 __ tbz(idx, 0, ShiftTwo); 4482 __ ldrw(r10, Address(oldArr, 12)); 4483 __ ldrw(r11, Address(oldArr, 8)); 4484 __ lsrvw(r10, r10, shiftCount); 4485 __ lslvw(r11, r11, shiftRevCount); 4486 __ orrw(r12, r10, r11); 4487 __ strw(r12, Address(newArr, 8)); 4488 4489 __ BIND(ShiftTwo); 4490 __ ldrw(r10, Address(oldArr, 8)); 4491 __ ldrw(r11, Address(oldArr, 4)); 4492 __ lsrvw(r10, r10, shiftCount); 4493 __ lslvw(r11, r11, shiftRevCount); 4494 __ orrw(r12, r10, r11); 4495 __ strw(r12, Address(newArr, 4)); 4496 4497 __ BIND(ShiftOne); 4498 __ ldrw(r10, Address(oldArr, 4)); 4499 __ ldrw(r11, Address(oldArr)); 4500 __ lsrvw(r10, r10, shiftCount); 4501 __ lslvw(r11, r11, shiftRevCount); 4502 __ orrw(r12, r10, r11); 4503 __ strw(r12, Address(newArr)); 4504 4505 __ BIND(Exit); 4506 __ ret(lr); 4507 4508 return start; 4509 } 4510 4511 // Arguments: 4512 // 4513 // Input: 4514 // c_rarg0 - newArr address 4515 // c_rarg1 - oldArr address 4516 // c_rarg2 - newIdx 4517 // c_rarg3 - shiftCount 4518 // c_rarg4 - numIter 4519 // 4520 address generate_bigIntegerLeftShift() { 4521 __ align(CodeEntryAlignment); 4522 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4523 address start = __ pc(); 4524 4525 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4526 4527 Register newArr = c_rarg0; 4528 Register oldArr = c_rarg1; 4529 Register newIdx = c_rarg2; 4530 Register shiftCount = c_rarg3; 4531 Register numIter = c_rarg4; 4532 4533 Register shiftRevCount = rscratch1; 4534 Register oldArrNext = rscratch2; 4535 4536 FloatRegister oldElem0 = v0; 4537 FloatRegister oldElem1 = v1; 4538 FloatRegister newElem = v2; 4539 FloatRegister shiftVCount = v3; 4540 FloatRegister shiftVRevCount = v4; 4541 4542 __ cbz(numIter, Exit); 4543 4544 __ add(oldArrNext, oldArr, 4); 4545 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4546 4547 // right shift count 4548 __ movw(shiftRevCount, 32); 4549 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4550 4551 // numIter too small to allow a 4-words SIMD loop, rolling back 4552 __ cmp(numIter, (u1)4); 4553 __ br(Assembler::LT, ShiftThree); 4554 4555 __ dup(shiftVCount, __ T4S, shiftCount); 4556 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4557 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4558 4559 __ BIND(ShiftSIMDLoop); 4560 4561 // load 4 words and process 4562 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4563 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4564 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4565 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4566 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4567 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4568 __ sub(numIter, numIter, 4); 4569 4570 __ cmp(numIter, (u1)4); 4571 __ br(Assembler::LT, ShiftTwoLoop); 4572 __ b(ShiftSIMDLoop); 4573 4574 __ BIND(ShiftTwoLoop); 4575 __ cbz(numIter, Exit); 4576 __ cmp(numIter, (u1)1); 4577 __ br(Assembler::EQ, ShiftOne); 4578 4579 // load 2 words and process 4580 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4581 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4582 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4583 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4584 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4585 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4586 __ sub(numIter, numIter, 2); 4587 __ b(ShiftTwoLoop); 4588 4589 __ BIND(ShiftThree); 4590 __ ldrw(r10, __ post(oldArr, 4)); 4591 __ ldrw(r11, __ post(oldArrNext, 4)); 4592 __ lslvw(r10, r10, shiftCount); 4593 __ lsrvw(r11, r11, shiftRevCount); 4594 __ orrw(r12, r10, r11); 4595 __ strw(r12, __ post(newArr, 4)); 4596 __ tbz(numIter, 1, Exit); 4597 __ tbz(numIter, 0, ShiftOne); 4598 4599 __ BIND(ShiftTwo); 4600 __ ldrw(r10, __ post(oldArr, 4)); 4601 __ ldrw(r11, __ post(oldArrNext, 4)); 4602 __ lslvw(r10, r10, shiftCount); 4603 __ lsrvw(r11, r11, shiftRevCount); 4604 __ orrw(r12, r10, r11); 4605 __ strw(r12, __ post(newArr, 4)); 4606 4607 __ BIND(ShiftOne); 4608 __ ldrw(r10, Address(oldArr)); 4609 __ ldrw(r11, Address(oldArrNext)); 4610 __ lslvw(r10, r10, shiftCount); 4611 __ lsrvw(r11, r11, shiftRevCount); 4612 __ orrw(r12, r10, r11); 4613 __ strw(r12, Address(newArr)); 4614 4615 __ BIND(Exit); 4616 __ ret(lr); 4617 4618 return start; 4619 } 4620 4621 address generate_count_positives(address &count_positives_long) { 4622 const u1 large_loop_size = 64; 4623 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4624 int dcache_line = VM_Version::dcache_line_size(); 4625 4626 Register ary1 = r1, len = r2, result = r0; 4627 4628 __ align(CodeEntryAlignment); 4629 4630 StubCodeMark mark(this, "StubRoutines", "count_positives"); 4631 4632 address entry = __ pc(); 4633 4634 __ enter(); 4635 // precondition: a copy of len is already in result 4636 // __ mov(result, len); 4637 4638 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 4639 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 4640 4641 __ cmp(len, (u1)15); 4642 __ br(Assembler::GT, LEN_OVER_15); 4643 // The only case when execution falls into this code is when pointer is near 4644 // the end of memory page and we have to avoid reading next page 4645 __ add(ary1, ary1, len); 4646 __ subs(len, len, 8); 4647 __ br(Assembler::GT, LEN_OVER_8); 4648 __ ldr(rscratch2, Address(ary1, -8)); 4649 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 4650 __ lsrv(rscratch2, rscratch2, rscratch1); 4651 __ tst(rscratch2, UPPER_BIT_MASK); 4652 __ csel(result, zr, result, Assembler::NE); 4653 __ leave(); 4654 __ ret(lr); 4655 __ bind(LEN_OVER_8); 4656 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 4657 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 4658 __ tst(rscratch2, UPPER_BIT_MASK); 4659 __ br(Assembler::NE, RET_NO_POP); 4660 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 4661 __ lsrv(rscratch1, rscratch1, rscratch2); 4662 __ tst(rscratch1, UPPER_BIT_MASK); 4663 __ bind(RET_NO_POP); 4664 __ csel(result, zr, result, Assembler::NE); 4665 __ leave(); 4666 __ ret(lr); 4667 4668 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 4669 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 4670 4671 count_positives_long = __ pc(); // 2nd entry point 4672 4673 __ enter(); 4674 4675 __ bind(LEN_OVER_15); 4676 __ push(spilled_regs, sp); 4677 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 4678 __ cbz(rscratch2, ALIGNED); 4679 __ ldp(tmp6, tmp1, Address(ary1)); 4680 __ mov(tmp5, 16); 4681 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 4682 __ add(ary1, ary1, rscratch1); 4683 __ orr(tmp6, tmp6, tmp1); 4684 __ tst(tmp6, UPPER_BIT_MASK); 4685 __ br(Assembler::NE, RET_ADJUST); 4686 __ sub(len, len, rscratch1); 4687 4688 __ bind(ALIGNED); 4689 __ cmp(len, large_loop_size); 4690 __ br(Assembler::LT, CHECK_16); 4691 // Perform 16-byte load as early return in pre-loop to handle situation 4692 // when initially aligned large array has negative values at starting bytes, 4693 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 4694 // slower. Cases with negative bytes further ahead won't be affected that 4695 // much. In fact, it'll be faster due to early loads, less instructions and 4696 // less branches in LARGE_LOOP. 4697 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 4698 __ sub(len, len, 16); 4699 __ orr(tmp6, tmp6, tmp1); 4700 __ tst(tmp6, UPPER_BIT_MASK); 4701 __ br(Assembler::NE, RET_ADJUST_16); 4702 __ cmp(len, large_loop_size); 4703 __ br(Assembler::LT, CHECK_16); 4704 4705 if (SoftwarePrefetchHintDistance >= 0 4706 && SoftwarePrefetchHintDistance >= dcache_line) { 4707 // initial prefetch 4708 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 4709 } 4710 __ bind(LARGE_LOOP); 4711 if (SoftwarePrefetchHintDistance >= 0) { 4712 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 4713 } 4714 // Issue load instructions first, since it can save few CPU/MEM cycles, also 4715 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 4716 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 4717 // instructions per cycle and have less branches, but this approach disables 4718 // early return, thus, all 64 bytes are loaded and checked every time. 4719 __ ldp(tmp2, tmp3, Address(ary1)); 4720 __ ldp(tmp4, tmp5, Address(ary1, 16)); 4721 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 4722 __ ldp(tmp6, tmp1, Address(ary1, 48)); 4723 __ add(ary1, ary1, large_loop_size); 4724 __ sub(len, len, large_loop_size); 4725 __ orr(tmp2, tmp2, tmp3); 4726 __ orr(tmp4, tmp4, tmp5); 4727 __ orr(rscratch1, rscratch1, rscratch2); 4728 __ orr(tmp6, tmp6, tmp1); 4729 __ orr(tmp2, tmp2, tmp4); 4730 __ orr(rscratch1, rscratch1, tmp6); 4731 __ orr(tmp2, tmp2, rscratch1); 4732 __ tst(tmp2, UPPER_BIT_MASK); 4733 __ br(Assembler::NE, RET_ADJUST_LONG); 4734 __ cmp(len, large_loop_size); 4735 __ br(Assembler::GE, LARGE_LOOP); 4736 4737 __ bind(CHECK_16); // small 16-byte load pre-loop 4738 __ cmp(len, (u1)16); 4739 __ br(Assembler::LT, POST_LOOP16); 4740 4741 __ bind(LOOP16); // small 16-byte load loop 4742 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 4743 __ sub(len, len, 16); 4744 __ orr(tmp2, tmp2, tmp3); 4745 __ tst(tmp2, UPPER_BIT_MASK); 4746 __ br(Assembler::NE, RET_ADJUST_16); 4747 __ cmp(len, (u1)16); 4748 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 4749 4750 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 4751 __ cmp(len, (u1)8); 4752 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 4753 __ ldr(tmp3, Address(__ post(ary1, 8))); 4754 __ tst(tmp3, UPPER_BIT_MASK); 4755 __ br(Assembler::NE, RET_ADJUST); 4756 __ sub(len, len, 8); 4757 4758 __ bind(POST_LOOP16_LOAD_TAIL); 4759 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 4760 __ ldr(tmp1, Address(ary1)); 4761 __ mov(tmp2, 64); 4762 __ sub(tmp4, tmp2, len, __ LSL, 3); 4763 __ lslv(tmp1, tmp1, tmp4); 4764 __ tst(tmp1, UPPER_BIT_MASK); 4765 __ br(Assembler::NE, RET_ADJUST); 4766 // Fallthrough 4767 4768 __ bind(RET_LEN); 4769 __ pop(spilled_regs, sp); 4770 __ leave(); 4771 __ ret(lr); 4772 4773 // difference result - len is the count of guaranteed to be 4774 // positive bytes 4775 4776 __ bind(RET_ADJUST_LONG); 4777 __ add(len, len, (u1)(large_loop_size - 16)); 4778 __ bind(RET_ADJUST_16); 4779 __ add(len, len, 16); 4780 __ bind(RET_ADJUST); 4781 __ pop(spilled_regs, sp); 4782 __ leave(); 4783 __ sub(result, result, len); 4784 __ ret(lr); 4785 4786 return entry; 4787 } 4788 4789 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 4790 bool usePrefetch, Label &NOT_EQUAL) { 4791 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4792 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 4793 tmp7 = r12, tmp8 = r13; 4794 Label LOOP; 4795 4796 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4797 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4798 __ bind(LOOP); 4799 if (usePrefetch) { 4800 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 4801 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 4802 } 4803 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 4804 __ eor(tmp1, tmp1, tmp2); 4805 __ eor(tmp3, tmp3, tmp4); 4806 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 4807 __ orr(tmp1, tmp1, tmp3); 4808 __ cbnz(tmp1, NOT_EQUAL); 4809 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4810 __ eor(tmp5, tmp5, tmp6); 4811 __ eor(tmp7, tmp7, tmp8); 4812 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4813 __ orr(tmp5, tmp5, tmp7); 4814 __ cbnz(tmp5, NOT_EQUAL); 4815 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 4816 __ eor(tmp1, tmp1, tmp2); 4817 __ eor(tmp3, tmp3, tmp4); 4818 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 4819 __ orr(tmp1, tmp1, tmp3); 4820 __ cbnz(tmp1, NOT_EQUAL); 4821 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4822 __ eor(tmp5, tmp5, tmp6); 4823 __ sub(cnt1, cnt1, 8 * wordSize); 4824 __ eor(tmp7, tmp7, tmp8); 4825 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4826 // tmp6 is not used. MacroAssembler::subs is used here (rather than 4827 // cmp) because subs allows an unlimited range of immediate operand. 4828 __ subs(tmp6, cnt1, loopThreshold); 4829 __ orr(tmp5, tmp5, tmp7); 4830 __ cbnz(tmp5, NOT_EQUAL); 4831 __ br(__ GE, LOOP); 4832 // post-loop 4833 __ eor(tmp1, tmp1, tmp2); 4834 __ eor(tmp3, tmp3, tmp4); 4835 __ orr(tmp1, tmp1, tmp3); 4836 __ sub(cnt1, cnt1, 2 * wordSize); 4837 __ cbnz(tmp1, NOT_EQUAL); 4838 } 4839 4840 void generate_large_array_equals_loop_simd(int loopThreshold, 4841 bool usePrefetch, Label &NOT_EQUAL) { 4842 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4843 tmp2 = rscratch2; 4844 Label LOOP; 4845 4846 __ bind(LOOP); 4847 if (usePrefetch) { 4848 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 4849 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 4850 } 4851 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 4852 __ sub(cnt1, cnt1, 8 * wordSize); 4853 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 4854 __ subs(tmp1, cnt1, loopThreshold); 4855 __ eor(v0, __ T16B, v0, v4); 4856 __ eor(v1, __ T16B, v1, v5); 4857 __ eor(v2, __ T16B, v2, v6); 4858 __ eor(v3, __ T16B, v3, v7); 4859 __ orr(v0, __ T16B, v0, v1); 4860 __ orr(v1, __ T16B, v2, v3); 4861 __ orr(v0, __ T16B, v0, v1); 4862 __ umov(tmp1, v0, __ D, 0); 4863 __ umov(tmp2, v0, __ D, 1); 4864 __ orr(tmp1, tmp1, tmp2); 4865 __ cbnz(tmp1, NOT_EQUAL); 4866 __ br(__ GE, LOOP); 4867 } 4868 4869 // a1 = r1 - array1 address 4870 // a2 = r2 - array2 address 4871 // result = r0 - return value. Already contains "false" 4872 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 4873 // r3-r5 are reserved temporary registers 4874 address generate_large_array_equals() { 4875 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4876 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 4877 tmp7 = r12, tmp8 = r13; 4878 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 4879 SMALL_LOOP, POST_LOOP; 4880 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 4881 // calculate if at least 32 prefetched bytes are used 4882 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 4883 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 4884 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 4885 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 4886 tmp5, tmp6, tmp7, tmp8); 4887 4888 __ align(CodeEntryAlignment); 4889 4890 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 4891 4892 address entry = __ pc(); 4893 __ enter(); 4894 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 4895 // also advance pointers to use post-increment instead of pre-increment 4896 __ add(a1, a1, wordSize); 4897 __ add(a2, a2, wordSize); 4898 if (AvoidUnalignedAccesses) { 4899 // both implementations (SIMD/nonSIMD) are using relatively large load 4900 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 4901 // on some CPUs in case of address is not at least 16-byte aligned. 4902 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 4903 // load if needed at least for 1st address and make if 16-byte aligned. 4904 Label ALIGNED16; 4905 __ tbz(a1, 3, ALIGNED16); 4906 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4907 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4908 __ sub(cnt1, cnt1, wordSize); 4909 __ eor(tmp1, tmp1, tmp2); 4910 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 4911 __ bind(ALIGNED16); 4912 } 4913 if (UseSIMDForArrayEquals) { 4914 if (SoftwarePrefetchHintDistance >= 0) { 4915 __ subs(tmp1, cnt1, prefetchLoopThreshold); 4916 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4917 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 4918 /* prfm = */ true, NOT_EQUAL); 4919 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 4920 __ br(__ LT, TAIL); 4921 } 4922 __ bind(NO_PREFETCH_LARGE_LOOP); 4923 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 4924 /* prfm = */ false, NOT_EQUAL); 4925 } else { 4926 __ push(spilled_regs, sp); 4927 if (SoftwarePrefetchHintDistance >= 0) { 4928 __ subs(tmp1, cnt1, prefetchLoopThreshold); 4929 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4930 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 4931 /* prfm = */ true, NOT_EQUAL); 4932 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 4933 __ br(__ LT, TAIL); 4934 } 4935 __ bind(NO_PREFETCH_LARGE_LOOP); 4936 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 4937 /* prfm = */ false, NOT_EQUAL); 4938 } 4939 __ bind(TAIL); 4940 __ cbz(cnt1, EQUAL); 4941 __ subs(cnt1, cnt1, wordSize); 4942 __ br(__ LE, POST_LOOP); 4943 __ bind(SMALL_LOOP); 4944 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4945 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4946 __ subs(cnt1, cnt1, wordSize); 4947 __ eor(tmp1, tmp1, tmp2); 4948 __ cbnz(tmp1, NOT_EQUAL); 4949 __ br(__ GT, SMALL_LOOP); 4950 __ bind(POST_LOOP); 4951 __ ldr(tmp1, Address(a1, cnt1)); 4952 __ ldr(tmp2, Address(a2, cnt1)); 4953 __ eor(tmp1, tmp1, tmp2); 4954 __ cbnz(tmp1, NOT_EQUAL); 4955 __ bind(EQUAL); 4956 __ mov(result, true); 4957 __ bind(NOT_EQUAL); 4958 if (!UseSIMDForArrayEquals) { 4959 __ pop(spilled_regs, sp); 4960 } 4961 __ bind(NOT_EQUAL_NO_POP); 4962 __ leave(); 4963 __ ret(lr); 4964 return entry; 4965 } 4966 4967 address generate_dsin_dcos(bool isCos) { 4968 __ align(CodeEntryAlignment); 4969 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 4970 address start = __ pc(); 4971 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 4972 (address)StubRoutines::aarch64::_two_over_pi, 4973 (address)StubRoutines::aarch64::_pio2, 4974 (address)StubRoutines::aarch64::_dsin_coef, 4975 (address)StubRoutines::aarch64::_dcos_coef); 4976 return start; 4977 } 4978 4979 address generate_dlog() { 4980 __ align(CodeEntryAlignment); 4981 StubCodeMark mark(this, "StubRoutines", "dlog"); 4982 address entry = __ pc(); 4983 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4984 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4985 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4986 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 4987 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 4988 return entry; 4989 } 4990 4991 4992 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4993 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4994 Label &DIFF2) { 4995 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 4996 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4997 4998 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4999 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5000 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5001 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5002 5003 __ fmovd(tmpL, vtmp3); 5004 __ eor(rscratch2, tmp3, tmpL); 5005 __ cbnz(rscratch2, DIFF2); 5006 5007 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5008 __ umov(tmpL, vtmp3, __ D, 1); 5009 __ eor(rscratch2, tmpU, tmpL); 5010 __ cbnz(rscratch2, DIFF1); 5011 5012 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5013 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5014 __ fmovd(tmpL, vtmp); 5015 __ eor(rscratch2, tmp3, tmpL); 5016 __ cbnz(rscratch2, DIFF2); 5017 5018 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5019 __ umov(tmpL, vtmp, __ D, 1); 5020 __ eor(rscratch2, tmpU, tmpL); 5021 __ cbnz(rscratch2, DIFF1); 5022 } 5023 5024 // r0 = result 5025 // r1 = str1 5026 // r2 = cnt1 5027 // r3 = str2 5028 // r4 = cnt2 5029 // r10 = tmp1 5030 // r11 = tmp2 5031 address generate_compare_long_string_different_encoding(bool isLU) { 5032 __ align(CodeEntryAlignment); 5033 StubCodeMark mark(this, "StubRoutines", isLU 5034 ? "compare_long_string_different_encoding LU" 5035 : "compare_long_string_different_encoding UL"); 5036 address entry = __ pc(); 5037 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5038 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5039 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5040 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5041 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5042 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5043 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5044 5045 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5046 5047 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5048 // cnt2 == amount of characters left to compare 5049 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5050 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5051 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5052 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5053 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5054 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5055 __ eor(rscratch2, tmp1, tmp2); 5056 __ mov(rscratch1, tmp2); 5057 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5058 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5059 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5060 __ push(spilled_regs, sp); 5061 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5062 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5063 5064 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5065 5066 if (SoftwarePrefetchHintDistance >= 0) { 5067 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5068 __ br(__ LT, NO_PREFETCH); 5069 __ bind(LARGE_LOOP_PREFETCH); 5070 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5071 __ mov(tmp4, 2); 5072 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5073 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5074 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5075 __ subs(tmp4, tmp4, 1); 5076 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5077 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5078 __ mov(tmp4, 2); 5079 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5080 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5081 __ subs(tmp4, tmp4, 1); 5082 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5083 __ sub(cnt2, cnt2, 64); 5084 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5085 __ br(__ GE, LARGE_LOOP_PREFETCH); 5086 } 5087 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5088 __ bind(NO_PREFETCH); 5089 __ subs(cnt2, cnt2, 16); 5090 __ br(__ LT, TAIL); 5091 __ align(OptoLoopAlignment); 5092 __ bind(SMALL_LOOP); // smaller loop 5093 __ subs(cnt2, cnt2, 16); 5094 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5095 __ br(__ GE, SMALL_LOOP); 5096 __ cmn(cnt2, (u1)16); 5097 __ br(__ EQ, LOAD_LAST); 5098 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5099 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5100 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5101 __ ldr(tmp3, Address(cnt1, -8)); 5102 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5103 __ b(LOAD_LAST); 5104 __ bind(DIFF2); 5105 __ mov(tmpU, tmp3); 5106 __ bind(DIFF1); 5107 __ pop(spilled_regs, sp); 5108 __ b(CALCULATE_DIFFERENCE); 5109 __ bind(LOAD_LAST); 5110 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5111 // No need to load it again 5112 __ mov(tmpU, tmp3); 5113 __ pop(spilled_regs, sp); 5114 5115 // tmp2 points to the address of the last 4 Latin1 characters right now 5116 __ ldrs(vtmp, Address(tmp2)); 5117 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5118 __ fmovd(tmpL, vtmp); 5119 5120 __ eor(rscratch2, tmpU, tmpL); 5121 __ cbz(rscratch2, DONE); 5122 5123 // Find the first different characters in the longwords and 5124 // compute their difference. 5125 __ bind(CALCULATE_DIFFERENCE); 5126 __ rev(rscratch2, rscratch2); 5127 __ clz(rscratch2, rscratch2); 5128 __ andr(rscratch2, rscratch2, -16); 5129 __ lsrv(tmp1, tmp1, rscratch2); 5130 __ uxthw(tmp1, tmp1); 5131 __ lsrv(rscratch1, rscratch1, rscratch2); 5132 __ uxthw(rscratch1, rscratch1); 5133 __ subw(result, tmp1, rscratch1); 5134 __ bind(DONE); 5135 __ ret(lr); 5136 return entry; 5137 } 5138 5139 address generate_method_entry_barrier() { 5140 __ align(CodeEntryAlignment); 5141 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5142 5143 Label deoptimize_label; 5144 5145 address start = __ pc(); 5146 5147 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5148 5149 __ enter(); 5150 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5151 5152 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5153 5154 __ push_call_clobbered_registers(); 5155 5156 __ mov(c_rarg0, rscratch2); 5157 __ call_VM_leaf 5158 (CAST_FROM_FN_PTR 5159 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5160 5161 __ reset_last_Java_frame(true); 5162 5163 __ mov(rscratch1, r0); 5164 5165 __ pop_call_clobbered_registers(); 5166 5167 __ cbnz(rscratch1, deoptimize_label); 5168 5169 __ leave(); 5170 __ ret(lr); 5171 5172 __ BIND(deoptimize_label); 5173 5174 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5175 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5176 5177 __ mov(sp, rscratch1); 5178 __ br(rscratch2); 5179 5180 return start; 5181 } 5182 5183 // r0 = result 5184 // r1 = str1 5185 // r2 = cnt1 5186 // r3 = str2 5187 // r4 = cnt2 5188 // r10 = tmp1 5189 // r11 = tmp2 5190 address generate_compare_long_string_same_encoding(bool isLL) { 5191 __ align(CodeEntryAlignment); 5192 StubCodeMark mark(this, "StubRoutines", isLL 5193 ? "compare_long_string_same_encoding LL" 5194 : "compare_long_string_same_encoding UU"); 5195 address entry = __ pc(); 5196 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5197 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5198 5199 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5200 5201 // exit from large loop when less than 64 bytes left to read or we're about 5202 // to prefetch memory behind array border 5203 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5204 5205 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5206 __ eor(rscratch2, tmp1, tmp2); 5207 __ cbnz(rscratch2, CAL_DIFFERENCE); 5208 5209 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5210 // update pointers, because of previous read 5211 __ add(str1, str1, wordSize); 5212 __ add(str2, str2, wordSize); 5213 if (SoftwarePrefetchHintDistance >= 0) { 5214 __ align(OptoLoopAlignment); 5215 __ bind(LARGE_LOOP_PREFETCH); 5216 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5217 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5218 5219 for (int i = 0; i < 4; i++) { 5220 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5221 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5222 __ cmp(tmp1, tmp2); 5223 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5224 __ br(Assembler::NE, DIFF); 5225 } 5226 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5227 __ add(str1, str1, 64); 5228 __ add(str2, str2, 64); 5229 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5230 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5231 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5232 } 5233 5234 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5235 __ br(Assembler::LE, LESS16); 5236 __ align(OptoLoopAlignment); 5237 __ bind(LOOP_COMPARE16); 5238 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5239 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5240 __ cmp(tmp1, tmp2); 5241 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5242 __ br(Assembler::NE, DIFF); 5243 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5244 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5245 __ br(Assembler::LT, LESS16); 5246 5247 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5248 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5249 __ cmp(tmp1, tmp2); 5250 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5251 __ br(Assembler::NE, DIFF); 5252 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5253 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5254 __ br(Assembler::GE, LOOP_COMPARE16); 5255 __ cbz(cnt2, LENGTH_DIFF); 5256 5257 __ bind(LESS16); 5258 // each 8 compare 5259 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5260 __ br(Assembler::LE, LESS8); 5261 __ ldr(tmp1, Address(__ post(str1, 8))); 5262 __ ldr(tmp2, Address(__ post(str2, 8))); 5263 __ eor(rscratch2, tmp1, tmp2); 5264 __ cbnz(rscratch2, CAL_DIFFERENCE); 5265 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5266 5267 __ bind(LESS8); // directly load last 8 bytes 5268 if (!isLL) { 5269 __ add(cnt2, cnt2, cnt2); 5270 } 5271 __ ldr(tmp1, Address(str1, cnt2)); 5272 __ ldr(tmp2, Address(str2, cnt2)); 5273 __ eor(rscratch2, tmp1, tmp2); 5274 __ cbz(rscratch2, LENGTH_DIFF); 5275 __ b(CAL_DIFFERENCE); 5276 5277 __ bind(DIFF); 5278 __ cmp(tmp1, tmp2); 5279 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5280 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5281 // reuse rscratch2 register for the result of eor instruction 5282 __ eor(rscratch2, tmp1, tmp2); 5283 5284 __ bind(CAL_DIFFERENCE); 5285 __ rev(rscratch2, rscratch2); 5286 __ clz(rscratch2, rscratch2); 5287 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5288 __ lsrv(tmp1, tmp1, rscratch2); 5289 __ lsrv(tmp2, tmp2, rscratch2); 5290 if (isLL) { 5291 __ uxtbw(tmp1, tmp1); 5292 __ uxtbw(tmp2, tmp2); 5293 } else { 5294 __ uxthw(tmp1, tmp1); 5295 __ uxthw(tmp2, tmp2); 5296 } 5297 __ subw(result, tmp1, tmp2); 5298 5299 __ bind(LENGTH_DIFF); 5300 __ ret(lr); 5301 return entry; 5302 } 5303 5304 void generate_compare_long_strings() { 5305 StubRoutines::aarch64::_compare_long_string_LL 5306 = generate_compare_long_string_same_encoding(true); 5307 StubRoutines::aarch64::_compare_long_string_UU 5308 = generate_compare_long_string_same_encoding(false); 5309 StubRoutines::aarch64::_compare_long_string_LU 5310 = generate_compare_long_string_different_encoding(true); 5311 StubRoutines::aarch64::_compare_long_string_UL 5312 = generate_compare_long_string_different_encoding(false); 5313 } 5314 5315 // R0 = result 5316 // R1 = str2 5317 // R2 = cnt1 5318 // R3 = str1 5319 // R4 = cnt2 5320 // This generic linear code use few additional ideas, which makes it faster: 5321 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5322 // in order to skip initial loading(help in systems with 1 ld pipeline) 5323 // 2) we can use "fast" algorithm of finding single character to search for 5324 // first symbol with less branches(1 branch per each loaded register instead 5325 // of branch for each symbol), so, this is where constants like 5326 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5327 // 3) after loading and analyzing 1st register of source string, it can be 5328 // used to search for every 1st character entry, saving few loads in 5329 // comparison with "simplier-but-slower" implementation 5330 // 4) in order to avoid lots of push/pop operations, code below is heavily 5331 // re-using/re-initializing/compressing register values, which makes code 5332 // larger and a bit less readable, however, most of extra operations are 5333 // issued during loads or branches, so, penalty is minimal 5334 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5335 const char* stubName = str1_isL 5336 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5337 : "indexof_linear_uu"; 5338 __ align(CodeEntryAlignment); 5339 StubCodeMark mark(this, "StubRoutines", stubName); 5340 address entry = __ pc(); 5341 5342 int str1_chr_size = str1_isL ? 1 : 2; 5343 int str2_chr_size = str2_isL ? 1 : 2; 5344 int str1_chr_shift = str1_isL ? 0 : 1; 5345 int str2_chr_shift = str2_isL ? 0 : 1; 5346 bool isL = str1_isL && str2_isL; 5347 // parameters 5348 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5349 // temporary registers 5350 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5351 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5352 // redefinitions 5353 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5354 5355 __ push(spilled_regs, sp); 5356 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5357 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5358 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5359 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5360 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5361 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5362 // Read whole register from str1. It is safe, because length >=8 here 5363 __ ldr(ch1, Address(str1)); 5364 // Read whole register from str2. It is safe, because length >=8 here 5365 __ ldr(ch2, Address(str2)); 5366 __ sub(cnt2, cnt2, cnt1); 5367 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5368 if (str1_isL != str2_isL) { 5369 __ eor(v0, __ T16B, v0, v0); 5370 } 5371 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5372 __ mul(first, first, tmp1); 5373 // check if we have less than 1 register to check 5374 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5375 if (str1_isL != str2_isL) { 5376 __ fmovd(v1, ch1); 5377 } 5378 __ br(__ LE, L_SMALL); 5379 __ eor(ch2, first, ch2); 5380 if (str1_isL != str2_isL) { 5381 __ zip1(v1, __ T16B, v1, v0); 5382 } 5383 __ sub(tmp2, ch2, tmp1); 5384 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5385 __ bics(tmp2, tmp2, ch2); 5386 if (str1_isL != str2_isL) { 5387 __ fmovd(ch1, v1); 5388 } 5389 __ br(__ NE, L_HAS_ZERO); 5390 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5391 __ add(result, result, wordSize/str2_chr_size); 5392 __ add(str2, str2, wordSize); 5393 __ br(__ LT, L_POST_LOOP); 5394 __ BIND(L_LOOP); 5395 __ ldr(ch2, Address(str2)); 5396 __ eor(ch2, first, ch2); 5397 __ sub(tmp2, ch2, tmp1); 5398 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5399 __ bics(tmp2, tmp2, ch2); 5400 __ br(__ NE, L_HAS_ZERO); 5401 __ BIND(L_LOOP_PROCEED); 5402 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5403 __ add(str2, str2, wordSize); 5404 __ add(result, result, wordSize/str2_chr_size); 5405 __ br(__ GE, L_LOOP); 5406 __ BIND(L_POST_LOOP); 5407 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5408 __ br(__ LE, NOMATCH); 5409 __ ldr(ch2, Address(str2)); 5410 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5411 __ eor(ch2, first, ch2); 5412 __ sub(tmp2, ch2, tmp1); 5413 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5414 __ mov(tmp4, -1); // all bits set 5415 __ b(L_SMALL_PROCEED); 5416 __ align(OptoLoopAlignment); 5417 __ BIND(L_SMALL); 5418 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5419 __ eor(ch2, first, ch2); 5420 if (str1_isL != str2_isL) { 5421 __ zip1(v1, __ T16B, v1, v0); 5422 } 5423 __ sub(tmp2, ch2, tmp1); 5424 __ mov(tmp4, -1); // all bits set 5425 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5426 if (str1_isL != str2_isL) { 5427 __ fmovd(ch1, v1); // move converted 4 symbols 5428 } 5429 __ BIND(L_SMALL_PROCEED); 5430 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5431 __ bic(tmp2, tmp2, ch2); 5432 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5433 __ rbit(tmp2, tmp2); 5434 __ br(__ EQ, NOMATCH); 5435 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5436 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5437 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5438 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5439 if (str2_isL) { // LL 5440 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5441 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5442 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5443 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5444 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5445 } else { 5446 __ mov(ch2, 0xE); // all bits in byte set except last one 5447 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5448 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5449 __ lslv(tmp2, tmp2, tmp4); 5450 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5451 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5452 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5453 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5454 } 5455 __ cmp(ch1, ch2); 5456 __ mov(tmp4, wordSize/str2_chr_size); 5457 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5458 __ BIND(L_SMALL_CMP_LOOP); 5459 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5460 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5461 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5462 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5463 __ add(tmp4, tmp4, 1); 5464 __ cmp(tmp4, cnt1); 5465 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5466 __ cmp(first, ch2); 5467 __ br(__ EQ, L_SMALL_CMP_LOOP); 5468 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5469 __ cbz(tmp2, NOMATCH); // no more matches. exit 5470 __ clz(tmp4, tmp2); 5471 __ add(result, result, 1); // advance index 5472 __ add(str2, str2, str2_chr_size); // advance pointer 5473 __ b(L_SMALL_HAS_ZERO_LOOP); 5474 __ align(OptoLoopAlignment); 5475 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5476 __ cmp(first, ch2); 5477 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5478 __ b(DONE); 5479 __ align(OptoLoopAlignment); 5480 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5481 if (str2_isL) { // LL 5482 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5483 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5484 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5485 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5486 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5487 } else { 5488 __ mov(ch2, 0xE); // all bits in byte set except last one 5489 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5490 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5491 __ lslv(tmp2, tmp2, tmp4); 5492 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5493 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5494 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5495 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5496 } 5497 __ cmp(ch1, ch2); 5498 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5499 __ b(DONE); 5500 __ align(OptoLoopAlignment); 5501 __ BIND(L_HAS_ZERO); 5502 __ rbit(tmp2, tmp2); 5503 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 5504 // Now, perform compression of counters(cnt2 and cnt1) into one register. 5505 // It's fine because both counters are 32bit and are not changed in this 5506 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 5507 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 5508 __ sub(result, result, 1); 5509 __ BIND(L_HAS_ZERO_LOOP); 5510 __ mov(cnt1, wordSize/str2_chr_size); 5511 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5512 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 5513 if (str2_isL) { 5514 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5515 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5516 __ lslv(tmp2, tmp2, tmp4); 5517 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5518 __ add(tmp4, tmp4, 1); 5519 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5520 __ lsl(tmp2, tmp2, 1); 5521 __ mov(tmp4, wordSize/str2_chr_size); 5522 } else { 5523 __ mov(ch2, 0xE); 5524 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5525 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5526 __ lslv(tmp2, tmp2, tmp4); 5527 __ add(tmp4, tmp4, 1); 5528 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5529 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 5530 __ lsl(tmp2, tmp2, 1); 5531 __ mov(tmp4, wordSize/str2_chr_size); 5532 __ sub(str2, str2, str2_chr_size); 5533 } 5534 __ cmp(ch1, ch2); 5535 __ mov(tmp4, wordSize/str2_chr_size); 5536 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5537 __ BIND(L_CMP_LOOP); 5538 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5539 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5540 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5541 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5542 __ add(tmp4, tmp4, 1); 5543 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5544 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 5545 __ cmp(cnt1, ch2); 5546 __ br(__ EQ, L_CMP_LOOP); 5547 __ BIND(L_CMP_LOOP_NOMATCH); 5548 // here we're not matched 5549 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 5550 __ clz(tmp4, tmp2); 5551 __ add(str2, str2, str2_chr_size); // advance pointer 5552 __ b(L_HAS_ZERO_LOOP); 5553 __ align(OptoLoopAlignment); 5554 __ BIND(L_CMP_LOOP_LAST_CMP); 5555 __ cmp(cnt1, ch2); 5556 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5557 __ b(DONE); 5558 __ align(OptoLoopAlignment); 5559 __ BIND(L_CMP_LOOP_LAST_CMP2); 5560 if (str2_isL) { 5561 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5562 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5563 __ lslv(tmp2, tmp2, tmp4); 5564 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5565 __ add(tmp4, tmp4, 1); 5566 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5567 __ lsl(tmp2, tmp2, 1); 5568 } else { 5569 __ mov(ch2, 0xE); 5570 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5571 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5572 __ lslv(tmp2, tmp2, tmp4); 5573 __ add(tmp4, tmp4, 1); 5574 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5575 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 5576 __ lsl(tmp2, tmp2, 1); 5577 __ sub(str2, str2, str2_chr_size); 5578 } 5579 __ cmp(ch1, ch2); 5580 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5581 __ b(DONE); 5582 __ align(OptoLoopAlignment); 5583 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 5584 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 5585 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 5586 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 5587 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 5588 // result by analyzed characters value, so, we can just reset lower bits 5589 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 5590 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 5591 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 5592 // index of last analyzed substring inside current octet. So, str2 in at 5593 // respective start address. We need to advance it to next octet 5594 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 5595 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 5596 __ bfm(result, zr, 0, 2 - str2_chr_shift); 5597 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 5598 __ movw(cnt2, cnt2); 5599 __ b(L_LOOP_PROCEED); 5600 __ align(OptoLoopAlignment); 5601 __ BIND(NOMATCH); 5602 __ mov(result, -1); 5603 __ BIND(DONE); 5604 __ pop(spilled_regs, sp); 5605 __ ret(lr); 5606 return entry; 5607 } 5608 5609 void generate_string_indexof_stubs() { 5610 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 5611 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 5612 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 5613 } 5614 5615 void inflate_and_store_2_fp_registers(bool generatePrfm, 5616 FloatRegister src1, FloatRegister src2) { 5617 Register dst = r1; 5618 __ zip1(v1, __ T16B, src1, v0); 5619 __ zip2(v2, __ T16B, src1, v0); 5620 if (generatePrfm) { 5621 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 5622 } 5623 __ zip1(v3, __ T16B, src2, v0); 5624 __ zip2(v4, __ T16B, src2, v0); 5625 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 5626 } 5627 5628 // R0 = src 5629 // R1 = dst 5630 // R2 = len 5631 // R3 = len >> 3 5632 // V0 = 0 5633 // v1 = loaded 8 bytes 5634 address generate_large_byte_array_inflate() { 5635 __ align(CodeEntryAlignment); 5636 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 5637 address entry = __ pc(); 5638 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 5639 Register src = r0, dst = r1, len = r2, octetCounter = r3; 5640 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 5641 5642 // do one more 8-byte read to have address 16-byte aligned in most cases 5643 // also use single store instruction 5644 __ ldrd(v2, __ post(src, 8)); 5645 __ sub(octetCounter, octetCounter, 2); 5646 __ zip1(v1, __ T16B, v1, v0); 5647 __ zip1(v2, __ T16B, v2, v0); 5648 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 5649 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5650 __ subs(rscratch1, octetCounter, large_loop_threshold); 5651 __ br(__ LE, LOOP_START); 5652 __ b(LOOP_PRFM_START); 5653 __ bind(LOOP_PRFM); 5654 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5655 __ bind(LOOP_PRFM_START); 5656 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 5657 __ sub(octetCounter, octetCounter, 8); 5658 __ subs(rscratch1, octetCounter, large_loop_threshold); 5659 inflate_and_store_2_fp_registers(true, v3, v4); 5660 inflate_and_store_2_fp_registers(true, v5, v6); 5661 __ br(__ GT, LOOP_PRFM); 5662 __ cmp(octetCounter, (u1)8); 5663 __ br(__ LT, DONE); 5664 __ bind(LOOP); 5665 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5666 __ bind(LOOP_START); 5667 __ sub(octetCounter, octetCounter, 8); 5668 __ cmp(octetCounter, (u1)8); 5669 inflate_and_store_2_fp_registers(false, v3, v4); 5670 inflate_and_store_2_fp_registers(false, v5, v6); 5671 __ br(__ GE, LOOP); 5672 __ bind(DONE); 5673 __ ret(lr); 5674 return entry; 5675 } 5676 5677 /** 5678 * Arguments: 5679 * 5680 * Input: 5681 * c_rarg0 - current state address 5682 * c_rarg1 - H key address 5683 * c_rarg2 - data address 5684 * c_rarg3 - number of blocks 5685 * 5686 * Output: 5687 * Updated state at c_rarg0 5688 */ 5689 address generate_ghash_processBlocks() { 5690 // Bafflingly, GCM uses little-endian for the byte order, but 5691 // big-endian for the bit order. For example, the polynomial 1 is 5692 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 5693 // 5694 // So, we must either reverse the bytes in each word and do 5695 // everything big-endian or reverse the bits in each byte and do 5696 // it little-endian. On AArch64 it's more idiomatic to reverse 5697 // the bits in each byte (we have an instruction, RBIT, to do 5698 // that) and keep the data in little-endian bit order through the 5699 // calculation, bit-reversing the inputs and outputs. 5700 5701 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 5702 __ align(wordSize * 2); 5703 address p = __ pc(); 5704 __ emit_int64(0x87); // The low-order bits of the field 5705 // polynomial (i.e. p = z^7+z^2+z+1) 5706 // repeated in the low and high parts of a 5707 // 128-bit vector 5708 __ emit_int64(0x87); 5709 5710 __ align(CodeEntryAlignment); 5711 address start = __ pc(); 5712 5713 Register state = c_rarg0; 5714 Register subkeyH = c_rarg1; 5715 Register data = c_rarg2; 5716 Register blocks = c_rarg3; 5717 5718 FloatRegister vzr = v30; 5719 __ eor(vzr, __ T16B, vzr, vzr); // zero register 5720 5721 __ ldrq(v24, p); // The field polynomial 5722 5723 __ ldrq(v0, Address(state)); 5724 __ ldrq(v1, Address(subkeyH)); 5725 5726 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 5727 __ rbit(v0, __ T16B, v0); 5728 __ rev64(v1, __ T16B, v1); 5729 __ rbit(v1, __ T16B, v1); 5730 5731 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 5732 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 5733 5734 { 5735 Label L_ghash_loop; 5736 __ bind(L_ghash_loop); 5737 5738 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 5739 // reversing each byte 5740 __ rbit(v2, __ T16B, v2); 5741 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 5742 5743 // Multiply state in v2 by subkey in v1 5744 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 5745 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 5746 /*temps*/v6, v3, /*reuse/clobber b*/v2); 5747 // Reduce v7:v5 by the field polynomial 5748 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 5749 5750 __ sub(blocks, blocks, 1); 5751 __ cbnz(blocks, L_ghash_loop); 5752 } 5753 5754 // The bit-reversed result is at this point in v0 5755 __ rev64(v0, __ T16B, v0); 5756 __ rbit(v0, __ T16B, v0); 5757 5758 __ st1(v0, __ T16B, state); 5759 __ ret(lr); 5760 5761 return start; 5762 } 5763 5764 address generate_ghash_processBlocks_wide() { 5765 address small = generate_ghash_processBlocks(); 5766 5767 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 5768 __ align(wordSize * 2); 5769 address p = __ pc(); 5770 __ emit_int64(0x87); // The low-order bits of the field 5771 // polynomial (i.e. p = z^7+z^2+z+1) 5772 // repeated in the low and high parts of a 5773 // 128-bit vector 5774 __ emit_int64(0x87); 5775 5776 __ align(CodeEntryAlignment); 5777 address start = __ pc(); 5778 5779 Register state = c_rarg0; 5780 Register subkeyH = c_rarg1; 5781 Register data = c_rarg2; 5782 Register blocks = c_rarg3; 5783 5784 const int unroll = 4; 5785 5786 __ cmp(blocks, (unsigned char)(unroll * 2)); 5787 __ br(__ LT, small); 5788 5789 if (unroll > 1) { 5790 // Save state before entering routine 5791 __ sub(sp, sp, 4 * 16); 5792 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 5793 __ sub(sp, sp, 4 * 16); 5794 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 5795 } 5796 5797 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 5798 5799 if (unroll > 1) { 5800 // And restore state 5801 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 5802 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 5803 } 5804 5805 __ cmp(blocks, (unsigned char)0); 5806 __ br(__ GT, small); 5807 5808 __ ret(lr); 5809 5810 return start; 5811 } 5812 5813 void generate_base64_encode_simdround(Register src, Register dst, 5814 FloatRegister codec, u8 size) { 5815 5816 FloatRegister in0 = v4, in1 = v5, in2 = v6; 5817 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 5818 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 5819 5820 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 5821 5822 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 5823 5824 __ ushr(ind0, arrangement, in0, 2); 5825 5826 __ ushr(ind1, arrangement, in1, 2); 5827 __ shl(in0, arrangement, in0, 6); 5828 __ orr(ind1, arrangement, ind1, in0); 5829 __ ushr(ind1, arrangement, ind1, 2); 5830 5831 __ ushr(ind2, arrangement, in2, 4); 5832 __ shl(in1, arrangement, in1, 4); 5833 __ orr(ind2, arrangement, in1, ind2); 5834 __ ushr(ind2, arrangement, ind2, 2); 5835 5836 __ shl(ind3, arrangement, in2, 2); 5837 __ ushr(ind3, arrangement, ind3, 2); 5838 5839 __ tbl(out0, arrangement, codec, 4, ind0); 5840 __ tbl(out1, arrangement, codec, 4, ind1); 5841 __ tbl(out2, arrangement, codec, 4, ind2); 5842 __ tbl(out3, arrangement, codec, 4, ind3); 5843 5844 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 5845 } 5846 5847 /** 5848 * Arguments: 5849 * 5850 * Input: 5851 * c_rarg0 - src_start 5852 * c_rarg1 - src_offset 5853 * c_rarg2 - src_length 5854 * c_rarg3 - dest_start 5855 * c_rarg4 - dest_offset 5856 * c_rarg5 - isURL 5857 * 5858 */ 5859 address generate_base64_encodeBlock() { 5860 5861 static const char toBase64[64] = { 5862 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5863 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5864 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5865 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5866 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 5867 }; 5868 5869 static const char toBase64URL[64] = { 5870 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5871 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5872 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5873 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5874 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 5875 }; 5876 5877 __ align(CodeEntryAlignment); 5878 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 5879 address start = __ pc(); 5880 5881 Register src = c_rarg0; // source array 5882 Register soff = c_rarg1; // source start offset 5883 Register send = c_rarg2; // source end offset 5884 Register dst = c_rarg3; // dest array 5885 Register doff = c_rarg4; // position for writing to dest array 5886 Register isURL = c_rarg5; // Base64 or URL character set 5887 5888 // c_rarg6 and c_rarg7 are free to use as temps 5889 Register codec = c_rarg6; 5890 Register length = c_rarg7; 5891 5892 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 5893 5894 __ add(src, src, soff); 5895 __ add(dst, dst, doff); 5896 __ sub(length, send, soff); 5897 5898 // load the codec base address 5899 __ lea(codec, ExternalAddress((address) toBase64)); 5900 __ cbz(isURL, ProcessData); 5901 __ lea(codec, ExternalAddress((address) toBase64URL)); 5902 5903 __ BIND(ProcessData); 5904 5905 // too short to formup a SIMD loop, roll back 5906 __ cmp(length, (u1)24); 5907 __ br(Assembler::LT, Process3B); 5908 5909 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 5910 5911 __ BIND(Process48B); 5912 __ cmp(length, (u1)48); 5913 __ br(Assembler::LT, Process24B); 5914 generate_base64_encode_simdround(src, dst, v0, 16); 5915 __ sub(length, length, 48); 5916 __ b(Process48B); 5917 5918 __ BIND(Process24B); 5919 __ cmp(length, (u1)24); 5920 __ br(Assembler::LT, SIMDExit); 5921 generate_base64_encode_simdround(src, dst, v0, 8); 5922 __ sub(length, length, 24); 5923 5924 __ BIND(SIMDExit); 5925 __ cbz(length, Exit); 5926 5927 __ BIND(Process3B); 5928 // 3 src bytes, 24 bits 5929 __ ldrb(r10, __ post(src, 1)); 5930 __ ldrb(r11, __ post(src, 1)); 5931 __ ldrb(r12, __ post(src, 1)); 5932 __ orrw(r11, r11, r10, Assembler::LSL, 8); 5933 __ orrw(r12, r12, r11, Assembler::LSL, 8); 5934 // codec index 5935 __ ubfmw(r15, r12, 18, 23); 5936 __ ubfmw(r14, r12, 12, 17); 5937 __ ubfmw(r13, r12, 6, 11); 5938 __ andw(r12, r12, 63); 5939 // get the code based on the codec 5940 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 5941 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 5942 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 5943 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 5944 __ strb(r15, __ post(dst, 1)); 5945 __ strb(r14, __ post(dst, 1)); 5946 __ strb(r13, __ post(dst, 1)); 5947 __ strb(r12, __ post(dst, 1)); 5948 __ sub(length, length, 3); 5949 __ cbnz(length, Process3B); 5950 5951 __ BIND(Exit); 5952 __ ret(lr); 5953 5954 return start; 5955 } 5956 5957 void generate_base64_decode_simdround(Register src, Register dst, 5958 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 5959 5960 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 5961 FloatRegister out0 = v20, out1 = v21, out2 = v22; 5962 5963 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 5964 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 5965 5966 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 5967 5968 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 5969 5970 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 5971 5972 // we need unsigned saturating subtract, to make sure all input values 5973 // in range [0, 63] will have 0U value in the higher half lookup 5974 __ uqsubv(decH0, __ T16B, in0, v27); 5975 __ uqsubv(decH1, __ T16B, in1, v27); 5976 __ uqsubv(decH2, __ T16B, in2, v27); 5977 __ uqsubv(decH3, __ T16B, in3, v27); 5978 5979 // lower half lookup 5980 __ tbl(decL0, arrangement, codecL, 4, in0); 5981 __ tbl(decL1, arrangement, codecL, 4, in1); 5982 __ tbl(decL2, arrangement, codecL, 4, in2); 5983 __ tbl(decL3, arrangement, codecL, 4, in3); 5984 5985 // higher half lookup 5986 __ tbx(decH0, arrangement, codecH, 4, decH0); 5987 __ tbx(decH1, arrangement, codecH, 4, decH1); 5988 __ tbx(decH2, arrangement, codecH, 4, decH2); 5989 __ tbx(decH3, arrangement, codecH, 4, decH3); 5990 5991 // combine lower and higher 5992 __ orr(decL0, arrangement, decL0, decH0); 5993 __ orr(decL1, arrangement, decL1, decH1); 5994 __ orr(decL2, arrangement, decL2, decH2); 5995 __ orr(decL3, arrangement, decL3, decH3); 5996 5997 // check illegal inputs, value larger than 63 (maximum of 6 bits) 5998 __ cmhi(decH0, arrangement, decL0, v27); 5999 __ cmhi(decH1, arrangement, decL1, v27); 6000 __ cmhi(decH2, arrangement, decL2, v27); 6001 __ cmhi(decH3, arrangement, decL3, v27); 6002 __ orr(in0, arrangement, decH0, decH1); 6003 __ orr(in1, arrangement, decH2, decH3); 6004 __ orr(in2, arrangement, in0, in1); 6005 __ umaxv(in3, arrangement, in2); 6006 __ umov(rscratch2, in3, __ B, 0); 6007 6008 // get the data to output 6009 __ shl(out0, arrangement, decL0, 2); 6010 __ ushr(out1, arrangement, decL1, 4); 6011 __ orr(out0, arrangement, out0, out1); 6012 __ shl(out1, arrangement, decL1, 4); 6013 __ ushr(out2, arrangement, decL2, 2); 6014 __ orr(out1, arrangement, out1, out2); 6015 __ shl(out2, arrangement, decL2, 6); 6016 __ orr(out2, arrangement, out2, decL3); 6017 6018 __ cbz(rscratch2, NoIllegalData); 6019 6020 // handle illegal input 6021 __ umov(r10, in2, __ D, 0); 6022 if (size == 16) { 6023 __ cbnz(r10, ErrorInLowerHalf); 6024 6025 // illegal input is in higher half, store the lower half now. 6026 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6027 6028 __ umov(r10, in2, __ D, 1); 6029 __ umov(r11, out0, __ D, 1); 6030 __ umov(r12, out1, __ D, 1); 6031 __ umov(r13, out2, __ D, 1); 6032 __ b(StoreLegalData); 6033 6034 __ BIND(ErrorInLowerHalf); 6035 } 6036 __ umov(r11, out0, __ D, 0); 6037 __ umov(r12, out1, __ D, 0); 6038 __ umov(r13, out2, __ D, 0); 6039 6040 __ BIND(StoreLegalData); 6041 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6042 __ strb(r11, __ post(dst, 1)); 6043 __ strb(r12, __ post(dst, 1)); 6044 __ strb(r13, __ post(dst, 1)); 6045 __ lsr(r10, r10, 8); 6046 __ lsr(r11, r11, 8); 6047 __ lsr(r12, r12, 8); 6048 __ lsr(r13, r13, 8); 6049 __ b(StoreLegalData); 6050 6051 __ BIND(NoIllegalData); 6052 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6053 } 6054 6055 6056 /** 6057 * Arguments: 6058 * 6059 * Input: 6060 * c_rarg0 - src_start 6061 * c_rarg1 - src_offset 6062 * c_rarg2 - src_length 6063 * c_rarg3 - dest_start 6064 * c_rarg4 - dest_offset 6065 * c_rarg5 - isURL 6066 * c_rarg6 - isMIME 6067 * 6068 */ 6069 address generate_base64_decodeBlock() { 6070 6071 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6072 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6073 // titled "Base64 decoding". 6074 6075 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6076 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6077 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6078 static const uint8_t fromBase64ForNoSIMD[256] = { 6079 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6080 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6081 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6082 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6083 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6084 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6085 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6086 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6087 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6088 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6089 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6090 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6091 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6092 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6093 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6094 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6095 }; 6096 6097 static const uint8_t fromBase64URLForNoSIMD[256] = { 6098 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6099 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6100 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6101 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6102 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6103 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6104 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6105 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6106 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6107 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6108 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6109 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6110 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6111 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6112 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6113 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6114 }; 6115 6116 // A legal value of base64 code is in range [0, 127]. We need two lookups 6117 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6118 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6119 // table vector lookup use tbx, out of range indices are unchanged in 6120 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6121 // The value of index 64 is set to 0, so that we know that we already get the 6122 // decoded data with the 1st lookup. 6123 static const uint8_t fromBase64ForSIMD[128] = { 6124 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6125 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6126 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6127 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6128 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6129 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6130 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6131 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6132 }; 6133 6134 static const uint8_t fromBase64URLForSIMD[128] = { 6135 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6136 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6137 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6138 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6139 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6140 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6141 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6142 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6143 }; 6144 6145 __ align(CodeEntryAlignment); 6146 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6147 address start = __ pc(); 6148 6149 Register src = c_rarg0; // source array 6150 Register soff = c_rarg1; // source start offset 6151 Register send = c_rarg2; // source end offset 6152 Register dst = c_rarg3; // dest array 6153 Register doff = c_rarg4; // position for writing to dest array 6154 Register isURL = c_rarg5; // Base64 or URL character set 6155 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6156 6157 Register length = send; // reuse send as length of source data to process 6158 6159 Register simd_codec = c_rarg6; 6160 Register nosimd_codec = c_rarg7; 6161 6162 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6163 6164 __ enter(); 6165 6166 __ add(src, src, soff); 6167 __ add(dst, dst, doff); 6168 6169 __ mov(doff, dst); 6170 6171 __ sub(length, send, soff); 6172 __ bfm(length, zr, 0, 1); 6173 6174 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6175 __ cbz(isURL, ProcessData); 6176 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6177 6178 __ BIND(ProcessData); 6179 __ mov(rscratch1, length); 6180 __ cmp(length, (u1)144); // 144 = 80 + 64 6181 __ br(Assembler::LT, Process4B); 6182 6183 // In the MIME case, the line length cannot be more than 76 6184 // bytes (see RFC 2045). This is too short a block for SIMD 6185 // to be worthwhile, so we use non-SIMD here. 6186 __ movw(rscratch1, 79); 6187 6188 __ BIND(Process4B); 6189 __ ldrw(r14, __ post(src, 4)); 6190 __ ubfxw(r10, r14, 0, 8); 6191 __ ubfxw(r11, r14, 8, 8); 6192 __ ubfxw(r12, r14, 16, 8); 6193 __ ubfxw(r13, r14, 24, 8); 6194 // get the de-code 6195 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6196 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6197 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6198 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6199 // error detection, 255u indicates an illegal input 6200 __ orrw(r14, r10, r11); 6201 __ orrw(r15, r12, r13); 6202 __ orrw(r14, r14, r15); 6203 __ tbnz(r14, 7, Exit); 6204 // recover the data 6205 __ lslw(r14, r10, 10); 6206 __ bfiw(r14, r11, 4, 6); 6207 __ bfmw(r14, r12, 2, 5); 6208 __ rev16w(r14, r14); 6209 __ bfiw(r13, r12, 6, 2); 6210 __ strh(r14, __ post(dst, 2)); 6211 __ strb(r13, __ post(dst, 1)); 6212 // non-simd loop 6213 __ subsw(rscratch1, rscratch1, 4); 6214 __ br(Assembler::GT, Process4B); 6215 6216 // if exiting from PreProcess80B, rscratch1 == -1; 6217 // otherwise, rscratch1 == 0. 6218 __ cbzw(rscratch1, Exit); 6219 __ sub(length, length, 80); 6220 6221 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6222 __ cbz(isURL, SIMDEnter); 6223 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6224 6225 __ BIND(SIMDEnter); 6226 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6227 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6228 __ mov(rscratch1, 63); 6229 __ dup(v27, __ T16B, rscratch1); 6230 6231 __ BIND(Process64B); 6232 __ cmp(length, (u1)64); 6233 __ br(Assembler::LT, Process32B); 6234 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6235 __ sub(length, length, 64); 6236 __ b(Process64B); 6237 6238 __ BIND(Process32B); 6239 __ cmp(length, (u1)32); 6240 __ br(Assembler::LT, SIMDExit); 6241 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6242 __ sub(length, length, 32); 6243 __ b(Process32B); 6244 6245 __ BIND(SIMDExit); 6246 __ cbz(length, Exit); 6247 __ movw(rscratch1, length); 6248 __ b(Process4B); 6249 6250 __ BIND(Exit); 6251 __ sub(c_rarg0, dst, doff); 6252 6253 __ leave(); 6254 __ ret(lr); 6255 6256 return start; 6257 } 6258 6259 // Support for spin waits. 6260 address generate_spin_wait() { 6261 __ align(CodeEntryAlignment); 6262 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6263 address start = __ pc(); 6264 6265 __ spin_wait(); 6266 __ ret(lr); 6267 6268 return start; 6269 } 6270 6271 #ifdef LINUX 6272 6273 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6274 // 6275 // If LSE is in use, generate LSE versions of all the stubs. The 6276 // non-LSE versions are in atomic_aarch64.S. 6277 6278 // class AtomicStubMark records the entry point of a stub and the 6279 // stub pointer which will point to it. The stub pointer is set to 6280 // the entry point when ~AtomicStubMark() is called, which must be 6281 // after ICache::invalidate_range. This ensures safe publication of 6282 // the generated code. 6283 class AtomicStubMark { 6284 address _entry_point; 6285 aarch64_atomic_stub_t *_stub; 6286 MacroAssembler *_masm; 6287 public: 6288 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6289 _masm = masm; 6290 __ align(32); 6291 _entry_point = __ pc(); 6292 _stub = stub; 6293 } 6294 ~AtomicStubMark() { 6295 *_stub = (aarch64_atomic_stub_t)_entry_point; 6296 } 6297 }; 6298 6299 // NB: For memory_order_conservative we need a trailing membar after 6300 // LSE atomic operations but not a leading membar. 6301 // 6302 // We don't need a leading membar because a clause in the Arm ARM 6303 // says: 6304 // 6305 // Barrier-ordered-before 6306 // 6307 // Barrier instructions order prior Memory effects before subsequent 6308 // Memory effects generated by the same Observer. A read or a write 6309 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6310 // Observer if and only if RW1 appears in program order before RW 2 6311 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6312 // instruction with both Acquire and Release semantics. 6313 // 6314 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6315 // and Release semantics, therefore we don't need a leading 6316 // barrier. However, there is no corresponding Barrier-ordered-after 6317 // relationship, therefore we need a trailing membar to prevent a 6318 // later store or load from being reordered with the store in an 6319 // atomic instruction. 6320 // 6321 // This was checked by using the herd7 consistency model simulator 6322 // (http://diy.inria.fr/) with this test case: 6323 // 6324 // AArch64 LseCas 6325 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6326 // P0 | P1; 6327 // LDR W4, [X2] | MOV W3, #0; 6328 // DMB LD | MOV W4, #1; 6329 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6330 // | DMB ISH; 6331 // | STR W4, [X2]; 6332 // exists 6333 // (0:X3=0 /\ 0:X4=1) 6334 // 6335 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6336 // with the store to x in P1. Without the DMB in P1 this may happen. 6337 // 6338 // At the time of writing we don't know of any AArch64 hardware that 6339 // reorders stores in this way, but the Reference Manual permits it. 6340 6341 void gen_cas_entry(Assembler::operand_size size, 6342 atomic_memory_order order) { 6343 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6344 exchange_val = c_rarg2; 6345 bool acquire, release; 6346 switch (order) { 6347 case memory_order_relaxed: 6348 acquire = false; 6349 release = false; 6350 break; 6351 case memory_order_release: 6352 acquire = false; 6353 release = true; 6354 break; 6355 default: 6356 acquire = true; 6357 release = true; 6358 break; 6359 } 6360 __ mov(prev, compare_val); 6361 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6362 if (order == memory_order_conservative) { 6363 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6364 } 6365 if (size == Assembler::xword) { 6366 __ mov(r0, prev); 6367 } else { 6368 __ movw(r0, prev); 6369 } 6370 __ ret(lr); 6371 } 6372 6373 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 6374 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6375 // If not relaxed, then default to conservative. Relaxed is the only 6376 // case we use enough to be worth specializing. 6377 if (order == memory_order_relaxed) { 6378 __ ldadd(size, incr, prev, addr); 6379 } else { 6380 __ ldaddal(size, incr, prev, addr); 6381 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6382 } 6383 if (size == Assembler::xword) { 6384 __ mov(r0, prev); 6385 } else { 6386 __ movw(r0, prev); 6387 } 6388 __ ret(lr); 6389 } 6390 6391 void gen_swpal_entry(Assembler::operand_size size) { 6392 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6393 __ swpal(size, incr, prev, addr); 6394 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6395 if (size == Assembler::xword) { 6396 __ mov(r0, prev); 6397 } else { 6398 __ movw(r0, prev); 6399 } 6400 __ ret(lr); 6401 } 6402 6403 void generate_atomic_entry_points() { 6404 if (! UseLSE) { 6405 return; 6406 } 6407 6408 __ align(CodeEntryAlignment); 6409 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6410 address first_entry = __ pc(); 6411 6412 // ADD, memory_order_conservative 6413 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6414 gen_ldadd_entry(Assembler::word, memory_order_conservative); 6415 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6416 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 6417 6418 // ADD, memory_order_relaxed 6419 AtomicStubMark mark_fetch_add_4_relaxed 6420 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 6421 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 6422 AtomicStubMark mark_fetch_add_8_relaxed 6423 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 6424 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 6425 6426 // XCHG, memory_order_conservative 6427 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6428 gen_swpal_entry(Assembler::word); 6429 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6430 gen_swpal_entry(Assembler::xword); 6431 6432 // CAS, memory_order_conservative 6433 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6434 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6435 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6436 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6437 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6438 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6439 6440 // CAS, memory_order_relaxed 6441 AtomicStubMark mark_cmpxchg_1_relaxed 6442 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6443 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6444 AtomicStubMark mark_cmpxchg_4_relaxed 6445 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6446 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6447 AtomicStubMark mark_cmpxchg_8_relaxed 6448 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6449 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6450 6451 AtomicStubMark mark_cmpxchg_4_release 6452 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6453 gen_cas_entry(MacroAssembler::word, memory_order_release); 6454 AtomicStubMark mark_cmpxchg_8_release 6455 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 6456 gen_cas_entry(MacroAssembler::xword, memory_order_release); 6457 6458 AtomicStubMark mark_cmpxchg_4_seq_cst 6459 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 6460 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 6461 AtomicStubMark mark_cmpxchg_8_seq_cst 6462 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 6463 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 6464 6465 ICache::invalidate_range(first_entry, __ pc() - first_entry); 6466 } 6467 #endif // LINUX 6468 6469 // Continuation point for throwing of implicit exceptions that are 6470 // not handled in the current activation. Fabricates an exception 6471 // oop and initiates normal exception dispatching in this 6472 // frame. Since we need to preserve callee-saved values (currently 6473 // only for C2, but done for C1 as well) we need a callee-saved oop 6474 // map and therefore have to make these stubs into RuntimeStubs 6475 // rather than BufferBlobs. If the compiler needs all registers to 6476 // be preserved between the fault point and the exception handler 6477 // then it must assume responsibility for that in 6478 // AbstractCompiler::continuation_for_implicit_null_exception or 6479 // continuation_for_implicit_division_by_zero_exception. All other 6480 // implicit exceptions (e.g., NullPointerException or 6481 // AbstractMethodError on entry) are either at call sites or 6482 // otherwise assume that stack unwinding will be initiated, so 6483 // caller saved registers were assumed volatile in the compiler. 6484 6485 #undef __ 6486 #define __ masm-> 6487 6488 address generate_throw_exception(const char* name, 6489 address runtime_entry, 6490 Register arg1 = noreg, 6491 Register arg2 = noreg) { 6492 // Information about frame layout at time of blocking runtime call. 6493 // Note that we only have to preserve callee-saved registers since 6494 // the compilers are responsible for supplying a continuation point 6495 // if they expect all registers to be preserved. 6496 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 6497 enum layout { 6498 rfp_off = 0, 6499 rfp_off2, 6500 return_off, 6501 return_off2, 6502 framesize // inclusive of return address 6503 }; 6504 6505 int insts_size = 512; 6506 int locs_size = 64; 6507 6508 CodeBuffer code(name, insts_size, locs_size); 6509 OopMapSet* oop_maps = new OopMapSet(); 6510 MacroAssembler* masm = new MacroAssembler(&code); 6511 6512 address start = __ pc(); 6513 6514 // This is an inlined and slightly modified version of call_VM 6515 // which has the ability to fetch the return PC out of 6516 // thread-local storage and also sets up last_Java_sp slightly 6517 // differently than the real call_VM 6518 6519 __ enter(); // Save FP and LR before call 6520 6521 assert(is_even(framesize/2), "sp not 16-byte aligned"); 6522 6523 // lr and fp are already in place 6524 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 6525 6526 int frame_complete = __ pc() - start; 6527 6528 // Set up last_Java_sp and last_Java_fp 6529 address the_pc = __ pc(); 6530 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 6531 6532 // Call runtime 6533 if (arg1 != noreg) { 6534 assert(arg2 != c_rarg1, "clobbered"); 6535 __ mov(c_rarg1, arg1); 6536 } 6537 if (arg2 != noreg) { 6538 __ mov(c_rarg2, arg2); 6539 } 6540 __ mov(c_rarg0, rthread); 6541 BLOCK_COMMENT("call runtime_entry"); 6542 __ mov(rscratch1, runtime_entry); 6543 __ blr(rscratch1); 6544 6545 // Generate oop map 6546 OopMap* map = new OopMap(framesize, 0); 6547 6548 oop_maps->add_gc_map(the_pc - start, map); 6549 6550 __ reset_last_Java_frame(true); 6551 6552 // Reinitialize the ptrue predicate register, in case the external runtime 6553 // call clobbers ptrue reg, as we may return to SVE compiled code. 6554 __ reinitialize_ptrue(); 6555 6556 __ leave(); 6557 6558 // check for pending exceptions 6559 #ifdef ASSERT 6560 Label L; 6561 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 6562 __ cbnz(rscratch1, L); 6563 __ should_not_reach_here(); 6564 __ bind(L); 6565 #endif // ASSERT 6566 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 6567 6568 6569 // codeBlob framesize is in words (not VMRegImpl::slot_size) 6570 RuntimeStub* stub = 6571 RuntimeStub::new_runtime_stub(name, 6572 &code, 6573 frame_complete, 6574 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 6575 oop_maps, false); 6576 return stub->entry_point(); 6577 } 6578 6579 class MontgomeryMultiplyGenerator : public MacroAssembler { 6580 6581 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 6582 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 6583 6584 RegSet _toSave; 6585 bool _squaring; 6586 6587 public: 6588 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 6589 : MacroAssembler(as->code()), _squaring(squaring) { 6590 6591 // Register allocation 6592 6593 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 6594 Pa_base = *regs; // Argument registers 6595 if (squaring) 6596 Pb_base = Pa_base; 6597 else 6598 Pb_base = *++regs; 6599 Pn_base = *++regs; 6600 Rlen= *++regs; 6601 inv = *++regs; 6602 Pm_base = *++regs; 6603 6604 // Working registers: 6605 Ra = *++regs; // The current digit of a, b, n, and m. 6606 Rb = *++regs; 6607 Rm = *++regs; 6608 Rn = *++regs; 6609 6610 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 6611 Pb = *++regs; 6612 Pm = *++regs; 6613 Pn = *++regs; 6614 6615 t0 = *++regs; // Three registers which form a 6616 t1 = *++regs; // triple-precision accumuator. 6617 t2 = *++regs; 6618 6619 Ri = *++regs; // Inner and outer loop indexes. 6620 Rj = *++regs; 6621 6622 Rhi_ab = *++regs; // Product registers: low and high parts 6623 Rlo_ab = *++regs; // of a*b and m*n. 6624 Rhi_mn = *++regs; 6625 Rlo_mn = *++regs; 6626 6627 // r19 and up are callee-saved. 6628 _toSave = RegSet::range(r19, *regs) + Pm_base; 6629 } 6630 6631 private: 6632 void save_regs() { 6633 push(_toSave, sp); 6634 } 6635 6636 void restore_regs() { 6637 pop(_toSave, sp); 6638 } 6639 6640 template <typename T> 6641 void unroll_2(Register count, T block) { 6642 Label loop, end, odd; 6643 tbnz(count, 0, odd); 6644 cbz(count, end); 6645 align(16); 6646 bind(loop); 6647 (this->*block)(); 6648 bind(odd); 6649 (this->*block)(); 6650 subs(count, count, 2); 6651 br(Assembler::GT, loop); 6652 bind(end); 6653 } 6654 6655 template <typename T> 6656 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 6657 Label loop, end, odd; 6658 tbnz(count, 0, odd); 6659 cbz(count, end); 6660 align(16); 6661 bind(loop); 6662 (this->*block)(d, s, tmp); 6663 bind(odd); 6664 (this->*block)(d, s, tmp); 6665 subs(count, count, 2); 6666 br(Assembler::GT, loop); 6667 bind(end); 6668 } 6669 6670 void pre1(RegisterOrConstant i) { 6671 block_comment("pre1"); 6672 // Pa = Pa_base; 6673 // Pb = Pb_base + i; 6674 // Pm = Pm_base; 6675 // Pn = Pn_base + i; 6676 // Ra = *Pa; 6677 // Rb = *Pb; 6678 // Rm = *Pm; 6679 // Rn = *Pn; 6680 ldr(Ra, Address(Pa_base)); 6681 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 6682 ldr(Rm, Address(Pm_base)); 6683 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6684 lea(Pa, Address(Pa_base)); 6685 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 6686 lea(Pm, Address(Pm_base)); 6687 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6688 6689 // Zero the m*n result. 6690 mov(Rhi_mn, zr); 6691 mov(Rlo_mn, zr); 6692 } 6693 6694 // The core multiply-accumulate step of a Montgomery 6695 // multiplication. The idea is to schedule operations as a 6696 // pipeline so that instructions with long latencies (loads and 6697 // multiplies) have time to complete before their results are 6698 // used. This most benefits in-order implementations of the 6699 // architecture but out-of-order ones also benefit. 6700 void step() { 6701 block_comment("step"); 6702 // MACC(Ra, Rb, t0, t1, t2); 6703 // Ra = *++Pa; 6704 // Rb = *--Pb; 6705 umulh(Rhi_ab, Ra, Rb); 6706 mul(Rlo_ab, Ra, Rb); 6707 ldr(Ra, pre(Pa, wordSize)); 6708 ldr(Rb, pre(Pb, -wordSize)); 6709 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 6710 // previous iteration. 6711 // MACC(Rm, Rn, t0, t1, t2); 6712 // Rm = *++Pm; 6713 // Rn = *--Pn; 6714 umulh(Rhi_mn, Rm, Rn); 6715 mul(Rlo_mn, Rm, Rn); 6716 ldr(Rm, pre(Pm, wordSize)); 6717 ldr(Rn, pre(Pn, -wordSize)); 6718 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 6719 } 6720 6721 void post1() { 6722 block_comment("post1"); 6723 6724 // MACC(Ra, Rb, t0, t1, t2); 6725 // Ra = *++Pa; 6726 // Rb = *--Pb; 6727 umulh(Rhi_ab, Ra, Rb); 6728 mul(Rlo_ab, Ra, Rb); 6729 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 6730 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 6731 6732 // *Pm = Rm = t0 * inv; 6733 mul(Rm, t0, inv); 6734 str(Rm, Address(Pm)); 6735 6736 // MACC(Rm, Rn, t0, t1, t2); 6737 // t0 = t1; t1 = t2; t2 = 0; 6738 umulh(Rhi_mn, Rm, Rn); 6739 6740 #ifndef PRODUCT 6741 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 6742 { 6743 mul(Rlo_mn, Rm, Rn); 6744 add(Rlo_mn, t0, Rlo_mn); 6745 Label ok; 6746 cbz(Rlo_mn, ok); { 6747 stop("broken Montgomery multiply"); 6748 } bind(ok); 6749 } 6750 #endif 6751 // We have very carefully set things up so that 6752 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 6753 // the lower half of Rm * Rn because we know the result already: 6754 // it must be -t0. t0 + (-t0) must generate a carry iff 6755 // t0 != 0. So, rather than do a mul and an adds we just set 6756 // the carry flag iff t0 is nonzero. 6757 // 6758 // mul(Rlo_mn, Rm, Rn); 6759 // adds(zr, t0, Rlo_mn); 6760 subs(zr, t0, 1); // Set carry iff t0 is nonzero 6761 adcs(t0, t1, Rhi_mn); 6762 adc(t1, t2, zr); 6763 mov(t2, zr); 6764 } 6765 6766 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 6767 block_comment("pre2"); 6768 // Pa = Pa_base + i-len; 6769 // Pb = Pb_base + len; 6770 // Pm = Pm_base + i-len; 6771 // Pn = Pn_base + len; 6772 6773 if (i.is_register()) { 6774 sub(Rj, i.as_register(), len); 6775 } else { 6776 mov(Rj, i.as_constant()); 6777 sub(Rj, Rj, len); 6778 } 6779 // Rj == i-len 6780 6781 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 6782 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 6783 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 6784 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 6785 6786 // Ra = *++Pa; 6787 // Rb = *--Pb; 6788 // Rm = *++Pm; 6789 // Rn = *--Pn; 6790 ldr(Ra, pre(Pa, wordSize)); 6791 ldr(Rb, pre(Pb, -wordSize)); 6792 ldr(Rm, pre(Pm, wordSize)); 6793 ldr(Rn, pre(Pn, -wordSize)); 6794 6795 mov(Rhi_mn, zr); 6796 mov(Rlo_mn, zr); 6797 } 6798 6799 void post2(RegisterOrConstant i, RegisterOrConstant len) { 6800 block_comment("post2"); 6801 if (i.is_constant()) { 6802 mov(Rj, i.as_constant()-len.as_constant()); 6803 } else { 6804 sub(Rj, i.as_register(), len); 6805 } 6806 6807 adds(t0, t0, Rlo_mn); // The pending m*n, low part 6808 6809 // As soon as we know the least significant digit of our result, 6810 // store it. 6811 // Pm_base[i-len] = t0; 6812 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 6813 6814 // t0 = t1; t1 = t2; t2 = 0; 6815 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 6816 adc(t1, t2, zr); 6817 mov(t2, zr); 6818 } 6819 6820 // A carry in t0 after Montgomery multiplication means that we 6821 // should subtract multiples of n from our result in m. We'll 6822 // keep doing that until there is no carry. 6823 void normalize(RegisterOrConstant len) { 6824 block_comment("normalize"); 6825 // while (t0) 6826 // t0 = sub(Pm_base, Pn_base, t0, len); 6827 Label loop, post, again; 6828 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 6829 cbz(t0, post); { 6830 bind(again); { 6831 mov(i, zr); 6832 mov(cnt, len); 6833 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 6834 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6835 subs(zr, zr, zr); // set carry flag, i.e. no borrow 6836 align(16); 6837 bind(loop); { 6838 sbcs(Rm, Rm, Rn); 6839 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 6840 add(i, i, 1); 6841 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 6842 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6843 sub(cnt, cnt, 1); 6844 } cbnz(cnt, loop); 6845 sbc(t0, t0, zr); 6846 } cbnz(t0, again); 6847 } bind(post); 6848 } 6849 6850 // Move memory at s to d, reversing words. 6851 // Increments d to end of copied memory 6852 // Destroys tmp1, tmp2 6853 // Preserves len 6854 // Leaves s pointing to the address which was in d at start 6855 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 6856 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 6857 6858 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 6859 mov(tmp1, len); 6860 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 6861 sub(s, d, len, ext::uxtw, LogBytesPerWord); 6862 } 6863 // where 6864 void reverse1(Register d, Register s, Register tmp) { 6865 ldr(tmp, pre(s, -wordSize)); 6866 ror(tmp, tmp, 32); 6867 str(tmp, post(d, wordSize)); 6868 } 6869 6870 void step_squaring() { 6871 // An extra ACC 6872 step(); 6873 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 6874 } 6875 6876 void last_squaring(RegisterOrConstant i) { 6877 Label dont; 6878 // if ((i & 1) == 0) { 6879 tbnz(i.as_register(), 0, dont); { 6880 // MACC(Ra, Rb, t0, t1, t2); 6881 // Ra = *++Pa; 6882 // Rb = *--Pb; 6883 umulh(Rhi_ab, Ra, Rb); 6884 mul(Rlo_ab, Ra, Rb); 6885 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 6886 } bind(dont); 6887 } 6888 6889 void extra_step_squaring() { 6890 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 6891 6892 // MACC(Rm, Rn, t0, t1, t2); 6893 // Rm = *++Pm; 6894 // Rn = *--Pn; 6895 umulh(Rhi_mn, Rm, Rn); 6896 mul(Rlo_mn, Rm, Rn); 6897 ldr(Rm, pre(Pm, wordSize)); 6898 ldr(Rn, pre(Pn, -wordSize)); 6899 } 6900 6901 void post1_squaring() { 6902 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 6903 6904 // *Pm = Rm = t0 * inv; 6905 mul(Rm, t0, inv); 6906 str(Rm, Address(Pm)); 6907 6908 // MACC(Rm, Rn, t0, t1, t2); 6909 // t0 = t1; t1 = t2; t2 = 0; 6910 umulh(Rhi_mn, Rm, Rn); 6911 6912 #ifndef PRODUCT 6913 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 6914 { 6915 mul(Rlo_mn, Rm, Rn); 6916 add(Rlo_mn, t0, Rlo_mn); 6917 Label ok; 6918 cbz(Rlo_mn, ok); { 6919 stop("broken Montgomery multiply"); 6920 } bind(ok); 6921 } 6922 #endif 6923 // We have very carefully set things up so that 6924 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 6925 // the lower half of Rm * Rn because we know the result already: 6926 // it must be -t0. t0 + (-t0) must generate a carry iff 6927 // t0 != 0. So, rather than do a mul and an adds we just set 6928 // the carry flag iff t0 is nonzero. 6929 // 6930 // mul(Rlo_mn, Rm, Rn); 6931 // adds(zr, t0, Rlo_mn); 6932 subs(zr, t0, 1); // Set carry iff t0 is nonzero 6933 adcs(t0, t1, Rhi_mn); 6934 adc(t1, t2, zr); 6935 mov(t2, zr); 6936 } 6937 6938 void acc(Register Rhi, Register Rlo, 6939 Register t0, Register t1, Register t2) { 6940 adds(t0, t0, Rlo); 6941 adcs(t1, t1, Rhi); 6942 adc(t2, t2, zr); 6943 } 6944 6945 public: 6946 /** 6947 * Fast Montgomery multiplication. The derivation of the 6948 * algorithm is in A Cryptographic Library for the Motorola 6949 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 6950 * 6951 * Arguments: 6952 * 6953 * Inputs for multiplication: 6954 * c_rarg0 - int array elements a 6955 * c_rarg1 - int array elements b 6956 * c_rarg2 - int array elements n (the modulus) 6957 * c_rarg3 - int length 6958 * c_rarg4 - int inv 6959 * c_rarg5 - int array elements m (the result) 6960 * 6961 * Inputs for squaring: 6962 * c_rarg0 - int array elements a 6963 * c_rarg1 - int array elements n (the modulus) 6964 * c_rarg2 - int length 6965 * c_rarg3 - int inv 6966 * c_rarg4 - int array elements m (the result) 6967 * 6968 */ 6969 address generate_multiply() { 6970 Label argh, nothing; 6971 bind(argh); 6972 stop("MontgomeryMultiply total_allocation must be <= 8192"); 6973 6974 align(CodeEntryAlignment); 6975 address entry = pc(); 6976 6977 cbzw(Rlen, nothing); 6978 6979 enter(); 6980 6981 // Make room. 6982 cmpw(Rlen, 512); 6983 br(Assembler::HI, argh); 6984 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 6985 andr(sp, Ra, -2 * wordSize); 6986 6987 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 6988 6989 { 6990 // Copy input args, reversing as we go. We use Ra as a 6991 // temporary variable. 6992 reverse(Ra, Pa_base, Rlen, t0, t1); 6993 if (!_squaring) 6994 reverse(Ra, Pb_base, Rlen, t0, t1); 6995 reverse(Ra, Pn_base, Rlen, t0, t1); 6996 } 6997 6998 // Push all call-saved registers and also Pm_base which we'll need 6999 // at the end. 7000 save_regs(); 7001 7002 #ifndef PRODUCT 7003 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7004 { 7005 ldr(Rn, Address(Pn_base, 0)); 7006 mul(Rlo_mn, Rn, inv); 7007 subs(zr, Rlo_mn, -1); 7008 Label ok; 7009 br(EQ, ok); { 7010 stop("broken inverse in Montgomery multiply"); 7011 } bind(ok); 7012 } 7013 #endif 7014 7015 mov(Pm_base, Ra); 7016 7017 mov(t0, zr); 7018 mov(t1, zr); 7019 mov(t2, zr); 7020 7021 block_comment("for (int i = 0; i < len; i++) {"); 7022 mov(Ri, zr); { 7023 Label loop, end; 7024 cmpw(Ri, Rlen); 7025 br(Assembler::GE, end); 7026 7027 bind(loop); 7028 pre1(Ri); 7029 7030 block_comment(" for (j = i; j; j--) {"); { 7031 movw(Rj, Ri); 7032 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7033 } block_comment(" } // j"); 7034 7035 post1(); 7036 addw(Ri, Ri, 1); 7037 cmpw(Ri, Rlen); 7038 br(Assembler::LT, loop); 7039 bind(end); 7040 block_comment("} // i"); 7041 } 7042 7043 block_comment("for (int i = len; i < 2*len; i++) {"); 7044 mov(Ri, Rlen); { 7045 Label loop, end; 7046 cmpw(Ri, Rlen, Assembler::LSL, 1); 7047 br(Assembler::GE, end); 7048 7049 bind(loop); 7050 pre2(Ri, Rlen); 7051 7052 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7053 lslw(Rj, Rlen, 1); 7054 subw(Rj, Rj, Ri); 7055 subw(Rj, Rj, 1); 7056 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7057 } block_comment(" } // j"); 7058 7059 post2(Ri, Rlen); 7060 addw(Ri, Ri, 1); 7061 cmpw(Ri, Rlen, Assembler::LSL, 1); 7062 br(Assembler::LT, loop); 7063 bind(end); 7064 } 7065 block_comment("} // i"); 7066 7067 normalize(Rlen); 7068 7069 mov(Ra, Pm_base); // Save Pm_base in Ra 7070 restore_regs(); // Restore caller's Pm_base 7071 7072 // Copy our result into caller's Pm_base 7073 reverse(Pm_base, Ra, Rlen, t0, t1); 7074 7075 leave(); 7076 bind(nothing); 7077 ret(lr); 7078 7079 return entry; 7080 } 7081 // In C, approximately: 7082 7083 // void 7084 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7085 // julong Pn_base[], julong Pm_base[], 7086 // julong inv, int len) { 7087 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7088 // julong *Pa, *Pb, *Pn, *Pm; 7089 // julong Ra, Rb, Rn, Rm; 7090 7091 // int i; 7092 7093 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7094 7095 // for (i = 0; i < len; i++) { 7096 // int j; 7097 7098 // Pa = Pa_base; 7099 // Pb = Pb_base + i; 7100 // Pm = Pm_base; 7101 // Pn = Pn_base + i; 7102 7103 // Ra = *Pa; 7104 // Rb = *Pb; 7105 // Rm = *Pm; 7106 // Rn = *Pn; 7107 7108 // int iters = i; 7109 // for (j = 0; iters--; j++) { 7110 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7111 // MACC(Ra, Rb, t0, t1, t2); 7112 // Ra = *++Pa; 7113 // Rb = *--Pb; 7114 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7115 // MACC(Rm, Rn, t0, t1, t2); 7116 // Rm = *++Pm; 7117 // Rn = *--Pn; 7118 // } 7119 7120 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 7121 // MACC(Ra, Rb, t0, t1, t2); 7122 // *Pm = Rm = t0 * inv; 7123 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7124 // MACC(Rm, Rn, t0, t1, t2); 7125 7126 // assert(t0 == 0, "broken Montgomery multiply"); 7127 7128 // t0 = t1; t1 = t2; t2 = 0; 7129 // } 7130 7131 // for (i = len; i < 2*len; i++) { 7132 // int j; 7133 7134 // Pa = Pa_base + i-len; 7135 // Pb = Pb_base + len; 7136 // Pm = Pm_base + i-len; 7137 // Pn = Pn_base + len; 7138 7139 // Ra = *++Pa; 7140 // Rb = *--Pb; 7141 // Rm = *++Pm; 7142 // Rn = *--Pn; 7143 7144 // int iters = len*2-i-1; 7145 // for (j = i-len+1; iters--; j++) { 7146 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7147 // MACC(Ra, Rb, t0, t1, t2); 7148 // Ra = *++Pa; 7149 // Rb = *--Pb; 7150 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7151 // MACC(Rm, Rn, t0, t1, t2); 7152 // Rm = *++Pm; 7153 // Rn = *--Pn; 7154 // } 7155 7156 // Pm_base[i-len] = t0; 7157 // t0 = t1; t1 = t2; t2 = 0; 7158 // } 7159 7160 // while (t0) 7161 // t0 = sub(Pm_base, Pn_base, t0, len); 7162 // } 7163 7164 /** 7165 * Fast Montgomery squaring. This uses asymptotically 25% fewer 7166 * multiplies than Montgomery multiplication so it should be up to 7167 * 25% faster. However, its loop control is more complex and it 7168 * may actually run slower on some machines. 7169 * 7170 * Arguments: 7171 * 7172 * Inputs: 7173 * c_rarg0 - int array elements a 7174 * c_rarg1 - int array elements n (the modulus) 7175 * c_rarg2 - int length 7176 * c_rarg3 - int inv 7177 * c_rarg4 - int array elements m (the result) 7178 * 7179 */ 7180 address generate_square() { 7181 Label argh; 7182 bind(argh); 7183 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7184 7185 align(CodeEntryAlignment); 7186 address entry = pc(); 7187 7188 enter(); 7189 7190 // Make room. 7191 cmpw(Rlen, 512); 7192 br(Assembler::HI, argh); 7193 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7194 andr(sp, Ra, -2 * wordSize); 7195 7196 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7197 7198 { 7199 // Copy input args, reversing as we go. We use Ra as a 7200 // temporary variable. 7201 reverse(Ra, Pa_base, Rlen, t0, t1); 7202 reverse(Ra, Pn_base, Rlen, t0, t1); 7203 } 7204 7205 // Push all call-saved registers and also Pm_base which we'll need 7206 // at the end. 7207 save_regs(); 7208 7209 mov(Pm_base, Ra); 7210 7211 mov(t0, zr); 7212 mov(t1, zr); 7213 mov(t2, zr); 7214 7215 block_comment("for (int i = 0; i < len; i++) {"); 7216 mov(Ri, zr); { 7217 Label loop, end; 7218 bind(loop); 7219 cmp(Ri, Rlen); 7220 br(Assembler::GE, end); 7221 7222 pre1(Ri); 7223 7224 block_comment("for (j = (i+1)/2; j; j--) {"); { 7225 add(Rj, Ri, 1); 7226 lsr(Rj, Rj, 1); 7227 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7228 } block_comment(" } // j"); 7229 7230 last_squaring(Ri); 7231 7232 block_comment(" for (j = i/2; j; j--) {"); { 7233 lsr(Rj, Ri, 1); 7234 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7235 } block_comment(" } // j"); 7236 7237 post1_squaring(); 7238 add(Ri, Ri, 1); 7239 cmp(Ri, Rlen); 7240 br(Assembler::LT, loop); 7241 7242 bind(end); 7243 block_comment("} // i"); 7244 } 7245 7246 block_comment("for (int i = len; i < 2*len; i++) {"); 7247 mov(Ri, Rlen); { 7248 Label loop, end; 7249 bind(loop); 7250 cmp(Ri, Rlen, Assembler::LSL, 1); 7251 br(Assembler::GE, end); 7252 7253 pre2(Ri, Rlen); 7254 7255 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 7256 lsl(Rj, Rlen, 1); 7257 sub(Rj, Rj, Ri); 7258 sub(Rj, Rj, 1); 7259 lsr(Rj, Rj, 1); 7260 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7261 } block_comment(" } // j"); 7262 7263 last_squaring(Ri); 7264 7265 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 7266 lsl(Rj, Rlen, 1); 7267 sub(Rj, Rj, Ri); 7268 lsr(Rj, Rj, 1); 7269 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7270 } block_comment(" } // j"); 7271 7272 post2(Ri, Rlen); 7273 add(Ri, Ri, 1); 7274 cmp(Ri, Rlen, Assembler::LSL, 1); 7275 7276 br(Assembler::LT, loop); 7277 bind(end); 7278 block_comment("} // i"); 7279 } 7280 7281 normalize(Rlen); 7282 7283 mov(Ra, Pm_base); // Save Pm_base in Ra 7284 restore_regs(); // Restore caller's Pm_base 7285 7286 // Copy our result into caller's Pm_base 7287 reverse(Pm_base, Ra, Rlen, t0, t1); 7288 7289 leave(); 7290 ret(lr); 7291 7292 return entry; 7293 } 7294 // In C, approximately: 7295 7296 // void 7297 // montgomery_square(julong Pa_base[], julong Pn_base[], 7298 // julong Pm_base[], julong inv, int len) { 7299 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7300 // julong *Pa, *Pb, *Pn, *Pm; 7301 // julong Ra, Rb, Rn, Rm; 7302 7303 // int i; 7304 7305 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7306 7307 // for (i = 0; i < len; i++) { 7308 // int j; 7309 7310 // Pa = Pa_base; 7311 // Pb = Pa_base + i; 7312 // Pm = Pm_base; 7313 // Pn = Pn_base + i; 7314 7315 // Ra = *Pa; 7316 // Rb = *Pb; 7317 // Rm = *Pm; 7318 // Rn = *Pn; 7319 7320 // int iters = (i+1)/2; 7321 // for (j = 0; iters--; j++) { 7322 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7323 // MACC2(Ra, Rb, t0, t1, t2); 7324 // Ra = *++Pa; 7325 // Rb = *--Pb; 7326 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7327 // MACC(Rm, Rn, t0, t1, t2); 7328 // Rm = *++Pm; 7329 // Rn = *--Pn; 7330 // } 7331 // if ((i & 1) == 0) { 7332 // assert(Ra == Pa_base[j], "must be"); 7333 // MACC(Ra, Ra, t0, t1, t2); 7334 // } 7335 // iters = i/2; 7336 // assert(iters == i-j, "must be"); 7337 // for (; iters--; j++) { 7338 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7339 // MACC(Rm, Rn, t0, t1, t2); 7340 // Rm = *++Pm; 7341 // Rn = *--Pn; 7342 // } 7343 7344 // *Pm = Rm = t0 * inv; 7345 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7346 // MACC(Rm, Rn, t0, t1, t2); 7347 7348 // assert(t0 == 0, "broken Montgomery multiply"); 7349 7350 // t0 = t1; t1 = t2; t2 = 0; 7351 // } 7352 7353 // for (i = len; i < 2*len; i++) { 7354 // int start = i-len+1; 7355 // int end = start + (len - start)/2; 7356 // int j; 7357 7358 // Pa = Pa_base + i-len; 7359 // Pb = Pa_base + len; 7360 // Pm = Pm_base + i-len; 7361 // Pn = Pn_base + len; 7362 7363 // Ra = *++Pa; 7364 // Rb = *--Pb; 7365 // Rm = *++Pm; 7366 // Rn = *--Pn; 7367 7368 // int iters = (2*len-i-1)/2; 7369 // assert(iters == end-start, "must be"); 7370 // for (j = start; iters--; j++) { 7371 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7372 // MACC2(Ra, Rb, t0, t1, t2); 7373 // Ra = *++Pa; 7374 // Rb = *--Pb; 7375 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7376 // MACC(Rm, Rn, t0, t1, t2); 7377 // Rm = *++Pm; 7378 // Rn = *--Pn; 7379 // } 7380 // if ((i & 1) == 0) { 7381 // assert(Ra == Pa_base[j], "must be"); 7382 // MACC(Ra, Ra, t0, t1, t2); 7383 // } 7384 // iters = (2*len-i)/2; 7385 // assert(iters == len-j, "must be"); 7386 // for (; iters--; j++) { 7387 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7388 // MACC(Rm, Rn, t0, t1, t2); 7389 // Rm = *++Pm; 7390 // Rn = *--Pn; 7391 // } 7392 // Pm_base[i-len] = t0; 7393 // t0 = t1; t1 = t2; t2 = 0; 7394 // } 7395 7396 // while (t0) 7397 // t0 = sub(Pm_base, Pn_base, t0, len); 7398 // } 7399 }; 7400 7401 7402 // Initialization 7403 void generate_initial() { 7404 // Generate initial stubs and initializes the entry points 7405 7406 // entry points that exist in all platforms Note: This is code 7407 // that could be shared among different platforms - however the 7408 // benefit seems to be smaller than the disadvantage of having a 7409 // much more complicated generator structure. See also comment in 7410 // stubRoutines.hpp. 7411 7412 StubRoutines::_forward_exception_entry = generate_forward_exception(); 7413 7414 StubRoutines::_call_stub_entry = 7415 generate_call_stub(StubRoutines::_call_stub_return_address); 7416 7417 // is referenced by megamorphic call 7418 StubRoutines::_catch_exception_entry = generate_catch_exception(); 7419 7420 // Build this early so it's available for the interpreter. 7421 StubRoutines::_throw_StackOverflowError_entry = 7422 generate_throw_exception("StackOverflowError throw_exception", 7423 CAST_FROM_FN_PTR(address, 7424 SharedRuntime::throw_StackOverflowError)); 7425 StubRoutines::_throw_delayed_StackOverflowError_entry = 7426 generate_throw_exception("delayed StackOverflowError throw_exception", 7427 CAST_FROM_FN_PTR(address, 7428 SharedRuntime::throw_delayed_StackOverflowError)); 7429 if (UseCRC32Intrinsics) { 7430 // set table address before stub generation which use it 7431 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 7432 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 7433 } 7434 7435 if (UseCRC32CIntrinsics) { 7436 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 7437 } 7438 7439 // Disabled until JDK-8210858 is fixed 7440 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 7441 // StubRoutines::_dlog = generate_dlog(); 7442 // } 7443 7444 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 7445 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 7446 } 7447 7448 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 7449 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 7450 } 7451 } 7452 7453 void generate_all() { 7454 // support for verify_oop (must happen after universe_init) 7455 if (VerifyOops) { 7456 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 7457 } 7458 StubRoutines::_throw_AbstractMethodError_entry = 7459 generate_throw_exception("AbstractMethodError throw_exception", 7460 CAST_FROM_FN_PTR(address, 7461 SharedRuntime:: 7462 throw_AbstractMethodError)); 7463 7464 StubRoutines::_throw_IncompatibleClassChangeError_entry = 7465 generate_throw_exception("IncompatibleClassChangeError throw_exception", 7466 CAST_FROM_FN_PTR(address, 7467 SharedRuntime:: 7468 throw_IncompatibleClassChangeError)); 7469 7470 StubRoutines::_throw_NullPointerException_at_call_entry = 7471 generate_throw_exception("NullPointerException at call throw_exception", 7472 CAST_FROM_FN_PTR(address, 7473 SharedRuntime:: 7474 throw_NullPointerException_at_call)); 7475 7476 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 7477 7478 // arraycopy stubs used by compilers 7479 generate_arraycopy_stubs(); 7480 7481 // countPositives stub for large arrays. 7482 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 7483 7484 // array equals stub for large arrays. 7485 if (!UseSimpleArrayEquals) { 7486 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 7487 } 7488 7489 generate_compare_long_strings(); 7490 7491 generate_string_indexof_stubs(); 7492 7493 // byte_array_inflate stub for large arrays. 7494 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 7495 7496 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 7497 if (bs_nm != NULL) { 7498 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 7499 } 7500 #ifdef COMPILER2 7501 if (UseMultiplyToLenIntrinsic) { 7502 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 7503 } 7504 7505 if (UseSquareToLenIntrinsic) { 7506 StubRoutines::_squareToLen = generate_squareToLen(); 7507 } 7508 7509 if (UseMulAddIntrinsic) { 7510 StubRoutines::_mulAdd = generate_mulAdd(); 7511 } 7512 7513 if (UseSIMDForBigIntegerShiftIntrinsics) { 7514 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 7515 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 7516 } 7517 7518 if (UseMontgomeryMultiplyIntrinsic) { 7519 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 7520 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 7521 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 7522 } 7523 7524 if (UseMontgomerySquareIntrinsic) { 7525 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 7526 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 7527 // We use generate_multiply() rather than generate_square() 7528 // because it's faster for the sizes of modulus we care about. 7529 StubRoutines::_montgomerySquare = g.generate_multiply(); 7530 } 7531 #endif // COMPILER2 7532 7533 if (UseBASE64Intrinsics) { 7534 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 7535 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 7536 } 7537 7538 // data cache line writeback 7539 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 7540 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 7541 7542 if (UseAESIntrinsics) { 7543 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 7544 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 7545 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 7546 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 7547 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 7548 } 7549 if (UseGHASHIntrinsics) { 7550 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 7551 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 7552 } 7553 if (UseAESIntrinsics && UseGHASHIntrinsics) { 7554 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 7555 } 7556 7557 if (UseMD5Intrinsics) { 7558 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 7559 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 7560 } 7561 if (UseSHA1Intrinsics) { 7562 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 7563 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 7564 } 7565 if (UseSHA256Intrinsics) { 7566 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 7567 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 7568 } 7569 if (UseSHA512Intrinsics) { 7570 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 7571 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 7572 } 7573 if (UseSHA3Intrinsics) { 7574 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 7575 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 7576 } 7577 7578 // generate Adler32 intrinsics code 7579 if (UseAdler32Intrinsics) { 7580 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 7581 } 7582 7583 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 7584 7585 #ifdef LINUX 7586 7587 generate_atomic_entry_points(); 7588 7589 #endif // LINUX 7590 7591 StubRoutines::aarch64::set_completed(); 7592 } 7593 7594 public: 7595 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 7596 if (all) { 7597 generate_all(); 7598 } else { 7599 generate_initial(); 7600 } 7601 } 7602 }; // end class declaration 7603 7604 #define UCM_TABLE_MAX_ENTRIES 8 7605 void StubGenerator_generate(CodeBuffer* code, bool all) { 7606 if (UnsafeCopyMemory::_table == NULL) { 7607 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 7608 } 7609 StubGenerator g(code, all); 7610 } 7611 7612 7613 #ifdef LINUX 7614 7615 // Define pointers to atomic stubs and initialize them to point to the 7616 // code in atomic_aarch64.S. 7617 7618 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 7619 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 7620 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 7621 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 7622 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 7623 7624 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 7625 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 7626 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 7627 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 7628 DEFAULT_ATOMIC_OP(xchg, 4, ) 7629 DEFAULT_ATOMIC_OP(xchg, 8, ) 7630 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 7631 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 7632 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 7633 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 7634 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 7635 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 7636 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 7637 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 7638 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 7639 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 7640 7641 #undef DEFAULT_ATOMIC_OP 7642 7643 #endif // LINUX --- EOF ---