1 /* 2 * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "runtime/atomic.hpp" 44 #include "runtime/frame.inline.hpp" 45 #include "runtime/handles.inline.hpp" 46 #include "runtime/sharedRuntime.hpp" 47 #include "runtime/stubCodeGenerator.hpp" 48 #include "runtime/stubRoutines.hpp" 49 #include "runtime/thread.inline.hpp" 50 #include "utilities/align.hpp" 51 #include "utilities/powerOfTwo.hpp" 52 #ifdef COMPILER2 53 #include "opto/runtime.hpp" 54 #endif 55 #if INCLUDE_ZGC 56 #include "gc/z/zThreadLocalData.hpp" 57 #endif 58 59 // Declaration and definition of StubGenerator (no .hpp file). 60 // For a more detailed description of the stub routine structure 61 // see the comment in stubRoutines.hpp 62 63 #undef __ 64 #define __ _masm-> 65 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 66 67 #ifdef PRODUCT 68 #define BLOCK_COMMENT(str) /* nothing */ 69 #else 70 #define BLOCK_COMMENT(str) __ block_comment(str) 71 #endif 72 73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 74 75 // Stub Code definitions 76 77 class StubGenerator: public StubCodeGenerator { 78 private: 79 80 #ifdef PRODUCT 81 #define inc_counter_np(counter) ((void)0) 82 #else 83 void inc_counter_np_(int& counter) { 84 __ lea(rscratch2, ExternalAddress((address)&counter)); 85 __ ldrw(rscratch1, Address(rscratch2)); 86 __ addw(rscratch1, rscratch1, 1); 87 __ strw(rscratch1, Address(rscratch2)); 88 } 89 #define inc_counter_np(counter) \ 90 BLOCK_COMMENT("inc_counter " #counter); \ 91 inc_counter_np_(counter); 92 #endif 93 94 // Call stubs are used to call Java from C 95 // 96 // Arguments: 97 // c_rarg0: call wrapper address address 98 // c_rarg1: result address 99 // c_rarg2: result type BasicType 100 // c_rarg3: method Method* 101 // c_rarg4: (interpreter) entry point address 102 // c_rarg5: parameters intptr_t* 103 // c_rarg6: parameter size (in words) int 104 // c_rarg7: thread Thread* 105 // 106 // There is no return from the stub itself as any Java result 107 // is written to result 108 // 109 // we save r30 (lr) as the return PC at the base of the frame and 110 // link r29 (fp) below it as the frame pointer installing sp (r31) 111 // into fp. 112 // 113 // we save r0-r7, which accounts for all the c arguments. 114 // 115 // TODO: strictly do we need to save them all? they are treated as 116 // volatile by C so could we omit saving the ones we are going to 117 // place in global registers (thread? method?) or those we only use 118 // during setup of the Java call? 119 // 120 // we don't need to save r8 which C uses as an indirect result location 121 // return register. 122 // 123 // we don't need to save r9-r15 which both C and Java treat as 124 // volatile 125 // 126 // we don't need to save r16-18 because Java does not use them 127 // 128 // we save r19-r28 which Java uses as scratch registers and C 129 // expects to be callee-save 130 // 131 // we save the bottom 64 bits of each value stored in v8-v15; it is 132 // the responsibility of the caller to preserve larger values. 133 // 134 // so the stub frame looks like this when we enter Java code 135 // 136 // [ return_from_Java ] <--- sp 137 // [ argument word n ] 138 // ... 139 // -27 [ argument word 1 ] 140 // -26 [ saved v15 ] <--- sp_after_call 141 // -25 [ saved v14 ] 142 // -24 [ saved v13 ] 143 // -23 [ saved v12 ] 144 // -22 [ saved v11 ] 145 // -21 [ saved v10 ] 146 // -20 [ saved v9 ] 147 // -19 [ saved v8 ] 148 // -18 [ saved r28 ] 149 // -17 [ saved r27 ] 150 // -16 [ saved r26 ] 151 // -15 [ saved r25 ] 152 // -14 [ saved r24 ] 153 // -13 [ saved r23 ] 154 // -12 [ saved r22 ] 155 // -11 [ saved r21 ] 156 // -10 [ saved r20 ] 157 // -9 [ saved r19 ] 158 // -8 [ call wrapper (r0) ] 159 // -7 [ result (r1) ] 160 // -6 [ result type (r2) ] 161 // -5 [ method (r3) ] 162 // -4 [ entry point (r4) ] 163 // -3 [ parameters (r5) ] 164 // -2 [ parameter size (r6) ] 165 // -1 [ thread (r7) ] 166 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 167 // 1 [ saved lr (r30) ] 168 169 // Call stub stack layout word offsets from fp 170 enum call_stub_layout { 171 sp_after_call_off = -26, 172 173 d15_off = -26, 174 d13_off = -24, 175 d11_off = -22, 176 d9_off = -20, 177 178 r28_off = -18, 179 r26_off = -16, 180 r24_off = -14, 181 r22_off = -12, 182 r20_off = -10, 183 call_wrapper_off = -8, 184 result_off = -7, 185 result_type_off = -6, 186 method_off = -5, 187 entry_point_off = -4, 188 parameter_size_off = -2, 189 thread_off = -1, 190 fp_f = 0, 191 retaddr_off = 1, 192 }; 193 194 address generate_call_stub(address& return_address) { 195 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 196 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 197 "adjust this code"); 198 199 StubCodeMark mark(this, "StubRoutines", "call_stub"); 200 address start = __ pc(); 201 202 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 203 204 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 205 const Address result (rfp, result_off * wordSize); 206 const Address result_type (rfp, result_type_off * wordSize); 207 const Address method (rfp, method_off * wordSize); 208 const Address entry_point (rfp, entry_point_off * wordSize); 209 const Address parameter_size(rfp, parameter_size_off * wordSize); 210 211 const Address thread (rfp, thread_off * wordSize); 212 213 const Address d15_save (rfp, d15_off * wordSize); 214 const Address d13_save (rfp, d13_off * wordSize); 215 const Address d11_save (rfp, d11_off * wordSize); 216 const Address d9_save (rfp, d9_off * wordSize); 217 218 const Address r28_save (rfp, r28_off * wordSize); 219 const Address r26_save (rfp, r26_off * wordSize); 220 const Address r24_save (rfp, r24_off * wordSize); 221 const Address r22_save (rfp, r22_off * wordSize); 222 const Address r20_save (rfp, r20_off * wordSize); 223 224 // stub code 225 226 address aarch64_entry = __ pc(); 227 228 // set up frame and move sp to end of save area 229 __ enter(); 230 __ sub(sp, rfp, -sp_after_call_off * wordSize); 231 232 // save register parameters and Java scratch/global registers 233 // n.b. we save thread even though it gets installed in 234 // rthread because we want to sanity check rthread later 235 __ str(c_rarg7, thread); 236 __ strw(c_rarg6, parameter_size); 237 __ stp(c_rarg4, c_rarg5, entry_point); 238 __ stp(c_rarg2, c_rarg3, result_type); 239 __ stp(c_rarg0, c_rarg1, call_wrapper); 240 241 __ stp(r20, r19, r20_save); 242 __ stp(r22, r21, r22_save); 243 __ stp(r24, r23, r24_save); 244 __ stp(r26, r25, r26_save); 245 __ stp(r28, r27, r28_save); 246 247 __ stpd(v9, v8, d9_save); 248 __ stpd(v11, v10, d11_save); 249 __ stpd(v13, v12, d13_save); 250 __ stpd(v15, v14, d15_save); 251 252 // install Java thread in global register now we have saved 253 // whatever value it held 254 __ mov(rthread, c_rarg7); 255 // And method 256 __ mov(rmethod, c_rarg3); 257 258 // set up the heapbase register 259 __ reinit_heapbase(); 260 261 #ifdef ASSERT 262 // make sure we have no pending exceptions 263 { 264 Label L; 265 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 266 __ cmp(rscratch1, (u1)NULL_WORD); 267 __ br(Assembler::EQ, L); 268 __ stop("StubRoutines::call_stub: entered with pending exception"); 269 __ BIND(L); 270 } 271 #endif 272 // pass parameters if any 273 __ mov(esp, sp); 274 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 275 __ andr(sp, rscratch1, -2 * wordSize); 276 277 BLOCK_COMMENT("pass parameters if any"); 278 Label parameters_done; 279 // parameter count is still in c_rarg6 280 // and parameter pointer identifying param 1 is in c_rarg5 281 __ cbzw(c_rarg6, parameters_done); 282 283 address loop = __ pc(); 284 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 285 __ subsw(c_rarg6, c_rarg6, 1); 286 __ push(rscratch1); 287 __ br(Assembler::GT, loop); 288 289 __ BIND(parameters_done); 290 291 // call Java entry -- passing methdoOop, and current sp 292 // rmethod: Method* 293 // r13: sender sp 294 BLOCK_COMMENT("call Java function"); 295 __ mov(r13, sp); 296 __ blr(c_rarg4); 297 298 // we do this here because the notify will already have been done 299 // if we get to the next instruction via an exception 300 // 301 // n.b. adding this instruction here affects the calculation of 302 // whether or not a routine returns to the call stub (used when 303 // doing stack walks) since the normal test is to check the return 304 // pc against the address saved below. so we may need to allow for 305 // this extra instruction in the check. 306 307 // save current address for use by exception handling code 308 309 return_address = __ pc(); 310 311 // store result depending on type (everything that is not 312 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 313 // n.b. this assumes Java returns an integral result in r0 314 // and a floating result in j_farg0 315 __ ldr(j_rarg2, result); 316 Label is_long, is_float, is_double, exit; 317 __ ldr(j_rarg1, result_type); 318 __ cmp(j_rarg1, (u1)T_OBJECT); 319 __ br(Assembler::EQ, is_long); 320 __ cmp(j_rarg1, (u1)T_LONG); 321 __ br(Assembler::EQ, is_long); 322 __ cmp(j_rarg1, (u1)T_FLOAT); 323 __ br(Assembler::EQ, is_float); 324 __ cmp(j_rarg1, (u1)T_DOUBLE); 325 __ br(Assembler::EQ, is_double); 326 327 // handle T_INT case 328 __ strw(r0, Address(j_rarg2)); 329 330 __ BIND(exit); 331 332 // pop parameters 333 __ sub(esp, rfp, -sp_after_call_off * wordSize); 334 335 #ifdef ASSERT 336 // verify that threads correspond 337 { 338 Label L, S; 339 __ ldr(rscratch1, thread); 340 __ cmp(rthread, rscratch1); 341 __ br(Assembler::NE, S); 342 __ get_thread(rscratch1); 343 __ cmp(rthread, rscratch1); 344 __ br(Assembler::EQ, L); 345 __ BIND(S); 346 __ stop("StubRoutines::call_stub: threads must correspond"); 347 __ BIND(L); 348 } 349 #endif 350 351 // restore callee-save registers 352 __ ldpd(v15, v14, d15_save); 353 __ ldpd(v13, v12, d13_save); 354 __ ldpd(v11, v10, d11_save); 355 __ ldpd(v9, v8, d9_save); 356 357 __ ldp(r28, r27, r28_save); 358 __ ldp(r26, r25, r26_save); 359 __ ldp(r24, r23, r24_save); 360 __ ldp(r22, r21, r22_save); 361 __ ldp(r20, r19, r20_save); 362 363 __ ldp(c_rarg0, c_rarg1, call_wrapper); 364 __ ldrw(c_rarg2, result_type); 365 __ ldr(c_rarg3, method); 366 __ ldp(c_rarg4, c_rarg5, entry_point); 367 __ ldp(c_rarg6, c_rarg7, parameter_size); 368 369 // leave frame and return to caller 370 __ leave(); 371 __ ret(lr); 372 373 // handle return types different from T_INT 374 375 __ BIND(is_long); 376 __ str(r0, Address(j_rarg2, 0)); 377 __ br(Assembler::AL, exit); 378 379 __ BIND(is_float); 380 __ strs(j_farg0, Address(j_rarg2, 0)); 381 __ br(Assembler::AL, exit); 382 383 __ BIND(is_double); 384 __ strd(j_farg0, Address(j_rarg2, 0)); 385 __ br(Assembler::AL, exit); 386 387 return start; 388 } 389 390 // Return point for a Java call if there's an exception thrown in 391 // Java code. The exception is caught and transformed into a 392 // pending exception stored in JavaThread that can be tested from 393 // within the VM. 394 // 395 // Note: Usually the parameters are removed by the callee. In case 396 // of an exception crossing an activation frame boundary, that is 397 // not the case if the callee is compiled code => need to setup the 398 // rsp. 399 // 400 // r0: exception oop 401 402 address generate_catch_exception() { 403 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 404 address start = __ pc(); 405 406 // same as in generate_call_stub(): 407 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 408 const Address thread (rfp, thread_off * wordSize); 409 410 #ifdef ASSERT 411 // verify that threads correspond 412 { 413 Label L, S; 414 __ ldr(rscratch1, thread); 415 __ cmp(rthread, rscratch1); 416 __ br(Assembler::NE, S); 417 __ get_thread(rscratch1); 418 __ cmp(rthread, rscratch1); 419 __ br(Assembler::EQ, L); 420 __ bind(S); 421 __ stop("StubRoutines::catch_exception: threads must correspond"); 422 __ bind(L); 423 } 424 #endif 425 426 // set pending exception 427 __ verify_oop(r0); 428 429 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 430 __ mov(rscratch1, (address)__FILE__); 431 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 432 __ movw(rscratch1, (int)__LINE__); 433 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 434 435 // complete return to VM 436 assert(StubRoutines::_call_stub_return_address != NULL, 437 "_call_stub_return_address must have been generated before"); 438 __ b(StubRoutines::_call_stub_return_address); 439 440 return start; 441 } 442 443 // Continuation point for runtime calls returning with a pending 444 // exception. The pending exception check happened in the runtime 445 // or native call stub. The pending exception in Thread is 446 // converted into a Java-level exception. 447 // 448 // Contract with Java-level exception handlers: 449 // r0: exception 450 // r3: throwing pc 451 // 452 // NOTE: At entry of this stub, exception-pc must be in LR !! 453 454 // NOTE: this is always used as a jump target within generated code 455 // so it just needs to be generated code wiht no x86 prolog 456 457 address generate_forward_exception() { 458 StubCodeMark mark(this, "StubRoutines", "forward exception"); 459 address start = __ pc(); 460 461 // Upon entry, LR points to the return address returning into 462 // Java (interpreted or compiled) code; i.e., the return address 463 // becomes the throwing pc. 464 // 465 // Arguments pushed before the runtime call are still on the stack 466 // but the exception handler will reset the stack pointer -> 467 // ignore them. A potential result in registers can be ignored as 468 // well. 469 470 #ifdef ASSERT 471 // make sure this code is only executed if there is a pending exception 472 { 473 Label L; 474 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 475 __ cbnz(rscratch1, L); 476 __ stop("StubRoutines::forward exception: no pending exception (1)"); 477 __ bind(L); 478 } 479 #endif 480 481 // compute exception handler into r19 482 483 // call the VM to find the handler address associated with the 484 // caller address. pass thread in r0 and caller pc (ret address) 485 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 486 // the stack. 487 __ mov(c_rarg1, lr); 488 // lr will be trashed by the VM call so we move it to R19 489 // (callee-saved) because we also need to pass it to the handler 490 // returned by this call. 491 __ mov(r19, lr); 492 BLOCK_COMMENT("call exception_handler_for_return_address"); 493 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 494 SharedRuntime::exception_handler_for_return_address), 495 rthread, c_rarg1); 496 // Reinitialize the ptrue predicate register, in case the external runtime 497 // call clobbers ptrue reg, as we may return to SVE compiled code. 498 __ reinitialize_ptrue(); 499 500 // we should not really care that lr is no longer the callee 501 // address. we saved the value the handler needs in r19 so we can 502 // just copy it to r3. however, the C2 handler will push its own 503 // frame and then calls into the VM and the VM code asserts that 504 // the PC for the frame above the handler belongs to a compiled 505 // Java method. So, we restore lr here to satisfy that assert. 506 __ mov(lr, r19); 507 // setup r0 & r3 & clear pending exception 508 __ mov(r3, r19); 509 __ mov(r19, r0); 510 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 511 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 512 513 #ifdef ASSERT 514 // make sure exception is set 515 { 516 Label L; 517 __ cbnz(r0, L); 518 __ stop("StubRoutines::forward exception: no pending exception (2)"); 519 __ bind(L); 520 } 521 #endif 522 523 // continue at exception handler 524 // r0: exception 525 // r3: throwing pc 526 // r19: exception handler 527 __ verify_oop(r0); 528 __ br(r19); 529 530 return start; 531 } 532 533 // Non-destructive plausibility checks for oops 534 // 535 // Arguments: 536 // r0: oop to verify 537 // rscratch1: error message 538 // 539 // Stack after saving c_rarg3: 540 // [tos + 0]: saved c_rarg3 541 // [tos + 1]: saved c_rarg2 542 // [tos + 2]: saved lr 543 // [tos + 3]: saved rscratch2 544 // [tos + 4]: saved r0 545 // [tos + 5]: saved rscratch1 546 address generate_verify_oop() { 547 548 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 549 address start = __ pc(); 550 551 Label exit, error; 552 553 // save c_rarg2 and c_rarg3 554 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 555 556 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 557 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 558 __ ldr(c_rarg3, Address(c_rarg2)); 559 __ add(c_rarg3, c_rarg3, 1); 560 __ str(c_rarg3, Address(c_rarg2)); 561 562 // object is in r0 563 // make sure object is 'reasonable' 564 __ cbz(r0, exit); // if obj is NULL it is OK 565 566 #if INCLUDE_ZGC 567 if (UseZGC) { 568 // Check if mask is good. 569 // verifies that ZAddressBadMask & r0 == 0 570 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 571 __ andr(c_rarg2, r0, c_rarg3); 572 __ cbnz(c_rarg2, error); 573 } 574 #endif 575 576 // Check if the oop is in the right area of memory 577 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 578 __ andr(c_rarg2, r0, c_rarg3); 579 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 580 581 // Compare c_rarg2 and c_rarg3. We don't use a compare 582 // instruction here because the flags register is live. 583 __ eor(c_rarg2, c_rarg2, c_rarg3); 584 __ cbnz(c_rarg2, error); 585 586 // make sure klass is 'reasonable', which is not zero. 587 __ load_klass(r0, r0); // get klass 588 __ cbz(r0, error); // if klass is NULL it is broken 589 590 // return if everything seems ok 591 __ bind(exit); 592 593 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 594 __ ret(lr); 595 596 // handle errors 597 __ bind(error); 598 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 599 600 __ push(RegSet::range(r0, r29), sp); 601 // debug(char* msg, int64_t pc, int64_t regs[]) 602 __ mov(c_rarg0, rscratch1); // pass address of error message 603 __ mov(c_rarg1, lr); // pass return address 604 __ mov(c_rarg2, sp); // pass address of regs on stack 605 #ifndef PRODUCT 606 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 607 #endif 608 BLOCK_COMMENT("call MacroAssembler::debug"); 609 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 610 __ blr(rscratch1); 611 __ hlt(0); 612 613 return start; 614 } 615 616 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 617 618 // Generate indices for iota vector. 619 address generate_iota_indices(const char *stub_name) { 620 __ align(CodeEntryAlignment); 621 StubCodeMark mark(this, "StubRoutines", stub_name); 622 address start = __ pc(); 623 __ emit_data64(0x0706050403020100, relocInfo::none); 624 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 625 return start; 626 } 627 628 // The inner part of zero_words(). This is the bulk operation, 629 // zeroing words in blocks, possibly using DC ZVA to do it. The 630 // caller is responsible for zeroing the last few words. 631 // 632 // Inputs: 633 // r10: the HeapWord-aligned base address of an array to zero. 634 // r11: the count in HeapWords, r11 > 0. 635 // 636 // Returns r10 and r11, adjusted for the caller to clear. 637 // r10: the base address of the tail of words left to clear. 638 // r11: the number of words in the tail. 639 // r11 < MacroAssembler::zero_words_block_size. 640 641 address generate_zero_blocks() { 642 Label done; 643 Label base_aligned; 644 645 Register base = r10, cnt = r11; 646 647 __ align(CodeEntryAlignment); 648 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 649 address start = __ pc(); 650 651 if (UseBlockZeroing) { 652 int zva_length = VM_Version::zva_length(); 653 654 // Ensure ZVA length can be divided by 16. This is required by 655 // the subsequent operations. 656 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 657 658 __ tbz(base, 3, base_aligned); 659 __ str(zr, Address(__ post(base, 8))); 660 __ sub(cnt, cnt, 1); 661 __ bind(base_aligned); 662 663 // Ensure count >= zva_length * 2 so that it still deserves a zva after 664 // alignment. 665 Label small; 666 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 667 __ subs(rscratch1, cnt, low_limit >> 3); 668 __ br(Assembler::LT, small); 669 __ zero_dcache_blocks(base, cnt); 670 __ bind(small); 671 } 672 673 { 674 // Number of stp instructions we'll unroll 675 const int unroll = 676 MacroAssembler::zero_words_block_size / 2; 677 // Clear the remaining blocks. 678 Label loop; 679 __ subs(cnt, cnt, unroll * 2); 680 __ br(Assembler::LT, done); 681 __ bind(loop); 682 for (int i = 0; i < unroll; i++) 683 __ stp(zr, zr, __ post(base, 16)); 684 __ subs(cnt, cnt, unroll * 2); 685 __ br(Assembler::GE, loop); 686 __ bind(done); 687 __ add(cnt, cnt, unroll * 2); 688 } 689 690 __ ret(lr); 691 692 return start; 693 } 694 695 696 typedef enum { 697 copy_forwards = 1, 698 copy_backwards = -1 699 } copy_direction; 700 701 // Bulk copy of blocks of 8 words. 702 // 703 // count is a count of words. 704 // 705 // Precondition: count >= 8 706 // 707 // Postconditions: 708 // 709 // The least significant bit of count contains the remaining count 710 // of words to copy. The rest of count is trash. 711 // 712 // s and d are adjusted to point to the remaining words to copy 713 // 714 void generate_copy_longs(Label &start, Register s, Register d, Register count, 715 copy_direction direction) { 716 int unit = wordSize * direction; 717 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 718 719 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 720 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 721 const Register stride = r13; 722 723 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 724 assert_different_registers(s, d, count, rscratch1); 725 726 Label again, drain; 727 const char *stub_name; 728 if (direction == copy_forwards) 729 stub_name = "forward_copy_longs"; 730 else 731 stub_name = "backward_copy_longs"; 732 733 __ align(CodeEntryAlignment); 734 735 StubCodeMark mark(this, "StubRoutines", stub_name); 736 737 __ bind(start); 738 739 Label unaligned_copy_long; 740 if (AvoidUnalignedAccesses) { 741 __ tbnz(d, 3, unaligned_copy_long); 742 } 743 744 if (direction == copy_forwards) { 745 __ sub(s, s, bias); 746 __ sub(d, d, bias); 747 } 748 749 #ifdef ASSERT 750 // Make sure we are never given < 8 words 751 { 752 Label L; 753 __ cmp(count, (u1)8); 754 __ br(Assembler::GE, L); 755 __ stop("genrate_copy_longs called with < 8 words"); 756 __ bind(L); 757 } 758 #endif 759 760 // Fill 8 registers 761 if (UseSIMDForMemoryOps) { 762 __ ldpq(v0, v1, Address(s, 4 * unit)); 763 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 764 } else { 765 __ ldp(t0, t1, Address(s, 2 * unit)); 766 __ ldp(t2, t3, Address(s, 4 * unit)); 767 __ ldp(t4, t5, Address(s, 6 * unit)); 768 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 769 } 770 771 __ subs(count, count, 16); 772 __ br(Assembler::LO, drain); 773 774 int prefetch = PrefetchCopyIntervalInBytes; 775 bool use_stride = false; 776 if (direction == copy_backwards) { 777 use_stride = prefetch > 256; 778 prefetch = -prefetch; 779 if (use_stride) __ mov(stride, prefetch); 780 } 781 782 __ bind(again); 783 784 if (PrefetchCopyIntervalInBytes > 0) 785 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 786 787 if (UseSIMDForMemoryOps) { 788 __ stpq(v0, v1, Address(d, 4 * unit)); 789 __ ldpq(v0, v1, Address(s, 4 * unit)); 790 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 791 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 792 } else { 793 __ stp(t0, t1, Address(d, 2 * unit)); 794 __ ldp(t0, t1, Address(s, 2 * unit)); 795 __ stp(t2, t3, Address(d, 4 * unit)); 796 __ ldp(t2, t3, Address(s, 4 * unit)); 797 __ stp(t4, t5, Address(d, 6 * unit)); 798 __ ldp(t4, t5, Address(s, 6 * unit)); 799 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 800 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 801 } 802 803 __ subs(count, count, 8); 804 __ br(Assembler::HS, again); 805 806 // Drain 807 __ bind(drain); 808 if (UseSIMDForMemoryOps) { 809 __ stpq(v0, v1, Address(d, 4 * unit)); 810 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 811 } else { 812 __ stp(t0, t1, Address(d, 2 * unit)); 813 __ stp(t2, t3, Address(d, 4 * unit)); 814 __ stp(t4, t5, Address(d, 6 * unit)); 815 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 816 } 817 818 { 819 Label L1, L2; 820 __ tbz(count, exact_log2(4), L1); 821 if (UseSIMDForMemoryOps) { 822 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 823 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 824 } else { 825 __ ldp(t0, t1, Address(s, 2 * unit)); 826 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 827 __ stp(t0, t1, Address(d, 2 * unit)); 828 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 829 } 830 __ bind(L1); 831 832 if (direction == copy_forwards) { 833 __ add(s, s, bias); 834 __ add(d, d, bias); 835 } 836 837 __ tbz(count, 1, L2); 838 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 839 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 840 __ bind(L2); 841 } 842 843 __ ret(lr); 844 845 if (AvoidUnalignedAccesses) { 846 Label drain, again; 847 // Register order for storing. Order is different for backward copy. 848 849 __ bind(unaligned_copy_long); 850 851 // source address is even aligned, target odd aligned 852 // 853 // when forward copying word pairs we read long pairs at offsets 854 // {0, 2, 4, 6} (in long words). when backwards copying we read 855 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 856 // address by -2 in the forwards case so we can compute the 857 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 858 // or -1. 859 // 860 // when forward copying we need to store 1 word, 3 pairs and 861 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 862 // zero offset We adjust the destination by -1 which means we 863 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 864 // 865 // When backwards copyng we need to store 1 word, 3 pairs and 866 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 867 // offsets {1, 3, 5, 7, 8} * unit. 868 869 if (direction == copy_forwards) { 870 __ sub(s, s, 16); 871 __ sub(d, d, 8); 872 } 873 874 // Fill 8 registers 875 // 876 // for forwards copy s was offset by -16 from the original input 877 // value of s so the register contents are at these offsets 878 // relative to the 64 bit block addressed by that original input 879 // and so on for each successive 64 byte block when s is updated 880 // 881 // t0 at offset 0, t1 at offset 8 882 // t2 at offset 16, t3 at offset 24 883 // t4 at offset 32, t5 at offset 40 884 // t6 at offset 48, t7 at offset 56 885 886 // for backwards copy s was not offset so the register contents 887 // are at these offsets into the preceding 64 byte block 888 // relative to that original input and so on for each successive 889 // preceding 64 byte block when s is updated. this explains the 890 // slightly counter-intuitive looking pattern of register usage 891 // in the stp instructions for backwards copy. 892 // 893 // t0 at offset -16, t1 at offset -8 894 // t2 at offset -32, t3 at offset -24 895 // t4 at offset -48, t5 at offset -40 896 // t6 at offset -64, t7 at offset -56 897 898 __ ldp(t0, t1, Address(s, 2 * unit)); 899 __ ldp(t2, t3, Address(s, 4 * unit)); 900 __ ldp(t4, t5, Address(s, 6 * unit)); 901 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 902 903 __ subs(count, count, 16); 904 __ br(Assembler::LO, drain); 905 906 int prefetch = PrefetchCopyIntervalInBytes; 907 bool use_stride = false; 908 if (direction == copy_backwards) { 909 use_stride = prefetch > 256; 910 prefetch = -prefetch; 911 if (use_stride) __ mov(stride, prefetch); 912 } 913 914 __ bind(again); 915 916 if (PrefetchCopyIntervalInBytes > 0) 917 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 918 919 if (direction == copy_forwards) { 920 // allowing for the offset of -8 the store instructions place 921 // registers into the target 64 bit block at the following 922 // offsets 923 // 924 // t0 at offset 0 925 // t1 at offset 8, t2 at offset 16 926 // t3 at offset 24, t4 at offset 32 927 // t5 at offset 40, t6 at offset 48 928 // t7 at offset 56 929 930 __ str(t0, Address(d, 1 * unit)); 931 __ stp(t1, t2, Address(d, 2 * unit)); 932 __ ldp(t0, t1, Address(s, 2 * unit)); 933 __ stp(t3, t4, Address(d, 4 * unit)); 934 __ ldp(t2, t3, Address(s, 4 * unit)); 935 __ stp(t5, t6, Address(d, 6 * unit)); 936 __ ldp(t4, t5, Address(s, 6 * unit)); 937 __ str(t7, Address(__ pre(d, 8 * unit))); 938 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 939 } else { 940 // d was not offset when we started so the registers are 941 // written into the 64 bit block preceding d with the following 942 // offsets 943 // 944 // t1 at offset -8 945 // t3 at offset -24, t0 at offset -16 946 // t5 at offset -48, t2 at offset -32 947 // t7 at offset -56, t4 at offset -48 948 // t6 at offset -64 949 // 950 // note that this matches the offsets previously noted for the 951 // loads 952 953 __ str(t1, Address(d, 1 * unit)); 954 __ stp(t3, t0, Address(d, 3 * unit)); 955 __ ldp(t0, t1, Address(s, 2 * unit)); 956 __ stp(t5, t2, Address(d, 5 * unit)); 957 __ ldp(t2, t3, Address(s, 4 * unit)); 958 __ stp(t7, t4, Address(d, 7 * unit)); 959 __ ldp(t4, t5, Address(s, 6 * unit)); 960 __ str(t6, Address(__ pre(d, 8 * unit))); 961 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 962 } 963 964 __ subs(count, count, 8); 965 __ br(Assembler::HS, again); 966 967 // Drain 968 // 969 // this uses the same pattern of offsets and register arguments 970 // as above 971 __ bind(drain); 972 if (direction == copy_forwards) { 973 __ str(t0, Address(d, 1 * unit)); 974 __ stp(t1, t2, Address(d, 2 * unit)); 975 __ stp(t3, t4, Address(d, 4 * unit)); 976 __ stp(t5, t6, Address(d, 6 * unit)); 977 __ str(t7, Address(__ pre(d, 8 * unit))); 978 } else { 979 __ str(t1, Address(d, 1 * unit)); 980 __ stp(t3, t0, Address(d, 3 * unit)); 981 __ stp(t5, t2, Address(d, 5 * unit)); 982 __ stp(t7, t4, Address(d, 7 * unit)); 983 __ str(t6, Address(__ pre(d, 8 * unit))); 984 } 985 // now we need to copy any remaining part block which may 986 // include a 4 word block subblock and/or a 2 word subblock. 987 // bits 2 and 1 in the count are the tell-tale for whetehr we 988 // have each such subblock 989 { 990 Label L1, L2; 991 __ tbz(count, exact_log2(4), L1); 992 // this is the same as above but copying only 4 longs hence 993 // with ony one intervening stp between the str instructions 994 // but note that the offsets and registers still follow the 995 // same pattern 996 __ ldp(t0, t1, Address(s, 2 * unit)); 997 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 998 if (direction == copy_forwards) { 999 __ str(t0, Address(d, 1 * unit)); 1000 __ stp(t1, t2, Address(d, 2 * unit)); 1001 __ str(t3, Address(__ pre(d, 4 * unit))); 1002 } else { 1003 __ str(t1, Address(d, 1 * unit)); 1004 __ stp(t3, t0, Address(d, 3 * unit)); 1005 __ str(t2, Address(__ pre(d, 4 * unit))); 1006 } 1007 __ bind(L1); 1008 1009 __ tbz(count, 1, L2); 1010 // this is the same as above but copying only 2 longs hence 1011 // there is no intervening stp between the str instructions 1012 // but note that the offset and register patterns are still 1013 // the same 1014 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1015 if (direction == copy_forwards) { 1016 __ str(t0, Address(d, 1 * unit)); 1017 __ str(t1, Address(__ pre(d, 2 * unit))); 1018 } else { 1019 __ str(t1, Address(d, 1 * unit)); 1020 __ str(t0, Address(__ pre(d, 2 * unit))); 1021 } 1022 __ bind(L2); 1023 1024 // for forwards copy we need to re-adjust the offsets we 1025 // applied so that s and d are follow the last words written 1026 1027 if (direction == copy_forwards) { 1028 __ add(s, s, 16); 1029 __ add(d, d, 8); 1030 } 1031 1032 } 1033 1034 __ ret(lr); 1035 } 1036 } 1037 1038 // Small copy: less than 16 bytes. 1039 // 1040 // NB: Ignores all of the bits of count which represent more than 15 1041 // bytes, so a caller doesn't have to mask them. 1042 1043 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1044 bool is_backwards = step < 0; 1045 size_t granularity = uabs(step); 1046 int direction = is_backwards ? -1 : 1; 1047 int unit = wordSize * direction; 1048 1049 Label Lword, Lint, Lshort, Lbyte; 1050 1051 assert(granularity 1052 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1053 1054 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1055 1056 // ??? I don't know if this bit-test-and-branch is the right thing 1057 // to do. It does a lot of jumping, resulting in several 1058 // mispredicted branches. It might make more sense to do this 1059 // with something like Duff's device with a single computed branch. 1060 1061 __ tbz(count, 3 - exact_log2(granularity), Lword); 1062 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1063 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1064 __ bind(Lword); 1065 1066 if (granularity <= sizeof (jint)) { 1067 __ tbz(count, 2 - exact_log2(granularity), Lint); 1068 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1069 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1070 __ bind(Lint); 1071 } 1072 1073 if (granularity <= sizeof (jshort)) { 1074 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1075 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1076 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1077 __ bind(Lshort); 1078 } 1079 1080 if (granularity <= sizeof (jbyte)) { 1081 __ tbz(count, 0, Lbyte); 1082 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1083 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1084 __ bind(Lbyte); 1085 } 1086 } 1087 1088 Label copy_f, copy_b; 1089 1090 // All-singing all-dancing memory copy. 1091 // 1092 // Copy count units of memory from s to d. The size of a unit is 1093 // step, which can be positive or negative depending on the direction 1094 // of copy. If is_aligned is false, we align the source address. 1095 // 1096 1097 void copy_memory(bool is_aligned, Register s, Register d, 1098 Register count, Register tmp, int step) { 1099 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1100 bool is_backwards = step < 0; 1101 unsigned int granularity = uabs(step); 1102 const Register t0 = r3, t1 = r4; 1103 1104 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1105 // load all the data before writing anything 1106 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1107 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1108 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1109 const Register send = r17, dend = r16; 1110 1111 if (PrefetchCopyIntervalInBytes > 0) 1112 __ prfm(Address(s, 0), PLDL1KEEP); 1113 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1114 __ br(Assembler::HI, copy_big); 1115 1116 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1117 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1118 1119 __ cmp(count, u1(16/granularity)); 1120 __ br(Assembler::LS, copy16); 1121 1122 __ cmp(count, u1(64/granularity)); 1123 __ br(Assembler::HI, copy80); 1124 1125 __ cmp(count, u1(32/granularity)); 1126 __ br(Assembler::LS, copy32); 1127 1128 // 33..64 bytes 1129 if (UseSIMDForMemoryOps) { 1130 __ ldpq(v0, v1, Address(s, 0)); 1131 __ ldpq(v2, v3, Address(send, -32)); 1132 __ stpq(v0, v1, Address(d, 0)); 1133 __ stpq(v2, v3, Address(dend, -32)); 1134 } else { 1135 __ ldp(t0, t1, Address(s, 0)); 1136 __ ldp(t2, t3, Address(s, 16)); 1137 __ ldp(t4, t5, Address(send, -32)); 1138 __ ldp(t6, t7, Address(send, -16)); 1139 1140 __ stp(t0, t1, Address(d, 0)); 1141 __ stp(t2, t3, Address(d, 16)); 1142 __ stp(t4, t5, Address(dend, -32)); 1143 __ stp(t6, t7, Address(dend, -16)); 1144 } 1145 __ b(finish); 1146 1147 // 17..32 bytes 1148 __ bind(copy32); 1149 __ ldp(t0, t1, Address(s, 0)); 1150 __ ldp(t2, t3, Address(send, -16)); 1151 __ stp(t0, t1, Address(d, 0)); 1152 __ stp(t2, t3, Address(dend, -16)); 1153 __ b(finish); 1154 1155 // 65..80/96 bytes 1156 // (96 bytes if SIMD because we do 32 byes per instruction) 1157 __ bind(copy80); 1158 if (UseSIMDForMemoryOps) { 1159 __ ldpq(v0, v1, Address(s, 0)); 1160 __ ldpq(v2, v3, Address(s, 32)); 1161 // Unaligned pointers can be an issue for copying. 1162 // The issue has more chances to happen when granularity of data is 1163 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1164 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1165 // The most performance drop has been seen for the range 65-80 bytes. 1166 // For such cases using the pair of ldp/stp instead of the third pair of 1167 // ldpq/stpq fixes the performance issue. 1168 if (granularity < sizeof (jint)) { 1169 Label copy96; 1170 __ cmp(count, u1(80/granularity)); 1171 __ br(Assembler::HI, copy96); 1172 __ ldp(t0, t1, Address(send, -16)); 1173 1174 __ stpq(v0, v1, Address(d, 0)); 1175 __ stpq(v2, v3, Address(d, 32)); 1176 __ stp(t0, t1, Address(dend, -16)); 1177 __ b(finish); 1178 1179 __ bind(copy96); 1180 } 1181 __ ldpq(v4, v5, Address(send, -32)); 1182 1183 __ stpq(v0, v1, Address(d, 0)); 1184 __ stpq(v2, v3, Address(d, 32)); 1185 __ stpq(v4, v5, Address(dend, -32)); 1186 } else { 1187 __ ldp(t0, t1, Address(s, 0)); 1188 __ ldp(t2, t3, Address(s, 16)); 1189 __ ldp(t4, t5, Address(s, 32)); 1190 __ ldp(t6, t7, Address(s, 48)); 1191 __ ldp(t8, t9, Address(send, -16)); 1192 1193 __ stp(t0, t1, Address(d, 0)); 1194 __ stp(t2, t3, Address(d, 16)); 1195 __ stp(t4, t5, Address(d, 32)); 1196 __ stp(t6, t7, Address(d, 48)); 1197 __ stp(t8, t9, Address(dend, -16)); 1198 } 1199 __ b(finish); 1200 1201 // 0..16 bytes 1202 __ bind(copy16); 1203 __ cmp(count, u1(8/granularity)); 1204 __ br(Assembler::LO, copy8); 1205 1206 // 8..16 bytes 1207 __ ldr(t0, Address(s, 0)); 1208 __ ldr(t1, Address(send, -8)); 1209 __ str(t0, Address(d, 0)); 1210 __ str(t1, Address(dend, -8)); 1211 __ b(finish); 1212 1213 if (granularity < 8) { 1214 // 4..7 bytes 1215 __ bind(copy8); 1216 __ tbz(count, 2 - exact_log2(granularity), copy4); 1217 __ ldrw(t0, Address(s, 0)); 1218 __ ldrw(t1, Address(send, -4)); 1219 __ strw(t0, Address(d, 0)); 1220 __ strw(t1, Address(dend, -4)); 1221 __ b(finish); 1222 if (granularity < 4) { 1223 // 0..3 bytes 1224 __ bind(copy4); 1225 __ cbz(count, finish); // get rid of 0 case 1226 if (granularity == 2) { 1227 __ ldrh(t0, Address(s, 0)); 1228 __ strh(t0, Address(d, 0)); 1229 } else { // granularity == 1 1230 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1231 // the first and last byte. 1232 // Handle the 3 byte case by loading and storing base + count/2 1233 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1234 // This does means in the 1 byte case we load/store the same 1235 // byte 3 times. 1236 __ lsr(count, count, 1); 1237 __ ldrb(t0, Address(s, 0)); 1238 __ ldrb(t1, Address(send, -1)); 1239 __ ldrb(t2, Address(s, count)); 1240 __ strb(t0, Address(d, 0)); 1241 __ strb(t1, Address(dend, -1)); 1242 __ strb(t2, Address(d, count)); 1243 } 1244 __ b(finish); 1245 } 1246 } 1247 1248 __ bind(copy_big); 1249 if (is_backwards) { 1250 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1251 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1252 } 1253 1254 // Now we've got the small case out of the way we can align the 1255 // source address on a 2-word boundary. 1256 1257 Label aligned; 1258 1259 if (is_aligned) { 1260 // We may have to adjust by 1 word to get s 2-word-aligned. 1261 __ tbz(s, exact_log2(wordSize), aligned); 1262 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1263 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1264 __ sub(count, count, wordSize/granularity); 1265 } else { 1266 if (is_backwards) { 1267 __ andr(rscratch2, s, 2 * wordSize - 1); 1268 } else { 1269 __ neg(rscratch2, s); 1270 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1271 } 1272 // rscratch2 is the byte adjustment needed to align s. 1273 __ cbz(rscratch2, aligned); 1274 int shift = exact_log2(granularity); 1275 if (shift) __ lsr(rscratch2, rscratch2, shift); 1276 __ sub(count, count, rscratch2); 1277 1278 #if 0 1279 // ?? This code is only correct for a disjoint copy. It may or 1280 // may not make sense to use it in that case. 1281 1282 // Copy the first pair; s and d may not be aligned. 1283 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1284 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1285 1286 // Align s and d, adjust count 1287 if (is_backwards) { 1288 __ sub(s, s, rscratch2); 1289 __ sub(d, d, rscratch2); 1290 } else { 1291 __ add(s, s, rscratch2); 1292 __ add(d, d, rscratch2); 1293 } 1294 #else 1295 copy_memory_small(s, d, rscratch2, rscratch1, step); 1296 #endif 1297 } 1298 1299 __ bind(aligned); 1300 1301 // s is now 2-word-aligned. 1302 1303 // We have a count of units and some trailing bytes. Adjust the 1304 // count and do a bulk copy of words. 1305 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1306 if (direction == copy_forwards) 1307 __ bl(copy_f); 1308 else 1309 __ bl(copy_b); 1310 1311 // And the tail. 1312 copy_memory_small(s, d, count, tmp, step); 1313 1314 if (granularity >= 8) __ bind(copy8); 1315 if (granularity >= 4) __ bind(copy4); 1316 __ bind(finish); 1317 } 1318 1319 1320 void clobber_registers() { 1321 #ifdef ASSERT 1322 RegSet clobbered 1323 = MacroAssembler::call_clobbered_registers() - rscratch1; 1324 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1325 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1326 for (RegSetIterator<> it = clobbered.begin(); *it != noreg; ++it) { 1327 __ mov(*it, rscratch1); 1328 } 1329 #endif 1330 1331 } 1332 1333 // Scan over array at a for count oops, verifying each one. 1334 // Preserves a and count, clobbers rscratch1 and rscratch2. 1335 void verify_oop_array (int size, Register a, Register count, Register temp) { 1336 Label loop, end; 1337 __ mov(rscratch1, a); 1338 __ mov(rscratch2, zr); 1339 __ bind(loop); 1340 __ cmp(rscratch2, count); 1341 __ br(Assembler::HS, end); 1342 if (size == wordSize) { 1343 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1344 __ verify_oop(temp); 1345 } else { 1346 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1347 __ decode_heap_oop(temp); // calls verify_oop 1348 } 1349 __ add(rscratch2, rscratch2, 1); 1350 __ b(loop); 1351 __ bind(end); 1352 } 1353 1354 // Arguments: 1355 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1356 // ignored 1357 // is_oop - true => oop array, so generate store check code 1358 // name - stub name string 1359 // 1360 // Inputs: 1361 // c_rarg0 - source array address 1362 // c_rarg1 - destination array address 1363 // c_rarg2 - element count, treated as ssize_t, can be zero 1364 // 1365 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1366 // the hardware handle it. The two dwords within qwords that span 1367 // cache line boundaries will still be loaded and stored atomically. 1368 // 1369 // Side Effects: 1370 // disjoint_int_copy_entry is set to the no-overlap entry point 1371 // used by generate_conjoint_int_oop_copy(). 1372 // 1373 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1374 const char *name, bool dest_uninitialized = false) { 1375 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1376 RegSet saved_reg = RegSet::of(s, d, count); 1377 __ align(CodeEntryAlignment); 1378 StubCodeMark mark(this, "StubRoutines", name); 1379 address start = __ pc(); 1380 __ enter(); 1381 1382 if (entry != NULL) { 1383 *entry = __ pc(); 1384 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1385 BLOCK_COMMENT("Entry:"); 1386 } 1387 1388 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1389 if (dest_uninitialized) { 1390 decorators |= IS_DEST_UNINITIALIZED; 1391 } 1392 if (aligned) { 1393 decorators |= ARRAYCOPY_ALIGNED; 1394 } 1395 1396 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1397 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1398 1399 if (is_oop) { 1400 // save regs before copy_memory 1401 __ push(RegSet::of(d, count), sp); 1402 } 1403 { 1404 // UnsafeCopyMemory page error: continue after ucm 1405 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1406 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1407 copy_memory(aligned, s, d, count, rscratch1, size); 1408 } 1409 1410 if (is_oop) { 1411 __ pop(RegSet::of(d, count), sp); 1412 if (VerifyOops) 1413 verify_oop_array(size, d, count, r16); 1414 } 1415 1416 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1417 1418 __ leave(); 1419 __ mov(r0, zr); // return 0 1420 __ ret(lr); 1421 return start; 1422 } 1423 1424 // Arguments: 1425 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1426 // ignored 1427 // is_oop - true => oop array, so generate store check code 1428 // name - stub name string 1429 // 1430 // Inputs: 1431 // c_rarg0 - source array address 1432 // c_rarg1 - destination array address 1433 // c_rarg2 - element count, treated as ssize_t, can be zero 1434 // 1435 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1436 // the hardware handle it. The two dwords within qwords that span 1437 // cache line boundaries will still be loaded and stored atomically. 1438 // 1439 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1440 address *entry, const char *name, 1441 bool dest_uninitialized = false) { 1442 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1443 RegSet saved_regs = RegSet::of(s, d, count); 1444 StubCodeMark mark(this, "StubRoutines", name); 1445 address start = __ pc(); 1446 __ enter(); 1447 1448 if (entry != NULL) { 1449 *entry = __ pc(); 1450 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1451 BLOCK_COMMENT("Entry:"); 1452 } 1453 1454 // use fwd copy when (d-s) above_equal (count*size) 1455 __ sub(rscratch1, d, s); 1456 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1457 __ br(Assembler::HS, nooverlap_target); 1458 1459 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1460 if (dest_uninitialized) { 1461 decorators |= IS_DEST_UNINITIALIZED; 1462 } 1463 if (aligned) { 1464 decorators |= ARRAYCOPY_ALIGNED; 1465 } 1466 1467 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1468 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1469 1470 if (is_oop) { 1471 // save regs before copy_memory 1472 __ push(RegSet::of(d, count), sp); 1473 } 1474 { 1475 // UnsafeCopyMemory page error: continue after ucm 1476 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1477 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1478 copy_memory(aligned, s, d, count, rscratch1, -size); 1479 } 1480 if (is_oop) { 1481 __ pop(RegSet::of(d, count), sp); 1482 if (VerifyOops) 1483 verify_oop_array(size, d, count, r16); 1484 } 1485 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1486 __ leave(); 1487 __ mov(r0, zr); // return 0 1488 __ ret(lr); 1489 return start; 1490 } 1491 1492 // Arguments: 1493 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1494 // ignored 1495 // name - stub name string 1496 // 1497 // Inputs: 1498 // c_rarg0 - source array address 1499 // c_rarg1 - destination array address 1500 // c_rarg2 - element count, treated as ssize_t, can be zero 1501 // 1502 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1503 // we let the hardware handle it. The one to eight bytes within words, 1504 // dwords or qwords that span cache line boundaries will still be loaded 1505 // and stored atomically. 1506 // 1507 // Side Effects: 1508 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1509 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1510 // we let the hardware handle it. The one to eight bytes within words, 1511 // dwords or qwords that span cache line boundaries will still be loaded 1512 // and stored atomically. 1513 // 1514 // Side Effects: 1515 // disjoint_byte_copy_entry is set to the no-overlap entry point 1516 // used by generate_conjoint_byte_copy(). 1517 // 1518 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1519 const bool not_oop = false; 1520 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1521 } 1522 1523 // Arguments: 1524 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1525 // ignored 1526 // name - stub name string 1527 // 1528 // Inputs: 1529 // c_rarg0 - source array address 1530 // c_rarg1 - destination array address 1531 // c_rarg2 - element count, treated as ssize_t, can be zero 1532 // 1533 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1534 // we let the hardware handle it. The one to eight bytes within words, 1535 // dwords or qwords that span cache line boundaries will still be loaded 1536 // and stored atomically. 1537 // 1538 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1539 address* entry, const char *name) { 1540 const bool not_oop = false; 1541 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1542 } 1543 1544 // Arguments: 1545 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1546 // ignored 1547 // name - stub name string 1548 // 1549 // Inputs: 1550 // c_rarg0 - source array address 1551 // c_rarg1 - destination array address 1552 // c_rarg2 - element count, treated as ssize_t, can be zero 1553 // 1554 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1555 // let the hardware handle it. The two or four words within dwords 1556 // or qwords that span cache line boundaries will still be loaded 1557 // and stored atomically. 1558 // 1559 // Side Effects: 1560 // disjoint_short_copy_entry is set to the no-overlap entry point 1561 // used by generate_conjoint_short_copy(). 1562 // 1563 address generate_disjoint_short_copy(bool aligned, 1564 address* entry, const char *name) { 1565 const bool not_oop = false; 1566 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1567 } 1568 1569 // Arguments: 1570 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1571 // ignored 1572 // name - stub name string 1573 // 1574 // Inputs: 1575 // c_rarg0 - source array address 1576 // c_rarg1 - destination array address 1577 // c_rarg2 - element count, treated as ssize_t, can be zero 1578 // 1579 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1580 // let the hardware handle it. The two or four words within dwords 1581 // or qwords that span cache line boundaries will still be loaded 1582 // and stored atomically. 1583 // 1584 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1585 address *entry, const char *name) { 1586 const bool not_oop = false; 1587 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1588 1589 } 1590 // Arguments: 1591 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1592 // ignored 1593 // name - stub name string 1594 // 1595 // Inputs: 1596 // c_rarg0 - source array address 1597 // c_rarg1 - destination array address 1598 // c_rarg2 - element count, treated as ssize_t, can be zero 1599 // 1600 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1601 // the hardware handle it. The two dwords within qwords that span 1602 // cache line boundaries will still be loaded and stored atomically. 1603 // 1604 // Side Effects: 1605 // disjoint_int_copy_entry is set to the no-overlap entry point 1606 // used by generate_conjoint_int_oop_copy(). 1607 // 1608 address generate_disjoint_int_copy(bool aligned, address *entry, 1609 const char *name, bool dest_uninitialized = false) { 1610 const bool not_oop = false; 1611 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1612 } 1613 1614 // Arguments: 1615 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1616 // ignored 1617 // name - stub name string 1618 // 1619 // Inputs: 1620 // c_rarg0 - source array address 1621 // c_rarg1 - destination array address 1622 // c_rarg2 - element count, treated as ssize_t, can be zero 1623 // 1624 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1625 // the hardware handle it. The two dwords within qwords that span 1626 // cache line boundaries will still be loaded and stored atomically. 1627 // 1628 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1629 address *entry, const char *name, 1630 bool dest_uninitialized = false) { 1631 const bool not_oop = false; 1632 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1633 } 1634 1635 1636 // Arguments: 1637 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1638 // ignored 1639 // name - stub name string 1640 // 1641 // Inputs: 1642 // c_rarg0 - source array address 1643 // c_rarg1 - destination array address 1644 // c_rarg2 - element count, treated as size_t, can be zero 1645 // 1646 // Side Effects: 1647 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1648 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1649 // 1650 address generate_disjoint_long_copy(bool aligned, address *entry, 1651 const char *name, bool dest_uninitialized = false) { 1652 const bool not_oop = false; 1653 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1654 } 1655 1656 // Arguments: 1657 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1658 // ignored 1659 // name - stub name string 1660 // 1661 // Inputs: 1662 // c_rarg0 - source array address 1663 // c_rarg1 - destination array address 1664 // c_rarg2 - element count, treated as size_t, can be zero 1665 // 1666 address generate_conjoint_long_copy(bool aligned, 1667 address nooverlap_target, address *entry, 1668 const char *name, bool dest_uninitialized = false) { 1669 const bool not_oop = false; 1670 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1671 } 1672 1673 // Arguments: 1674 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1675 // ignored 1676 // name - stub name string 1677 // 1678 // Inputs: 1679 // c_rarg0 - source array address 1680 // c_rarg1 - destination array address 1681 // c_rarg2 - element count, treated as size_t, can be zero 1682 // 1683 // Side Effects: 1684 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1685 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1686 // 1687 address generate_disjoint_oop_copy(bool aligned, address *entry, 1688 const char *name, bool dest_uninitialized) { 1689 const bool is_oop = true; 1690 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1691 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1692 } 1693 1694 // Arguments: 1695 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1696 // ignored 1697 // name - stub name string 1698 // 1699 // Inputs: 1700 // c_rarg0 - source array address 1701 // c_rarg1 - destination array address 1702 // c_rarg2 - element count, treated as size_t, can be zero 1703 // 1704 address generate_conjoint_oop_copy(bool aligned, 1705 address nooverlap_target, address *entry, 1706 const char *name, bool dest_uninitialized) { 1707 const bool is_oop = true; 1708 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1709 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1710 name, dest_uninitialized); 1711 } 1712 1713 1714 // Helper for generating a dynamic type check. 1715 // Smashes rscratch1, rscratch2. 1716 void generate_type_check(Register sub_klass, 1717 Register super_check_offset, 1718 Register super_klass, 1719 Label& L_success) { 1720 assert_different_registers(sub_klass, super_check_offset, super_klass); 1721 1722 BLOCK_COMMENT("type_check:"); 1723 1724 Label L_miss; 1725 1726 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1727 super_check_offset); 1728 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1729 1730 // Fall through on failure! 1731 __ BIND(L_miss); 1732 } 1733 1734 // 1735 // Generate checkcasting array copy stub 1736 // 1737 // Input: 1738 // c_rarg0 - source array address 1739 // c_rarg1 - destination array address 1740 // c_rarg2 - element count, treated as ssize_t, can be zero 1741 // c_rarg3 - size_t ckoff (super_check_offset) 1742 // c_rarg4 - oop ckval (super_klass) 1743 // 1744 // Output: 1745 // r0 == 0 - success 1746 // r0 == -1^K - failure, where K is partial transfer count 1747 // 1748 address generate_checkcast_copy(const char *name, address *entry, 1749 bool dest_uninitialized = false) { 1750 1751 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1752 1753 // Input registers (after setup_arg_regs) 1754 const Register from = c_rarg0; // source array address 1755 const Register to = c_rarg1; // destination array address 1756 const Register count = c_rarg2; // elementscount 1757 const Register ckoff = c_rarg3; // super_check_offset 1758 const Register ckval = c_rarg4; // super_klass 1759 1760 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1761 RegSet wb_post_saved_regs = RegSet::of(count); 1762 1763 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1764 const Register copied_oop = r22; // actual oop copied 1765 const Register count_save = r21; // orig elementscount 1766 const Register start_to = r20; // destination array start address 1767 const Register r19_klass = r19; // oop._klass 1768 1769 //--------------------------------------------------------------- 1770 // Assembler stub will be used for this call to arraycopy 1771 // if the two arrays are subtypes of Object[] but the 1772 // destination array type is not equal to or a supertype 1773 // of the source type. Each element must be separately 1774 // checked. 1775 1776 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1777 copied_oop, r19_klass, count_save); 1778 1779 __ align(CodeEntryAlignment); 1780 StubCodeMark mark(this, "StubRoutines", name); 1781 address start = __ pc(); 1782 1783 __ enter(); // required for proper stackwalking of RuntimeStub frame 1784 1785 #ifdef ASSERT 1786 // caller guarantees that the arrays really are different 1787 // otherwise, we would have to make conjoint checks 1788 { Label L; 1789 array_overlap_test(L, TIMES_OOP); 1790 __ stop("checkcast_copy within a single array"); 1791 __ bind(L); 1792 } 1793 #endif //ASSERT 1794 1795 // Caller of this entry point must set up the argument registers. 1796 if (entry != NULL) { 1797 *entry = __ pc(); 1798 BLOCK_COMMENT("Entry:"); 1799 } 1800 1801 // Empty array: Nothing to do. 1802 __ cbz(count, L_done); 1803 __ push(RegSet::of(r19, r20, r21, r22), sp); 1804 1805 #ifdef ASSERT 1806 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1807 // The ckoff and ckval must be mutually consistent, 1808 // even though caller generates both. 1809 { Label L; 1810 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1811 __ ldrw(start_to, Address(ckval, sco_offset)); 1812 __ cmpw(ckoff, start_to); 1813 __ br(Assembler::EQ, L); 1814 __ stop("super_check_offset inconsistent"); 1815 __ bind(L); 1816 } 1817 #endif //ASSERT 1818 1819 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1820 bool is_oop = true; 1821 if (dest_uninitialized) { 1822 decorators |= IS_DEST_UNINITIALIZED; 1823 } 1824 1825 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1826 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1827 1828 // save the original count 1829 __ mov(count_save, count); 1830 1831 // Copy from low to high addresses 1832 __ mov(start_to, to); // Save destination array start address 1833 __ b(L_load_element); 1834 1835 // ======== begin loop ======== 1836 // (Loop is rotated; its entry is L_load_element.) 1837 // Loop control: 1838 // for (; count != 0; count--) { 1839 // copied_oop = load_heap_oop(from++); 1840 // ... generate_type_check ...; 1841 // store_heap_oop(to++, copied_oop); 1842 // } 1843 __ align(OptoLoopAlignment); 1844 1845 __ BIND(L_store_element); 1846 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1847 __ sub(count, count, 1); 1848 __ cbz(count, L_do_card_marks); 1849 1850 // ======== loop entry is here ======== 1851 __ BIND(L_load_element); 1852 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1853 __ cbz(copied_oop, L_store_element); 1854 1855 __ load_klass(r19_klass, copied_oop);// query the object klass 1856 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1857 // ======== end loop ======== 1858 1859 // It was a real error; we must depend on the caller to finish the job. 1860 // Register count = remaining oops, count_orig = total oops. 1861 // Emit GC store barriers for the oops we have copied and report 1862 // their number to the caller. 1863 1864 __ subs(count, count_save, count); // K = partially copied oop count 1865 __ eon(count, count, zr); // report (-1^K) to caller 1866 __ br(Assembler::EQ, L_done_pop); 1867 1868 __ BIND(L_do_card_marks); 1869 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1870 1871 __ bind(L_done_pop); 1872 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1873 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1874 1875 __ bind(L_done); 1876 __ mov(r0, count); 1877 __ leave(); 1878 __ ret(lr); 1879 1880 return start; 1881 } 1882 1883 // Perform range checks on the proposed arraycopy. 1884 // Kills temp, but nothing else. 1885 // Also, clean the sign bits of src_pos and dst_pos. 1886 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1887 Register src_pos, // source position (c_rarg1) 1888 Register dst, // destination array oo (c_rarg2) 1889 Register dst_pos, // destination position (c_rarg3) 1890 Register length, 1891 Register temp, 1892 Label& L_failed) { 1893 BLOCK_COMMENT("arraycopy_range_checks:"); 1894 1895 assert_different_registers(rscratch1, temp); 1896 1897 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1898 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1899 __ addw(temp, length, src_pos); 1900 __ cmpw(temp, rscratch1); 1901 __ br(Assembler::HI, L_failed); 1902 1903 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1904 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1905 __ addw(temp, length, dst_pos); 1906 __ cmpw(temp, rscratch1); 1907 __ br(Assembler::HI, L_failed); 1908 1909 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1910 __ movw(src_pos, src_pos); 1911 __ movw(dst_pos, dst_pos); 1912 1913 BLOCK_COMMENT("arraycopy_range_checks done"); 1914 } 1915 1916 // These stubs get called from some dumb test routine. 1917 // I'll write them properly when they're called from 1918 // something that's actually doing something. 1919 static void fake_arraycopy_stub(address src, address dst, int count) { 1920 assert(count == 0, "huh?"); 1921 } 1922 1923 1924 // 1925 // Generate 'unsafe' array copy stub 1926 // Though just as safe as the other stubs, it takes an unscaled 1927 // size_t argument instead of an element count. 1928 // 1929 // Input: 1930 // c_rarg0 - source array address 1931 // c_rarg1 - destination array address 1932 // c_rarg2 - byte count, treated as ssize_t, can be zero 1933 // 1934 // Examines the alignment of the operands and dispatches 1935 // to a long, int, short, or byte copy loop. 1936 // 1937 address generate_unsafe_copy(const char *name, 1938 address byte_copy_entry, 1939 address short_copy_entry, 1940 address int_copy_entry, 1941 address long_copy_entry) { 1942 Label L_long_aligned, L_int_aligned, L_short_aligned; 1943 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1944 1945 __ align(CodeEntryAlignment); 1946 StubCodeMark mark(this, "StubRoutines", name); 1947 address start = __ pc(); 1948 __ enter(); // required for proper stackwalking of RuntimeStub frame 1949 1950 // bump this on entry, not on exit: 1951 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1952 1953 __ orr(rscratch1, s, d); 1954 __ orr(rscratch1, rscratch1, count); 1955 1956 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1957 __ cbz(rscratch1, L_long_aligned); 1958 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1959 __ cbz(rscratch1, L_int_aligned); 1960 __ tbz(rscratch1, 0, L_short_aligned); 1961 __ b(RuntimeAddress(byte_copy_entry)); 1962 1963 __ BIND(L_short_aligned); 1964 __ lsr(count, count, LogBytesPerShort); // size => short_count 1965 __ b(RuntimeAddress(short_copy_entry)); 1966 __ BIND(L_int_aligned); 1967 __ lsr(count, count, LogBytesPerInt); // size => int_count 1968 __ b(RuntimeAddress(int_copy_entry)); 1969 __ BIND(L_long_aligned); 1970 __ lsr(count, count, LogBytesPerLong); // size => long_count 1971 __ b(RuntimeAddress(long_copy_entry)); 1972 1973 return start; 1974 } 1975 1976 // 1977 // Generate generic array copy stubs 1978 // 1979 // Input: 1980 // c_rarg0 - src oop 1981 // c_rarg1 - src_pos (32-bits) 1982 // c_rarg2 - dst oop 1983 // c_rarg3 - dst_pos (32-bits) 1984 // c_rarg4 - element count (32-bits) 1985 // 1986 // Output: 1987 // r0 == 0 - success 1988 // r0 == -1^K - failure, where K is partial transfer count 1989 // 1990 address generate_generic_copy(const char *name, 1991 address byte_copy_entry, address short_copy_entry, 1992 address int_copy_entry, address oop_copy_entry, 1993 address long_copy_entry, address checkcast_copy_entry) { 1994 1995 Label L_failed, L_objArray; 1996 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1997 1998 // Input registers 1999 const Register src = c_rarg0; // source array oop 2000 const Register src_pos = c_rarg1; // source position 2001 const Register dst = c_rarg2; // destination array oop 2002 const Register dst_pos = c_rarg3; // destination position 2003 const Register length = c_rarg4; 2004 2005 2006 // Registers used as temps 2007 const Register dst_klass = c_rarg5; 2008 2009 __ align(CodeEntryAlignment); 2010 2011 StubCodeMark mark(this, "StubRoutines", name); 2012 2013 address start = __ pc(); 2014 2015 __ enter(); // required for proper stackwalking of RuntimeStub frame 2016 2017 // bump this on entry, not on exit: 2018 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2019 2020 //----------------------------------------------------------------------- 2021 // Assembler stub will be used for this call to arraycopy 2022 // if the following conditions are met: 2023 // 2024 // (1) src and dst must not be null. 2025 // (2) src_pos must not be negative. 2026 // (3) dst_pos must not be negative. 2027 // (4) length must not be negative. 2028 // (5) src klass and dst klass should be the same and not NULL. 2029 // (6) src and dst should be arrays. 2030 // (7) src_pos + length must not exceed length of src. 2031 // (8) dst_pos + length must not exceed length of dst. 2032 // 2033 2034 // if (src == NULL) return -1; 2035 __ cbz(src, L_failed); 2036 2037 // if (src_pos < 0) return -1; 2038 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2039 2040 // if (dst == NULL) return -1; 2041 __ cbz(dst, L_failed); 2042 2043 // if (dst_pos < 0) return -1; 2044 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2045 2046 // registers used as temp 2047 const Register scratch_length = r16; // elements count to copy 2048 const Register scratch_src_klass = r17; // array klass 2049 const Register lh = r15; // layout helper 2050 2051 // if (length < 0) return -1; 2052 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2053 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2054 2055 __ load_klass(scratch_src_klass, src); 2056 #ifdef ASSERT 2057 // assert(src->klass() != NULL); 2058 { 2059 BLOCK_COMMENT("assert klasses not null {"); 2060 Label L1, L2; 2061 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2062 __ bind(L1); 2063 __ stop("broken null klass"); 2064 __ bind(L2); 2065 __ load_klass(rscratch1, dst); 2066 __ cbz(rscratch1, L1); // this would be broken also 2067 BLOCK_COMMENT("} assert klasses not null done"); 2068 } 2069 #endif 2070 2071 // Load layout helper (32-bits) 2072 // 2073 // |array_tag| | header_size | element_type | |log2_element_size| 2074 // 32 30 24 16 8 2 0 2075 // 2076 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2077 // 2078 2079 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2080 2081 // Handle objArrays completely differently... 2082 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2083 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2084 __ movw(rscratch1, objArray_lh); 2085 __ eorw(rscratch2, lh, rscratch1); 2086 __ cbzw(rscratch2, L_objArray); 2087 2088 // if (src->klass() != dst->klass()) return -1; 2089 __ load_klass(rscratch2, dst); 2090 __ eor(rscratch2, rscratch2, scratch_src_klass); 2091 __ cbnz(rscratch2, L_failed); 2092 2093 // if (!src->is_Array()) return -1; 2094 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2095 2096 // At this point, it is known to be a typeArray (array_tag 0x3). 2097 #ifdef ASSERT 2098 { 2099 BLOCK_COMMENT("assert primitive array {"); 2100 Label L; 2101 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2102 __ cmpw(lh, rscratch2); 2103 __ br(Assembler::GE, L); 2104 __ stop("must be a primitive array"); 2105 __ bind(L); 2106 BLOCK_COMMENT("} assert primitive array done"); 2107 } 2108 #endif 2109 2110 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2111 rscratch2, L_failed); 2112 2113 // TypeArrayKlass 2114 // 2115 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2116 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2117 // 2118 2119 const Register rscratch1_offset = rscratch1; // array offset 2120 const Register r15_elsize = lh; // element size 2121 2122 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2123 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2124 __ add(src, src, rscratch1_offset); // src array offset 2125 __ add(dst, dst, rscratch1_offset); // dst array offset 2126 BLOCK_COMMENT("choose copy loop based on element size"); 2127 2128 // next registers should be set before the jump to corresponding stub 2129 const Register from = c_rarg0; // source array address 2130 const Register to = c_rarg1; // destination array address 2131 const Register count = c_rarg2; // elements count 2132 2133 // 'from', 'to', 'count' registers should be set in such order 2134 // since they are the same as 'src', 'src_pos', 'dst'. 2135 2136 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2137 2138 // The possible values of elsize are 0-3, i.e. exact_log2(element 2139 // size in bytes). We do a simple bitwise binary search. 2140 __ BIND(L_copy_bytes); 2141 __ tbnz(r15_elsize, 1, L_copy_ints); 2142 __ tbnz(r15_elsize, 0, L_copy_shorts); 2143 __ lea(from, Address(src, src_pos));// src_addr 2144 __ lea(to, Address(dst, dst_pos));// dst_addr 2145 __ movw(count, scratch_length); // length 2146 __ b(RuntimeAddress(byte_copy_entry)); 2147 2148 __ BIND(L_copy_shorts); 2149 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2150 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2151 __ movw(count, scratch_length); // length 2152 __ b(RuntimeAddress(short_copy_entry)); 2153 2154 __ BIND(L_copy_ints); 2155 __ tbnz(r15_elsize, 0, L_copy_longs); 2156 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2157 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2158 __ movw(count, scratch_length); // length 2159 __ b(RuntimeAddress(int_copy_entry)); 2160 2161 __ BIND(L_copy_longs); 2162 #ifdef ASSERT 2163 { 2164 BLOCK_COMMENT("assert long copy {"); 2165 Label L; 2166 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2167 __ cmpw(r15_elsize, LogBytesPerLong); 2168 __ br(Assembler::EQ, L); 2169 __ stop("must be long copy, but elsize is wrong"); 2170 __ bind(L); 2171 BLOCK_COMMENT("} assert long copy done"); 2172 } 2173 #endif 2174 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2175 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2176 __ movw(count, scratch_length); // length 2177 __ b(RuntimeAddress(long_copy_entry)); 2178 2179 // ObjArrayKlass 2180 __ BIND(L_objArray); 2181 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2182 2183 Label L_plain_copy, L_checkcast_copy; 2184 // test array classes for subtyping 2185 __ load_klass(r15, dst); 2186 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2187 __ br(Assembler::NE, L_checkcast_copy); 2188 2189 // Identically typed arrays can be copied without element-wise checks. 2190 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2191 rscratch2, L_failed); 2192 2193 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2194 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2195 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2196 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2197 __ movw(count, scratch_length); // length 2198 __ BIND(L_plain_copy); 2199 __ b(RuntimeAddress(oop_copy_entry)); 2200 2201 __ BIND(L_checkcast_copy); 2202 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2203 { 2204 // Before looking at dst.length, make sure dst is also an objArray. 2205 __ ldrw(rscratch1, Address(r15, lh_offset)); 2206 __ movw(rscratch2, objArray_lh); 2207 __ eorw(rscratch1, rscratch1, rscratch2); 2208 __ cbnzw(rscratch1, L_failed); 2209 2210 // It is safe to examine both src.length and dst.length. 2211 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2212 r15, L_failed); 2213 2214 __ load_klass(dst_klass, dst); // reload 2215 2216 // Marshal the base address arguments now, freeing registers. 2217 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2218 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2219 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2220 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2221 __ movw(count, length); // length (reloaded) 2222 Register sco_temp = c_rarg3; // this register is free now 2223 assert_different_registers(from, to, count, sco_temp, 2224 dst_klass, scratch_src_klass); 2225 // assert_clean_int(count, sco_temp); 2226 2227 // Generate the type check. 2228 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2229 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2230 2231 // Smashes rscratch1, rscratch2 2232 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2233 2234 // Fetch destination element klass from the ObjArrayKlass header. 2235 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2236 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2237 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2238 2239 // the checkcast_copy loop needs two extra arguments: 2240 assert(c_rarg3 == sco_temp, "#3 already in place"); 2241 // Set up arguments for checkcast_copy_entry. 2242 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2243 __ b(RuntimeAddress(checkcast_copy_entry)); 2244 } 2245 2246 __ BIND(L_failed); 2247 __ mov(r0, -1); 2248 __ leave(); // required for proper stackwalking of RuntimeStub frame 2249 __ ret(lr); 2250 2251 return start; 2252 } 2253 2254 // 2255 // Generate stub for array fill. If "aligned" is true, the 2256 // "to" address is assumed to be heapword aligned. 2257 // 2258 // Arguments for generated stub: 2259 // to: c_rarg0 2260 // value: c_rarg1 2261 // count: c_rarg2 treated as signed 2262 // 2263 address generate_fill(BasicType t, bool aligned, const char *name) { 2264 __ align(CodeEntryAlignment); 2265 StubCodeMark mark(this, "StubRoutines", name); 2266 address start = __ pc(); 2267 2268 BLOCK_COMMENT("Entry:"); 2269 2270 const Register to = c_rarg0; // source array address 2271 const Register value = c_rarg1; // value 2272 const Register count = c_rarg2; // elements count 2273 2274 const Register bz_base = r10; // base for block_zero routine 2275 const Register cnt_words = r11; // temp register 2276 2277 __ enter(); 2278 2279 Label L_fill_elements, L_exit1; 2280 2281 int shift = -1; 2282 switch (t) { 2283 case T_BYTE: 2284 shift = 0; 2285 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2286 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2287 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2288 __ br(Assembler::LO, L_fill_elements); 2289 break; 2290 case T_SHORT: 2291 shift = 1; 2292 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2293 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2294 __ br(Assembler::LO, L_fill_elements); 2295 break; 2296 case T_INT: 2297 shift = 2; 2298 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2299 __ br(Assembler::LO, L_fill_elements); 2300 break; 2301 default: ShouldNotReachHere(); 2302 } 2303 2304 // Align source address at 8 bytes address boundary. 2305 Label L_skip_align1, L_skip_align2, L_skip_align4; 2306 if (!aligned) { 2307 switch (t) { 2308 case T_BYTE: 2309 // One byte misalignment happens only for byte arrays. 2310 __ tbz(to, 0, L_skip_align1); 2311 __ strb(value, Address(__ post(to, 1))); 2312 __ subw(count, count, 1); 2313 __ bind(L_skip_align1); 2314 // Fallthrough 2315 case T_SHORT: 2316 // Two bytes misalignment happens only for byte and short (char) arrays. 2317 __ tbz(to, 1, L_skip_align2); 2318 __ strh(value, Address(__ post(to, 2))); 2319 __ subw(count, count, 2 >> shift); 2320 __ bind(L_skip_align2); 2321 // Fallthrough 2322 case T_INT: 2323 // Align to 8 bytes, we know we are 4 byte aligned to start. 2324 __ tbz(to, 2, L_skip_align4); 2325 __ strw(value, Address(__ post(to, 4))); 2326 __ subw(count, count, 4 >> shift); 2327 __ bind(L_skip_align4); 2328 break; 2329 default: ShouldNotReachHere(); 2330 } 2331 } 2332 2333 // 2334 // Fill large chunks 2335 // 2336 __ lsrw(cnt_words, count, 3 - shift); // number of words 2337 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2338 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2339 if (UseBlockZeroing) { 2340 Label non_block_zeroing, rest; 2341 // If the fill value is zero we can use the fast zero_words(). 2342 __ cbnz(value, non_block_zeroing); 2343 __ mov(bz_base, to); 2344 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2345 __ zero_words(bz_base, cnt_words); 2346 __ b(rest); 2347 __ bind(non_block_zeroing); 2348 __ fill_words(to, cnt_words, value); 2349 __ bind(rest); 2350 } else { 2351 __ fill_words(to, cnt_words, value); 2352 } 2353 2354 // Remaining count is less than 8 bytes. Fill it by a single store. 2355 // Note that the total length is no less than 8 bytes. 2356 if (t == T_BYTE || t == T_SHORT) { 2357 Label L_exit1; 2358 __ cbzw(count, L_exit1); 2359 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2360 __ str(value, Address(to, -8)); // overwrite some elements 2361 __ bind(L_exit1); 2362 __ leave(); 2363 __ ret(lr); 2364 } 2365 2366 // Handle copies less than 8 bytes. 2367 Label L_fill_2, L_fill_4, L_exit2; 2368 __ bind(L_fill_elements); 2369 switch (t) { 2370 case T_BYTE: 2371 __ tbz(count, 0, L_fill_2); 2372 __ strb(value, Address(__ post(to, 1))); 2373 __ bind(L_fill_2); 2374 __ tbz(count, 1, L_fill_4); 2375 __ strh(value, Address(__ post(to, 2))); 2376 __ bind(L_fill_4); 2377 __ tbz(count, 2, L_exit2); 2378 __ strw(value, Address(to)); 2379 break; 2380 case T_SHORT: 2381 __ tbz(count, 0, L_fill_4); 2382 __ strh(value, Address(__ post(to, 2))); 2383 __ bind(L_fill_4); 2384 __ tbz(count, 1, L_exit2); 2385 __ strw(value, Address(to)); 2386 break; 2387 case T_INT: 2388 __ cbzw(count, L_exit2); 2389 __ strw(value, Address(to)); 2390 break; 2391 default: ShouldNotReachHere(); 2392 } 2393 __ bind(L_exit2); 2394 __ leave(); 2395 __ ret(lr); 2396 return start; 2397 } 2398 2399 address generate_data_cache_writeback() { 2400 const Register line = c_rarg0; // address of line to write back 2401 2402 __ align(CodeEntryAlignment); 2403 2404 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2405 2406 address start = __ pc(); 2407 __ enter(); 2408 __ cache_wb(Address(line, 0)); 2409 __ leave(); 2410 __ ret(lr); 2411 2412 return start; 2413 } 2414 2415 address generate_data_cache_writeback_sync() { 2416 const Register is_pre = c_rarg0; // pre or post sync 2417 2418 __ align(CodeEntryAlignment); 2419 2420 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2421 2422 // pre wbsync is a no-op 2423 // post wbsync translates to an sfence 2424 2425 Label skip; 2426 address start = __ pc(); 2427 __ enter(); 2428 __ cbnz(is_pre, skip); 2429 __ cache_wbsync(false); 2430 __ bind(skip); 2431 __ leave(); 2432 __ ret(lr); 2433 2434 return start; 2435 } 2436 2437 void generate_arraycopy_stubs() { 2438 address entry; 2439 address entry_jbyte_arraycopy; 2440 address entry_jshort_arraycopy; 2441 address entry_jint_arraycopy; 2442 address entry_oop_arraycopy; 2443 address entry_jlong_arraycopy; 2444 address entry_checkcast_arraycopy; 2445 2446 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2447 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2448 2449 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2450 2451 //*** jbyte 2452 // Always need aligned and unaligned versions 2453 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2454 "jbyte_disjoint_arraycopy"); 2455 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2456 &entry_jbyte_arraycopy, 2457 "jbyte_arraycopy"); 2458 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2459 "arrayof_jbyte_disjoint_arraycopy"); 2460 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2461 "arrayof_jbyte_arraycopy"); 2462 2463 //*** jshort 2464 // Always need aligned and unaligned versions 2465 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2466 "jshort_disjoint_arraycopy"); 2467 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2468 &entry_jshort_arraycopy, 2469 "jshort_arraycopy"); 2470 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2471 "arrayof_jshort_disjoint_arraycopy"); 2472 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2473 "arrayof_jshort_arraycopy"); 2474 2475 //*** jint 2476 // Aligned versions 2477 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2478 "arrayof_jint_disjoint_arraycopy"); 2479 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2480 "arrayof_jint_arraycopy"); 2481 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2482 // entry_jint_arraycopy always points to the unaligned version 2483 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2484 "jint_disjoint_arraycopy"); 2485 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2486 &entry_jint_arraycopy, 2487 "jint_arraycopy"); 2488 2489 //*** jlong 2490 // It is always aligned 2491 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2492 "arrayof_jlong_disjoint_arraycopy"); 2493 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2494 "arrayof_jlong_arraycopy"); 2495 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2496 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2497 2498 //*** oops 2499 { 2500 // With compressed oops we need unaligned versions; notice that 2501 // we overwrite entry_oop_arraycopy. 2502 bool aligned = !UseCompressedOops; 2503 2504 StubRoutines::_arrayof_oop_disjoint_arraycopy 2505 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2506 /*dest_uninitialized*/false); 2507 StubRoutines::_arrayof_oop_arraycopy 2508 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2509 /*dest_uninitialized*/false); 2510 // Aligned versions without pre-barriers 2511 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2512 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2513 /*dest_uninitialized*/true); 2514 StubRoutines::_arrayof_oop_arraycopy_uninit 2515 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2516 /*dest_uninitialized*/true); 2517 } 2518 2519 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2520 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2521 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2522 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2523 2524 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2525 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2526 /*dest_uninitialized*/true); 2527 2528 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2529 entry_jbyte_arraycopy, 2530 entry_jshort_arraycopy, 2531 entry_jint_arraycopy, 2532 entry_jlong_arraycopy); 2533 2534 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2535 entry_jbyte_arraycopy, 2536 entry_jshort_arraycopy, 2537 entry_jint_arraycopy, 2538 entry_oop_arraycopy, 2539 entry_jlong_arraycopy, 2540 entry_checkcast_arraycopy); 2541 2542 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2543 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2544 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2545 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2546 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2547 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2548 } 2549 2550 void generate_math_stubs() { Unimplemented(); } 2551 2552 // Arguments: 2553 // 2554 // Inputs: 2555 // c_rarg0 - source byte array address 2556 // c_rarg1 - destination byte array address 2557 // c_rarg2 - K (key) in little endian int array 2558 // 2559 address generate_aescrypt_encryptBlock() { 2560 __ align(CodeEntryAlignment); 2561 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2562 2563 Label L_doLast; 2564 2565 const Register from = c_rarg0; // source array address 2566 const Register to = c_rarg1; // destination array address 2567 const Register key = c_rarg2; // key array address 2568 const Register keylen = rscratch1; 2569 2570 address start = __ pc(); 2571 __ enter(); 2572 2573 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2574 2575 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2576 2577 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2578 __ rev32(v1, __ T16B, v1); 2579 __ rev32(v2, __ T16B, v2); 2580 __ rev32(v3, __ T16B, v3); 2581 __ rev32(v4, __ T16B, v4); 2582 __ aese(v0, v1); 2583 __ aesmc(v0, v0); 2584 __ aese(v0, v2); 2585 __ aesmc(v0, v0); 2586 __ aese(v0, v3); 2587 __ aesmc(v0, v0); 2588 __ aese(v0, v4); 2589 __ aesmc(v0, v0); 2590 2591 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2592 __ rev32(v1, __ T16B, v1); 2593 __ rev32(v2, __ T16B, v2); 2594 __ rev32(v3, __ T16B, v3); 2595 __ rev32(v4, __ T16B, v4); 2596 __ aese(v0, v1); 2597 __ aesmc(v0, v0); 2598 __ aese(v0, v2); 2599 __ aesmc(v0, v0); 2600 __ aese(v0, v3); 2601 __ aesmc(v0, v0); 2602 __ aese(v0, v4); 2603 __ aesmc(v0, v0); 2604 2605 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2606 __ rev32(v1, __ T16B, v1); 2607 __ rev32(v2, __ T16B, v2); 2608 2609 __ cmpw(keylen, 44); 2610 __ br(Assembler::EQ, L_doLast); 2611 2612 __ aese(v0, v1); 2613 __ aesmc(v0, v0); 2614 __ aese(v0, v2); 2615 __ aesmc(v0, v0); 2616 2617 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2618 __ rev32(v1, __ T16B, v1); 2619 __ rev32(v2, __ T16B, v2); 2620 2621 __ cmpw(keylen, 52); 2622 __ br(Assembler::EQ, L_doLast); 2623 2624 __ aese(v0, v1); 2625 __ aesmc(v0, v0); 2626 __ aese(v0, v2); 2627 __ aesmc(v0, v0); 2628 2629 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2630 __ rev32(v1, __ T16B, v1); 2631 __ rev32(v2, __ T16B, v2); 2632 2633 __ BIND(L_doLast); 2634 2635 __ aese(v0, v1); 2636 __ aesmc(v0, v0); 2637 __ aese(v0, v2); 2638 2639 __ ld1(v1, __ T16B, key); 2640 __ rev32(v1, __ T16B, v1); 2641 __ eor(v0, __ T16B, v0, v1); 2642 2643 __ st1(v0, __ T16B, to); 2644 2645 __ mov(r0, 0); 2646 2647 __ leave(); 2648 __ ret(lr); 2649 2650 return start; 2651 } 2652 2653 // Arguments: 2654 // 2655 // Inputs: 2656 // c_rarg0 - source byte array address 2657 // c_rarg1 - destination byte array address 2658 // c_rarg2 - K (key) in little endian int array 2659 // 2660 address generate_aescrypt_decryptBlock() { 2661 assert(UseAES, "need AES cryptographic extension support"); 2662 __ align(CodeEntryAlignment); 2663 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2664 Label L_doLast; 2665 2666 const Register from = c_rarg0; // source array address 2667 const Register to = c_rarg1; // destination array address 2668 const Register key = c_rarg2; // key array address 2669 const Register keylen = rscratch1; 2670 2671 address start = __ pc(); 2672 __ enter(); // required for proper stackwalking of RuntimeStub frame 2673 2674 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2675 2676 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2677 2678 __ ld1(v5, __ T16B, __ post(key, 16)); 2679 __ rev32(v5, __ T16B, v5); 2680 2681 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2682 __ rev32(v1, __ T16B, v1); 2683 __ rev32(v2, __ T16B, v2); 2684 __ rev32(v3, __ T16B, v3); 2685 __ rev32(v4, __ T16B, v4); 2686 __ aesd(v0, v1); 2687 __ aesimc(v0, v0); 2688 __ aesd(v0, v2); 2689 __ aesimc(v0, v0); 2690 __ aesd(v0, v3); 2691 __ aesimc(v0, v0); 2692 __ aesd(v0, v4); 2693 __ aesimc(v0, v0); 2694 2695 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2696 __ rev32(v1, __ T16B, v1); 2697 __ rev32(v2, __ T16B, v2); 2698 __ rev32(v3, __ T16B, v3); 2699 __ rev32(v4, __ T16B, v4); 2700 __ aesd(v0, v1); 2701 __ aesimc(v0, v0); 2702 __ aesd(v0, v2); 2703 __ aesimc(v0, v0); 2704 __ aesd(v0, v3); 2705 __ aesimc(v0, v0); 2706 __ aesd(v0, v4); 2707 __ aesimc(v0, v0); 2708 2709 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2710 __ rev32(v1, __ T16B, v1); 2711 __ rev32(v2, __ T16B, v2); 2712 2713 __ cmpw(keylen, 44); 2714 __ br(Assembler::EQ, L_doLast); 2715 2716 __ aesd(v0, v1); 2717 __ aesimc(v0, v0); 2718 __ aesd(v0, v2); 2719 __ aesimc(v0, v0); 2720 2721 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2722 __ rev32(v1, __ T16B, v1); 2723 __ rev32(v2, __ T16B, v2); 2724 2725 __ cmpw(keylen, 52); 2726 __ br(Assembler::EQ, L_doLast); 2727 2728 __ aesd(v0, v1); 2729 __ aesimc(v0, v0); 2730 __ aesd(v0, v2); 2731 __ aesimc(v0, v0); 2732 2733 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2734 __ rev32(v1, __ T16B, v1); 2735 __ rev32(v2, __ T16B, v2); 2736 2737 __ BIND(L_doLast); 2738 2739 __ aesd(v0, v1); 2740 __ aesimc(v0, v0); 2741 __ aesd(v0, v2); 2742 2743 __ eor(v0, __ T16B, v0, v5); 2744 2745 __ st1(v0, __ T16B, to); 2746 2747 __ mov(r0, 0); 2748 2749 __ leave(); 2750 __ ret(lr); 2751 2752 return start; 2753 } 2754 2755 // Arguments: 2756 // 2757 // Inputs: 2758 // c_rarg0 - source byte array address 2759 // c_rarg1 - destination byte array address 2760 // c_rarg2 - K (key) in little endian int array 2761 // c_rarg3 - r vector byte array address 2762 // c_rarg4 - input length 2763 // 2764 // Output: 2765 // x0 - input length 2766 // 2767 address generate_cipherBlockChaining_encryptAESCrypt() { 2768 assert(UseAES, "need AES cryptographic extension support"); 2769 __ align(CodeEntryAlignment); 2770 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2771 2772 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2773 2774 const Register from = c_rarg0; // source array address 2775 const Register to = c_rarg1; // destination array address 2776 const Register key = c_rarg2; // key array address 2777 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2778 // and left with the results of the last encryption block 2779 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2780 const Register keylen = rscratch1; 2781 2782 address start = __ pc(); 2783 2784 __ enter(); 2785 2786 __ movw(rscratch2, len_reg); 2787 2788 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2789 2790 __ ld1(v0, __ T16B, rvec); 2791 2792 __ cmpw(keylen, 52); 2793 __ br(Assembler::CC, L_loadkeys_44); 2794 __ br(Assembler::EQ, L_loadkeys_52); 2795 2796 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2797 __ rev32(v17, __ T16B, v17); 2798 __ rev32(v18, __ T16B, v18); 2799 __ BIND(L_loadkeys_52); 2800 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2801 __ rev32(v19, __ T16B, v19); 2802 __ rev32(v20, __ T16B, v20); 2803 __ BIND(L_loadkeys_44); 2804 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2805 __ rev32(v21, __ T16B, v21); 2806 __ rev32(v22, __ T16B, v22); 2807 __ rev32(v23, __ T16B, v23); 2808 __ rev32(v24, __ T16B, v24); 2809 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2810 __ rev32(v25, __ T16B, v25); 2811 __ rev32(v26, __ T16B, v26); 2812 __ rev32(v27, __ T16B, v27); 2813 __ rev32(v28, __ T16B, v28); 2814 __ ld1(v29, v30, v31, __ T16B, key); 2815 __ rev32(v29, __ T16B, v29); 2816 __ rev32(v30, __ T16B, v30); 2817 __ rev32(v31, __ T16B, v31); 2818 2819 __ BIND(L_aes_loop); 2820 __ ld1(v1, __ T16B, __ post(from, 16)); 2821 __ eor(v0, __ T16B, v0, v1); 2822 2823 __ br(Assembler::CC, L_rounds_44); 2824 __ br(Assembler::EQ, L_rounds_52); 2825 2826 __ aese(v0, v17); __ aesmc(v0, v0); 2827 __ aese(v0, v18); __ aesmc(v0, v0); 2828 __ BIND(L_rounds_52); 2829 __ aese(v0, v19); __ aesmc(v0, v0); 2830 __ aese(v0, v20); __ aesmc(v0, v0); 2831 __ BIND(L_rounds_44); 2832 __ aese(v0, v21); __ aesmc(v0, v0); 2833 __ aese(v0, v22); __ aesmc(v0, v0); 2834 __ aese(v0, v23); __ aesmc(v0, v0); 2835 __ aese(v0, v24); __ aesmc(v0, v0); 2836 __ aese(v0, v25); __ aesmc(v0, v0); 2837 __ aese(v0, v26); __ aesmc(v0, v0); 2838 __ aese(v0, v27); __ aesmc(v0, v0); 2839 __ aese(v0, v28); __ aesmc(v0, v0); 2840 __ aese(v0, v29); __ aesmc(v0, v0); 2841 __ aese(v0, v30); 2842 __ eor(v0, __ T16B, v0, v31); 2843 2844 __ st1(v0, __ T16B, __ post(to, 16)); 2845 2846 __ subw(len_reg, len_reg, 16); 2847 __ cbnzw(len_reg, L_aes_loop); 2848 2849 __ st1(v0, __ T16B, rvec); 2850 2851 __ mov(r0, rscratch2); 2852 2853 __ leave(); 2854 __ ret(lr); 2855 2856 return start; 2857 } 2858 2859 // Arguments: 2860 // 2861 // Inputs: 2862 // c_rarg0 - source byte array address 2863 // c_rarg1 - destination byte array address 2864 // c_rarg2 - K (key) in little endian int array 2865 // c_rarg3 - r vector byte array address 2866 // c_rarg4 - input length 2867 // 2868 // Output: 2869 // r0 - input length 2870 // 2871 address generate_cipherBlockChaining_decryptAESCrypt() { 2872 assert(UseAES, "need AES cryptographic extension support"); 2873 __ align(CodeEntryAlignment); 2874 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2875 2876 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2877 2878 const Register from = c_rarg0; // source array address 2879 const Register to = c_rarg1; // destination array address 2880 const Register key = c_rarg2; // key array address 2881 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2882 // and left with the results of the last encryption block 2883 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2884 const Register keylen = rscratch1; 2885 2886 address start = __ pc(); 2887 2888 __ enter(); 2889 2890 __ movw(rscratch2, len_reg); 2891 2892 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2893 2894 __ ld1(v2, __ T16B, rvec); 2895 2896 __ ld1(v31, __ T16B, __ post(key, 16)); 2897 __ rev32(v31, __ T16B, v31); 2898 2899 __ cmpw(keylen, 52); 2900 __ br(Assembler::CC, L_loadkeys_44); 2901 __ br(Assembler::EQ, L_loadkeys_52); 2902 2903 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2904 __ rev32(v17, __ T16B, v17); 2905 __ rev32(v18, __ T16B, v18); 2906 __ BIND(L_loadkeys_52); 2907 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2908 __ rev32(v19, __ T16B, v19); 2909 __ rev32(v20, __ T16B, v20); 2910 __ BIND(L_loadkeys_44); 2911 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2912 __ rev32(v21, __ T16B, v21); 2913 __ rev32(v22, __ T16B, v22); 2914 __ rev32(v23, __ T16B, v23); 2915 __ rev32(v24, __ T16B, v24); 2916 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2917 __ rev32(v25, __ T16B, v25); 2918 __ rev32(v26, __ T16B, v26); 2919 __ rev32(v27, __ T16B, v27); 2920 __ rev32(v28, __ T16B, v28); 2921 __ ld1(v29, v30, __ T16B, key); 2922 __ rev32(v29, __ T16B, v29); 2923 __ rev32(v30, __ T16B, v30); 2924 2925 __ BIND(L_aes_loop); 2926 __ ld1(v0, __ T16B, __ post(from, 16)); 2927 __ orr(v1, __ T16B, v0, v0); 2928 2929 __ br(Assembler::CC, L_rounds_44); 2930 __ br(Assembler::EQ, L_rounds_52); 2931 2932 __ aesd(v0, v17); __ aesimc(v0, v0); 2933 __ aesd(v0, v18); __ aesimc(v0, v0); 2934 __ BIND(L_rounds_52); 2935 __ aesd(v0, v19); __ aesimc(v0, v0); 2936 __ aesd(v0, v20); __ aesimc(v0, v0); 2937 __ BIND(L_rounds_44); 2938 __ aesd(v0, v21); __ aesimc(v0, v0); 2939 __ aesd(v0, v22); __ aesimc(v0, v0); 2940 __ aesd(v0, v23); __ aesimc(v0, v0); 2941 __ aesd(v0, v24); __ aesimc(v0, v0); 2942 __ aesd(v0, v25); __ aesimc(v0, v0); 2943 __ aesd(v0, v26); __ aesimc(v0, v0); 2944 __ aesd(v0, v27); __ aesimc(v0, v0); 2945 __ aesd(v0, v28); __ aesimc(v0, v0); 2946 __ aesd(v0, v29); __ aesimc(v0, v0); 2947 __ aesd(v0, v30); 2948 __ eor(v0, __ T16B, v0, v31); 2949 __ eor(v0, __ T16B, v0, v2); 2950 2951 __ st1(v0, __ T16B, __ post(to, 16)); 2952 __ orr(v2, __ T16B, v1, v1); 2953 2954 __ subw(len_reg, len_reg, 16); 2955 __ cbnzw(len_reg, L_aes_loop); 2956 2957 __ st1(v2, __ T16B, rvec); 2958 2959 __ mov(r0, rscratch2); 2960 2961 __ leave(); 2962 __ ret(lr); 2963 2964 return start; 2965 } 2966 2967 // CTR AES crypt. 2968 // Arguments: 2969 // 2970 // Inputs: 2971 // c_rarg0 - source byte array address 2972 // c_rarg1 - destination byte array address 2973 // c_rarg2 - K (key) in little endian int array 2974 // c_rarg3 - counter vector byte array address 2975 // c_rarg4 - input length 2976 // c_rarg5 - saved encryptedCounter start 2977 // c_rarg6 - saved used length 2978 // 2979 // Output: 2980 // r0 - input length 2981 // 2982 address generate_counterMode_AESCrypt() { 2983 const Register in = c_rarg0; 2984 const Register out = c_rarg1; 2985 const Register key = c_rarg2; 2986 const Register counter = c_rarg3; 2987 const Register saved_len = c_rarg4, len = r10; 2988 const Register saved_encrypted_ctr = c_rarg5; 2989 const Register used_ptr = c_rarg6, used = r12; 2990 2991 const Register offset = r7; 2992 const Register keylen = r11; 2993 2994 const unsigned char block_size = 16; 2995 const int bulk_width = 4; 2996 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 2997 // performance with larger data sizes, but it also means that the 2998 // fast path isn't used until you have at least 8 blocks, and up 2999 // to 127 bytes of data will be executed on the slow path. For 3000 // that reason, and also so as not to blow away too much icache, 4 3001 // blocks seems like a sensible compromise. 3002 3003 // Algorithm: 3004 // 3005 // if (len == 0) { 3006 // goto DONE; 3007 // } 3008 // int result = len; 3009 // do { 3010 // if (used >= blockSize) { 3011 // if (len >= bulk_width * blockSize) { 3012 // CTR_large_block(); 3013 // if (len == 0) 3014 // goto DONE; 3015 // } 3016 // for (;;) { 3017 // 16ByteVector v0 = counter; 3018 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3019 // used = 0; 3020 // if (len < blockSize) 3021 // break; /* goto NEXT */ 3022 // 16ByteVector v1 = load16Bytes(in, offset); 3023 // v1 = v1 ^ encryptedCounter; 3024 // store16Bytes(out, offset); 3025 // used = blockSize; 3026 // offset += blockSize; 3027 // len -= blockSize; 3028 // if (len == 0) 3029 // goto DONE; 3030 // } 3031 // } 3032 // NEXT: 3033 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3034 // len--; 3035 // } while (len != 0); 3036 // DONE: 3037 // return result; 3038 // 3039 // CTR_large_block() 3040 // Wide bulk encryption of whole blocks. 3041 3042 __ align(CodeEntryAlignment); 3043 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3044 const address start = __ pc(); 3045 __ enter(); 3046 3047 Label DONE, CTR_large_block, large_block_return; 3048 __ ldrw(used, Address(used_ptr)); 3049 __ cbzw(saved_len, DONE); 3050 3051 __ mov(len, saved_len); 3052 __ mov(offset, 0); 3053 3054 // Compute #rounds for AES based on the length of the key array 3055 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3056 3057 __ aesenc_loadkeys(key, keylen); 3058 3059 { 3060 Label L_CTR_loop, NEXT; 3061 3062 __ bind(L_CTR_loop); 3063 3064 __ cmp(used, block_size); 3065 __ br(__ LO, NEXT); 3066 3067 // Maybe we have a lot of data 3068 __ subsw(rscratch1, len, bulk_width * block_size); 3069 __ br(__ HS, CTR_large_block); 3070 __ BIND(large_block_return); 3071 __ cbzw(len, DONE); 3072 3073 // Setup the counter 3074 __ movi(v4, __ T4S, 0); 3075 __ movi(v5, __ T4S, 1); 3076 __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 } 3077 3078 __ ld1(v0, __ T16B, counter); // Load the counter into v0 3079 __ rev32(v16, __ T16B, v0); 3080 __ addv(v16, __ T4S, v16, v4); 3081 __ rev32(v16, __ T16B, v16); 3082 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3083 3084 { 3085 // We have fewer than bulk_width blocks of data left. Encrypt 3086 // them one by one until there is less than a full block 3087 // remaining, being careful to save both the encrypted counter 3088 // and the counter. 3089 3090 Label inner_loop; 3091 __ bind(inner_loop); 3092 // Counter to encrypt is in v0 3093 __ aesecb_encrypt(noreg, noreg, keylen); 3094 __ st1(v0, __ T16B, saved_encrypted_ctr); 3095 3096 // Do we have a remaining full block? 3097 3098 __ mov(used, 0); 3099 __ cmp(len, block_size); 3100 __ br(__ LO, NEXT); 3101 3102 // Yes, we have a full block 3103 __ ldrq(v1, Address(in, offset)); 3104 __ eor(v1, __ T16B, v1, v0); 3105 __ strq(v1, Address(out, offset)); 3106 __ mov(used, block_size); 3107 __ add(offset, offset, block_size); 3108 3109 __ subw(len, len, block_size); 3110 __ cbzw(len, DONE); 3111 3112 // Increment the counter, store it back 3113 __ orr(v0, __ T16B, v16, v16); 3114 __ rev32(v16, __ T16B, v16); 3115 __ addv(v16, __ T4S, v16, v4); 3116 __ rev32(v16, __ T16B, v16); 3117 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3118 3119 __ b(inner_loop); 3120 } 3121 3122 __ BIND(NEXT); 3123 3124 // Encrypt a single byte, and loop. 3125 // We expect this to be a rare event. 3126 __ ldrb(rscratch1, Address(in, offset)); 3127 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3128 __ eor(rscratch1, rscratch1, rscratch2); 3129 __ strb(rscratch1, Address(out, offset)); 3130 __ add(offset, offset, 1); 3131 __ add(used, used, 1); 3132 __ subw(len, len,1); 3133 __ cbnzw(len, L_CTR_loop); 3134 } 3135 3136 __ bind(DONE); 3137 __ strw(used, Address(used_ptr)); 3138 __ mov(r0, saved_len); 3139 3140 __ leave(); // required for proper stackwalking of RuntimeStub frame 3141 __ ret(lr); 3142 3143 // Bulk encryption 3144 3145 __ BIND (CTR_large_block); 3146 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3147 3148 if (bulk_width == 8) { 3149 __ sub(sp, sp, 4 * 16); 3150 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3151 } 3152 __ sub(sp, sp, 4 * 16); 3153 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3154 RegSet saved_regs = (RegSet::of(in, out, offset) 3155 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3156 __ push(saved_regs, sp); 3157 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3158 __ add(in, in, offset); 3159 __ add(out, out, offset); 3160 3161 // Keys should already be loaded into the correct registers 3162 3163 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3164 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3165 3166 // AES/CTR loop 3167 { 3168 Label L_CTR_loop; 3169 __ BIND(L_CTR_loop); 3170 3171 // Setup the counters 3172 __ movi(v8, __ T4S, 0); 3173 __ movi(v9, __ T4S, 1); 3174 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3175 3176 for (FloatRegister f = v0; f < v0 + bulk_width; f++) { 3177 __ rev32(f, __ T16B, v16); 3178 __ addv(v16, __ T4S, v16, v8); 3179 } 3180 3181 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3182 3183 // Encrypt the counters 3184 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3185 3186 if (bulk_width == 8) { 3187 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3188 } 3189 3190 // XOR the encrypted counters with the inputs 3191 for (int i = 0; i < bulk_width; i++) { 3192 __ eor(v0 + i, __ T16B, v0 + i, v8 + i); 3193 } 3194 3195 // Write the encrypted data 3196 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3197 if (bulk_width == 8) { 3198 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3199 } 3200 3201 __ subw(len, len, 16 * bulk_width); 3202 __ cbnzw(len, L_CTR_loop); 3203 } 3204 3205 // Save the counter back where it goes 3206 __ rev32(v16, __ T16B, v16); 3207 __ st1(v16, __ T16B, counter); 3208 3209 __ pop(saved_regs, sp); 3210 3211 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3212 if (bulk_width == 8) { 3213 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3214 } 3215 3216 __ andr(rscratch1, len, -16 * bulk_width); 3217 __ sub(len, len, rscratch1); 3218 __ add(offset, offset, rscratch1); 3219 __ mov(used, 16); 3220 __ strw(used, Address(used_ptr)); 3221 __ b(large_block_return); 3222 3223 return start; 3224 } 3225 3226 // Arguments: 3227 // 3228 // Inputs: 3229 // c_rarg0 - byte[] source+offset 3230 // c_rarg1 - int[] SHA.state 3231 // c_rarg2 - int offset 3232 // c_rarg3 - int limit 3233 // 3234 address generate_md5_implCompress(bool multi_block, const char *name) { 3235 __ align(CodeEntryAlignment); 3236 StubCodeMark mark(this, "StubRoutines", name); 3237 address start = __ pc(); 3238 3239 Register buf = c_rarg0; 3240 Register state = c_rarg1; 3241 Register ofs = c_rarg2; 3242 Register limit = c_rarg3; 3243 Register a = r4; 3244 Register b = r5; 3245 Register c = r6; 3246 Register d = r7; 3247 Register rscratch3 = r10; 3248 Register rscratch4 = r11; 3249 3250 Label keys; 3251 Label md5_loop; 3252 3253 __ BIND(md5_loop); 3254 3255 // Save hash values for addition after rounds 3256 __ ldrw(a, Address(state, 0)); 3257 __ ldrw(b, Address(state, 4)); 3258 __ ldrw(c, Address(state, 8)); 3259 __ ldrw(d, Address(state, 12)); 3260 3261 #define FF(r1, r2, r3, r4, k, s, t) \ 3262 __ eorw(rscratch3, r3, r4); \ 3263 __ movw(rscratch2, t); \ 3264 __ andw(rscratch3, rscratch3, r2); \ 3265 __ addw(rscratch4, r1, rscratch2); \ 3266 __ ldrw(rscratch1, Address(buf, k*4)); \ 3267 __ eorw(rscratch3, rscratch3, r4); \ 3268 __ addw(rscratch3, rscratch3, rscratch1); \ 3269 __ addw(rscratch3, rscratch3, rscratch4); \ 3270 __ rorw(rscratch2, rscratch3, 32 - s); \ 3271 __ addw(r1, rscratch2, r2); 3272 3273 #define GG(r1, r2, r3, r4, k, s, t) \ 3274 __ eorw(rscratch2, r2, r3); \ 3275 __ ldrw(rscratch1, Address(buf, k*4)); \ 3276 __ andw(rscratch3, rscratch2, r4); \ 3277 __ movw(rscratch2, t); \ 3278 __ eorw(rscratch3, rscratch3, r3); \ 3279 __ addw(rscratch4, r1, rscratch2); \ 3280 __ addw(rscratch3, rscratch3, rscratch1); \ 3281 __ addw(rscratch3, rscratch3, rscratch4); \ 3282 __ rorw(rscratch2, rscratch3, 32 - s); \ 3283 __ addw(r1, rscratch2, r2); 3284 3285 #define HH(r1, r2, r3, r4, k, s, t) \ 3286 __ eorw(rscratch3, r3, r4); \ 3287 __ movw(rscratch2, t); \ 3288 __ addw(rscratch4, r1, rscratch2); \ 3289 __ ldrw(rscratch1, Address(buf, k*4)); \ 3290 __ eorw(rscratch3, rscratch3, r2); \ 3291 __ addw(rscratch3, rscratch3, rscratch1); \ 3292 __ addw(rscratch3, rscratch3, rscratch4); \ 3293 __ rorw(rscratch2, rscratch3, 32 - s); \ 3294 __ addw(r1, rscratch2, r2); 3295 3296 #define II(r1, r2, r3, r4, k, s, t) \ 3297 __ movw(rscratch3, t); \ 3298 __ ornw(rscratch2, r2, r4); \ 3299 __ addw(rscratch4, r1, rscratch3); \ 3300 __ ldrw(rscratch1, Address(buf, k*4)); \ 3301 __ eorw(rscratch3, rscratch2, r3); \ 3302 __ addw(rscratch3, rscratch3, rscratch1); \ 3303 __ addw(rscratch3, rscratch3, rscratch4); \ 3304 __ rorw(rscratch2, rscratch3, 32 - s); \ 3305 __ addw(r1, rscratch2, r2); 3306 3307 // Round 1 3308 FF(a, b, c, d, 0, 7, 0xd76aa478) 3309 FF(d, a, b, c, 1, 12, 0xe8c7b756) 3310 FF(c, d, a, b, 2, 17, 0x242070db) 3311 FF(b, c, d, a, 3, 22, 0xc1bdceee) 3312 FF(a, b, c, d, 4, 7, 0xf57c0faf) 3313 FF(d, a, b, c, 5, 12, 0x4787c62a) 3314 FF(c, d, a, b, 6, 17, 0xa8304613) 3315 FF(b, c, d, a, 7, 22, 0xfd469501) 3316 FF(a, b, c, d, 8, 7, 0x698098d8) 3317 FF(d, a, b, c, 9, 12, 0x8b44f7af) 3318 FF(c, d, a, b, 10, 17, 0xffff5bb1) 3319 FF(b, c, d, a, 11, 22, 0x895cd7be) 3320 FF(a, b, c, d, 12, 7, 0x6b901122) 3321 FF(d, a, b, c, 13, 12, 0xfd987193) 3322 FF(c, d, a, b, 14, 17, 0xa679438e) 3323 FF(b, c, d, a, 15, 22, 0x49b40821) 3324 3325 // Round 2 3326 GG(a, b, c, d, 1, 5, 0xf61e2562) 3327 GG(d, a, b, c, 6, 9, 0xc040b340) 3328 GG(c, d, a, b, 11, 14, 0x265e5a51) 3329 GG(b, c, d, a, 0, 20, 0xe9b6c7aa) 3330 GG(a, b, c, d, 5, 5, 0xd62f105d) 3331 GG(d, a, b, c, 10, 9, 0x02441453) 3332 GG(c, d, a, b, 15, 14, 0xd8a1e681) 3333 GG(b, c, d, a, 4, 20, 0xe7d3fbc8) 3334 GG(a, b, c, d, 9, 5, 0x21e1cde6) 3335 GG(d, a, b, c, 14, 9, 0xc33707d6) 3336 GG(c, d, a, b, 3, 14, 0xf4d50d87) 3337 GG(b, c, d, a, 8, 20, 0x455a14ed) 3338 GG(a, b, c, d, 13, 5, 0xa9e3e905) 3339 GG(d, a, b, c, 2, 9, 0xfcefa3f8) 3340 GG(c, d, a, b, 7, 14, 0x676f02d9) 3341 GG(b, c, d, a, 12, 20, 0x8d2a4c8a) 3342 3343 // Round 3 3344 HH(a, b, c, d, 5, 4, 0xfffa3942) 3345 HH(d, a, b, c, 8, 11, 0x8771f681) 3346 HH(c, d, a, b, 11, 16, 0x6d9d6122) 3347 HH(b, c, d, a, 14, 23, 0xfde5380c) 3348 HH(a, b, c, d, 1, 4, 0xa4beea44) 3349 HH(d, a, b, c, 4, 11, 0x4bdecfa9) 3350 HH(c, d, a, b, 7, 16, 0xf6bb4b60) 3351 HH(b, c, d, a, 10, 23, 0xbebfbc70) 3352 HH(a, b, c, d, 13, 4, 0x289b7ec6) 3353 HH(d, a, b, c, 0, 11, 0xeaa127fa) 3354 HH(c, d, a, b, 3, 16, 0xd4ef3085) 3355 HH(b, c, d, a, 6, 23, 0x04881d05) 3356 HH(a, b, c, d, 9, 4, 0xd9d4d039) 3357 HH(d, a, b, c, 12, 11, 0xe6db99e5) 3358 HH(c, d, a, b, 15, 16, 0x1fa27cf8) 3359 HH(b, c, d, a, 2, 23, 0xc4ac5665) 3360 3361 // Round 4 3362 II(a, b, c, d, 0, 6, 0xf4292244) 3363 II(d, a, b, c, 7, 10, 0x432aff97) 3364 II(c, d, a, b, 14, 15, 0xab9423a7) 3365 II(b, c, d, a, 5, 21, 0xfc93a039) 3366 II(a, b, c, d, 12, 6, 0x655b59c3) 3367 II(d, a, b, c, 3, 10, 0x8f0ccc92) 3368 II(c, d, a, b, 10, 15, 0xffeff47d) 3369 II(b, c, d, a, 1, 21, 0x85845dd1) 3370 II(a, b, c, d, 8, 6, 0x6fa87e4f) 3371 II(d, a, b, c, 15, 10, 0xfe2ce6e0) 3372 II(c, d, a, b, 6, 15, 0xa3014314) 3373 II(b, c, d, a, 13, 21, 0x4e0811a1) 3374 II(a, b, c, d, 4, 6, 0xf7537e82) 3375 II(d, a, b, c, 11, 10, 0xbd3af235) 3376 II(c, d, a, b, 2, 15, 0x2ad7d2bb) 3377 II(b, c, d, a, 9, 21, 0xeb86d391) 3378 3379 #undef FF 3380 #undef GG 3381 #undef HH 3382 #undef II 3383 3384 // write hash values back in the correct order 3385 __ ldrw(rscratch1, Address(state, 0)); 3386 __ addw(rscratch1, rscratch1, a); 3387 __ strw(rscratch1, Address(state, 0)); 3388 3389 __ ldrw(rscratch2, Address(state, 4)); 3390 __ addw(rscratch2, rscratch2, b); 3391 __ strw(rscratch2, Address(state, 4)); 3392 3393 __ ldrw(rscratch3, Address(state, 8)); 3394 __ addw(rscratch3, rscratch3, c); 3395 __ strw(rscratch3, Address(state, 8)); 3396 3397 __ ldrw(rscratch4, Address(state, 12)); 3398 __ addw(rscratch4, rscratch4, d); 3399 __ strw(rscratch4, Address(state, 12)); 3400 3401 if (multi_block) { 3402 __ add(buf, buf, 64); 3403 __ add(ofs, ofs, 64); 3404 __ cmp(ofs, limit); 3405 __ br(Assembler::LE, md5_loop); 3406 __ mov(c_rarg0, ofs); // return ofs 3407 } 3408 3409 __ ret(lr); 3410 3411 return start; 3412 } 3413 3414 // Arguments: 3415 // 3416 // Inputs: 3417 // c_rarg0 - byte[] source+offset 3418 // c_rarg1 - int[] SHA.state 3419 // c_rarg2 - int offset 3420 // c_rarg3 - int limit 3421 // 3422 address generate_sha1_implCompress(bool multi_block, const char *name) { 3423 __ align(CodeEntryAlignment); 3424 StubCodeMark mark(this, "StubRoutines", name); 3425 address start = __ pc(); 3426 3427 Register buf = c_rarg0; 3428 Register state = c_rarg1; 3429 Register ofs = c_rarg2; 3430 Register limit = c_rarg3; 3431 3432 Label keys; 3433 Label sha1_loop; 3434 3435 // load the keys into v0..v3 3436 __ adr(rscratch1, keys); 3437 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3438 // load 5 words state into v6, v7 3439 __ ldrq(v6, Address(state, 0)); 3440 __ ldrs(v7, Address(state, 16)); 3441 3442 3443 __ BIND(sha1_loop); 3444 // load 64 bytes of data into v16..v19 3445 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3446 __ rev32(v16, __ T16B, v16); 3447 __ rev32(v17, __ T16B, v17); 3448 __ rev32(v18, __ T16B, v18); 3449 __ rev32(v19, __ T16B, v19); 3450 3451 // do the sha1 3452 __ addv(v4, __ T4S, v16, v0); 3453 __ orr(v20, __ T16B, v6, v6); 3454 3455 FloatRegister d0 = v16; 3456 FloatRegister d1 = v17; 3457 FloatRegister d2 = v18; 3458 FloatRegister d3 = v19; 3459 3460 for (int round = 0; round < 20; round++) { 3461 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3462 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3463 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3464 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3465 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3466 3467 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3468 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3469 __ sha1h(tmp2, __ T4S, v20); 3470 if (round < 5) 3471 __ sha1c(v20, __ T4S, tmp3, tmp4); 3472 else if (round < 10 || round >= 15) 3473 __ sha1p(v20, __ T4S, tmp3, tmp4); 3474 else 3475 __ sha1m(v20, __ T4S, tmp3, tmp4); 3476 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3477 3478 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3479 } 3480 3481 __ addv(v7, __ T2S, v7, v21); 3482 __ addv(v6, __ T4S, v6, v20); 3483 3484 if (multi_block) { 3485 __ add(ofs, ofs, 64); 3486 __ cmp(ofs, limit); 3487 __ br(Assembler::LE, sha1_loop); 3488 __ mov(c_rarg0, ofs); // return ofs 3489 } 3490 3491 __ strq(v6, Address(state, 0)); 3492 __ strs(v7, Address(state, 16)); 3493 3494 __ ret(lr); 3495 3496 __ bind(keys); 3497 __ emit_int32(0x5a827999); 3498 __ emit_int32(0x6ed9eba1); 3499 __ emit_int32(0x8f1bbcdc); 3500 __ emit_int32(0xca62c1d6); 3501 3502 return start; 3503 } 3504 3505 3506 // Arguments: 3507 // 3508 // Inputs: 3509 // c_rarg0 - byte[] source+offset 3510 // c_rarg1 - int[] SHA.state 3511 // c_rarg2 - int offset 3512 // c_rarg3 - int limit 3513 // 3514 address generate_sha256_implCompress(bool multi_block, const char *name) { 3515 static const uint32_t round_consts[64] = { 3516 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3517 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3518 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3519 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3520 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3521 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3522 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3523 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3524 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3525 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3526 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3527 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3528 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3529 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3530 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3531 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3532 }; 3533 __ align(CodeEntryAlignment); 3534 StubCodeMark mark(this, "StubRoutines", name); 3535 address start = __ pc(); 3536 3537 Register buf = c_rarg0; 3538 Register state = c_rarg1; 3539 Register ofs = c_rarg2; 3540 Register limit = c_rarg3; 3541 3542 Label sha1_loop; 3543 3544 __ stpd(v8, v9, __ pre(sp, -32)); 3545 __ stpd(v10, v11, Address(sp, 16)); 3546 3547 // dga == v0 3548 // dgb == v1 3549 // dg0 == v2 3550 // dg1 == v3 3551 // dg2 == v4 3552 // t0 == v6 3553 // t1 == v7 3554 3555 // load 16 keys to v16..v31 3556 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3557 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3558 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3559 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3560 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3561 3562 // load 8 words (256 bits) state 3563 __ ldpq(v0, v1, state); 3564 3565 __ BIND(sha1_loop); 3566 // load 64 bytes of data into v8..v11 3567 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3568 __ rev32(v8, __ T16B, v8); 3569 __ rev32(v9, __ T16B, v9); 3570 __ rev32(v10, __ T16B, v10); 3571 __ rev32(v11, __ T16B, v11); 3572 3573 __ addv(v6, __ T4S, v8, v16); 3574 __ orr(v2, __ T16B, v0, v0); 3575 __ orr(v3, __ T16B, v1, v1); 3576 3577 FloatRegister d0 = v8; 3578 FloatRegister d1 = v9; 3579 FloatRegister d2 = v10; 3580 FloatRegister d3 = v11; 3581 3582 3583 for (int round = 0; round < 16; round++) { 3584 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3585 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3586 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3587 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3588 3589 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3590 __ orr(v4, __ T16B, v2, v2); 3591 if (round < 15) 3592 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3593 __ sha256h(v2, __ T4S, v3, tmp2); 3594 __ sha256h2(v3, __ T4S, v4, tmp2); 3595 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3596 3597 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3598 } 3599 3600 __ addv(v0, __ T4S, v0, v2); 3601 __ addv(v1, __ T4S, v1, v3); 3602 3603 if (multi_block) { 3604 __ add(ofs, ofs, 64); 3605 __ cmp(ofs, limit); 3606 __ br(Assembler::LE, sha1_loop); 3607 __ mov(c_rarg0, ofs); // return ofs 3608 } 3609 3610 __ ldpd(v10, v11, Address(sp, 16)); 3611 __ ldpd(v8, v9, __ post(sp, 32)); 3612 3613 __ stpq(v0, v1, state); 3614 3615 __ ret(lr); 3616 3617 return start; 3618 } 3619 3620 // Arguments: 3621 // 3622 // Inputs: 3623 // c_rarg0 - byte[] source+offset 3624 // c_rarg1 - int[] SHA.state 3625 // c_rarg2 - int offset 3626 // c_rarg3 - int limit 3627 // 3628 address generate_sha512_implCompress(bool multi_block, const char *name) { 3629 static const uint64_t round_consts[80] = { 3630 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3631 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3632 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3633 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3634 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3635 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3636 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3637 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3638 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3639 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3640 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3641 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3642 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3643 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3644 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3645 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3646 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3647 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3648 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3649 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3650 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3651 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3652 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3653 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3654 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3655 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3656 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3657 }; 3658 3659 // Double rounds for sha512. 3660 #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \ 3661 if (dr < 36) \ 3662 __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16)); \ 3663 __ addv(v5, __ T2D, v##rc0, v##in0); \ 3664 __ ext(v6, __ T16B, v##i2, v##i3, 8); \ 3665 __ ext(v5, __ T16B, v5, v5, 8); \ 3666 __ ext(v7, __ T16B, v##i1, v##i2, 8); \ 3667 __ addv(v##i3, __ T2D, v##i3, v5); \ 3668 if (dr < 32) { \ 3669 __ ext(v5, __ T16B, v##in3, v##in4, 8); \ 3670 __ sha512su0(v##in0, __ T2D, v##in1); \ 3671 } \ 3672 __ sha512h(v##i3, __ T2D, v6, v7); \ 3673 if (dr < 32) \ 3674 __ sha512su1(v##in0, __ T2D, v##in2, v5); \ 3675 __ addv(v##i4, __ T2D, v##i1, v##i3); \ 3676 __ sha512h2(v##i3, __ T2D, v##i1, v##i0); \ 3677 3678 __ align(CodeEntryAlignment); 3679 StubCodeMark mark(this, "StubRoutines", name); 3680 address start = __ pc(); 3681 3682 Register buf = c_rarg0; 3683 Register state = c_rarg1; 3684 Register ofs = c_rarg2; 3685 Register limit = c_rarg3; 3686 3687 __ stpd(v8, v9, __ pre(sp, -64)); 3688 __ stpd(v10, v11, Address(sp, 16)); 3689 __ stpd(v12, v13, Address(sp, 32)); 3690 __ stpd(v14, v15, Address(sp, 48)); 3691 3692 Label sha512_loop; 3693 3694 // load state 3695 __ ld1(v8, v9, v10, v11, __ T2D, state); 3696 3697 // load first 4 round constants 3698 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3699 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3700 3701 __ BIND(sha512_loop); 3702 // load 128B of data into v12..v19 3703 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3704 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3705 __ rev64(v12, __ T16B, v12); 3706 __ rev64(v13, __ T16B, v13); 3707 __ rev64(v14, __ T16B, v14); 3708 __ rev64(v15, __ T16B, v15); 3709 __ rev64(v16, __ T16B, v16); 3710 __ rev64(v17, __ T16B, v17); 3711 __ rev64(v18, __ T16B, v18); 3712 __ rev64(v19, __ T16B, v19); 3713 3714 __ mov(rscratch2, rscratch1); 3715 3716 __ mov(v0, __ T16B, v8); 3717 __ mov(v1, __ T16B, v9); 3718 __ mov(v2, __ T16B, v10); 3719 __ mov(v3, __ T16B, v11); 3720 3721 sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17); 3722 sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18); 3723 sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19); 3724 sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12); 3725 sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13); 3726 sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14); 3727 sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15); 3728 sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16); 3729 sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17); 3730 sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18); 3731 sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19); 3732 sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12); 3733 sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13); 3734 sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14); 3735 sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15); 3736 sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16); 3737 sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17); 3738 sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18); 3739 sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19); 3740 sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12); 3741 sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13); 3742 sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14); 3743 sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15); 3744 sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16); 3745 sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17); 3746 sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18); 3747 sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19); 3748 sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12); 3749 sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13); 3750 sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14); 3751 sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15); 3752 sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16); 3753 sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12, 0, 0, 0, 0); 3754 sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13, 0, 0, 0, 0); 3755 sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14, 0, 0, 0, 0); 3756 sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15, 0, 0, 0, 0); 3757 sha512_dround(36, 3, 0, 4, 2, 1, 24, 0, 16, 0, 0, 0, 0); 3758 sha512_dround(37, 2, 3, 1, 4, 0, 25, 0, 17, 0, 0, 0, 0); 3759 sha512_dround(38, 4, 2, 0, 1, 3, 26, 0, 18, 0, 0, 0, 0); 3760 sha512_dround(39, 1, 4, 3, 0, 2, 27, 0, 19, 0, 0, 0, 0); 3761 3762 __ addv(v8, __ T2D, v8, v0); 3763 __ addv(v9, __ T2D, v9, v1); 3764 __ addv(v10, __ T2D, v10, v2); 3765 __ addv(v11, __ T2D, v11, v3); 3766 3767 if (multi_block) { 3768 __ add(ofs, ofs, 128); 3769 __ cmp(ofs, limit); 3770 __ br(Assembler::LE, sha512_loop); 3771 __ mov(c_rarg0, ofs); // return ofs 3772 } 3773 3774 __ st1(v8, v9, v10, v11, __ T2D, state); 3775 3776 __ ldpd(v14, v15, Address(sp, 48)); 3777 __ ldpd(v12, v13, Address(sp, 32)); 3778 __ ldpd(v10, v11, Address(sp, 16)); 3779 __ ldpd(v8, v9, __ post(sp, 64)); 3780 3781 __ ret(lr); 3782 3783 return start; 3784 } 3785 3786 // Arguments: 3787 // 3788 // Inputs: 3789 // c_rarg0 - byte[] source+offset 3790 // c_rarg1 - byte[] SHA.state 3791 // c_rarg2 - int digest_length 3792 // c_rarg3 - int offset 3793 // c_rarg4 - int limit 3794 // 3795 address generate_sha3_implCompress(bool multi_block, const char *name) { 3796 static const uint64_t round_consts[24] = { 3797 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 3798 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 3799 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 3800 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 3801 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 3802 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 3803 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 3804 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 3805 }; 3806 3807 __ align(CodeEntryAlignment); 3808 StubCodeMark mark(this, "StubRoutines", name); 3809 address start = __ pc(); 3810 3811 Register buf = c_rarg0; 3812 Register state = c_rarg1; 3813 Register digest_length = c_rarg2; 3814 Register ofs = c_rarg3; 3815 Register limit = c_rarg4; 3816 3817 Label sha3_loop, rounds24_loop; 3818 Label sha3_512, sha3_384_or_224, sha3_256; 3819 3820 __ stpd(v8, v9, __ pre(sp, -64)); 3821 __ stpd(v10, v11, Address(sp, 16)); 3822 __ stpd(v12, v13, Address(sp, 32)); 3823 __ stpd(v14, v15, Address(sp, 48)); 3824 3825 // load state 3826 __ add(rscratch1, state, 32); 3827 __ ld1(v0, v1, v2, v3, __ T1D, state); 3828 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 3829 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 3830 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 3831 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 3832 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 3833 __ ld1(v24, __ T1D, rscratch1); 3834 3835 __ BIND(sha3_loop); 3836 3837 // 24 keccak rounds 3838 __ movw(rscratch2, 24); 3839 3840 // load round_constants base 3841 __ lea(rscratch1, ExternalAddress((address) round_consts)); 3842 3843 // load input 3844 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3845 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 3846 __ eor(v0, __ T8B, v0, v25); 3847 __ eor(v1, __ T8B, v1, v26); 3848 __ eor(v2, __ T8B, v2, v27); 3849 __ eor(v3, __ T8B, v3, v28); 3850 __ eor(v4, __ T8B, v4, v29); 3851 __ eor(v5, __ T8B, v5, v30); 3852 __ eor(v6, __ T8B, v6, v31); 3853 3854 // digest_length == 64, SHA3-512 3855 __ tbnz(digest_length, 6, sha3_512); 3856 3857 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3858 __ ld1(v29, v30, __ T8B, __ post(buf, 16)); 3859 __ eor(v7, __ T8B, v7, v25); 3860 __ eor(v8, __ T8B, v8, v26); 3861 __ eor(v9, __ T8B, v9, v27); 3862 __ eor(v10, __ T8B, v10, v28); 3863 __ eor(v11, __ T8B, v11, v29); 3864 __ eor(v12, __ T8B, v12, v30); 3865 3866 // digest_length == 28, SHA3-224; digest_length == 48, SHA3-384 3867 __ tbnz(digest_length, 4, sha3_384_or_224); 3868 3869 // SHA3-256 3870 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3871 __ eor(v13, __ T8B, v13, v25); 3872 __ eor(v14, __ T8B, v14, v26); 3873 __ eor(v15, __ T8B, v15, v27); 3874 __ eor(v16, __ T8B, v16, v28); 3875 __ b(rounds24_loop); 3876 3877 __ BIND(sha3_384_or_224); 3878 __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384 3879 3880 // SHA3-224 3881 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3882 __ ld1(v29, __ T8B, __ post(buf, 8)); 3883 __ eor(v13, __ T8B, v13, v25); 3884 __ eor(v14, __ T8B, v14, v26); 3885 __ eor(v15, __ T8B, v15, v27); 3886 __ eor(v16, __ T8B, v16, v28); 3887 __ eor(v17, __ T8B, v17, v29); 3888 __ b(rounds24_loop); 3889 3890 __ BIND(sha3_512); 3891 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 3892 __ eor(v7, __ T8B, v7, v25); 3893 __ eor(v8, __ T8B, v8, v26); 3894 3895 __ BIND(rounds24_loop); 3896 __ subw(rscratch2, rscratch2, 1); 3897 3898 __ eor3(v29, __ T16B, v4, v9, v14); 3899 __ eor3(v26, __ T16B, v1, v6, v11); 3900 __ eor3(v28, __ T16B, v3, v8, v13); 3901 __ eor3(v25, __ T16B, v0, v5, v10); 3902 __ eor3(v27, __ T16B, v2, v7, v12); 3903 __ eor3(v29, __ T16B, v29, v19, v24); 3904 __ eor3(v26, __ T16B, v26, v16, v21); 3905 __ eor3(v28, __ T16B, v28, v18, v23); 3906 __ eor3(v25, __ T16B, v25, v15, v20); 3907 __ eor3(v27, __ T16B, v27, v17, v22); 3908 3909 __ rax1(v30, __ T2D, v29, v26); 3910 __ rax1(v26, __ T2D, v26, v28); 3911 __ rax1(v28, __ T2D, v28, v25); 3912 __ rax1(v25, __ T2D, v25, v27); 3913 __ rax1(v27, __ T2D, v27, v29); 3914 3915 __ eor(v0, __ T16B, v0, v30); 3916 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 3917 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 3918 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 3919 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 3920 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 3921 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 3922 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 3923 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 3924 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 3925 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 3926 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 3927 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 3928 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 3929 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 3930 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 3931 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 3932 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 3933 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 3934 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 3935 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 3936 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 3937 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 3938 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 3939 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 3940 3941 __ bcax(v20, __ T16B, v31, v22, v8); 3942 __ bcax(v21, __ T16B, v8, v23, v22); 3943 __ bcax(v22, __ T16B, v22, v24, v23); 3944 __ bcax(v23, __ T16B, v23, v31, v24); 3945 __ bcax(v24, __ T16B, v24, v8, v31); 3946 3947 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 3948 3949 __ bcax(v17, __ T16B, v25, v19, v3); 3950 __ bcax(v18, __ T16B, v3, v15, v19); 3951 __ bcax(v19, __ T16B, v19, v16, v15); 3952 __ bcax(v15, __ T16B, v15, v25, v16); 3953 __ bcax(v16, __ T16B, v16, v3, v25); 3954 3955 __ bcax(v10, __ T16B, v29, v12, v26); 3956 __ bcax(v11, __ T16B, v26, v13, v12); 3957 __ bcax(v12, __ T16B, v12, v14, v13); 3958 __ bcax(v13, __ T16B, v13, v29, v14); 3959 __ bcax(v14, __ T16B, v14, v26, v29); 3960 3961 __ bcax(v7, __ T16B, v30, v9, v4); 3962 __ bcax(v8, __ T16B, v4, v5, v9); 3963 __ bcax(v9, __ T16B, v9, v6, v5); 3964 __ bcax(v5, __ T16B, v5, v30, v6); 3965 __ bcax(v6, __ T16B, v6, v4, v30); 3966 3967 __ bcax(v3, __ T16B, v27, v0, v28); 3968 __ bcax(v4, __ T16B, v28, v1, v0); 3969 __ bcax(v0, __ T16B, v0, v2, v1); 3970 __ bcax(v1, __ T16B, v1, v27, v2); 3971 __ bcax(v2, __ T16B, v2, v28, v27); 3972 3973 __ eor(v0, __ T16B, v0, v31); 3974 3975 __ cbnzw(rscratch2, rounds24_loop); 3976 3977 if (multi_block) { 3978 // block_size = 200 - 2 * digest_length, ofs += block_size 3979 __ add(ofs, ofs, 200); 3980 __ sub(ofs, ofs, digest_length, Assembler::LSL, 1); 3981 3982 __ cmp(ofs, limit); 3983 __ br(Assembler::LE, sha3_loop); 3984 __ mov(c_rarg0, ofs); // return ofs 3985 } 3986 3987 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 3988 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 3989 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 3990 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 3991 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 3992 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 3993 __ st1(v24, __ T1D, state); 3994 3995 __ ldpd(v14, v15, Address(sp, 48)); 3996 __ ldpd(v12, v13, Address(sp, 32)); 3997 __ ldpd(v10, v11, Address(sp, 16)); 3998 __ ldpd(v8, v9, __ post(sp, 64)); 3999 4000 __ ret(lr); 4001 4002 return start; 4003 } 4004 4005 // Safefetch stubs. 4006 void generate_safefetch(const char* name, int size, address* entry, 4007 address* fault_pc, address* continuation_pc) { 4008 // safefetch signatures: 4009 // int SafeFetch32(int* adr, int errValue); 4010 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 4011 // 4012 // arguments: 4013 // c_rarg0 = adr 4014 // c_rarg1 = errValue 4015 // 4016 // result: 4017 // PPC_RET = *adr or errValue 4018 4019 StubCodeMark mark(this, "StubRoutines", name); 4020 4021 // Entry point, pc or function descriptor. 4022 *entry = __ pc(); 4023 4024 // Load *adr into c_rarg1, may fault. 4025 *fault_pc = __ pc(); 4026 switch (size) { 4027 case 4: 4028 // int32_t 4029 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 4030 break; 4031 case 8: 4032 // int64_t 4033 __ ldr(c_rarg1, Address(c_rarg0, 0)); 4034 break; 4035 default: 4036 ShouldNotReachHere(); 4037 } 4038 4039 // return errValue or *adr 4040 *continuation_pc = __ pc(); 4041 __ mov(r0, c_rarg1); 4042 __ ret(lr); 4043 } 4044 4045 /** 4046 * Arguments: 4047 * 4048 * Inputs: 4049 * c_rarg0 - int crc 4050 * c_rarg1 - byte* buf 4051 * c_rarg2 - int length 4052 * 4053 * Ouput: 4054 * rax - int crc result 4055 */ 4056 address generate_updateBytesCRC32() { 4057 assert(UseCRC32Intrinsics, "what are we doing here?"); 4058 4059 __ align(CodeEntryAlignment); 4060 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4061 4062 address start = __ pc(); 4063 4064 const Register crc = c_rarg0; // crc 4065 const Register buf = c_rarg1; // source java byte array address 4066 const Register len = c_rarg2; // length 4067 const Register table0 = c_rarg3; // crc_table address 4068 const Register table1 = c_rarg4; 4069 const Register table2 = c_rarg5; 4070 const Register table3 = c_rarg6; 4071 const Register tmp3 = c_rarg7; 4072 4073 BLOCK_COMMENT("Entry:"); 4074 __ enter(); // required for proper stackwalking of RuntimeStub frame 4075 4076 __ kernel_crc32(crc, buf, len, 4077 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4078 4079 __ leave(); // required for proper stackwalking of RuntimeStub frame 4080 __ ret(lr); 4081 4082 return start; 4083 } 4084 4085 /** 4086 * Arguments: 4087 * 4088 * Inputs: 4089 * c_rarg0 - int crc 4090 * c_rarg1 - byte* buf 4091 * c_rarg2 - int length 4092 * c_rarg3 - int* table 4093 * 4094 * Ouput: 4095 * r0 - int crc result 4096 */ 4097 address generate_updateBytesCRC32C() { 4098 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4099 4100 __ align(CodeEntryAlignment); 4101 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4102 4103 address start = __ pc(); 4104 4105 const Register crc = c_rarg0; // crc 4106 const Register buf = c_rarg1; // source java byte array address 4107 const Register len = c_rarg2; // length 4108 const Register table0 = c_rarg3; // crc_table address 4109 const Register table1 = c_rarg4; 4110 const Register table2 = c_rarg5; 4111 const Register table3 = c_rarg6; 4112 const Register tmp3 = c_rarg7; 4113 4114 BLOCK_COMMENT("Entry:"); 4115 __ enter(); // required for proper stackwalking of RuntimeStub frame 4116 4117 __ kernel_crc32c(crc, buf, len, 4118 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4119 4120 __ leave(); // required for proper stackwalking of RuntimeStub frame 4121 __ ret(lr); 4122 4123 return start; 4124 } 4125 4126 /*** 4127 * Arguments: 4128 * 4129 * Inputs: 4130 * c_rarg0 - int adler 4131 * c_rarg1 - byte* buff 4132 * c_rarg2 - int len 4133 * 4134 * Output: 4135 * c_rarg0 - int adler result 4136 */ 4137 address generate_updateBytesAdler32() { 4138 __ align(CodeEntryAlignment); 4139 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4140 address start = __ pc(); 4141 4142 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4143 4144 // Aliases 4145 Register adler = c_rarg0; 4146 Register s1 = c_rarg0; 4147 Register s2 = c_rarg3; 4148 Register buff = c_rarg1; 4149 Register len = c_rarg2; 4150 Register nmax = r4; 4151 Register base = r5; 4152 Register count = r6; 4153 Register temp0 = rscratch1; 4154 Register temp1 = rscratch2; 4155 FloatRegister vbytes = v0; 4156 FloatRegister vs1acc = v1; 4157 FloatRegister vs2acc = v2; 4158 FloatRegister vtable = v3; 4159 4160 // Max number of bytes we can process before having to take the mod 4161 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4162 uint64_t BASE = 0xfff1; 4163 uint64_t NMAX = 0x15B0; 4164 4165 __ mov(base, BASE); 4166 __ mov(nmax, NMAX); 4167 4168 // Load accumulation coefficients for the upper 16 bits 4169 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4170 __ ld1(vtable, __ T16B, Address(temp0)); 4171 4172 // s1 is initialized to the lower 16 bits of adler 4173 // s2 is initialized to the upper 16 bits of adler 4174 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4175 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4176 4177 // The pipelined loop needs at least 16 elements for 1 iteration 4178 // It does check this, but it is more effective to skip to the cleanup loop 4179 __ cmp(len, (u1)16); 4180 __ br(Assembler::HS, L_nmax); 4181 __ cbz(len, L_combine); 4182 4183 __ bind(L_simple_by1_loop); 4184 __ ldrb(temp0, Address(__ post(buff, 1))); 4185 __ add(s1, s1, temp0); 4186 __ add(s2, s2, s1); 4187 __ subs(len, len, 1); 4188 __ br(Assembler::HI, L_simple_by1_loop); 4189 4190 // s1 = s1 % BASE 4191 __ subs(temp0, s1, base); 4192 __ csel(s1, temp0, s1, Assembler::HS); 4193 4194 // s2 = s2 % BASE 4195 __ lsr(temp0, s2, 16); 4196 __ lsl(temp1, temp0, 4); 4197 __ sub(temp1, temp1, temp0); 4198 __ add(s2, temp1, s2, ext::uxth); 4199 4200 __ subs(temp0, s2, base); 4201 __ csel(s2, temp0, s2, Assembler::HS); 4202 4203 __ b(L_combine); 4204 4205 __ bind(L_nmax); 4206 __ subs(len, len, nmax); 4207 __ sub(count, nmax, 16); 4208 __ br(Assembler::LO, L_by16); 4209 4210 __ bind(L_nmax_loop); 4211 4212 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4213 vbytes, vs1acc, vs2acc, vtable); 4214 4215 __ subs(count, count, 16); 4216 __ br(Assembler::HS, L_nmax_loop); 4217 4218 // s1 = s1 % BASE 4219 __ lsr(temp0, s1, 16); 4220 __ lsl(temp1, temp0, 4); 4221 __ sub(temp1, temp1, temp0); 4222 __ add(temp1, temp1, s1, ext::uxth); 4223 4224 __ lsr(temp0, temp1, 16); 4225 __ lsl(s1, temp0, 4); 4226 __ sub(s1, s1, temp0); 4227 __ add(s1, s1, temp1, ext:: uxth); 4228 4229 __ subs(temp0, s1, base); 4230 __ csel(s1, temp0, s1, Assembler::HS); 4231 4232 // s2 = s2 % BASE 4233 __ lsr(temp0, s2, 16); 4234 __ lsl(temp1, temp0, 4); 4235 __ sub(temp1, temp1, temp0); 4236 __ add(temp1, temp1, s2, ext::uxth); 4237 4238 __ lsr(temp0, temp1, 16); 4239 __ lsl(s2, temp0, 4); 4240 __ sub(s2, s2, temp0); 4241 __ add(s2, s2, temp1, ext:: uxth); 4242 4243 __ subs(temp0, s2, base); 4244 __ csel(s2, temp0, s2, Assembler::HS); 4245 4246 __ subs(len, len, nmax); 4247 __ sub(count, nmax, 16); 4248 __ br(Assembler::HS, L_nmax_loop); 4249 4250 __ bind(L_by16); 4251 __ adds(len, len, count); 4252 __ br(Assembler::LO, L_by1); 4253 4254 __ bind(L_by16_loop); 4255 4256 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4257 vbytes, vs1acc, vs2acc, vtable); 4258 4259 __ subs(len, len, 16); 4260 __ br(Assembler::HS, L_by16_loop); 4261 4262 __ bind(L_by1); 4263 __ adds(len, len, 15); 4264 __ br(Assembler::LO, L_do_mod); 4265 4266 __ bind(L_by1_loop); 4267 __ ldrb(temp0, Address(__ post(buff, 1))); 4268 __ add(s1, temp0, s1); 4269 __ add(s2, s2, s1); 4270 __ subs(len, len, 1); 4271 __ br(Assembler::HS, L_by1_loop); 4272 4273 __ bind(L_do_mod); 4274 // s1 = s1 % BASE 4275 __ lsr(temp0, s1, 16); 4276 __ lsl(temp1, temp0, 4); 4277 __ sub(temp1, temp1, temp0); 4278 __ add(temp1, temp1, s1, ext::uxth); 4279 4280 __ lsr(temp0, temp1, 16); 4281 __ lsl(s1, temp0, 4); 4282 __ sub(s1, s1, temp0); 4283 __ add(s1, s1, temp1, ext:: uxth); 4284 4285 __ subs(temp0, s1, base); 4286 __ csel(s1, temp0, s1, Assembler::HS); 4287 4288 // s2 = s2 % BASE 4289 __ lsr(temp0, s2, 16); 4290 __ lsl(temp1, temp0, 4); 4291 __ sub(temp1, temp1, temp0); 4292 __ add(temp1, temp1, s2, ext::uxth); 4293 4294 __ lsr(temp0, temp1, 16); 4295 __ lsl(s2, temp0, 4); 4296 __ sub(s2, s2, temp0); 4297 __ add(s2, s2, temp1, ext:: uxth); 4298 4299 __ subs(temp0, s2, base); 4300 __ csel(s2, temp0, s2, Assembler::HS); 4301 4302 // Combine lower bits and higher bits 4303 __ bind(L_combine); 4304 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4305 4306 __ ret(lr); 4307 4308 return start; 4309 } 4310 4311 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4312 Register temp0, Register temp1, FloatRegister vbytes, 4313 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4314 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4315 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4316 // In non-vectorized code, we update s1 and s2 as: 4317 // s1 <- s1 + b1 4318 // s2 <- s2 + s1 4319 // s1 <- s1 + b2 4320 // s2 <- s2 + b1 4321 // ... 4322 // s1 <- s1 + b16 4323 // s2 <- s2 + s1 4324 // Putting above assignments together, we have: 4325 // s1_new = s1 + b1 + b2 + ... + b16 4326 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4327 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4328 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4329 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4330 4331 // s2 = s2 + s1 * 16 4332 __ add(s2, s2, s1, Assembler::LSL, 4); 4333 4334 // vs1acc = b1 + b2 + b3 + ... + b16 4335 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4336 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4337 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4338 __ uaddlv(vs1acc, __ T16B, vbytes); 4339 __ uaddlv(vs2acc, __ T8H, vs2acc); 4340 4341 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4342 __ fmovd(temp0, vs1acc); 4343 __ fmovd(temp1, vs2acc); 4344 __ add(s1, s1, temp0); 4345 __ add(s2, s2, temp1); 4346 } 4347 4348 /** 4349 * Arguments: 4350 * 4351 * Input: 4352 * c_rarg0 - x address 4353 * c_rarg1 - x length 4354 * c_rarg2 - y address 4355 * c_rarg3 - y lenth 4356 * c_rarg4 - z address 4357 * c_rarg5 - z length 4358 */ 4359 address generate_multiplyToLen() { 4360 __ align(CodeEntryAlignment); 4361 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4362 4363 address start = __ pc(); 4364 const Register x = r0; 4365 const Register xlen = r1; 4366 const Register y = r2; 4367 const Register ylen = r3; 4368 const Register z = r4; 4369 const Register zlen = r5; 4370 4371 const Register tmp1 = r10; 4372 const Register tmp2 = r11; 4373 const Register tmp3 = r12; 4374 const Register tmp4 = r13; 4375 const Register tmp5 = r14; 4376 const Register tmp6 = r15; 4377 const Register tmp7 = r16; 4378 4379 BLOCK_COMMENT("Entry:"); 4380 __ enter(); // required for proper stackwalking of RuntimeStub frame 4381 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4382 __ leave(); // required for proper stackwalking of RuntimeStub frame 4383 __ ret(lr); 4384 4385 return start; 4386 } 4387 4388 address generate_squareToLen() { 4389 // squareToLen algorithm for sizes 1..127 described in java code works 4390 // faster than multiply_to_len on some CPUs and slower on others, but 4391 // multiply_to_len shows a bit better overall results 4392 __ align(CodeEntryAlignment); 4393 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4394 address start = __ pc(); 4395 4396 const Register x = r0; 4397 const Register xlen = r1; 4398 const Register z = r2; 4399 const Register zlen = r3; 4400 const Register y = r4; // == x 4401 const Register ylen = r5; // == xlen 4402 4403 const Register tmp1 = r10; 4404 const Register tmp2 = r11; 4405 const Register tmp3 = r12; 4406 const Register tmp4 = r13; 4407 const Register tmp5 = r14; 4408 const Register tmp6 = r15; 4409 const Register tmp7 = r16; 4410 4411 RegSet spilled_regs = RegSet::of(y, ylen); 4412 BLOCK_COMMENT("Entry:"); 4413 __ enter(); 4414 __ push(spilled_regs, sp); 4415 __ mov(y, x); 4416 __ mov(ylen, xlen); 4417 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4418 __ pop(spilled_regs, sp); 4419 __ leave(); 4420 __ ret(lr); 4421 return start; 4422 } 4423 4424 address generate_mulAdd() { 4425 __ align(CodeEntryAlignment); 4426 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4427 4428 address start = __ pc(); 4429 4430 const Register out = r0; 4431 const Register in = r1; 4432 const Register offset = r2; 4433 const Register len = r3; 4434 const Register k = r4; 4435 4436 BLOCK_COMMENT("Entry:"); 4437 __ enter(); 4438 __ mul_add(out, in, offset, len, k); 4439 __ leave(); 4440 __ ret(lr); 4441 4442 return start; 4443 } 4444 4445 // Arguments: 4446 // 4447 // Input: 4448 // c_rarg0 - newArr address 4449 // c_rarg1 - oldArr address 4450 // c_rarg2 - newIdx 4451 // c_rarg3 - shiftCount 4452 // c_rarg4 - numIter 4453 // 4454 address generate_bigIntegerRightShift() { 4455 __ align(CodeEntryAlignment); 4456 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4457 address start = __ pc(); 4458 4459 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4460 4461 Register newArr = c_rarg0; 4462 Register oldArr = c_rarg1; 4463 Register newIdx = c_rarg2; 4464 Register shiftCount = c_rarg3; 4465 Register numIter = c_rarg4; 4466 Register idx = numIter; 4467 4468 Register newArrCur = rscratch1; 4469 Register shiftRevCount = rscratch2; 4470 Register oldArrCur = r13; 4471 Register oldArrNext = r14; 4472 4473 FloatRegister oldElem0 = v0; 4474 FloatRegister oldElem1 = v1; 4475 FloatRegister newElem = v2; 4476 FloatRegister shiftVCount = v3; 4477 FloatRegister shiftVRevCount = v4; 4478 4479 __ cbz(idx, Exit); 4480 4481 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4482 4483 // left shift count 4484 __ movw(shiftRevCount, 32); 4485 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4486 4487 // numIter too small to allow a 4-words SIMD loop, rolling back 4488 __ cmp(numIter, (u1)4); 4489 __ br(Assembler::LT, ShiftThree); 4490 4491 __ dup(shiftVCount, __ T4S, shiftCount); 4492 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4493 __ negr(shiftVCount, __ T4S, shiftVCount); 4494 4495 __ BIND(ShiftSIMDLoop); 4496 4497 // Calculate the load addresses 4498 __ sub(idx, idx, 4); 4499 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4500 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4501 __ add(oldArrCur, oldArrNext, 4); 4502 4503 // Load 4 words and process 4504 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4505 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4506 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4507 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4508 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4509 __ st1(newElem, __ T4S, Address(newArrCur)); 4510 4511 __ cmp(idx, (u1)4); 4512 __ br(Assembler::LT, ShiftTwoLoop); 4513 __ b(ShiftSIMDLoop); 4514 4515 __ BIND(ShiftTwoLoop); 4516 __ cbz(idx, Exit); 4517 __ cmp(idx, (u1)1); 4518 __ br(Assembler::EQ, ShiftOne); 4519 4520 // Calculate the load addresses 4521 __ sub(idx, idx, 2); 4522 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4523 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4524 __ add(oldArrCur, oldArrNext, 4); 4525 4526 // Load 2 words and process 4527 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4528 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4529 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4530 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4531 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4532 __ st1(newElem, __ T2S, Address(newArrCur)); 4533 __ b(ShiftTwoLoop); 4534 4535 __ BIND(ShiftThree); 4536 __ tbz(idx, 1, ShiftOne); 4537 __ tbz(idx, 0, ShiftTwo); 4538 __ ldrw(r10, Address(oldArr, 12)); 4539 __ ldrw(r11, Address(oldArr, 8)); 4540 __ lsrvw(r10, r10, shiftCount); 4541 __ lslvw(r11, r11, shiftRevCount); 4542 __ orrw(r12, r10, r11); 4543 __ strw(r12, Address(newArr, 8)); 4544 4545 __ BIND(ShiftTwo); 4546 __ ldrw(r10, Address(oldArr, 8)); 4547 __ ldrw(r11, Address(oldArr, 4)); 4548 __ lsrvw(r10, r10, shiftCount); 4549 __ lslvw(r11, r11, shiftRevCount); 4550 __ orrw(r12, r10, r11); 4551 __ strw(r12, Address(newArr, 4)); 4552 4553 __ BIND(ShiftOne); 4554 __ ldrw(r10, Address(oldArr, 4)); 4555 __ ldrw(r11, Address(oldArr)); 4556 __ lsrvw(r10, r10, shiftCount); 4557 __ lslvw(r11, r11, shiftRevCount); 4558 __ orrw(r12, r10, r11); 4559 __ strw(r12, Address(newArr)); 4560 4561 __ BIND(Exit); 4562 __ ret(lr); 4563 4564 return start; 4565 } 4566 4567 // Arguments: 4568 // 4569 // Input: 4570 // c_rarg0 - newArr address 4571 // c_rarg1 - oldArr address 4572 // c_rarg2 - newIdx 4573 // c_rarg3 - shiftCount 4574 // c_rarg4 - numIter 4575 // 4576 address generate_bigIntegerLeftShift() { 4577 __ align(CodeEntryAlignment); 4578 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4579 address start = __ pc(); 4580 4581 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4582 4583 Register newArr = c_rarg0; 4584 Register oldArr = c_rarg1; 4585 Register newIdx = c_rarg2; 4586 Register shiftCount = c_rarg3; 4587 Register numIter = c_rarg4; 4588 4589 Register shiftRevCount = rscratch1; 4590 Register oldArrNext = rscratch2; 4591 4592 FloatRegister oldElem0 = v0; 4593 FloatRegister oldElem1 = v1; 4594 FloatRegister newElem = v2; 4595 FloatRegister shiftVCount = v3; 4596 FloatRegister shiftVRevCount = v4; 4597 4598 __ cbz(numIter, Exit); 4599 4600 __ add(oldArrNext, oldArr, 4); 4601 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4602 4603 // right shift count 4604 __ movw(shiftRevCount, 32); 4605 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4606 4607 // numIter too small to allow a 4-words SIMD loop, rolling back 4608 __ cmp(numIter, (u1)4); 4609 __ br(Assembler::LT, ShiftThree); 4610 4611 __ dup(shiftVCount, __ T4S, shiftCount); 4612 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4613 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4614 4615 __ BIND(ShiftSIMDLoop); 4616 4617 // load 4 words and process 4618 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4619 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4620 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4621 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4622 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4623 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4624 __ sub(numIter, numIter, 4); 4625 4626 __ cmp(numIter, (u1)4); 4627 __ br(Assembler::LT, ShiftTwoLoop); 4628 __ b(ShiftSIMDLoop); 4629 4630 __ BIND(ShiftTwoLoop); 4631 __ cbz(numIter, Exit); 4632 __ cmp(numIter, (u1)1); 4633 __ br(Assembler::EQ, ShiftOne); 4634 4635 // load 2 words and process 4636 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4637 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4638 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4639 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4640 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4641 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4642 __ sub(numIter, numIter, 2); 4643 __ b(ShiftTwoLoop); 4644 4645 __ BIND(ShiftThree); 4646 __ ldrw(r10, __ post(oldArr, 4)); 4647 __ ldrw(r11, __ post(oldArrNext, 4)); 4648 __ lslvw(r10, r10, shiftCount); 4649 __ lsrvw(r11, r11, shiftRevCount); 4650 __ orrw(r12, r10, r11); 4651 __ strw(r12, __ post(newArr, 4)); 4652 __ tbz(numIter, 1, Exit); 4653 __ tbz(numIter, 0, ShiftOne); 4654 4655 __ BIND(ShiftTwo); 4656 __ ldrw(r10, __ post(oldArr, 4)); 4657 __ ldrw(r11, __ post(oldArrNext, 4)); 4658 __ lslvw(r10, r10, shiftCount); 4659 __ lsrvw(r11, r11, shiftRevCount); 4660 __ orrw(r12, r10, r11); 4661 __ strw(r12, __ post(newArr, 4)); 4662 4663 __ BIND(ShiftOne); 4664 __ ldrw(r10, Address(oldArr)); 4665 __ ldrw(r11, Address(oldArrNext)); 4666 __ lslvw(r10, r10, shiftCount); 4667 __ lsrvw(r11, r11, shiftRevCount); 4668 __ orrw(r12, r10, r11); 4669 __ strw(r12, Address(newArr)); 4670 4671 __ BIND(Exit); 4672 __ ret(lr); 4673 4674 return start; 4675 } 4676 4677 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 4678 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 4679 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 4680 // Karatsuba multiplication performs a 128*128 -> 256-bit 4681 // multiplication in three 128-bit multiplications and a few 4682 // additions. 4683 // 4684 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 4685 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 4686 // 4687 // Inputs: 4688 // 4689 // A0 in a.d[0] (subkey) 4690 // A1 in a.d[1] 4691 // (A1+A0) in a1_xor_a0.d[0] 4692 // 4693 // B0 in b.d[0] (state) 4694 // B1 in b.d[1] 4695 4696 __ ext(tmp1, __ T16B, b, b, 0x08); 4697 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 4698 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 4699 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 4700 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 4701 4702 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 4703 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 4704 __ eor(tmp2, __ T16B, tmp2, tmp4); 4705 __ eor(tmp2, __ T16B, tmp2, tmp3); 4706 4707 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 4708 __ ins(result_hi, __ D, tmp2, 0, 1); 4709 __ ins(result_lo, __ D, tmp2, 1, 0); 4710 } 4711 4712 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 4713 FloatRegister p, FloatRegister z, FloatRegister t1) { 4714 const FloatRegister t0 = result; 4715 4716 // The GCM field polynomial f is z^128 + p(z), where p = 4717 // z^7+z^2+z+1. 4718 // 4719 // z^128 === -p(z) (mod (z^128 + p(z))) 4720 // 4721 // so, given that the product we're reducing is 4722 // a == lo + hi * z^128 4723 // substituting, 4724 // === lo - hi * p(z) (mod (z^128 + p(z))) 4725 // 4726 // we reduce by multiplying hi by p(z) and subtracting the result 4727 // from (i.e. XORing it with) lo. Because p has no nonzero high 4728 // bits we can do this with two 64-bit multiplications, lo*p and 4729 // hi*p. 4730 4731 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 4732 __ ext(t1, __ T16B, t0, z, 8); 4733 __ eor(hi, __ T16B, hi, t1); 4734 __ ext(t1, __ T16B, z, t0, 8); 4735 __ eor(lo, __ T16B, lo, t1); 4736 __ pmull(t0, __ T1Q, hi, p, __ T1D); 4737 __ eor(result, __ T16B, lo, t0); 4738 } 4739 4740 address generate_has_negatives(address &has_negatives_long) { 4741 const u1 large_loop_size = 64; 4742 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4743 int dcache_line = VM_Version::dcache_line_size(); 4744 4745 Register ary1 = r1, len = r2, result = r0; 4746 4747 __ align(CodeEntryAlignment); 4748 4749 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 4750 4751 address entry = __ pc(); 4752 4753 __ enter(); 4754 4755 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, 4756 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 4757 4758 __ cmp(len, (u1)15); 4759 __ br(Assembler::GT, LEN_OVER_15); 4760 // The only case when execution falls into this code is when pointer is near 4761 // the end of memory page and we have to avoid reading next page 4762 __ add(ary1, ary1, len); 4763 __ subs(len, len, 8); 4764 __ br(Assembler::GT, LEN_OVER_8); 4765 __ ldr(rscratch2, Address(ary1, -8)); 4766 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 4767 __ lsrv(rscratch2, rscratch2, rscratch1); 4768 __ tst(rscratch2, UPPER_BIT_MASK); 4769 __ cset(result, Assembler::NE); 4770 __ leave(); 4771 __ ret(lr); 4772 __ bind(LEN_OVER_8); 4773 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 4774 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 4775 __ tst(rscratch2, UPPER_BIT_MASK); 4776 __ br(Assembler::NE, RET_TRUE_NO_POP); 4777 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 4778 __ lsrv(rscratch1, rscratch1, rscratch2); 4779 __ tst(rscratch1, UPPER_BIT_MASK); 4780 __ cset(result, Assembler::NE); 4781 __ leave(); 4782 __ ret(lr); 4783 4784 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 4785 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 4786 4787 has_negatives_long = __ pc(); // 2nd entry point 4788 4789 __ enter(); 4790 4791 __ bind(LEN_OVER_15); 4792 __ push(spilled_regs, sp); 4793 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 4794 __ cbz(rscratch2, ALIGNED); 4795 __ ldp(tmp6, tmp1, Address(ary1)); 4796 __ mov(tmp5, 16); 4797 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 4798 __ add(ary1, ary1, rscratch1); 4799 __ sub(len, len, rscratch1); 4800 __ orr(tmp6, tmp6, tmp1); 4801 __ tst(tmp6, UPPER_BIT_MASK); 4802 __ br(Assembler::NE, RET_TRUE); 4803 4804 __ bind(ALIGNED); 4805 __ cmp(len, large_loop_size); 4806 __ br(Assembler::LT, CHECK_16); 4807 // Perform 16-byte load as early return in pre-loop to handle situation 4808 // when initially aligned large array has negative values at starting bytes, 4809 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 4810 // slower. Cases with negative bytes further ahead won't be affected that 4811 // much. In fact, it'll be faster due to early loads, less instructions and 4812 // less branches in LARGE_LOOP. 4813 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 4814 __ sub(len, len, 16); 4815 __ orr(tmp6, tmp6, tmp1); 4816 __ tst(tmp6, UPPER_BIT_MASK); 4817 __ br(Assembler::NE, RET_TRUE); 4818 __ cmp(len, large_loop_size); 4819 __ br(Assembler::LT, CHECK_16); 4820 4821 if (SoftwarePrefetchHintDistance >= 0 4822 && SoftwarePrefetchHintDistance >= dcache_line) { 4823 // initial prefetch 4824 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 4825 } 4826 __ bind(LARGE_LOOP); 4827 if (SoftwarePrefetchHintDistance >= 0) { 4828 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 4829 } 4830 // Issue load instructions first, since it can save few CPU/MEM cycles, also 4831 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 4832 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 4833 // instructions per cycle and have less branches, but this approach disables 4834 // early return, thus, all 64 bytes are loaded and checked every time. 4835 __ ldp(tmp2, tmp3, Address(ary1)); 4836 __ ldp(tmp4, tmp5, Address(ary1, 16)); 4837 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 4838 __ ldp(tmp6, tmp1, Address(ary1, 48)); 4839 __ add(ary1, ary1, large_loop_size); 4840 __ sub(len, len, large_loop_size); 4841 __ orr(tmp2, tmp2, tmp3); 4842 __ orr(tmp4, tmp4, tmp5); 4843 __ orr(rscratch1, rscratch1, rscratch2); 4844 __ orr(tmp6, tmp6, tmp1); 4845 __ orr(tmp2, tmp2, tmp4); 4846 __ orr(rscratch1, rscratch1, tmp6); 4847 __ orr(tmp2, tmp2, rscratch1); 4848 __ tst(tmp2, UPPER_BIT_MASK); 4849 __ br(Assembler::NE, RET_TRUE); 4850 __ cmp(len, large_loop_size); 4851 __ br(Assembler::GE, LARGE_LOOP); 4852 4853 __ bind(CHECK_16); // small 16-byte load pre-loop 4854 __ cmp(len, (u1)16); 4855 __ br(Assembler::LT, POST_LOOP16); 4856 4857 __ bind(LOOP16); // small 16-byte load loop 4858 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 4859 __ sub(len, len, 16); 4860 __ orr(tmp2, tmp2, tmp3); 4861 __ tst(tmp2, UPPER_BIT_MASK); 4862 __ br(Assembler::NE, RET_TRUE); 4863 __ cmp(len, (u1)16); 4864 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 4865 4866 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 4867 __ cmp(len, (u1)8); 4868 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 4869 __ ldr(tmp3, Address(__ post(ary1, 8))); 4870 __ sub(len, len, 8); 4871 __ tst(tmp3, UPPER_BIT_MASK); 4872 __ br(Assembler::NE, RET_TRUE); 4873 4874 __ bind(POST_LOOP16_LOAD_TAIL); 4875 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 4876 __ ldr(tmp1, Address(ary1)); 4877 __ mov(tmp2, 64); 4878 __ sub(tmp4, tmp2, len, __ LSL, 3); 4879 __ lslv(tmp1, tmp1, tmp4); 4880 __ tst(tmp1, UPPER_BIT_MASK); 4881 __ br(Assembler::NE, RET_TRUE); 4882 // Fallthrough 4883 4884 __ bind(RET_FALSE); 4885 __ pop(spilled_regs, sp); 4886 __ leave(); 4887 __ mov(result, zr); 4888 __ ret(lr); 4889 4890 __ bind(RET_TRUE); 4891 __ pop(spilled_regs, sp); 4892 __ bind(RET_TRUE_NO_POP); 4893 __ leave(); 4894 __ mov(result, 1); 4895 __ ret(lr); 4896 4897 return entry; 4898 } 4899 4900 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 4901 bool usePrefetch, Label &NOT_EQUAL) { 4902 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4903 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 4904 tmp7 = r12, tmp8 = r13; 4905 Label LOOP; 4906 4907 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4908 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4909 __ bind(LOOP); 4910 if (usePrefetch) { 4911 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 4912 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 4913 } 4914 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 4915 __ eor(tmp1, tmp1, tmp2); 4916 __ eor(tmp3, tmp3, tmp4); 4917 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 4918 __ orr(tmp1, tmp1, tmp3); 4919 __ cbnz(tmp1, NOT_EQUAL); 4920 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4921 __ eor(tmp5, tmp5, tmp6); 4922 __ eor(tmp7, tmp7, tmp8); 4923 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4924 __ orr(tmp5, tmp5, tmp7); 4925 __ cbnz(tmp5, NOT_EQUAL); 4926 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 4927 __ eor(tmp1, tmp1, tmp2); 4928 __ eor(tmp3, tmp3, tmp4); 4929 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 4930 __ orr(tmp1, tmp1, tmp3); 4931 __ cbnz(tmp1, NOT_EQUAL); 4932 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4933 __ eor(tmp5, tmp5, tmp6); 4934 __ sub(cnt1, cnt1, 8 * wordSize); 4935 __ eor(tmp7, tmp7, tmp8); 4936 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4937 // tmp6 is not used. MacroAssembler::subs is used here (rather than 4938 // cmp) because subs allows an unlimited range of immediate operand. 4939 __ subs(tmp6, cnt1, loopThreshold); 4940 __ orr(tmp5, tmp5, tmp7); 4941 __ cbnz(tmp5, NOT_EQUAL); 4942 __ br(__ GE, LOOP); 4943 // post-loop 4944 __ eor(tmp1, tmp1, tmp2); 4945 __ eor(tmp3, tmp3, tmp4); 4946 __ orr(tmp1, tmp1, tmp3); 4947 __ sub(cnt1, cnt1, 2 * wordSize); 4948 __ cbnz(tmp1, NOT_EQUAL); 4949 } 4950 4951 void generate_large_array_equals_loop_simd(int loopThreshold, 4952 bool usePrefetch, Label &NOT_EQUAL) { 4953 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4954 tmp2 = rscratch2; 4955 Label LOOP; 4956 4957 __ bind(LOOP); 4958 if (usePrefetch) { 4959 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 4960 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 4961 } 4962 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 4963 __ sub(cnt1, cnt1, 8 * wordSize); 4964 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 4965 __ subs(tmp1, cnt1, loopThreshold); 4966 __ eor(v0, __ T16B, v0, v4); 4967 __ eor(v1, __ T16B, v1, v5); 4968 __ eor(v2, __ T16B, v2, v6); 4969 __ eor(v3, __ T16B, v3, v7); 4970 __ orr(v0, __ T16B, v0, v1); 4971 __ orr(v1, __ T16B, v2, v3); 4972 __ orr(v0, __ T16B, v0, v1); 4973 __ umov(tmp1, v0, __ D, 0); 4974 __ umov(tmp2, v0, __ D, 1); 4975 __ orr(tmp1, tmp1, tmp2); 4976 __ cbnz(tmp1, NOT_EQUAL); 4977 __ br(__ GE, LOOP); 4978 } 4979 4980 // a1 = r1 - array1 address 4981 // a2 = r2 - array2 address 4982 // result = r0 - return value. Already contains "false" 4983 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 4984 // r3-r5 are reserved temporary registers 4985 address generate_large_array_equals() { 4986 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4987 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 4988 tmp7 = r12, tmp8 = r13; 4989 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 4990 SMALL_LOOP, POST_LOOP; 4991 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 4992 // calculate if at least 32 prefetched bytes are used 4993 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 4994 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 4995 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 4996 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 4997 tmp5, tmp6, tmp7, tmp8); 4998 4999 __ align(CodeEntryAlignment); 5000 5001 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5002 5003 address entry = __ pc(); 5004 __ enter(); 5005 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5006 // also advance pointers to use post-increment instead of pre-increment 5007 __ add(a1, a1, wordSize); 5008 __ add(a2, a2, wordSize); 5009 if (AvoidUnalignedAccesses) { 5010 // both implementations (SIMD/nonSIMD) are using relatively large load 5011 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5012 // on some CPUs in case of address is not at least 16-byte aligned. 5013 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5014 // load if needed at least for 1st address and make if 16-byte aligned. 5015 Label ALIGNED16; 5016 __ tbz(a1, 3, ALIGNED16); 5017 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5018 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5019 __ sub(cnt1, cnt1, wordSize); 5020 __ eor(tmp1, tmp1, tmp2); 5021 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5022 __ bind(ALIGNED16); 5023 } 5024 if (UseSIMDForArrayEquals) { 5025 if (SoftwarePrefetchHintDistance >= 0) { 5026 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5027 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5028 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5029 /* prfm = */ true, NOT_EQUAL); 5030 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5031 __ br(__ LT, TAIL); 5032 } 5033 __ bind(NO_PREFETCH_LARGE_LOOP); 5034 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5035 /* prfm = */ false, NOT_EQUAL); 5036 } else { 5037 __ push(spilled_regs, sp); 5038 if (SoftwarePrefetchHintDistance >= 0) { 5039 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5040 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5041 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5042 /* prfm = */ true, NOT_EQUAL); 5043 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5044 __ br(__ LT, TAIL); 5045 } 5046 __ bind(NO_PREFETCH_LARGE_LOOP); 5047 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5048 /* prfm = */ false, NOT_EQUAL); 5049 } 5050 __ bind(TAIL); 5051 __ cbz(cnt1, EQUAL); 5052 __ subs(cnt1, cnt1, wordSize); 5053 __ br(__ LE, POST_LOOP); 5054 __ bind(SMALL_LOOP); 5055 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5056 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5057 __ subs(cnt1, cnt1, wordSize); 5058 __ eor(tmp1, tmp1, tmp2); 5059 __ cbnz(tmp1, NOT_EQUAL); 5060 __ br(__ GT, SMALL_LOOP); 5061 __ bind(POST_LOOP); 5062 __ ldr(tmp1, Address(a1, cnt1)); 5063 __ ldr(tmp2, Address(a2, cnt1)); 5064 __ eor(tmp1, tmp1, tmp2); 5065 __ cbnz(tmp1, NOT_EQUAL); 5066 __ bind(EQUAL); 5067 __ mov(result, true); 5068 __ bind(NOT_EQUAL); 5069 if (!UseSIMDForArrayEquals) { 5070 __ pop(spilled_regs, sp); 5071 } 5072 __ bind(NOT_EQUAL_NO_POP); 5073 __ leave(); 5074 __ ret(lr); 5075 return entry; 5076 } 5077 5078 address generate_dsin_dcos(bool isCos) { 5079 __ align(CodeEntryAlignment); 5080 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5081 address start = __ pc(); 5082 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5083 (address)StubRoutines::aarch64::_two_over_pi, 5084 (address)StubRoutines::aarch64::_pio2, 5085 (address)StubRoutines::aarch64::_dsin_coef, 5086 (address)StubRoutines::aarch64::_dcos_coef); 5087 return start; 5088 } 5089 5090 address generate_dlog() { 5091 __ align(CodeEntryAlignment); 5092 StubCodeMark mark(this, "StubRoutines", "dlog"); 5093 address entry = __ pc(); 5094 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 5095 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 5096 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 5097 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 5098 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 5099 return entry; 5100 } 5101 5102 5103 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5104 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5105 Label &DIFF2) { 5106 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5107 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5108 5109 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5110 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5111 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5112 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5113 5114 __ fmovd(tmpL, vtmp3); 5115 __ eor(rscratch2, tmp3, tmpL); 5116 __ cbnz(rscratch2, DIFF2); 5117 5118 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5119 __ umov(tmpL, vtmp3, __ D, 1); 5120 __ eor(rscratch2, tmpU, tmpL); 5121 __ cbnz(rscratch2, DIFF1); 5122 5123 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5124 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5125 __ fmovd(tmpL, vtmp); 5126 __ eor(rscratch2, tmp3, tmpL); 5127 __ cbnz(rscratch2, DIFF2); 5128 5129 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5130 __ umov(tmpL, vtmp, __ D, 1); 5131 __ eor(rscratch2, tmpU, tmpL); 5132 __ cbnz(rscratch2, DIFF1); 5133 } 5134 5135 // r0 = result 5136 // r1 = str1 5137 // r2 = cnt1 5138 // r3 = str2 5139 // r4 = cnt2 5140 // r10 = tmp1 5141 // r11 = tmp2 5142 address generate_compare_long_string_different_encoding(bool isLU) { 5143 __ align(CodeEntryAlignment); 5144 StubCodeMark mark(this, "StubRoutines", isLU 5145 ? "compare_long_string_different_encoding LU" 5146 : "compare_long_string_different_encoding UL"); 5147 address entry = __ pc(); 5148 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5149 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5150 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5151 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5152 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5153 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5154 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5155 5156 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5157 5158 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5159 // cnt2 == amount of characters left to compare 5160 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5161 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5162 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5163 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5164 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5165 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5166 __ eor(rscratch2, tmp1, tmp2); 5167 __ mov(rscratch1, tmp2); 5168 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5169 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5170 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5171 __ push(spilled_regs, sp); 5172 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5173 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5174 5175 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5176 5177 if (SoftwarePrefetchHintDistance >= 0) { 5178 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5179 __ br(__ LT, NO_PREFETCH); 5180 __ bind(LARGE_LOOP_PREFETCH); 5181 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5182 __ mov(tmp4, 2); 5183 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5184 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5185 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5186 __ subs(tmp4, tmp4, 1); 5187 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5188 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5189 __ mov(tmp4, 2); 5190 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5191 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5192 __ subs(tmp4, tmp4, 1); 5193 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5194 __ sub(cnt2, cnt2, 64); 5195 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5196 __ br(__ GE, LARGE_LOOP_PREFETCH); 5197 } 5198 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5199 __ bind(NO_PREFETCH); 5200 __ subs(cnt2, cnt2, 16); 5201 __ br(__ LT, TAIL); 5202 __ align(OptoLoopAlignment); 5203 __ bind(SMALL_LOOP); // smaller loop 5204 __ subs(cnt2, cnt2, 16); 5205 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5206 __ br(__ GE, SMALL_LOOP); 5207 __ cmn(cnt2, (u1)16); 5208 __ br(__ EQ, LOAD_LAST); 5209 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5210 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5211 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5212 __ ldr(tmp3, Address(cnt1, -8)); 5213 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5214 __ b(LOAD_LAST); 5215 __ bind(DIFF2); 5216 __ mov(tmpU, tmp3); 5217 __ bind(DIFF1); 5218 __ pop(spilled_regs, sp); 5219 __ b(CALCULATE_DIFFERENCE); 5220 __ bind(LOAD_LAST); 5221 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5222 // No need to load it again 5223 __ mov(tmpU, tmp3); 5224 __ pop(spilled_regs, sp); 5225 5226 // tmp2 points to the address of the last 4 Latin1 characters right now 5227 __ ldrs(vtmp, Address(tmp2)); 5228 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5229 __ fmovd(tmpL, vtmp); 5230 5231 __ eor(rscratch2, tmpU, tmpL); 5232 __ cbz(rscratch2, DONE); 5233 5234 // Find the first different characters in the longwords and 5235 // compute their difference. 5236 __ bind(CALCULATE_DIFFERENCE); 5237 __ rev(rscratch2, rscratch2); 5238 __ clz(rscratch2, rscratch2); 5239 __ andr(rscratch2, rscratch2, -16); 5240 __ lsrv(tmp1, tmp1, rscratch2); 5241 __ uxthw(tmp1, tmp1); 5242 __ lsrv(rscratch1, rscratch1, rscratch2); 5243 __ uxthw(rscratch1, rscratch1); 5244 __ subw(result, tmp1, rscratch1); 5245 __ bind(DONE); 5246 __ ret(lr); 5247 return entry; 5248 } 5249 5250 address generate_method_entry_barrier() { 5251 __ align(CodeEntryAlignment); 5252 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5253 5254 Label deoptimize_label; 5255 5256 address start = __ pc(); 5257 5258 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5259 5260 __ enter(); 5261 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5262 5263 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5264 5265 __ push_call_clobbered_registers(); 5266 5267 __ mov(c_rarg0, rscratch2); 5268 __ call_VM_leaf 5269 (CAST_FROM_FN_PTR 5270 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5271 5272 __ reset_last_Java_frame(true); 5273 5274 __ mov(rscratch1, r0); 5275 5276 __ pop_call_clobbered_registers(); 5277 5278 __ cbnz(rscratch1, deoptimize_label); 5279 5280 __ leave(); 5281 __ ret(lr); 5282 5283 __ BIND(deoptimize_label); 5284 5285 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5286 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5287 5288 __ mov(sp, rscratch1); 5289 __ br(rscratch2); 5290 5291 return start; 5292 } 5293 5294 // r0 = result 5295 // r1 = str1 5296 // r2 = cnt1 5297 // r3 = str2 5298 // r4 = cnt2 5299 // r10 = tmp1 5300 // r11 = tmp2 5301 address generate_compare_long_string_same_encoding(bool isLL) { 5302 __ align(CodeEntryAlignment); 5303 StubCodeMark mark(this, "StubRoutines", isLL 5304 ? "compare_long_string_same_encoding LL" 5305 : "compare_long_string_same_encoding UU"); 5306 address entry = __ pc(); 5307 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5308 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5309 5310 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5311 5312 // exit from large loop when less than 64 bytes left to read or we're about 5313 // to prefetch memory behind array border 5314 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5315 5316 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5317 __ eor(rscratch2, tmp1, tmp2); 5318 __ cbnz(rscratch2, CAL_DIFFERENCE); 5319 5320 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5321 // update pointers, because of previous read 5322 __ add(str1, str1, wordSize); 5323 __ add(str2, str2, wordSize); 5324 if (SoftwarePrefetchHintDistance >= 0) { 5325 __ align(OptoLoopAlignment); 5326 __ bind(LARGE_LOOP_PREFETCH); 5327 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5328 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5329 5330 for (int i = 0; i < 4; i++) { 5331 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5332 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5333 __ cmp(tmp1, tmp2); 5334 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5335 __ br(Assembler::NE, DIFF); 5336 } 5337 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5338 __ add(str1, str1, 64); 5339 __ add(str2, str2, 64); 5340 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5341 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5342 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5343 } 5344 5345 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5346 __ br(Assembler::LE, LESS16); 5347 __ align(OptoLoopAlignment); 5348 __ bind(LOOP_COMPARE16); 5349 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5350 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5351 __ cmp(tmp1, tmp2); 5352 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5353 __ br(Assembler::NE, DIFF); 5354 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5355 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5356 __ br(Assembler::LT, LESS16); 5357 5358 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5359 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5360 __ cmp(tmp1, tmp2); 5361 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5362 __ br(Assembler::NE, DIFF); 5363 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5364 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5365 __ br(Assembler::GE, LOOP_COMPARE16); 5366 __ cbz(cnt2, LENGTH_DIFF); 5367 5368 __ bind(LESS16); 5369 // each 8 compare 5370 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5371 __ br(Assembler::LE, LESS8); 5372 __ ldr(tmp1, Address(__ post(str1, 8))); 5373 __ ldr(tmp2, Address(__ post(str2, 8))); 5374 __ eor(rscratch2, tmp1, tmp2); 5375 __ cbnz(rscratch2, CAL_DIFFERENCE); 5376 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5377 5378 __ bind(LESS8); // directly load last 8 bytes 5379 if (!isLL) { 5380 __ add(cnt2, cnt2, cnt2); 5381 } 5382 __ ldr(tmp1, Address(str1, cnt2)); 5383 __ ldr(tmp2, Address(str2, cnt2)); 5384 __ eor(rscratch2, tmp1, tmp2); 5385 __ cbz(rscratch2, LENGTH_DIFF); 5386 __ b(CAL_DIFFERENCE); 5387 5388 __ bind(DIFF); 5389 __ cmp(tmp1, tmp2); 5390 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5391 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5392 // reuse rscratch2 register for the result of eor instruction 5393 __ eor(rscratch2, tmp1, tmp2); 5394 5395 __ bind(CAL_DIFFERENCE); 5396 __ rev(rscratch2, rscratch2); 5397 __ clz(rscratch2, rscratch2); 5398 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5399 __ lsrv(tmp1, tmp1, rscratch2); 5400 __ lsrv(tmp2, tmp2, rscratch2); 5401 if (isLL) { 5402 __ uxtbw(tmp1, tmp1); 5403 __ uxtbw(tmp2, tmp2); 5404 } else { 5405 __ uxthw(tmp1, tmp1); 5406 __ uxthw(tmp2, tmp2); 5407 } 5408 __ subw(result, tmp1, tmp2); 5409 5410 __ bind(LENGTH_DIFF); 5411 __ ret(lr); 5412 return entry; 5413 } 5414 5415 void generate_compare_long_strings() { 5416 StubRoutines::aarch64::_compare_long_string_LL 5417 = generate_compare_long_string_same_encoding(true); 5418 StubRoutines::aarch64::_compare_long_string_UU 5419 = generate_compare_long_string_same_encoding(false); 5420 StubRoutines::aarch64::_compare_long_string_LU 5421 = generate_compare_long_string_different_encoding(true); 5422 StubRoutines::aarch64::_compare_long_string_UL 5423 = generate_compare_long_string_different_encoding(false); 5424 } 5425 5426 // R0 = result 5427 // R1 = str2 5428 // R2 = cnt1 5429 // R3 = str1 5430 // R4 = cnt2 5431 // This generic linear code use few additional ideas, which makes it faster: 5432 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5433 // in order to skip initial loading(help in systems with 1 ld pipeline) 5434 // 2) we can use "fast" algorithm of finding single character to search for 5435 // first symbol with less branches(1 branch per each loaded register instead 5436 // of branch for each symbol), so, this is where constants like 5437 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5438 // 3) after loading and analyzing 1st register of source string, it can be 5439 // used to search for every 1st character entry, saving few loads in 5440 // comparison with "simplier-but-slower" implementation 5441 // 4) in order to avoid lots of push/pop operations, code below is heavily 5442 // re-using/re-initializing/compressing register values, which makes code 5443 // larger and a bit less readable, however, most of extra operations are 5444 // issued during loads or branches, so, penalty is minimal 5445 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5446 const char* stubName = str1_isL 5447 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5448 : "indexof_linear_uu"; 5449 __ align(CodeEntryAlignment); 5450 StubCodeMark mark(this, "StubRoutines", stubName); 5451 address entry = __ pc(); 5452 5453 int str1_chr_size = str1_isL ? 1 : 2; 5454 int str2_chr_size = str2_isL ? 1 : 2; 5455 int str1_chr_shift = str1_isL ? 0 : 1; 5456 int str2_chr_shift = str2_isL ? 0 : 1; 5457 bool isL = str1_isL && str2_isL; 5458 // parameters 5459 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5460 // temporary registers 5461 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5462 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5463 // redefinitions 5464 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5465 5466 __ push(spilled_regs, sp); 5467 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5468 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5469 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5470 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5471 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5472 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5473 // Read whole register from str1. It is safe, because length >=8 here 5474 __ ldr(ch1, Address(str1)); 5475 // Read whole register from str2. It is safe, because length >=8 here 5476 __ ldr(ch2, Address(str2)); 5477 __ sub(cnt2, cnt2, cnt1); 5478 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5479 if (str1_isL != str2_isL) { 5480 __ eor(v0, __ T16B, v0, v0); 5481 } 5482 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5483 __ mul(first, first, tmp1); 5484 // check if we have less than 1 register to check 5485 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5486 if (str1_isL != str2_isL) { 5487 __ fmovd(v1, ch1); 5488 } 5489 __ br(__ LE, L_SMALL); 5490 __ eor(ch2, first, ch2); 5491 if (str1_isL != str2_isL) { 5492 __ zip1(v1, __ T16B, v1, v0); 5493 } 5494 __ sub(tmp2, ch2, tmp1); 5495 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5496 __ bics(tmp2, tmp2, ch2); 5497 if (str1_isL != str2_isL) { 5498 __ fmovd(ch1, v1); 5499 } 5500 __ br(__ NE, L_HAS_ZERO); 5501 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5502 __ add(result, result, wordSize/str2_chr_size); 5503 __ add(str2, str2, wordSize); 5504 __ br(__ LT, L_POST_LOOP); 5505 __ BIND(L_LOOP); 5506 __ ldr(ch2, Address(str2)); 5507 __ eor(ch2, first, ch2); 5508 __ sub(tmp2, ch2, tmp1); 5509 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5510 __ bics(tmp2, tmp2, ch2); 5511 __ br(__ NE, L_HAS_ZERO); 5512 __ BIND(L_LOOP_PROCEED); 5513 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5514 __ add(str2, str2, wordSize); 5515 __ add(result, result, wordSize/str2_chr_size); 5516 __ br(__ GE, L_LOOP); 5517 __ BIND(L_POST_LOOP); 5518 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5519 __ br(__ LE, NOMATCH); 5520 __ ldr(ch2, Address(str2)); 5521 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5522 __ eor(ch2, first, ch2); 5523 __ sub(tmp2, ch2, tmp1); 5524 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5525 __ mov(tmp4, -1); // all bits set 5526 __ b(L_SMALL_PROCEED); 5527 __ align(OptoLoopAlignment); 5528 __ BIND(L_SMALL); 5529 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5530 __ eor(ch2, first, ch2); 5531 if (str1_isL != str2_isL) { 5532 __ zip1(v1, __ T16B, v1, v0); 5533 } 5534 __ sub(tmp2, ch2, tmp1); 5535 __ mov(tmp4, -1); // all bits set 5536 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5537 if (str1_isL != str2_isL) { 5538 __ fmovd(ch1, v1); // move converted 4 symbols 5539 } 5540 __ BIND(L_SMALL_PROCEED); 5541 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5542 __ bic(tmp2, tmp2, ch2); 5543 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5544 __ rbit(tmp2, tmp2); 5545 __ br(__ EQ, NOMATCH); 5546 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5547 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5548 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5549 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5550 if (str2_isL) { // LL 5551 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5552 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5553 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5554 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5555 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5556 } else { 5557 __ mov(ch2, 0xE); // all bits in byte set except last one 5558 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5559 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5560 __ lslv(tmp2, tmp2, tmp4); 5561 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5562 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5563 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5564 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5565 } 5566 __ cmp(ch1, ch2); 5567 __ mov(tmp4, wordSize/str2_chr_size); 5568 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5569 __ BIND(L_SMALL_CMP_LOOP); 5570 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5571 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5572 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5573 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5574 __ add(tmp4, tmp4, 1); 5575 __ cmp(tmp4, cnt1); 5576 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5577 __ cmp(first, ch2); 5578 __ br(__ EQ, L_SMALL_CMP_LOOP); 5579 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5580 __ cbz(tmp2, NOMATCH); // no more matches. exit 5581 __ clz(tmp4, tmp2); 5582 __ add(result, result, 1); // advance index 5583 __ add(str2, str2, str2_chr_size); // advance pointer 5584 __ b(L_SMALL_HAS_ZERO_LOOP); 5585 __ align(OptoLoopAlignment); 5586 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5587 __ cmp(first, ch2); 5588 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5589 __ b(DONE); 5590 __ align(OptoLoopAlignment); 5591 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5592 if (str2_isL) { // LL 5593 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5594 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5595 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5596 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5597 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5598 } else { 5599 __ mov(ch2, 0xE); // all bits in byte set except last one 5600 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5601 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5602 __ lslv(tmp2, tmp2, tmp4); 5603 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5604 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5605 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5606 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5607 } 5608 __ cmp(ch1, ch2); 5609 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5610 __ b(DONE); 5611 __ align(OptoLoopAlignment); 5612 __ BIND(L_HAS_ZERO); 5613 __ rbit(tmp2, tmp2); 5614 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 5615 // Now, perform compression of counters(cnt2 and cnt1) into one register. 5616 // It's fine because both counters are 32bit and are not changed in this 5617 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 5618 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 5619 __ sub(result, result, 1); 5620 __ BIND(L_HAS_ZERO_LOOP); 5621 __ mov(cnt1, wordSize/str2_chr_size); 5622 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5623 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 5624 if (str2_isL) { 5625 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5626 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5627 __ lslv(tmp2, tmp2, tmp4); 5628 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5629 __ add(tmp4, tmp4, 1); 5630 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5631 __ lsl(tmp2, tmp2, 1); 5632 __ mov(tmp4, wordSize/str2_chr_size); 5633 } else { 5634 __ mov(ch2, 0xE); 5635 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5636 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5637 __ lslv(tmp2, tmp2, tmp4); 5638 __ add(tmp4, tmp4, 1); 5639 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5640 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 5641 __ lsl(tmp2, tmp2, 1); 5642 __ mov(tmp4, wordSize/str2_chr_size); 5643 __ sub(str2, str2, str2_chr_size); 5644 } 5645 __ cmp(ch1, ch2); 5646 __ mov(tmp4, wordSize/str2_chr_size); 5647 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5648 __ BIND(L_CMP_LOOP); 5649 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5650 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5651 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5652 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5653 __ add(tmp4, tmp4, 1); 5654 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5655 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 5656 __ cmp(cnt1, ch2); 5657 __ br(__ EQ, L_CMP_LOOP); 5658 __ BIND(L_CMP_LOOP_NOMATCH); 5659 // here we're not matched 5660 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 5661 __ clz(tmp4, tmp2); 5662 __ add(str2, str2, str2_chr_size); // advance pointer 5663 __ b(L_HAS_ZERO_LOOP); 5664 __ align(OptoLoopAlignment); 5665 __ BIND(L_CMP_LOOP_LAST_CMP); 5666 __ cmp(cnt1, ch2); 5667 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5668 __ b(DONE); 5669 __ align(OptoLoopAlignment); 5670 __ BIND(L_CMP_LOOP_LAST_CMP2); 5671 if (str2_isL) { 5672 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5673 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5674 __ lslv(tmp2, tmp2, tmp4); 5675 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5676 __ add(tmp4, tmp4, 1); 5677 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5678 __ lsl(tmp2, tmp2, 1); 5679 } else { 5680 __ mov(ch2, 0xE); 5681 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5682 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5683 __ lslv(tmp2, tmp2, tmp4); 5684 __ add(tmp4, tmp4, 1); 5685 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5686 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 5687 __ lsl(tmp2, tmp2, 1); 5688 __ sub(str2, str2, str2_chr_size); 5689 } 5690 __ cmp(ch1, ch2); 5691 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5692 __ b(DONE); 5693 __ align(OptoLoopAlignment); 5694 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 5695 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 5696 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 5697 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 5698 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 5699 // result by analyzed characters value, so, we can just reset lower bits 5700 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 5701 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 5702 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 5703 // index of last analyzed substring inside current octet. So, str2 in at 5704 // respective start address. We need to advance it to next octet 5705 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 5706 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 5707 __ bfm(result, zr, 0, 2 - str2_chr_shift); 5708 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 5709 __ movw(cnt2, cnt2); 5710 __ b(L_LOOP_PROCEED); 5711 __ align(OptoLoopAlignment); 5712 __ BIND(NOMATCH); 5713 __ mov(result, -1); 5714 __ BIND(DONE); 5715 __ pop(spilled_regs, sp); 5716 __ ret(lr); 5717 return entry; 5718 } 5719 5720 void generate_string_indexof_stubs() { 5721 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 5722 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 5723 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 5724 } 5725 5726 void inflate_and_store_2_fp_registers(bool generatePrfm, 5727 FloatRegister src1, FloatRegister src2) { 5728 Register dst = r1; 5729 __ zip1(v1, __ T16B, src1, v0); 5730 __ zip2(v2, __ T16B, src1, v0); 5731 if (generatePrfm) { 5732 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 5733 } 5734 __ zip1(v3, __ T16B, src2, v0); 5735 __ zip2(v4, __ T16B, src2, v0); 5736 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 5737 } 5738 5739 // R0 = src 5740 // R1 = dst 5741 // R2 = len 5742 // R3 = len >> 3 5743 // V0 = 0 5744 // v1 = loaded 8 bytes 5745 address generate_large_byte_array_inflate() { 5746 __ align(CodeEntryAlignment); 5747 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 5748 address entry = __ pc(); 5749 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 5750 Register src = r0, dst = r1, len = r2, octetCounter = r3; 5751 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 5752 5753 // do one more 8-byte read to have address 16-byte aligned in most cases 5754 // also use single store instruction 5755 __ ldrd(v2, __ post(src, 8)); 5756 __ sub(octetCounter, octetCounter, 2); 5757 __ zip1(v1, __ T16B, v1, v0); 5758 __ zip1(v2, __ T16B, v2, v0); 5759 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 5760 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5761 __ subs(rscratch1, octetCounter, large_loop_threshold); 5762 __ br(__ LE, LOOP_START); 5763 __ b(LOOP_PRFM_START); 5764 __ bind(LOOP_PRFM); 5765 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5766 __ bind(LOOP_PRFM_START); 5767 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 5768 __ sub(octetCounter, octetCounter, 8); 5769 __ subs(rscratch1, octetCounter, large_loop_threshold); 5770 inflate_and_store_2_fp_registers(true, v3, v4); 5771 inflate_and_store_2_fp_registers(true, v5, v6); 5772 __ br(__ GT, LOOP_PRFM); 5773 __ cmp(octetCounter, (u1)8); 5774 __ br(__ LT, DONE); 5775 __ bind(LOOP); 5776 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5777 __ bind(LOOP_START); 5778 __ sub(octetCounter, octetCounter, 8); 5779 __ cmp(octetCounter, (u1)8); 5780 inflate_and_store_2_fp_registers(false, v3, v4); 5781 inflate_and_store_2_fp_registers(false, v5, v6); 5782 __ br(__ GE, LOOP); 5783 __ bind(DONE); 5784 __ ret(lr); 5785 return entry; 5786 } 5787 5788 /** 5789 * Arguments: 5790 * 5791 * Input: 5792 * c_rarg0 - current state address 5793 * c_rarg1 - H key address 5794 * c_rarg2 - data address 5795 * c_rarg3 - number of blocks 5796 * 5797 * Output: 5798 * Updated state at c_rarg0 5799 */ 5800 address generate_ghash_processBlocks() { 5801 // Bafflingly, GCM uses little-endian for the byte order, but 5802 // big-endian for the bit order. For example, the polynomial 1 is 5803 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 5804 // 5805 // So, we must either reverse the bytes in each word and do 5806 // everything big-endian or reverse the bits in each byte and do 5807 // it little-endian. On AArch64 it's more idiomatic to reverse 5808 // the bits in each byte (we have an instruction, RBIT, to do 5809 // that) and keep the data in little-endian bit order throught the 5810 // calculation, bit-reversing the inputs and outputs. 5811 5812 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 5813 __ align(wordSize * 2); 5814 address p = __ pc(); 5815 __ emit_int64(0x87); // The low-order bits of the field 5816 // polynomial (i.e. p = z^7+z^2+z+1) 5817 // repeated in the low and high parts of a 5818 // 128-bit vector 5819 __ emit_int64(0x87); 5820 5821 __ align(CodeEntryAlignment); 5822 address start = __ pc(); 5823 5824 Register state = c_rarg0; 5825 Register subkeyH = c_rarg1; 5826 Register data = c_rarg2; 5827 Register blocks = c_rarg3; 5828 5829 FloatRegister vzr = v30; 5830 __ eor(vzr, __ T16B, vzr, vzr); // zero register 5831 5832 __ ldrq(v0, Address(state)); 5833 __ ldrq(v1, Address(subkeyH)); 5834 5835 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 5836 __ rbit(v0, __ T16B, v0); 5837 __ rev64(v1, __ T16B, v1); 5838 __ rbit(v1, __ T16B, v1); 5839 5840 __ ldrq(v26, p); 5841 5842 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 5843 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 5844 5845 { 5846 Label L_ghash_loop; 5847 __ bind(L_ghash_loop); 5848 5849 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 5850 // reversing each byte 5851 __ rbit(v2, __ T16B, v2); 5852 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 5853 5854 // Multiply state in v2 by subkey in v1 5855 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 5856 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 5857 /*temps*/v6, v20, v18, v21); 5858 // Reduce v7:v5 by the field polynomial 5859 ghash_reduce(v0, v5, v7, v26, vzr, v20); 5860 5861 __ sub(blocks, blocks, 1); 5862 __ cbnz(blocks, L_ghash_loop); 5863 } 5864 5865 // The bit-reversed result is at this point in v0 5866 __ rev64(v1, __ T16B, v0); 5867 __ rbit(v1, __ T16B, v1); 5868 5869 __ st1(v1, __ T16B, state); 5870 __ ret(lr); 5871 5872 return start; 5873 } 5874 5875 void generate_base64_encode_simdround(Register src, Register dst, 5876 FloatRegister codec, u8 size) { 5877 5878 FloatRegister in0 = v4, in1 = v5, in2 = v6; 5879 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 5880 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 5881 5882 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 5883 5884 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 5885 5886 __ ushr(ind0, arrangement, in0, 2); 5887 5888 __ ushr(ind1, arrangement, in1, 2); 5889 __ shl(in0, arrangement, in0, 6); 5890 __ orr(ind1, arrangement, ind1, in0); 5891 __ ushr(ind1, arrangement, ind1, 2); 5892 5893 __ ushr(ind2, arrangement, in2, 4); 5894 __ shl(in1, arrangement, in1, 4); 5895 __ orr(ind2, arrangement, in1, ind2); 5896 __ ushr(ind2, arrangement, ind2, 2); 5897 5898 __ shl(ind3, arrangement, in2, 2); 5899 __ ushr(ind3, arrangement, ind3, 2); 5900 5901 __ tbl(out0, arrangement, codec, 4, ind0); 5902 __ tbl(out1, arrangement, codec, 4, ind1); 5903 __ tbl(out2, arrangement, codec, 4, ind2); 5904 __ tbl(out3, arrangement, codec, 4, ind3); 5905 5906 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 5907 } 5908 5909 /** 5910 * Arguments: 5911 * 5912 * Input: 5913 * c_rarg0 - src_start 5914 * c_rarg1 - src_offset 5915 * c_rarg2 - src_length 5916 * c_rarg3 - dest_start 5917 * c_rarg4 - dest_offset 5918 * c_rarg5 - isURL 5919 * 5920 */ 5921 address generate_base64_encodeBlock() { 5922 5923 static const char toBase64[64] = { 5924 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5925 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5926 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5927 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5928 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 5929 }; 5930 5931 static const char toBase64URL[64] = { 5932 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5933 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5934 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5935 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5936 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 5937 }; 5938 5939 __ align(CodeEntryAlignment); 5940 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 5941 address start = __ pc(); 5942 5943 Register src = c_rarg0; // source array 5944 Register soff = c_rarg1; // source start offset 5945 Register send = c_rarg2; // source end offset 5946 Register dst = c_rarg3; // dest array 5947 Register doff = c_rarg4; // position for writing to dest array 5948 Register isURL = c_rarg5; // Base64 or URL chracter set 5949 5950 // c_rarg6 and c_rarg7 are free to use as temps 5951 Register codec = c_rarg6; 5952 Register length = c_rarg7; 5953 5954 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 5955 5956 __ add(src, src, soff); 5957 __ add(dst, dst, doff); 5958 __ sub(length, send, soff); 5959 5960 // load the codec base address 5961 __ lea(codec, ExternalAddress((address) toBase64)); 5962 __ cbz(isURL, ProcessData); 5963 __ lea(codec, ExternalAddress((address) toBase64URL)); 5964 5965 __ BIND(ProcessData); 5966 5967 // too short to formup a SIMD loop, roll back 5968 __ cmp(length, (u1)24); 5969 __ br(Assembler::LT, Process3B); 5970 5971 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 5972 5973 __ BIND(Process48B); 5974 __ cmp(length, (u1)48); 5975 __ br(Assembler::LT, Process24B); 5976 generate_base64_encode_simdround(src, dst, v0, 16); 5977 __ sub(length, length, 48); 5978 __ b(Process48B); 5979 5980 __ BIND(Process24B); 5981 __ cmp(length, (u1)24); 5982 __ br(Assembler::LT, SIMDExit); 5983 generate_base64_encode_simdround(src, dst, v0, 8); 5984 __ sub(length, length, 24); 5985 5986 __ BIND(SIMDExit); 5987 __ cbz(length, Exit); 5988 5989 __ BIND(Process3B); 5990 // 3 src bytes, 24 bits 5991 __ ldrb(r10, __ post(src, 1)); 5992 __ ldrb(r11, __ post(src, 1)); 5993 __ ldrb(r12, __ post(src, 1)); 5994 __ orrw(r11, r11, r10, Assembler::LSL, 8); 5995 __ orrw(r12, r12, r11, Assembler::LSL, 8); 5996 // codec index 5997 __ ubfmw(r15, r12, 18, 23); 5998 __ ubfmw(r14, r12, 12, 17); 5999 __ ubfmw(r13, r12, 6, 11); 6000 __ andw(r12, r12, 63); 6001 // get the code based on the codec 6002 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6003 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6004 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6005 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6006 __ strb(r15, __ post(dst, 1)); 6007 __ strb(r14, __ post(dst, 1)); 6008 __ strb(r13, __ post(dst, 1)); 6009 __ strb(r12, __ post(dst, 1)); 6010 __ sub(length, length, 3); 6011 __ cbnz(length, Process3B); 6012 6013 __ BIND(Exit); 6014 __ ret(lr); 6015 6016 return start; 6017 } 6018 6019 void generate_base64_decode_simdround(Register src, Register dst, 6020 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6021 6022 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6023 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6024 6025 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6026 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6027 6028 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6029 6030 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6031 6032 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6033 6034 // we need unsigned saturating substract, to make sure all input values 6035 // in range [0, 63] will have 0U value in the higher half lookup 6036 __ uqsubv(decH0, __ T16B, in0, v27); 6037 __ uqsubv(decH1, __ T16B, in1, v27); 6038 __ uqsubv(decH2, __ T16B, in2, v27); 6039 __ uqsubv(decH3, __ T16B, in3, v27); 6040 6041 // lower half lookup 6042 __ tbl(decL0, arrangement, codecL, 4, in0); 6043 __ tbl(decL1, arrangement, codecL, 4, in1); 6044 __ tbl(decL2, arrangement, codecL, 4, in2); 6045 __ tbl(decL3, arrangement, codecL, 4, in3); 6046 6047 // higher half lookup 6048 __ tbx(decH0, arrangement, codecH, 4, decH0); 6049 __ tbx(decH1, arrangement, codecH, 4, decH1); 6050 __ tbx(decH2, arrangement, codecH, 4, decH2); 6051 __ tbx(decH3, arrangement, codecH, 4, decH3); 6052 6053 // combine lower and higher 6054 __ orr(decL0, arrangement, decL0, decH0); 6055 __ orr(decL1, arrangement, decL1, decH1); 6056 __ orr(decL2, arrangement, decL2, decH2); 6057 __ orr(decL3, arrangement, decL3, decH3); 6058 6059 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6060 __ cmhi(decH0, arrangement, decL0, v27); 6061 __ cmhi(decH1, arrangement, decL1, v27); 6062 __ cmhi(decH2, arrangement, decL2, v27); 6063 __ cmhi(decH3, arrangement, decL3, v27); 6064 __ orr(in0, arrangement, decH0, decH1); 6065 __ orr(in1, arrangement, decH2, decH3); 6066 __ orr(in2, arrangement, in0, in1); 6067 __ umaxv(in3, arrangement, in2); 6068 __ umov(rscratch2, in3, __ B, 0); 6069 6070 // get the data to output 6071 __ shl(out0, arrangement, decL0, 2); 6072 __ ushr(out1, arrangement, decL1, 4); 6073 __ orr(out0, arrangement, out0, out1); 6074 __ shl(out1, arrangement, decL1, 4); 6075 __ ushr(out2, arrangement, decL2, 2); 6076 __ orr(out1, arrangement, out1, out2); 6077 __ shl(out2, arrangement, decL2, 6); 6078 __ orr(out2, arrangement, out2, decL3); 6079 6080 __ cbz(rscratch2, NoIllegalData); 6081 6082 // handle illegal input 6083 __ umov(r10, in2, __ D, 0); 6084 if (size == 16) { 6085 __ cbnz(r10, ErrorInLowerHalf); 6086 6087 // illegal input is in higher half, store the lower half now. 6088 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6089 6090 __ umov(r10, in2, __ D, 1); 6091 __ umov(r11, out0, __ D, 1); 6092 __ umov(r12, out1, __ D, 1); 6093 __ umov(r13, out2, __ D, 1); 6094 __ b(StoreLegalData); 6095 6096 __ BIND(ErrorInLowerHalf); 6097 } 6098 __ umov(r11, out0, __ D, 0); 6099 __ umov(r12, out1, __ D, 0); 6100 __ umov(r13, out2, __ D, 0); 6101 6102 __ BIND(StoreLegalData); 6103 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6104 __ strb(r11, __ post(dst, 1)); 6105 __ strb(r12, __ post(dst, 1)); 6106 __ strb(r13, __ post(dst, 1)); 6107 __ lsr(r10, r10, 8); 6108 __ lsr(r11, r11, 8); 6109 __ lsr(r12, r12, 8); 6110 __ lsr(r13, r13, 8); 6111 __ b(StoreLegalData); 6112 6113 __ BIND(NoIllegalData); 6114 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6115 } 6116 6117 6118 /** 6119 * Arguments: 6120 * 6121 * Input: 6122 * c_rarg0 - src_start 6123 * c_rarg1 - src_offset 6124 * c_rarg2 - src_length 6125 * c_rarg3 - dest_start 6126 * c_rarg4 - dest_offset 6127 * c_rarg5 - isURL 6128 * c_rarg6 - isMIME 6129 * 6130 */ 6131 address generate_base64_decodeBlock() { 6132 6133 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6134 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6135 // titled "Base64 decoding". 6136 6137 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6138 // except the trailing character '=' is also treated illegal value in this instrinsic. That 6139 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6140 static const uint8_t fromBase64ForNoSIMD[256] = { 6141 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6142 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6143 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6144 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6145 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6146 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6147 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6148 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6149 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6150 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6151 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6152 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6153 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6154 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6155 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6156 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6157 }; 6158 6159 static const uint8_t fromBase64URLForNoSIMD[256] = { 6160 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6161 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6162 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6163 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6164 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6165 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6166 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6167 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6168 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6169 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6170 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6171 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6172 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6173 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6174 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6175 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6176 }; 6177 6178 // A legal value of base64 code is in range [0, 127]. We need two lookups 6179 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6180 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6181 // table vector lookup use tbx, out of range indices are unchanged in 6182 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6183 // The value of index 64 is set to 0, so that we know that we already get the 6184 // decoded data with the 1st lookup. 6185 static const uint8_t fromBase64ForSIMD[128] = { 6186 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6187 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6188 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6189 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6190 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6191 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6192 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6193 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6194 }; 6195 6196 static const uint8_t fromBase64URLForSIMD[128] = { 6197 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6198 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6199 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6200 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6201 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6202 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6203 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6204 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6205 }; 6206 6207 __ align(CodeEntryAlignment); 6208 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6209 address start = __ pc(); 6210 6211 Register src = c_rarg0; // source array 6212 Register soff = c_rarg1; // source start offset 6213 Register send = c_rarg2; // source end offset 6214 Register dst = c_rarg3; // dest array 6215 Register doff = c_rarg4; // position for writing to dest array 6216 Register isURL = c_rarg5; // Base64 or URL character set 6217 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6218 6219 Register length = send; // reuse send as length of source data to process 6220 6221 Register simd_codec = c_rarg6; 6222 Register nosimd_codec = c_rarg7; 6223 6224 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6225 6226 __ enter(); 6227 6228 __ add(src, src, soff); 6229 __ add(dst, dst, doff); 6230 6231 __ mov(doff, dst); 6232 6233 __ sub(length, send, soff); 6234 __ bfm(length, zr, 0, 1); 6235 6236 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6237 __ cbz(isURL, ProcessData); 6238 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6239 6240 __ BIND(ProcessData); 6241 __ mov(rscratch1, length); 6242 __ cmp(length, (u1)144); // 144 = 80 + 64 6243 __ br(Assembler::LT, Process4B); 6244 6245 // In the MIME case, the line length cannot be more than 76 6246 // bytes (see RFC 2045). This is too short a block for SIMD 6247 // to be worthwhile, so we use non-SIMD here. 6248 __ movw(rscratch1, 79); 6249 6250 __ BIND(Process4B); 6251 __ ldrw(r14, __ post(src, 4)); 6252 __ ubfxw(r10, r14, 0, 8); 6253 __ ubfxw(r11, r14, 8, 8); 6254 __ ubfxw(r12, r14, 16, 8); 6255 __ ubfxw(r13, r14, 24, 8); 6256 // get the de-code 6257 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6258 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6259 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6260 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6261 // error detection, 255u indicates an illegal input 6262 __ orrw(r14, r10, r11); 6263 __ orrw(r15, r12, r13); 6264 __ orrw(r14, r14, r15); 6265 __ tbnz(r14, 7, Exit); 6266 // recover the data 6267 __ lslw(r14, r10, 10); 6268 __ bfiw(r14, r11, 4, 6); 6269 __ bfmw(r14, r12, 2, 5); 6270 __ rev16w(r14, r14); 6271 __ bfiw(r13, r12, 6, 2); 6272 __ strh(r14, __ post(dst, 2)); 6273 __ strb(r13, __ post(dst, 1)); 6274 // non-simd loop 6275 __ subsw(rscratch1, rscratch1, 4); 6276 __ br(Assembler::GT, Process4B); 6277 6278 // if exiting from PreProcess80B, rscratch1 == -1; 6279 // otherwise, rscratch1 == 0. 6280 __ cbzw(rscratch1, Exit); 6281 __ sub(length, length, 80); 6282 6283 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6284 __ cbz(isURL, SIMDEnter); 6285 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6286 6287 __ BIND(SIMDEnter); 6288 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6289 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6290 __ mov(rscratch1, 63); 6291 __ dup(v27, __ T16B, rscratch1); 6292 6293 __ BIND(Process64B); 6294 __ cmp(length, (u1)64); 6295 __ br(Assembler::LT, Process32B); 6296 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6297 __ sub(length, length, 64); 6298 __ b(Process64B); 6299 6300 __ BIND(Process32B); 6301 __ cmp(length, (u1)32); 6302 __ br(Assembler::LT, SIMDExit); 6303 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6304 __ sub(length, length, 32); 6305 __ b(Process32B); 6306 6307 __ BIND(SIMDExit); 6308 __ cbz(length, Exit); 6309 __ movw(rscratch1, length); 6310 __ b(Process4B); 6311 6312 __ BIND(Exit); 6313 __ sub(c_rarg0, dst, doff); 6314 6315 __ leave(); 6316 __ ret(lr); 6317 6318 return start; 6319 } 6320 6321 address generate_ghash_processBlocks_wide() { 6322 address small = generate_ghash_processBlocks(); 6323 6324 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6325 __ align(wordSize * 2); 6326 address p = __ pc(); 6327 __ emit_int64(0x87); // The low-order bits of the field 6328 // polynomial (i.e. p = z^7+z^2+z+1) 6329 // repeated in the low and high parts of a 6330 // 128-bit vector 6331 __ emit_int64(0x87); 6332 6333 __ align(CodeEntryAlignment); 6334 address start = __ pc(); 6335 6336 Register state = c_rarg0; 6337 Register subkeyH = c_rarg1; 6338 Register data = c_rarg2; 6339 Register blocks = c_rarg3; 6340 6341 const int unroll = 4; 6342 6343 __ cmp(blocks, (unsigned char)(unroll * 2)); 6344 __ br(__ LT, small); 6345 6346 if (unroll > 1) { 6347 // Save state before entering routine 6348 __ sub(sp, sp, 4 * 16); 6349 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6350 __ sub(sp, sp, 4 * 16); 6351 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6352 } 6353 6354 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6355 6356 if (unroll > 1) { 6357 // And restore state 6358 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6359 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6360 } 6361 6362 __ cmp(blocks, zr); 6363 __ br(__ GT, small); 6364 6365 __ ret(lr); 6366 6367 return start; 6368 } 6369 6370 // Support for spin waits. 6371 address generate_spin_wait() { 6372 __ align(CodeEntryAlignment); 6373 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6374 address start = __ pc(); 6375 6376 __ spin_wait(); 6377 __ ret(lr); 6378 6379 return start; 6380 } 6381 6382 #ifdef LINUX 6383 6384 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6385 // 6386 // If LSE is in use, generate LSE versions of all the stubs. The 6387 // non-LSE versions are in atomic_aarch64.S. 6388 6389 // class AtomicStubMark records the entry point of a stub and the 6390 // stub pointer which will point to it. The stub pointer is set to 6391 // the entry point when ~AtomicStubMark() is called, which must be 6392 // after ICache::invalidate_range. This ensures safe publication of 6393 // the generated code. 6394 class AtomicStubMark { 6395 address _entry_point; 6396 aarch64_atomic_stub_t *_stub; 6397 MacroAssembler *_masm; 6398 public: 6399 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6400 _masm = masm; 6401 __ align(32); 6402 _entry_point = __ pc(); 6403 _stub = stub; 6404 } 6405 ~AtomicStubMark() { 6406 *_stub = (aarch64_atomic_stub_t)_entry_point; 6407 } 6408 }; 6409 6410 // NB: For memory_order_conservative we need a trailing membar after 6411 // LSE atomic operations but not a leading membar. 6412 // 6413 // We don't need a leading membar because a clause in the Arm ARM 6414 // says: 6415 // 6416 // Barrier-ordered-before 6417 // 6418 // Barrier instructions order prior Memory effects before subsequent 6419 // Memory effects generated by the same Observer. A read or a write 6420 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6421 // Observer if and only if RW1 appears in program order before RW 2 6422 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6423 // instruction with both Acquire and Release semantics. 6424 // 6425 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6426 // and Release semantics, therefore we don't need a leading 6427 // barrier. However, there is no corresponding Barrier-ordered-after 6428 // relationship, therefore we need a trailing membar to prevent a 6429 // later store or load from being reordered with the store in an 6430 // atomic instruction. 6431 // 6432 // This was checked by using the herd7 consistency model simulator 6433 // (http://diy.inria.fr/) with this test case: 6434 // 6435 // AArch64 LseCas 6436 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6437 // P0 | P1; 6438 // LDR W4, [X2] | MOV W3, #0; 6439 // DMB LD | MOV W4, #1; 6440 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6441 // | DMB ISH; 6442 // | STR W4, [X2]; 6443 // exists 6444 // (0:X3=0 /\ 0:X4=1) 6445 // 6446 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6447 // with the store to x in P1. Without the DMB in P1 this may happen. 6448 // 6449 // At the time of writing we don't know of any AArch64 hardware that 6450 // reorders stores in this way, but the Reference Manual permits it. 6451 6452 void gen_cas_entry(Assembler::operand_size size, 6453 atomic_memory_order order) { 6454 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6455 exchange_val = c_rarg2; 6456 bool acquire, release; 6457 switch (order) { 6458 case memory_order_relaxed: 6459 acquire = false; 6460 release = false; 6461 break; 6462 case memory_order_release: 6463 acquire = false; 6464 release = true; 6465 break; 6466 default: 6467 acquire = true; 6468 release = true; 6469 break; 6470 } 6471 __ mov(prev, compare_val); 6472 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6473 if (order == memory_order_conservative) { 6474 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6475 } 6476 if (size == Assembler::xword) { 6477 __ mov(r0, prev); 6478 } else { 6479 __ movw(r0, prev); 6480 } 6481 __ ret(lr); 6482 } 6483 6484 void gen_ldaddal_entry(Assembler::operand_size size) { 6485 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6486 __ ldaddal(size, incr, prev, addr); 6487 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6488 if (size == Assembler::xword) { 6489 __ mov(r0, prev); 6490 } else { 6491 __ movw(r0, prev); 6492 } 6493 __ ret(lr); 6494 } 6495 6496 void gen_swpal_entry(Assembler::operand_size size) { 6497 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6498 __ swpal(size, incr, prev, addr); 6499 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6500 if (size == Assembler::xword) { 6501 __ mov(r0, prev); 6502 } else { 6503 __ movw(r0, prev); 6504 } 6505 __ ret(lr); 6506 } 6507 6508 void generate_atomic_entry_points() { 6509 if (! UseLSE) { 6510 return; 6511 } 6512 6513 __ align(CodeEntryAlignment); 6514 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6515 address first_entry = __ pc(); 6516 6517 // All memory_order_conservative 6518 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6519 gen_ldaddal_entry(Assembler::word); 6520 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6521 gen_ldaddal_entry(Assembler::xword); 6522 6523 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6524 gen_swpal_entry(Assembler::word); 6525 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6526 gen_swpal_entry(Assembler::xword); 6527 6528 // CAS, memory_order_conservative 6529 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6530 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6531 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6532 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6533 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6534 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6535 6536 // CAS, memory_order_relaxed 6537 AtomicStubMark mark_cmpxchg_1_relaxed 6538 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6539 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6540 AtomicStubMark mark_cmpxchg_4_relaxed 6541 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6542 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6543 AtomicStubMark mark_cmpxchg_8_relaxed 6544 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6545 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6546 6547 AtomicStubMark mark_cmpxchg_4_release 6548 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6549 gen_cas_entry(MacroAssembler::word, memory_order_release); 6550 AtomicStubMark mark_cmpxchg_8_release 6551 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 6552 gen_cas_entry(MacroAssembler::xword, memory_order_release); 6553 6554 AtomicStubMark mark_cmpxchg_4_seq_cst 6555 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 6556 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 6557 AtomicStubMark mark_cmpxchg_8_seq_cst 6558 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 6559 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 6560 6561 ICache::invalidate_range(first_entry, __ pc() - first_entry); 6562 } 6563 #endif // LINUX 6564 6565 // Continuation point for throwing of implicit exceptions that are 6566 // not handled in the current activation. Fabricates an exception 6567 // oop and initiates normal exception dispatching in this 6568 // frame. Since we need to preserve callee-saved values (currently 6569 // only for C2, but done for C1 as well) we need a callee-saved oop 6570 // map and therefore have to make these stubs into RuntimeStubs 6571 // rather than BufferBlobs. If the compiler needs all registers to 6572 // be preserved between the fault point and the exception handler 6573 // then it must assume responsibility for that in 6574 // AbstractCompiler::continuation_for_implicit_null_exception or 6575 // continuation_for_implicit_division_by_zero_exception. All other 6576 // implicit exceptions (e.g., NullPointerException or 6577 // AbstractMethodError on entry) are either at call sites or 6578 // otherwise assume that stack unwinding will be initiated, so 6579 // caller saved registers were assumed volatile in the compiler. 6580 6581 #undef __ 6582 #define __ masm-> 6583 6584 address generate_throw_exception(const char* name, 6585 address runtime_entry, 6586 Register arg1 = noreg, 6587 Register arg2 = noreg) { 6588 // Information about frame layout at time of blocking runtime call. 6589 // Note that we only have to preserve callee-saved registers since 6590 // the compilers are responsible for supplying a continuation point 6591 // if they expect all registers to be preserved. 6592 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 6593 enum layout { 6594 rfp_off = 0, 6595 rfp_off2, 6596 return_off, 6597 return_off2, 6598 framesize // inclusive of return address 6599 }; 6600 6601 int insts_size = 512; 6602 int locs_size = 64; 6603 6604 CodeBuffer code(name, insts_size, locs_size); 6605 OopMapSet* oop_maps = new OopMapSet(); 6606 MacroAssembler* masm = new MacroAssembler(&code); 6607 6608 address start = __ pc(); 6609 6610 // This is an inlined and slightly modified version of call_VM 6611 // which has the ability to fetch the return PC out of 6612 // thread-local storage and also sets up last_Java_sp slightly 6613 // differently than the real call_VM 6614 6615 __ enter(); // Save FP and LR before call 6616 6617 assert(is_even(framesize/2), "sp not 16-byte aligned"); 6618 6619 // lr and fp are already in place 6620 __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog 6621 6622 int frame_complete = __ pc() - start; 6623 6624 // Set up last_Java_sp and last_Java_fp 6625 address the_pc = __ pc(); 6626 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 6627 6628 // Call runtime 6629 if (arg1 != noreg) { 6630 assert(arg2 != c_rarg1, "clobbered"); 6631 __ mov(c_rarg1, arg1); 6632 } 6633 if (arg2 != noreg) { 6634 __ mov(c_rarg2, arg2); 6635 } 6636 __ mov(c_rarg0, rthread); 6637 BLOCK_COMMENT("call runtime_entry"); 6638 __ mov(rscratch1, runtime_entry); 6639 __ blr(rscratch1); 6640 6641 // Generate oop map 6642 OopMap* map = new OopMap(framesize, 0); 6643 6644 oop_maps->add_gc_map(the_pc - start, map); 6645 6646 __ reset_last_Java_frame(true); 6647 6648 // Reinitialize the ptrue predicate register, in case the external runtime 6649 // call clobbers ptrue reg, as we may return to SVE compiled code. 6650 __ reinitialize_ptrue(); 6651 6652 __ leave(); 6653 6654 // check for pending exceptions 6655 #ifdef ASSERT 6656 Label L; 6657 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 6658 __ cbnz(rscratch1, L); 6659 __ should_not_reach_here(); 6660 __ bind(L); 6661 #endif // ASSERT 6662 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 6663 6664 6665 // codeBlob framesize is in words (not VMRegImpl::slot_size) 6666 RuntimeStub* stub = 6667 RuntimeStub::new_runtime_stub(name, 6668 &code, 6669 frame_complete, 6670 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 6671 oop_maps, false); 6672 return stub->entry_point(); 6673 } 6674 6675 class MontgomeryMultiplyGenerator : public MacroAssembler { 6676 6677 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 6678 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 6679 6680 RegSet _toSave; 6681 bool _squaring; 6682 6683 public: 6684 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 6685 : MacroAssembler(as->code()), _squaring(squaring) { 6686 6687 // Register allocation 6688 6689 RegSetIterator<> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 6690 Pa_base = *regs; // Argument registers 6691 if (squaring) 6692 Pb_base = Pa_base; 6693 else 6694 Pb_base = *++regs; 6695 Pn_base = *++regs; 6696 Rlen= *++regs; 6697 inv = *++regs; 6698 Pm_base = *++regs; 6699 6700 // Working registers: 6701 Ra = *++regs; // The current digit of a, b, n, and m. 6702 Rb = *++regs; 6703 Rm = *++regs; 6704 Rn = *++regs; 6705 6706 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 6707 Pb = *++regs; 6708 Pm = *++regs; 6709 Pn = *++regs; 6710 6711 t0 = *++regs; // Three registers which form a 6712 t1 = *++regs; // triple-precision accumuator. 6713 t2 = *++regs; 6714 6715 Ri = *++regs; // Inner and outer loop indexes. 6716 Rj = *++regs; 6717 6718 Rhi_ab = *++regs; // Product registers: low and high parts 6719 Rlo_ab = *++regs; // of a*b and m*n. 6720 Rhi_mn = *++regs; 6721 Rlo_mn = *++regs; 6722 6723 // r19 and up are callee-saved. 6724 _toSave = RegSet::range(r19, *regs) + Pm_base; 6725 } 6726 6727 private: 6728 void save_regs() { 6729 push(_toSave, sp); 6730 } 6731 6732 void restore_regs() { 6733 pop(_toSave, sp); 6734 } 6735 6736 template <typename T> 6737 void unroll_2(Register count, T block) { 6738 Label loop, end, odd; 6739 tbnz(count, 0, odd); 6740 cbz(count, end); 6741 align(16); 6742 bind(loop); 6743 (this->*block)(); 6744 bind(odd); 6745 (this->*block)(); 6746 subs(count, count, 2); 6747 br(Assembler::GT, loop); 6748 bind(end); 6749 } 6750 6751 template <typename T> 6752 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 6753 Label loop, end, odd; 6754 tbnz(count, 0, odd); 6755 cbz(count, end); 6756 align(16); 6757 bind(loop); 6758 (this->*block)(d, s, tmp); 6759 bind(odd); 6760 (this->*block)(d, s, tmp); 6761 subs(count, count, 2); 6762 br(Assembler::GT, loop); 6763 bind(end); 6764 } 6765 6766 void pre1(RegisterOrConstant i) { 6767 block_comment("pre1"); 6768 // Pa = Pa_base; 6769 // Pb = Pb_base + i; 6770 // Pm = Pm_base; 6771 // Pn = Pn_base + i; 6772 // Ra = *Pa; 6773 // Rb = *Pb; 6774 // Rm = *Pm; 6775 // Rn = *Pn; 6776 ldr(Ra, Address(Pa_base)); 6777 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 6778 ldr(Rm, Address(Pm_base)); 6779 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6780 lea(Pa, Address(Pa_base)); 6781 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 6782 lea(Pm, Address(Pm_base)); 6783 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6784 6785 // Zero the m*n result. 6786 mov(Rhi_mn, zr); 6787 mov(Rlo_mn, zr); 6788 } 6789 6790 // The core multiply-accumulate step of a Montgomery 6791 // multiplication. The idea is to schedule operations as a 6792 // pipeline so that instructions with long latencies (loads and 6793 // multiplies) have time to complete before their results are 6794 // used. This most benefits in-order implementations of the 6795 // architecture but out-of-order ones also benefit. 6796 void step() { 6797 block_comment("step"); 6798 // MACC(Ra, Rb, t0, t1, t2); 6799 // Ra = *++Pa; 6800 // Rb = *--Pb; 6801 umulh(Rhi_ab, Ra, Rb); 6802 mul(Rlo_ab, Ra, Rb); 6803 ldr(Ra, pre(Pa, wordSize)); 6804 ldr(Rb, pre(Pb, -wordSize)); 6805 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 6806 // previous iteration. 6807 // MACC(Rm, Rn, t0, t1, t2); 6808 // Rm = *++Pm; 6809 // Rn = *--Pn; 6810 umulh(Rhi_mn, Rm, Rn); 6811 mul(Rlo_mn, Rm, Rn); 6812 ldr(Rm, pre(Pm, wordSize)); 6813 ldr(Rn, pre(Pn, -wordSize)); 6814 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 6815 } 6816 6817 void post1() { 6818 block_comment("post1"); 6819 6820 // MACC(Ra, Rb, t0, t1, t2); 6821 // Ra = *++Pa; 6822 // Rb = *--Pb; 6823 umulh(Rhi_ab, Ra, Rb); 6824 mul(Rlo_ab, Ra, Rb); 6825 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 6826 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 6827 6828 // *Pm = Rm = t0 * inv; 6829 mul(Rm, t0, inv); 6830 str(Rm, Address(Pm)); 6831 6832 // MACC(Rm, Rn, t0, t1, t2); 6833 // t0 = t1; t1 = t2; t2 = 0; 6834 umulh(Rhi_mn, Rm, Rn); 6835 6836 #ifndef PRODUCT 6837 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 6838 { 6839 mul(Rlo_mn, Rm, Rn); 6840 add(Rlo_mn, t0, Rlo_mn); 6841 Label ok; 6842 cbz(Rlo_mn, ok); { 6843 stop("broken Montgomery multiply"); 6844 } bind(ok); 6845 } 6846 #endif 6847 // We have very carefully set things up so that 6848 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 6849 // the lower half of Rm * Rn because we know the result already: 6850 // it must be -t0. t0 + (-t0) must generate a carry iff 6851 // t0 != 0. So, rather than do a mul and an adds we just set 6852 // the carry flag iff t0 is nonzero. 6853 // 6854 // mul(Rlo_mn, Rm, Rn); 6855 // adds(zr, t0, Rlo_mn); 6856 subs(zr, t0, 1); // Set carry iff t0 is nonzero 6857 adcs(t0, t1, Rhi_mn); 6858 adc(t1, t2, zr); 6859 mov(t2, zr); 6860 } 6861 6862 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 6863 block_comment("pre2"); 6864 // Pa = Pa_base + i-len; 6865 // Pb = Pb_base + len; 6866 // Pm = Pm_base + i-len; 6867 // Pn = Pn_base + len; 6868 6869 if (i.is_register()) { 6870 sub(Rj, i.as_register(), len); 6871 } else { 6872 mov(Rj, i.as_constant()); 6873 sub(Rj, Rj, len); 6874 } 6875 // Rj == i-len 6876 6877 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 6878 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 6879 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 6880 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 6881 6882 // Ra = *++Pa; 6883 // Rb = *--Pb; 6884 // Rm = *++Pm; 6885 // Rn = *--Pn; 6886 ldr(Ra, pre(Pa, wordSize)); 6887 ldr(Rb, pre(Pb, -wordSize)); 6888 ldr(Rm, pre(Pm, wordSize)); 6889 ldr(Rn, pre(Pn, -wordSize)); 6890 6891 mov(Rhi_mn, zr); 6892 mov(Rlo_mn, zr); 6893 } 6894 6895 void post2(RegisterOrConstant i, RegisterOrConstant len) { 6896 block_comment("post2"); 6897 if (i.is_constant()) { 6898 mov(Rj, i.as_constant()-len.as_constant()); 6899 } else { 6900 sub(Rj, i.as_register(), len); 6901 } 6902 6903 adds(t0, t0, Rlo_mn); // The pending m*n, low part 6904 6905 // As soon as we know the least significant digit of our result, 6906 // store it. 6907 // Pm_base[i-len] = t0; 6908 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 6909 6910 // t0 = t1; t1 = t2; t2 = 0; 6911 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 6912 adc(t1, t2, zr); 6913 mov(t2, zr); 6914 } 6915 6916 // A carry in t0 after Montgomery multiplication means that we 6917 // should subtract multiples of n from our result in m. We'll 6918 // keep doing that until there is no carry. 6919 void normalize(RegisterOrConstant len) { 6920 block_comment("normalize"); 6921 // while (t0) 6922 // t0 = sub(Pm_base, Pn_base, t0, len); 6923 Label loop, post, again; 6924 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 6925 cbz(t0, post); { 6926 bind(again); { 6927 mov(i, zr); 6928 mov(cnt, len); 6929 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 6930 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6931 subs(zr, zr, zr); // set carry flag, i.e. no borrow 6932 align(16); 6933 bind(loop); { 6934 sbcs(Rm, Rm, Rn); 6935 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 6936 add(i, i, 1); 6937 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 6938 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6939 sub(cnt, cnt, 1); 6940 } cbnz(cnt, loop); 6941 sbc(t0, t0, zr); 6942 } cbnz(t0, again); 6943 } bind(post); 6944 } 6945 6946 // Move memory at s to d, reversing words. 6947 // Increments d to end of copied memory 6948 // Destroys tmp1, tmp2 6949 // Preserves len 6950 // Leaves s pointing to the address which was in d at start 6951 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 6952 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 6953 6954 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 6955 mov(tmp1, len); 6956 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 6957 sub(s, d, len, ext::uxtw, LogBytesPerWord); 6958 } 6959 // where 6960 void reverse1(Register d, Register s, Register tmp) { 6961 ldr(tmp, pre(s, -wordSize)); 6962 ror(tmp, tmp, 32); 6963 str(tmp, post(d, wordSize)); 6964 } 6965 6966 void step_squaring() { 6967 // An extra ACC 6968 step(); 6969 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 6970 } 6971 6972 void last_squaring(RegisterOrConstant i) { 6973 Label dont; 6974 // if ((i & 1) == 0) { 6975 tbnz(i.as_register(), 0, dont); { 6976 // MACC(Ra, Rb, t0, t1, t2); 6977 // Ra = *++Pa; 6978 // Rb = *--Pb; 6979 umulh(Rhi_ab, Ra, Rb); 6980 mul(Rlo_ab, Ra, Rb); 6981 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 6982 } bind(dont); 6983 } 6984 6985 void extra_step_squaring() { 6986 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 6987 6988 // MACC(Rm, Rn, t0, t1, t2); 6989 // Rm = *++Pm; 6990 // Rn = *--Pn; 6991 umulh(Rhi_mn, Rm, Rn); 6992 mul(Rlo_mn, Rm, Rn); 6993 ldr(Rm, pre(Pm, wordSize)); 6994 ldr(Rn, pre(Pn, -wordSize)); 6995 } 6996 6997 void post1_squaring() { 6998 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 6999 7000 // *Pm = Rm = t0 * inv; 7001 mul(Rm, t0, inv); 7002 str(Rm, Address(Pm)); 7003 7004 // MACC(Rm, Rn, t0, t1, t2); 7005 // t0 = t1; t1 = t2; t2 = 0; 7006 umulh(Rhi_mn, Rm, Rn); 7007 7008 #ifndef PRODUCT 7009 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7010 { 7011 mul(Rlo_mn, Rm, Rn); 7012 add(Rlo_mn, t0, Rlo_mn); 7013 Label ok; 7014 cbz(Rlo_mn, ok); { 7015 stop("broken Montgomery multiply"); 7016 } bind(ok); 7017 } 7018 #endif 7019 // We have very carefully set things up so that 7020 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7021 // the lower half of Rm * Rn because we know the result already: 7022 // it must be -t0. t0 + (-t0) must generate a carry iff 7023 // t0 != 0. So, rather than do a mul and an adds we just set 7024 // the carry flag iff t0 is nonzero. 7025 // 7026 // mul(Rlo_mn, Rm, Rn); 7027 // adds(zr, t0, Rlo_mn); 7028 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7029 adcs(t0, t1, Rhi_mn); 7030 adc(t1, t2, zr); 7031 mov(t2, zr); 7032 } 7033 7034 void acc(Register Rhi, Register Rlo, 7035 Register t0, Register t1, Register t2) { 7036 adds(t0, t0, Rlo); 7037 adcs(t1, t1, Rhi); 7038 adc(t2, t2, zr); 7039 } 7040 7041 public: 7042 /** 7043 * Fast Montgomery multiplication. The derivation of the 7044 * algorithm is in A Cryptographic Library for the Motorola 7045 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7046 * 7047 * Arguments: 7048 * 7049 * Inputs for multiplication: 7050 * c_rarg0 - int array elements a 7051 * c_rarg1 - int array elements b 7052 * c_rarg2 - int array elements n (the modulus) 7053 * c_rarg3 - int length 7054 * c_rarg4 - int inv 7055 * c_rarg5 - int array elements m (the result) 7056 * 7057 * Inputs for squaring: 7058 * c_rarg0 - int array elements a 7059 * c_rarg1 - int array elements n (the modulus) 7060 * c_rarg2 - int length 7061 * c_rarg3 - int inv 7062 * c_rarg4 - int array elements m (the result) 7063 * 7064 */ 7065 address generate_multiply() { 7066 Label argh, nothing; 7067 bind(argh); 7068 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7069 7070 align(CodeEntryAlignment); 7071 address entry = pc(); 7072 7073 cbzw(Rlen, nothing); 7074 7075 enter(); 7076 7077 // Make room. 7078 cmpw(Rlen, 512); 7079 br(Assembler::HI, argh); 7080 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7081 andr(sp, Ra, -2 * wordSize); 7082 7083 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7084 7085 { 7086 // Copy input args, reversing as we go. We use Ra as a 7087 // temporary variable. 7088 reverse(Ra, Pa_base, Rlen, t0, t1); 7089 if (!_squaring) 7090 reverse(Ra, Pb_base, Rlen, t0, t1); 7091 reverse(Ra, Pn_base, Rlen, t0, t1); 7092 } 7093 7094 // Push all call-saved registers and also Pm_base which we'll need 7095 // at the end. 7096 save_regs(); 7097 7098 #ifndef PRODUCT 7099 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7100 { 7101 ldr(Rn, Address(Pn_base, 0)); 7102 mul(Rlo_mn, Rn, inv); 7103 subs(zr, Rlo_mn, -1); 7104 Label ok; 7105 br(EQ, ok); { 7106 stop("broken inverse in Montgomery multiply"); 7107 } bind(ok); 7108 } 7109 #endif 7110 7111 mov(Pm_base, Ra); 7112 7113 mov(t0, zr); 7114 mov(t1, zr); 7115 mov(t2, zr); 7116 7117 block_comment("for (int i = 0; i < len; i++) {"); 7118 mov(Ri, zr); { 7119 Label loop, end; 7120 cmpw(Ri, Rlen); 7121 br(Assembler::GE, end); 7122 7123 bind(loop); 7124 pre1(Ri); 7125 7126 block_comment(" for (j = i; j; j--) {"); { 7127 movw(Rj, Ri); 7128 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7129 } block_comment(" } // j"); 7130 7131 post1(); 7132 addw(Ri, Ri, 1); 7133 cmpw(Ri, Rlen); 7134 br(Assembler::LT, loop); 7135 bind(end); 7136 block_comment("} // i"); 7137 } 7138 7139 block_comment("for (int i = len; i < 2*len; i++) {"); 7140 mov(Ri, Rlen); { 7141 Label loop, end; 7142 cmpw(Ri, Rlen, Assembler::LSL, 1); 7143 br(Assembler::GE, end); 7144 7145 bind(loop); 7146 pre2(Ri, Rlen); 7147 7148 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7149 lslw(Rj, Rlen, 1); 7150 subw(Rj, Rj, Ri); 7151 subw(Rj, Rj, 1); 7152 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7153 } block_comment(" } // j"); 7154 7155 post2(Ri, Rlen); 7156 addw(Ri, Ri, 1); 7157 cmpw(Ri, Rlen, Assembler::LSL, 1); 7158 br(Assembler::LT, loop); 7159 bind(end); 7160 } 7161 block_comment("} // i"); 7162 7163 normalize(Rlen); 7164 7165 mov(Ra, Pm_base); // Save Pm_base in Ra 7166 restore_regs(); // Restore caller's Pm_base 7167 7168 // Copy our result into caller's Pm_base 7169 reverse(Pm_base, Ra, Rlen, t0, t1); 7170 7171 leave(); 7172 bind(nothing); 7173 ret(lr); 7174 7175 return entry; 7176 } 7177 // In C, approximately: 7178 7179 // void 7180 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7181 // julong Pn_base[], julong Pm_base[], 7182 // julong inv, int len) { 7183 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7184 // julong *Pa, *Pb, *Pn, *Pm; 7185 // julong Ra, Rb, Rn, Rm; 7186 7187 // int i; 7188 7189 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7190 7191 // for (i = 0; i < len; i++) { 7192 // int j; 7193 7194 // Pa = Pa_base; 7195 // Pb = Pb_base + i; 7196 // Pm = Pm_base; 7197 // Pn = Pn_base + i; 7198 7199 // Ra = *Pa; 7200 // Rb = *Pb; 7201 // Rm = *Pm; 7202 // Rn = *Pn; 7203 7204 // int iters = i; 7205 // for (j = 0; iters--; j++) { 7206 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7207 // MACC(Ra, Rb, t0, t1, t2); 7208 // Ra = *++Pa; 7209 // Rb = *--Pb; 7210 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7211 // MACC(Rm, Rn, t0, t1, t2); 7212 // Rm = *++Pm; 7213 // Rn = *--Pn; 7214 // } 7215 7216 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 7217 // MACC(Ra, Rb, t0, t1, t2); 7218 // *Pm = Rm = t0 * inv; 7219 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7220 // MACC(Rm, Rn, t0, t1, t2); 7221 7222 // assert(t0 == 0, "broken Montgomery multiply"); 7223 7224 // t0 = t1; t1 = t2; t2 = 0; 7225 // } 7226 7227 // for (i = len; i < 2*len; i++) { 7228 // int j; 7229 7230 // Pa = Pa_base + i-len; 7231 // Pb = Pb_base + len; 7232 // Pm = Pm_base + i-len; 7233 // Pn = Pn_base + len; 7234 7235 // Ra = *++Pa; 7236 // Rb = *--Pb; 7237 // Rm = *++Pm; 7238 // Rn = *--Pn; 7239 7240 // int iters = len*2-i-1; 7241 // for (j = i-len+1; iters--; j++) { 7242 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7243 // MACC(Ra, Rb, t0, t1, t2); 7244 // Ra = *++Pa; 7245 // Rb = *--Pb; 7246 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7247 // MACC(Rm, Rn, t0, t1, t2); 7248 // Rm = *++Pm; 7249 // Rn = *--Pn; 7250 // } 7251 7252 // Pm_base[i-len] = t0; 7253 // t0 = t1; t1 = t2; t2 = 0; 7254 // } 7255 7256 // while (t0) 7257 // t0 = sub(Pm_base, Pn_base, t0, len); 7258 // } 7259 7260 /** 7261 * Fast Montgomery squaring. This uses asymptotically 25% fewer 7262 * multiplies than Montgomery multiplication so it should be up to 7263 * 25% faster. However, its loop control is more complex and it 7264 * may actually run slower on some machines. 7265 * 7266 * Arguments: 7267 * 7268 * Inputs: 7269 * c_rarg0 - int array elements a 7270 * c_rarg1 - int array elements n (the modulus) 7271 * c_rarg2 - int length 7272 * c_rarg3 - int inv 7273 * c_rarg4 - int array elements m (the result) 7274 * 7275 */ 7276 address generate_square() { 7277 Label argh; 7278 bind(argh); 7279 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7280 7281 align(CodeEntryAlignment); 7282 address entry = pc(); 7283 7284 enter(); 7285 7286 // Make room. 7287 cmpw(Rlen, 512); 7288 br(Assembler::HI, argh); 7289 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7290 andr(sp, Ra, -2 * wordSize); 7291 7292 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7293 7294 { 7295 // Copy input args, reversing as we go. We use Ra as a 7296 // temporary variable. 7297 reverse(Ra, Pa_base, Rlen, t0, t1); 7298 reverse(Ra, Pn_base, Rlen, t0, t1); 7299 } 7300 7301 // Push all call-saved registers and also Pm_base which we'll need 7302 // at the end. 7303 save_regs(); 7304 7305 mov(Pm_base, Ra); 7306 7307 mov(t0, zr); 7308 mov(t1, zr); 7309 mov(t2, zr); 7310 7311 block_comment("for (int i = 0; i < len; i++) {"); 7312 mov(Ri, zr); { 7313 Label loop, end; 7314 bind(loop); 7315 cmp(Ri, Rlen); 7316 br(Assembler::GE, end); 7317 7318 pre1(Ri); 7319 7320 block_comment("for (j = (i+1)/2; j; j--) {"); { 7321 add(Rj, Ri, 1); 7322 lsr(Rj, Rj, 1); 7323 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7324 } block_comment(" } // j"); 7325 7326 last_squaring(Ri); 7327 7328 block_comment(" for (j = i/2; j; j--) {"); { 7329 lsr(Rj, Ri, 1); 7330 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7331 } block_comment(" } // j"); 7332 7333 post1_squaring(); 7334 add(Ri, Ri, 1); 7335 cmp(Ri, Rlen); 7336 br(Assembler::LT, loop); 7337 7338 bind(end); 7339 block_comment("} // i"); 7340 } 7341 7342 block_comment("for (int i = len; i < 2*len; i++) {"); 7343 mov(Ri, Rlen); { 7344 Label loop, end; 7345 bind(loop); 7346 cmp(Ri, Rlen, Assembler::LSL, 1); 7347 br(Assembler::GE, end); 7348 7349 pre2(Ri, Rlen); 7350 7351 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 7352 lsl(Rj, Rlen, 1); 7353 sub(Rj, Rj, Ri); 7354 sub(Rj, Rj, 1); 7355 lsr(Rj, Rj, 1); 7356 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7357 } block_comment(" } // j"); 7358 7359 last_squaring(Ri); 7360 7361 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 7362 lsl(Rj, Rlen, 1); 7363 sub(Rj, Rj, Ri); 7364 lsr(Rj, Rj, 1); 7365 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7366 } block_comment(" } // j"); 7367 7368 post2(Ri, Rlen); 7369 add(Ri, Ri, 1); 7370 cmp(Ri, Rlen, Assembler::LSL, 1); 7371 7372 br(Assembler::LT, loop); 7373 bind(end); 7374 block_comment("} // i"); 7375 } 7376 7377 normalize(Rlen); 7378 7379 mov(Ra, Pm_base); // Save Pm_base in Ra 7380 restore_regs(); // Restore caller's Pm_base 7381 7382 // Copy our result into caller's Pm_base 7383 reverse(Pm_base, Ra, Rlen, t0, t1); 7384 7385 leave(); 7386 ret(lr); 7387 7388 return entry; 7389 } 7390 // In C, approximately: 7391 7392 // void 7393 // montgomery_square(julong Pa_base[], julong Pn_base[], 7394 // julong Pm_base[], julong inv, int len) { 7395 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7396 // julong *Pa, *Pb, *Pn, *Pm; 7397 // julong Ra, Rb, Rn, Rm; 7398 7399 // int i; 7400 7401 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7402 7403 // for (i = 0; i < len; i++) { 7404 // int j; 7405 7406 // Pa = Pa_base; 7407 // Pb = Pa_base + i; 7408 // Pm = Pm_base; 7409 // Pn = Pn_base + i; 7410 7411 // Ra = *Pa; 7412 // Rb = *Pb; 7413 // Rm = *Pm; 7414 // Rn = *Pn; 7415 7416 // int iters = (i+1)/2; 7417 // for (j = 0; iters--; j++) { 7418 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7419 // MACC2(Ra, Rb, t0, t1, t2); 7420 // Ra = *++Pa; 7421 // Rb = *--Pb; 7422 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7423 // MACC(Rm, Rn, t0, t1, t2); 7424 // Rm = *++Pm; 7425 // Rn = *--Pn; 7426 // } 7427 // if ((i & 1) == 0) { 7428 // assert(Ra == Pa_base[j], "must be"); 7429 // MACC(Ra, Ra, t0, t1, t2); 7430 // } 7431 // iters = i/2; 7432 // assert(iters == i-j, "must be"); 7433 // for (; iters--; j++) { 7434 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7435 // MACC(Rm, Rn, t0, t1, t2); 7436 // Rm = *++Pm; 7437 // Rn = *--Pn; 7438 // } 7439 7440 // *Pm = Rm = t0 * inv; 7441 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7442 // MACC(Rm, Rn, t0, t1, t2); 7443 7444 // assert(t0 == 0, "broken Montgomery multiply"); 7445 7446 // t0 = t1; t1 = t2; t2 = 0; 7447 // } 7448 7449 // for (i = len; i < 2*len; i++) { 7450 // int start = i-len+1; 7451 // int end = start + (len - start)/2; 7452 // int j; 7453 7454 // Pa = Pa_base + i-len; 7455 // Pb = Pa_base + len; 7456 // Pm = Pm_base + i-len; 7457 // Pn = Pn_base + len; 7458 7459 // Ra = *++Pa; 7460 // Rb = *--Pb; 7461 // Rm = *++Pm; 7462 // Rn = *--Pn; 7463 7464 // int iters = (2*len-i-1)/2; 7465 // assert(iters == end-start, "must be"); 7466 // for (j = start; iters--; j++) { 7467 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7468 // MACC2(Ra, Rb, t0, t1, t2); 7469 // Ra = *++Pa; 7470 // Rb = *--Pb; 7471 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7472 // MACC(Rm, Rn, t0, t1, t2); 7473 // Rm = *++Pm; 7474 // Rn = *--Pn; 7475 // } 7476 // if ((i & 1) == 0) { 7477 // assert(Ra == Pa_base[j], "must be"); 7478 // MACC(Ra, Ra, t0, t1, t2); 7479 // } 7480 // iters = (2*len-i)/2; 7481 // assert(iters == len-j, "must be"); 7482 // for (; iters--; j++) { 7483 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7484 // MACC(Rm, Rn, t0, t1, t2); 7485 // Rm = *++Pm; 7486 // Rn = *--Pn; 7487 // } 7488 // Pm_base[i-len] = t0; 7489 // t0 = t1; t1 = t2; t2 = 0; 7490 // } 7491 7492 // while (t0) 7493 // t0 = sub(Pm_base, Pn_base, t0, len); 7494 // } 7495 }; 7496 7497 7498 // Initialization 7499 void generate_initial() { 7500 // Generate initial stubs and initializes the entry points 7501 7502 // entry points that exist in all platforms Note: This is code 7503 // that could be shared among different platforms - however the 7504 // benefit seems to be smaller than the disadvantage of having a 7505 // much more complicated generator structure. See also comment in 7506 // stubRoutines.hpp. 7507 7508 StubRoutines::_forward_exception_entry = generate_forward_exception(); 7509 7510 StubRoutines::_call_stub_entry = 7511 generate_call_stub(StubRoutines::_call_stub_return_address); 7512 7513 // is referenced by megamorphic call 7514 StubRoutines::_catch_exception_entry = generate_catch_exception(); 7515 7516 // Build this early so it's available for the interpreter. 7517 StubRoutines::_throw_StackOverflowError_entry = 7518 generate_throw_exception("StackOverflowError throw_exception", 7519 CAST_FROM_FN_PTR(address, 7520 SharedRuntime::throw_StackOverflowError)); 7521 StubRoutines::_throw_delayed_StackOverflowError_entry = 7522 generate_throw_exception("delayed StackOverflowError throw_exception", 7523 CAST_FROM_FN_PTR(address, 7524 SharedRuntime::throw_delayed_StackOverflowError)); 7525 if (UseCRC32Intrinsics) { 7526 // set table address before stub generation which use it 7527 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 7528 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 7529 } 7530 7531 if (UseCRC32CIntrinsics) { 7532 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 7533 } 7534 7535 // Disabled until JDK-8210858 is fixed 7536 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 7537 // StubRoutines::_dlog = generate_dlog(); 7538 // } 7539 7540 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 7541 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 7542 } 7543 7544 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 7545 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 7546 } 7547 7548 // Safefetch stubs. 7549 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 7550 &StubRoutines::_safefetch32_fault_pc, 7551 &StubRoutines::_safefetch32_continuation_pc); 7552 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 7553 &StubRoutines::_safefetchN_fault_pc, 7554 &StubRoutines::_safefetchN_continuation_pc); 7555 } 7556 7557 void generate_all() { 7558 // support for verify_oop (must happen after universe_init) 7559 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 7560 StubRoutines::_throw_AbstractMethodError_entry = 7561 generate_throw_exception("AbstractMethodError throw_exception", 7562 CAST_FROM_FN_PTR(address, 7563 SharedRuntime:: 7564 throw_AbstractMethodError)); 7565 7566 StubRoutines::_throw_IncompatibleClassChangeError_entry = 7567 generate_throw_exception("IncompatibleClassChangeError throw_exception", 7568 CAST_FROM_FN_PTR(address, 7569 SharedRuntime:: 7570 throw_IncompatibleClassChangeError)); 7571 7572 StubRoutines::_throw_NullPointerException_at_call_entry = 7573 generate_throw_exception("NullPointerException at call throw_exception", 7574 CAST_FROM_FN_PTR(address, 7575 SharedRuntime:: 7576 throw_NullPointerException_at_call)); 7577 7578 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 7579 7580 // arraycopy stubs used by compilers 7581 generate_arraycopy_stubs(); 7582 7583 // has negatives stub for large arrays. 7584 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 7585 7586 // array equals stub for large arrays. 7587 if (!UseSimpleArrayEquals) { 7588 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 7589 } 7590 7591 generate_compare_long_strings(); 7592 7593 generate_string_indexof_stubs(); 7594 7595 // byte_array_inflate stub for large arrays. 7596 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 7597 7598 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 7599 if (bs_nm != NULL) { 7600 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 7601 } 7602 #ifdef COMPILER2 7603 if (UseMultiplyToLenIntrinsic) { 7604 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 7605 } 7606 7607 if (UseSquareToLenIntrinsic) { 7608 StubRoutines::_squareToLen = generate_squareToLen(); 7609 } 7610 7611 if (UseMulAddIntrinsic) { 7612 StubRoutines::_mulAdd = generate_mulAdd(); 7613 } 7614 7615 if (UseSIMDForBigIntegerShiftIntrinsics) { 7616 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 7617 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 7618 } 7619 7620 if (UseMontgomeryMultiplyIntrinsic) { 7621 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 7622 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 7623 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 7624 } 7625 7626 if (UseMontgomerySquareIntrinsic) { 7627 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 7628 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 7629 // We use generate_multiply() rather than generate_square() 7630 // because it's faster for the sizes of modulus we care about. 7631 StubRoutines::_montgomerySquare = g.generate_multiply(); 7632 } 7633 #endif // COMPILER2 7634 7635 // generate GHASH intrinsics code 7636 if (UseGHASHIntrinsics) { 7637 if (UseAESCTRIntrinsics) { 7638 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 7639 } else { 7640 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 7641 } 7642 } 7643 7644 if (UseBASE64Intrinsics) { 7645 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 7646 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 7647 } 7648 7649 // data cache line writeback 7650 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 7651 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 7652 7653 if (UseAESIntrinsics) { 7654 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 7655 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 7656 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 7657 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 7658 } 7659 7660 if (UseAESCTRIntrinsics) { 7661 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 7662 } 7663 7664 if (UseMD5Intrinsics) { 7665 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 7666 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 7667 } 7668 if (UseSHA1Intrinsics) { 7669 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 7670 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 7671 } 7672 if (UseSHA256Intrinsics) { 7673 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 7674 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 7675 } 7676 if (UseSHA512Intrinsics) { 7677 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 7678 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 7679 } 7680 if (UseSHA3Intrinsics) { 7681 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 7682 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 7683 } 7684 7685 // generate Adler32 intrinsics code 7686 if (UseAdler32Intrinsics) { 7687 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 7688 } 7689 7690 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 7691 7692 #ifdef LINUX 7693 7694 generate_atomic_entry_points(); 7695 7696 #endif // LINUX 7697 7698 StubRoutines::aarch64::set_completed(); 7699 } 7700 7701 public: 7702 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 7703 if (all) { 7704 generate_all(); 7705 } else { 7706 generate_initial(); 7707 } 7708 } 7709 }; // end class declaration 7710 7711 #define UCM_TABLE_MAX_ENTRIES 8 7712 void StubGenerator_generate(CodeBuffer* code, bool all) { 7713 if (UnsafeCopyMemory::_table == NULL) { 7714 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 7715 } 7716 StubGenerator g(code, all); 7717 } 7718 7719 7720 #ifdef LINUX 7721 7722 // Define pointers to atomic stubs and initialize them to point to the 7723 // code in atomic_aarch64.S. 7724 7725 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 7726 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 7727 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 7728 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 7729 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 7730 7731 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 7732 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 7733 DEFAULT_ATOMIC_OP(xchg, 4, ) 7734 DEFAULT_ATOMIC_OP(xchg, 8, ) 7735 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 7736 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 7737 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 7738 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 7739 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 7740 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 7741 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 7742 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 7743 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 7744 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 7745 7746 #undef DEFAULT_ATOMIC_OP 7747 7748 #endif // LINUX