1 /* 2 * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "runtime/atomic.hpp" 44 #include "runtime/frame.inline.hpp" 45 #include "runtime/handles.inline.hpp" 46 #include "runtime/sharedRuntime.hpp" 47 #include "runtime/stubCodeGenerator.hpp" 48 #include "runtime/stubRoutines.hpp" 49 #include "runtime/thread.inline.hpp" 50 #include "utilities/align.hpp" 51 #include "utilities/powerOfTwo.hpp" 52 #ifdef COMPILER2 53 #include "opto/runtime.hpp" 54 #endif 55 #if INCLUDE_ZGC 56 #include "gc/z/zThreadLocalData.hpp" 57 #endif 58 59 // Declaration and definition of StubGenerator (no .hpp file). 60 // For a more detailed description of the stub routine structure 61 // see the comment in stubRoutines.hpp 62 63 #undef __ 64 #define __ _masm-> 65 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 66 67 #ifdef PRODUCT 68 #define BLOCK_COMMENT(str) /* nothing */ 69 #else 70 #define BLOCK_COMMENT(str) __ block_comment(str) 71 #endif 72 73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 74 75 // Stub Code definitions 76 77 class StubGenerator: public StubCodeGenerator { 78 private: 79 80 #ifdef PRODUCT 81 #define inc_counter_np(counter) ((void)0) 82 #else 83 void inc_counter_np_(int& counter) { 84 __ lea(rscratch2, ExternalAddress((address)&counter)); 85 __ ldrw(rscratch1, Address(rscratch2)); 86 __ addw(rscratch1, rscratch1, 1); 87 __ strw(rscratch1, Address(rscratch2)); 88 } 89 #define inc_counter_np(counter) \ 90 BLOCK_COMMENT("inc_counter " #counter); \ 91 inc_counter_np_(counter); 92 #endif 93 94 // Call stubs are used to call Java from C 95 // 96 // Arguments: 97 // c_rarg0: call wrapper address address 98 // c_rarg1: result address 99 // c_rarg2: result type BasicType 100 // c_rarg3: method Method* 101 // c_rarg4: (interpreter) entry point address 102 // c_rarg5: parameters intptr_t* 103 // c_rarg6: parameter size (in words) int 104 // c_rarg7: thread Thread* 105 // 106 // There is no return from the stub itself as any Java result 107 // is written to result 108 // 109 // we save r30 (lr) as the return PC at the base of the frame and 110 // link r29 (fp) below it as the frame pointer installing sp (r31) 111 // into fp. 112 // 113 // we save r0-r7, which accounts for all the c arguments. 114 // 115 // TODO: strictly do we need to save them all? they are treated as 116 // volatile by C so could we omit saving the ones we are going to 117 // place in global registers (thread? method?) or those we only use 118 // during setup of the Java call? 119 // 120 // we don't need to save r8 which C uses as an indirect result location 121 // return register. 122 // 123 // we don't need to save r9-r15 which both C and Java treat as 124 // volatile 125 // 126 // we don't need to save r16-18 because Java does not use them 127 // 128 // we save r19-r28 which Java uses as scratch registers and C 129 // expects to be callee-save 130 // 131 // we save the bottom 64 bits of each value stored in v8-v15; it is 132 // the responsibility of the caller to preserve larger values. 133 // 134 // so the stub frame looks like this when we enter Java code 135 // 136 // [ return_from_Java ] <--- sp 137 // [ argument word n ] 138 // ... 139 // -27 [ argument word 1 ] 140 // -26 [ saved v15 ] <--- sp_after_call 141 // -25 [ saved v14 ] 142 // -24 [ saved v13 ] 143 // -23 [ saved v12 ] 144 // -22 [ saved v11 ] 145 // -21 [ saved v10 ] 146 // -20 [ saved v9 ] 147 // -19 [ saved v8 ] 148 // -18 [ saved r28 ] 149 // -17 [ saved r27 ] 150 // -16 [ saved r26 ] 151 // -15 [ saved r25 ] 152 // -14 [ saved r24 ] 153 // -13 [ saved r23 ] 154 // -12 [ saved r22 ] 155 // -11 [ saved r21 ] 156 // -10 [ saved r20 ] 157 // -9 [ saved r19 ] 158 // -8 [ call wrapper (r0) ] 159 // -7 [ result (r1) ] 160 // -6 [ result type (r2) ] 161 // -5 [ method (r3) ] 162 // -4 [ entry point (r4) ] 163 // -3 [ parameters (r5) ] 164 // -2 [ parameter size (r6) ] 165 // -1 [ thread (r7) ] 166 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 167 // 1 [ saved lr (r30) ] 168 169 // Call stub stack layout word offsets from fp 170 enum call_stub_layout { 171 sp_after_call_off = -26, 172 173 d15_off = -26, 174 d13_off = -24, 175 d11_off = -22, 176 d9_off = -20, 177 178 r28_off = -18, 179 r26_off = -16, 180 r24_off = -14, 181 r22_off = -12, 182 r20_off = -10, 183 call_wrapper_off = -8, 184 result_off = -7, 185 result_type_off = -6, 186 method_off = -5, 187 entry_point_off = -4, 188 parameter_size_off = -2, 189 thread_off = -1, 190 fp_f = 0, 191 retaddr_off = 1, 192 }; 193 194 address generate_call_stub(address& return_address) { 195 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 196 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 197 "adjust this code"); 198 199 StubCodeMark mark(this, "StubRoutines", "call_stub"); 200 address start = __ pc(); 201 202 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 203 204 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 205 const Address result (rfp, result_off * wordSize); 206 const Address result_type (rfp, result_type_off * wordSize); 207 const Address method (rfp, method_off * wordSize); 208 const Address entry_point (rfp, entry_point_off * wordSize); 209 const Address parameter_size(rfp, parameter_size_off * wordSize); 210 211 const Address thread (rfp, thread_off * wordSize); 212 213 const Address d15_save (rfp, d15_off * wordSize); 214 const Address d13_save (rfp, d13_off * wordSize); 215 const Address d11_save (rfp, d11_off * wordSize); 216 const Address d9_save (rfp, d9_off * wordSize); 217 218 const Address r28_save (rfp, r28_off * wordSize); 219 const Address r26_save (rfp, r26_off * wordSize); 220 const Address r24_save (rfp, r24_off * wordSize); 221 const Address r22_save (rfp, r22_off * wordSize); 222 const Address r20_save (rfp, r20_off * wordSize); 223 224 // stub code 225 226 address aarch64_entry = __ pc(); 227 228 // set up frame and move sp to end of save area 229 __ enter(); 230 __ sub(sp, rfp, -sp_after_call_off * wordSize); 231 232 // save register parameters and Java scratch/global registers 233 // n.b. we save thread even though it gets installed in 234 // rthread because we want to sanity check rthread later 235 __ str(c_rarg7, thread); 236 __ strw(c_rarg6, parameter_size); 237 __ stp(c_rarg4, c_rarg5, entry_point); 238 __ stp(c_rarg2, c_rarg3, result_type); 239 __ stp(c_rarg0, c_rarg1, call_wrapper); 240 241 __ stp(r20, r19, r20_save); 242 __ stp(r22, r21, r22_save); 243 __ stp(r24, r23, r24_save); 244 __ stp(r26, r25, r26_save); 245 __ stp(r28, r27, r28_save); 246 247 __ stpd(v9, v8, d9_save); 248 __ stpd(v11, v10, d11_save); 249 __ stpd(v13, v12, d13_save); 250 __ stpd(v15, v14, d15_save); 251 252 // install Java thread in global register now we have saved 253 // whatever value it held 254 __ mov(rthread, c_rarg7); 255 // And method 256 __ mov(rmethod, c_rarg3); 257 258 // set up the heapbase register 259 __ reinit_heapbase(); 260 261 #ifdef ASSERT 262 // make sure we have no pending exceptions 263 { 264 Label L; 265 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 266 __ cmp(rscratch1, (u1)NULL_WORD); 267 __ br(Assembler::EQ, L); 268 __ stop("StubRoutines::call_stub: entered with pending exception"); 269 __ BIND(L); 270 } 271 #endif 272 // pass parameters if any 273 __ mov(esp, sp); 274 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 275 __ andr(sp, rscratch1, -2 * wordSize); 276 277 BLOCK_COMMENT("pass parameters if any"); 278 Label parameters_done; 279 // parameter count is still in c_rarg6 280 // and parameter pointer identifying param 1 is in c_rarg5 281 __ cbzw(c_rarg6, parameters_done); 282 283 address loop = __ pc(); 284 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 285 __ subsw(c_rarg6, c_rarg6, 1); 286 __ push(rscratch1); 287 __ br(Assembler::GT, loop); 288 289 __ BIND(parameters_done); 290 291 // call Java entry -- passing methdoOop, and current sp 292 // rmethod: Method* 293 // r13: sender sp 294 BLOCK_COMMENT("call Java function"); 295 __ mov(r13, sp); 296 __ blr(c_rarg4); 297 298 // we do this here because the notify will already have been done 299 // if we get to the next instruction via an exception 300 // 301 // n.b. adding this instruction here affects the calculation of 302 // whether or not a routine returns to the call stub (used when 303 // doing stack walks) since the normal test is to check the return 304 // pc against the address saved below. so we may need to allow for 305 // this extra instruction in the check. 306 307 // save current address for use by exception handling code 308 309 return_address = __ pc(); 310 311 // store result depending on type (everything that is not 312 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 313 // n.b. this assumes Java returns an integral result in r0 314 // and a floating result in j_farg0 315 __ ldr(j_rarg2, result); 316 Label is_long, is_float, is_double, exit; 317 __ ldr(j_rarg1, result_type); 318 __ cmp(j_rarg1, (u1)T_OBJECT); 319 __ br(Assembler::EQ, is_long); 320 __ cmp(j_rarg1, (u1)T_LONG); 321 __ br(Assembler::EQ, is_long); 322 __ cmp(j_rarg1, (u1)T_FLOAT); 323 __ br(Assembler::EQ, is_float); 324 __ cmp(j_rarg1, (u1)T_DOUBLE); 325 __ br(Assembler::EQ, is_double); 326 327 // handle T_INT case 328 __ strw(r0, Address(j_rarg2)); 329 330 __ BIND(exit); 331 332 // pop parameters 333 __ sub(esp, rfp, -sp_after_call_off * wordSize); 334 335 #ifdef ASSERT 336 // verify that threads correspond 337 { 338 Label L, S; 339 __ ldr(rscratch1, thread); 340 __ cmp(rthread, rscratch1); 341 __ br(Assembler::NE, S); 342 __ get_thread(rscratch1); 343 __ cmp(rthread, rscratch1); 344 __ br(Assembler::EQ, L); 345 __ BIND(S); 346 __ stop("StubRoutines::call_stub: threads must correspond"); 347 __ BIND(L); 348 } 349 #endif 350 351 // restore callee-save registers 352 __ ldpd(v15, v14, d15_save); 353 __ ldpd(v13, v12, d13_save); 354 __ ldpd(v11, v10, d11_save); 355 __ ldpd(v9, v8, d9_save); 356 357 __ ldp(r28, r27, r28_save); 358 __ ldp(r26, r25, r26_save); 359 __ ldp(r24, r23, r24_save); 360 __ ldp(r22, r21, r22_save); 361 __ ldp(r20, r19, r20_save); 362 363 __ ldp(c_rarg0, c_rarg1, call_wrapper); 364 __ ldrw(c_rarg2, result_type); 365 __ ldr(c_rarg3, method); 366 __ ldp(c_rarg4, c_rarg5, entry_point); 367 __ ldp(c_rarg6, c_rarg7, parameter_size); 368 369 // leave frame and return to caller 370 __ leave(); 371 __ ret(lr); 372 373 // handle return types different from T_INT 374 375 __ BIND(is_long); 376 __ str(r0, Address(j_rarg2, 0)); 377 __ br(Assembler::AL, exit); 378 379 __ BIND(is_float); 380 __ strs(j_farg0, Address(j_rarg2, 0)); 381 __ br(Assembler::AL, exit); 382 383 __ BIND(is_double); 384 __ strd(j_farg0, Address(j_rarg2, 0)); 385 __ br(Assembler::AL, exit); 386 387 return start; 388 } 389 390 // Return point for a Java call if there's an exception thrown in 391 // Java code. The exception is caught and transformed into a 392 // pending exception stored in JavaThread that can be tested from 393 // within the VM. 394 // 395 // Note: Usually the parameters are removed by the callee. In case 396 // of an exception crossing an activation frame boundary, that is 397 // not the case if the callee is compiled code => need to setup the 398 // rsp. 399 // 400 // r0: exception oop 401 402 address generate_catch_exception() { 403 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 404 address start = __ pc(); 405 406 // same as in generate_call_stub(): 407 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 408 const Address thread (rfp, thread_off * wordSize); 409 410 #ifdef ASSERT 411 // verify that threads correspond 412 { 413 Label L, S; 414 __ ldr(rscratch1, thread); 415 __ cmp(rthread, rscratch1); 416 __ br(Assembler::NE, S); 417 __ get_thread(rscratch1); 418 __ cmp(rthread, rscratch1); 419 __ br(Assembler::EQ, L); 420 __ bind(S); 421 __ stop("StubRoutines::catch_exception: threads must correspond"); 422 __ bind(L); 423 } 424 #endif 425 426 // set pending exception 427 __ verify_oop(r0); 428 429 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 430 __ mov(rscratch1, (address)__FILE__); 431 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 432 __ movw(rscratch1, (int)__LINE__); 433 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 434 435 // complete return to VM 436 assert(StubRoutines::_call_stub_return_address != NULL, 437 "_call_stub_return_address must have been generated before"); 438 __ b(StubRoutines::_call_stub_return_address); 439 440 return start; 441 } 442 443 // Continuation point for runtime calls returning with a pending 444 // exception. The pending exception check happened in the runtime 445 // or native call stub. The pending exception in Thread is 446 // converted into a Java-level exception. 447 // 448 // Contract with Java-level exception handlers: 449 // r0: exception 450 // r3: throwing pc 451 // 452 // NOTE: At entry of this stub, exception-pc must be in LR !! 453 454 // NOTE: this is always used as a jump target within generated code 455 // so it just needs to be generated code wiht no x86 prolog 456 457 address generate_forward_exception() { 458 StubCodeMark mark(this, "StubRoutines", "forward exception"); 459 address start = __ pc(); 460 461 // Upon entry, LR points to the return address returning into 462 // Java (interpreted or compiled) code; i.e., the return address 463 // becomes the throwing pc. 464 // 465 // Arguments pushed before the runtime call are still on the stack 466 // but the exception handler will reset the stack pointer -> 467 // ignore them. A potential result in registers can be ignored as 468 // well. 469 470 #ifdef ASSERT 471 // make sure this code is only executed if there is a pending exception 472 { 473 Label L; 474 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 475 __ cbnz(rscratch1, L); 476 __ stop("StubRoutines::forward exception: no pending exception (1)"); 477 __ bind(L); 478 } 479 #endif 480 481 // compute exception handler into r19 482 483 // call the VM to find the handler address associated with the 484 // caller address. pass thread in r0 and caller pc (ret address) 485 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 486 // the stack. 487 __ mov(c_rarg1, lr); 488 // lr will be trashed by the VM call so we move it to R19 489 // (callee-saved) because we also need to pass it to the handler 490 // returned by this call. 491 __ mov(r19, lr); 492 BLOCK_COMMENT("call exception_handler_for_return_address"); 493 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 494 SharedRuntime::exception_handler_for_return_address), 495 rthread, c_rarg1); 496 // Reinitialize the ptrue predicate register, in case the external runtime 497 // call clobbers ptrue reg, as we may return to SVE compiled code. 498 __ reinitialize_ptrue(); 499 500 // we should not really care that lr is no longer the callee 501 // address. we saved the value the handler needs in r19 so we can 502 // just copy it to r3. however, the C2 handler will push its own 503 // frame and then calls into the VM and the VM code asserts that 504 // the PC for the frame above the handler belongs to a compiled 505 // Java method. So, we restore lr here to satisfy that assert. 506 __ mov(lr, r19); 507 // setup r0 & r3 & clear pending exception 508 __ mov(r3, r19); 509 __ mov(r19, r0); 510 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 511 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 512 513 #ifdef ASSERT 514 // make sure exception is set 515 { 516 Label L; 517 __ cbnz(r0, L); 518 __ stop("StubRoutines::forward exception: no pending exception (2)"); 519 __ bind(L); 520 } 521 #endif 522 523 // continue at exception handler 524 // r0: exception 525 // r3: throwing pc 526 // r19: exception handler 527 __ verify_oop(r0); 528 __ br(r19); 529 530 return start; 531 } 532 533 // Non-destructive plausibility checks for oops 534 // 535 // Arguments: 536 // r0: oop to verify 537 // rscratch1: error message 538 // 539 // Stack after saving c_rarg3: 540 // [tos + 0]: saved c_rarg3 541 // [tos + 1]: saved c_rarg2 542 // [tos + 2]: saved lr 543 // [tos + 3]: saved rscratch2 544 // [tos + 4]: saved r0 545 // [tos + 5]: saved rscratch1 546 address generate_verify_oop() { 547 548 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 549 address start = __ pc(); 550 551 Label exit, error; 552 553 // save c_rarg2 and c_rarg3 554 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 555 556 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 557 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 558 __ ldr(c_rarg3, Address(c_rarg2)); 559 __ add(c_rarg3, c_rarg3, 1); 560 __ str(c_rarg3, Address(c_rarg2)); 561 562 // object is in r0 563 // make sure object is 'reasonable' 564 __ cbz(r0, exit); // if obj is NULL it is OK 565 566 #if INCLUDE_ZGC 567 if (UseZGC) { 568 // Check if mask is good. 569 // verifies that ZAddressBadMask & r0 == 0 570 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 571 __ andr(c_rarg2, r0, c_rarg3); 572 __ cbnz(c_rarg2, error); 573 } 574 #endif 575 576 // Check if the oop is in the right area of memory 577 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 578 __ andr(c_rarg2, r0, c_rarg3); 579 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 580 581 // Compare c_rarg2 and c_rarg3. We don't use a compare 582 // instruction here because the flags register is live. 583 __ eor(c_rarg2, c_rarg2, c_rarg3); 584 __ cbnz(c_rarg2, error); 585 586 // make sure klass is 'reasonable', which is not zero. 587 __ load_klass(r0, r0); // get klass 588 __ cbz(r0, error); // if klass is NULL it is broken 589 590 // return if everything seems ok 591 __ bind(exit); 592 593 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 594 __ ret(lr); 595 596 // handle errors 597 __ bind(error); 598 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 599 600 __ push(RegSet::range(r0, r29), sp); 601 // debug(char* msg, int64_t pc, int64_t regs[]) 602 __ mov(c_rarg0, rscratch1); // pass address of error message 603 __ mov(c_rarg1, lr); // pass return address 604 __ mov(c_rarg2, sp); // pass address of regs on stack 605 #ifndef PRODUCT 606 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 607 #endif 608 BLOCK_COMMENT("call MacroAssembler::debug"); 609 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 610 __ blr(rscratch1); 611 __ hlt(0); 612 613 return start; 614 } 615 616 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 617 618 // Generate indices for iota vector. 619 address generate_iota_indices(const char *stub_name) { 620 __ align(CodeEntryAlignment); 621 StubCodeMark mark(this, "StubRoutines", stub_name); 622 address start = __ pc(); 623 __ emit_data64(0x0706050403020100, relocInfo::none); 624 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 625 return start; 626 } 627 628 // The inner part of zero_words(). This is the bulk operation, 629 // zeroing words in blocks, possibly using DC ZVA to do it. The 630 // caller is responsible for zeroing the last few words. 631 // 632 // Inputs: 633 // r10: the HeapWord-aligned base address of an array to zero. 634 // r11: the count in HeapWords, r11 > 0. 635 // 636 // Returns r10 and r11, adjusted for the caller to clear. 637 // r10: the base address of the tail of words left to clear. 638 // r11: the number of words in the tail. 639 // r11 < MacroAssembler::zero_words_block_size. 640 641 address generate_zero_blocks() { 642 Label done; 643 Label base_aligned; 644 645 Register base = r10, cnt = r11; 646 647 __ align(CodeEntryAlignment); 648 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 649 address start = __ pc(); 650 651 if (UseBlockZeroing) { 652 int zva_length = VM_Version::zva_length(); 653 654 // Ensure ZVA length can be divided by 16. This is required by 655 // the subsequent operations. 656 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 657 658 __ tbz(base, 3, base_aligned); 659 __ str(zr, Address(__ post(base, 8))); 660 __ sub(cnt, cnt, 1); 661 __ bind(base_aligned); 662 663 // Ensure count >= zva_length * 2 so that it still deserves a zva after 664 // alignment. 665 Label small; 666 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 667 __ subs(rscratch1, cnt, low_limit >> 3); 668 __ br(Assembler::LT, small); 669 __ zero_dcache_blocks(base, cnt); 670 __ bind(small); 671 } 672 673 { 674 // Number of stp instructions we'll unroll 675 const int unroll = 676 MacroAssembler::zero_words_block_size / 2; 677 // Clear the remaining blocks. 678 Label loop; 679 __ subs(cnt, cnt, unroll * 2); 680 __ br(Assembler::LT, done); 681 __ bind(loop); 682 for (int i = 0; i < unroll; i++) 683 __ stp(zr, zr, __ post(base, 16)); 684 __ subs(cnt, cnt, unroll * 2); 685 __ br(Assembler::GE, loop); 686 __ bind(done); 687 __ add(cnt, cnt, unroll * 2); 688 } 689 690 __ ret(lr); 691 692 return start; 693 } 694 695 696 typedef enum { 697 copy_forwards = 1, 698 copy_backwards = -1 699 } copy_direction; 700 701 // Bulk copy of blocks of 8 words. 702 // 703 // count is a count of words. 704 // 705 // Precondition: count >= 8 706 // 707 // Postconditions: 708 // 709 // The least significant bit of count contains the remaining count 710 // of words to copy. The rest of count is trash. 711 // 712 // s and d are adjusted to point to the remaining words to copy 713 // 714 void generate_copy_longs(Label &start, Register s, Register d, Register count, 715 copy_direction direction) { 716 int unit = wordSize * direction; 717 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 718 719 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 720 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 721 const Register stride = r13; 722 723 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 724 assert_different_registers(s, d, count, rscratch1); 725 726 Label again, drain; 727 const char *stub_name; 728 if (direction == copy_forwards) 729 stub_name = "forward_copy_longs"; 730 else 731 stub_name = "backward_copy_longs"; 732 733 __ align(CodeEntryAlignment); 734 735 StubCodeMark mark(this, "StubRoutines", stub_name); 736 737 __ bind(start); 738 739 Label unaligned_copy_long; 740 if (AvoidUnalignedAccesses) { 741 __ tbnz(d, 3, unaligned_copy_long); 742 } 743 744 if (direction == copy_forwards) { 745 __ sub(s, s, bias); 746 __ sub(d, d, bias); 747 } 748 749 #ifdef ASSERT 750 // Make sure we are never given < 8 words 751 { 752 Label L; 753 __ cmp(count, (u1)8); 754 __ br(Assembler::GE, L); 755 __ stop("genrate_copy_longs called with < 8 words"); 756 __ bind(L); 757 } 758 #endif 759 760 // Fill 8 registers 761 if (UseSIMDForMemoryOps) { 762 __ ldpq(v0, v1, Address(s, 4 * unit)); 763 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 764 } else { 765 __ ldp(t0, t1, Address(s, 2 * unit)); 766 __ ldp(t2, t3, Address(s, 4 * unit)); 767 __ ldp(t4, t5, Address(s, 6 * unit)); 768 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 769 } 770 771 __ subs(count, count, 16); 772 __ br(Assembler::LO, drain); 773 774 int prefetch = PrefetchCopyIntervalInBytes; 775 bool use_stride = false; 776 if (direction == copy_backwards) { 777 use_stride = prefetch > 256; 778 prefetch = -prefetch; 779 if (use_stride) __ mov(stride, prefetch); 780 } 781 782 __ bind(again); 783 784 if (PrefetchCopyIntervalInBytes > 0) 785 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 786 787 if (UseSIMDForMemoryOps) { 788 __ stpq(v0, v1, Address(d, 4 * unit)); 789 __ ldpq(v0, v1, Address(s, 4 * unit)); 790 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 791 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 792 } else { 793 __ stp(t0, t1, Address(d, 2 * unit)); 794 __ ldp(t0, t1, Address(s, 2 * unit)); 795 __ stp(t2, t3, Address(d, 4 * unit)); 796 __ ldp(t2, t3, Address(s, 4 * unit)); 797 __ stp(t4, t5, Address(d, 6 * unit)); 798 __ ldp(t4, t5, Address(s, 6 * unit)); 799 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 800 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 801 } 802 803 __ subs(count, count, 8); 804 __ br(Assembler::HS, again); 805 806 // Drain 807 __ bind(drain); 808 if (UseSIMDForMemoryOps) { 809 __ stpq(v0, v1, Address(d, 4 * unit)); 810 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 811 } else { 812 __ stp(t0, t1, Address(d, 2 * unit)); 813 __ stp(t2, t3, Address(d, 4 * unit)); 814 __ stp(t4, t5, Address(d, 6 * unit)); 815 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 816 } 817 818 { 819 Label L1, L2; 820 __ tbz(count, exact_log2(4), L1); 821 if (UseSIMDForMemoryOps) { 822 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 823 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 824 } else { 825 __ ldp(t0, t1, Address(s, 2 * unit)); 826 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 827 __ stp(t0, t1, Address(d, 2 * unit)); 828 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 829 } 830 __ bind(L1); 831 832 if (direction == copy_forwards) { 833 __ add(s, s, bias); 834 __ add(d, d, bias); 835 } 836 837 __ tbz(count, 1, L2); 838 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 839 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 840 __ bind(L2); 841 } 842 843 __ ret(lr); 844 845 if (AvoidUnalignedAccesses) { 846 Label drain, again; 847 // Register order for storing. Order is different for backward copy. 848 849 __ bind(unaligned_copy_long); 850 851 // source address is even aligned, target odd aligned 852 // 853 // when forward copying word pairs we read long pairs at offsets 854 // {0, 2, 4, 6} (in long words). when backwards copying we read 855 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 856 // address by -2 in the forwards case so we can compute the 857 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 858 // or -1. 859 // 860 // when forward copying we need to store 1 word, 3 pairs and 861 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 862 // zero offset We adjust the destination by -1 which means we 863 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 864 // 865 // When backwards copyng we need to store 1 word, 3 pairs and 866 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 867 // offsets {1, 3, 5, 7, 8} * unit. 868 869 if (direction == copy_forwards) { 870 __ sub(s, s, 16); 871 __ sub(d, d, 8); 872 } 873 874 // Fill 8 registers 875 // 876 // for forwards copy s was offset by -16 from the original input 877 // value of s so the register contents are at these offsets 878 // relative to the 64 bit block addressed by that original input 879 // and so on for each successive 64 byte block when s is updated 880 // 881 // t0 at offset 0, t1 at offset 8 882 // t2 at offset 16, t3 at offset 24 883 // t4 at offset 32, t5 at offset 40 884 // t6 at offset 48, t7 at offset 56 885 886 // for backwards copy s was not offset so the register contents 887 // are at these offsets into the preceding 64 byte block 888 // relative to that original input and so on for each successive 889 // preceding 64 byte block when s is updated. this explains the 890 // slightly counter-intuitive looking pattern of register usage 891 // in the stp instructions for backwards copy. 892 // 893 // t0 at offset -16, t1 at offset -8 894 // t2 at offset -32, t3 at offset -24 895 // t4 at offset -48, t5 at offset -40 896 // t6 at offset -64, t7 at offset -56 897 898 __ ldp(t0, t1, Address(s, 2 * unit)); 899 __ ldp(t2, t3, Address(s, 4 * unit)); 900 __ ldp(t4, t5, Address(s, 6 * unit)); 901 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 902 903 __ subs(count, count, 16); 904 __ br(Assembler::LO, drain); 905 906 int prefetch = PrefetchCopyIntervalInBytes; 907 bool use_stride = false; 908 if (direction == copy_backwards) { 909 use_stride = prefetch > 256; 910 prefetch = -prefetch; 911 if (use_stride) __ mov(stride, prefetch); 912 } 913 914 __ bind(again); 915 916 if (PrefetchCopyIntervalInBytes > 0) 917 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 918 919 if (direction == copy_forwards) { 920 // allowing for the offset of -8 the store instructions place 921 // registers into the target 64 bit block at the following 922 // offsets 923 // 924 // t0 at offset 0 925 // t1 at offset 8, t2 at offset 16 926 // t3 at offset 24, t4 at offset 32 927 // t5 at offset 40, t6 at offset 48 928 // t7 at offset 56 929 930 __ str(t0, Address(d, 1 * unit)); 931 __ stp(t1, t2, Address(d, 2 * unit)); 932 __ ldp(t0, t1, Address(s, 2 * unit)); 933 __ stp(t3, t4, Address(d, 4 * unit)); 934 __ ldp(t2, t3, Address(s, 4 * unit)); 935 __ stp(t5, t6, Address(d, 6 * unit)); 936 __ ldp(t4, t5, Address(s, 6 * unit)); 937 __ str(t7, Address(__ pre(d, 8 * unit))); 938 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 939 } else { 940 // d was not offset when we started so the registers are 941 // written into the 64 bit block preceding d with the following 942 // offsets 943 // 944 // t1 at offset -8 945 // t3 at offset -24, t0 at offset -16 946 // t5 at offset -48, t2 at offset -32 947 // t7 at offset -56, t4 at offset -48 948 // t6 at offset -64 949 // 950 // note that this matches the offsets previously noted for the 951 // loads 952 953 __ str(t1, Address(d, 1 * unit)); 954 __ stp(t3, t0, Address(d, 3 * unit)); 955 __ ldp(t0, t1, Address(s, 2 * unit)); 956 __ stp(t5, t2, Address(d, 5 * unit)); 957 __ ldp(t2, t3, Address(s, 4 * unit)); 958 __ stp(t7, t4, Address(d, 7 * unit)); 959 __ ldp(t4, t5, Address(s, 6 * unit)); 960 __ str(t6, Address(__ pre(d, 8 * unit))); 961 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 962 } 963 964 __ subs(count, count, 8); 965 __ br(Assembler::HS, again); 966 967 // Drain 968 // 969 // this uses the same pattern of offsets and register arguments 970 // as above 971 __ bind(drain); 972 if (direction == copy_forwards) { 973 __ str(t0, Address(d, 1 * unit)); 974 __ stp(t1, t2, Address(d, 2 * unit)); 975 __ stp(t3, t4, Address(d, 4 * unit)); 976 __ stp(t5, t6, Address(d, 6 * unit)); 977 __ str(t7, Address(__ pre(d, 8 * unit))); 978 } else { 979 __ str(t1, Address(d, 1 * unit)); 980 __ stp(t3, t0, Address(d, 3 * unit)); 981 __ stp(t5, t2, Address(d, 5 * unit)); 982 __ stp(t7, t4, Address(d, 7 * unit)); 983 __ str(t6, Address(__ pre(d, 8 * unit))); 984 } 985 // now we need to copy any remaining part block which may 986 // include a 4 word block subblock and/or a 2 word subblock. 987 // bits 2 and 1 in the count are the tell-tale for whetehr we 988 // have each such subblock 989 { 990 Label L1, L2; 991 __ tbz(count, exact_log2(4), L1); 992 // this is the same as above but copying only 4 longs hence 993 // with ony one intervening stp between the str instructions 994 // but note that the offsets and registers still follow the 995 // same pattern 996 __ ldp(t0, t1, Address(s, 2 * unit)); 997 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 998 if (direction == copy_forwards) { 999 __ str(t0, Address(d, 1 * unit)); 1000 __ stp(t1, t2, Address(d, 2 * unit)); 1001 __ str(t3, Address(__ pre(d, 4 * unit))); 1002 } else { 1003 __ str(t1, Address(d, 1 * unit)); 1004 __ stp(t3, t0, Address(d, 3 * unit)); 1005 __ str(t2, Address(__ pre(d, 4 * unit))); 1006 } 1007 __ bind(L1); 1008 1009 __ tbz(count, 1, L2); 1010 // this is the same as above but copying only 2 longs hence 1011 // there is no intervening stp between the str instructions 1012 // but note that the offset and register patterns are still 1013 // the same 1014 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1015 if (direction == copy_forwards) { 1016 __ str(t0, Address(d, 1 * unit)); 1017 __ str(t1, Address(__ pre(d, 2 * unit))); 1018 } else { 1019 __ str(t1, Address(d, 1 * unit)); 1020 __ str(t0, Address(__ pre(d, 2 * unit))); 1021 } 1022 __ bind(L2); 1023 1024 // for forwards copy we need to re-adjust the offsets we 1025 // applied so that s and d are follow the last words written 1026 1027 if (direction == copy_forwards) { 1028 __ add(s, s, 16); 1029 __ add(d, d, 8); 1030 } 1031 1032 } 1033 1034 __ ret(lr); 1035 } 1036 } 1037 1038 // Small copy: less than 16 bytes. 1039 // 1040 // NB: Ignores all of the bits of count which represent more than 15 1041 // bytes, so a caller doesn't have to mask them. 1042 1043 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1044 bool is_backwards = step < 0; 1045 size_t granularity = uabs(step); 1046 int direction = is_backwards ? -1 : 1; 1047 int unit = wordSize * direction; 1048 1049 Label Lword, Lint, Lshort, Lbyte; 1050 1051 assert(granularity 1052 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1053 1054 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1055 1056 // ??? I don't know if this bit-test-and-branch is the right thing 1057 // to do. It does a lot of jumping, resulting in several 1058 // mispredicted branches. It might make more sense to do this 1059 // with something like Duff's device with a single computed branch. 1060 1061 __ tbz(count, 3 - exact_log2(granularity), Lword); 1062 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1063 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1064 __ bind(Lword); 1065 1066 if (granularity <= sizeof (jint)) { 1067 __ tbz(count, 2 - exact_log2(granularity), Lint); 1068 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1069 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1070 __ bind(Lint); 1071 } 1072 1073 if (granularity <= sizeof (jshort)) { 1074 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1075 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1076 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1077 __ bind(Lshort); 1078 } 1079 1080 if (granularity <= sizeof (jbyte)) { 1081 __ tbz(count, 0, Lbyte); 1082 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1083 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1084 __ bind(Lbyte); 1085 } 1086 } 1087 1088 Label copy_f, copy_b; 1089 1090 // All-singing all-dancing memory copy. 1091 // 1092 // Copy count units of memory from s to d. The size of a unit is 1093 // step, which can be positive or negative depending on the direction 1094 // of copy. If is_aligned is false, we align the source address. 1095 // 1096 1097 void copy_memory(bool is_aligned, Register s, Register d, 1098 Register count, Register tmp, int step) { 1099 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1100 bool is_backwards = step < 0; 1101 unsigned int granularity = uabs(step); 1102 const Register t0 = r3, t1 = r4; 1103 1104 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1105 // load all the data before writing anything 1106 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1107 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1108 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1109 const Register send = r17, dend = r16; 1110 1111 if (PrefetchCopyIntervalInBytes > 0) 1112 __ prfm(Address(s, 0), PLDL1KEEP); 1113 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1114 __ br(Assembler::HI, copy_big); 1115 1116 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1117 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1118 1119 __ cmp(count, u1(16/granularity)); 1120 __ br(Assembler::LS, copy16); 1121 1122 __ cmp(count, u1(64/granularity)); 1123 __ br(Assembler::HI, copy80); 1124 1125 __ cmp(count, u1(32/granularity)); 1126 __ br(Assembler::LS, copy32); 1127 1128 // 33..64 bytes 1129 if (UseSIMDForMemoryOps) { 1130 __ ldpq(v0, v1, Address(s, 0)); 1131 __ ldpq(v2, v3, Address(send, -32)); 1132 __ stpq(v0, v1, Address(d, 0)); 1133 __ stpq(v2, v3, Address(dend, -32)); 1134 } else { 1135 __ ldp(t0, t1, Address(s, 0)); 1136 __ ldp(t2, t3, Address(s, 16)); 1137 __ ldp(t4, t5, Address(send, -32)); 1138 __ ldp(t6, t7, Address(send, -16)); 1139 1140 __ stp(t0, t1, Address(d, 0)); 1141 __ stp(t2, t3, Address(d, 16)); 1142 __ stp(t4, t5, Address(dend, -32)); 1143 __ stp(t6, t7, Address(dend, -16)); 1144 } 1145 __ b(finish); 1146 1147 // 17..32 bytes 1148 __ bind(copy32); 1149 __ ldp(t0, t1, Address(s, 0)); 1150 __ ldp(t2, t3, Address(send, -16)); 1151 __ stp(t0, t1, Address(d, 0)); 1152 __ stp(t2, t3, Address(dend, -16)); 1153 __ b(finish); 1154 1155 // 65..80/96 bytes 1156 // (96 bytes if SIMD because we do 32 byes per instruction) 1157 __ bind(copy80); 1158 if (UseSIMDForMemoryOps) { 1159 __ ldpq(v0, v1, Address(s, 0)); 1160 __ ldpq(v2, v3, Address(s, 32)); 1161 // Unaligned pointers can be an issue for copying. 1162 // The issue has more chances to happen when granularity of data is 1163 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1164 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1165 // The most performance drop has been seen for the range 65-80 bytes. 1166 // For such cases using the pair of ldp/stp instead of the third pair of 1167 // ldpq/stpq fixes the performance issue. 1168 if (granularity < sizeof (jint)) { 1169 Label copy96; 1170 __ cmp(count, u1(80/granularity)); 1171 __ br(Assembler::HI, copy96); 1172 __ ldp(t0, t1, Address(send, -16)); 1173 1174 __ stpq(v0, v1, Address(d, 0)); 1175 __ stpq(v2, v3, Address(d, 32)); 1176 __ stp(t0, t1, Address(dend, -16)); 1177 __ b(finish); 1178 1179 __ bind(copy96); 1180 } 1181 __ ldpq(v4, v5, Address(send, -32)); 1182 1183 __ stpq(v0, v1, Address(d, 0)); 1184 __ stpq(v2, v3, Address(d, 32)); 1185 __ stpq(v4, v5, Address(dend, -32)); 1186 } else { 1187 __ ldp(t0, t1, Address(s, 0)); 1188 __ ldp(t2, t3, Address(s, 16)); 1189 __ ldp(t4, t5, Address(s, 32)); 1190 __ ldp(t6, t7, Address(s, 48)); 1191 __ ldp(t8, t9, Address(send, -16)); 1192 1193 __ stp(t0, t1, Address(d, 0)); 1194 __ stp(t2, t3, Address(d, 16)); 1195 __ stp(t4, t5, Address(d, 32)); 1196 __ stp(t6, t7, Address(d, 48)); 1197 __ stp(t8, t9, Address(dend, -16)); 1198 } 1199 __ b(finish); 1200 1201 // 0..16 bytes 1202 __ bind(copy16); 1203 __ cmp(count, u1(8/granularity)); 1204 __ br(Assembler::LO, copy8); 1205 1206 // 8..16 bytes 1207 __ ldr(t0, Address(s, 0)); 1208 __ ldr(t1, Address(send, -8)); 1209 __ str(t0, Address(d, 0)); 1210 __ str(t1, Address(dend, -8)); 1211 __ b(finish); 1212 1213 if (granularity < 8) { 1214 // 4..7 bytes 1215 __ bind(copy8); 1216 __ tbz(count, 2 - exact_log2(granularity), copy4); 1217 __ ldrw(t0, Address(s, 0)); 1218 __ ldrw(t1, Address(send, -4)); 1219 __ strw(t0, Address(d, 0)); 1220 __ strw(t1, Address(dend, -4)); 1221 __ b(finish); 1222 if (granularity < 4) { 1223 // 0..3 bytes 1224 __ bind(copy4); 1225 __ cbz(count, finish); // get rid of 0 case 1226 if (granularity == 2) { 1227 __ ldrh(t0, Address(s, 0)); 1228 __ strh(t0, Address(d, 0)); 1229 } else { // granularity == 1 1230 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1231 // the first and last byte. 1232 // Handle the 3 byte case by loading and storing base + count/2 1233 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1234 // This does means in the 1 byte case we load/store the same 1235 // byte 3 times. 1236 __ lsr(count, count, 1); 1237 __ ldrb(t0, Address(s, 0)); 1238 __ ldrb(t1, Address(send, -1)); 1239 __ ldrb(t2, Address(s, count)); 1240 __ strb(t0, Address(d, 0)); 1241 __ strb(t1, Address(dend, -1)); 1242 __ strb(t2, Address(d, count)); 1243 } 1244 __ b(finish); 1245 } 1246 } 1247 1248 __ bind(copy_big); 1249 if (is_backwards) { 1250 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1251 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1252 } 1253 1254 // Now we've got the small case out of the way we can align the 1255 // source address on a 2-word boundary. 1256 1257 Label aligned; 1258 1259 if (is_aligned) { 1260 // We may have to adjust by 1 word to get s 2-word-aligned. 1261 __ tbz(s, exact_log2(wordSize), aligned); 1262 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1263 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1264 __ sub(count, count, wordSize/granularity); 1265 } else { 1266 if (is_backwards) { 1267 __ andr(rscratch2, s, 2 * wordSize - 1); 1268 } else { 1269 __ neg(rscratch2, s); 1270 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1271 } 1272 // rscratch2 is the byte adjustment needed to align s. 1273 __ cbz(rscratch2, aligned); 1274 int shift = exact_log2(granularity); 1275 if (shift) __ lsr(rscratch2, rscratch2, shift); 1276 __ sub(count, count, rscratch2); 1277 1278 #if 0 1279 // ?? This code is only correct for a disjoint copy. It may or 1280 // may not make sense to use it in that case. 1281 1282 // Copy the first pair; s and d may not be aligned. 1283 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1284 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1285 1286 // Align s and d, adjust count 1287 if (is_backwards) { 1288 __ sub(s, s, rscratch2); 1289 __ sub(d, d, rscratch2); 1290 } else { 1291 __ add(s, s, rscratch2); 1292 __ add(d, d, rscratch2); 1293 } 1294 #else 1295 copy_memory_small(s, d, rscratch2, rscratch1, step); 1296 #endif 1297 } 1298 1299 __ bind(aligned); 1300 1301 // s is now 2-word-aligned. 1302 1303 // We have a count of units and some trailing bytes. Adjust the 1304 // count and do a bulk copy of words. 1305 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1306 if (direction == copy_forwards) 1307 __ bl(copy_f); 1308 else 1309 __ bl(copy_b); 1310 1311 // And the tail. 1312 copy_memory_small(s, d, count, tmp, step); 1313 1314 if (granularity >= 8) __ bind(copy8); 1315 if (granularity >= 4) __ bind(copy4); 1316 __ bind(finish); 1317 } 1318 1319 1320 void clobber_registers() { 1321 #ifdef ASSERT 1322 RegSet clobbered 1323 = MacroAssembler::call_clobbered_registers() - rscratch1; 1324 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1325 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1326 for (RegSetIterator<> it = clobbered.begin(); *it != noreg; ++it) { 1327 __ mov(*it, rscratch1); 1328 } 1329 #endif 1330 1331 } 1332 1333 // Scan over array at a for count oops, verifying each one. 1334 // Preserves a and count, clobbers rscratch1 and rscratch2. 1335 void verify_oop_array (int size, Register a, Register count, Register temp) { 1336 Label loop, end; 1337 __ mov(rscratch1, a); 1338 __ mov(rscratch2, zr); 1339 __ bind(loop); 1340 __ cmp(rscratch2, count); 1341 __ br(Assembler::HS, end); 1342 if (size == wordSize) { 1343 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1344 __ verify_oop(temp); 1345 } else { 1346 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1347 __ decode_heap_oop(temp); // calls verify_oop 1348 } 1349 __ add(rscratch2, rscratch2, 1); 1350 __ b(loop); 1351 __ bind(end); 1352 } 1353 1354 // Arguments: 1355 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1356 // ignored 1357 // is_oop - true => oop array, so generate store check code 1358 // name - stub name string 1359 // 1360 // Inputs: 1361 // c_rarg0 - source array address 1362 // c_rarg1 - destination array address 1363 // c_rarg2 - element count, treated as ssize_t, can be zero 1364 // 1365 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1366 // the hardware handle it. The two dwords within qwords that span 1367 // cache line boundaries will still be loaded and stored atomically. 1368 // 1369 // Side Effects: 1370 // disjoint_int_copy_entry is set to the no-overlap entry point 1371 // used by generate_conjoint_int_oop_copy(). 1372 // 1373 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1374 const char *name, bool dest_uninitialized = false) { 1375 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1376 RegSet saved_reg = RegSet::of(s, d, count); 1377 __ align(CodeEntryAlignment); 1378 StubCodeMark mark(this, "StubRoutines", name); 1379 address start = __ pc(); 1380 __ enter(); 1381 1382 if (entry != NULL) { 1383 *entry = __ pc(); 1384 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1385 BLOCK_COMMENT("Entry:"); 1386 } 1387 1388 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1389 if (dest_uninitialized) { 1390 decorators |= IS_DEST_UNINITIALIZED; 1391 } 1392 if (aligned) { 1393 decorators |= ARRAYCOPY_ALIGNED; 1394 } 1395 1396 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1397 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1398 1399 if (is_oop) { 1400 // save regs before copy_memory 1401 __ push(RegSet::of(d, count), sp); 1402 } 1403 { 1404 // UnsafeCopyMemory page error: continue after ucm 1405 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1406 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1407 copy_memory(aligned, s, d, count, rscratch1, size); 1408 } 1409 1410 if (is_oop) { 1411 __ pop(RegSet::of(d, count), sp); 1412 if (VerifyOops) 1413 verify_oop_array(size, d, count, r16); 1414 } 1415 1416 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1417 1418 __ leave(); 1419 __ mov(r0, zr); // return 0 1420 __ ret(lr); 1421 return start; 1422 } 1423 1424 // Arguments: 1425 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1426 // ignored 1427 // is_oop - true => oop array, so generate store check code 1428 // name - stub name string 1429 // 1430 // Inputs: 1431 // c_rarg0 - source array address 1432 // c_rarg1 - destination array address 1433 // c_rarg2 - element count, treated as ssize_t, can be zero 1434 // 1435 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1436 // the hardware handle it. The two dwords within qwords that span 1437 // cache line boundaries will still be loaded and stored atomically. 1438 // 1439 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1440 address *entry, const char *name, 1441 bool dest_uninitialized = false) { 1442 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1443 RegSet saved_regs = RegSet::of(s, d, count); 1444 StubCodeMark mark(this, "StubRoutines", name); 1445 address start = __ pc(); 1446 __ enter(); 1447 1448 if (entry != NULL) { 1449 *entry = __ pc(); 1450 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1451 BLOCK_COMMENT("Entry:"); 1452 } 1453 1454 // use fwd copy when (d-s) above_equal (count*size) 1455 __ sub(rscratch1, d, s); 1456 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1457 __ br(Assembler::HS, nooverlap_target); 1458 1459 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1460 if (dest_uninitialized) { 1461 decorators |= IS_DEST_UNINITIALIZED; 1462 } 1463 if (aligned) { 1464 decorators |= ARRAYCOPY_ALIGNED; 1465 } 1466 1467 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1468 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1469 1470 if (is_oop) { 1471 // save regs before copy_memory 1472 __ push(RegSet::of(d, count), sp); 1473 } 1474 { 1475 // UnsafeCopyMemory page error: continue after ucm 1476 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1477 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1478 copy_memory(aligned, s, d, count, rscratch1, -size); 1479 } 1480 if (is_oop) { 1481 __ pop(RegSet::of(d, count), sp); 1482 if (VerifyOops) 1483 verify_oop_array(size, d, count, r16); 1484 } 1485 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1486 __ leave(); 1487 __ mov(r0, zr); // return 0 1488 __ ret(lr); 1489 return start; 1490 } 1491 1492 // Arguments: 1493 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1494 // ignored 1495 // name - stub name string 1496 // 1497 // Inputs: 1498 // c_rarg0 - source array address 1499 // c_rarg1 - destination array address 1500 // c_rarg2 - element count, treated as ssize_t, can be zero 1501 // 1502 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1503 // we let the hardware handle it. The one to eight bytes within words, 1504 // dwords or qwords that span cache line boundaries will still be loaded 1505 // and stored atomically. 1506 // 1507 // Side Effects: 1508 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1509 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1510 // we let the hardware handle it. The one to eight bytes within words, 1511 // dwords or qwords that span cache line boundaries will still be loaded 1512 // and stored atomically. 1513 // 1514 // Side Effects: 1515 // disjoint_byte_copy_entry is set to the no-overlap entry point 1516 // used by generate_conjoint_byte_copy(). 1517 // 1518 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1519 const bool not_oop = false; 1520 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1521 } 1522 1523 // Arguments: 1524 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1525 // ignored 1526 // name - stub name string 1527 // 1528 // Inputs: 1529 // c_rarg0 - source array address 1530 // c_rarg1 - destination array address 1531 // c_rarg2 - element count, treated as ssize_t, can be zero 1532 // 1533 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1534 // we let the hardware handle it. The one to eight bytes within words, 1535 // dwords or qwords that span cache line boundaries will still be loaded 1536 // and stored atomically. 1537 // 1538 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1539 address* entry, const char *name) { 1540 const bool not_oop = false; 1541 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1542 } 1543 1544 // Arguments: 1545 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1546 // ignored 1547 // name - stub name string 1548 // 1549 // Inputs: 1550 // c_rarg0 - source array address 1551 // c_rarg1 - destination array address 1552 // c_rarg2 - element count, treated as ssize_t, can be zero 1553 // 1554 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1555 // let the hardware handle it. The two or four words within dwords 1556 // or qwords that span cache line boundaries will still be loaded 1557 // and stored atomically. 1558 // 1559 // Side Effects: 1560 // disjoint_short_copy_entry is set to the no-overlap entry point 1561 // used by generate_conjoint_short_copy(). 1562 // 1563 address generate_disjoint_short_copy(bool aligned, 1564 address* entry, const char *name) { 1565 const bool not_oop = false; 1566 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1567 } 1568 1569 // Arguments: 1570 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1571 // ignored 1572 // name - stub name string 1573 // 1574 // Inputs: 1575 // c_rarg0 - source array address 1576 // c_rarg1 - destination array address 1577 // c_rarg2 - element count, treated as ssize_t, can be zero 1578 // 1579 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1580 // let the hardware handle it. The two or four words within dwords 1581 // or qwords that span cache line boundaries will still be loaded 1582 // and stored atomically. 1583 // 1584 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1585 address *entry, const char *name) { 1586 const bool not_oop = false; 1587 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1588 1589 } 1590 // Arguments: 1591 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1592 // ignored 1593 // name - stub name string 1594 // 1595 // Inputs: 1596 // c_rarg0 - source array address 1597 // c_rarg1 - destination array address 1598 // c_rarg2 - element count, treated as ssize_t, can be zero 1599 // 1600 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1601 // the hardware handle it. The two dwords within qwords that span 1602 // cache line boundaries will still be loaded and stored atomically. 1603 // 1604 // Side Effects: 1605 // disjoint_int_copy_entry is set to the no-overlap entry point 1606 // used by generate_conjoint_int_oop_copy(). 1607 // 1608 address generate_disjoint_int_copy(bool aligned, address *entry, 1609 const char *name, bool dest_uninitialized = false) { 1610 const bool not_oop = false; 1611 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1612 } 1613 1614 // Arguments: 1615 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1616 // ignored 1617 // name - stub name string 1618 // 1619 // Inputs: 1620 // c_rarg0 - source array address 1621 // c_rarg1 - destination array address 1622 // c_rarg2 - element count, treated as ssize_t, can be zero 1623 // 1624 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1625 // the hardware handle it. The two dwords within qwords that span 1626 // cache line boundaries will still be loaded and stored atomically. 1627 // 1628 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1629 address *entry, const char *name, 1630 bool dest_uninitialized = false) { 1631 const bool not_oop = false; 1632 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1633 } 1634 1635 1636 // Arguments: 1637 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1638 // ignored 1639 // name - stub name string 1640 // 1641 // Inputs: 1642 // c_rarg0 - source array address 1643 // c_rarg1 - destination array address 1644 // c_rarg2 - element count, treated as size_t, can be zero 1645 // 1646 // Side Effects: 1647 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1648 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1649 // 1650 address generate_disjoint_long_copy(bool aligned, address *entry, 1651 const char *name, bool dest_uninitialized = false) { 1652 const bool not_oop = false; 1653 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1654 } 1655 1656 // Arguments: 1657 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1658 // ignored 1659 // name - stub name string 1660 // 1661 // Inputs: 1662 // c_rarg0 - source array address 1663 // c_rarg1 - destination array address 1664 // c_rarg2 - element count, treated as size_t, can be zero 1665 // 1666 address generate_conjoint_long_copy(bool aligned, 1667 address nooverlap_target, address *entry, 1668 const char *name, bool dest_uninitialized = false) { 1669 const bool not_oop = false; 1670 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1671 } 1672 1673 // Arguments: 1674 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1675 // ignored 1676 // name - stub name string 1677 // 1678 // Inputs: 1679 // c_rarg0 - source array address 1680 // c_rarg1 - destination array address 1681 // c_rarg2 - element count, treated as size_t, can be zero 1682 // 1683 // Side Effects: 1684 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1685 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1686 // 1687 address generate_disjoint_oop_copy(bool aligned, address *entry, 1688 const char *name, bool dest_uninitialized) { 1689 const bool is_oop = true; 1690 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1691 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1692 } 1693 1694 // Arguments: 1695 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1696 // ignored 1697 // name - stub name string 1698 // 1699 // Inputs: 1700 // c_rarg0 - source array address 1701 // c_rarg1 - destination array address 1702 // c_rarg2 - element count, treated as size_t, can be zero 1703 // 1704 address generate_conjoint_oop_copy(bool aligned, 1705 address nooverlap_target, address *entry, 1706 const char *name, bool dest_uninitialized) { 1707 const bool is_oop = true; 1708 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1709 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1710 name, dest_uninitialized); 1711 } 1712 1713 1714 // Helper for generating a dynamic type check. 1715 // Smashes rscratch1, rscratch2. 1716 void generate_type_check(Register sub_klass, 1717 Register super_check_offset, 1718 Register super_klass, 1719 Label& L_success) { 1720 assert_different_registers(sub_klass, super_check_offset, super_klass); 1721 1722 BLOCK_COMMENT("type_check:"); 1723 1724 Label L_miss; 1725 1726 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1727 super_check_offset); 1728 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1729 1730 // Fall through on failure! 1731 __ BIND(L_miss); 1732 } 1733 1734 // 1735 // Generate checkcasting array copy stub 1736 // 1737 // Input: 1738 // c_rarg0 - source array address 1739 // c_rarg1 - destination array address 1740 // c_rarg2 - element count, treated as ssize_t, can be zero 1741 // c_rarg3 - size_t ckoff (super_check_offset) 1742 // c_rarg4 - oop ckval (super_klass) 1743 // 1744 // Output: 1745 // r0 == 0 - success 1746 // r0 == -1^K - failure, where K is partial transfer count 1747 // 1748 address generate_checkcast_copy(const char *name, address *entry, 1749 bool dest_uninitialized = false) { 1750 1751 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1752 1753 // Input registers (after setup_arg_regs) 1754 const Register from = c_rarg0; // source array address 1755 const Register to = c_rarg1; // destination array address 1756 const Register count = c_rarg2; // elementscount 1757 const Register ckoff = c_rarg3; // super_check_offset 1758 const Register ckval = c_rarg4; // super_klass 1759 1760 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1761 RegSet wb_post_saved_regs = RegSet::of(count); 1762 1763 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1764 const Register copied_oop = r22; // actual oop copied 1765 const Register count_save = r21; // orig elementscount 1766 const Register start_to = r20; // destination array start address 1767 const Register r19_klass = r19; // oop._klass 1768 1769 //--------------------------------------------------------------- 1770 // Assembler stub will be used for this call to arraycopy 1771 // if the two arrays are subtypes of Object[] but the 1772 // destination array type is not equal to or a supertype 1773 // of the source type. Each element must be separately 1774 // checked. 1775 1776 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1777 copied_oop, r19_klass, count_save); 1778 1779 __ align(CodeEntryAlignment); 1780 StubCodeMark mark(this, "StubRoutines", name); 1781 address start = __ pc(); 1782 1783 __ enter(); // required for proper stackwalking of RuntimeStub frame 1784 1785 #ifdef ASSERT 1786 // caller guarantees that the arrays really are different 1787 // otherwise, we would have to make conjoint checks 1788 { Label L; 1789 array_overlap_test(L, TIMES_OOP); 1790 __ stop("checkcast_copy within a single array"); 1791 __ bind(L); 1792 } 1793 #endif //ASSERT 1794 1795 // Caller of this entry point must set up the argument registers. 1796 if (entry != NULL) { 1797 *entry = __ pc(); 1798 BLOCK_COMMENT("Entry:"); 1799 } 1800 1801 // Empty array: Nothing to do. 1802 __ cbz(count, L_done); 1803 __ push(RegSet::of(r19, r20, r21, r22), sp); 1804 1805 #ifdef ASSERT 1806 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1807 // The ckoff and ckval must be mutually consistent, 1808 // even though caller generates both. 1809 { Label L; 1810 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1811 __ ldrw(start_to, Address(ckval, sco_offset)); 1812 __ cmpw(ckoff, start_to); 1813 __ br(Assembler::EQ, L); 1814 __ stop("super_check_offset inconsistent"); 1815 __ bind(L); 1816 } 1817 #endif //ASSERT 1818 1819 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1820 bool is_oop = true; 1821 if (dest_uninitialized) { 1822 decorators |= IS_DEST_UNINITIALIZED; 1823 } 1824 1825 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1826 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1827 1828 // save the original count 1829 __ mov(count_save, count); 1830 1831 // Copy from low to high addresses 1832 __ mov(start_to, to); // Save destination array start address 1833 __ b(L_load_element); 1834 1835 // ======== begin loop ======== 1836 // (Loop is rotated; its entry is L_load_element.) 1837 // Loop control: 1838 // for (; count != 0; count--) { 1839 // copied_oop = load_heap_oop(from++); 1840 // ... generate_type_check ...; 1841 // store_heap_oop(to++, copied_oop); 1842 // } 1843 __ align(OptoLoopAlignment); 1844 1845 __ BIND(L_store_element); 1846 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1847 __ sub(count, count, 1); 1848 __ cbz(count, L_do_card_marks); 1849 1850 // ======== loop entry is here ======== 1851 __ BIND(L_load_element); 1852 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1853 __ cbz(copied_oop, L_store_element); 1854 1855 __ load_klass(r19_klass, copied_oop);// query the object klass 1856 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1857 // ======== end loop ======== 1858 1859 // It was a real error; we must depend on the caller to finish the job. 1860 // Register count = remaining oops, count_orig = total oops. 1861 // Emit GC store barriers for the oops we have copied and report 1862 // their number to the caller. 1863 1864 __ subs(count, count_save, count); // K = partially copied oop count 1865 __ eon(count, count, zr); // report (-1^K) to caller 1866 __ br(Assembler::EQ, L_done_pop); 1867 1868 __ BIND(L_do_card_marks); 1869 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1870 1871 __ bind(L_done_pop); 1872 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1873 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1874 1875 __ bind(L_done); 1876 __ mov(r0, count); 1877 __ leave(); 1878 __ ret(lr); 1879 1880 return start; 1881 } 1882 1883 // Perform range checks on the proposed arraycopy. 1884 // Kills temp, but nothing else. 1885 // Also, clean the sign bits of src_pos and dst_pos. 1886 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1887 Register src_pos, // source position (c_rarg1) 1888 Register dst, // destination array oo (c_rarg2) 1889 Register dst_pos, // destination position (c_rarg3) 1890 Register length, 1891 Register temp, 1892 Label& L_failed) { 1893 BLOCK_COMMENT("arraycopy_range_checks:"); 1894 1895 assert_different_registers(rscratch1, temp); 1896 1897 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1898 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1899 __ addw(temp, length, src_pos); 1900 __ cmpw(temp, rscratch1); 1901 __ br(Assembler::HI, L_failed); 1902 1903 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1904 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1905 __ addw(temp, length, dst_pos); 1906 __ cmpw(temp, rscratch1); 1907 __ br(Assembler::HI, L_failed); 1908 1909 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1910 __ movw(src_pos, src_pos); 1911 __ movw(dst_pos, dst_pos); 1912 1913 BLOCK_COMMENT("arraycopy_range_checks done"); 1914 } 1915 1916 // These stubs get called from some dumb test routine. 1917 // I'll write them properly when they're called from 1918 // something that's actually doing something. 1919 static void fake_arraycopy_stub(address src, address dst, int count) { 1920 assert(count == 0, "huh?"); 1921 } 1922 1923 1924 // 1925 // Generate 'unsafe' array copy stub 1926 // Though just as safe as the other stubs, it takes an unscaled 1927 // size_t argument instead of an element count. 1928 // 1929 // Input: 1930 // c_rarg0 - source array address 1931 // c_rarg1 - destination array address 1932 // c_rarg2 - byte count, treated as ssize_t, can be zero 1933 // 1934 // Examines the alignment of the operands and dispatches 1935 // to a long, int, short, or byte copy loop. 1936 // 1937 address generate_unsafe_copy(const char *name, 1938 address byte_copy_entry, 1939 address short_copy_entry, 1940 address int_copy_entry, 1941 address long_copy_entry) { 1942 Label L_long_aligned, L_int_aligned, L_short_aligned; 1943 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1944 1945 __ align(CodeEntryAlignment); 1946 StubCodeMark mark(this, "StubRoutines", name); 1947 address start = __ pc(); 1948 __ enter(); // required for proper stackwalking of RuntimeStub frame 1949 1950 // bump this on entry, not on exit: 1951 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1952 1953 __ orr(rscratch1, s, d); 1954 __ orr(rscratch1, rscratch1, count); 1955 1956 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1957 __ cbz(rscratch1, L_long_aligned); 1958 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1959 __ cbz(rscratch1, L_int_aligned); 1960 __ tbz(rscratch1, 0, L_short_aligned); 1961 __ b(RuntimeAddress(byte_copy_entry)); 1962 1963 __ BIND(L_short_aligned); 1964 __ lsr(count, count, LogBytesPerShort); // size => short_count 1965 __ b(RuntimeAddress(short_copy_entry)); 1966 __ BIND(L_int_aligned); 1967 __ lsr(count, count, LogBytesPerInt); // size => int_count 1968 __ b(RuntimeAddress(int_copy_entry)); 1969 __ BIND(L_long_aligned); 1970 __ lsr(count, count, LogBytesPerLong); // size => long_count 1971 __ b(RuntimeAddress(long_copy_entry)); 1972 1973 return start; 1974 } 1975 1976 // 1977 // Generate generic array copy stubs 1978 // 1979 // Input: 1980 // c_rarg0 - src oop 1981 // c_rarg1 - src_pos (32-bits) 1982 // c_rarg2 - dst oop 1983 // c_rarg3 - dst_pos (32-bits) 1984 // c_rarg4 - element count (32-bits) 1985 // 1986 // Output: 1987 // r0 == 0 - success 1988 // r0 == -1^K - failure, where K is partial transfer count 1989 // 1990 address generate_generic_copy(const char *name, 1991 address byte_copy_entry, address short_copy_entry, 1992 address int_copy_entry, address oop_copy_entry, 1993 address long_copy_entry, address checkcast_copy_entry) { 1994 1995 Label L_failed, L_objArray; 1996 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1997 1998 // Input registers 1999 const Register src = c_rarg0; // source array oop 2000 const Register src_pos = c_rarg1; // source position 2001 const Register dst = c_rarg2; // destination array oop 2002 const Register dst_pos = c_rarg3; // destination position 2003 const Register length = c_rarg4; 2004 2005 2006 // Registers used as temps 2007 const Register dst_klass = c_rarg5; 2008 2009 __ align(CodeEntryAlignment); 2010 2011 StubCodeMark mark(this, "StubRoutines", name); 2012 2013 address start = __ pc(); 2014 2015 __ enter(); // required for proper stackwalking of RuntimeStub frame 2016 2017 // bump this on entry, not on exit: 2018 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2019 2020 //----------------------------------------------------------------------- 2021 // Assembler stub will be used for this call to arraycopy 2022 // if the following conditions are met: 2023 // 2024 // (1) src and dst must not be null. 2025 // (2) src_pos must not be negative. 2026 // (3) dst_pos must not be negative. 2027 // (4) length must not be negative. 2028 // (5) src klass and dst klass should be the same and not NULL. 2029 // (6) src and dst should be arrays. 2030 // (7) src_pos + length must not exceed length of src. 2031 // (8) dst_pos + length must not exceed length of dst. 2032 // 2033 2034 // if (src == NULL) return -1; 2035 __ cbz(src, L_failed); 2036 2037 // if (src_pos < 0) return -1; 2038 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2039 2040 // if (dst == NULL) return -1; 2041 __ cbz(dst, L_failed); 2042 2043 // if (dst_pos < 0) return -1; 2044 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2045 2046 // registers used as temp 2047 const Register scratch_length = r16; // elements count to copy 2048 const Register scratch_src_klass = r17; // array klass 2049 const Register lh = r15; // layout helper 2050 2051 // if (length < 0) return -1; 2052 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2053 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2054 2055 __ load_klass(scratch_src_klass, src); 2056 #ifdef ASSERT 2057 // assert(src->klass() != NULL); 2058 { 2059 BLOCK_COMMENT("assert klasses not null {"); 2060 Label L1, L2; 2061 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2062 __ bind(L1); 2063 __ stop("broken null klass"); 2064 __ bind(L2); 2065 __ load_klass(rscratch1, dst); 2066 __ cbz(rscratch1, L1); // this would be broken also 2067 BLOCK_COMMENT("} assert klasses not null done"); 2068 } 2069 #endif 2070 2071 // Load layout helper (32-bits) 2072 // 2073 // |array_tag| | header_size | element_type | |log2_element_size| 2074 // 32 30 24 16 8 2 0 2075 // 2076 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2077 // 2078 2079 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2080 2081 // Handle objArrays completely differently... 2082 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2083 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2084 __ movw(rscratch1, objArray_lh); 2085 __ eorw(rscratch2, lh, rscratch1); 2086 __ cbzw(rscratch2, L_objArray); 2087 2088 // if (src->klass() != dst->klass()) return -1; 2089 __ load_klass(rscratch2, dst); 2090 __ eor(rscratch2, rscratch2, scratch_src_klass); 2091 __ cbnz(rscratch2, L_failed); 2092 2093 // if (!src->is_Array()) return -1; 2094 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2095 2096 // At this point, it is known to be a typeArray (array_tag 0x3). 2097 #ifdef ASSERT 2098 { 2099 BLOCK_COMMENT("assert primitive array {"); 2100 Label L; 2101 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2102 __ cmpw(lh, rscratch2); 2103 __ br(Assembler::GE, L); 2104 __ stop("must be a primitive array"); 2105 __ bind(L); 2106 BLOCK_COMMENT("} assert primitive array done"); 2107 } 2108 #endif 2109 2110 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2111 rscratch2, L_failed); 2112 2113 // TypeArrayKlass 2114 // 2115 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2116 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2117 // 2118 2119 const Register rscratch1_offset = rscratch1; // array offset 2120 const Register r15_elsize = lh; // element size 2121 2122 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2123 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2124 __ add(src, src, rscratch1_offset); // src array offset 2125 __ add(dst, dst, rscratch1_offset); // dst array offset 2126 BLOCK_COMMENT("choose copy loop based on element size"); 2127 2128 // next registers should be set before the jump to corresponding stub 2129 const Register from = c_rarg0; // source array address 2130 const Register to = c_rarg1; // destination array address 2131 const Register count = c_rarg2; // elements count 2132 2133 // 'from', 'to', 'count' registers should be set in such order 2134 // since they are the same as 'src', 'src_pos', 'dst'. 2135 2136 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2137 2138 // The possible values of elsize are 0-3, i.e. exact_log2(element 2139 // size in bytes). We do a simple bitwise binary search. 2140 __ BIND(L_copy_bytes); 2141 __ tbnz(r15_elsize, 1, L_copy_ints); 2142 __ tbnz(r15_elsize, 0, L_copy_shorts); 2143 __ lea(from, Address(src, src_pos));// src_addr 2144 __ lea(to, Address(dst, dst_pos));// dst_addr 2145 __ movw(count, scratch_length); // length 2146 __ b(RuntimeAddress(byte_copy_entry)); 2147 2148 __ BIND(L_copy_shorts); 2149 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2150 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2151 __ movw(count, scratch_length); // length 2152 __ b(RuntimeAddress(short_copy_entry)); 2153 2154 __ BIND(L_copy_ints); 2155 __ tbnz(r15_elsize, 0, L_copy_longs); 2156 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2157 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2158 __ movw(count, scratch_length); // length 2159 __ b(RuntimeAddress(int_copy_entry)); 2160 2161 __ BIND(L_copy_longs); 2162 #ifdef ASSERT 2163 { 2164 BLOCK_COMMENT("assert long copy {"); 2165 Label L; 2166 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2167 __ cmpw(r15_elsize, LogBytesPerLong); 2168 __ br(Assembler::EQ, L); 2169 __ stop("must be long copy, but elsize is wrong"); 2170 __ bind(L); 2171 BLOCK_COMMENT("} assert long copy done"); 2172 } 2173 #endif 2174 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2175 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2176 __ movw(count, scratch_length); // length 2177 __ b(RuntimeAddress(long_copy_entry)); 2178 2179 // ObjArrayKlass 2180 __ BIND(L_objArray); 2181 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2182 2183 Label L_plain_copy, L_checkcast_copy; 2184 // test array classes for subtyping 2185 __ load_klass(r15, dst); 2186 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2187 __ br(Assembler::NE, L_checkcast_copy); 2188 2189 // Identically typed arrays can be copied without element-wise checks. 2190 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2191 rscratch2, L_failed); 2192 2193 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2194 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2195 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2196 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2197 __ movw(count, scratch_length); // length 2198 __ BIND(L_plain_copy); 2199 __ b(RuntimeAddress(oop_copy_entry)); 2200 2201 __ BIND(L_checkcast_copy); 2202 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2203 { 2204 // Before looking at dst.length, make sure dst is also an objArray. 2205 __ ldrw(rscratch1, Address(r15, lh_offset)); 2206 __ movw(rscratch2, objArray_lh); 2207 __ eorw(rscratch1, rscratch1, rscratch2); 2208 __ cbnzw(rscratch1, L_failed); 2209 2210 // It is safe to examine both src.length and dst.length. 2211 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2212 r15, L_failed); 2213 2214 __ load_klass(dst_klass, dst); // reload 2215 2216 // Marshal the base address arguments now, freeing registers. 2217 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2218 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2219 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2220 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2221 __ movw(count, length); // length (reloaded) 2222 Register sco_temp = c_rarg3; // this register is free now 2223 assert_different_registers(from, to, count, sco_temp, 2224 dst_klass, scratch_src_klass); 2225 // assert_clean_int(count, sco_temp); 2226 2227 // Generate the type check. 2228 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2229 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2230 2231 // Smashes rscratch1, rscratch2 2232 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2233 2234 // Fetch destination element klass from the ObjArrayKlass header. 2235 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2236 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2237 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2238 2239 // the checkcast_copy loop needs two extra arguments: 2240 assert(c_rarg3 == sco_temp, "#3 already in place"); 2241 // Set up arguments for checkcast_copy_entry. 2242 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2243 __ b(RuntimeAddress(checkcast_copy_entry)); 2244 } 2245 2246 __ BIND(L_failed); 2247 __ mov(r0, -1); 2248 __ leave(); // required for proper stackwalking of RuntimeStub frame 2249 __ ret(lr); 2250 2251 return start; 2252 } 2253 2254 // 2255 // Generate stub for array fill. If "aligned" is true, the 2256 // "to" address is assumed to be heapword aligned. 2257 // 2258 // Arguments for generated stub: 2259 // to: c_rarg0 2260 // value: c_rarg1 2261 // count: c_rarg2 treated as signed 2262 // 2263 address generate_fill(BasicType t, bool aligned, const char *name) { 2264 __ align(CodeEntryAlignment); 2265 StubCodeMark mark(this, "StubRoutines", name); 2266 address start = __ pc(); 2267 2268 BLOCK_COMMENT("Entry:"); 2269 2270 const Register to = c_rarg0; // source array address 2271 const Register value = c_rarg1; // value 2272 const Register count = c_rarg2; // elements count 2273 2274 const Register bz_base = r10; // base for block_zero routine 2275 const Register cnt_words = r11; // temp register 2276 2277 __ enter(); 2278 2279 Label L_fill_elements, L_exit1; 2280 2281 int shift = -1; 2282 switch (t) { 2283 case T_BYTE: 2284 shift = 0; 2285 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2286 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2287 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2288 __ br(Assembler::LO, L_fill_elements); 2289 break; 2290 case T_SHORT: 2291 shift = 1; 2292 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2293 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2294 __ br(Assembler::LO, L_fill_elements); 2295 break; 2296 case T_INT: 2297 shift = 2; 2298 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2299 __ br(Assembler::LO, L_fill_elements); 2300 break; 2301 default: ShouldNotReachHere(); 2302 } 2303 2304 // Align source address at 8 bytes address boundary. 2305 Label L_skip_align1, L_skip_align2, L_skip_align4; 2306 if (!aligned) { 2307 switch (t) { 2308 case T_BYTE: 2309 // One byte misalignment happens only for byte arrays. 2310 __ tbz(to, 0, L_skip_align1); 2311 __ strb(value, Address(__ post(to, 1))); 2312 __ subw(count, count, 1); 2313 __ bind(L_skip_align1); 2314 // Fallthrough 2315 case T_SHORT: 2316 // Two bytes misalignment happens only for byte and short (char) arrays. 2317 __ tbz(to, 1, L_skip_align2); 2318 __ strh(value, Address(__ post(to, 2))); 2319 __ subw(count, count, 2 >> shift); 2320 __ bind(L_skip_align2); 2321 // Fallthrough 2322 case T_INT: 2323 // Align to 8 bytes, we know we are 4 byte aligned to start. 2324 __ tbz(to, 2, L_skip_align4); 2325 __ strw(value, Address(__ post(to, 4))); 2326 __ subw(count, count, 4 >> shift); 2327 __ bind(L_skip_align4); 2328 break; 2329 default: ShouldNotReachHere(); 2330 } 2331 } 2332 2333 // 2334 // Fill large chunks 2335 // 2336 __ lsrw(cnt_words, count, 3 - shift); // number of words 2337 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2338 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2339 if (UseBlockZeroing) { 2340 Label non_block_zeroing, rest; 2341 // If the fill value is zero we can use the fast zero_words(). 2342 __ cbnz(value, non_block_zeroing); 2343 __ mov(bz_base, to); 2344 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2345 __ zero_words(bz_base, cnt_words); 2346 __ b(rest); 2347 __ bind(non_block_zeroing); 2348 __ fill_words(to, cnt_words, value); 2349 __ bind(rest); 2350 } else { 2351 __ fill_words(to, cnt_words, value); 2352 } 2353 2354 // Remaining count is less than 8 bytes. Fill it by a single store. 2355 // Note that the total length is no less than 8 bytes. 2356 if (t == T_BYTE || t == T_SHORT) { 2357 Label L_exit1; 2358 __ cbzw(count, L_exit1); 2359 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2360 __ str(value, Address(to, -8)); // overwrite some elements 2361 __ bind(L_exit1); 2362 __ leave(); 2363 __ ret(lr); 2364 } 2365 2366 // Handle copies less than 8 bytes. 2367 Label L_fill_2, L_fill_4, L_exit2; 2368 __ bind(L_fill_elements); 2369 switch (t) { 2370 case T_BYTE: 2371 __ tbz(count, 0, L_fill_2); 2372 __ strb(value, Address(__ post(to, 1))); 2373 __ bind(L_fill_2); 2374 __ tbz(count, 1, L_fill_4); 2375 __ strh(value, Address(__ post(to, 2))); 2376 __ bind(L_fill_4); 2377 __ tbz(count, 2, L_exit2); 2378 __ strw(value, Address(to)); 2379 break; 2380 case T_SHORT: 2381 __ tbz(count, 0, L_fill_4); 2382 __ strh(value, Address(__ post(to, 2))); 2383 __ bind(L_fill_4); 2384 __ tbz(count, 1, L_exit2); 2385 __ strw(value, Address(to)); 2386 break; 2387 case T_INT: 2388 __ cbzw(count, L_exit2); 2389 __ strw(value, Address(to)); 2390 break; 2391 default: ShouldNotReachHere(); 2392 } 2393 __ bind(L_exit2); 2394 __ leave(); 2395 __ ret(lr); 2396 return start; 2397 } 2398 2399 address generate_data_cache_writeback() { 2400 const Register line = c_rarg0; // address of line to write back 2401 2402 __ align(CodeEntryAlignment); 2403 2404 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2405 2406 address start = __ pc(); 2407 __ enter(); 2408 __ cache_wb(Address(line, 0)); 2409 __ leave(); 2410 __ ret(lr); 2411 2412 return start; 2413 } 2414 2415 address generate_data_cache_writeback_sync() { 2416 const Register is_pre = c_rarg0; // pre or post sync 2417 2418 __ align(CodeEntryAlignment); 2419 2420 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2421 2422 // pre wbsync is a no-op 2423 // post wbsync translates to an sfence 2424 2425 Label skip; 2426 address start = __ pc(); 2427 __ enter(); 2428 __ cbnz(is_pre, skip); 2429 __ cache_wbsync(false); 2430 __ bind(skip); 2431 __ leave(); 2432 __ ret(lr); 2433 2434 return start; 2435 } 2436 2437 void generate_arraycopy_stubs() { 2438 address entry; 2439 address entry_jbyte_arraycopy; 2440 address entry_jshort_arraycopy; 2441 address entry_jint_arraycopy; 2442 address entry_oop_arraycopy; 2443 address entry_jlong_arraycopy; 2444 address entry_checkcast_arraycopy; 2445 2446 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2447 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2448 2449 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2450 2451 //*** jbyte 2452 // Always need aligned and unaligned versions 2453 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2454 "jbyte_disjoint_arraycopy"); 2455 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2456 &entry_jbyte_arraycopy, 2457 "jbyte_arraycopy"); 2458 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2459 "arrayof_jbyte_disjoint_arraycopy"); 2460 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2461 "arrayof_jbyte_arraycopy"); 2462 2463 //*** jshort 2464 // Always need aligned and unaligned versions 2465 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2466 "jshort_disjoint_arraycopy"); 2467 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2468 &entry_jshort_arraycopy, 2469 "jshort_arraycopy"); 2470 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2471 "arrayof_jshort_disjoint_arraycopy"); 2472 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2473 "arrayof_jshort_arraycopy"); 2474 2475 //*** jint 2476 // Aligned versions 2477 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2478 "arrayof_jint_disjoint_arraycopy"); 2479 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2480 "arrayof_jint_arraycopy"); 2481 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2482 // entry_jint_arraycopy always points to the unaligned version 2483 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2484 "jint_disjoint_arraycopy"); 2485 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2486 &entry_jint_arraycopy, 2487 "jint_arraycopy"); 2488 2489 //*** jlong 2490 // It is always aligned 2491 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2492 "arrayof_jlong_disjoint_arraycopy"); 2493 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2494 "arrayof_jlong_arraycopy"); 2495 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2496 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2497 2498 //*** oops 2499 { 2500 // With compressed oops we need unaligned versions; notice that 2501 // we overwrite entry_oop_arraycopy. 2502 bool aligned = !UseCompressedOops; 2503 2504 StubRoutines::_arrayof_oop_disjoint_arraycopy 2505 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2506 /*dest_uninitialized*/false); 2507 StubRoutines::_arrayof_oop_arraycopy 2508 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2509 /*dest_uninitialized*/false); 2510 // Aligned versions without pre-barriers 2511 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2512 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2513 /*dest_uninitialized*/true); 2514 StubRoutines::_arrayof_oop_arraycopy_uninit 2515 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2516 /*dest_uninitialized*/true); 2517 } 2518 2519 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2520 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2521 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2522 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2523 2524 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2525 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2526 /*dest_uninitialized*/true); 2527 2528 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2529 entry_jbyte_arraycopy, 2530 entry_jshort_arraycopy, 2531 entry_jint_arraycopy, 2532 entry_jlong_arraycopy); 2533 2534 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2535 entry_jbyte_arraycopy, 2536 entry_jshort_arraycopy, 2537 entry_jint_arraycopy, 2538 entry_oop_arraycopy, 2539 entry_jlong_arraycopy, 2540 entry_checkcast_arraycopy); 2541 2542 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2543 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2544 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2545 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2546 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2547 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2548 } 2549 2550 void generate_math_stubs() { Unimplemented(); } 2551 2552 // Arguments: 2553 // 2554 // Inputs: 2555 // c_rarg0 - source byte array address 2556 // c_rarg1 - destination byte array address 2557 // c_rarg2 - K (key) in little endian int array 2558 // 2559 address generate_aescrypt_encryptBlock() { 2560 __ align(CodeEntryAlignment); 2561 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2562 2563 Label L_doLast; 2564 2565 const Register from = c_rarg0; // source array address 2566 const Register to = c_rarg1; // destination array address 2567 const Register key = c_rarg2; // key array address 2568 const Register keylen = rscratch1; 2569 2570 address start = __ pc(); 2571 __ enter(); 2572 2573 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2574 2575 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2576 2577 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2578 __ rev32(v1, __ T16B, v1); 2579 __ rev32(v2, __ T16B, v2); 2580 __ rev32(v3, __ T16B, v3); 2581 __ rev32(v4, __ T16B, v4); 2582 __ aese(v0, v1); 2583 __ aesmc(v0, v0); 2584 __ aese(v0, v2); 2585 __ aesmc(v0, v0); 2586 __ aese(v0, v3); 2587 __ aesmc(v0, v0); 2588 __ aese(v0, v4); 2589 __ aesmc(v0, v0); 2590 2591 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2592 __ rev32(v1, __ T16B, v1); 2593 __ rev32(v2, __ T16B, v2); 2594 __ rev32(v3, __ T16B, v3); 2595 __ rev32(v4, __ T16B, v4); 2596 __ aese(v0, v1); 2597 __ aesmc(v0, v0); 2598 __ aese(v0, v2); 2599 __ aesmc(v0, v0); 2600 __ aese(v0, v3); 2601 __ aesmc(v0, v0); 2602 __ aese(v0, v4); 2603 __ aesmc(v0, v0); 2604 2605 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2606 __ rev32(v1, __ T16B, v1); 2607 __ rev32(v2, __ T16B, v2); 2608 2609 __ cmpw(keylen, 44); 2610 __ br(Assembler::EQ, L_doLast); 2611 2612 __ aese(v0, v1); 2613 __ aesmc(v0, v0); 2614 __ aese(v0, v2); 2615 __ aesmc(v0, v0); 2616 2617 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2618 __ rev32(v1, __ T16B, v1); 2619 __ rev32(v2, __ T16B, v2); 2620 2621 __ cmpw(keylen, 52); 2622 __ br(Assembler::EQ, L_doLast); 2623 2624 __ aese(v0, v1); 2625 __ aesmc(v0, v0); 2626 __ aese(v0, v2); 2627 __ aesmc(v0, v0); 2628 2629 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2630 __ rev32(v1, __ T16B, v1); 2631 __ rev32(v2, __ T16B, v2); 2632 2633 __ BIND(L_doLast); 2634 2635 __ aese(v0, v1); 2636 __ aesmc(v0, v0); 2637 __ aese(v0, v2); 2638 2639 __ ld1(v1, __ T16B, key); 2640 __ rev32(v1, __ T16B, v1); 2641 __ eor(v0, __ T16B, v0, v1); 2642 2643 __ st1(v0, __ T16B, to); 2644 2645 __ mov(r0, 0); 2646 2647 __ leave(); 2648 __ ret(lr); 2649 2650 return start; 2651 } 2652 2653 // Arguments: 2654 // 2655 // Inputs: 2656 // c_rarg0 - source byte array address 2657 // c_rarg1 - destination byte array address 2658 // c_rarg2 - K (key) in little endian int array 2659 // 2660 address generate_aescrypt_decryptBlock() { 2661 assert(UseAES, "need AES cryptographic extension support"); 2662 __ align(CodeEntryAlignment); 2663 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2664 Label L_doLast; 2665 2666 const Register from = c_rarg0; // source array address 2667 const Register to = c_rarg1; // destination array address 2668 const Register key = c_rarg2; // key array address 2669 const Register keylen = rscratch1; 2670 2671 address start = __ pc(); 2672 __ enter(); // required for proper stackwalking of RuntimeStub frame 2673 2674 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2675 2676 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2677 2678 __ ld1(v5, __ T16B, __ post(key, 16)); 2679 __ rev32(v5, __ T16B, v5); 2680 2681 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2682 __ rev32(v1, __ T16B, v1); 2683 __ rev32(v2, __ T16B, v2); 2684 __ rev32(v3, __ T16B, v3); 2685 __ rev32(v4, __ T16B, v4); 2686 __ aesd(v0, v1); 2687 __ aesimc(v0, v0); 2688 __ aesd(v0, v2); 2689 __ aesimc(v0, v0); 2690 __ aesd(v0, v3); 2691 __ aesimc(v0, v0); 2692 __ aesd(v0, v4); 2693 __ aesimc(v0, v0); 2694 2695 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2696 __ rev32(v1, __ T16B, v1); 2697 __ rev32(v2, __ T16B, v2); 2698 __ rev32(v3, __ T16B, v3); 2699 __ rev32(v4, __ T16B, v4); 2700 __ aesd(v0, v1); 2701 __ aesimc(v0, v0); 2702 __ aesd(v0, v2); 2703 __ aesimc(v0, v0); 2704 __ aesd(v0, v3); 2705 __ aesimc(v0, v0); 2706 __ aesd(v0, v4); 2707 __ aesimc(v0, v0); 2708 2709 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2710 __ rev32(v1, __ T16B, v1); 2711 __ rev32(v2, __ T16B, v2); 2712 2713 __ cmpw(keylen, 44); 2714 __ br(Assembler::EQ, L_doLast); 2715 2716 __ aesd(v0, v1); 2717 __ aesimc(v0, v0); 2718 __ aesd(v0, v2); 2719 __ aesimc(v0, v0); 2720 2721 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2722 __ rev32(v1, __ T16B, v1); 2723 __ rev32(v2, __ T16B, v2); 2724 2725 __ cmpw(keylen, 52); 2726 __ br(Assembler::EQ, L_doLast); 2727 2728 __ aesd(v0, v1); 2729 __ aesimc(v0, v0); 2730 __ aesd(v0, v2); 2731 __ aesimc(v0, v0); 2732 2733 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2734 __ rev32(v1, __ T16B, v1); 2735 __ rev32(v2, __ T16B, v2); 2736 2737 __ BIND(L_doLast); 2738 2739 __ aesd(v0, v1); 2740 __ aesimc(v0, v0); 2741 __ aesd(v0, v2); 2742 2743 __ eor(v0, __ T16B, v0, v5); 2744 2745 __ st1(v0, __ T16B, to); 2746 2747 __ mov(r0, 0); 2748 2749 __ leave(); 2750 __ ret(lr); 2751 2752 return start; 2753 } 2754 2755 // Arguments: 2756 // 2757 // Inputs: 2758 // c_rarg0 - source byte array address 2759 // c_rarg1 - destination byte array address 2760 // c_rarg2 - K (key) in little endian int array 2761 // c_rarg3 - r vector byte array address 2762 // c_rarg4 - input length 2763 // 2764 // Output: 2765 // x0 - input length 2766 // 2767 address generate_cipherBlockChaining_encryptAESCrypt() { 2768 assert(UseAES, "need AES cryptographic extension support"); 2769 __ align(CodeEntryAlignment); 2770 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2771 2772 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2773 2774 const Register from = c_rarg0; // source array address 2775 const Register to = c_rarg1; // destination array address 2776 const Register key = c_rarg2; // key array address 2777 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2778 // and left with the results of the last encryption block 2779 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2780 const Register keylen = rscratch1; 2781 2782 address start = __ pc(); 2783 2784 __ enter(); 2785 2786 __ movw(rscratch2, len_reg); 2787 2788 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2789 2790 __ ld1(v0, __ T16B, rvec); 2791 2792 __ cmpw(keylen, 52); 2793 __ br(Assembler::CC, L_loadkeys_44); 2794 __ br(Assembler::EQ, L_loadkeys_52); 2795 2796 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2797 __ rev32(v17, __ T16B, v17); 2798 __ rev32(v18, __ T16B, v18); 2799 __ BIND(L_loadkeys_52); 2800 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2801 __ rev32(v19, __ T16B, v19); 2802 __ rev32(v20, __ T16B, v20); 2803 __ BIND(L_loadkeys_44); 2804 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2805 __ rev32(v21, __ T16B, v21); 2806 __ rev32(v22, __ T16B, v22); 2807 __ rev32(v23, __ T16B, v23); 2808 __ rev32(v24, __ T16B, v24); 2809 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2810 __ rev32(v25, __ T16B, v25); 2811 __ rev32(v26, __ T16B, v26); 2812 __ rev32(v27, __ T16B, v27); 2813 __ rev32(v28, __ T16B, v28); 2814 __ ld1(v29, v30, v31, __ T16B, key); 2815 __ rev32(v29, __ T16B, v29); 2816 __ rev32(v30, __ T16B, v30); 2817 __ rev32(v31, __ T16B, v31); 2818 2819 __ BIND(L_aes_loop); 2820 __ ld1(v1, __ T16B, __ post(from, 16)); 2821 __ eor(v0, __ T16B, v0, v1); 2822 2823 __ br(Assembler::CC, L_rounds_44); 2824 __ br(Assembler::EQ, L_rounds_52); 2825 2826 __ aese(v0, v17); __ aesmc(v0, v0); 2827 __ aese(v0, v18); __ aesmc(v0, v0); 2828 __ BIND(L_rounds_52); 2829 __ aese(v0, v19); __ aesmc(v0, v0); 2830 __ aese(v0, v20); __ aesmc(v0, v0); 2831 __ BIND(L_rounds_44); 2832 __ aese(v0, v21); __ aesmc(v0, v0); 2833 __ aese(v0, v22); __ aesmc(v0, v0); 2834 __ aese(v0, v23); __ aesmc(v0, v0); 2835 __ aese(v0, v24); __ aesmc(v0, v0); 2836 __ aese(v0, v25); __ aesmc(v0, v0); 2837 __ aese(v0, v26); __ aesmc(v0, v0); 2838 __ aese(v0, v27); __ aesmc(v0, v0); 2839 __ aese(v0, v28); __ aesmc(v0, v0); 2840 __ aese(v0, v29); __ aesmc(v0, v0); 2841 __ aese(v0, v30); 2842 __ eor(v0, __ T16B, v0, v31); 2843 2844 __ st1(v0, __ T16B, __ post(to, 16)); 2845 2846 __ subw(len_reg, len_reg, 16); 2847 __ cbnzw(len_reg, L_aes_loop); 2848 2849 __ st1(v0, __ T16B, rvec); 2850 2851 __ mov(r0, rscratch2); 2852 2853 __ leave(); 2854 __ ret(lr); 2855 2856 return start; 2857 } 2858 2859 // Arguments: 2860 // 2861 // Inputs: 2862 // c_rarg0 - source byte array address 2863 // c_rarg1 - destination byte array address 2864 // c_rarg2 - K (key) in little endian int array 2865 // c_rarg3 - r vector byte array address 2866 // c_rarg4 - input length 2867 // 2868 // Output: 2869 // r0 - input length 2870 // 2871 address generate_cipherBlockChaining_decryptAESCrypt() { 2872 assert(UseAES, "need AES cryptographic extension support"); 2873 __ align(CodeEntryAlignment); 2874 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2875 2876 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2877 2878 const Register from = c_rarg0; // source array address 2879 const Register to = c_rarg1; // destination array address 2880 const Register key = c_rarg2; // key array address 2881 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2882 // and left with the results of the last encryption block 2883 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2884 const Register keylen = rscratch1; 2885 2886 address start = __ pc(); 2887 2888 __ enter(); 2889 2890 __ movw(rscratch2, len_reg); 2891 2892 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2893 2894 __ ld1(v2, __ T16B, rvec); 2895 2896 __ ld1(v31, __ T16B, __ post(key, 16)); 2897 __ rev32(v31, __ T16B, v31); 2898 2899 __ cmpw(keylen, 52); 2900 __ br(Assembler::CC, L_loadkeys_44); 2901 __ br(Assembler::EQ, L_loadkeys_52); 2902 2903 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2904 __ rev32(v17, __ T16B, v17); 2905 __ rev32(v18, __ T16B, v18); 2906 __ BIND(L_loadkeys_52); 2907 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2908 __ rev32(v19, __ T16B, v19); 2909 __ rev32(v20, __ T16B, v20); 2910 __ BIND(L_loadkeys_44); 2911 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2912 __ rev32(v21, __ T16B, v21); 2913 __ rev32(v22, __ T16B, v22); 2914 __ rev32(v23, __ T16B, v23); 2915 __ rev32(v24, __ T16B, v24); 2916 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2917 __ rev32(v25, __ T16B, v25); 2918 __ rev32(v26, __ T16B, v26); 2919 __ rev32(v27, __ T16B, v27); 2920 __ rev32(v28, __ T16B, v28); 2921 __ ld1(v29, v30, __ T16B, key); 2922 __ rev32(v29, __ T16B, v29); 2923 __ rev32(v30, __ T16B, v30); 2924 2925 __ BIND(L_aes_loop); 2926 __ ld1(v0, __ T16B, __ post(from, 16)); 2927 __ orr(v1, __ T16B, v0, v0); 2928 2929 __ br(Assembler::CC, L_rounds_44); 2930 __ br(Assembler::EQ, L_rounds_52); 2931 2932 __ aesd(v0, v17); __ aesimc(v0, v0); 2933 __ aesd(v0, v18); __ aesimc(v0, v0); 2934 __ BIND(L_rounds_52); 2935 __ aesd(v0, v19); __ aesimc(v0, v0); 2936 __ aesd(v0, v20); __ aesimc(v0, v0); 2937 __ BIND(L_rounds_44); 2938 __ aesd(v0, v21); __ aesimc(v0, v0); 2939 __ aesd(v0, v22); __ aesimc(v0, v0); 2940 __ aesd(v0, v23); __ aesimc(v0, v0); 2941 __ aesd(v0, v24); __ aesimc(v0, v0); 2942 __ aesd(v0, v25); __ aesimc(v0, v0); 2943 __ aesd(v0, v26); __ aesimc(v0, v0); 2944 __ aesd(v0, v27); __ aesimc(v0, v0); 2945 __ aesd(v0, v28); __ aesimc(v0, v0); 2946 __ aesd(v0, v29); __ aesimc(v0, v0); 2947 __ aesd(v0, v30); 2948 __ eor(v0, __ T16B, v0, v31); 2949 __ eor(v0, __ T16B, v0, v2); 2950 2951 __ st1(v0, __ T16B, __ post(to, 16)); 2952 __ orr(v2, __ T16B, v1, v1); 2953 2954 __ subw(len_reg, len_reg, 16); 2955 __ cbnzw(len_reg, L_aes_loop); 2956 2957 __ st1(v2, __ T16B, rvec); 2958 2959 __ mov(r0, rscratch2); 2960 2961 __ leave(); 2962 __ ret(lr); 2963 2964 return start; 2965 } 2966 2967 // CTR AES crypt. 2968 // Arguments: 2969 // 2970 // Inputs: 2971 // c_rarg0 - source byte array address 2972 // c_rarg1 - destination byte array address 2973 // c_rarg2 - K (key) in little endian int array 2974 // c_rarg3 - counter vector byte array address 2975 // c_rarg4 - input length 2976 // c_rarg5 - saved encryptedCounter start 2977 // c_rarg6 - saved used length 2978 // 2979 // Output: 2980 // r0 - input length 2981 // 2982 address generate_counterMode_AESCrypt() { 2983 const Register in = c_rarg0; 2984 const Register out = c_rarg1; 2985 const Register key = c_rarg2; 2986 const Register counter = c_rarg3; 2987 const Register saved_len = c_rarg4, len = r10; 2988 const Register saved_encrypted_ctr = c_rarg5; 2989 const Register used_ptr = c_rarg6, used = r12; 2990 2991 const Register offset = r7; 2992 const Register keylen = r11; 2993 2994 const unsigned char block_size = 16; 2995 const int bulk_width = 4; 2996 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 2997 // performance with larger data sizes, but it also means that the 2998 // fast path isn't used until you have at least 8 blocks, and up 2999 // to 127 bytes of data will be executed on the slow path. For 3000 // that reason, and also so as not to blow away too much icache, 4 3001 // blocks seems like a sensible compromise. 3002 3003 // Algorithm: 3004 // 3005 // if (len == 0) { 3006 // goto DONE; 3007 // } 3008 // int result = len; 3009 // do { 3010 // if (used >= blockSize) { 3011 // if (len >= bulk_width * blockSize) { 3012 // CTR_large_block(); 3013 // if (len == 0) 3014 // goto DONE; 3015 // } 3016 // for (;;) { 3017 // 16ByteVector v0 = counter; 3018 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3019 // used = 0; 3020 // if (len < blockSize) 3021 // break; /* goto NEXT */ 3022 // 16ByteVector v1 = load16Bytes(in, offset); 3023 // v1 = v1 ^ encryptedCounter; 3024 // store16Bytes(out, offset); 3025 // used = blockSize; 3026 // offset += blockSize; 3027 // len -= blockSize; 3028 // if (len == 0) 3029 // goto DONE; 3030 // } 3031 // } 3032 // NEXT: 3033 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3034 // len--; 3035 // } while (len != 0); 3036 // DONE: 3037 // return result; 3038 // 3039 // CTR_large_block() 3040 // Wide bulk encryption of whole blocks. 3041 3042 __ align(CodeEntryAlignment); 3043 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3044 const address start = __ pc(); 3045 __ enter(); 3046 3047 Label DONE, CTR_large_block, large_block_return; 3048 __ ldrw(used, Address(used_ptr)); 3049 __ cbzw(saved_len, DONE); 3050 3051 __ mov(len, saved_len); 3052 __ mov(offset, 0); 3053 3054 // Compute #rounds for AES based on the length of the key array 3055 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3056 3057 __ aesenc_loadkeys(key, keylen); 3058 3059 { 3060 Label L_CTR_loop, NEXT; 3061 3062 __ bind(L_CTR_loop); 3063 3064 __ cmp(used, block_size); 3065 __ br(__ LO, NEXT); 3066 3067 // Maybe we have a lot of data 3068 __ subsw(rscratch1, len, bulk_width * block_size); 3069 __ br(__ HS, CTR_large_block); 3070 __ BIND(large_block_return); 3071 __ cbzw(len, DONE); 3072 3073 // Setup the counter 3074 __ movi(v4, __ T4S, 0); 3075 __ movi(v5, __ T4S, 1); 3076 __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 } 3077 3078 __ ld1(v0, __ T16B, counter); // Load the counter into v0 3079 __ rev32(v16, __ T16B, v0); 3080 __ addv(v16, __ T4S, v16, v4); 3081 __ rev32(v16, __ T16B, v16); 3082 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3083 3084 { 3085 // We have fewer than bulk_width blocks of data left. Encrypt 3086 // them one by one until there is less than a full block 3087 // remaining, being careful to save both the encrypted counter 3088 // and the counter. 3089 3090 Label inner_loop; 3091 __ bind(inner_loop); 3092 // Counter to encrypt is in v0 3093 __ aesecb_encrypt(noreg, noreg, keylen); 3094 __ st1(v0, __ T16B, saved_encrypted_ctr); 3095 3096 // Do we have a remaining full block? 3097 3098 __ mov(used, 0); 3099 __ cmp(len, block_size); 3100 __ br(__ LO, NEXT); 3101 3102 // Yes, we have a full block 3103 __ ldrq(v1, Address(in, offset)); 3104 __ eor(v1, __ T16B, v1, v0); 3105 __ strq(v1, Address(out, offset)); 3106 __ mov(used, block_size); 3107 __ add(offset, offset, block_size); 3108 3109 __ subw(len, len, block_size); 3110 __ cbzw(len, DONE); 3111 3112 // Increment the counter, store it back 3113 __ orr(v0, __ T16B, v16, v16); 3114 __ rev32(v16, __ T16B, v16); 3115 __ addv(v16, __ T4S, v16, v4); 3116 __ rev32(v16, __ T16B, v16); 3117 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3118 3119 __ b(inner_loop); 3120 } 3121 3122 __ BIND(NEXT); 3123 3124 // Encrypt a single byte, and loop. 3125 // We expect this to be a rare event. 3126 __ ldrb(rscratch1, Address(in, offset)); 3127 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3128 __ eor(rscratch1, rscratch1, rscratch2); 3129 __ strb(rscratch1, Address(out, offset)); 3130 __ add(offset, offset, 1); 3131 __ add(used, used, 1); 3132 __ subw(len, len,1); 3133 __ cbnzw(len, L_CTR_loop); 3134 } 3135 3136 __ bind(DONE); 3137 __ strw(used, Address(used_ptr)); 3138 __ mov(r0, saved_len); 3139 3140 __ leave(); // required for proper stackwalking of RuntimeStub frame 3141 __ ret(lr); 3142 3143 // Bulk encryption 3144 3145 __ BIND (CTR_large_block); 3146 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3147 3148 if (bulk_width == 8) { 3149 __ sub(sp, sp, 4 * 16); 3150 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3151 } 3152 __ sub(sp, sp, 4 * 16); 3153 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3154 RegSet saved_regs = (RegSet::of(in, out, offset) 3155 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3156 __ push(saved_regs, sp); 3157 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3158 __ add(in, in, offset); 3159 __ add(out, out, offset); 3160 3161 // Keys should already be loaded into the correct registers 3162 3163 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3164 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3165 3166 // AES/CTR loop 3167 { 3168 Label L_CTR_loop; 3169 __ BIND(L_CTR_loop); 3170 3171 // Setup the counters 3172 __ movi(v8, __ T4S, 0); 3173 __ movi(v9, __ T4S, 1); 3174 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3175 3176 for (FloatRegister f = v0; f < v0 + bulk_width; f++) { 3177 __ rev32(f, __ T16B, v16); 3178 __ addv(v16, __ T4S, v16, v8); 3179 } 3180 3181 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3182 3183 // Encrypt the counters 3184 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3185 3186 if (bulk_width == 8) { 3187 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3188 } 3189 3190 // XOR the encrypted counters with the inputs 3191 for (int i = 0; i < bulk_width; i++) { 3192 __ eor(v0 + i, __ T16B, v0 + i, v8 + i); 3193 } 3194 3195 // Write the encrypted data 3196 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3197 if (bulk_width == 8) { 3198 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3199 } 3200 3201 __ subw(len, len, 16 * bulk_width); 3202 __ cbnzw(len, L_CTR_loop); 3203 } 3204 3205 // Save the counter back where it goes 3206 __ rev32(v16, __ T16B, v16); 3207 __ st1(v16, __ T16B, counter); 3208 3209 __ pop(saved_regs, sp); 3210 3211 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3212 if (bulk_width == 8) { 3213 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3214 } 3215 3216 __ andr(rscratch1, len, -16 * bulk_width); 3217 __ sub(len, len, rscratch1); 3218 __ add(offset, offset, rscratch1); 3219 __ mov(used, 16); 3220 __ strw(used, Address(used_ptr)); 3221 __ b(large_block_return); 3222 3223 return start; 3224 } 3225 3226 // Arguments: 3227 // 3228 // Inputs: 3229 // c_rarg0 - byte[] source+offset 3230 // c_rarg1 - int[] SHA.state 3231 // c_rarg2 - int offset 3232 // c_rarg3 - int limit 3233 // 3234 address generate_md5_implCompress(bool multi_block, const char *name) { 3235 __ align(CodeEntryAlignment); 3236 StubCodeMark mark(this, "StubRoutines", name); 3237 address start = __ pc(); 3238 3239 Register buf = c_rarg0; 3240 Register state = c_rarg1; 3241 Register ofs = c_rarg2; 3242 Register limit = c_rarg3; 3243 Register a = r4; 3244 Register b = r5; 3245 Register c = r6; 3246 Register d = r7; 3247 Register rscratch3 = r10; 3248 Register rscratch4 = r11; 3249 3250 Label keys; 3251 Label md5_loop; 3252 3253 __ BIND(md5_loop); 3254 3255 // Save hash values for addition after rounds 3256 __ ldrw(a, Address(state, 0)); 3257 __ ldrw(b, Address(state, 4)); 3258 __ ldrw(c, Address(state, 8)); 3259 __ ldrw(d, Address(state, 12)); 3260 3261 #define FF(r1, r2, r3, r4, k, s, t) \ 3262 __ eorw(rscratch3, r3, r4); \ 3263 __ movw(rscratch2, t); \ 3264 __ andw(rscratch3, rscratch3, r2); \ 3265 __ addw(rscratch4, r1, rscratch2); \ 3266 __ ldrw(rscratch1, Address(buf, k*4)); \ 3267 __ eorw(rscratch3, rscratch3, r4); \ 3268 __ addw(rscratch3, rscratch3, rscratch1); \ 3269 __ addw(rscratch3, rscratch3, rscratch4); \ 3270 __ rorw(rscratch2, rscratch3, 32 - s); \ 3271 __ addw(r1, rscratch2, r2); 3272 3273 #define GG(r1, r2, r3, r4, k, s, t) \ 3274 __ eorw(rscratch2, r2, r3); \ 3275 __ ldrw(rscratch1, Address(buf, k*4)); \ 3276 __ andw(rscratch3, rscratch2, r4); \ 3277 __ movw(rscratch2, t); \ 3278 __ eorw(rscratch3, rscratch3, r3); \ 3279 __ addw(rscratch4, r1, rscratch2); \ 3280 __ addw(rscratch3, rscratch3, rscratch1); \ 3281 __ addw(rscratch3, rscratch3, rscratch4); \ 3282 __ rorw(rscratch2, rscratch3, 32 - s); \ 3283 __ addw(r1, rscratch2, r2); 3284 3285 #define HH(r1, r2, r3, r4, k, s, t) \ 3286 __ eorw(rscratch3, r3, r4); \ 3287 __ movw(rscratch2, t); \ 3288 __ addw(rscratch4, r1, rscratch2); \ 3289 __ ldrw(rscratch1, Address(buf, k*4)); \ 3290 __ eorw(rscratch3, rscratch3, r2); \ 3291 __ addw(rscratch3, rscratch3, rscratch1); \ 3292 __ addw(rscratch3, rscratch3, rscratch4); \ 3293 __ rorw(rscratch2, rscratch3, 32 - s); \ 3294 __ addw(r1, rscratch2, r2); 3295 3296 #define II(r1, r2, r3, r4, k, s, t) \ 3297 __ movw(rscratch3, t); \ 3298 __ ornw(rscratch2, r2, r4); \ 3299 __ addw(rscratch4, r1, rscratch3); \ 3300 __ ldrw(rscratch1, Address(buf, k*4)); \ 3301 __ eorw(rscratch3, rscratch2, r3); \ 3302 __ addw(rscratch3, rscratch3, rscratch1); \ 3303 __ addw(rscratch3, rscratch3, rscratch4); \ 3304 __ rorw(rscratch2, rscratch3, 32 - s); \ 3305 __ addw(r1, rscratch2, r2); 3306 3307 // Round 1 3308 FF(a, b, c, d, 0, 7, 0xd76aa478) 3309 FF(d, a, b, c, 1, 12, 0xe8c7b756) 3310 FF(c, d, a, b, 2, 17, 0x242070db) 3311 FF(b, c, d, a, 3, 22, 0xc1bdceee) 3312 FF(a, b, c, d, 4, 7, 0xf57c0faf) 3313 FF(d, a, b, c, 5, 12, 0x4787c62a) 3314 FF(c, d, a, b, 6, 17, 0xa8304613) 3315 FF(b, c, d, a, 7, 22, 0xfd469501) 3316 FF(a, b, c, d, 8, 7, 0x698098d8) 3317 FF(d, a, b, c, 9, 12, 0x8b44f7af) 3318 FF(c, d, a, b, 10, 17, 0xffff5bb1) 3319 FF(b, c, d, a, 11, 22, 0x895cd7be) 3320 FF(a, b, c, d, 12, 7, 0x6b901122) 3321 FF(d, a, b, c, 13, 12, 0xfd987193) 3322 FF(c, d, a, b, 14, 17, 0xa679438e) 3323 FF(b, c, d, a, 15, 22, 0x49b40821) 3324 3325 // Round 2 3326 GG(a, b, c, d, 1, 5, 0xf61e2562) 3327 GG(d, a, b, c, 6, 9, 0xc040b340) 3328 GG(c, d, a, b, 11, 14, 0x265e5a51) 3329 GG(b, c, d, a, 0, 20, 0xe9b6c7aa) 3330 GG(a, b, c, d, 5, 5, 0xd62f105d) 3331 GG(d, a, b, c, 10, 9, 0x02441453) 3332 GG(c, d, a, b, 15, 14, 0xd8a1e681) 3333 GG(b, c, d, a, 4, 20, 0xe7d3fbc8) 3334 GG(a, b, c, d, 9, 5, 0x21e1cde6) 3335 GG(d, a, b, c, 14, 9, 0xc33707d6) 3336 GG(c, d, a, b, 3, 14, 0xf4d50d87) 3337 GG(b, c, d, a, 8, 20, 0x455a14ed) 3338 GG(a, b, c, d, 13, 5, 0xa9e3e905) 3339 GG(d, a, b, c, 2, 9, 0xfcefa3f8) 3340 GG(c, d, a, b, 7, 14, 0x676f02d9) 3341 GG(b, c, d, a, 12, 20, 0x8d2a4c8a) 3342 3343 // Round 3 3344 HH(a, b, c, d, 5, 4, 0xfffa3942) 3345 HH(d, a, b, c, 8, 11, 0x8771f681) 3346 HH(c, d, a, b, 11, 16, 0x6d9d6122) 3347 HH(b, c, d, a, 14, 23, 0xfde5380c) 3348 HH(a, b, c, d, 1, 4, 0xa4beea44) 3349 HH(d, a, b, c, 4, 11, 0x4bdecfa9) 3350 HH(c, d, a, b, 7, 16, 0xf6bb4b60) 3351 HH(b, c, d, a, 10, 23, 0xbebfbc70) 3352 HH(a, b, c, d, 13, 4, 0x289b7ec6) 3353 HH(d, a, b, c, 0, 11, 0xeaa127fa) 3354 HH(c, d, a, b, 3, 16, 0xd4ef3085) 3355 HH(b, c, d, a, 6, 23, 0x04881d05) 3356 HH(a, b, c, d, 9, 4, 0xd9d4d039) 3357 HH(d, a, b, c, 12, 11, 0xe6db99e5) 3358 HH(c, d, a, b, 15, 16, 0x1fa27cf8) 3359 HH(b, c, d, a, 2, 23, 0xc4ac5665) 3360 3361 // Round 4 3362 II(a, b, c, d, 0, 6, 0xf4292244) 3363 II(d, a, b, c, 7, 10, 0x432aff97) 3364 II(c, d, a, b, 14, 15, 0xab9423a7) 3365 II(b, c, d, a, 5, 21, 0xfc93a039) 3366 II(a, b, c, d, 12, 6, 0x655b59c3) 3367 II(d, a, b, c, 3, 10, 0x8f0ccc92) 3368 II(c, d, a, b, 10, 15, 0xffeff47d) 3369 II(b, c, d, a, 1, 21, 0x85845dd1) 3370 II(a, b, c, d, 8, 6, 0x6fa87e4f) 3371 II(d, a, b, c, 15, 10, 0xfe2ce6e0) 3372 II(c, d, a, b, 6, 15, 0xa3014314) 3373 II(b, c, d, a, 13, 21, 0x4e0811a1) 3374 II(a, b, c, d, 4, 6, 0xf7537e82) 3375 II(d, a, b, c, 11, 10, 0xbd3af235) 3376 II(c, d, a, b, 2, 15, 0x2ad7d2bb) 3377 II(b, c, d, a, 9, 21, 0xeb86d391) 3378 3379 #undef FF 3380 #undef GG 3381 #undef HH 3382 #undef II 3383 3384 // write hash values back in the correct order 3385 __ ldrw(rscratch1, Address(state, 0)); 3386 __ addw(rscratch1, rscratch1, a); 3387 __ strw(rscratch1, Address(state, 0)); 3388 3389 __ ldrw(rscratch2, Address(state, 4)); 3390 __ addw(rscratch2, rscratch2, b); 3391 __ strw(rscratch2, Address(state, 4)); 3392 3393 __ ldrw(rscratch3, Address(state, 8)); 3394 __ addw(rscratch3, rscratch3, c); 3395 __ strw(rscratch3, Address(state, 8)); 3396 3397 __ ldrw(rscratch4, Address(state, 12)); 3398 __ addw(rscratch4, rscratch4, d); 3399 __ strw(rscratch4, Address(state, 12)); 3400 3401 if (multi_block) { 3402 __ add(buf, buf, 64); 3403 __ add(ofs, ofs, 64); 3404 __ cmp(ofs, limit); 3405 __ br(Assembler::LE, md5_loop); 3406 __ mov(c_rarg0, ofs); // return ofs 3407 } 3408 3409 __ ret(lr); 3410 3411 return start; 3412 } 3413 3414 // Arguments: 3415 // 3416 // Inputs: 3417 // c_rarg0 - byte[] source+offset 3418 // c_rarg1 - int[] SHA.state 3419 // c_rarg2 - int offset 3420 // c_rarg3 - int limit 3421 // 3422 address generate_sha1_implCompress(bool multi_block, const char *name) { 3423 __ align(CodeEntryAlignment); 3424 StubCodeMark mark(this, "StubRoutines", name); 3425 address start = __ pc(); 3426 3427 Register buf = c_rarg0; 3428 Register state = c_rarg1; 3429 Register ofs = c_rarg2; 3430 Register limit = c_rarg3; 3431 3432 Label keys; 3433 Label sha1_loop; 3434 3435 // load the keys into v0..v3 3436 __ adr(rscratch1, keys); 3437 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3438 // load 5 words state into v6, v7 3439 __ ldrq(v6, Address(state, 0)); 3440 __ ldrs(v7, Address(state, 16)); 3441 3442 3443 __ BIND(sha1_loop); 3444 // load 64 bytes of data into v16..v19 3445 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3446 __ rev32(v16, __ T16B, v16); 3447 __ rev32(v17, __ T16B, v17); 3448 __ rev32(v18, __ T16B, v18); 3449 __ rev32(v19, __ T16B, v19); 3450 3451 // do the sha1 3452 __ addv(v4, __ T4S, v16, v0); 3453 __ orr(v20, __ T16B, v6, v6); 3454 3455 FloatRegister d0 = v16; 3456 FloatRegister d1 = v17; 3457 FloatRegister d2 = v18; 3458 FloatRegister d3 = v19; 3459 3460 for (int round = 0; round < 20; round++) { 3461 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3462 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3463 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3464 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3465 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3466 3467 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3468 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3469 __ sha1h(tmp2, __ T4S, v20); 3470 if (round < 5) 3471 __ sha1c(v20, __ T4S, tmp3, tmp4); 3472 else if (round < 10 || round >= 15) 3473 __ sha1p(v20, __ T4S, tmp3, tmp4); 3474 else 3475 __ sha1m(v20, __ T4S, tmp3, tmp4); 3476 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3477 3478 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3479 } 3480 3481 __ addv(v7, __ T2S, v7, v21); 3482 __ addv(v6, __ T4S, v6, v20); 3483 3484 if (multi_block) { 3485 __ add(ofs, ofs, 64); 3486 __ cmp(ofs, limit); 3487 __ br(Assembler::LE, sha1_loop); 3488 __ mov(c_rarg0, ofs); // return ofs 3489 } 3490 3491 __ strq(v6, Address(state, 0)); 3492 __ strs(v7, Address(state, 16)); 3493 3494 __ ret(lr); 3495 3496 __ bind(keys); 3497 __ emit_int32(0x5a827999); 3498 __ emit_int32(0x6ed9eba1); 3499 __ emit_int32(0x8f1bbcdc); 3500 __ emit_int32(0xca62c1d6); 3501 3502 return start; 3503 } 3504 3505 3506 // Arguments: 3507 // 3508 // Inputs: 3509 // c_rarg0 - byte[] source+offset 3510 // c_rarg1 - int[] SHA.state 3511 // c_rarg2 - int offset 3512 // c_rarg3 - int limit 3513 // 3514 address generate_sha256_implCompress(bool multi_block, const char *name) { 3515 static const uint32_t round_consts[64] = { 3516 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3517 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3518 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3519 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3520 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3521 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3522 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3523 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3524 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3525 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3526 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3527 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3528 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3529 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3530 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3531 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3532 }; 3533 __ align(CodeEntryAlignment); 3534 StubCodeMark mark(this, "StubRoutines", name); 3535 address start = __ pc(); 3536 3537 Register buf = c_rarg0; 3538 Register state = c_rarg1; 3539 Register ofs = c_rarg2; 3540 Register limit = c_rarg3; 3541 3542 Label sha1_loop; 3543 3544 __ stpd(v8, v9, __ pre(sp, -32)); 3545 __ stpd(v10, v11, Address(sp, 16)); 3546 3547 // dga == v0 3548 // dgb == v1 3549 // dg0 == v2 3550 // dg1 == v3 3551 // dg2 == v4 3552 // t0 == v6 3553 // t1 == v7 3554 3555 // load 16 keys to v16..v31 3556 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3557 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3558 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3559 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3560 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3561 3562 // load 8 words (256 bits) state 3563 __ ldpq(v0, v1, state); 3564 3565 __ BIND(sha1_loop); 3566 // load 64 bytes of data into v8..v11 3567 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3568 __ rev32(v8, __ T16B, v8); 3569 __ rev32(v9, __ T16B, v9); 3570 __ rev32(v10, __ T16B, v10); 3571 __ rev32(v11, __ T16B, v11); 3572 3573 __ addv(v6, __ T4S, v8, v16); 3574 __ orr(v2, __ T16B, v0, v0); 3575 __ orr(v3, __ T16B, v1, v1); 3576 3577 FloatRegister d0 = v8; 3578 FloatRegister d1 = v9; 3579 FloatRegister d2 = v10; 3580 FloatRegister d3 = v11; 3581 3582 3583 for (int round = 0; round < 16; round++) { 3584 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3585 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3586 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3587 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3588 3589 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3590 __ orr(v4, __ T16B, v2, v2); 3591 if (round < 15) 3592 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3593 __ sha256h(v2, __ T4S, v3, tmp2); 3594 __ sha256h2(v3, __ T4S, v4, tmp2); 3595 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3596 3597 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3598 } 3599 3600 __ addv(v0, __ T4S, v0, v2); 3601 __ addv(v1, __ T4S, v1, v3); 3602 3603 if (multi_block) { 3604 __ add(ofs, ofs, 64); 3605 __ cmp(ofs, limit); 3606 __ br(Assembler::LE, sha1_loop); 3607 __ mov(c_rarg0, ofs); // return ofs 3608 } 3609 3610 __ ldpd(v10, v11, Address(sp, 16)); 3611 __ ldpd(v8, v9, __ post(sp, 32)); 3612 3613 __ stpq(v0, v1, state); 3614 3615 __ ret(lr); 3616 3617 return start; 3618 } 3619 3620 // Arguments: 3621 // 3622 // Inputs: 3623 // c_rarg0 - byte[] source+offset 3624 // c_rarg1 - int[] SHA.state 3625 // c_rarg2 - int offset 3626 // c_rarg3 - int limit 3627 // 3628 address generate_sha512_implCompress(bool multi_block, const char *name) { 3629 static const uint64_t round_consts[80] = { 3630 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3631 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3632 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3633 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3634 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3635 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3636 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3637 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3638 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3639 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3640 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3641 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3642 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3643 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3644 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3645 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3646 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3647 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3648 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3649 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3650 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3651 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3652 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3653 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3654 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3655 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3656 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3657 }; 3658 3659 // Double rounds for sha512. 3660 #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \ 3661 if (dr < 36) \ 3662 __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16)); \ 3663 __ addv(v5, __ T2D, v##rc0, v##in0); \ 3664 __ ext(v6, __ T16B, v##i2, v##i3, 8); \ 3665 __ ext(v5, __ T16B, v5, v5, 8); \ 3666 __ ext(v7, __ T16B, v##i1, v##i2, 8); \ 3667 __ addv(v##i3, __ T2D, v##i3, v5); \ 3668 if (dr < 32) { \ 3669 __ ext(v5, __ T16B, v##in3, v##in4, 8); \ 3670 __ sha512su0(v##in0, __ T2D, v##in1); \ 3671 } \ 3672 __ sha512h(v##i3, __ T2D, v6, v7); \ 3673 if (dr < 32) \ 3674 __ sha512su1(v##in0, __ T2D, v##in2, v5); \ 3675 __ addv(v##i4, __ T2D, v##i1, v##i3); \ 3676 __ sha512h2(v##i3, __ T2D, v##i1, v##i0); \ 3677 3678 __ align(CodeEntryAlignment); 3679 StubCodeMark mark(this, "StubRoutines", name); 3680 address start = __ pc(); 3681 3682 Register buf = c_rarg0; 3683 Register state = c_rarg1; 3684 Register ofs = c_rarg2; 3685 Register limit = c_rarg3; 3686 3687 __ stpd(v8, v9, __ pre(sp, -64)); 3688 __ stpd(v10, v11, Address(sp, 16)); 3689 __ stpd(v12, v13, Address(sp, 32)); 3690 __ stpd(v14, v15, Address(sp, 48)); 3691 3692 Label sha512_loop; 3693 3694 // load state 3695 __ ld1(v8, v9, v10, v11, __ T2D, state); 3696 3697 // load first 4 round constants 3698 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3699 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3700 3701 __ BIND(sha512_loop); 3702 // load 128B of data into v12..v19 3703 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3704 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3705 __ rev64(v12, __ T16B, v12); 3706 __ rev64(v13, __ T16B, v13); 3707 __ rev64(v14, __ T16B, v14); 3708 __ rev64(v15, __ T16B, v15); 3709 __ rev64(v16, __ T16B, v16); 3710 __ rev64(v17, __ T16B, v17); 3711 __ rev64(v18, __ T16B, v18); 3712 __ rev64(v19, __ T16B, v19); 3713 3714 __ mov(rscratch2, rscratch1); 3715 3716 __ mov(v0, __ T16B, v8); 3717 __ mov(v1, __ T16B, v9); 3718 __ mov(v2, __ T16B, v10); 3719 __ mov(v3, __ T16B, v11); 3720 3721 sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17); 3722 sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18); 3723 sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19); 3724 sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12); 3725 sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13); 3726 sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14); 3727 sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15); 3728 sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16); 3729 sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17); 3730 sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18); 3731 sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19); 3732 sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12); 3733 sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13); 3734 sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14); 3735 sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15); 3736 sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16); 3737 sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17); 3738 sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18); 3739 sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19); 3740 sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12); 3741 sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13); 3742 sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14); 3743 sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15); 3744 sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16); 3745 sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17); 3746 sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18); 3747 sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19); 3748 sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12); 3749 sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13); 3750 sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14); 3751 sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15); 3752 sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16); 3753 sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12, 0, 0, 0, 0); 3754 sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13, 0, 0, 0, 0); 3755 sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14, 0, 0, 0, 0); 3756 sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15, 0, 0, 0, 0); 3757 sha512_dround(36, 3, 0, 4, 2, 1, 24, 0, 16, 0, 0, 0, 0); 3758 sha512_dround(37, 2, 3, 1, 4, 0, 25, 0, 17, 0, 0, 0, 0); 3759 sha512_dround(38, 4, 2, 0, 1, 3, 26, 0, 18, 0, 0, 0, 0); 3760 sha512_dround(39, 1, 4, 3, 0, 2, 27, 0, 19, 0, 0, 0, 0); 3761 3762 __ addv(v8, __ T2D, v8, v0); 3763 __ addv(v9, __ T2D, v9, v1); 3764 __ addv(v10, __ T2D, v10, v2); 3765 __ addv(v11, __ T2D, v11, v3); 3766 3767 if (multi_block) { 3768 __ add(ofs, ofs, 128); 3769 __ cmp(ofs, limit); 3770 __ br(Assembler::LE, sha512_loop); 3771 __ mov(c_rarg0, ofs); // return ofs 3772 } 3773 3774 __ st1(v8, v9, v10, v11, __ T2D, state); 3775 3776 __ ldpd(v14, v15, Address(sp, 48)); 3777 __ ldpd(v12, v13, Address(sp, 32)); 3778 __ ldpd(v10, v11, Address(sp, 16)); 3779 __ ldpd(v8, v9, __ post(sp, 64)); 3780 3781 __ ret(lr); 3782 3783 return start; 3784 } 3785 3786 // Arguments: 3787 // 3788 // Inputs: 3789 // c_rarg0 - byte[] source+offset 3790 // c_rarg1 - byte[] SHA.state 3791 // c_rarg2 - int digest_length 3792 // c_rarg3 - int offset 3793 // c_rarg4 - int limit 3794 // 3795 address generate_sha3_implCompress(bool multi_block, const char *name) { 3796 static const uint64_t round_consts[24] = { 3797 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 3798 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 3799 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 3800 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 3801 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 3802 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 3803 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 3804 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 3805 }; 3806 3807 __ align(CodeEntryAlignment); 3808 StubCodeMark mark(this, "StubRoutines", name); 3809 address start = __ pc(); 3810 3811 Register buf = c_rarg0; 3812 Register state = c_rarg1; 3813 Register digest_length = c_rarg2; 3814 Register ofs = c_rarg3; 3815 Register limit = c_rarg4; 3816 3817 Label sha3_loop, rounds24_loop; 3818 Label sha3_512, sha3_384_or_224, sha3_256; 3819 3820 __ stpd(v8, v9, __ pre(sp, -64)); 3821 __ stpd(v10, v11, Address(sp, 16)); 3822 __ stpd(v12, v13, Address(sp, 32)); 3823 __ stpd(v14, v15, Address(sp, 48)); 3824 3825 // load state 3826 __ add(rscratch1, state, 32); 3827 __ ld1(v0, v1, v2, v3, __ T1D, state); 3828 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 3829 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 3830 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 3831 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 3832 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 3833 __ ld1(v24, __ T1D, rscratch1); 3834 3835 __ BIND(sha3_loop); 3836 3837 // 24 keccak rounds 3838 __ movw(rscratch2, 24); 3839 3840 // load round_constants base 3841 __ lea(rscratch1, ExternalAddress((address) round_consts)); 3842 3843 // load input 3844 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3845 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 3846 __ eor(v0, __ T8B, v0, v25); 3847 __ eor(v1, __ T8B, v1, v26); 3848 __ eor(v2, __ T8B, v2, v27); 3849 __ eor(v3, __ T8B, v3, v28); 3850 __ eor(v4, __ T8B, v4, v29); 3851 __ eor(v5, __ T8B, v5, v30); 3852 __ eor(v6, __ T8B, v6, v31); 3853 3854 // digest_length == 64, SHA3-512 3855 __ tbnz(digest_length, 6, sha3_512); 3856 3857 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3858 __ ld1(v29, v30, __ T8B, __ post(buf, 16)); 3859 __ eor(v7, __ T8B, v7, v25); 3860 __ eor(v8, __ T8B, v8, v26); 3861 __ eor(v9, __ T8B, v9, v27); 3862 __ eor(v10, __ T8B, v10, v28); 3863 __ eor(v11, __ T8B, v11, v29); 3864 __ eor(v12, __ T8B, v12, v30); 3865 3866 // digest_length == 28, SHA3-224; digest_length == 48, SHA3-384 3867 __ tbnz(digest_length, 4, sha3_384_or_224); 3868 3869 // SHA3-256 3870 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3871 __ eor(v13, __ T8B, v13, v25); 3872 __ eor(v14, __ T8B, v14, v26); 3873 __ eor(v15, __ T8B, v15, v27); 3874 __ eor(v16, __ T8B, v16, v28); 3875 __ b(rounds24_loop); 3876 3877 __ BIND(sha3_384_or_224); 3878 __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384 3879 3880 // SHA3-224 3881 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 3882 __ ld1(v29, __ T8B, __ post(buf, 8)); 3883 __ eor(v13, __ T8B, v13, v25); 3884 __ eor(v14, __ T8B, v14, v26); 3885 __ eor(v15, __ T8B, v15, v27); 3886 __ eor(v16, __ T8B, v16, v28); 3887 __ eor(v17, __ T8B, v17, v29); 3888 __ b(rounds24_loop); 3889 3890 __ BIND(sha3_512); 3891 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 3892 __ eor(v7, __ T8B, v7, v25); 3893 __ eor(v8, __ T8B, v8, v26); 3894 3895 __ BIND(rounds24_loop); 3896 __ subw(rscratch2, rscratch2, 1); 3897 3898 __ eor3(v29, __ T16B, v4, v9, v14); 3899 __ eor3(v26, __ T16B, v1, v6, v11); 3900 __ eor3(v28, __ T16B, v3, v8, v13); 3901 __ eor3(v25, __ T16B, v0, v5, v10); 3902 __ eor3(v27, __ T16B, v2, v7, v12); 3903 __ eor3(v29, __ T16B, v29, v19, v24); 3904 __ eor3(v26, __ T16B, v26, v16, v21); 3905 __ eor3(v28, __ T16B, v28, v18, v23); 3906 __ eor3(v25, __ T16B, v25, v15, v20); 3907 __ eor3(v27, __ T16B, v27, v17, v22); 3908 3909 __ rax1(v30, __ T2D, v29, v26); 3910 __ rax1(v26, __ T2D, v26, v28); 3911 __ rax1(v28, __ T2D, v28, v25); 3912 __ rax1(v25, __ T2D, v25, v27); 3913 __ rax1(v27, __ T2D, v27, v29); 3914 3915 __ eor(v0, __ T16B, v0, v30); 3916 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 3917 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 3918 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 3919 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 3920 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 3921 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 3922 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 3923 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 3924 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 3925 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 3926 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 3927 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 3928 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 3929 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 3930 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 3931 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 3932 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 3933 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 3934 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 3935 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 3936 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 3937 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 3938 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 3939 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 3940 3941 __ bcax(v20, __ T16B, v31, v22, v8); 3942 __ bcax(v21, __ T16B, v8, v23, v22); 3943 __ bcax(v22, __ T16B, v22, v24, v23); 3944 __ bcax(v23, __ T16B, v23, v31, v24); 3945 __ bcax(v24, __ T16B, v24, v8, v31); 3946 3947 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 3948 3949 __ bcax(v17, __ T16B, v25, v19, v3); 3950 __ bcax(v18, __ T16B, v3, v15, v19); 3951 __ bcax(v19, __ T16B, v19, v16, v15); 3952 __ bcax(v15, __ T16B, v15, v25, v16); 3953 __ bcax(v16, __ T16B, v16, v3, v25); 3954 3955 __ bcax(v10, __ T16B, v29, v12, v26); 3956 __ bcax(v11, __ T16B, v26, v13, v12); 3957 __ bcax(v12, __ T16B, v12, v14, v13); 3958 __ bcax(v13, __ T16B, v13, v29, v14); 3959 __ bcax(v14, __ T16B, v14, v26, v29); 3960 3961 __ bcax(v7, __ T16B, v30, v9, v4); 3962 __ bcax(v8, __ T16B, v4, v5, v9); 3963 __ bcax(v9, __ T16B, v9, v6, v5); 3964 __ bcax(v5, __ T16B, v5, v30, v6); 3965 __ bcax(v6, __ T16B, v6, v4, v30); 3966 3967 __ bcax(v3, __ T16B, v27, v0, v28); 3968 __ bcax(v4, __ T16B, v28, v1, v0); 3969 __ bcax(v0, __ T16B, v0, v2, v1); 3970 __ bcax(v1, __ T16B, v1, v27, v2); 3971 __ bcax(v2, __ T16B, v2, v28, v27); 3972 3973 __ eor(v0, __ T16B, v0, v31); 3974 3975 __ cbnzw(rscratch2, rounds24_loop); 3976 3977 if (multi_block) { 3978 // block_size = 200 - 2 * digest_length, ofs += block_size 3979 __ add(ofs, ofs, 200); 3980 __ sub(ofs, ofs, digest_length, Assembler::LSL, 1); 3981 3982 __ cmp(ofs, limit); 3983 __ br(Assembler::LE, sha3_loop); 3984 __ mov(c_rarg0, ofs); // return ofs 3985 } 3986 3987 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 3988 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 3989 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 3990 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 3991 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 3992 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 3993 __ st1(v24, __ T1D, state); 3994 3995 __ ldpd(v14, v15, Address(sp, 48)); 3996 __ ldpd(v12, v13, Address(sp, 32)); 3997 __ ldpd(v10, v11, Address(sp, 16)); 3998 __ ldpd(v8, v9, __ post(sp, 64)); 3999 4000 __ ret(lr); 4001 4002 return start; 4003 } 4004 4005 // Safefetch stubs. 4006 void generate_safefetch(const char* name, int size, address* entry, 4007 address* fault_pc, address* continuation_pc) { 4008 // safefetch signatures: 4009 // int SafeFetch32(int* adr, int errValue); 4010 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 4011 // 4012 // arguments: 4013 // c_rarg0 = adr 4014 // c_rarg1 = errValue 4015 // 4016 // result: 4017 // PPC_RET = *adr or errValue 4018 4019 StubCodeMark mark(this, "StubRoutines", name); 4020 4021 // Entry point, pc or function descriptor. 4022 *entry = __ pc(); 4023 4024 // Load *adr into c_rarg1, may fault. 4025 *fault_pc = __ pc(); 4026 switch (size) { 4027 case 4: 4028 // int32_t 4029 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 4030 break; 4031 case 8: 4032 // int64_t 4033 __ ldr(c_rarg1, Address(c_rarg0, 0)); 4034 break; 4035 default: 4036 ShouldNotReachHere(); 4037 } 4038 4039 // return errValue or *adr 4040 *continuation_pc = __ pc(); 4041 __ mov(r0, c_rarg1); 4042 __ ret(lr); 4043 } 4044 4045 /** 4046 * Arguments: 4047 * 4048 * Inputs: 4049 * c_rarg0 - int crc 4050 * c_rarg1 - byte* buf 4051 * c_rarg2 - int length 4052 * 4053 * Ouput: 4054 * rax - int crc result 4055 */ 4056 address generate_updateBytesCRC32() { 4057 assert(UseCRC32Intrinsics, "what are we doing here?"); 4058 4059 __ align(CodeEntryAlignment); 4060 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4061 4062 address start = __ pc(); 4063 4064 const Register crc = c_rarg0; // crc 4065 const Register buf = c_rarg1; // source java byte array address 4066 const Register len = c_rarg2; // length 4067 const Register table0 = c_rarg3; // crc_table address 4068 const Register table1 = c_rarg4; 4069 const Register table2 = c_rarg5; 4070 const Register table3 = c_rarg6; 4071 const Register tmp3 = c_rarg7; 4072 4073 BLOCK_COMMENT("Entry:"); 4074 __ enter(); // required for proper stackwalking of RuntimeStub frame 4075 4076 __ kernel_crc32(crc, buf, len, 4077 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4078 4079 __ leave(); // required for proper stackwalking of RuntimeStub frame 4080 __ ret(lr); 4081 4082 return start; 4083 } 4084 4085 /** 4086 * Arguments: 4087 * 4088 * Inputs: 4089 * c_rarg0 - int crc 4090 * c_rarg1 - byte* buf 4091 * c_rarg2 - int length 4092 * c_rarg3 - int* table 4093 * 4094 * Ouput: 4095 * r0 - int crc result 4096 */ 4097 address generate_updateBytesCRC32C() { 4098 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4099 4100 __ align(CodeEntryAlignment); 4101 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4102 4103 address start = __ pc(); 4104 4105 const Register crc = c_rarg0; // crc 4106 const Register buf = c_rarg1; // source java byte array address 4107 const Register len = c_rarg2; // length 4108 const Register table0 = c_rarg3; // crc_table address 4109 const Register table1 = c_rarg4; 4110 const Register table2 = c_rarg5; 4111 const Register table3 = c_rarg6; 4112 const Register tmp3 = c_rarg7; 4113 4114 BLOCK_COMMENT("Entry:"); 4115 __ enter(); // required for proper stackwalking of RuntimeStub frame 4116 4117 __ kernel_crc32c(crc, buf, len, 4118 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4119 4120 __ leave(); // required for proper stackwalking of RuntimeStub frame 4121 __ ret(lr); 4122 4123 return start; 4124 } 4125 4126 /*** 4127 * Arguments: 4128 * 4129 * Inputs: 4130 * c_rarg0 - int adler 4131 * c_rarg1 - byte* buff 4132 * c_rarg2 - int len 4133 * 4134 * Output: 4135 * c_rarg0 - int adler result 4136 */ 4137 address generate_updateBytesAdler32() { 4138 __ align(CodeEntryAlignment); 4139 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4140 address start = __ pc(); 4141 4142 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4143 4144 // Aliases 4145 Register adler = c_rarg0; 4146 Register s1 = c_rarg0; 4147 Register s2 = c_rarg3; 4148 Register buff = c_rarg1; 4149 Register len = c_rarg2; 4150 Register nmax = r4; 4151 Register base = r5; 4152 Register count = r6; 4153 Register temp0 = rscratch1; 4154 Register temp1 = rscratch2; 4155 FloatRegister vbytes = v0; 4156 FloatRegister vs1acc = v1; 4157 FloatRegister vs2acc = v2; 4158 FloatRegister vtable = v3; 4159 4160 // Max number of bytes we can process before having to take the mod 4161 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4162 uint64_t BASE = 0xfff1; 4163 uint64_t NMAX = 0x15B0; 4164 4165 __ mov(base, BASE); 4166 __ mov(nmax, NMAX); 4167 4168 // Load accumulation coefficients for the upper 16 bits 4169 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4170 __ ld1(vtable, __ T16B, Address(temp0)); 4171 4172 // s1 is initialized to the lower 16 bits of adler 4173 // s2 is initialized to the upper 16 bits of adler 4174 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4175 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4176 4177 // The pipelined loop needs at least 16 elements for 1 iteration 4178 // It does check this, but it is more effective to skip to the cleanup loop 4179 __ cmp(len, (u1)16); 4180 __ br(Assembler::HS, L_nmax); 4181 __ cbz(len, L_combine); 4182 4183 __ bind(L_simple_by1_loop); 4184 __ ldrb(temp0, Address(__ post(buff, 1))); 4185 __ add(s1, s1, temp0); 4186 __ add(s2, s2, s1); 4187 __ subs(len, len, 1); 4188 __ br(Assembler::HI, L_simple_by1_loop); 4189 4190 // s1 = s1 % BASE 4191 __ subs(temp0, s1, base); 4192 __ csel(s1, temp0, s1, Assembler::HS); 4193 4194 // s2 = s2 % BASE 4195 __ lsr(temp0, s2, 16); 4196 __ lsl(temp1, temp0, 4); 4197 __ sub(temp1, temp1, temp0); 4198 __ add(s2, temp1, s2, ext::uxth); 4199 4200 __ subs(temp0, s2, base); 4201 __ csel(s2, temp0, s2, Assembler::HS); 4202 4203 __ b(L_combine); 4204 4205 __ bind(L_nmax); 4206 __ subs(len, len, nmax); 4207 __ sub(count, nmax, 16); 4208 __ br(Assembler::LO, L_by16); 4209 4210 __ bind(L_nmax_loop); 4211 4212 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4213 vbytes, vs1acc, vs2acc, vtable); 4214 4215 __ subs(count, count, 16); 4216 __ br(Assembler::HS, L_nmax_loop); 4217 4218 // s1 = s1 % BASE 4219 __ lsr(temp0, s1, 16); 4220 __ lsl(temp1, temp0, 4); 4221 __ sub(temp1, temp1, temp0); 4222 __ add(temp1, temp1, s1, ext::uxth); 4223 4224 __ lsr(temp0, temp1, 16); 4225 __ lsl(s1, temp0, 4); 4226 __ sub(s1, s1, temp0); 4227 __ add(s1, s1, temp1, ext:: uxth); 4228 4229 __ subs(temp0, s1, base); 4230 __ csel(s1, temp0, s1, Assembler::HS); 4231 4232 // s2 = s2 % BASE 4233 __ lsr(temp0, s2, 16); 4234 __ lsl(temp1, temp0, 4); 4235 __ sub(temp1, temp1, temp0); 4236 __ add(temp1, temp1, s2, ext::uxth); 4237 4238 __ lsr(temp0, temp1, 16); 4239 __ lsl(s2, temp0, 4); 4240 __ sub(s2, s2, temp0); 4241 __ add(s2, s2, temp1, ext:: uxth); 4242 4243 __ subs(temp0, s2, base); 4244 __ csel(s2, temp0, s2, Assembler::HS); 4245 4246 __ subs(len, len, nmax); 4247 __ sub(count, nmax, 16); 4248 __ br(Assembler::HS, L_nmax_loop); 4249 4250 __ bind(L_by16); 4251 __ adds(len, len, count); 4252 __ br(Assembler::LO, L_by1); 4253 4254 __ bind(L_by16_loop); 4255 4256 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4257 vbytes, vs1acc, vs2acc, vtable); 4258 4259 __ subs(len, len, 16); 4260 __ br(Assembler::HS, L_by16_loop); 4261 4262 __ bind(L_by1); 4263 __ adds(len, len, 15); 4264 __ br(Assembler::LO, L_do_mod); 4265 4266 __ bind(L_by1_loop); 4267 __ ldrb(temp0, Address(__ post(buff, 1))); 4268 __ add(s1, temp0, s1); 4269 __ add(s2, s2, s1); 4270 __ subs(len, len, 1); 4271 __ br(Assembler::HS, L_by1_loop); 4272 4273 __ bind(L_do_mod); 4274 // s1 = s1 % BASE 4275 __ lsr(temp0, s1, 16); 4276 __ lsl(temp1, temp0, 4); 4277 __ sub(temp1, temp1, temp0); 4278 __ add(temp1, temp1, s1, ext::uxth); 4279 4280 __ lsr(temp0, temp1, 16); 4281 __ lsl(s1, temp0, 4); 4282 __ sub(s1, s1, temp0); 4283 __ add(s1, s1, temp1, ext:: uxth); 4284 4285 __ subs(temp0, s1, base); 4286 __ csel(s1, temp0, s1, Assembler::HS); 4287 4288 // s2 = s2 % BASE 4289 __ lsr(temp0, s2, 16); 4290 __ lsl(temp1, temp0, 4); 4291 __ sub(temp1, temp1, temp0); 4292 __ add(temp1, temp1, s2, ext::uxth); 4293 4294 __ lsr(temp0, temp1, 16); 4295 __ lsl(s2, temp0, 4); 4296 __ sub(s2, s2, temp0); 4297 __ add(s2, s2, temp1, ext:: uxth); 4298 4299 __ subs(temp0, s2, base); 4300 __ csel(s2, temp0, s2, Assembler::HS); 4301 4302 // Combine lower bits and higher bits 4303 __ bind(L_combine); 4304 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4305 4306 __ ret(lr); 4307 4308 return start; 4309 } 4310 4311 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4312 Register temp0, Register temp1, FloatRegister vbytes, 4313 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4314 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4315 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4316 // In non-vectorized code, we update s1 and s2 as: 4317 // s1 <- s1 + b1 4318 // s2 <- s2 + s1 4319 // s1 <- s1 + b2 4320 // s2 <- s2 + b1 4321 // ... 4322 // s1 <- s1 + b16 4323 // s2 <- s2 + s1 4324 // Putting above assignments together, we have: 4325 // s1_new = s1 + b1 + b2 + ... + b16 4326 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4327 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4328 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4329 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4330 4331 // s2 = s2 + s1 * 16 4332 __ add(s2, s2, s1, Assembler::LSL, 4); 4333 4334 // vs1acc = b1 + b2 + b3 + ... + b16 4335 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4336 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4337 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4338 __ uaddlv(vs1acc, __ T16B, vbytes); 4339 __ uaddlv(vs2acc, __ T8H, vs2acc); 4340 4341 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4342 __ fmovd(temp0, vs1acc); 4343 __ fmovd(temp1, vs2acc); 4344 __ add(s1, s1, temp0); 4345 __ add(s2, s2, temp1); 4346 } 4347 4348 /** 4349 * Arguments: 4350 * 4351 * Input: 4352 * c_rarg0 - x address 4353 * c_rarg1 - x length 4354 * c_rarg2 - y address 4355 * c_rarg3 - y lenth 4356 * c_rarg4 - z address 4357 * c_rarg5 - z length 4358 */ 4359 address generate_multiplyToLen() { 4360 __ align(CodeEntryAlignment); 4361 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4362 4363 address start = __ pc(); 4364 const Register x = r0; 4365 const Register xlen = r1; 4366 const Register y = r2; 4367 const Register ylen = r3; 4368 const Register z = r4; 4369 const Register zlen = r5; 4370 4371 const Register tmp1 = r10; 4372 const Register tmp2 = r11; 4373 const Register tmp3 = r12; 4374 const Register tmp4 = r13; 4375 const Register tmp5 = r14; 4376 const Register tmp6 = r15; 4377 const Register tmp7 = r16; 4378 4379 BLOCK_COMMENT("Entry:"); 4380 __ enter(); // required for proper stackwalking of RuntimeStub frame 4381 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4382 __ leave(); // required for proper stackwalking of RuntimeStub frame 4383 __ ret(lr); 4384 4385 return start; 4386 } 4387 4388 address generate_squareToLen() { 4389 // squareToLen algorithm for sizes 1..127 described in java code works 4390 // faster than multiply_to_len on some CPUs and slower on others, but 4391 // multiply_to_len shows a bit better overall results 4392 __ align(CodeEntryAlignment); 4393 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4394 address start = __ pc(); 4395 4396 const Register x = r0; 4397 const Register xlen = r1; 4398 const Register z = r2; 4399 const Register zlen = r3; 4400 const Register y = r4; // == x 4401 const Register ylen = r5; // == xlen 4402 4403 const Register tmp1 = r10; 4404 const Register tmp2 = r11; 4405 const Register tmp3 = r12; 4406 const Register tmp4 = r13; 4407 const Register tmp5 = r14; 4408 const Register tmp6 = r15; 4409 const Register tmp7 = r16; 4410 4411 RegSet spilled_regs = RegSet::of(y, ylen); 4412 BLOCK_COMMENT("Entry:"); 4413 __ enter(); 4414 __ push(spilled_regs, sp); 4415 __ mov(y, x); 4416 __ mov(ylen, xlen); 4417 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4418 __ pop(spilled_regs, sp); 4419 __ leave(); 4420 __ ret(lr); 4421 return start; 4422 } 4423 4424 address generate_mulAdd() { 4425 __ align(CodeEntryAlignment); 4426 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4427 4428 address start = __ pc(); 4429 4430 const Register out = r0; 4431 const Register in = r1; 4432 const Register offset = r2; 4433 const Register len = r3; 4434 const Register k = r4; 4435 4436 BLOCK_COMMENT("Entry:"); 4437 __ enter(); 4438 __ mul_add(out, in, offset, len, k); 4439 __ leave(); 4440 __ ret(lr); 4441 4442 return start; 4443 } 4444 4445 // Arguments: 4446 // 4447 // Input: 4448 // c_rarg0 - newArr address 4449 // c_rarg1 - oldArr address 4450 // c_rarg2 - newIdx 4451 // c_rarg3 - shiftCount 4452 // c_rarg4 - numIter 4453 // 4454 address generate_bigIntegerRightShift() { 4455 __ align(CodeEntryAlignment); 4456 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4457 address start = __ pc(); 4458 4459 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4460 4461 Register newArr = c_rarg0; 4462 Register oldArr = c_rarg1; 4463 Register newIdx = c_rarg2; 4464 Register shiftCount = c_rarg3; 4465 Register numIter = c_rarg4; 4466 Register idx = numIter; 4467 4468 Register newArrCur = rscratch1; 4469 Register shiftRevCount = rscratch2; 4470 Register oldArrCur = r13; 4471 Register oldArrNext = r14; 4472 4473 FloatRegister oldElem0 = v0; 4474 FloatRegister oldElem1 = v1; 4475 FloatRegister newElem = v2; 4476 FloatRegister shiftVCount = v3; 4477 FloatRegister shiftVRevCount = v4; 4478 4479 __ cbz(idx, Exit); 4480 4481 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4482 4483 // left shift count 4484 __ movw(shiftRevCount, 32); 4485 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4486 4487 // numIter too small to allow a 4-words SIMD loop, rolling back 4488 __ cmp(numIter, (u1)4); 4489 __ br(Assembler::LT, ShiftThree); 4490 4491 __ dup(shiftVCount, __ T4S, shiftCount); 4492 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4493 __ negr(shiftVCount, __ T4S, shiftVCount); 4494 4495 __ BIND(ShiftSIMDLoop); 4496 4497 // Calculate the load addresses 4498 __ sub(idx, idx, 4); 4499 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4500 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4501 __ add(oldArrCur, oldArrNext, 4); 4502 4503 // Load 4 words and process 4504 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4505 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4506 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4507 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4508 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4509 __ st1(newElem, __ T4S, Address(newArrCur)); 4510 4511 __ cmp(idx, (u1)4); 4512 __ br(Assembler::LT, ShiftTwoLoop); 4513 __ b(ShiftSIMDLoop); 4514 4515 __ BIND(ShiftTwoLoop); 4516 __ cbz(idx, Exit); 4517 __ cmp(idx, (u1)1); 4518 __ br(Assembler::EQ, ShiftOne); 4519 4520 // Calculate the load addresses 4521 __ sub(idx, idx, 2); 4522 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4523 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4524 __ add(oldArrCur, oldArrNext, 4); 4525 4526 // Load 2 words and process 4527 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4528 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4529 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4530 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4531 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4532 __ st1(newElem, __ T2S, Address(newArrCur)); 4533 __ b(ShiftTwoLoop); 4534 4535 __ BIND(ShiftThree); 4536 __ tbz(idx, 1, ShiftOne); 4537 __ tbz(idx, 0, ShiftTwo); 4538 __ ldrw(r10, Address(oldArr, 12)); 4539 __ ldrw(r11, Address(oldArr, 8)); 4540 __ lsrvw(r10, r10, shiftCount); 4541 __ lslvw(r11, r11, shiftRevCount); 4542 __ orrw(r12, r10, r11); 4543 __ strw(r12, Address(newArr, 8)); 4544 4545 __ BIND(ShiftTwo); 4546 __ ldrw(r10, Address(oldArr, 8)); 4547 __ ldrw(r11, Address(oldArr, 4)); 4548 __ lsrvw(r10, r10, shiftCount); 4549 __ lslvw(r11, r11, shiftRevCount); 4550 __ orrw(r12, r10, r11); 4551 __ strw(r12, Address(newArr, 4)); 4552 4553 __ BIND(ShiftOne); 4554 __ ldrw(r10, Address(oldArr, 4)); 4555 __ ldrw(r11, Address(oldArr)); 4556 __ lsrvw(r10, r10, shiftCount); 4557 __ lslvw(r11, r11, shiftRevCount); 4558 __ orrw(r12, r10, r11); 4559 __ strw(r12, Address(newArr)); 4560 4561 __ BIND(Exit); 4562 __ ret(lr); 4563 4564 return start; 4565 } 4566 4567 // Arguments: 4568 // 4569 // Input: 4570 // c_rarg0 - newArr address 4571 // c_rarg1 - oldArr address 4572 // c_rarg2 - newIdx 4573 // c_rarg3 - shiftCount 4574 // c_rarg4 - numIter 4575 // 4576 address generate_bigIntegerLeftShift() { 4577 __ align(CodeEntryAlignment); 4578 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4579 address start = __ pc(); 4580 4581 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4582 4583 Register newArr = c_rarg0; 4584 Register oldArr = c_rarg1; 4585 Register newIdx = c_rarg2; 4586 Register shiftCount = c_rarg3; 4587 Register numIter = c_rarg4; 4588 4589 Register shiftRevCount = rscratch1; 4590 Register oldArrNext = rscratch2; 4591 4592 FloatRegister oldElem0 = v0; 4593 FloatRegister oldElem1 = v1; 4594 FloatRegister newElem = v2; 4595 FloatRegister shiftVCount = v3; 4596 FloatRegister shiftVRevCount = v4; 4597 4598 __ cbz(numIter, Exit); 4599 4600 __ add(oldArrNext, oldArr, 4); 4601 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4602 4603 // right shift count 4604 __ movw(shiftRevCount, 32); 4605 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4606 4607 // numIter too small to allow a 4-words SIMD loop, rolling back 4608 __ cmp(numIter, (u1)4); 4609 __ br(Assembler::LT, ShiftThree); 4610 4611 __ dup(shiftVCount, __ T4S, shiftCount); 4612 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4613 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4614 4615 __ BIND(ShiftSIMDLoop); 4616 4617 // load 4 words and process 4618 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4619 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4620 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4621 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4622 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4623 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4624 __ sub(numIter, numIter, 4); 4625 4626 __ cmp(numIter, (u1)4); 4627 __ br(Assembler::LT, ShiftTwoLoop); 4628 __ b(ShiftSIMDLoop); 4629 4630 __ BIND(ShiftTwoLoop); 4631 __ cbz(numIter, Exit); 4632 __ cmp(numIter, (u1)1); 4633 __ br(Assembler::EQ, ShiftOne); 4634 4635 // load 2 words and process 4636 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4637 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4638 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4639 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4640 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4641 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4642 __ sub(numIter, numIter, 2); 4643 __ b(ShiftTwoLoop); 4644 4645 __ BIND(ShiftThree); 4646 __ ldrw(r10, __ post(oldArr, 4)); 4647 __ ldrw(r11, __ post(oldArrNext, 4)); 4648 __ lslvw(r10, r10, shiftCount); 4649 __ lsrvw(r11, r11, shiftRevCount); 4650 __ orrw(r12, r10, r11); 4651 __ strw(r12, __ post(newArr, 4)); 4652 __ tbz(numIter, 1, Exit); 4653 __ tbz(numIter, 0, ShiftOne); 4654 4655 __ BIND(ShiftTwo); 4656 __ ldrw(r10, __ post(oldArr, 4)); 4657 __ ldrw(r11, __ post(oldArrNext, 4)); 4658 __ lslvw(r10, r10, shiftCount); 4659 __ lsrvw(r11, r11, shiftRevCount); 4660 __ orrw(r12, r10, r11); 4661 __ strw(r12, __ post(newArr, 4)); 4662 4663 __ BIND(ShiftOne); 4664 __ ldrw(r10, Address(oldArr)); 4665 __ ldrw(r11, Address(oldArrNext)); 4666 __ lslvw(r10, r10, shiftCount); 4667 __ lsrvw(r11, r11, shiftRevCount); 4668 __ orrw(r12, r10, r11); 4669 __ strw(r12, Address(newArr)); 4670 4671 __ BIND(Exit); 4672 __ ret(lr); 4673 4674 return start; 4675 } 4676 4677 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 4678 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 4679 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 4680 // Karatsuba multiplication performs a 128*128 -> 256-bit 4681 // multiplication in three 128-bit multiplications and a few 4682 // additions. 4683 // 4684 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 4685 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 4686 // 4687 // Inputs: 4688 // 4689 // A0 in a.d[0] (subkey) 4690 // A1 in a.d[1] 4691 // (A1+A0) in a1_xor_a0.d[0] 4692 // 4693 // B0 in b.d[0] (state) 4694 // B1 in b.d[1] 4695 4696 __ ext(tmp1, __ T16B, b, b, 0x08); 4697 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 4698 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 4699 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 4700 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 4701 4702 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 4703 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 4704 __ eor(tmp2, __ T16B, tmp2, tmp4); 4705 __ eor(tmp2, __ T16B, tmp2, tmp3); 4706 4707 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 4708 __ ins(result_hi, __ D, tmp2, 0, 1); 4709 __ ins(result_lo, __ D, tmp2, 1, 0); 4710 } 4711 4712 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 4713 FloatRegister p, FloatRegister z, FloatRegister t1) { 4714 const FloatRegister t0 = result; 4715 4716 // The GCM field polynomial f is z^128 + p(z), where p = 4717 // z^7+z^2+z+1. 4718 // 4719 // z^128 === -p(z) (mod (z^128 + p(z))) 4720 // 4721 // so, given that the product we're reducing is 4722 // a == lo + hi * z^128 4723 // substituting, 4724 // === lo - hi * p(z) (mod (z^128 + p(z))) 4725 // 4726 // we reduce by multiplying hi by p(z) and subtracting the result 4727 // from (i.e. XORing it with) lo. Because p has no nonzero high 4728 // bits we can do this with two 64-bit multiplications, lo*p and 4729 // hi*p. 4730 4731 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 4732 __ ext(t1, __ T16B, t0, z, 8); 4733 __ eor(hi, __ T16B, hi, t1); 4734 __ ext(t1, __ T16B, z, t0, 8); 4735 __ eor(lo, __ T16B, lo, t1); 4736 __ pmull(t0, __ T1Q, hi, p, __ T1D); 4737 __ eor(result, __ T16B, lo, t0); 4738 } 4739 4740 address generate_has_negatives(address &has_negatives_long) { 4741 const u1 large_loop_size = 64; 4742 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4743 int dcache_line = VM_Version::dcache_line_size(); 4744 4745 Register ary1 = r1, len = r2, result = r0; 4746 4747 __ align(CodeEntryAlignment); 4748 4749 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 4750 4751 address entry = __ pc(); 4752 4753 __ enter(); 4754 4755 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, 4756 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 4757 4758 __ cmp(len, (u1)15); 4759 __ br(Assembler::GT, LEN_OVER_15); 4760 // The only case when execution falls into this code is when pointer is near 4761 // the end of memory page and we have to avoid reading next page 4762 __ add(ary1, ary1, len); 4763 __ subs(len, len, 8); 4764 __ br(Assembler::GT, LEN_OVER_8); 4765 __ ldr(rscratch2, Address(ary1, -8)); 4766 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 4767 __ lsrv(rscratch2, rscratch2, rscratch1); 4768 __ tst(rscratch2, UPPER_BIT_MASK); 4769 __ cset(result, Assembler::NE); 4770 __ leave(); 4771 __ ret(lr); 4772 __ bind(LEN_OVER_8); 4773 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 4774 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 4775 __ tst(rscratch2, UPPER_BIT_MASK); 4776 __ br(Assembler::NE, RET_TRUE_NO_POP); 4777 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 4778 __ lsrv(rscratch1, rscratch1, rscratch2); 4779 __ tst(rscratch1, UPPER_BIT_MASK); 4780 __ cset(result, Assembler::NE); 4781 __ leave(); 4782 __ ret(lr); 4783 4784 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 4785 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 4786 4787 has_negatives_long = __ pc(); // 2nd entry point 4788 4789 __ enter(); 4790 4791 __ bind(LEN_OVER_15); 4792 __ push(spilled_regs, sp); 4793 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 4794 __ cbz(rscratch2, ALIGNED); 4795 __ ldp(tmp6, tmp1, Address(ary1)); 4796 __ mov(tmp5, 16); 4797 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 4798 __ add(ary1, ary1, rscratch1); 4799 __ sub(len, len, rscratch1); 4800 __ orr(tmp6, tmp6, tmp1); 4801 __ tst(tmp6, UPPER_BIT_MASK); 4802 __ br(Assembler::NE, RET_TRUE); 4803 4804 __ bind(ALIGNED); 4805 __ cmp(len, large_loop_size); 4806 __ br(Assembler::LT, CHECK_16); 4807 // Perform 16-byte load as early return in pre-loop to handle situation 4808 // when initially aligned large array has negative values at starting bytes, 4809 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 4810 // slower. Cases with negative bytes further ahead won't be affected that 4811 // much. In fact, it'll be faster due to early loads, less instructions and 4812 // less branches in LARGE_LOOP. 4813 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 4814 __ sub(len, len, 16); 4815 __ orr(tmp6, tmp6, tmp1); 4816 __ tst(tmp6, UPPER_BIT_MASK); 4817 __ br(Assembler::NE, RET_TRUE); 4818 __ cmp(len, large_loop_size); 4819 __ br(Assembler::LT, CHECK_16); 4820 4821 if (SoftwarePrefetchHintDistance >= 0 4822 && SoftwarePrefetchHintDistance >= dcache_line) { 4823 // initial prefetch 4824 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 4825 } 4826 __ bind(LARGE_LOOP); 4827 if (SoftwarePrefetchHintDistance >= 0) { 4828 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 4829 } 4830 // Issue load instructions first, since it can save few CPU/MEM cycles, also 4831 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 4832 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 4833 // instructions per cycle and have less branches, but this approach disables 4834 // early return, thus, all 64 bytes are loaded and checked every time. 4835 __ ldp(tmp2, tmp3, Address(ary1)); 4836 __ ldp(tmp4, tmp5, Address(ary1, 16)); 4837 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 4838 __ ldp(tmp6, tmp1, Address(ary1, 48)); 4839 __ add(ary1, ary1, large_loop_size); 4840 __ sub(len, len, large_loop_size); 4841 __ orr(tmp2, tmp2, tmp3); 4842 __ orr(tmp4, tmp4, tmp5); 4843 __ orr(rscratch1, rscratch1, rscratch2); 4844 __ orr(tmp6, tmp6, tmp1); 4845 __ orr(tmp2, tmp2, tmp4); 4846 __ orr(rscratch1, rscratch1, tmp6); 4847 __ orr(tmp2, tmp2, rscratch1); 4848 __ tst(tmp2, UPPER_BIT_MASK); 4849 __ br(Assembler::NE, RET_TRUE); 4850 __ cmp(len, large_loop_size); 4851 __ br(Assembler::GE, LARGE_LOOP); 4852 4853 __ bind(CHECK_16); // small 16-byte load pre-loop 4854 __ cmp(len, (u1)16); 4855 __ br(Assembler::LT, POST_LOOP16); 4856 4857 __ bind(LOOP16); // small 16-byte load loop 4858 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 4859 __ sub(len, len, 16); 4860 __ orr(tmp2, tmp2, tmp3); 4861 __ tst(tmp2, UPPER_BIT_MASK); 4862 __ br(Assembler::NE, RET_TRUE); 4863 __ cmp(len, (u1)16); 4864 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 4865 4866 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 4867 __ cmp(len, (u1)8); 4868 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 4869 __ ldr(tmp3, Address(__ post(ary1, 8))); 4870 __ sub(len, len, 8); 4871 __ tst(tmp3, UPPER_BIT_MASK); 4872 __ br(Assembler::NE, RET_TRUE); 4873 4874 __ bind(POST_LOOP16_LOAD_TAIL); 4875 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 4876 __ ldr(tmp1, Address(ary1)); 4877 __ mov(tmp2, 64); 4878 __ sub(tmp4, tmp2, len, __ LSL, 3); 4879 __ lslv(tmp1, tmp1, tmp4); 4880 __ tst(tmp1, UPPER_BIT_MASK); 4881 __ br(Assembler::NE, RET_TRUE); 4882 // Fallthrough 4883 4884 __ bind(RET_FALSE); 4885 __ pop(spilled_regs, sp); 4886 __ leave(); 4887 __ mov(result, zr); 4888 __ ret(lr); 4889 4890 __ bind(RET_TRUE); 4891 __ pop(spilled_regs, sp); 4892 __ bind(RET_TRUE_NO_POP); 4893 __ leave(); 4894 __ mov(result, 1); 4895 __ ret(lr); 4896 4897 return entry; 4898 } 4899 4900 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 4901 bool usePrefetch, Label &NOT_EQUAL) { 4902 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4903 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 4904 tmp7 = r12, tmp8 = r13; 4905 Label LOOP; 4906 4907 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4908 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4909 __ bind(LOOP); 4910 if (usePrefetch) { 4911 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 4912 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 4913 } 4914 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 4915 __ eor(tmp1, tmp1, tmp2); 4916 __ eor(tmp3, tmp3, tmp4); 4917 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 4918 __ orr(tmp1, tmp1, tmp3); 4919 __ cbnz(tmp1, NOT_EQUAL); 4920 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4921 __ eor(tmp5, tmp5, tmp6); 4922 __ eor(tmp7, tmp7, tmp8); 4923 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4924 __ orr(tmp5, tmp5, tmp7); 4925 __ cbnz(tmp5, NOT_EQUAL); 4926 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 4927 __ eor(tmp1, tmp1, tmp2); 4928 __ eor(tmp3, tmp3, tmp4); 4929 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 4930 __ orr(tmp1, tmp1, tmp3); 4931 __ cbnz(tmp1, NOT_EQUAL); 4932 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 4933 __ eor(tmp5, tmp5, tmp6); 4934 __ sub(cnt1, cnt1, 8 * wordSize); 4935 __ eor(tmp7, tmp7, tmp8); 4936 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4937 // tmp6 is not used. MacroAssembler::subs is used here (rather than 4938 // cmp) because subs allows an unlimited range of immediate operand. 4939 __ subs(tmp6, cnt1, loopThreshold); 4940 __ orr(tmp5, tmp5, tmp7); 4941 __ cbnz(tmp5, NOT_EQUAL); 4942 __ br(__ GE, LOOP); 4943 // post-loop 4944 __ eor(tmp1, tmp1, tmp2); 4945 __ eor(tmp3, tmp3, tmp4); 4946 __ orr(tmp1, tmp1, tmp3); 4947 __ sub(cnt1, cnt1, 2 * wordSize); 4948 __ cbnz(tmp1, NOT_EQUAL); 4949 } 4950 4951 void generate_large_array_equals_loop_simd(int loopThreshold, 4952 bool usePrefetch, Label &NOT_EQUAL) { 4953 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4954 tmp2 = rscratch2; 4955 Label LOOP; 4956 4957 __ bind(LOOP); 4958 if (usePrefetch) { 4959 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 4960 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 4961 } 4962 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 4963 __ sub(cnt1, cnt1, 8 * wordSize); 4964 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 4965 __ subs(tmp1, cnt1, loopThreshold); 4966 __ eor(v0, __ T16B, v0, v4); 4967 __ eor(v1, __ T16B, v1, v5); 4968 __ eor(v2, __ T16B, v2, v6); 4969 __ eor(v3, __ T16B, v3, v7); 4970 __ orr(v0, __ T16B, v0, v1); 4971 __ orr(v1, __ T16B, v2, v3); 4972 __ orr(v0, __ T16B, v0, v1); 4973 __ umov(tmp1, v0, __ D, 0); 4974 __ umov(tmp2, v0, __ D, 1); 4975 __ orr(tmp1, tmp1, tmp2); 4976 __ cbnz(tmp1, NOT_EQUAL); 4977 __ br(__ GE, LOOP); 4978 } 4979 4980 // a1 = r1 - array1 address 4981 // a2 = r2 - array2 address 4982 // result = r0 - return value. Already contains "false" 4983 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 4984 // r3-r5 are reserved temporary registers 4985 address generate_large_array_equals() { 4986 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4987 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 4988 tmp7 = r12, tmp8 = r13; 4989 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 4990 SMALL_LOOP, POST_LOOP; 4991 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 4992 // calculate if at least 32 prefetched bytes are used 4993 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 4994 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 4995 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 4996 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 4997 tmp5, tmp6, tmp7, tmp8); 4998 4999 __ align(CodeEntryAlignment); 5000 5001 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5002 5003 address entry = __ pc(); 5004 __ enter(); 5005 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5006 // also advance pointers to use post-increment instead of pre-increment 5007 __ add(a1, a1, wordSize); 5008 __ add(a2, a2, wordSize); 5009 if (AvoidUnalignedAccesses) { 5010 // both implementations (SIMD/nonSIMD) are using relatively large load 5011 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5012 // on some CPUs in case of address is not at least 16-byte aligned. 5013 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5014 // load if needed at least for 1st address and make if 16-byte aligned. 5015 Label ALIGNED16; 5016 __ tbz(a1, 3, ALIGNED16); 5017 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5018 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5019 __ sub(cnt1, cnt1, wordSize); 5020 __ eor(tmp1, tmp1, tmp2); 5021 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5022 __ bind(ALIGNED16); 5023 } 5024 if (UseSIMDForArrayEquals) { 5025 if (SoftwarePrefetchHintDistance >= 0) { 5026 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5027 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5028 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5029 /* prfm = */ true, NOT_EQUAL); 5030 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5031 __ br(__ LT, TAIL); 5032 } 5033 __ bind(NO_PREFETCH_LARGE_LOOP); 5034 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5035 /* prfm = */ false, NOT_EQUAL); 5036 } else { 5037 __ push(spilled_regs, sp); 5038 if (SoftwarePrefetchHintDistance >= 0) { 5039 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5040 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5041 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5042 /* prfm = */ true, NOT_EQUAL); 5043 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5044 __ br(__ LT, TAIL); 5045 } 5046 __ bind(NO_PREFETCH_LARGE_LOOP); 5047 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5048 /* prfm = */ false, NOT_EQUAL); 5049 } 5050 __ bind(TAIL); 5051 __ cbz(cnt1, EQUAL); 5052 __ subs(cnt1, cnt1, wordSize); 5053 __ br(__ LE, POST_LOOP); 5054 __ bind(SMALL_LOOP); 5055 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5056 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5057 __ subs(cnt1, cnt1, wordSize); 5058 __ eor(tmp1, tmp1, tmp2); 5059 __ cbnz(tmp1, NOT_EQUAL); 5060 __ br(__ GT, SMALL_LOOP); 5061 __ bind(POST_LOOP); 5062 __ ldr(tmp1, Address(a1, cnt1)); 5063 __ ldr(tmp2, Address(a2, cnt1)); 5064 __ eor(tmp1, tmp1, tmp2); 5065 __ cbnz(tmp1, NOT_EQUAL); 5066 __ bind(EQUAL); 5067 __ mov(result, true); 5068 __ bind(NOT_EQUAL); 5069 if (!UseSIMDForArrayEquals) { 5070 __ pop(spilled_regs, sp); 5071 } 5072 __ bind(NOT_EQUAL_NO_POP); 5073 __ leave(); 5074 __ ret(lr); 5075 return entry; 5076 } 5077 5078 address generate_dsin_dcos(bool isCos) { 5079 __ align(CodeEntryAlignment); 5080 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5081 address start = __ pc(); 5082 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5083 (address)StubRoutines::aarch64::_two_over_pi, 5084 (address)StubRoutines::aarch64::_pio2, 5085 (address)StubRoutines::aarch64::_dsin_coef, 5086 (address)StubRoutines::aarch64::_dcos_coef); 5087 return start; 5088 } 5089 5090 address generate_dlog() { 5091 __ align(CodeEntryAlignment); 5092 StubCodeMark mark(this, "StubRoutines", "dlog"); 5093 address entry = __ pc(); 5094 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 5095 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 5096 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 5097 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 5098 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 5099 return entry; 5100 } 5101 5102 5103 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5104 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5105 Label &DIFF2) { 5106 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5107 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5108 5109 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5110 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5111 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5112 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5113 5114 __ fmovd(tmpL, vtmp3); 5115 __ eor(rscratch2, tmp3, tmpL); 5116 __ cbnz(rscratch2, DIFF2); 5117 5118 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5119 __ umov(tmpL, vtmp3, __ D, 1); 5120 __ eor(rscratch2, tmpU, tmpL); 5121 __ cbnz(rscratch2, DIFF1); 5122 5123 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5124 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5125 __ fmovd(tmpL, vtmp); 5126 __ eor(rscratch2, tmp3, tmpL); 5127 __ cbnz(rscratch2, DIFF2); 5128 5129 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5130 __ umov(tmpL, vtmp, __ D, 1); 5131 __ eor(rscratch2, tmpU, tmpL); 5132 __ cbnz(rscratch2, DIFF1); 5133 } 5134 5135 // r0 = result 5136 // r1 = str1 5137 // r2 = cnt1 5138 // r3 = str2 5139 // r4 = cnt2 5140 // r10 = tmp1 5141 // r11 = tmp2 5142 address generate_compare_long_string_different_encoding(bool isLU) { 5143 __ align(CodeEntryAlignment); 5144 StubCodeMark mark(this, "StubRoutines", isLU 5145 ? "compare_long_string_different_encoding LU" 5146 : "compare_long_string_different_encoding UL"); 5147 address entry = __ pc(); 5148 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5149 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5150 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5151 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5152 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5153 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5154 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5155 5156 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5157 5158 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5159 // cnt2 == amount of characters left to compare 5160 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5161 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5162 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5163 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5164 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5165 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5166 __ eor(rscratch2, tmp1, tmp2); 5167 __ mov(rscratch1, tmp2); 5168 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5169 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5170 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5171 __ push(spilled_regs, sp); 5172 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5173 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5174 5175 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5176 5177 if (SoftwarePrefetchHintDistance >= 0) { 5178 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5179 __ br(__ LT, NO_PREFETCH); 5180 __ bind(LARGE_LOOP_PREFETCH); 5181 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5182 __ mov(tmp4, 2); 5183 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5184 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5185 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5186 __ subs(tmp4, tmp4, 1); 5187 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5188 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5189 __ mov(tmp4, 2); 5190 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5191 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5192 __ subs(tmp4, tmp4, 1); 5193 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5194 __ sub(cnt2, cnt2, 64); 5195 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5196 __ br(__ GE, LARGE_LOOP_PREFETCH); 5197 } 5198 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5199 __ bind(NO_PREFETCH); 5200 __ subs(cnt2, cnt2, 16); 5201 __ br(__ LT, TAIL); 5202 __ align(OptoLoopAlignment); 5203 __ bind(SMALL_LOOP); // smaller loop 5204 __ subs(cnt2, cnt2, 16); 5205 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5206 __ br(__ GE, SMALL_LOOP); 5207 __ cmn(cnt2, (u1)16); 5208 __ br(__ EQ, LOAD_LAST); 5209 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5210 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5211 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5212 __ ldr(tmp3, Address(cnt1, -8)); 5213 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5214 __ b(LOAD_LAST); 5215 __ bind(DIFF2); 5216 __ mov(tmpU, tmp3); 5217 __ bind(DIFF1); 5218 __ pop(spilled_regs, sp); 5219 __ b(CALCULATE_DIFFERENCE); 5220 __ bind(LOAD_LAST); 5221 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5222 // No need to load it again 5223 __ mov(tmpU, tmp3); 5224 __ pop(spilled_regs, sp); 5225 5226 // tmp2 points to the address of the last 4 Latin1 characters right now 5227 __ ldrs(vtmp, Address(tmp2)); 5228 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5229 __ fmovd(tmpL, vtmp); 5230 5231 __ eor(rscratch2, tmpU, tmpL); 5232 __ cbz(rscratch2, DONE); 5233 5234 // Find the first different characters in the longwords and 5235 // compute their difference. 5236 __ bind(CALCULATE_DIFFERENCE); 5237 __ rev(rscratch2, rscratch2); 5238 __ clz(rscratch2, rscratch2); 5239 __ andr(rscratch2, rscratch2, -16); 5240 __ lsrv(tmp1, tmp1, rscratch2); 5241 __ uxthw(tmp1, tmp1); 5242 __ lsrv(rscratch1, rscratch1, rscratch2); 5243 __ uxthw(rscratch1, rscratch1); 5244 __ subw(result, tmp1, rscratch1); 5245 __ bind(DONE); 5246 __ ret(lr); 5247 return entry; 5248 } 5249 5250 address generate_method_entry_barrier() { 5251 __ align(CodeEntryAlignment); 5252 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5253 5254 Label deoptimize_label; 5255 5256 address start = __ pc(); 5257 5258 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5259 5260 __ enter(); 5261 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5262 5263 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5264 5265 __ push_call_clobbered_registers(); 5266 5267 __ mov(c_rarg0, rscratch2); 5268 __ call_VM_leaf 5269 (CAST_FROM_FN_PTR 5270 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5271 5272 __ reset_last_Java_frame(true); 5273 5274 __ mov(rscratch1, r0); 5275 5276 __ pop_call_clobbered_registers(); 5277 5278 __ cbnz(rscratch1, deoptimize_label); 5279 5280 __ leave(); 5281 __ ret(lr); 5282 5283 __ BIND(deoptimize_label); 5284 5285 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5286 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5287 5288 __ mov(sp, rscratch1); 5289 __ br(rscratch2); 5290 5291 return start; 5292 } 5293 5294 address generate_check_lock_stack() { 5295 __ align(CodeEntryAlignment); 5296 StubCodeMark mark(this, "StubRoutines", "check_lock_stack"); 5297 5298 address start = __ pc(); 5299 5300 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5301 __ enter(); 5302 __ push_call_clobbered_registers(); 5303 5304 __ mov(c_rarg0, r9); 5305 __ call_VM_leaf(CAST_FROM_FN_PTR(address, LockStack::ensure_lock_stack_size), 1); 5306 5307 5308 __ pop_call_clobbered_registers(); 5309 __ leave(); 5310 __ reset_last_Java_frame(true); 5311 5312 __ ret(lr); 5313 5314 return start; 5315 } 5316 5317 // r0 = result 5318 // r1 = str1 5319 // r2 = cnt1 5320 // r3 = str2 5321 // r4 = cnt2 5322 // r10 = tmp1 5323 // r11 = tmp2 5324 address generate_compare_long_string_same_encoding(bool isLL) { 5325 __ align(CodeEntryAlignment); 5326 StubCodeMark mark(this, "StubRoutines", isLL 5327 ? "compare_long_string_same_encoding LL" 5328 : "compare_long_string_same_encoding UU"); 5329 address entry = __ pc(); 5330 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5331 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5332 5333 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5334 5335 // exit from large loop when less than 64 bytes left to read or we're about 5336 // to prefetch memory behind array border 5337 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5338 5339 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5340 __ eor(rscratch2, tmp1, tmp2); 5341 __ cbnz(rscratch2, CAL_DIFFERENCE); 5342 5343 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5344 // update pointers, because of previous read 5345 __ add(str1, str1, wordSize); 5346 __ add(str2, str2, wordSize); 5347 if (SoftwarePrefetchHintDistance >= 0) { 5348 __ align(OptoLoopAlignment); 5349 __ bind(LARGE_LOOP_PREFETCH); 5350 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5351 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5352 5353 for (int i = 0; i < 4; i++) { 5354 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5355 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5356 __ cmp(tmp1, tmp2); 5357 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5358 __ br(Assembler::NE, DIFF); 5359 } 5360 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5361 __ add(str1, str1, 64); 5362 __ add(str2, str2, 64); 5363 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5364 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5365 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5366 } 5367 5368 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5369 __ br(Assembler::LE, LESS16); 5370 __ align(OptoLoopAlignment); 5371 __ bind(LOOP_COMPARE16); 5372 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5373 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5374 __ cmp(tmp1, tmp2); 5375 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5376 __ br(Assembler::NE, DIFF); 5377 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5378 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5379 __ br(Assembler::LT, LESS16); 5380 5381 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5382 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5383 __ cmp(tmp1, tmp2); 5384 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5385 __ br(Assembler::NE, DIFF); 5386 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5387 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5388 __ br(Assembler::GE, LOOP_COMPARE16); 5389 __ cbz(cnt2, LENGTH_DIFF); 5390 5391 __ bind(LESS16); 5392 // each 8 compare 5393 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5394 __ br(Assembler::LE, LESS8); 5395 __ ldr(tmp1, Address(__ post(str1, 8))); 5396 __ ldr(tmp2, Address(__ post(str2, 8))); 5397 __ eor(rscratch2, tmp1, tmp2); 5398 __ cbnz(rscratch2, CAL_DIFFERENCE); 5399 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5400 5401 __ bind(LESS8); // directly load last 8 bytes 5402 if (!isLL) { 5403 __ add(cnt2, cnt2, cnt2); 5404 } 5405 __ ldr(tmp1, Address(str1, cnt2)); 5406 __ ldr(tmp2, Address(str2, cnt2)); 5407 __ eor(rscratch2, tmp1, tmp2); 5408 __ cbz(rscratch2, LENGTH_DIFF); 5409 __ b(CAL_DIFFERENCE); 5410 5411 __ bind(DIFF); 5412 __ cmp(tmp1, tmp2); 5413 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5414 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5415 // reuse rscratch2 register for the result of eor instruction 5416 __ eor(rscratch2, tmp1, tmp2); 5417 5418 __ bind(CAL_DIFFERENCE); 5419 __ rev(rscratch2, rscratch2); 5420 __ clz(rscratch2, rscratch2); 5421 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5422 __ lsrv(tmp1, tmp1, rscratch2); 5423 __ lsrv(tmp2, tmp2, rscratch2); 5424 if (isLL) { 5425 __ uxtbw(tmp1, tmp1); 5426 __ uxtbw(tmp2, tmp2); 5427 } else { 5428 __ uxthw(tmp1, tmp1); 5429 __ uxthw(tmp2, tmp2); 5430 } 5431 __ subw(result, tmp1, tmp2); 5432 5433 __ bind(LENGTH_DIFF); 5434 __ ret(lr); 5435 return entry; 5436 } 5437 5438 void generate_compare_long_strings() { 5439 StubRoutines::aarch64::_compare_long_string_LL 5440 = generate_compare_long_string_same_encoding(true); 5441 StubRoutines::aarch64::_compare_long_string_UU 5442 = generate_compare_long_string_same_encoding(false); 5443 StubRoutines::aarch64::_compare_long_string_LU 5444 = generate_compare_long_string_different_encoding(true); 5445 StubRoutines::aarch64::_compare_long_string_UL 5446 = generate_compare_long_string_different_encoding(false); 5447 } 5448 5449 // R0 = result 5450 // R1 = str2 5451 // R2 = cnt1 5452 // R3 = str1 5453 // R4 = cnt2 5454 // This generic linear code use few additional ideas, which makes it faster: 5455 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 5456 // in order to skip initial loading(help in systems with 1 ld pipeline) 5457 // 2) we can use "fast" algorithm of finding single character to search for 5458 // first symbol with less branches(1 branch per each loaded register instead 5459 // of branch for each symbol), so, this is where constants like 5460 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 5461 // 3) after loading and analyzing 1st register of source string, it can be 5462 // used to search for every 1st character entry, saving few loads in 5463 // comparison with "simplier-but-slower" implementation 5464 // 4) in order to avoid lots of push/pop operations, code below is heavily 5465 // re-using/re-initializing/compressing register values, which makes code 5466 // larger and a bit less readable, however, most of extra operations are 5467 // issued during loads or branches, so, penalty is minimal 5468 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 5469 const char* stubName = str1_isL 5470 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 5471 : "indexof_linear_uu"; 5472 __ align(CodeEntryAlignment); 5473 StubCodeMark mark(this, "StubRoutines", stubName); 5474 address entry = __ pc(); 5475 5476 int str1_chr_size = str1_isL ? 1 : 2; 5477 int str2_chr_size = str2_isL ? 1 : 2; 5478 int str1_chr_shift = str1_isL ? 0 : 1; 5479 int str2_chr_shift = str2_isL ? 0 : 1; 5480 bool isL = str1_isL && str2_isL; 5481 // parameters 5482 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 5483 // temporary registers 5484 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 5485 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 5486 // redefinitions 5487 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 5488 5489 __ push(spilled_regs, sp); 5490 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 5491 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 5492 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 5493 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 5494 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 5495 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 5496 // Read whole register from str1. It is safe, because length >=8 here 5497 __ ldr(ch1, Address(str1)); 5498 // Read whole register from str2. It is safe, because length >=8 here 5499 __ ldr(ch2, Address(str2)); 5500 __ sub(cnt2, cnt2, cnt1); 5501 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 5502 if (str1_isL != str2_isL) { 5503 __ eor(v0, __ T16B, v0, v0); 5504 } 5505 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 5506 __ mul(first, first, tmp1); 5507 // check if we have less than 1 register to check 5508 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 5509 if (str1_isL != str2_isL) { 5510 __ fmovd(v1, ch1); 5511 } 5512 __ br(__ LE, L_SMALL); 5513 __ eor(ch2, first, ch2); 5514 if (str1_isL != str2_isL) { 5515 __ zip1(v1, __ T16B, v1, v0); 5516 } 5517 __ sub(tmp2, ch2, tmp1); 5518 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5519 __ bics(tmp2, tmp2, ch2); 5520 if (str1_isL != str2_isL) { 5521 __ fmovd(ch1, v1); 5522 } 5523 __ br(__ NE, L_HAS_ZERO); 5524 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5525 __ add(result, result, wordSize/str2_chr_size); 5526 __ add(str2, str2, wordSize); 5527 __ br(__ LT, L_POST_LOOP); 5528 __ BIND(L_LOOP); 5529 __ ldr(ch2, Address(str2)); 5530 __ eor(ch2, first, ch2); 5531 __ sub(tmp2, ch2, tmp1); 5532 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5533 __ bics(tmp2, tmp2, ch2); 5534 __ br(__ NE, L_HAS_ZERO); 5535 __ BIND(L_LOOP_PROCEED); 5536 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 5537 __ add(str2, str2, wordSize); 5538 __ add(result, result, wordSize/str2_chr_size); 5539 __ br(__ GE, L_LOOP); 5540 __ BIND(L_POST_LOOP); 5541 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 5542 __ br(__ LE, NOMATCH); 5543 __ ldr(ch2, Address(str2)); 5544 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5545 __ eor(ch2, first, ch2); 5546 __ sub(tmp2, ch2, tmp1); 5547 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5548 __ mov(tmp4, -1); // all bits set 5549 __ b(L_SMALL_PROCEED); 5550 __ align(OptoLoopAlignment); 5551 __ BIND(L_SMALL); 5552 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 5553 __ eor(ch2, first, ch2); 5554 if (str1_isL != str2_isL) { 5555 __ zip1(v1, __ T16B, v1, v0); 5556 } 5557 __ sub(tmp2, ch2, tmp1); 5558 __ mov(tmp4, -1); // all bits set 5559 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 5560 if (str1_isL != str2_isL) { 5561 __ fmovd(ch1, v1); // move converted 4 symbols 5562 } 5563 __ BIND(L_SMALL_PROCEED); 5564 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 5565 __ bic(tmp2, tmp2, ch2); 5566 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 5567 __ rbit(tmp2, tmp2); 5568 __ br(__ EQ, NOMATCH); 5569 __ BIND(L_SMALL_HAS_ZERO_LOOP); 5570 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 5571 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 5572 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 5573 if (str2_isL) { // LL 5574 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5575 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5576 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5577 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5578 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5579 } else { 5580 __ mov(ch2, 0xE); // all bits in byte set except last one 5581 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5582 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5583 __ lslv(tmp2, tmp2, tmp4); 5584 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5585 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5586 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5587 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5588 } 5589 __ cmp(ch1, ch2); 5590 __ mov(tmp4, wordSize/str2_chr_size); 5591 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5592 __ BIND(L_SMALL_CMP_LOOP); 5593 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5594 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5595 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5596 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5597 __ add(tmp4, tmp4, 1); 5598 __ cmp(tmp4, cnt1); 5599 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 5600 __ cmp(first, ch2); 5601 __ br(__ EQ, L_SMALL_CMP_LOOP); 5602 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 5603 __ cbz(tmp2, NOMATCH); // no more matches. exit 5604 __ clz(tmp4, tmp2); 5605 __ add(result, result, 1); // advance index 5606 __ add(str2, str2, str2_chr_size); // advance pointer 5607 __ b(L_SMALL_HAS_ZERO_LOOP); 5608 __ align(OptoLoopAlignment); 5609 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 5610 __ cmp(first, ch2); 5611 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5612 __ b(DONE); 5613 __ align(OptoLoopAlignment); 5614 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 5615 if (str2_isL) { // LL 5616 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 5617 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 5618 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 5619 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 5620 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5621 } else { 5622 __ mov(ch2, 0xE); // all bits in byte set except last one 5623 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5624 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5625 __ lslv(tmp2, tmp2, tmp4); 5626 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5627 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5628 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 5629 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5630 } 5631 __ cmp(ch1, ch2); 5632 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 5633 __ b(DONE); 5634 __ align(OptoLoopAlignment); 5635 __ BIND(L_HAS_ZERO); 5636 __ rbit(tmp2, tmp2); 5637 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 5638 // Now, perform compression of counters(cnt2 and cnt1) into one register. 5639 // It's fine because both counters are 32bit and are not changed in this 5640 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 5641 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 5642 __ sub(result, result, 1); 5643 __ BIND(L_HAS_ZERO_LOOP); 5644 __ mov(cnt1, wordSize/str2_chr_size); 5645 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5646 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 5647 if (str2_isL) { 5648 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5649 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5650 __ lslv(tmp2, tmp2, tmp4); 5651 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5652 __ add(tmp4, tmp4, 1); 5653 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5654 __ lsl(tmp2, tmp2, 1); 5655 __ mov(tmp4, wordSize/str2_chr_size); 5656 } else { 5657 __ mov(ch2, 0xE); 5658 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5659 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5660 __ lslv(tmp2, tmp2, tmp4); 5661 __ add(tmp4, tmp4, 1); 5662 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5663 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 5664 __ lsl(tmp2, tmp2, 1); 5665 __ mov(tmp4, wordSize/str2_chr_size); 5666 __ sub(str2, str2, str2_chr_size); 5667 } 5668 __ cmp(ch1, ch2); 5669 __ mov(tmp4, wordSize/str2_chr_size); 5670 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5671 __ BIND(L_CMP_LOOP); 5672 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 5673 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 5674 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 5675 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 5676 __ add(tmp4, tmp4, 1); 5677 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 5678 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 5679 __ cmp(cnt1, ch2); 5680 __ br(__ EQ, L_CMP_LOOP); 5681 __ BIND(L_CMP_LOOP_NOMATCH); 5682 // here we're not matched 5683 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 5684 __ clz(tmp4, tmp2); 5685 __ add(str2, str2, str2_chr_size); // advance pointer 5686 __ b(L_HAS_ZERO_LOOP); 5687 __ align(OptoLoopAlignment); 5688 __ BIND(L_CMP_LOOP_LAST_CMP); 5689 __ cmp(cnt1, ch2); 5690 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5691 __ b(DONE); 5692 __ align(OptoLoopAlignment); 5693 __ BIND(L_CMP_LOOP_LAST_CMP2); 5694 if (str2_isL) { 5695 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 5696 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5697 __ lslv(tmp2, tmp2, tmp4); 5698 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5699 __ add(tmp4, tmp4, 1); 5700 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5701 __ lsl(tmp2, tmp2, 1); 5702 } else { 5703 __ mov(ch2, 0xE); 5704 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 5705 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 5706 __ lslv(tmp2, tmp2, tmp4); 5707 __ add(tmp4, tmp4, 1); 5708 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 5709 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 5710 __ lsl(tmp2, tmp2, 1); 5711 __ sub(str2, str2, str2_chr_size); 5712 } 5713 __ cmp(ch1, ch2); 5714 __ br(__ NE, L_CMP_LOOP_NOMATCH); 5715 __ b(DONE); 5716 __ align(OptoLoopAlignment); 5717 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 5718 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 5719 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 5720 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 5721 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 5722 // result by analyzed characters value, so, we can just reset lower bits 5723 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 5724 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 5725 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 5726 // index of last analyzed substring inside current octet. So, str2 in at 5727 // respective start address. We need to advance it to next octet 5728 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 5729 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 5730 __ bfm(result, zr, 0, 2 - str2_chr_shift); 5731 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 5732 __ movw(cnt2, cnt2); 5733 __ b(L_LOOP_PROCEED); 5734 __ align(OptoLoopAlignment); 5735 __ BIND(NOMATCH); 5736 __ mov(result, -1); 5737 __ BIND(DONE); 5738 __ pop(spilled_regs, sp); 5739 __ ret(lr); 5740 return entry; 5741 } 5742 5743 void generate_string_indexof_stubs() { 5744 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 5745 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 5746 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 5747 } 5748 5749 void inflate_and_store_2_fp_registers(bool generatePrfm, 5750 FloatRegister src1, FloatRegister src2) { 5751 Register dst = r1; 5752 __ zip1(v1, __ T16B, src1, v0); 5753 __ zip2(v2, __ T16B, src1, v0); 5754 if (generatePrfm) { 5755 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 5756 } 5757 __ zip1(v3, __ T16B, src2, v0); 5758 __ zip2(v4, __ T16B, src2, v0); 5759 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 5760 } 5761 5762 // R0 = src 5763 // R1 = dst 5764 // R2 = len 5765 // R3 = len >> 3 5766 // V0 = 0 5767 // v1 = loaded 8 bytes 5768 address generate_large_byte_array_inflate() { 5769 __ align(CodeEntryAlignment); 5770 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 5771 address entry = __ pc(); 5772 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 5773 Register src = r0, dst = r1, len = r2, octetCounter = r3; 5774 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 5775 5776 // do one more 8-byte read to have address 16-byte aligned in most cases 5777 // also use single store instruction 5778 __ ldrd(v2, __ post(src, 8)); 5779 __ sub(octetCounter, octetCounter, 2); 5780 __ zip1(v1, __ T16B, v1, v0); 5781 __ zip1(v2, __ T16B, v2, v0); 5782 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 5783 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5784 __ subs(rscratch1, octetCounter, large_loop_threshold); 5785 __ br(__ LE, LOOP_START); 5786 __ b(LOOP_PRFM_START); 5787 __ bind(LOOP_PRFM); 5788 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5789 __ bind(LOOP_PRFM_START); 5790 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 5791 __ sub(octetCounter, octetCounter, 8); 5792 __ subs(rscratch1, octetCounter, large_loop_threshold); 5793 inflate_and_store_2_fp_registers(true, v3, v4); 5794 inflate_and_store_2_fp_registers(true, v5, v6); 5795 __ br(__ GT, LOOP_PRFM); 5796 __ cmp(octetCounter, (u1)8); 5797 __ br(__ LT, DONE); 5798 __ bind(LOOP); 5799 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 5800 __ bind(LOOP_START); 5801 __ sub(octetCounter, octetCounter, 8); 5802 __ cmp(octetCounter, (u1)8); 5803 inflate_and_store_2_fp_registers(false, v3, v4); 5804 inflate_and_store_2_fp_registers(false, v5, v6); 5805 __ br(__ GE, LOOP); 5806 __ bind(DONE); 5807 __ ret(lr); 5808 return entry; 5809 } 5810 5811 /** 5812 * Arguments: 5813 * 5814 * Input: 5815 * c_rarg0 - current state address 5816 * c_rarg1 - H key address 5817 * c_rarg2 - data address 5818 * c_rarg3 - number of blocks 5819 * 5820 * Output: 5821 * Updated state at c_rarg0 5822 */ 5823 address generate_ghash_processBlocks() { 5824 // Bafflingly, GCM uses little-endian for the byte order, but 5825 // big-endian for the bit order. For example, the polynomial 1 is 5826 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 5827 // 5828 // So, we must either reverse the bytes in each word and do 5829 // everything big-endian or reverse the bits in each byte and do 5830 // it little-endian. On AArch64 it's more idiomatic to reverse 5831 // the bits in each byte (we have an instruction, RBIT, to do 5832 // that) and keep the data in little-endian bit order throught the 5833 // calculation, bit-reversing the inputs and outputs. 5834 5835 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 5836 __ align(wordSize * 2); 5837 address p = __ pc(); 5838 __ emit_int64(0x87); // The low-order bits of the field 5839 // polynomial (i.e. p = z^7+z^2+z+1) 5840 // repeated in the low and high parts of a 5841 // 128-bit vector 5842 __ emit_int64(0x87); 5843 5844 __ align(CodeEntryAlignment); 5845 address start = __ pc(); 5846 5847 Register state = c_rarg0; 5848 Register subkeyH = c_rarg1; 5849 Register data = c_rarg2; 5850 Register blocks = c_rarg3; 5851 5852 FloatRegister vzr = v30; 5853 __ eor(vzr, __ T16B, vzr, vzr); // zero register 5854 5855 __ ldrq(v0, Address(state)); 5856 __ ldrq(v1, Address(subkeyH)); 5857 5858 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 5859 __ rbit(v0, __ T16B, v0); 5860 __ rev64(v1, __ T16B, v1); 5861 __ rbit(v1, __ T16B, v1); 5862 5863 __ ldrq(v26, p); 5864 5865 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 5866 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 5867 5868 { 5869 Label L_ghash_loop; 5870 __ bind(L_ghash_loop); 5871 5872 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 5873 // reversing each byte 5874 __ rbit(v2, __ T16B, v2); 5875 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 5876 5877 // Multiply state in v2 by subkey in v1 5878 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 5879 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 5880 /*temps*/v6, v20, v18, v21); 5881 // Reduce v7:v5 by the field polynomial 5882 ghash_reduce(v0, v5, v7, v26, vzr, v20); 5883 5884 __ sub(blocks, blocks, 1); 5885 __ cbnz(blocks, L_ghash_loop); 5886 } 5887 5888 // The bit-reversed result is at this point in v0 5889 __ rev64(v1, __ T16B, v0); 5890 __ rbit(v1, __ T16B, v1); 5891 5892 __ st1(v1, __ T16B, state); 5893 __ ret(lr); 5894 5895 return start; 5896 } 5897 5898 void generate_base64_encode_simdround(Register src, Register dst, 5899 FloatRegister codec, u8 size) { 5900 5901 FloatRegister in0 = v4, in1 = v5, in2 = v6; 5902 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 5903 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 5904 5905 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 5906 5907 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 5908 5909 __ ushr(ind0, arrangement, in0, 2); 5910 5911 __ ushr(ind1, arrangement, in1, 2); 5912 __ shl(in0, arrangement, in0, 6); 5913 __ orr(ind1, arrangement, ind1, in0); 5914 __ ushr(ind1, arrangement, ind1, 2); 5915 5916 __ ushr(ind2, arrangement, in2, 4); 5917 __ shl(in1, arrangement, in1, 4); 5918 __ orr(ind2, arrangement, in1, ind2); 5919 __ ushr(ind2, arrangement, ind2, 2); 5920 5921 __ shl(ind3, arrangement, in2, 2); 5922 __ ushr(ind3, arrangement, ind3, 2); 5923 5924 __ tbl(out0, arrangement, codec, 4, ind0); 5925 __ tbl(out1, arrangement, codec, 4, ind1); 5926 __ tbl(out2, arrangement, codec, 4, ind2); 5927 __ tbl(out3, arrangement, codec, 4, ind3); 5928 5929 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 5930 } 5931 5932 /** 5933 * Arguments: 5934 * 5935 * Input: 5936 * c_rarg0 - src_start 5937 * c_rarg1 - src_offset 5938 * c_rarg2 - src_length 5939 * c_rarg3 - dest_start 5940 * c_rarg4 - dest_offset 5941 * c_rarg5 - isURL 5942 * 5943 */ 5944 address generate_base64_encodeBlock() { 5945 5946 static const char toBase64[64] = { 5947 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5948 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5949 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5950 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5951 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 5952 }; 5953 5954 static const char toBase64URL[64] = { 5955 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5956 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5957 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5958 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5959 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 5960 }; 5961 5962 __ align(CodeEntryAlignment); 5963 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 5964 address start = __ pc(); 5965 5966 Register src = c_rarg0; // source array 5967 Register soff = c_rarg1; // source start offset 5968 Register send = c_rarg2; // source end offset 5969 Register dst = c_rarg3; // dest array 5970 Register doff = c_rarg4; // position for writing to dest array 5971 Register isURL = c_rarg5; // Base64 or URL chracter set 5972 5973 // c_rarg6 and c_rarg7 are free to use as temps 5974 Register codec = c_rarg6; 5975 Register length = c_rarg7; 5976 5977 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 5978 5979 __ add(src, src, soff); 5980 __ add(dst, dst, doff); 5981 __ sub(length, send, soff); 5982 5983 // load the codec base address 5984 __ lea(codec, ExternalAddress((address) toBase64)); 5985 __ cbz(isURL, ProcessData); 5986 __ lea(codec, ExternalAddress((address) toBase64URL)); 5987 5988 __ BIND(ProcessData); 5989 5990 // too short to formup a SIMD loop, roll back 5991 __ cmp(length, (u1)24); 5992 __ br(Assembler::LT, Process3B); 5993 5994 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 5995 5996 __ BIND(Process48B); 5997 __ cmp(length, (u1)48); 5998 __ br(Assembler::LT, Process24B); 5999 generate_base64_encode_simdround(src, dst, v0, 16); 6000 __ sub(length, length, 48); 6001 __ b(Process48B); 6002 6003 __ BIND(Process24B); 6004 __ cmp(length, (u1)24); 6005 __ br(Assembler::LT, SIMDExit); 6006 generate_base64_encode_simdround(src, dst, v0, 8); 6007 __ sub(length, length, 24); 6008 6009 __ BIND(SIMDExit); 6010 __ cbz(length, Exit); 6011 6012 __ BIND(Process3B); 6013 // 3 src bytes, 24 bits 6014 __ ldrb(r10, __ post(src, 1)); 6015 __ ldrb(r11, __ post(src, 1)); 6016 __ ldrb(r12, __ post(src, 1)); 6017 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6018 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6019 // codec index 6020 __ ubfmw(r15, r12, 18, 23); 6021 __ ubfmw(r14, r12, 12, 17); 6022 __ ubfmw(r13, r12, 6, 11); 6023 __ andw(r12, r12, 63); 6024 // get the code based on the codec 6025 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6026 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6027 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6028 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6029 __ strb(r15, __ post(dst, 1)); 6030 __ strb(r14, __ post(dst, 1)); 6031 __ strb(r13, __ post(dst, 1)); 6032 __ strb(r12, __ post(dst, 1)); 6033 __ sub(length, length, 3); 6034 __ cbnz(length, Process3B); 6035 6036 __ BIND(Exit); 6037 __ ret(lr); 6038 6039 return start; 6040 } 6041 6042 void generate_base64_decode_simdround(Register src, Register dst, 6043 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6044 6045 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6046 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6047 6048 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6049 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6050 6051 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6052 6053 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6054 6055 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6056 6057 // we need unsigned saturating substract, to make sure all input values 6058 // in range [0, 63] will have 0U value in the higher half lookup 6059 __ uqsubv(decH0, __ T16B, in0, v27); 6060 __ uqsubv(decH1, __ T16B, in1, v27); 6061 __ uqsubv(decH2, __ T16B, in2, v27); 6062 __ uqsubv(decH3, __ T16B, in3, v27); 6063 6064 // lower half lookup 6065 __ tbl(decL0, arrangement, codecL, 4, in0); 6066 __ tbl(decL1, arrangement, codecL, 4, in1); 6067 __ tbl(decL2, arrangement, codecL, 4, in2); 6068 __ tbl(decL3, arrangement, codecL, 4, in3); 6069 6070 // higher half lookup 6071 __ tbx(decH0, arrangement, codecH, 4, decH0); 6072 __ tbx(decH1, arrangement, codecH, 4, decH1); 6073 __ tbx(decH2, arrangement, codecH, 4, decH2); 6074 __ tbx(decH3, arrangement, codecH, 4, decH3); 6075 6076 // combine lower and higher 6077 __ orr(decL0, arrangement, decL0, decH0); 6078 __ orr(decL1, arrangement, decL1, decH1); 6079 __ orr(decL2, arrangement, decL2, decH2); 6080 __ orr(decL3, arrangement, decL3, decH3); 6081 6082 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6083 __ cmhi(decH0, arrangement, decL0, v27); 6084 __ cmhi(decH1, arrangement, decL1, v27); 6085 __ cmhi(decH2, arrangement, decL2, v27); 6086 __ cmhi(decH3, arrangement, decL3, v27); 6087 __ orr(in0, arrangement, decH0, decH1); 6088 __ orr(in1, arrangement, decH2, decH3); 6089 __ orr(in2, arrangement, in0, in1); 6090 __ umaxv(in3, arrangement, in2); 6091 __ umov(rscratch2, in3, __ B, 0); 6092 6093 // get the data to output 6094 __ shl(out0, arrangement, decL0, 2); 6095 __ ushr(out1, arrangement, decL1, 4); 6096 __ orr(out0, arrangement, out0, out1); 6097 __ shl(out1, arrangement, decL1, 4); 6098 __ ushr(out2, arrangement, decL2, 2); 6099 __ orr(out1, arrangement, out1, out2); 6100 __ shl(out2, arrangement, decL2, 6); 6101 __ orr(out2, arrangement, out2, decL3); 6102 6103 __ cbz(rscratch2, NoIllegalData); 6104 6105 // handle illegal input 6106 __ umov(r10, in2, __ D, 0); 6107 if (size == 16) { 6108 __ cbnz(r10, ErrorInLowerHalf); 6109 6110 // illegal input is in higher half, store the lower half now. 6111 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6112 6113 __ umov(r10, in2, __ D, 1); 6114 __ umov(r11, out0, __ D, 1); 6115 __ umov(r12, out1, __ D, 1); 6116 __ umov(r13, out2, __ D, 1); 6117 __ b(StoreLegalData); 6118 6119 __ BIND(ErrorInLowerHalf); 6120 } 6121 __ umov(r11, out0, __ D, 0); 6122 __ umov(r12, out1, __ D, 0); 6123 __ umov(r13, out2, __ D, 0); 6124 6125 __ BIND(StoreLegalData); 6126 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6127 __ strb(r11, __ post(dst, 1)); 6128 __ strb(r12, __ post(dst, 1)); 6129 __ strb(r13, __ post(dst, 1)); 6130 __ lsr(r10, r10, 8); 6131 __ lsr(r11, r11, 8); 6132 __ lsr(r12, r12, 8); 6133 __ lsr(r13, r13, 8); 6134 __ b(StoreLegalData); 6135 6136 __ BIND(NoIllegalData); 6137 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6138 } 6139 6140 6141 /** 6142 * Arguments: 6143 * 6144 * Input: 6145 * c_rarg0 - src_start 6146 * c_rarg1 - src_offset 6147 * c_rarg2 - src_length 6148 * c_rarg3 - dest_start 6149 * c_rarg4 - dest_offset 6150 * c_rarg5 - isURL 6151 * c_rarg6 - isMIME 6152 * 6153 */ 6154 address generate_base64_decodeBlock() { 6155 6156 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6157 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6158 // titled "Base64 decoding". 6159 6160 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6161 // except the trailing character '=' is also treated illegal value in this instrinsic. That 6162 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6163 static const uint8_t fromBase64ForNoSIMD[256] = { 6164 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6165 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6166 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6167 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6168 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6169 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6170 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6171 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6172 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6173 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6174 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6175 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6176 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6177 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6178 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6179 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6180 }; 6181 6182 static const uint8_t fromBase64URLForNoSIMD[256] = { 6183 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6184 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6185 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6186 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6187 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6188 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6189 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6190 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6191 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6192 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6193 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6194 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6195 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6196 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6197 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6198 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6199 }; 6200 6201 // A legal value of base64 code is in range [0, 127]. We need two lookups 6202 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6203 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6204 // table vector lookup use tbx, out of range indices are unchanged in 6205 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6206 // The value of index 64 is set to 0, so that we know that we already get the 6207 // decoded data with the 1st lookup. 6208 static const uint8_t fromBase64ForSIMD[128] = { 6209 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6210 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6211 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6212 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6213 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6214 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6215 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6216 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6217 }; 6218 6219 static const uint8_t fromBase64URLForSIMD[128] = { 6220 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6221 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6222 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6223 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6224 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6225 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6226 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6227 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6228 }; 6229 6230 __ align(CodeEntryAlignment); 6231 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6232 address start = __ pc(); 6233 6234 Register src = c_rarg0; // source array 6235 Register soff = c_rarg1; // source start offset 6236 Register send = c_rarg2; // source end offset 6237 Register dst = c_rarg3; // dest array 6238 Register doff = c_rarg4; // position for writing to dest array 6239 Register isURL = c_rarg5; // Base64 or URL character set 6240 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6241 6242 Register length = send; // reuse send as length of source data to process 6243 6244 Register simd_codec = c_rarg6; 6245 Register nosimd_codec = c_rarg7; 6246 6247 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6248 6249 __ enter(); 6250 6251 __ add(src, src, soff); 6252 __ add(dst, dst, doff); 6253 6254 __ mov(doff, dst); 6255 6256 __ sub(length, send, soff); 6257 __ bfm(length, zr, 0, 1); 6258 6259 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 6260 __ cbz(isURL, ProcessData); 6261 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 6262 6263 __ BIND(ProcessData); 6264 __ mov(rscratch1, length); 6265 __ cmp(length, (u1)144); // 144 = 80 + 64 6266 __ br(Assembler::LT, Process4B); 6267 6268 // In the MIME case, the line length cannot be more than 76 6269 // bytes (see RFC 2045). This is too short a block for SIMD 6270 // to be worthwhile, so we use non-SIMD here. 6271 __ movw(rscratch1, 79); 6272 6273 __ BIND(Process4B); 6274 __ ldrw(r14, __ post(src, 4)); 6275 __ ubfxw(r10, r14, 0, 8); 6276 __ ubfxw(r11, r14, 8, 8); 6277 __ ubfxw(r12, r14, 16, 8); 6278 __ ubfxw(r13, r14, 24, 8); 6279 // get the de-code 6280 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 6281 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 6282 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 6283 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 6284 // error detection, 255u indicates an illegal input 6285 __ orrw(r14, r10, r11); 6286 __ orrw(r15, r12, r13); 6287 __ orrw(r14, r14, r15); 6288 __ tbnz(r14, 7, Exit); 6289 // recover the data 6290 __ lslw(r14, r10, 10); 6291 __ bfiw(r14, r11, 4, 6); 6292 __ bfmw(r14, r12, 2, 5); 6293 __ rev16w(r14, r14); 6294 __ bfiw(r13, r12, 6, 2); 6295 __ strh(r14, __ post(dst, 2)); 6296 __ strb(r13, __ post(dst, 1)); 6297 // non-simd loop 6298 __ subsw(rscratch1, rscratch1, 4); 6299 __ br(Assembler::GT, Process4B); 6300 6301 // if exiting from PreProcess80B, rscratch1 == -1; 6302 // otherwise, rscratch1 == 0. 6303 __ cbzw(rscratch1, Exit); 6304 __ sub(length, length, 80); 6305 6306 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 6307 __ cbz(isURL, SIMDEnter); 6308 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 6309 6310 __ BIND(SIMDEnter); 6311 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 6312 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 6313 __ mov(rscratch1, 63); 6314 __ dup(v27, __ T16B, rscratch1); 6315 6316 __ BIND(Process64B); 6317 __ cmp(length, (u1)64); 6318 __ br(Assembler::LT, Process32B); 6319 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 6320 __ sub(length, length, 64); 6321 __ b(Process64B); 6322 6323 __ BIND(Process32B); 6324 __ cmp(length, (u1)32); 6325 __ br(Assembler::LT, SIMDExit); 6326 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 6327 __ sub(length, length, 32); 6328 __ b(Process32B); 6329 6330 __ BIND(SIMDExit); 6331 __ cbz(length, Exit); 6332 __ movw(rscratch1, length); 6333 __ b(Process4B); 6334 6335 __ BIND(Exit); 6336 __ sub(c_rarg0, dst, doff); 6337 6338 __ leave(); 6339 __ ret(lr); 6340 6341 return start; 6342 } 6343 6344 address generate_ghash_processBlocks_wide() { 6345 address small = generate_ghash_processBlocks(); 6346 6347 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6348 __ align(wordSize * 2); 6349 address p = __ pc(); 6350 __ emit_int64(0x87); // The low-order bits of the field 6351 // polynomial (i.e. p = z^7+z^2+z+1) 6352 // repeated in the low and high parts of a 6353 // 128-bit vector 6354 __ emit_int64(0x87); 6355 6356 __ align(CodeEntryAlignment); 6357 address start = __ pc(); 6358 6359 Register state = c_rarg0; 6360 Register subkeyH = c_rarg1; 6361 Register data = c_rarg2; 6362 Register blocks = c_rarg3; 6363 6364 const int unroll = 4; 6365 6366 __ cmp(blocks, (unsigned char)(unroll * 2)); 6367 __ br(__ LT, small); 6368 6369 if (unroll > 1) { 6370 // Save state before entering routine 6371 __ sub(sp, sp, 4 * 16); 6372 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6373 __ sub(sp, sp, 4 * 16); 6374 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6375 } 6376 6377 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6378 6379 if (unroll > 1) { 6380 // And restore state 6381 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6382 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6383 } 6384 6385 __ cmp(blocks, zr); 6386 __ br(__ GT, small); 6387 6388 __ ret(lr); 6389 6390 return start; 6391 } 6392 6393 // Support for spin waits. 6394 address generate_spin_wait() { 6395 __ align(CodeEntryAlignment); 6396 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 6397 address start = __ pc(); 6398 6399 __ spin_wait(); 6400 __ ret(lr); 6401 6402 return start; 6403 } 6404 6405 #ifdef LINUX 6406 6407 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 6408 // 6409 // If LSE is in use, generate LSE versions of all the stubs. The 6410 // non-LSE versions are in atomic_aarch64.S. 6411 6412 // class AtomicStubMark records the entry point of a stub and the 6413 // stub pointer which will point to it. The stub pointer is set to 6414 // the entry point when ~AtomicStubMark() is called, which must be 6415 // after ICache::invalidate_range. This ensures safe publication of 6416 // the generated code. 6417 class AtomicStubMark { 6418 address _entry_point; 6419 aarch64_atomic_stub_t *_stub; 6420 MacroAssembler *_masm; 6421 public: 6422 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 6423 _masm = masm; 6424 __ align(32); 6425 _entry_point = __ pc(); 6426 _stub = stub; 6427 } 6428 ~AtomicStubMark() { 6429 *_stub = (aarch64_atomic_stub_t)_entry_point; 6430 } 6431 }; 6432 6433 // NB: For memory_order_conservative we need a trailing membar after 6434 // LSE atomic operations but not a leading membar. 6435 // 6436 // We don't need a leading membar because a clause in the Arm ARM 6437 // says: 6438 // 6439 // Barrier-ordered-before 6440 // 6441 // Barrier instructions order prior Memory effects before subsequent 6442 // Memory effects generated by the same Observer. A read or a write 6443 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 6444 // Observer if and only if RW1 appears in program order before RW 2 6445 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 6446 // instruction with both Acquire and Release semantics. 6447 // 6448 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 6449 // and Release semantics, therefore we don't need a leading 6450 // barrier. However, there is no corresponding Barrier-ordered-after 6451 // relationship, therefore we need a trailing membar to prevent a 6452 // later store or load from being reordered with the store in an 6453 // atomic instruction. 6454 // 6455 // This was checked by using the herd7 consistency model simulator 6456 // (http://diy.inria.fr/) with this test case: 6457 // 6458 // AArch64 LseCas 6459 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 6460 // P0 | P1; 6461 // LDR W4, [X2] | MOV W3, #0; 6462 // DMB LD | MOV W4, #1; 6463 // LDR W3, [X1] | CASAL W3, W4, [X1]; 6464 // | DMB ISH; 6465 // | STR W4, [X2]; 6466 // exists 6467 // (0:X3=0 /\ 0:X4=1) 6468 // 6469 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 6470 // with the store to x in P1. Without the DMB in P1 this may happen. 6471 // 6472 // At the time of writing we don't know of any AArch64 hardware that 6473 // reorders stores in this way, but the Reference Manual permits it. 6474 6475 void gen_cas_entry(Assembler::operand_size size, 6476 atomic_memory_order order) { 6477 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 6478 exchange_val = c_rarg2; 6479 bool acquire, release; 6480 switch (order) { 6481 case memory_order_relaxed: 6482 acquire = false; 6483 release = false; 6484 break; 6485 case memory_order_release: 6486 acquire = false; 6487 release = true; 6488 break; 6489 default: 6490 acquire = true; 6491 release = true; 6492 break; 6493 } 6494 __ mov(prev, compare_val); 6495 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 6496 if (order == memory_order_conservative) { 6497 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6498 } 6499 if (size == Assembler::xword) { 6500 __ mov(r0, prev); 6501 } else { 6502 __ movw(r0, prev); 6503 } 6504 __ ret(lr); 6505 } 6506 6507 void gen_ldaddal_entry(Assembler::operand_size size) { 6508 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6509 __ ldaddal(size, incr, prev, addr); 6510 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6511 if (size == Assembler::xword) { 6512 __ mov(r0, prev); 6513 } else { 6514 __ movw(r0, prev); 6515 } 6516 __ ret(lr); 6517 } 6518 6519 void gen_swpal_entry(Assembler::operand_size size) { 6520 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 6521 __ swpal(size, incr, prev, addr); 6522 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 6523 if (size == Assembler::xword) { 6524 __ mov(r0, prev); 6525 } else { 6526 __ movw(r0, prev); 6527 } 6528 __ ret(lr); 6529 } 6530 6531 void generate_atomic_entry_points() { 6532 if (! UseLSE) { 6533 return; 6534 } 6535 6536 __ align(CodeEntryAlignment); 6537 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 6538 address first_entry = __ pc(); 6539 6540 // All memory_order_conservative 6541 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 6542 gen_ldaddal_entry(Assembler::word); 6543 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 6544 gen_ldaddal_entry(Assembler::xword); 6545 6546 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 6547 gen_swpal_entry(Assembler::word); 6548 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 6549 gen_swpal_entry(Assembler::xword); 6550 6551 // CAS, memory_order_conservative 6552 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 6553 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 6554 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 6555 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 6556 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 6557 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 6558 6559 // CAS, memory_order_relaxed 6560 AtomicStubMark mark_cmpxchg_1_relaxed 6561 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 6562 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 6563 AtomicStubMark mark_cmpxchg_4_relaxed 6564 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 6565 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 6566 AtomicStubMark mark_cmpxchg_8_relaxed 6567 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 6568 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 6569 6570 AtomicStubMark mark_cmpxchg_4_release 6571 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 6572 gen_cas_entry(MacroAssembler::word, memory_order_release); 6573 AtomicStubMark mark_cmpxchg_8_release 6574 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 6575 gen_cas_entry(MacroAssembler::xword, memory_order_release); 6576 6577 AtomicStubMark mark_cmpxchg_4_seq_cst 6578 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 6579 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 6580 AtomicStubMark mark_cmpxchg_8_seq_cst 6581 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 6582 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 6583 6584 ICache::invalidate_range(first_entry, __ pc() - first_entry); 6585 } 6586 #endif // LINUX 6587 6588 // Continuation point for throwing of implicit exceptions that are 6589 // not handled in the current activation. Fabricates an exception 6590 // oop and initiates normal exception dispatching in this 6591 // frame. Since we need to preserve callee-saved values (currently 6592 // only for C2, but done for C1 as well) we need a callee-saved oop 6593 // map and therefore have to make these stubs into RuntimeStubs 6594 // rather than BufferBlobs. If the compiler needs all registers to 6595 // be preserved between the fault point and the exception handler 6596 // then it must assume responsibility for that in 6597 // AbstractCompiler::continuation_for_implicit_null_exception or 6598 // continuation_for_implicit_division_by_zero_exception. All other 6599 // implicit exceptions (e.g., NullPointerException or 6600 // AbstractMethodError on entry) are either at call sites or 6601 // otherwise assume that stack unwinding will be initiated, so 6602 // caller saved registers were assumed volatile in the compiler. 6603 6604 #undef __ 6605 #define __ masm-> 6606 6607 address generate_throw_exception(const char* name, 6608 address runtime_entry, 6609 Register arg1 = noreg, 6610 Register arg2 = noreg) { 6611 // Information about frame layout at time of blocking runtime call. 6612 // Note that we only have to preserve callee-saved registers since 6613 // the compilers are responsible for supplying a continuation point 6614 // if they expect all registers to be preserved. 6615 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 6616 enum layout { 6617 rfp_off = 0, 6618 rfp_off2, 6619 return_off, 6620 return_off2, 6621 framesize // inclusive of return address 6622 }; 6623 6624 int insts_size = 512; 6625 int locs_size = 64; 6626 6627 CodeBuffer code(name, insts_size, locs_size); 6628 OopMapSet* oop_maps = new OopMapSet(); 6629 MacroAssembler* masm = new MacroAssembler(&code); 6630 6631 address start = __ pc(); 6632 6633 // This is an inlined and slightly modified version of call_VM 6634 // which has the ability to fetch the return PC out of 6635 // thread-local storage and also sets up last_Java_sp slightly 6636 // differently than the real call_VM 6637 6638 __ enter(); // Save FP and LR before call 6639 6640 assert(is_even(framesize/2), "sp not 16-byte aligned"); 6641 6642 // lr and fp are already in place 6643 __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog 6644 6645 int frame_complete = __ pc() - start; 6646 6647 // Set up last_Java_sp and last_Java_fp 6648 address the_pc = __ pc(); 6649 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 6650 6651 // Call runtime 6652 if (arg1 != noreg) { 6653 assert(arg2 != c_rarg1, "clobbered"); 6654 __ mov(c_rarg1, arg1); 6655 } 6656 if (arg2 != noreg) { 6657 __ mov(c_rarg2, arg2); 6658 } 6659 __ mov(c_rarg0, rthread); 6660 BLOCK_COMMENT("call runtime_entry"); 6661 __ mov(rscratch1, runtime_entry); 6662 __ blr(rscratch1); 6663 6664 // Generate oop map 6665 OopMap* map = new OopMap(framesize, 0); 6666 6667 oop_maps->add_gc_map(the_pc - start, map); 6668 6669 __ reset_last_Java_frame(true); 6670 6671 // Reinitialize the ptrue predicate register, in case the external runtime 6672 // call clobbers ptrue reg, as we may return to SVE compiled code. 6673 __ reinitialize_ptrue(); 6674 6675 __ leave(); 6676 6677 // check for pending exceptions 6678 #ifdef ASSERT 6679 Label L; 6680 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 6681 __ cbnz(rscratch1, L); 6682 __ should_not_reach_here(); 6683 __ bind(L); 6684 #endif // ASSERT 6685 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 6686 6687 6688 // codeBlob framesize is in words (not VMRegImpl::slot_size) 6689 RuntimeStub* stub = 6690 RuntimeStub::new_runtime_stub(name, 6691 &code, 6692 frame_complete, 6693 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 6694 oop_maps, false); 6695 return stub->entry_point(); 6696 } 6697 6698 class MontgomeryMultiplyGenerator : public MacroAssembler { 6699 6700 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 6701 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 6702 6703 RegSet _toSave; 6704 bool _squaring; 6705 6706 public: 6707 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 6708 : MacroAssembler(as->code()), _squaring(squaring) { 6709 6710 // Register allocation 6711 6712 RegSetIterator<> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 6713 Pa_base = *regs; // Argument registers 6714 if (squaring) 6715 Pb_base = Pa_base; 6716 else 6717 Pb_base = *++regs; 6718 Pn_base = *++regs; 6719 Rlen= *++regs; 6720 inv = *++regs; 6721 Pm_base = *++regs; 6722 6723 // Working registers: 6724 Ra = *++regs; // The current digit of a, b, n, and m. 6725 Rb = *++regs; 6726 Rm = *++regs; 6727 Rn = *++regs; 6728 6729 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 6730 Pb = *++regs; 6731 Pm = *++regs; 6732 Pn = *++regs; 6733 6734 t0 = *++regs; // Three registers which form a 6735 t1 = *++regs; // triple-precision accumuator. 6736 t2 = *++regs; 6737 6738 Ri = *++regs; // Inner and outer loop indexes. 6739 Rj = *++regs; 6740 6741 Rhi_ab = *++regs; // Product registers: low and high parts 6742 Rlo_ab = *++regs; // of a*b and m*n. 6743 Rhi_mn = *++regs; 6744 Rlo_mn = *++regs; 6745 6746 // r19 and up are callee-saved. 6747 _toSave = RegSet::range(r19, *regs) + Pm_base; 6748 } 6749 6750 private: 6751 void save_regs() { 6752 push(_toSave, sp); 6753 } 6754 6755 void restore_regs() { 6756 pop(_toSave, sp); 6757 } 6758 6759 template <typename T> 6760 void unroll_2(Register count, T block) { 6761 Label loop, end, odd; 6762 tbnz(count, 0, odd); 6763 cbz(count, end); 6764 align(16); 6765 bind(loop); 6766 (this->*block)(); 6767 bind(odd); 6768 (this->*block)(); 6769 subs(count, count, 2); 6770 br(Assembler::GT, loop); 6771 bind(end); 6772 } 6773 6774 template <typename T> 6775 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 6776 Label loop, end, odd; 6777 tbnz(count, 0, odd); 6778 cbz(count, end); 6779 align(16); 6780 bind(loop); 6781 (this->*block)(d, s, tmp); 6782 bind(odd); 6783 (this->*block)(d, s, tmp); 6784 subs(count, count, 2); 6785 br(Assembler::GT, loop); 6786 bind(end); 6787 } 6788 6789 void pre1(RegisterOrConstant i) { 6790 block_comment("pre1"); 6791 // Pa = Pa_base; 6792 // Pb = Pb_base + i; 6793 // Pm = Pm_base; 6794 // Pn = Pn_base + i; 6795 // Ra = *Pa; 6796 // Rb = *Pb; 6797 // Rm = *Pm; 6798 // Rn = *Pn; 6799 ldr(Ra, Address(Pa_base)); 6800 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 6801 ldr(Rm, Address(Pm_base)); 6802 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6803 lea(Pa, Address(Pa_base)); 6804 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 6805 lea(Pm, Address(Pm_base)); 6806 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6807 6808 // Zero the m*n result. 6809 mov(Rhi_mn, zr); 6810 mov(Rlo_mn, zr); 6811 } 6812 6813 // The core multiply-accumulate step of a Montgomery 6814 // multiplication. The idea is to schedule operations as a 6815 // pipeline so that instructions with long latencies (loads and 6816 // multiplies) have time to complete before their results are 6817 // used. This most benefits in-order implementations of the 6818 // architecture but out-of-order ones also benefit. 6819 void step() { 6820 block_comment("step"); 6821 // MACC(Ra, Rb, t0, t1, t2); 6822 // Ra = *++Pa; 6823 // Rb = *--Pb; 6824 umulh(Rhi_ab, Ra, Rb); 6825 mul(Rlo_ab, Ra, Rb); 6826 ldr(Ra, pre(Pa, wordSize)); 6827 ldr(Rb, pre(Pb, -wordSize)); 6828 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 6829 // previous iteration. 6830 // MACC(Rm, Rn, t0, t1, t2); 6831 // Rm = *++Pm; 6832 // Rn = *--Pn; 6833 umulh(Rhi_mn, Rm, Rn); 6834 mul(Rlo_mn, Rm, Rn); 6835 ldr(Rm, pre(Pm, wordSize)); 6836 ldr(Rn, pre(Pn, -wordSize)); 6837 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 6838 } 6839 6840 void post1() { 6841 block_comment("post1"); 6842 6843 // MACC(Ra, Rb, t0, t1, t2); 6844 // Ra = *++Pa; 6845 // Rb = *--Pb; 6846 umulh(Rhi_ab, Ra, Rb); 6847 mul(Rlo_ab, Ra, Rb); 6848 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 6849 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 6850 6851 // *Pm = Rm = t0 * inv; 6852 mul(Rm, t0, inv); 6853 str(Rm, Address(Pm)); 6854 6855 // MACC(Rm, Rn, t0, t1, t2); 6856 // t0 = t1; t1 = t2; t2 = 0; 6857 umulh(Rhi_mn, Rm, Rn); 6858 6859 #ifndef PRODUCT 6860 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 6861 { 6862 mul(Rlo_mn, Rm, Rn); 6863 add(Rlo_mn, t0, Rlo_mn); 6864 Label ok; 6865 cbz(Rlo_mn, ok); { 6866 stop("broken Montgomery multiply"); 6867 } bind(ok); 6868 } 6869 #endif 6870 // We have very carefully set things up so that 6871 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 6872 // the lower half of Rm * Rn because we know the result already: 6873 // it must be -t0. t0 + (-t0) must generate a carry iff 6874 // t0 != 0. So, rather than do a mul and an adds we just set 6875 // the carry flag iff t0 is nonzero. 6876 // 6877 // mul(Rlo_mn, Rm, Rn); 6878 // adds(zr, t0, Rlo_mn); 6879 subs(zr, t0, 1); // Set carry iff t0 is nonzero 6880 adcs(t0, t1, Rhi_mn); 6881 adc(t1, t2, zr); 6882 mov(t2, zr); 6883 } 6884 6885 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 6886 block_comment("pre2"); 6887 // Pa = Pa_base + i-len; 6888 // Pb = Pb_base + len; 6889 // Pm = Pm_base + i-len; 6890 // Pn = Pn_base + len; 6891 6892 if (i.is_register()) { 6893 sub(Rj, i.as_register(), len); 6894 } else { 6895 mov(Rj, i.as_constant()); 6896 sub(Rj, Rj, len); 6897 } 6898 // Rj == i-len 6899 6900 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 6901 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 6902 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 6903 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 6904 6905 // Ra = *++Pa; 6906 // Rb = *--Pb; 6907 // Rm = *++Pm; 6908 // Rn = *--Pn; 6909 ldr(Ra, pre(Pa, wordSize)); 6910 ldr(Rb, pre(Pb, -wordSize)); 6911 ldr(Rm, pre(Pm, wordSize)); 6912 ldr(Rn, pre(Pn, -wordSize)); 6913 6914 mov(Rhi_mn, zr); 6915 mov(Rlo_mn, zr); 6916 } 6917 6918 void post2(RegisterOrConstant i, RegisterOrConstant len) { 6919 block_comment("post2"); 6920 if (i.is_constant()) { 6921 mov(Rj, i.as_constant()-len.as_constant()); 6922 } else { 6923 sub(Rj, i.as_register(), len); 6924 } 6925 6926 adds(t0, t0, Rlo_mn); // The pending m*n, low part 6927 6928 // As soon as we know the least significant digit of our result, 6929 // store it. 6930 // Pm_base[i-len] = t0; 6931 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 6932 6933 // t0 = t1; t1 = t2; t2 = 0; 6934 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 6935 adc(t1, t2, zr); 6936 mov(t2, zr); 6937 } 6938 6939 // A carry in t0 after Montgomery multiplication means that we 6940 // should subtract multiples of n from our result in m. We'll 6941 // keep doing that until there is no carry. 6942 void normalize(RegisterOrConstant len) { 6943 block_comment("normalize"); 6944 // while (t0) 6945 // t0 = sub(Pm_base, Pn_base, t0, len); 6946 Label loop, post, again; 6947 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 6948 cbz(t0, post); { 6949 bind(again); { 6950 mov(i, zr); 6951 mov(cnt, len); 6952 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 6953 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6954 subs(zr, zr, zr); // set carry flag, i.e. no borrow 6955 align(16); 6956 bind(loop); { 6957 sbcs(Rm, Rm, Rn); 6958 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 6959 add(i, i, 1); 6960 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 6961 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 6962 sub(cnt, cnt, 1); 6963 } cbnz(cnt, loop); 6964 sbc(t0, t0, zr); 6965 } cbnz(t0, again); 6966 } bind(post); 6967 } 6968 6969 // Move memory at s to d, reversing words. 6970 // Increments d to end of copied memory 6971 // Destroys tmp1, tmp2 6972 // Preserves len 6973 // Leaves s pointing to the address which was in d at start 6974 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 6975 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 6976 6977 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 6978 mov(tmp1, len); 6979 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 6980 sub(s, d, len, ext::uxtw, LogBytesPerWord); 6981 } 6982 // where 6983 void reverse1(Register d, Register s, Register tmp) { 6984 ldr(tmp, pre(s, -wordSize)); 6985 ror(tmp, tmp, 32); 6986 str(tmp, post(d, wordSize)); 6987 } 6988 6989 void step_squaring() { 6990 // An extra ACC 6991 step(); 6992 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 6993 } 6994 6995 void last_squaring(RegisterOrConstant i) { 6996 Label dont; 6997 // if ((i & 1) == 0) { 6998 tbnz(i.as_register(), 0, dont); { 6999 // MACC(Ra, Rb, t0, t1, t2); 7000 // Ra = *++Pa; 7001 // Rb = *--Pb; 7002 umulh(Rhi_ab, Ra, Rb); 7003 mul(Rlo_ab, Ra, Rb); 7004 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7005 } bind(dont); 7006 } 7007 7008 void extra_step_squaring() { 7009 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7010 7011 // MACC(Rm, Rn, t0, t1, t2); 7012 // Rm = *++Pm; 7013 // Rn = *--Pn; 7014 umulh(Rhi_mn, Rm, Rn); 7015 mul(Rlo_mn, Rm, Rn); 7016 ldr(Rm, pre(Pm, wordSize)); 7017 ldr(Rn, pre(Pn, -wordSize)); 7018 } 7019 7020 void post1_squaring() { 7021 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7022 7023 // *Pm = Rm = t0 * inv; 7024 mul(Rm, t0, inv); 7025 str(Rm, Address(Pm)); 7026 7027 // MACC(Rm, Rn, t0, t1, t2); 7028 // t0 = t1; t1 = t2; t2 = 0; 7029 umulh(Rhi_mn, Rm, Rn); 7030 7031 #ifndef PRODUCT 7032 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7033 { 7034 mul(Rlo_mn, Rm, Rn); 7035 add(Rlo_mn, t0, Rlo_mn); 7036 Label ok; 7037 cbz(Rlo_mn, ok); { 7038 stop("broken Montgomery multiply"); 7039 } bind(ok); 7040 } 7041 #endif 7042 // We have very carefully set things up so that 7043 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7044 // the lower half of Rm * Rn because we know the result already: 7045 // it must be -t0. t0 + (-t0) must generate a carry iff 7046 // t0 != 0. So, rather than do a mul and an adds we just set 7047 // the carry flag iff t0 is nonzero. 7048 // 7049 // mul(Rlo_mn, Rm, Rn); 7050 // adds(zr, t0, Rlo_mn); 7051 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7052 adcs(t0, t1, Rhi_mn); 7053 adc(t1, t2, zr); 7054 mov(t2, zr); 7055 } 7056 7057 void acc(Register Rhi, Register Rlo, 7058 Register t0, Register t1, Register t2) { 7059 adds(t0, t0, Rlo); 7060 adcs(t1, t1, Rhi); 7061 adc(t2, t2, zr); 7062 } 7063 7064 public: 7065 /** 7066 * Fast Montgomery multiplication. The derivation of the 7067 * algorithm is in A Cryptographic Library for the Motorola 7068 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 7069 * 7070 * Arguments: 7071 * 7072 * Inputs for multiplication: 7073 * c_rarg0 - int array elements a 7074 * c_rarg1 - int array elements b 7075 * c_rarg2 - int array elements n (the modulus) 7076 * c_rarg3 - int length 7077 * c_rarg4 - int inv 7078 * c_rarg5 - int array elements m (the result) 7079 * 7080 * Inputs for squaring: 7081 * c_rarg0 - int array elements a 7082 * c_rarg1 - int array elements n (the modulus) 7083 * c_rarg2 - int length 7084 * c_rarg3 - int inv 7085 * c_rarg4 - int array elements m (the result) 7086 * 7087 */ 7088 address generate_multiply() { 7089 Label argh, nothing; 7090 bind(argh); 7091 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7092 7093 align(CodeEntryAlignment); 7094 address entry = pc(); 7095 7096 cbzw(Rlen, nothing); 7097 7098 enter(); 7099 7100 // Make room. 7101 cmpw(Rlen, 512); 7102 br(Assembler::HI, argh); 7103 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7104 andr(sp, Ra, -2 * wordSize); 7105 7106 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7107 7108 { 7109 // Copy input args, reversing as we go. We use Ra as a 7110 // temporary variable. 7111 reverse(Ra, Pa_base, Rlen, t0, t1); 7112 if (!_squaring) 7113 reverse(Ra, Pb_base, Rlen, t0, t1); 7114 reverse(Ra, Pn_base, Rlen, t0, t1); 7115 } 7116 7117 // Push all call-saved registers and also Pm_base which we'll need 7118 // at the end. 7119 save_regs(); 7120 7121 #ifndef PRODUCT 7122 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 7123 { 7124 ldr(Rn, Address(Pn_base, 0)); 7125 mul(Rlo_mn, Rn, inv); 7126 subs(zr, Rlo_mn, -1); 7127 Label ok; 7128 br(EQ, ok); { 7129 stop("broken inverse in Montgomery multiply"); 7130 } bind(ok); 7131 } 7132 #endif 7133 7134 mov(Pm_base, Ra); 7135 7136 mov(t0, zr); 7137 mov(t1, zr); 7138 mov(t2, zr); 7139 7140 block_comment("for (int i = 0; i < len; i++) {"); 7141 mov(Ri, zr); { 7142 Label loop, end; 7143 cmpw(Ri, Rlen); 7144 br(Assembler::GE, end); 7145 7146 bind(loop); 7147 pre1(Ri); 7148 7149 block_comment(" for (j = i; j; j--) {"); { 7150 movw(Rj, Ri); 7151 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7152 } block_comment(" } // j"); 7153 7154 post1(); 7155 addw(Ri, Ri, 1); 7156 cmpw(Ri, Rlen); 7157 br(Assembler::LT, loop); 7158 bind(end); 7159 block_comment("} // i"); 7160 } 7161 7162 block_comment("for (int i = len; i < 2*len; i++) {"); 7163 mov(Ri, Rlen); { 7164 Label loop, end; 7165 cmpw(Ri, Rlen, Assembler::LSL, 1); 7166 br(Assembler::GE, end); 7167 7168 bind(loop); 7169 pre2(Ri, Rlen); 7170 7171 block_comment(" for (j = len*2-i-1; j; j--) {"); { 7172 lslw(Rj, Rlen, 1); 7173 subw(Rj, Rj, Ri); 7174 subw(Rj, Rj, 1); 7175 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 7176 } block_comment(" } // j"); 7177 7178 post2(Ri, Rlen); 7179 addw(Ri, Ri, 1); 7180 cmpw(Ri, Rlen, Assembler::LSL, 1); 7181 br(Assembler::LT, loop); 7182 bind(end); 7183 } 7184 block_comment("} // i"); 7185 7186 normalize(Rlen); 7187 7188 mov(Ra, Pm_base); // Save Pm_base in Ra 7189 restore_regs(); // Restore caller's Pm_base 7190 7191 // Copy our result into caller's Pm_base 7192 reverse(Pm_base, Ra, Rlen, t0, t1); 7193 7194 leave(); 7195 bind(nothing); 7196 ret(lr); 7197 7198 return entry; 7199 } 7200 // In C, approximately: 7201 7202 // void 7203 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 7204 // julong Pn_base[], julong Pm_base[], 7205 // julong inv, int len) { 7206 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7207 // julong *Pa, *Pb, *Pn, *Pm; 7208 // julong Ra, Rb, Rn, Rm; 7209 7210 // int i; 7211 7212 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7213 7214 // for (i = 0; i < len; i++) { 7215 // int j; 7216 7217 // Pa = Pa_base; 7218 // Pb = Pb_base + i; 7219 // Pm = Pm_base; 7220 // Pn = Pn_base + i; 7221 7222 // Ra = *Pa; 7223 // Rb = *Pb; 7224 // Rm = *Pm; 7225 // Rn = *Pn; 7226 7227 // int iters = i; 7228 // for (j = 0; iters--; j++) { 7229 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7230 // MACC(Ra, Rb, t0, t1, t2); 7231 // Ra = *++Pa; 7232 // Rb = *--Pb; 7233 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7234 // MACC(Rm, Rn, t0, t1, t2); 7235 // Rm = *++Pm; 7236 // Rn = *--Pn; 7237 // } 7238 7239 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 7240 // MACC(Ra, Rb, t0, t1, t2); 7241 // *Pm = Rm = t0 * inv; 7242 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7243 // MACC(Rm, Rn, t0, t1, t2); 7244 7245 // assert(t0 == 0, "broken Montgomery multiply"); 7246 7247 // t0 = t1; t1 = t2; t2 = 0; 7248 // } 7249 7250 // for (i = len; i < 2*len; i++) { 7251 // int j; 7252 7253 // Pa = Pa_base + i-len; 7254 // Pb = Pb_base + len; 7255 // Pm = Pm_base + i-len; 7256 // Pn = Pn_base + len; 7257 7258 // Ra = *++Pa; 7259 // Rb = *--Pb; 7260 // Rm = *++Pm; 7261 // Rn = *--Pn; 7262 7263 // int iters = len*2-i-1; 7264 // for (j = i-len+1; iters--; j++) { 7265 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 7266 // MACC(Ra, Rb, t0, t1, t2); 7267 // Ra = *++Pa; 7268 // Rb = *--Pb; 7269 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7270 // MACC(Rm, Rn, t0, t1, t2); 7271 // Rm = *++Pm; 7272 // Rn = *--Pn; 7273 // } 7274 7275 // Pm_base[i-len] = t0; 7276 // t0 = t1; t1 = t2; t2 = 0; 7277 // } 7278 7279 // while (t0) 7280 // t0 = sub(Pm_base, Pn_base, t0, len); 7281 // } 7282 7283 /** 7284 * Fast Montgomery squaring. This uses asymptotically 25% fewer 7285 * multiplies than Montgomery multiplication so it should be up to 7286 * 25% faster. However, its loop control is more complex and it 7287 * may actually run slower on some machines. 7288 * 7289 * Arguments: 7290 * 7291 * Inputs: 7292 * c_rarg0 - int array elements a 7293 * c_rarg1 - int array elements n (the modulus) 7294 * c_rarg2 - int length 7295 * c_rarg3 - int inv 7296 * c_rarg4 - int array elements m (the result) 7297 * 7298 */ 7299 address generate_square() { 7300 Label argh; 7301 bind(argh); 7302 stop("MontgomeryMultiply total_allocation must be <= 8192"); 7303 7304 align(CodeEntryAlignment); 7305 address entry = pc(); 7306 7307 enter(); 7308 7309 // Make room. 7310 cmpw(Rlen, 512); 7311 br(Assembler::HI, argh); 7312 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 7313 andr(sp, Ra, -2 * wordSize); 7314 7315 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 7316 7317 { 7318 // Copy input args, reversing as we go. We use Ra as a 7319 // temporary variable. 7320 reverse(Ra, Pa_base, Rlen, t0, t1); 7321 reverse(Ra, Pn_base, Rlen, t0, t1); 7322 } 7323 7324 // Push all call-saved registers and also Pm_base which we'll need 7325 // at the end. 7326 save_regs(); 7327 7328 mov(Pm_base, Ra); 7329 7330 mov(t0, zr); 7331 mov(t1, zr); 7332 mov(t2, zr); 7333 7334 block_comment("for (int i = 0; i < len; i++) {"); 7335 mov(Ri, zr); { 7336 Label loop, end; 7337 bind(loop); 7338 cmp(Ri, Rlen); 7339 br(Assembler::GE, end); 7340 7341 pre1(Ri); 7342 7343 block_comment("for (j = (i+1)/2; j; j--) {"); { 7344 add(Rj, Ri, 1); 7345 lsr(Rj, Rj, 1); 7346 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7347 } block_comment(" } // j"); 7348 7349 last_squaring(Ri); 7350 7351 block_comment(" for (j = i/2; j; j--) {"); { 7352 lsr(Rj, Ri, 1); 7353 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7354 } block_comment(" } // j"); 7355 7356 post1_squaring(); 7357 add(Ri, Ri, 1); 7358 cmp(Ri, Rlen); 7359 br(Assembler::LT, loop); 7360 7361 bind(end); 7362 block_comment("} // i"); 7363 } 7364 7365 block_comment("for (int i = len; i < 2*len; i++) {"); 7366 mov(Ri, Rlen); { 7367 Label loop, end; 7368 bind(loop); 7369 cmp(Ri, Rlen, Assembler::LSL, 1); 7370 br(Assembler::GE, end); 7371 7372 pre2(Ri, Rlen); 7373 7374 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 7375 lsl(Rj, Rlen, 1); 7376 sub(Rj, Rj, Ri); 7377 sub(Rj, Rj, 1); 7378 lsr(Rj, Rj, 1); 7379 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 7380 } block_comment(" } // j"); 7381 7382 last_squaring(Ri); 7383 7384 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 7385 lsl(Rj, Rlen, 1); 7386 sub(Rj, Rj, Ri); 7387 lsr(Rj, Rj, 1); 7388 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 7389 } block_comment(" } // j"); 7390 7391 post2(Ri, Rlen); 7392 add(Ri, Ri, 1); 7393 cmp(Ri, Rlen, Assembler::LSL, 1); 7394 7395 br(Assembler::LT, loop); 7396 bind(end); 7397 block_comment("} // i"); 7398 } 7399 7400 normalize(Rlen); 7401 7402 mov(Ra, Pm_base); // Save Pm_base in Ra 7403 restore_regs(); // Restore caller's Pm_base 7404 7405 // Copy our result into caller's Pm_base 7406 reverse(Pm_base, Ra, Rlen, t0, t1); 7407 7408 leave(); 7409 ret(lr); 7410 7411 return entry; 7412 } 7413 // In C, approximately: 7414 7415 // void 7416 // montgomery_square(julong Pa_base[], julong Pn_base[], 7417 // julong Pm_base[], julong inv, int len) { 7418 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 7419 // julong *Pa, *Pb, *Pn, *Pm; 7420 // julong Ra, Rb, Rn, Rm; 7421 7422 // int i; 7423 7424 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 7425 7426 // for (i = 0; i < len; i++) { 7427 // int j; 7428 7429 // Pa = Pa_base; 7430 // Pb = Pa_base + i; 7431 // Pm = Pm_base; 7432 // Pn = Pn_base + i; 7433 7434 // Ra = *Pa; 7435 // Rb = *Pb; 7436 // Rm = *Pm; 7437 // Rn = *Pn; 7438 7439 // int iters = (i+1)/2; 7440 // for (j = 0; iters--; j++) { 7441 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7442 // MACC2(Ra, Rb, t0, t1, t2); 7443 // Ra = *++Pa; 7444 // Rb = *--Pb; 7445 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7446 // MACC(Rm, Rn, t0, t1, t2); 7447 // Rm = *++Pm; 7448 // Rn = *--Pn; 7449 // } 7450 // if ((i & 1) == 0) { 7451 // assert(Ra == Pa_base[j], "must be"); 7452 // MACC(Ra, Ra, t0, t1, t2); 7453 // } 7454 // iters = i/2; 7455 // assert(iters == i-j, "must be"); 7456 // for (; iters--; j++) { 7457 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7458 // MACC(Rm, Rn, t0, t1, t2); 7459 // Rm = *++Pm; 7460 // Rn = *--Pn; 7461 // } 7462 7463 // *Pm = Rm = t0 * inv; 7464 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 7465 // MACC(Rm, Rn, t0, t1, t2); 7466 7467 // assert(t0 == 0, "broken Montgomery multiply"); 7468 7469 // t0 = t1; t1 = t2; t2 = 0; 7470 // } 7471 7472 // for (i = len; i < 2*len; i++) { 7473 // int start = i-len+1; 7474 // int end = start + (len - start)/2; 7475 // int j; 7476 7477 // Pa = Pa_base + i-len; 7478 // Pb = Pa_base + len; 7479 // Pm = Pm_base + i-len; 7480 // Pn = Pn_base + len; 7481 7482 // Ra = *++Pa; 7483 // Rb = *--Pb; 7484 // Rm = *++Pm; 7485 // Rn = *--Pn; 7486 7487 // int iters = (2*len-i-1)/2; 7488 // assert(iters == end-start, "must be"); 7489 // for (j = start; iters--; j++) { 7490 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 7491 // MACC2(Ra, Rb, t0, t1, t2); 7492 // Ra = *++Pa; 7493 // Rb = *--Pb; 7494 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7495 // MACC(Rm, Rn, t0, t1, t2); 7496 // Rm = *++Pm; 7497 // Rn = *--Pn; 7498 // } 7499 // if ((i & 1) == 0) { 7500 // assert(Ra == Pa_base[j], "must be"); 7501 // MACC(Ra, Ra, t0, t1, t2); 7502 // } 7503 // iters = (2*len-i)/2; 7504 // assert(iters == len-j, "must be"); 7505 // for (; iters--; j++) { 7506 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 7507 // MACC(Rm, Rn, t0, t1, t2); 7508 // Rm = *++Pm; 7509 // Rn = *--Pn; 7510 // } 7511 // Pm_base[i-len] = t0; 7512 // t0 = t1; t1 = t2; t2 = 0; 7513 // } 7514 7515 // while (t0) 7516 // t0 = sub(Pm_base, Pn_base, t0, len); 7517 // } 7518 }; 7519 7520 7521 // Initialization 7522 void generate_initial() { 7523 // Generate initial stubs and initializes the entry points 7524 7525 // entry points that exist in all platforms Note: This is code 7526 // that could be shared among different platforms - however the 7527 // benefit seems to be smaller than the disadvantage of having a 7528 // much more complicated generator structure. See also comment in 7529 // stubRoutines.hpp. 7530 7531 StubRoutines::_forward_exception_entry = generate_forward_exception(); 7532 7533 StubRoutines::_call_stub_entry = 7534 generate_call_stub(StubRoutines::_call_stub_return_address); 7535 7536 // is referenced by megamorphic call 7537 StubRoutines::_catch_exception_entry = generate_catch_exception(); 7538 7539 // Build this early so it's available for the interpreter. 7540 StubRoutines::_throw_StackOverflowError_entry = 7541 generate_throw_exception("StackOverflowError throw_exception", 7542 CAST_FROM_FN_PTR(address, 7543 SharedRuntime::throw_StackOverflowError)); 7544 StubRoutines::_throw_delayed_StackOverflowError_entry = 7545 generate_throw_exception("delayed StackOverflowError throw_exception", 7546 CAST_FROM_FN_PTR(address, 7547 SharedRuntime::throw_delayed_StackOverflowError)); 7548 if (UseCRC32Intrinsics) { 7549 // set table address before stub generation which use it 7550 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 7551 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 7552 } 7553 7554 if (UseCRC32CIntrinsics) { 7555 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 7556 } 7557 7558 // Disabled until JDK-8210858 is fixed 7559 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 7560 // StubRoutines::_dlog = generate_dlog(); 7561 // } 7562 7563 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 7564 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 7565 } 7566 7567 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 7568 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 7569 } 7570 7571 // Safefetch stubs. 7572 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 7573 &StubRoutines::_safefetch32_fault_pc, 7574 &StubRoutines::_safefetch32_continuation_pc); 7575 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 7576 &StubRoutines::_safefetchN_fault_pc, 7577 &StubRoutines::_safefetchN_continuation_pc); 7578 } 7579 7580 void generate_all() { 7581 // support for verify_oop (must happen after universe_init) 7582 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 7583 StubRoutines::_throw_AbstractMethodError_entry = 7584 generate_throw_exception("AbstractMethodError throw_exception", 7585 CAST_FROM_FN_PTR(address, 7586 SharedRuntime:: 7587 throw_AbstractMethodError)); 7588 7589 StubRoutines::_throw_IncompatibleClassChangeError_entry = 7590 generate_throw_exception("IncompatibleClassChangeError throw_exception", 7591 CAST_FROM_FN_PTR(address, 7592 SharedRuntime:: 7593 throw_IncompatibleClassChangeError)); 7594 7595 StubRoutines::_throw_NullPointerException_at_call_entry = 7596 generate_throw_exception("NullPointerException at call throw_exception", 7597 CAST_FROM_FN_PTR(address, 7598 SharedRuntime:: 7599 throw_NullPointerException_at_call)); 7600 7601 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 7602 7603 // arraycopy stubs used by compilers 7604 generate_arraycopy_stubs(); 7605 7606 // has negatives stub for large arrays. 7607 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 7608 7609 // array equals stub for large arrays. 7610 if (!UseSimpleArrayEquals) { 7611 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 7612 } 7613 7614 generate_compare_long_strings(); 7615 7616 generate_string_indexof_stubs(); 7617 7618 // byte_array_inflate stub for large arrays. 7619 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 7620 7621 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 7622 if (bs_nm != NULL) { 7623 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 7624 } 7625 if (UseFastLocking) { 7626 StubRoutines::aarch64::_check_lock_stack = generate_check_lock_stack(); 7627 } 7628 #ifdef COMPILER2 7629 if (UseMultiplyToLenIntrinsic) { 7630 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 7631 } 7632 7633 if (UseSquareToLenIntrinsic) { 7634 StubRoutines::_squareToLen = generate_squareToLen(); 7635 } 7636 7637 if (UseMulAddIntrinsic) { 7638 StubRoutines::_mulAdd = generate_mulAdd(); 7639 } 7640 7641 if (UseSIMDForBigIntegerShiftIntrinsics) { 7642 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 7643 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 7644 } 7645 7646 if (UseMontgomeryMultiplyIntrinsic) { 7647 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 7648 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 7649 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 7650 } 7651 7652 if (UseMontgomerySquareIntrinsic) { 7653 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 7654 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 7655 // We use generate_multiply() rather than generate_square() 7656 // because it's faster for the sizes of modulus we care about. 7657 StubRoutines::_montgomerySquare = g.generate_multiply(); 7658 } 7659 #endif // COMPILER2 7660 7661 // generate GHASH intrinsics code 7662 if (UseGHASHIntrinsics) { 7663 if (UseAESCTRIntrinsics) { 7664 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 7665 } else { 7666 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 7667 } 7668 } 7669 7670 if (UseBASE64Intrinsics) { 7671 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 7672 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 7673 } 7674 7675 // data cache line writeback 7676 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 7677 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 7678 7679 if (UseAESIntrinsics) { 7680 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 7681 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 7682 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 7683 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 7684 } 7685 7686 if (UseAESCTRIntrinsics) { 7687 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 7688 } 7689 7690 if (UseMD5Intrinsics) { 7691 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 7692 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 7693 } 7694 if (UseSHA1Intrinsics) { 7695 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 7696 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 7697 } 7698 if (UseSHA256Intrinsics) { 7699 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 7700 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 7701 } 7702 if (UseSHA512Intrinsics) { 7703 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 7704 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 7705 } 7706 if (UseSHA3Intrinsics) { 7707 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 7708 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 7709 } 7710 7711 // generate Adler32 intrinsics code 7712 if (UseAdler32Intrinsics) { 7713 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 7714 } 7715 7716 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 7717 7718 #ifdef LINUX 7719 7720 generate_atomic_entry_points(); 7721 7722 #endif // LINUX 7723 7724 StubRoutines::aarch64::set_completed(); 7725 } 7726 7727 public: 7728 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 7729 if (all) { 7730 generate_all(); 7731 } else { 7732 generate_initial(); 7733 } 7734 } 7735 }; // end class declaration 7736 7737 #define UCM_TABLE_MAX_ENTRIES 8 7738 void StubGenerator_generate(CodeBuffer* code, bool all) { 7739 if (UnsafeCopyMemory::_table == NULL) { 7740 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 7741 } 7742 StubGenerator g(code, all); 7743 } 7744 7745 7746 #ifdef LINUX 7747 7748 // Define pointers to atomic stubs and initialize them to point to the 7749 // code in atomic_aarch64.S. 7750 7751 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 7752 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 7753 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 7754 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 7755 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 7756 7757 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 7758 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 7759 DEFAULT_ATOMIC_OP(xchg, 4, ) 7760 DEFAULT_ATOMIC_OP(xchg, 8, ) 7761 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 7762 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 7763 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 7764 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 7765 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 7766 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 7767 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 7768 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 7769 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 7770 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 7771 7772 #undef DEFAULT_ATOMIC_OP 7773 7774 #endif // LINUX