1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/universe.hpp" 35 #include "nativeInst_riscv.hpp" 36 #include "oops/instanceOop.hpp" 37 #include "oops/method.hpp" 38 #include "oops/objArrayKlass.hpp" 39 #include "oops/oop.inline.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "prims/upcallLinker.hpp" 42 #include "runtime/continuation.hpp" 43 #include "runtime/continuationEntry.inline.hpp" 44 #include "runtime/frame.inline.hpp" 45 #include "runtime/handles.inline.hpp" 46 #include "runtime/javaThread.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/stubCodeGenerator.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "utilities/align.hpp" 51 #include "utilities/powerOfTwo.hpp" 52 #ifdef COMPILER2 53 #include "opto/runtime.hpp" 54 #endif 55 56 // Declaration and definition of StubGenerator (no .hpp file). 57 // For a more detailed description of the stub routine structure 58 // see the comment in stubRoutines.hpp 59 60 #undef __ 61 #define __ _masm-> 62 63 #ifdef PRODUCT 64 #define BLOCK_COMMENT(str) /* nothing */ 65 #else 66 #define BLOCK_COMMENT(str) __ block_comment(str) 67 #endif 68 69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 70 71 // Stub Code definitions 72 73 class StubGenerator: public StubCodeGenerator { 74 private: 75 76 #ifdef PRODUCT 77 #define inc_counter_np(counter) ((void)0) 78 #else 79 void inc_counter_np_(uint& counter) { 80 __ incrementw(ExternalAddress((address)&counter)); 81 } 82 #define inc_counter_np(counter) \ 83 BLOCK_COMMENT("inc_counter " #counter); \ 84 inc_counter_np_(counter); 85 #endif 86 87 // Call stubs are used to call Java from C 88 // 89 // Arguments: 90 // c_rarg0: call wrapper address address 91 // c_rarg1: result address 92 // c_rarg2: result type BasicType 93 // c_rarg3: method Method* 94 // c_rarg4: (interpreter) entry point address 95 // c_rarg5: parameters intptr_t* 96 // c_rarg6: parameter size (in words) int 97 // c_rarg7: thread Thread* 98 // 99 // There is no return from the stub itself as any Java result 100 // is written to result 101 // 102 // we save x1 (ra) as the return PC at the base of the frame and 103 // link x8 (fp) below it as the frame pointer installing sp (x2) 104 // into fp. 105 // 106 // we save x10-x17, which accounts for all the c arguments. 107 // 108 // TODO: strictly do we need to save them all? they are treated as 109 // volatile by C so could we omit saving the ones we are going to 110 // place in global registers (thread? method?) or those we only use 111 // during setup of the Java call? 112 // 113 // we don't need to save x5 which C uses as an indirect result location 114 // return register. 115 // 116 // we don't need to save x6-x7 and x28-x31 which both C and Java treat as 117 // volatile 118 // 119 // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary 120 // registers and C expects to be callee-save 121 // 122 // so the stub frame looks like this when we enter Java code 123 // 124 // [ return_from_Java ] <--- sp 125 // [ argument word n ] 126 // ... 127 // -35 [ argument word 1 ] 128 // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call 129 // -33 [ saved f27 ] 130 // -32 [ saved f26 ] 131 // -31 [ saved f25 ] 132 // -30 [ saved f24 ] 133 // -29 [ saved f23 ] 134 // -28 [ saved f22 ] 135 // -27 [ saved f21 ] 136 // -26 [ saved f20 ] 137 // -25 [ saved f19 ] 138 // -24 [ saved f18 ] 139 // -23 [ saved f9 ] 140 // -22 [ saved f8 ] 141 // -21 [ saved x27 ] 142 // -20 [ saved x26 ] 143 // -19 [ saved x25 ] 144 // -18 [ saved x24 ] 145 // -17 [ saved x23 ] 146 // -16 [ saved x22 ] 147 // -15 [ saved x21 ] 148 // -14 [ saved x20 ] 149 // -13 [ saved x19 ] 150 // -12 [ saved x18 ] 151 // -11 [ saved x9 ] 152 // -10 [ call wrapper (x10) ] 153 // -9 [ result (x11) ] 154 // -8 [ result type (x12) ] 155 // -7 [ method (x13) ] 156 // -6 [ entry point (x14) ] 157 // -5 [ parameters (x15) ] 158 // -4 [ parameter size (x16) ] 159 // -3 [ thread (x17) ] 160 // -2 [ saved fp (x8) ] 161 // -1 [ saved ra (x1) ] 162 // 0 [ ] <--- fp == saved sp (x2) 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -34, 167 168 frm_off = sp_after_call_off, 169 f27_off = -33, 170 f26_off = -32, 171 f25_off = -31, 172 f24_off = -30, 173 f23_off = -29, 174 f22_off = -28, 175 f21_off = -27, 176 f20_off = -26, 177 f19_off = -25, 178 f18_off = -24, 179 f9_off = -23, 180 f8_off = -22, 181 182 x27_off = -21, 183 x26_off = -20, 184 x25_off = -19, 185 x24_off = -18, 186 x23_off = -17, 187 x22_off = -16, 188 x21_off = -15, 189 x20_off = -14, 190 x19_off = -13, 191 x18_off = -12, 192 x9_off = -11, 193 194 call_wrapper_off = -10, 195 result_off = -9, 196 result_type_off = -8, 197 method_off = -7, 198 entry_point_off = -6, 199 parameters_off = -5, 200 parameter_size_off = -4, 201 thread_off = -3, 202 fp_f = -2, 203 retaddr_off = -1, 204 }; 205 206 address generate_call_stub(address& return_address) { 207 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 208 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 209 "adjust this code"); 210 211 StubCodeMark mark(this, "StubRoutines", "call_stub"); 212 address start = __ pc(); 213 214 const Address sp_after_call (fp, sp_after_call_off * wordSize); 215 216 const Address frm_save (fp, frm_off * wordSize); 217 const Address call_wrapper (fp, call_wrapper_off * wordSize); 218 const Address result (fp, result_off * wordSize); 219 const Address result_type (fp, result_type_off * wordSize); 220 const Address method (fp, method_off * wordSize); 221 const Address entry_point (fp, entry_point_off * wordSize); 222 const Address parameters (fp, parameters_off * wordSize); 223 const Address parameter_size(fp, parameter_size_off * wordSize); 224 225 const Address thread (fp, thread_off * wordSize); 226 227 const Address f27_save (fp, f27_off * wordSize); 228 const Address f26_save (fp, f26_off * wordSize); 229 const Address f25_save (fp, f25_off * wordSize); 230 const Address f24_save (fp, f24_off * wordSize); 231 const Address f23_save (fp, f23_off * wordSize); 232 const Address f22_save (fp, f22_off * wordSize); 233 const Address f21_save (fp, f21_off * wordSize); 234 const Address f20_save (fp, f20_off * wordSize); 235 const Address f19_save (fp, f19_off * wordSize); 236 const Address f18_save (fp, f18_off * wordSize); 237 const Address f9_save (fp, f9_off * wordSize); 238 const Address f8_save (fp, f8_off * wordSize); 239 240 const Address x27_save (fp, x27_off * wordSize); 241 const Address x26_save (fp, x26_off * wordSize); 242 const Address x25_save (fp, x25_off * wordSize); 243 const Address x24_save (fp, x24_off * wordSize); 244 const Address x23_save (fp, x23_off * wordSize); 245 const Address x22_save (fp, x22_off * wordSize); 246 const Address x21_save (fp, x21_off * wordSize); 247 const Address x20_save (fp, x20_off * wordSize); 248 const Address x19_save (fp, x19_off * wordSize); 249 const Address x18_save (fp, x18_off * wordSize); 250 251 const Address x9_save (fp, x9_off * wordSize); 252 253 // stub code 254 255 address riscv_entry = __ pc(); 256 257 // set up frame and move sp to end of save area 258 __ enter(); 259 __ addi(sp, fp, sp_after_call_off * wordSize); 260 261 // save register parameters and Java temporary/global registers 262 // n.b. we save thread even though it gets installed in 263 // xthread because we want to sanity check tp later 264 __ sd(c_rarg7, thread); 265 __ sw(c_rarg6, parameter_size); 266 __ sd(c_rarg5, parameters); 267 __ sd(c_rarg4, entry_point); 268 __ sd(c_rarg3, method); 269 __ sd(c_rarg2, result_type); 270 __ sd(c_rarg1, result); 271 __ sd(c_rarg0, call_wrapper); 272 273 __ sd(x9, x9_save); 274 275 __ sd(x18, x18_save); 276 __ sd(x19, x19_save); 277 __ sd(x20, x20_save); 278 __ sd(x21, x21_save); 279 __ sd(x22, x22_save); 280 __ sd(x23, x23_save); 281 __ sd(x24, x24_save); 282 __ sd(x25, x25_save); 283 __ sd(x26, x26_save); 284 __ sd(x27, x27_save); 285 286 __ fsd(f8, f8_save); 287 __ fsd(f9, f9_save); 288 __ fsd(f18, f18_save); 289 __ fsd(f19, f19_save); 290 __ fsd(f20, f20_save); 291 __ fsd(f21, f21_save); 292 __ fsd(f22, f22_save); 293 __ fsd(f23, f23_save); 294 __ fsd(f24, f24_save); 295 __ fsd(f25, f25_save); 296 __ fsd(f26, f26_save); 297 __ fsd(f27, f27_save); 298 299 __ frrm(t0); 300 __ sd(t0, frm_save); 301 // Set frm to the state we need. We do want Round to Nearest. We 302 // don't want non-IEEE rounding modes. 303 Label skip_fsrmi; 304 guarantee(__ RoundingMode::rne == 0, "must be"); 305 __ beqz(t0, skip_fsrmi); 306 __ fsrmi(__ RoundingMode::rne); 307 __ bind(skip_fsrmi); 308 309 // install Java thread in global register now we have saved 310 // whatever value it held 311 __ mv(xthread, c_rarg7); 312 313 // And method 314 __ mv(xmethod, c_rarg3); 315 316 // set up the heapbase register 317 __ reinit_heapbase(); 318 319 #ifdef ASSERT 320 // make sure we have no pending exceptions 321 { 322 Label L; 323 __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset()))); 324 __ beqz(t0, L); 325 __ stop("StubRoutines::call_stub: entered with pending exception"); 326 __ BIND(L); 327 } 328 #endif 329 // pass parameters if any 330 __ mv(esp, sp); 331 __ slli(t0, c_rarg6, LogBytesPerWord); 332 __ sub(t0, sp, t0); // Move SP out of the way 333 __ andi(sp, t0, -2 * wordSize); 334 335 BLOCK_COMMENT("pass parameters if any"); 336 Label parameters_done; 337 // parameter count is still in c_rarg6 338 // and parameter pointer identifying param 1 is in c_rarg5 339 __ beqz(c_rarg6, parameters_done); 340 341 address loop = __ pc(); 342 __ ld(t0, Address(c_rarg5, 0)); 343 __ addi(c_rarg5, c_rarg5, wordSize); 344 __ addi(c_rarg6, c_rarg6, -1); 345 __ push_reg(t0); 346 __ bgtz(c_rarg6, loop); 347 348 __ BIND(parameters_done); 349 350 // call Java entry -- passing methdoOop, and current sp 351 // xmethod: Method* 352 // x19_sender_sp: sender sp 353 BLOCK_COMMENT("call Java function"); 354 __ mv(x19_sender_sp, sp); 355 __ jalr(c_rarg4); 356 357 // save current address for use by exception handling code 358 359 return_address = __ pc(); 360 361 // store result depending on type (everything that is not 362 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 363 // n.b. this assumes Java returns an integral result in x10 364 // and a floating result in j_farg0 365 __ ld(j_rarg2, result); 366 Label is_long, is_float, is_double, exit; 367 __ ld(j_rarg1, result_type); 368 __ mv(t0, (u1)T_OBJECT); 369 __ beq(j_rarg1, t0, is_long); 370 __ mv(t0, (u1)T_LONG); 371 __ beq(j_rarg1, t0, is_long); 372 __ mv(t0, (u1)T_FLOAT); 373 __ beq(j_rarg1, t0, is_float); 374 __ mv(t0, (u1)T_DOUBLE); 375 __ beq(j_rarg1, t0, is_double); 376 377 // handle T_INT case 378 __ sw(x10, Address(j_rarg2)); 379 380 __ BIND(exit); 381 382 // pop parameters 383 __ addi(esp, fp, sp_after_call_off * wordSize); 384 385 #ifdef ASSERT 386 // verify that threads correspond 387 { 388 Label L, S; 389 __ ld(t0, thread); 390 __ bne(xthread, t0, S); 391 __ get_thread(t0); 392 __ beq(xthread, t0, L); 393 __ BIND(S); 394 __ stop("StubRoutines::call_stub: threads must correspond"); 395 __ BIND(L); 396 } 397 #endif 398 399 __ pop_cont_fastpath(xthread); 400 401 // restore callee-save registers 402 __ fld(f27, f27_save); 403 __ fld(f26, f26_save); 404 __ fld(f25, f25_save); 405 __ fld(f24, f24_save); 406 __ fld(f23, f23_save); 407 __ fld(f22, f22_save); 408 __ fld(f21, f21_save); 409 __ fld(f20, f20_save); 410 __ fld(f19, f19_save); 411 __ fld(f18, f18_save); 412 __ fld(f9, f9_save); 413 __ fld(f8, f8_save); 414 415 __ ld(x27, x27_save); 416 __ ld(x26, x26_save); 417 __ ld(x25, x25_save); 418 __ ld(x24, x24_save); 419 __ ld(x23, x23_save); 420 __ ld(x22, x22_save); 421 __ ld(x21, x21_save); 422 __ ld(x20, x20_save); 423 __ ld(x19, x19_save); 424 __ ld(x18, x18_save); 425 426 __ ld(x9, x9_save); 427 428 // restore frm 429 Label skip_fsrm; 430 __ ld(t0, frm_save); 431 __ frrm(t1); 432 __ beq(t0, t1, skip_fsrm); 433 __ fsrm(t0); 434 __ bind(skip_fsrm); 435 436 __ ld(c_rarg0, call_wrapper); 437 __ ld(c_rarg1, result); 438 __ ld(c_rarg2, result_type); 439 __ ld(c_rarg3, method); 440 __ ld(c_rarg4, entry_point); 441 __ ld(c_rarg5, parameters); 442 __ ld(c_rarg6, parameter_size); 443 __ ld(c_rarg7, thread); 444 445 // leave frame and return to caller 446 __ leave(); 447 __ ret(); 448 449 // handle return types different from T_INT 450 451 __ BIND(is_long); 452 __ sd(x10, Address(j_rarg2, 0)); 453 __ j(exit); 454 455 __ BIND(is_float); 456 __ fsw(j_farg0, Address(j_rarg2, 0), t0); 457 __ j(exit); 458 459 __ BIND(is_double); 460 __ fsd(j_farg0, Address(j_rarg2, 0), t0); 461 __ j(exit); 462 463 return start; 464 } 465 466 // Return point for a Java call if there's an exception thrown in 467 // Java code. The exception is caught and transformed into a 468 // pending exception stored in JavaThread that can be tested from 469 // within the VM. 470 // 471 // Note: Usually the parameters are removed by the callee. In case 472 // of an exception crossing an activation frame boundary, that is 473 // not the case if the callee is compiled code => need to setup the 474 // sp. 475 // 476 // x10: exception oop 477 478 address generate_catch_exception() { 479 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 480 address start = __ pc(); 481 482 // same as in generate_call_stub(): 483 const Address thread(fp, thread_off * wordSize); 484 485 #ifdef ASSERT 486 // verify that threads correspond 487 { 488 Label L, S; 489 __ ld(t0, thread); 490 __ bne(xthread, t0, S); 491 __ get_thread(t0); 492 __ beq(xthread, t0, L); 493 __ bind(S); 494 __ stop("StubRoutines::catch_exception: threads must correspond"); 495 __ bind(L); 496 } 497 #endif 498 499 // set pending exception 500 __ verify_oop(x10); 501 502 __ sd(x10, Address(xthread, Thread::pending_exception_offset())); 503 __ mv(t0, (address)__FILE__); 504 __ sd(t0, Address(xthread, Thread::exception_file_offset())); 505 __ mv(t0, (int)__LINE__); 506 __ sw(t0, Address(xthread, Thread::exception_line_offset())); 507 508 // complete return to VM 509 assert(StubRoutines::_call_stub_return_address != nullptr, 510 "_call_stub_return_address must have been generated before"); 511 __ j(StubRoutines::_call_stub_return_address); 512 513 return start; 514 } 515 516 // Continuation point for runtime calls returning with a pending 517 // exception. The pending exception check happened in the runtime 518 // or native call stub. The pending exception in Thread is 519 // converted into a Java-level exception. 520 // 521 // Contract with Java-level exception handlers: 522 // x10: exception 523 // x13: throwing pc 524 // 525 // NOTE: At entry of this stub, exception-pc must be in RA !! 526 527 // NOTE: this is always used as a jump target within generated code 528 // so it just needs to be generated code with no x86 prolog 529 530 address generate_forward_exception() { 531 StubCodeMark mark(this, "StubRoutines", "forward exception"); 532 address start = __ pc(); 533 534 // Upon entry, RA points to the return address returning into 535 // Java (interpreted or compiled) code; i.e., the return address 536 // becomes the throwing pc. 537 // 538 // Arguments pushed before the runtime call are still on the stack 539 // but the exception handler will reset the stack pointer -> 540 // ignore them. A potential result in registers can be ignored as 541 // well. 542 543 #ifdef ASSERT 544 // make sure this code is only executed if there is a pending exception 545 { 546 Label L; 547 __ ld(t0, Address(xthread, Thread::pending_exception_offset())); 548 __ bnez(t0, L); 549 __ stop("StubRoutines::forward exception: no pending exception (1)"); 550 __ bind(L); 551 } 552 #endif 553 554 // compute exception handler into x9 555 556 // call the VM to find the handler address associated with the 557 // caller address. pass thread in x10 and caller pc (ret address) 558 // in x11. n.b. the caller pc is in ra, unlike x86 where it is on 559 // the stack. 560 __ mv(c_rarg1, ra); 561 // ra will be trashed by the VM call so we move it to x9 562 // (callee-saved) because we also need to pass it to the handler 563 // returned by this call. 564 __ mv(x9, ra); 565 BLOCK_COMMENT("call exception_handler_for_return_address"); 566 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 567 SharedRuntime::exception_handler_for_return_address), 568 xthread, c_rarg1); 569 // we should not really care that ra is no longer the callee 570 // address. we saved the value the handler needs in x9 so we can 571 // just copy it to x13. however, the C2 handler will push its own 572 // frame and then calls into the VM and the VM code asserts that 573 // the PC for the frame above the handler belongs to a compiled 574 // Java method. So, we restore ra here to satisfy that assert. 575 __ mv(ra, x9); 576 // setup x10 & x13 & clear pending exception 577 __ mv(x13, x9); 578 __ mv(x9, x10); 579 __ ld(x10, Address(xthread, Thread::pending_exception_offset())); 580 __ sd(zr, Address(xthread, Thread::pending_exception_offset())); 581 582 #ifdef ASSERT 583 // make sure exception is set 584 { 585 Label L; 586 __ bnez(x10, L); 587 __ stop("StubRoutines::forward exception: no pending exception (2)"); 588 __ bind(L); 589 } 590 #endif 591 592 // continue at exception handler 593 // x10: exception 594 // x13: throwing pc 595 // x9: exception handler 596 __ verify_oop(x10); 597 __ jr(x9); 598 599 return start; 600 } 601 602 // Non-destructive plausibility checks for oops 603 // 604 // Arguments: 605 // x10: oop to verify 606 // t0: error message 607 // 608 // Stack after saving c_rarg3: 609 // [tos + 0]: saved c_rarg3 610 // [tos + 1]: saved c_rarg2 611 // [tos + 2]: saved ra 612 // [tos + 3]: saved t1 613 // [tos + 4]: saved x10 614 // [tos + 5]: saved t0 615 address generate_verify_oop() { 616 617 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 618 address start = __ pc(); 619 620 Label exit, error; 621 622 __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3 623 624 __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 625 __ ld(c_rarg3, Address(c_rarg2)); 626 __ add(c_rarg3, c_rarg3, 1); 627 __ sd(c_rarg3, Address(c_rarg2)); 628 629 // object is in x10 630 // make sure object is 'reasonable' 631 __ beqz(x10, exit); // if obj is null it is OK 632 633 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 634 bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error); 635 636 // return if everything seems ok 637 __ bind(exit); 638 639 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3 640 __ ret(); 641 642 // handle errors 643 __ bind(error); 644 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3 645 646 __ push_reg(RegSet::range(x0, x31), sp); 647 // debug(char* msg, int64_t pc, int64_t regs[]) 648 __ mv(c_rarg0, t0); // pass address of error message 649 __ mv(c_rarg1, ra); // pass return address 650 __ mv(c_rarg2, sp); // pass address of regs on stack 651 #ifndef PRODUCT 652 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 653 #endif 654 BLOCK_COMMENT("call MacroAssembler::debug"); 655 __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 656 __ ebreak(); 657 658 return start; 659 } 660 661 // The inner part of zero_words(). 662 // 663 // Inputs: 664 // x28: the HeapWord-aligned base address of an array to zero. 665 // x29: the count in HeapWords, x29 > 0. 666 // 667 // Returns x28 and x29, adjusted for the caller to clear. 668 // x28: the base address of the tail of words left to clear. 669 // x29: the number of words in the tail. 670 // x29 < MacroAssembler::zero_words_block_size. 671 672 address generate_zero_blocks() { 673 Label done; 674 675 const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31; 676 677 __ align(CodeEntryAlignment); 678 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 679 address start = __ pc(); 680 681 if (UseBlockZeroing) { 682 // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero 683 // after alignment. 684 Label small; 685 int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize; 686 __ mv(tmp1, low_limit); 687 __ blt(cnt, tmp1, small); 688 __ zero_dcache_blocks(base, cnt, tmp1, tmp2); 689 __ bind(small); 690 } 691 692 { 693 // Clear the remaining blocks. 694 Label loop; 695 __ mv(tmp1, MacroAssembler::zero_words_block_size); 696 __ blt(cnt, tmp1, done); 697 __ bind(loop); 698 for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) { 699 __ sd(zr, Address(base, i * wordSize)); 700 } 701 __ add(base, base, MacroAssembler::zero_words_block_size * wordSize); 702 __ sub(cnt, cnt, MacroAssembler::zero_words_block_size); 703 __ bge(cnt, tmp1, loop); 704 __ bind(done); 705 } 706 707 __ ret(); 708 709 return start; 710 } 711 712 typedef enum { 713 copy_forwards = 1, 714 copy_backwards = -1 715 } copy_direction; 716 717 // Bulk copy of blocks of 8 words. 718 // 719 // count is a count of words. 720 // 721 // Precondition: count >= 8 722 // 723 // Postconditions: 724 // 725 // The least significant bit of count contains the remaining count 726 // of words to copy. The rest of count is trash. 727 // 728 // s and d are adjusted to point to the remaining words to copy 729 // 730 void generate_copy_longs(Label &start, Register s, Register d, Register count, 731 copy_direction direction) { 732 int unit = wordSize * direction; 733 int bias = wordSize; 734 735 const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16, 736 tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29; 737 738 const Register stride = x30; 739 740 assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3, 741 tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7); 742 assert_different_registers(s, d, count, t0); 743 744 Label again, drain; 745 const char* stub_name = nullptr; 746 if (direction == copy_forwards) { 747 stub_name = "forward_copy_longs"; 748 } else { 749 stub_name = "backward_copy_longs"; 750 } 751 StubCodeMark mark(this, "StubRoutines", stub_name); 752 __ align(CodeEntryAlignment); 753 __ bind(start); 754 755 if (direction == copy_forwards) { 756 __ sub(s, s, bias); 757 __ sub(d, d, bias); 758 } 759 760 #ifdef ASSERT 761 // Make sure we are never given < 8 words 762 { 763 Label L; 764 765 __ mv(t0, 8); 766 __ bge(count, t0, L); 767 __ stop("genrate_copy_longs called with < 8 words"); 768 __ bind(L); 769 } 770 #endif 771 772 __ ld(tmp_reg0, Address(s, 1 * unit)); 773 __ ld(tmp_reg1, Address(s, 2 * unit)); 774 __ ld(tmp_reg2, Address(s, 3 * unit)); 775 __ ld(tmp_reg3, Address(s, 4 * unit)); 776 __ ld(tmp_reg4, Address(s, 5 * unit)); 777 __ ld(tmp_reg5, Address(s, 6 * unit)); 778 __ ld(tmp_reg6, Address(s, 7 * unit)); 779 __ ld(tmp_reg7, Address(s, 8 * unit)); 780 __ addi(s, s, 8 * unit); 781 782 __ sub(count, count, 16); 783 __ bltz(count, drain); 784 785 __ bind(again); 786 787 __ sd(tmp_reg0, Address(d, 1 * unit)); 788 __ sd(tmp_reg1, Address(d, 2 * unit)); 789 __ sd(tmp_reg2, Address(d, 3 * unit)); 790 __ sd(tmp_reg3, Address(d, 4 * unit)); 791 __ sd(tmp_reg4, Address(d, 5 * unit)); 792 __ sd(tmp_reg5, Address(d, 6 * unit)); 793 __ sd(tmp_reg6, Address(d, 7 * unit)); 794 __ sd(tmp_reg7, Address(d, 8 * unit)); 795 796 __ ld(tmp_reg0, Address(s, 1 * unit)); 797 __ ld(tmp_reg1, Address(s, 2 * unit)); 798 __ ld(tmp_reg2, Address(s, 3 * unit)); 799 __ ld(tmp_reg3, Address(s, 4 * unit)); 800 __ ld(tmp_reg4, Address(s, 5 * unit)); 801 __ ld(tmp_reg5, Address(s, 6 * unit)); 802 __ ld(tmp_reg6, Address(s, 7 * unit)); 803 __ ld(tmp_reg7, Address(s, 8 * unit)); 804 805 __ addi(s, s, 8 * unit); 806 __ addi(d, d, 8 * unit); 807 808 __ sub(count, count, 8); 809 __ bgez(count, again); 810 811 // Drain 812 __ bind(drain); 813 814 __ sd(tmp_reg0, Address(d, 1 * unit)); 815 __ sd(tmp_reg1, Address(d, 2 * unit)); 816 __ sd(tmp_reg2, Address(d, 3 * unit)); 817 __ sd(tmp_reg3, Address(d, 4 * unit)); 818 __ sd(tmp_reg4, Address(d, 5 * unit)); 819 __ sd(tmp_reg5, Address(d, 6 * unit)); 820 __ sd(tmp_reg6, Address(d, 7 * unit)); 821 __ sd(tmp_reg7, Address(d, 8 * unit)); 822 __ addi(d, d, 8 * unit); 823 824 { 825 Label L1, L2; 826 __ test_bit(t0, count, 2); 827 __ beqz(t0, L1); 828 829 __ ld(tmp_reg0, Address(s, 1 * unit)); 830 __ ld(tmp_reg1, Address(s, 2 * unit)); 831 __ ld(tmp_reg2, Address(s, 3 * unit)); 832 __ ld(tmp_reg3, Address(s, 4 * unit)); 833 __ addi(s, s, 4 * unit); 834 835 __ sd(tmp_reg0, Address(d, 1 * unit)); 836 __ sd(tmp_reg1, Address(d, 2 * unit)); 837 __ sd(tmp_reg2, Address(d, 3 * unit)); 838 __ sd(tmp_reg3, Address(d, 4 * unit)); 839 __ addi(d, d, 4 * unit); 840 841 __ bind(L1); 842 843 if (direction == copy_forwards) { 844 __ addi(s, s, bias); 845 __ addi(d, d, bias); 846 } 847 848 __ test_bit(t0, count, 1); 849 __ beqz(t0, L2); 850 if (direction == copy_backwards) { 851 __ addi(s, s, 2 * unit); 852 __ ld(tmp_reg0, Address(s)); 853 __ ld(tmp_reg1, Address(s, wordSize)); 854 __ addi(d, d, 2 * unit); 855 __ sd(tmp_reg0, Address(d)); 856 __ sd(tmp_reg1, Address(d, wordSize)); 857 } else { 858 __ ld(tmp_reg0, Address(s)); 859 __ ld(tmp_reg1, Address(s, wordSize)); 860 __ addi(s, s, 2 * unit); 861 __ sd(tmp_reg0, Address(d)); 862 __ sd(tmp_reg1, Address(d, wordSize)); 863 __ addi(d, d, 2 * unit); 864 } 865 __ bind(L2); 866 } 867 868 __ ret(); 869 } 870 871 Label copy_f, copy_b; 872 873 typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp); 874 875 void copy_memory_v(Register s, Register d, Register count, int step) { 876 bool is_backward = step < 0; 877 int granularity = uabs(step); 878 879 const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17; 880 assert_different_registers(s, d, cnt, vl, tmp1, tmp2); 881 Assembler::SEW sew = Assembler::elembytes_to_sew(granularity); 882 Label loop_forward, loop_backward, done; 883 884 __ mv(dst, d); 885 __ mv(src, s); 886 __ mv(cnt, count); 887 888 __ bind(loop_forward); 889 __ vsetvli(vl, cnt, sew, Assembler::m8); 890 if (is_backward) { 891 __ bne(vl, cnt, loop_backward); 892 } 893 894 __ vlex_v(v0, src, sew); 895 __ sub(cnt, cnt, vl); 896 if (sew != Assembler::e8) { 897 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary 898 __ slli(vl, vl, sew); 899 } 900 __ add(src, src, vl); 901 902 __ vsex_v(v0, dst, sew); 903 __ add(dst, dst, vl); 904 __ bnez(cnt, loop_forward); 905 906 if (is_backward) { 907 __ j(done); 908 909 __ bind(loop_backward); 910 __ sub(t0, cnt, vl); 911 if (sew != Assembler::e8) { 912 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary 913 __ slli(t0, t0, sew); 914 } 915 __ add(tmp1, s, t0); 916 __ vlex_v(v0, tmp1, sew); 917 __ add(tmp2, d, t0); 918 __ vsex_v(v0, tmp2, sew); 919 __ sub(cnt, cnt, vl); 920 __ bnez(cnt, loop_forward); 921 __ bind(done); 922 } 923 } 924 925 // All-singing all-dancing memory copy. 926 // 927 // Copy count units of memory from s to d. The size of a unit is 928 // step, which can be positive or negative depending on the direction 929 // of copy. 930 // 931 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 932 Register s, Register d, Register count, int step) { 933 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 934 if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) { 935 return copy_memory_v(s, d, count, step); 936 } 937 938 bool is_backwards = step < 0; 939 int granularity = uabs(step); 940 941 const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13; 942 const Register gct1 = x28, gct2 = x29, gct3 = t2; 943 944 Label same_aligned; 945 Label copy_big, copy32_loop, copy8_loop, copy_small, done; 946 947 // The size of copy32_loop body increases significantly with ZGC GC barriers. 948 // Need conditional far branches to reach a point beyond the loop in this case. 949 bool is_far = UseZGC && ZGenerational; 950 951 __ beqz(count, done, is_far); 952 __ slli(cnt, count, exact_log2(granularity)); 953 if (is_backwards) { 954 __ add(src, s, cnt); 955 __ add(dst, d, cnt); 956 } else { 957 __ mv(src, s); 958 __ mv(dst, d); 959 } 960 961 if (is_aligned) { 962 __ addi(t0, cnt, -32); 963 __ bgez(t0, copy32_loop); 964 __ addi(t0, cnt, -8); 965 __ bgez(t0, copy8_loop, is_far); 966 __ j(copy_small); 967 } else { 968 __ mv(t0, 16); 969 __ blt(cnt, t0, copy_small, is_far); 970 971 __ xorr(t0, src, dst); 972 __ andi(t0, t0, 0b111); 973 __ bnez(t0, copy_small, is_far); 974 975 __ bind(same_aligned); 976 __ andi(t0, src, 0b111); 977 __ beqz(t0, copy_big); 978 if (is_backwards) { 979 __ addi(src, src, step); 980 __ addi(dst, dst, step); 981 } 982 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1); 983 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3); 984 if (!is_backwards) { 985 __ addi(src, src, step); 986 __ addi(dst, dst, step); 987 } 988 __ addi(cnt, cnt, -granularity); 989 __ beqz(cnt, done, is_far); 990 __ j(same_aligned); 991 992 __ bind(copy_big); 993 __ mv(t0, 32); 994 __ blt(cnt, t0, copy8_loop, is_far); 995 } 996 997 __ bind(copy32_loop); 998 if (is_backwards) { 999 __ addi(src, src, -wordSize * 4); 1000 __ addi(dst, dst, -wordSize * 4); 1001 } 1002 // we first load 32 bytes, then write it, so the direction here doesn't matter 1003 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1); 1004 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8), gct1); 1005 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1); 1006 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1); 1007 1008 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3); 1009 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8), tmp4, gct1, gct2, gct3); 1010 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3); 1011 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3); 1012 1013 if (!is_backwards) { 1014 __ addi(src, src, wordSize * 4); 1015 __ addi(dst, dst, wordSize * 4); 1016 } 1017 __ addi(t0, cnt, -(32 + wordSize * 4)); 1018 __ addi(cnt, cnt, -wordSize * 4); 1019 __ bgez(t0, copy32_loop); // cnt >= 32, do next loop 1020 1021 __ beqz(cnt, done); // if that's all - done 1022 1023 __ addi(t0, cnt, -8); // if not - copy the reminder 1024 __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop 1025 1026 __ bind(copy8_loop); 1027 if (is_backwards) { 1028 __ addi(src, src, -wordSize); 1029 __ addi(dst, dst, -wordSize); 1030 } 1031 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1); 1032 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3); 1033 1034 if (!is_backwards) { 1035 __ addi(src, src, wordSize); 1036 __ addi(dst, dst, wordSize); 1037 } 1038 __ addi(t0, cnt, -(8 + wordSize)); 1039 __ addi(cnt, cnt, -wordSize); 1040 __ bgez(t0, copy8_loop); // cnt >= 8, do next loop 1041 1042 __ beqz(cnt, done); // if that's all - done 1043 1044 __ bind(copy_small); 1045 if (is_backwards) { 1046 __ addi(src, src, step); 1047 __ addi(dst, dst, step); 1048 } 1049 1050 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1); 1051 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3); 1052 1053 if (!is_backwards) { 1054 __ addi(src, src, step); 1055 __ addi(dst, dst, step); 1056 } 1057 __ addi(cnt, cnt, -granularity); 1058 __ bgtz(cnt, copy_small); 1059 1060 __ bind(done); 1061 } 1062 1063 // Scan over array at a for count oops, verifying each one. 1064 // Preserves a and count, clobbers t0 and t1. 1065 void verify_oop_array(size_t size, Register a, Register count, Register temp) { 1066 Label loop, end; 1067 __ mv(t1, zr); 1068 __ slli(t0, count, exact_log2(size)); 1069 __ bind(loop); 1070 __ bgeu(t1, t0, end); 1071 1072 __ add(temp, a, t1); 1073 if (size == (size_t)wordSize) { 1074 __ ld(temp, Address(temp, 0)); 1075 __ verify_oop(temp); 1076 } else { 1077 __ lwu(temp, Address(temp, 0)); 1078 __ decode_heap_oop(temp); // calls verify_oop 1079 } 1080 __ add(t1, t1, size); 1081 __ j(loop); 1082 __ bind(end); 1083 } 1084 1085 // Arguments: 1086 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1087 // ignored 1088 // is_oop - true => oop array, so generate store check code 1089 // name - stub name string 1090 // 1091 // Inputs: 1092 // c_rarg0 - source array address 1093 // c_rarg1 - destination array address 1094 // c_rarg2 - element count, treated as ssize_t, can be zero 1095 // 1096 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1097 // the hardware handle it. The two dwords within qwords that span 1098 // cache line boundaries will still be loaded and stored atomically. 1099 // 1100 // Side Effects: 1101 // disjoint_int_copy_entry is set to the no-overlap entry point 1102 // used by generate_conjoint_int_oop_copy(). 1103 // 1104 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry, 1105 const char* name, bool dest_uninitialized = false) { 1106 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1107 RegSet saved_reg = RegSet::of(s, d, count); 1108 __ align(CodeEntryAlignment); 1109 StubCodeMark mark(this, "StubRoutines", name); 1110 address start = __ pc(); 1111 __ enter(); 1112 1113 if (entry != nullptr) { 1114 *entry = __ pc(); 1115 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1116 BLOCK_COMMENT("Entry:"); 1117 } 1118 1119 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1120 if (dest_uninitialized) { 1121 decorators |= IS_DEST_UNINITIALIZED; 1122 } 1123 if (aligned) { 1124 decorators |= ARRAYCOPY_ALIGNED; 1125 } 1126 1127 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1128 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1129 1130 if (is_oop) { 1131 // save regs before copy_memory 1132 __ push_reg(RegSet::of(d, count), sp); 1133 } 1134 1135 { 1136 // UnsafeMemoryAccess page error: continue after unsafe access 1137 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1138 UnsafeMemoryAccessMark umam(this, add_entry, true); 1139 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1140 } 1141 1142 if (is_oop) { 1143 __ pop_reg(RegSet::of(d, count), sp); 1144 if (VerifyOops) { 1145 verify_oop_array(size, d, count, t2); 1146 } 1147 } 1148 1149 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1150 1151 __ leave(); 1152 __ mv(x10, zr); // return 0 1153 __ ret(); 1154 return start; 1155 } 1156 1157 // Arguments: 1158 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1159 // ignored 1160 // is_oop - true => oop array, so generate store check code 1161 // name - stub name string 1162 // 1163 // Inputs: 1164 // c_rarg0 - source array address 1165 // c_rarg1 - destination array address 1166 // c_rarg2 - element count, treated as ssize_t, can be zero 1167 // 1168 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1169 // the hardware handle it. The two dwords within qwords that span 1170 // cache line boundaries will still be loaded and stored atomically. 1171 // 1172 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1173 address* entry, const char* name, 1174 bool dest_uninitialized = false) { 1175 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1176 RegSet saved_regs = RegSet::of(s, d, count); 1177 StubCodeMark mark(this, "StubRoutines", name); 1178 address start = __ pc(); 1179 __ enter(); 1180 1181 if (entry != nullptr) { 1182 *entry = __ pc(); 1183 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1184 BLOCK_COMMENT("Entry:"); 1185 } 1186 1187 // use fwd copy when (d-s) above_equal (count*size) 1188 __ sub(t0, d, s); 1189 __ slli(t1, count, exact_log2(size)); 1190 Label L_continue; 1191 __ bltu(t0, t1, L_continue); 1192 __ j(nooverlap_target); 1193 __ bind(L_continue); 1194 1195 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1196 if (dest_uninitialized) { 1197 decorators |= IS_DEST_UNINITIALIZED; 1198 } 1199 if (aligned) { 1200 decorators |= ARRAYCOPY_ALIGNED; 1201 } 1202 1203 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1204 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1205 1206 if (is_oop) { 1207 // save regs before copy_memory 1208 __ push_reg(RegSet::of(d, count), sp); 1209 } 1210 1211 { 1212 // UnsafeMemoryAccess page error: continue after unsafe access 1213 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1214 UnsafeMemoryAccessMark umam(this, add_entry, true); 1215 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1216 } 1217 1218 if (is_oop) { 1219 __ pop_reg(RegSet::of(d, count), sp); 1220 if (VerifyOops) { 1221 verify_oop_array(size, d, count, t2); 1222 } 1223 } 1224 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1225 __ leave(); 1226 __ mv(x10, zr); // return 0 1227 __ ret(); 1228 return start; 1229 } 1230 1231 // Arguments: 1232 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1233 // ignored 1234 // name - stub name string 1235 // 1236 // Inputs: 1237 // c_rarg0 - source array address 1238 // c_rarg1 - destination array address 1239 // c_rarg2 - element count, treated as ssize_t, can be zero 1240 // 1241 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1242 // we let the hardware handle it. The one to eight bytes within words, 1243 // dwords or qwords that span cache line boundaries will still be loaded 1244 // and stored atomically. 1245 // 1246 // Side Effects: 1247 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1248 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1249 // we let the hardware handle it. The one to eight bytes within words, 1250 // dwords or qwords that span cache line boundaries will still be loaded 1251 // and stored atomically. 1252 // 1253 // Side Effects: 1254 // disjoint_byte_copy_entry is set to the no-overlap entry point 1255 // used by generate_conjoint_byte_copy(). 1256 // 1257 address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) { 1258 const bool not_oop = false; 1259 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1260 } 1261 1262 // Arguments: 1263 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1264 // ignored 1265 // name - stub name string 1266 // 1267 // Inputs: 1268 // c_rarg0 - source array address 1269 // c_rarg1 - destination array address 1270 // c_rarg2 - element count, treated as ssize_t, can be zero 1271 // 1272 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1273 // we let the hardware handle it. The one to eight bytes within words, 1274 // dwords or qwords that span cache line boundaries will still be loaded 1275 // and stored atomically. 1276 // 1277 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1278 address* entry, const char* name) { 1279 const bool not_oop = false; 1280 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1281 } 1282 1283 // Arguments: 1284 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1285 // ignored 1286 // name - stub name string 1287 // 1288 // Inputs: 1289 // c_rarg0 - source array address 1290 // c_rarg1 - destination array address 1291 // c_rarg2 - element count, treated as ssize_t, can be zero 1292 // 1293 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1294 // let the hardware handle it. The two or four words within dwords 1295 // or qwords that span cache line boundaries will still be loaded 1296 // and stored atomically. 1297 // 1298 // Side Effects: 1299 // disjoint_short_copy_entry is set to the no-overlap entry point 1300 // used by generate_conjoint_short_copy(). 1301 // 1302 address generate_disjoint_short_copy(bool aligned, 1303 address* entry, const char* name) { 1304 const bool not_oop = false; 1305 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1306 } 1307 1308 // Arguments: 1309 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1310 // ignored 1311 // name - stub name string 1312 // 1313 // Inputs: 1314 // c_rarg0 - source array address 1315 // c_rarg1 - destination array address 1316 // c_rarg2 - element count, treated as ssize_t, can be zero 1317 // 1318 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1319 // let the hardware handle it. The two or four words within dwords 1320 // or qwords that span cache line boundaries will still be loaded 1321 // and stored atomically. 1322 // 1323 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1324 address* entry, const char* name) { 1325 const bool not_oop = false; 1326 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1327 } 1328 1329 // Arguments: 1330 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1331 // ignored 1332 // name - stub name string 1333 // 1334 // Inputs: 1335 // c_rarg0 - source array address 1336 // c_rarg1 - destination array address 1337 // c_rarg2 - element count, treated as ssize_t, can be zero 1338 // 1339 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1340 // the hardware handle it. The two dwords within qwords that span 1341 // cache line boundaries will still be loaded and stored atomically. 1342 // 1343 // Side Effects: 1344 // disjoint_int_copy_entry is set to the no-overlap entry point 1345 // used by generate_conjoint_int_oop_copy(). 1346 // 1347 address generate_disjoint_int_copy(bool aligned, address* entry, 1348 const char* name, bool dest_uninitialized = false) { 1349 const bool not_oop = false; 1350 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1351 } 1352 1353 // Arguments: 1354 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1355 // ignored 1356 // name - stub name string 1357 // 1358 // Inputs: 1359 // c_rarg0 - source array address 1360 // c_rarg1 - destination array address 1361 // c_rarg2 - element count, treated as ssize_t, can be zero 1362 // 1363 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1364 // the hardware handle it. The two dwords within qwords that span 1365 // cache line boundaries will still be loaded and stored atomically. 1366 // 1367 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1368 address* entry, const char* name, 1369 bool dest_uninitialized = false) { 1370 const bool not_oop = false; 1371 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1372 } 1373 1374 1375 // Arguments: 1376 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1377 // ignored 1378 // name - stub name string 1379 // 1380 // Inputs: 1381 // c_rarg0 - source array address 1382 // c_rarg1 - destination array address 1383 // c_rarg2 - element count, treated as size_t, can be zero 1384 // 1385 // Side Effects: 1386 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1387 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1388 // 1389 address generate_disjoint_long_copy(bool aligned, address* entry, 1390 const char* name, bool dest_uninitialized = false) { 1391 const bool not_oop = false; 1392 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1393 } 1394 1395 // Arguments: 1396 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1397 // ignored 1398 // name - stub name string 1399 // 1400 // Inputs: 1401 // c_rarg0 - source array address 1402 // c_rarg1 - destination array address 1403 // c_rarg2 - element count, treated as size_t, can be zero 1404 // 1405 address generate_conjoint_long_copy(bool aligned, 1406 address nooverlap_target, address* entry, 1407 const char* name, bool dest_uninitialized = false) { 1408 const bool not_oop = false; 1409 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1410 } 1411 1412 // Arguments: 1413 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1414 // ignored 1415 // name - stub name string 1416 // 1417 // Inputs: 1418 // c_rarg0 - source array address 1419 // c_rarg1 - destination array address 1420 // c_rarg2 - element count, treated as size_t, can be zero 1421 // 1422 // Side Effects: 1423 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1424 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1425 // 1426 address generate_disjoint_oop_copy(bool aligned, address* entry, 1427 const char* name, bool dest_uninitialized) { 1428 const bool is_oop = true; 1429 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1430 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1431 } 1432 1433 // Arguments: 1434 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1435 // ignored 1436 // name - stub name string 1437 // 1438 // Inputs: 1439 // c_rarg0 - source array address 1440 // c_rarg1 - destination array address 1441 // c_rarg2 - element count, treated as size_t, can be zero 1442 // 1443 address generate_conjoint_oop_copy(bool aligned, 1444 address nooverlap_target, address* entry, 1445 const char* name, bool dest_uninitialized) { 1446 const bool is_oop = true; 1447 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1448 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1449 name, dest_uninitialized); 1450 } 1451 1452 // Helper for generating a dynamic type check. 1453 // Smashes t0, t1. 1454 void generate_type_check(Register sub_klass, 1455 Register super_check_offset, 1456 Register super_klass, 1457 Label& L_success) { 1458 assert_different_registers(sub_klass, super_check_offset, super_klass); 1459 1460 BLOCK_COMMENT("type_check:"); 1461 1462 Label L_miss; 1463 1464 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset); 1465 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1466 1467 // Fall through on failure! 1468 __ BIND(L_miss); 1469 } 1470 1471 // 1472 // Generate checkcasting array copy stub 1473 // 1474 // Input: 1475 // c_rarg0 - source array address 1476 // c_rarg1 - destination array address 1477 // c_rarg2 - element count, treated as ssize_t, can be zero 1478 // c_rarg3 - size_t ckoff (super_check_offset) 1479 // c_rarg4 - oop ckval (super_klass) 1480 // 1481 // Output: 1482 // x10 == 0 - success 1483 // x10 == -1^K - failure, where K is partial transfer count 1484 // 1485 address generate_checkcast_copy(const char* name, address* entry, 1486 bool dest_uninitialized = false) { 1487 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1488 1489 // Input registers (after setup_arg_regs) 1490 const Register from = c_rarg0; // source array address 1491 const Register to = c_rarg1; // destination array address 1492 const Register count = c_rarg2; // elementscount 1493 const Register ckoff = c_rarg3; // super_check_offset 1494 const Register ckval = c_rarg4; // super_klass 1495 1496 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1497 RegSet wb_post_saved_regs = RegSet::of(count); 1498 1499 // Registers used as temps (x7, x9, x18 are save-on-entry) 1500 const Register count_save = x19; // orig elementscount 1501 const Register start_to = x18; // destination array start address 1502 const Register copied_oop = x7; // actual oop copied 1503 const Register r9_klass = x9; // oop._klass 1504 1505 // Registers used as gc temps (x15, x16, x17 are save-on-call) 1506 const Register gct1 = x15, gct2 = x16, gct3 = x17; 1507 1508 //--------------------------------------------------------------- 1509 // Assembler stub will be used for this call to arraycopy 1510 // if the two arrays are subtypes of Object[] but the 1511 // destination array type is not equal to or a supertype 1512 // of the source type. Each element must be separately 1513 // checked. 1514 1515 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1516 copied_oop, r9_klass, count_save); 1517 1518 __ align(CodeEntryAlignment); 1519 StubCodeMark mark(this, "StubRoutines", name); 1520 address start = __ pc(); 1521 1522 __ enter(); // required for proper stackwalking of RuntimeStub frame 1523 1524 // Caller of this entry point must set up the argument registers. 1525 if (entry != nullptr) { 1526 *entry = __ pc(); 1527 BLOCK_COMMENT("Entry:"); 1528 } 1529 1530 // Empty array: Nothing to do 1531 __ beqz(count, L_done); 1532 1533 __ push_reg(RegSet::of(x7, x9, x18, x19), sp); 1534 1535 #ifdef ASSERT 1536 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1537 // The ckoff and ckval must be mutually consistent, 1538 // even though caller generates both. 1539 { Label L; 1540 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1541 __ lwu(start_to, Address(ckval, sco_offset)); 1542 __ beq(ckoff, start_to, L); 1543 __ stop("super_check_offset inconsistent"); 1544 __ bind(L); 1545 } 1546 #endif //ASSERT 1547 1548 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1549 if (dest_uninitialized) { 1550 decorators |= IS_DEST_UNINITIALIZED; 1551 } 1552 1553 bool is_oop = true; 1554 int element_size = UseCompressedOops ? 4 : 8; 1555 1556 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1557 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1558 1559 // save the original count 1560 __ mv(count_save, count); 1561 1562 // Copy from low to high addresses 1563 __ mv(start_to, to); // Save destination array start address 1564 __ j(L_load_element); 1565 1566 // ======== begin loop ======== 1567 // (Loop is rotated; its entry is L_load_element.) 1568 // Loop control: 1569 // for count to 0 do 1570 // copied_oop = load_heap_oop(from++) 1571 // ... generate_type_check ... 1572 // store_heap_oop(to++, copied_oop) 1573 // end 1574 1575 __ align(OptoLoopAlignment); 1576 1577 __ BIND(L_store_element); 1578 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1579 Address(to, 0), copied_oop, 1580 gct1, gct2, gct3); 1581 __ add(to, to, UseCompressedOops ? 4 : 8); 1582 __ sub(count, count, 1); 1583 __ beqz(count, L_do_card_marks); 1584 1585 // ======== loop entry is here ======== 1586 __ BIND(L_load_element); 1587 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1588 copied_oop, Address(from, 0), 1589 gct1); 1590 __ add(from, from, UseCompressedOops ? 4 : 8); 1591 __ beqz(copied_oop, L_store_element); 1592 1593 __ load_klass(r9_klass, copied_oop);// query the object klass 1594 generate_type_check(r9_klass, ckoff, ckval, L_store_element); 1595 // ======== end loop ======== 1596 1597 // It was a real error; we must depend on the caller to finish the job. 1598 // Register count = remaining oops, count_orig = total oops. 1599 // Emit GC store barriers for the oops we have copied and report 1600 // their number to the caller. 1601 1602 __ sub(count, count_save, count); // K = partially copied oop count 1603 __ xori(count, count, -1); // report (-1^K) to caller 1604 __ beqz(count, L_done_pop); 1605 1606 __ BIND(L_do_card_marks); 1607 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs); 1608 1609 __ bind(L_done_pop); 1610 __ pop_reg(RegSet::of(x7, x9, x18, x19), sp); 1611 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1612 1613 __ bind(L_done); 1614 __ mv(x10, count); 1615 __ leave(); 1616 __ ret(); 1617 1618 return start; 1619 } 1620 1621 // Perform range checks on the proposed arraycopy. 1622 // Kills temp, but nothing else. 1623 // Also, clean the sign bits of src_pos and dst_pos. 1624 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1625 Register src_pos, // source position (c_rarg1) 1626 Register dst, // destination array oo (c_rarg2) 1627 Register dst_pos, // destination position (c_rarg3) 1628 Register length, 1629 Register temp, 1630 Label& L_failed) { 1631 BLOCK_COMMENT("arraycopy_range_checks:"); 1632 1633 assert_different_registers(t0, temp); 1634 1635 // if [src_pos + length > arrayOop(src)->length()] then FAIL 1636 __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes())); 1637 __ addw(temp, length, src_pos); 1638 __ bgtu(temp, t0, L_failed); 1639 1640 // if [dst_pos + length > arrayOop(dst)->length()] then FAIL 1641 __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1642 __ addw(temp, length, dst_pos); 1643 __ bgtu(temp, t0, L_failed); 1644 1645 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1646 __ zero_extend(src_pos, src_pos, 32); 1647 __ zero_extend(dst_pos, dst_pos, 32); 1648 1649 BLOCK_COMMENT("arraycopy_range_checks done"); 1650 } 1651 1652 // 1653 // Generate 'unsafe' array copy stub 1654 // Though just as safe as the other stubs, it takes an unscaled 1655 // size_t argument instead of an element count. 1656 // 1657 // Input: 1658 // c_rarg0 - source array address 1659 // c_rarg1 - destination array address 1660 // c_rarg2 - byte count, treated as ssize_t, can be zero 1661 // 1662 // Examines the alignment of the operands and dispatches 1663 // to a long, int, short, or byte copy loop. 1664 // 1665 address generate_unsafe_copy(const char* name, 1666 address byte_copy_entry, 1667 address short_copy_entry, 1668 address int_copy_entry, 1669 address long_copy_entry) { 1670 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr && 1671 int_copy_entry != nullptr && long_copy_entry != nullptr); 1672 Label L_long_aligned, L_int_aligned, L_short_aligned; 1673 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1674 1675 __ align(CodeEntryAlignment); 1676 StubCodeMark mark(this, "StubRoutines", name); 1677 address start = __ pc(); 1678 __ enter(); // required for proper stackwalking of RuntimeStub frame 1679 1680 // bump this on entry, not on exit: 1681 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1682 1683 __ orr(t0, s, d); 1684 __ orr(t0, t0, count); 1685 1686 __ andi(t0, t0, BytesPerLong - 1); 1687 __ beqz(t0, L_long_aligned); 1688 __ andi(t0, t0, BytesPerInt - 1); 1689 __ beqz(t0, L_int_aligned); 1690 __ test_bit(t0, t0, 0); 1691 __ beqz(t0, L_short_aligned); 1692 __ j(RuntimeAddress(byte_copy_entry)); 1693 1694 __ BIND(L_short_aligned); 1695 __ srli(count, count, LogBytesPerShort); // size => short_count 1696 __ j(RuntimeAddress(short_copy_entry)); 1697 __ BIND(L_int_aligned); 1698 __ srli(count, count, LogBytesPerInt); // size => int_count 1699 __ j(RuntimeAddress(int_copy_entry)); 1700 __ BIND(L_long_aligned); 1701 __ srli(count, count, LogBytesPerLong); // size => long_count 1702 __ j(RuntimeAddress(long_copy_entry)); 1703 1704 return start; 1705 } 1706 1707 // 1708 // Generate generic array copy stubs 1709 // 1710 // Input: 1711 // c_rarg0 - src oop 1712 // c_rarg1 - src_pos (32-bits) 1713 // c_rarg2 - dst oop 1714 // c_rarg3 - dst_pos (32-bits) 1715 // c_rarg4 - element count (32-bits) 1716 // 1717 // Output: 1718 // x10 == 0 - success 1719 // x10 == -1^K - failure, where K is partial transfer count 1720 // 1721 address generate_generic_copy(const char* name, 1722 address byte_copy_entry, address short_copy_entry, 1723 address int_copy_entry, address oop_copy_entry, 1724 address long_copy_entry, address checkcast_copy_entry) { 1725 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr && 1726 int_copy_entry != nullptr && oop_copy_entry != nullptr && 1727 long_copy_entry != nullptr && checkcast_copy_entry != nullptr); 1728 Label L_failed, L_failed_0, L_objArray; 1729 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1730 1731 // Input registers 1732 const Register src = c_rarg0; // source array oop 1733 const Register src_pos = c_rarg1; // source position 1734 const Register dst = c_rarg2; // destination array oop 1735 const Register dst_pos = c_rarg3; // destination position 1736 const Register length = c_rarg4; 1737 1738 // Registers used as temps 1739 const Register dst_klass = c_rarg5; 1740 1741 __ align(CodeEntryAlignment); 1742 1743 StubCodeMark mark(this, "StubRoutines", name); 1744 1745 address start = __ pc(); 1746 1747 __ enter(); // required for proper stackwalking of RuntimeStub frame 1748 1749 // bump this on entry, not on exit: 1750 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1751 1752 //----------------------------------------------------------------------- 1753 // Assembler stub will be used for this call to arraycopy 1754 // if the following conditions are met: 1755 // 1756 // (1) src and dst must not be null. 1757 // (2) src_pos must not be negative. 1758 // (3) dst_pos must not be negative. 1759 // (4) length must not be negative. 1760 // (5) src klass and dst klass should be the same and not null. 1761 // (6) src and dst should be arrays. 1762 // (7) src_pos + length must not exceed length of src. 1763 // (8) dst_pos + length must not exceed length of dst. 1764 // 1765 1766 // if src is null then return -1 1767 __ beqz(src, L_failed); 1768 1769 // if [src_pos < 0] then return -1 1770 __ sign_extend(t0, src_pos, 32); 1771 __ bltz(t0, L_failed); 1772 1773 // if dst is null then return -1 1774 __ beqz(dst, L_failed); 1775 1776 // if [dst_pos < 0] then return -1 1777 __ sign_extend(t0, dst_pos, 32); 1778 __ bltz(t0, L_failed); 1779 1780 // registers used as temp 1781 const Register scratch_length = x28; // elements count to copy 1782 const Register scratch_src_klass = x29; // array klass 1783 const Register lh = x30; // layout helper 1784 1785 // if [length < 0] then return -1 1786 __ sign_extend(scratch_length, length, 32); // length (elements count, 32-bits value) 1787 __ bltz(scratch_length, L_failed); 1788 1789 __ load_klass(scratch_src_klass, src); 1790 #ifdef ASSERT 1791 { 1792 BLOCK_COMMENT("assert klasses not null {"); 1793 Label L1, L2; 1794 __ bnez(scratch_src_klass, L2); // it is broken if klass is null 1795 __ bind(L1); 1796 __ stop("broken null klass"); 1797 __ bind(L2); 1798 __ load_klass(t0, dst, t1); 1799 __ beqz(t0, L1); // this would be broken also 1800 BLOCK_COMMENT("} assert klasses not null done"); 1801 } 1802 #endif 1803 1804 // Load layout helper (32-bits) 1805 // 1806 // |array_tag| | header_size | element_type | |log2_element_size| 1807 // 32 30 24 16 8 2 0 1808 // 1809 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1810 // 1811 1812 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1813 1814 // Handle objArrays completely differently... 1815 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1816 __ lw(lh, Address(scratch_src_klass, lh_offset)); 1817 __ mv(t0, objArray_lh); 1818 __ beq(lh, t0, L_objArray); 1819 1820 // if [src->klass() != dst->klass()] then return -1 1821 __ load_klass(t1, dst); 1822 __ bne(t1, scratch_src_klass, L_failed); 1823 1824 // if src->is_Array() isn't null then return -1 1825 // i.e. (lh >= 0) 1826 __ bgez(lh, L_failed); 1827 1828 // At this point, it is known to be a typeArray (array_tag 0x3). 1829 #ifdef ASSERT 1830 { 1831 BLOCK_COMMENT("assert primitive array {"); 1832 Label L; 1833 __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 1834 __ bge(lh, t1, L); 1835 __ stop("must be a primitive array"); 1836 __ bind(L); 1837 BLOCK_COMMENT("} assert primitive array done"); 1838 } 1839 #endif 1840 1841 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1842 t1, L_failed); 1843 1844 // TypeArrayKlass 1845 // 1846 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize) 1847 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize) 1848 // 1849 1850 const Register t0_offset = t0; // array offset 1851 const Register x30_elsize = lh; // element size 1852 1853 // Get array_header_in_bytes() 1854 int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1); 1855 int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width; 1856 __ slli(t0_offset, lh, XLEN - lh_header_size_msb); // left shift to remove 24 ~ 32; 1857 __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset 1858 1859 __ add(src, src, t0_offset); // src array offset 1860 __ add(dst, dst, t0_offset); // dst array offset 1861 BLOCK_COMMENT("choose copy loop based on element size"); 1862 1863 // next registers should be set before the jump to corresponding stub 1864 const Register from = c_rarg0; // source array address 1865 const Register to = c_rarg1; // destination array address 1866 const Register count = c_rarg2; // elements count 1867 1868 // 'from', 'to', 'count' registers should be set in such order 1869 // since they are the same as 'src', 'src_pos', 'dst'. 1870 1871 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1872 1873 // The possible values of elsize are 0-3, i.e. exact_log2(element 1874 // size in bytes). We do a simple bitwise binary search. 1875 __ BIND(L_copy_bytes); 1876 __ test_bit(t0, x30_elsize, 1); 1877 __ bnez(t0, L_copy_ints); 1878 __ test_bit(t0, x30_elsize, 0); 1879 __ bnez(t0, L_copy_shorts); 1880 __ add(from, src, src_pos); // src_addr 1881 __ add(to, dst, dst_pos); // dst_addr 1882 __ sign_extend(count, scratch_length, 32); // length 1883 __ j(RuntimeAddress(byte_copy_entry)); 1884 1885 __ BIND(L_copy_shorts); 1886 __ shadd(from, src_pos, src, t0, 1); // src_addr 1887 __ shadd(to, dst_pos, dst, t0, 1); // dst_addr 1888 __ sign_extend(count, scratch_length, 32); // length 1889 __ j(RuntimeAddress(short_copy_entry)); 1890 1891 __ BIND(L_copy_ints); 1892 __ test_bit(t0, x30_elsize, 0); 1893 __ bnez(t0, L_copy_longs); 1894 __ shadd(from, src_pos, src, t0, 2); // src_addr 1895 __ shadd(to, dst_pos, dst, t0, 2); // dst_addr 1896 __ sign_extend(count, scratch_length, 32); // length 1897 __ j(RuntimeAddress(int_copy_entry)); 1898 1899 __ BIND(L_copy_longs); 1900 #ifdef ASSERT 1901 { 1902 BLOCK_COMMENT("assert long copy {"); 1903 Label L; 1904 __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize 1905 __ sign_extend(lh, lh, 32); 1906 __ mv(t0, LogBytesPerLong); 1907 __ beq(x30_elsize, t0, L); 1908 __ stop("must be long copy, but elsize is wrong"); 1909 __ bind(L); 1910 BLOCK_COMMENT("} assert long copy done"); 1911 } 1912 #endif 1913 __ shadd(from, src_pos, src, t0, 3); // src_addr 1914 __ shadd(to, dst_pos, dst, t0, 3); // dst_addr 1915 __ sign_extend(count, scratch_length, 32); // length 1916 __ j(RuntimeAddress(long_copy_entry)); 1917 1918 // ObjArrayKlass 1919 __ BIND(L_objArray); 1920 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 1921 1922 Label L_plain_copy, L_checkcast_copy; 1923 // test array classes for subtyping 1924 __ load_klass(t2, dst); 1925 __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality 1926 1927 // Identically typed arrays can be copied without element-wise checks. 1928 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1929 t1, L_failed); 1930 1931 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1932 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1933 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1934 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1935 __ sign_extend(count, scratch_length, 32); // length 1936 __ BIND(L_plain_copy); 1937 __ j(RuntimeAddress(oop_copy_entry)); 1938 1939 __ BIND(L_checkcast_copy); 1940 // live at this point: scratch_src_klass, scratch_length, t2 (dst_klass) 1941 { 1942 // Before looking at dst.length, make sure dst is also an objArray. 1943 __ lwu(t0, Address(t2, lh_offset)); 1944 __ mv(t1, objArray_lh); 1945 __ bne(t0, t1, L_failed); 1946 1947 // It is safe to examine both src.length and dst.length. 1948 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1949 t2, L_failed); 1950 1951 __ load_klass(dst_klass, dst); // reload 1952 1953 // Marshal the base address arguments now, freeing registers. 1954 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1955 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1956 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1957 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1958 __ sign_extend(count, length, 32); // length (reloaded) 1959 const Register sco_temp = c_rarg3; // this register is free now 1960 assert_different_registers(from, to, count, sco_temp, 1961 dst_klass, scratch_src_klass); 1962 1963 // Generate the type check. 1964 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1965 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1966 1967 // Smashes t0, t1 1968 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 1969 1970 // Fetch destination element klass from the ObjArrayKlass header. 1971 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 1972 __ ld(dst_klass, Address(dst_klass, ek_offset)); 1973 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1974 1975 // the checkcast_copy loop needs two extra arguments: 1976 assert(c_rarg3 == sco_temp, "#3 already in place"); 1977 // Set up arguments for checkcast_copy_entry. 1978 __ mv(c_rarg4, dst_klass); // dst.klass.element_klass 1979 __ j(RuntimeAddress(checkcast_copy_entry)); 1980 } 1981 1982 __ BIND(L_failed); 1983 __ mv(x10, -1); 1984 __ leave(); // required for proper stackwalking of RuntimeStub frame 1985 __ ret(); 1986 1987 return start; 1988 } 1989 1990 // 1991 // Generate stub for array fill. If "aligned" is true, the 1992 // "to" address is assumed to be heapword aligned. 1993 // 1994 // Arguments for generated stub: 1995 // to: c_rarg0 1996 // value: c_rarg1 1997 // count: c_rarg2 treated as signed 1998 // 1999 address generate_fill(BasicType t, bool aligned, const char* name) { 2000 __ align(CodeEntryAlignment); 2001 StubCodeMark mark(this, "StubRoutines", name); 2002 address start = __ pc(); 2003 2004 BLOCK_COMMENT("Entry:"); 2005 2006 const Register to = c_rarg0; // source array address 2007 const Register value = c_rarg1; // value 2008 const Register count = c_rarg2; // elements count 2009 2010 const Register bz_base = x28; // base for block_zero routine 2011 const Register cnt_words = x29; // temp register 2012 const Register tmp_reg = t1; 2013 2014 __ enter(); 2015 2016 Label L_fill_elements, L_exit1; 2017 2018 int shift = -1; 2019 switch (t) { 2020 case T_BYTE: 2021 shift = 0; 2022 2023 // Zero extend value 2024 // 8 bit -> 16 bit 2025 __ andi(value, value, 0xff); 2026 __ mv(tmp_reg, value); 2027 __ slli(tmp_reg, tmp_reg, 8); 2028 __ orr(value, value, tmp_reg); 2029 2030 // 16 bit -> 32 bit 2031 __ mv(tmp_reg, value); 2032 __ slli(tmp_reg, tmp_reg, 16); 2033 __ orr(value, value, tmp_reg); 2034 2035 __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2036 __ bltu(count, tmp_reg, L_fill_elements); 2037 break; 2038 case T_SHORT: 2039 shift = 1; 2040 // Zero extend value 2041 // 16 bit -> 32 bit 2042 __ andi(value, value, 0xffff); 2043 __ mv(tmp_reg, value); 2044 __ slli(tmp_reg, tmp_reg, 16); 2045 __ orr(value, value, tmp_reg); 2046 2047 // Short arrays (< 8 bytes) fill by element 2048 __ mv(tmp_reg, 8 >> shift); 2049 __ bltu(count, tmp_reg, L_fill_elements); 2050 break; 2051 case T_INT: 2052 shift = 2; 2053 2054 // Short arrays (< 8 bytes) fill by element 2055 __ mv(tmp_reg, 8 >> shift); 2056 __ bltu(count, tmp_reg, L_fill_elements); 2057 break; 2058 default: ShouldNotReachHere(); 2059 } 2060 2061 // Align source address at 8 bytes address boundary. 2062 Label L_skip_align1, L_skip_align2, L_skip_align4; 2063 if (!aligned) { 2064 switch (t) { 2065 case T_BYTE: 2066 // One byte misalignment happens only for byte arrays. 2067 __ test_bit(t0, to, 0); 2068 __ beqz(t0, L_skip_align1); 2069 __ sb(value, Address(to, 0)); 2070 __ addi(to, to, 1); 2071 __ addiw(count, count, -1); 2072 __ bind(L_skip_align1); 2073 // Fallthrough 2074 case T_SHORT: 2075 // Two bytes misalignment happens only for byte and short (char) arrays. 2076 __ test_bit(t0, to, 1); 2077 __ beqz(t0, L_skip_align2); 2078 __ sh(value, Address(to, 0)); 2079 __ addi(to, to, 2); 2080 __ addiw(count, count, -(2 >> shift)); 2081 __ bind(L_skip_align2); 2082 // Fallthrough 2083 case T_INT: 2084 // Align to 8 bytes, we know we are 4 byte aligned to start. 2085 __ test_bit(t0, to, 2); 2086 __ beqz(t0, L_skip_align4); 2087 __ sw(value, Address(to, 0)); 2088 __ addi(to, to, 4); 2089 __ addiw(count, count, -(4 >> shift)); 2090 __ bind(L_skip_align4); 2091 break; 2092 default: ShouldNotReachHere(); 2093 } 2094 } 2095 2096 // 2097 // Fill large chunks 2098 // 2099 __ srliw(cnt_words, count, 3 - shift); // number of words 2100 2101 // 32 bit -> 64 bit 2102 __ andi(value, value, 0xffffffff); 2103 __ mv(tmp_reg, value); 2104 __ slli(tmp_reg, tmp_reg, 32); 2105 __ orr(value, value, tmp_reg); 2106 2107 __ slli(tmp_reg, cnt_words, 3 - shift); 2108 __ subw(count, count, tmp_reg); 2109 { 2110 __ fill_words(to, cnt_words, value); 2111 } 2112 2113 // Remaining count is less than 8 bytes. Fill it by a single store. 2114 // Note that the total length is no less than 8 bytes. 2115 if (t == T_BYTE || t == T_SHORT) { 2116 __ beqz(count, L_exit1); 2117 __ shadd(to, count, to, tmp_reg, shift); // points to the end 2118 __ sd(value, Address(to, -8)); // overwrite some elements 2119 __ bind(L_exit1); 2120 __ leave(); 2121 __ ret(); 2122 } 2123 2124 // Handle copies less than 8 bytes. 2125 Label L_fill_2, L_fill_4, L_exit2; 2126 __ bind(L_fill_elements); 2127 switch (t) { 2128 case T_BYTE: 2129 __ test_bit(t0, count, 0); 2130 __ beqz(t0, L_fill_2); 2131 __ sb(value, Address(to, 0)); 2132 __ addi(to, to, 1); 2133 __ bind(L_fill_2); 2134 __ test_bit(t0, count, 1); 2135 __ beqz(t0, L_fill_4); 2136 __ sh(value, Address(to, 0)); 2137 __ addi(to, to, 2); 2138 __ bind(L_fill_4); 2139 __ test_bit(t0, count, 2); 2140 __ beqz(t0, L_exit2); 2141 __ sw(value, Address(to, 0)); 2142 break; 2143 case T_SHORT: 2144 __ test_bit(t0, count, 0); 2145 __ beqz(t0, L_fill_4); 2146 __ sh(value, Address(to, 0)); 2147 __ addi(to, to, 2); 2148 __ bind(L_fill_4); 2149 __ test_bit(t0, count, 1); 2150 __ beqz(t0, L_exit2); 2151 __ sw(value, Address(to, 0)); 2152 break; 2153 case T_INT: 2154 __ beqz(count, L_exit2); 2155 __ sw(value, Address(to, 0)); 2156 break; 2157 default: ShouldNotReachHere(); 2158 } 2159 __ bind(L_exit2); 2160 __ leave(); 2161 __ ret(); 2162 return start; 2163 } 2164 2165 void generate_arraycopy_stubs() { 2166 address entry = nullptr; 2167 address entry_jbyte_arraycopy = nullptr; 2168 address entry_jshort_arraycopy = nullptr; 2169 address entry_jint_arraycopy = nullptr; 2170 address entry_oop_arraycopy = nullptr; 2171 address entry_jlong_arraycopy = nullptr; 2172 address entry_checkcast_arraycopy = nullptr; 2173 2174 generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards); 2175 generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards); 2176 2177 StubRoutines::riscv::_zero_blocks = generate_zero_blocks(); 2178 2179 //*** jbyte 2180 // Always need aligned and unaligned versions 2181 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2182 "jbyte_disjoint_arraycopy"); 2183 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2184 &entry_jbyte_arraycopy, 2185 "jbyte_arraycopy"); 2186 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2187 "arrayof_jbyte_disjoint_arraycopy"); 2188 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2189 "arrayof_jbyte_arraycopy"); 2190 2191 //*** jshort 2192 // Always need aligned and unaligned versions 2193 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2194 "jshort_disjoint_arraycopy"); 2195 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2196 &entry_jshort_arraycopy, 2197 "jshort_arraycopy"); 2198 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2199 "arrayof_jshort_disjoint_arraycopy"); 2200 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2201 "arrayof_jshort_arraycopy"); 2202 2203 //*** jint 2204 // Aligned versions 2205 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2206 "arrayof_jint_disjoint_arraycopy"); 2207 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2208 "arrayof_jint_arraycopy"); 2209 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2210 // entry_jint_arraycopy always points to the unaligned version 2211 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2212 "jint_disjoint_arraycopy"); 2213 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2214 &entry_jint_arraycopy, 2215 "jint_arraycopy"); 2216 2217 //*** jlong 2218 // It is always aligned 2219 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2220 "arrayof_jlong_disjoint_arraycopy"); 2221 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2222 "arrayof_jlong_arraycopy"); 2223 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2224 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2225 2226 //*** oops 2227 { 2228 // With compressed oops we need unaligned versions; notice that 2229 // we overwrite entry_oop_arraycopy. 2230 bool aligned = !UseCompressedOops; 2231 2232 StubRoutines::_arrayof_oop_disjoint_arraycopy 2233 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2234 /*dest_uninitialized*/false); 2235 StubRoutines::_arrayof_oop_arraycopy 2236 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2237 /*dest_uninitialized*/false); 2238 // Aligned versions without pre-barriers 2239 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2240 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2241 /*dest_uninitialized*/true); 2242 StubRoutines::_arrayof_oop_arraycopy_uninit 2243 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2244 /*dest_uninitialized*/true); 2245 } 2246 2247 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2248 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2249 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2250 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2251 2252 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2253 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2254 /*dest_uninitialized*/true); 2255 2256 2257 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2258 entry_jbyte_arraycopy, 2259 entry_jshort_arraycopy, 2260 entry_jint_arraycopy, 2261 entry_jlong_arraycopy); 2262 2263 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2264 entry_jbyte_arraycopy, 2265 entry_jshort_arraycopy, 2266 entry_jint_arraycopy, 2267 entry_oop_arraycopy, 2268 entry_jlong_arraycopy, 2269 entry_checkcast_arraycopy); 2270 2271 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2272 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2273 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2274 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2275 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2276 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2277 } 2278 2279 // code for comparing 16 bytes of strings with same encoding 2280 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 2281 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31; 2282 __ ld(tmp5, Address(str1)); 2283 __ addi(str1, str1, 8); 2284 __ xorr(tmp4, tmp1, tmp2); 2285 __ ld(cnt1, Address(str2)); 2286 __ addi(str2, str2, 8); 2287 __ bnez(tmp4, DIFF1); 2288 __ ld(tmp1, Address(str1)); 2289 __ addi(str1, str1, 8); 2290 __ xorr(tmp4, tmp5, cnt1); 2291 __ ld(tmp2, Address(str2)); 2292 __ addi(str2, str2, 8); 2293 __ bnez(tmp4, DIFF2); 2294 } 2295 2296 // code for comparing 8 characters of strings with Latin1 and Utf16 encoding 2297 void compare_string_8_x_LU(Register tmpL, Register tmpU, Register strL, Register strU, Label& DIFF) { 2298 const Register tmp = x30, tmpLval = x12; 2299 __ ld(tmpLval, Address(strL)); 2300 __ addi(strL, strL, wordSize); 2301 __ ld(tmpU, Address(strU)); 2302 __ addi(strU, strU, wordSize); 2303 __ inflate_lo32(tmpL, tmpLval); 2304 __ xorr(tmp, tmpU, tmpL); 2305 __ bnez(tmp, DIFF); 2306 2307 __ ld(tmpU, Address(strU)); 2308 __ addi(strU, strU, wordSize); 2309 __ inflate_hi32(tmpL, tmpLval); 2310 __ xorr(tmp, tmpU, tmpL); 2311 __ bnez(tmp, DIFF); 2312 } 2313 2314 // x10 = result 2315 // x11 = str1 2316 // x12 = cnt1 2317 // x13 = str2 2318 // x14 = cnt2 2319 // x28 = tmp1 2320 // x29 = tmp2 2321 // x30 = tmp3 2322 address generate_compare_long_string_different_encoding(bool isLU) { 2323 __ align(CodeEntryAlignment); 2324 StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL"); 2325 address entry = __ pc(); 2326 Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE; 2327 const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14, 2328 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12; 2329 2330 // cnt2 == amount of characters left to compare 2331 // Check already loaded first 4 symbols 2332 __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2); 2333 __ mv(isLU ? tmp1 : tmp2, tmp3); 2334 __ addi(str1, str1, isLU ? wordSize / 2 : wordSize); 2335 __ addi(str2, str2, isLU ? wordSize : wordSize / 2); 2336 __ sub(cnt2, cnt2, wordSize / 2); // Already loaded 4 symbols 2337 2338 __ xorr(tmp3, tmp1, tmp2); 2339 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2340 2341 Register strU = isLU ? str2 : str1, 2342 strL = isLU ? str1 : str2, 2343 tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison 2344 tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison 2345 2346 // make sure main loop is 8 byte-aligned, we should load another 4 bytes from strL 2347 // cnt2 is >= 68 here, no need to check it for >= 0 2348 __ lwu(tmpL, Address(strL)); 2349 __ addi(strL, strL, wordSize / 2); 2350 __ ld(tmpU, Address(strU)); 2351 __ addi(strU, strU, wordSize); 2352 __ inflate_lo32(tmp3, tmpL); 2353 __ mv(tmpL, tmp3); 2354 __ xorr(tmp3, tmpU, tmpL); 2355 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2356 __ addi(cnt2, cnt2, -wordSize / 2); 2357 2358 // we are now 8-bytes aligned on strL 2359 __ sub(cnt2, cnt2, wordSize * 2); 2360 __ bltz(cnt2, TAIL); 2361 __ bind(SMALL_LOOP); // smaller loop 2362 __ sub(cnt2, cnt2, wordSize * 2); 2363 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2364 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2365 __ bgez(cnt2, SMALL_LOOP); 2366 __ addi(t0, cnt2, wordSize * 2); 2367 __ beqz(t0, DONE); 2368 __ bind(TAIL); // 1..15 characters left 2369 // Aligned access. Load bytes in portions - 4, 2, 1. 2370 2371 __ addi(t0, cnt2, wordSize); 2372 __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process 2373 __ bltz(t0, LOAD_LAST); 2374 // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU 2375 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2376 __ addi(cnt2, cnt2, -wordSize); 2377 __ beqz(cnt2, DONE); // no character left 2378 __ bind(LOAD_LAST); // cnt2 = 1..7 characters left 2379 2380 __ addi(cnt2, cnt2, -wordSize); // cnt2 is now an offset in strL which points to last 8 bytes 2381 __ slli(t0, cnt2, 1); // t0 is now an offset in strU which points to last 16 bytes 2382 __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string 2383 __ add(strU, strU, t0); // Address of last 16 bytes in UTF-16 string 2384 __ load_int_misaligned(tmpL, Address(strL), t0, false); 2385 __ load_long_misaligned(tmpU, Address(strU), t0, 2); 2386 __ inflate_lo32(tmp3, tmpL); 2387 __ mv(tmpL, tmp3); 2388 __ xorr(tmp3, tmpU, tmpL); 2389 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2390 2391 __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string 2392 __ addi(strU, strU, wordSize); // Address of last 8 bytes in UTF-16 string 2393 __ load_int_misaligned(tmpL, Address(strL), t0, false); 2394 __ load_long_misaligned(tmpU, Address(strU), t0, 2); 2395 __ inflate_lo32(tmp3, tmpL); 2396 __ mv(tmpL, tmp3); 2397 __ xorr(tmp3, tmpU, tmpL); 2398 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2399 __ j(DONE); // no character left 2400 2401 // Find the first different characters in the longwords and 2402 // compute their difference. 2403 __ bind(CALCULATE_DIFFERENCE); 2404 __ ctzc_bit(tmp4, tmp3); 2405 __ srl(tmp1, tmp1, tmp4); 2406 __ srl(tmp2, tmp2, tmp4); 2407 __ andi(tmp1, tmp1, 0xFFFF); 2408 __ andi(tmp2, tmp2, 0xFFFF); 2409 __ sub(result, tmp1, tmp2); 2410 __ bind(DONE); 2411 __ ret(); 2412 return entry; 2413 } 2414 2415 address generate_method_entry_barrier() { 2416 __ align(CodeEntryAlignment); 2417 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 2418 2419 Label deoptimize_label; 2420 2421 address start = __ pc(); 2422 2423 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 2424 2425 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 2426 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 2427 Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 2428 __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr())); 2429 __ lwu(t1, t1); 2430 __ sw(t1, thread_epoch_addr); 2431 // There are two ways this can work: 2432 // - The writer did system icache shootdown after the instruction stream update. 2433 // Hence do nothing. 2434 // - The writer trust us to make sure our icache is in sync before entering. 2435 // Hence use cmodx fence (fence.i, may change). 2436 if (UseCtxFencei) { 2437 __ cmodx_fence(); 2438 } 2439 __ membar(__ LoadLoad); 2440 } 2441 2442 __ set_last_Java_frame(sp, fp, ra); 2443 2444 __ enter(); 2445 __ add(t1, sp, wordSize); 2446 2447 __ sub(sp, sp, 4 * wordSize); 2448 2449 __ push_call_clobbered_registers(); 2450 2451 __ mv(c_rarg0, t1); 2452 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 2453 2454 __ reset_last_Java_frame(true); 2455 2456 __ mv(t0, x10); 2457 2458 __ pop_call_clobbered_registers(); 2459 2460 __ bnez(t0, deoptimize_label); 2461 2462 __ leave(); 2463 __ ret(); 2464 2465 __ BIND(deoptimize_label); 2466 2467 __ ld(t0, Address(sp, 0)); 2468 __ ld(fp, Address(sp, wordSize)); 2469 __ ld(ra, Address(sp, wordSize * 2)); 2470 __ ld(t1, Address(sp, wordSize * 3)); 2471 2472 __ mv(sp, t0); 2473 __ jr(t1); 2474 2475 return start; 2476 } 2477 2478 // x10 = result 2479 // x11 = str1 2480 // x12 = cnt1 2481 // x13 = str2 2482 // x14 = cnt2 2483 // x28 = tmp1 2484 // x29 = tmp2 2485 // x30 = tmp3 2486 // x31 = tmp4 2487 address generate_compare_long_string_same_encoding(bool isLL) { 2488 __ align(CodeEntryAlignment); 2489 StubCodeMark mark(this, "StubRoutines", isLL ? 2490 "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU"); 2491 address entry = __ pc(); 2492 Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL, 2493 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF; 2494 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14, 2495 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31; 2496 RegSet spilled_regs = RegSet::of(tmp4, tmp5); 2497 2498 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 2499 // update cnt2 counter with already loaded 8 bytes 2500 __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2)); 2501 // update pointers, because of previous read 2502 __ add(str1, str1, wordSize); 2503 __ add(str2, str2, wordSize); 2504 // less than 16 bytes left? 2505 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2506 __ push_reg(spilled_regs, sp); 2507 __ bltz(cnt2, TAIL); 2508 __ bind(SMALL_LOOP); 2509 compare_string_16_bytes_same(DIFF, DIFF2); 2510 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2511 __ bgez(cnt2, SMALL_LOOP); 2512 __ bind(TAIL); 2513 __ addi(cnt2, cnt2, isLL ? 16 : 8); 2514 __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); 2515 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2516 __ blez(cnt2, CHECK_LAST); 2517 __ xorr(tmp4, tmp1, tmp2); 2518 __ bnez(tmp4, DIFF); 2519 __ ld(tmp1, Address(str1)); 2520 __ addi(str1, str1, 8); 2521 __ ld(tmp2, Address(str2)); 2522 __ addi(str2, str2, 8); 2523 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2524 __ bind(CHECK_LAST); 2525 if (!isLL) { 2526 __ add(cnt2, cnt2, cnt2); // now in bytes 2527 } 2528 __ xorr(tmp4, tmp1, tmp2); 2529 __ bnez(tmp4, DIFF); 2530 __ add(str1, str1, cnt2); 2531 __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2); 2532 __ add(str2, str2, cnt2); 2533 __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2); 2534 __ xorr(tmp4, tmp5, cnt1); 2535 __ beqz(tmp4, LENGTH_DIFF); 2536 // Find the first different characters in the longwords and 2537 // compute their difference. 2538 __ bind(DIFF2); 2539 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2540 __ srl(tmp5, tmp5, tmp3); 2541 __ srl(cnt1, cnt1, tmp3); 2542 if (isLL) { 2543 __ andi(tmp5, tmp5, 0xFF); 2544 __ andi(cnt1, cnt1, 0xFF); 2545 } else { 2546 __ andi(tmp5, tmp5, 0xFFFF); 2547 __ andi(cnt1, cnt1, 0xFFFF); 2548 } 2549 __ sub(result, tmp5, cnt1); 2550 __ j(LENGTH_DIFF); 2551 __ bind(DIFF); 2552 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2553 __ srl(tmp1, tmp1, tmp3); 2554 __ srl(tmp2, tmp2, tmp3); 2555 if (isLL) { 2556 __ andi(tmp1, tmp1, 0xFF); 2557 __ andi(tmp2, tmp2, 0xFF); 2558 } else { 2559 __ andi(tmp1, tmp1, 0xFFFF); 2560 __ andi(tmp2, tmp2, 0xFFFF); 2561 } 2562 __ sub(result, tmp1, tmp2); 2563 __ j(LENGTH_DIFF); 2564 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 2565 __ xorr(tmp4, tmp1, tmp2); 2566 __ bnez(tmp4, DIFF); 2567 __ bind(LENGTH_DIFF); 2568 __ pop_reg(spilled_regs, sp); 2569 __ ret(); 2570 return entry; 2571 } 2572 2573 void generate_compare_long_strings() { 2574 StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true); 2575 StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false); 2576 StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true); 2577 StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false); 2578 } 2579 2580 // x10 result 2581 // x11 src 2582 // x12 src count 2583 // x13 pattern 2584 // x14 pattern count 2585 address generate_string_indexof_linear(bool needle_isL, bool haystack_isL) 2586 { 2587 const char* stubName = needle_isL 2588 ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul") 2589 : "indexof_linear_uu"; 2590 __ align(CodeEntryAlignment); 2591 StubCodeMark mark(this, "StubRoutines", stubName); 2592 address entry = __ pc(); 2593 2594 int needle_chr_size = needle_isL ? 1 : 2; 2595 int haystack_chr_size = haystack_isL ? 1 : 2; 2596 int needle_chr_shift = needle_isL ? 0 : 1; 2597 int haystack_chr_shift = haystack_isL ? 0 : 1; 2598 bool isL = needle_isL && haystack_isL; 2599 // parameters 2600 Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14; 2601 // temporary registers 2602 Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25; 2603 // redefinitions 2604 Register ch1 = x28, ch2 = x29; 2605 RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29); 2606 2607 __ push_reg(spilled_regs, sp); 2608 2609 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 2610 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 2611 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 2612 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 2613 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 2614 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 2615 2616 __ ld(ch1, Address(needle)); 2617 __ ld(ch2, Address(haystack)); 2618 // src.length - pattern.length 2619 __ sub(haystack_len, haystack_len, needle_len); 2620 2621 // first is needle[0] 2622 __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first); 2623 uint64_t mask0101 = UCONST64(0x0101010101010101); 2624 uint64_t mask0001 = UCONST64(0x0001000100010001); 2625 __ mv(mask1, haystack_isL ? mask0101 : mask0001); 2626 __ mul(first, first, mask1); 2627 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 2628 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 2629 __ mv(mask2, haystack_isL ? mask7f7f : mask7fff); 2630 if (needle_isL != haystack_isL) { 2631 __ mv(tmp, ch1); 2632 } 2633 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1); 2634 __ blez(haystack_len, L_SMALL); 2635 2636 if (needle_isL != haystack_isL) { 2637 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2638 } 2639 // xorr, sub, orr, notr, andr 2640 // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i] 2641 // eg: 2642 // first: aa aa aa aa aa aa aa aa 2643 // ch2: aa aa li nx jd ka aa aa 2644 // match_mask: 80 80 00 00 00 00 80 80 2645 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2646 2647 // search first char of needle, if success, goto L_HAS_ZERO; 2648 __ bnez(match_mask, L_HAS_ZERO); 2649 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2650 __ add(result, result, wordSize / haystack_chr_size); 2651 __ add(haystack, haystack, wordSize); 2652 __ bltz(haystack_len, L_POST_LOOP); 2653 2654 __ bind(L_LOOP); 2655 __ ld(ch2, Address(haystack)); 2656 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2657 __ bnez(match_mask, L_HAS_ZERO); 2658 2659 __ bind(L_LOOP_PROCEED); 2660 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2661 __ add(haystack, haystack, wordSize); 2662 __ add(result, result, wordSize / haystack_chr_size); 2663 __ bgez(haystack_len, L_LOOP); 2664 2665 __ bind(L_POST_LOOP); 2666 __ mv(ch2, -wordSize / haystack_chr_size); 2667 __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check 2668 __ ld(ch2, Address(haystack)); 2669 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2670 __ neg(haystack_len, haystack_len); 2671 __ xorr(ch2, first, ch2); 2672 __ sub(match_mask, ch2, mask1); 2673 __ orr(ch2, ch2, mask2); 2674 __ mv(trailing_zeros, -1); // all bits set 2675 __ j(L_SMALL_PROCEED); 2676 2677 __ align(OptoLoopAlignment); 2678 __ bind(L_SMALL); 2679 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2680 __ neg(haystack_len, haystack_len); 2681 if (needle_isL != haystack_isL) { 2682 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2683 } 2684 __ xorr(ch2, first, ch2); 2685 __ sub(match_mask, ch2, mask1); 2686 __ orr(ch2, ch2, mask2); 2687 __ mv(trailing_zeros, -1); // all bits set 2688 2689 __ bind(L_SMALL_PROCEED); 2690 __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits. 2691 __ notr(ch2, ch2); 2692 __ andr(match_mask, match_mask, ch2); 2693 __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check 2694 __ beqz(match_mask, NOMATCH); 2695 2696 __ bind(L_SMALL_HAS_ZERO_LOOP); 2697 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros 2698 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2699 __ mv(ch2, wordSize / haystack_chr_size); 2700 __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2); 2701 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2702 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2703 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2704 2705 __ bind(L_SMALL_CMP_LOOP); 2706 __ shadd(first, trailing_zeros, needle, first, needle_chr_shift); 2707 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2708 needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first)); 2709 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2710 __ add(trailing_zeros, trailing_zeros, 1); 2711 __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP); 2712 __ beq(first, ch2, L_SMALL_CMP_LOOP); 2713 2714 __ bind(L_SMALL_CMP_LOOP_NOMATCH); 2715 __ beqz(match_mask, NOMATCH); 2716 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2717 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2718 __ add(result, result, 1); 2719 __ add(haystack, haystack, haystack_chr_size); 2720 __ j(L_SMALL_HAS_ZERO_LOOP); 2721 2722 __ align(OptoLoopAlignment); 2723 __ bind(L_SMALL_CMP_LOOP_LAST_CMP); 2724 __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2725 __ j(DONE); 2726 2727 __ align(OptoLoopAlignment); 2728 __ bind(L_SMALL_CMP_LOOP_LAST_CMP2); 2729 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2730 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2731 __ j(DONE); 2732 2733 __ align(OptoLoopAlignment); 2734 __ bind(L_HAS_ZERO); 2735 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2736 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2737 __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2); 2738 __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits) 2739 __ sub(result, result, 1); // array index from 0, so result -= 1 2740 2741 __ bind(L_HAS_ZERO_LOOP); 2742 __ mv(needle_len, wordSize / haystack_chr_size); 2743 __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2); 2744 __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2); 2745 // load next 8 bytes from haystack, and increase result index 2746 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2747 __ add(result, result, 1); 2748 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2749 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2750 2751 // compare one char 2752 __ bind(L_CMP_LOOP); 2753 __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift); 2754 needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len)); 2755 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2756 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2757 __ add(trailing_zeros, trailing_zeros, 1); // next char index 2758 __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2); 2759 __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP); 2760 __ beq(needle_len, ch2, L_CMP_LOOP); 2761 2762 __ bind(L_CMP_LOOP_NOMATCH); 2763 __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH); 2764 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index 2765 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2766 __ add(haystack, haystack, haystack_chr_size); 2767 __ j(L_HAS_ZERO_LOOP); 2768 2769 __ align(OptoLoopAlignment); 2770 __ bind(L_CMP_LOOP_LAST_CMP); 2771 __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH); 2772 __ j(DONE); 2773 2774 __ align(OptoLoopAlignment); 2775 __ bind(L_CMP_LOOP_LAST_CMP2); 2776 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2777 __ add(result, result, 1); 2778 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2779 __ j(DONE); 2780 2781 __ align(OptoLoopAlignment); 2782 __ bind(L_HAS_ZERO_LOOP_NOMATCH); 2783 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 2784 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 2785 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 2786 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 2787 // result by analyzed characters value, so, we can just reset lower bits 2788 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 2789 // 2) restore needle_len and haystack_len values from "compressed" haystack_len 2790 // 3) advance haystack value to represent next haystack octet. result & 7/3 is 2791 // index of last analyzed substring inside current octet. So, haystack in at 2792 // respective start address. We need to advance it to next octet 2793 __ andi(match_mask, result, wordSize / haystack_chr_size - 1); 2794 __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2); 2795 __ andi(result, result, haystack_isL ? -8 : -4); 2796 __ slli(tmp, match_mask, haystack_chr_shift); 2797 __ sub(haystack, haystack, tmp); 2798 __ sign_extend(haystack_len, haystack_len, 32); 2799 __ j(L_LOOP_PROCEED); 2800 2801 __ align(OptoLoopAlignment); 2802 __ bind(NOMATCH); 2803 __ mv(result, -1); 2804 2805 __ bind(DONE); 2806 __ pop_reg(spilled_regs, sp); 2807 __ ret(); 2808 return entry; 2809 } 2810 2811 void generate_string_indexof_stubs() 2812 { 2813 StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 2814 StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 2815 StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 2816 } 2817 2818 #ifdef COMPILER2 2819 address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { 2820 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table"); 2821 2822 address start = __ pc(); 2823 const Register 2824 r_super_klass = x10, 2825 r_array_base = x11, 2826 r_array_length = x12, 2827 r_array_index = x13, 2828 r_sub_klass = x14, 2829 result = x15, 2830 r_bitmap = x16; 2831 2832 Label L_success; 2833 __ enter(); 2834 __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, result, 2835 r_array_base, r_array_length, r_array_index, 2836 r_bitmap, super_klass_index, /*stub_is_near*/true); 2837 __ leave(); 2838 __ ret(); 2839 2840 return start; 2841 } 2842 2843 // Slow path implementation for UseSecondarySupersTable. 2844 address generate_lookup_secondary_supers_table_slow_path_stub() { 2845 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path"); 2846 2847 address start = __ pc(); 2848 const Register 2849 r_super_klass = x10, // argument 2850 r_array_base = x11, // argument 2851 temp1 = x12, // tmp 2852 r_array_index = x13, // argument 2853 result = x15, // argument 2854 r_bitmap = x16; // argument 2855 2856 2857 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 2858 __ ret(); 2859 2860 return start; 2861 } 2862 2863 address generate_mulAdd() 2864 { 2865 __ align(CodeEntryAlignment); 2866 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 2867 2868 address entry = __ pc(); 2869 2870 const Register out = x10; 2871 const Register in = x11; 2872 const Register offset = x12; 2873 const Register len = x13; 2874 const Register k = x14; 2875 const Register tmp = x28; 2876 2877 BLOCK_COMMENT("Entry:"); 2878 __ enter(); 2879 __ mul_add(out, in, offset, len, k, tmp); 2880 __ leave(); 2881 __ ret(); 2882 2883 return entry; 2884 } 2885 2886 /** 2887 * Arguments: 2888 * 2889 * Input: 2890 * c_rarg0 - x address 2891 * c_rarg1 - x length 2892 * c_rarg2 - y address 2893 * c_rarg3 - y length 2894 * c_rarg4 - z address 2895 */ 2896 address generate_multiplyToLen() 2897 { 2898 __ align(CodeEntryAlignment); 2899 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 2900 address entry = __ pc(); 2901 2902 const Register x = x10; 2903 const Register xlen = x11; 2904 const Register y = x12; 2905 const Register ylen = x13; 2906 const Register z = x14; 2907 2908 const Register tmp0 = x15; 2909 const Register tmp1 = x16; 2910 const Register tmp2 = x17; 2911 const Register tmp3 = x7; 2912 const Register tmp4 = x28; 2913 const Register tmp5 = x29; 2914 const Register tmp6 = x30; 2915 const Register tmp7 = x31; 2916 2917 BLOCK_COMMENT("Entry:"); 2918 __ enter(); // required for proper stackwalking of RuntimeStub frame 2919 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2920 __ leave(); // required for proper stackwalking of RuntimeStub frame 2921 __ ret(); 2922 2923 return entry; 2924 } 2925 2926 address generate_squareToLen() 2927 { 2928 __ align(CodeEntryAlignment); 2929 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 2930 address entry = __ pc(); 2931 2932 const Register x = x10; 2933 const Register xlen = x11; 2934 const Register z = x12; 2935 const Register y = x14; // == x 2936 const Register ylen = x15; // == xlen 2937 2938 const Register tmp0 = x13; // zlen, unused 2939 const Register tmp1 = x16; 2940 const Register tmp2 = x17; 2941 const Register tmp3 = x7; 2942 const Register tmp4 = x28; 2943 const Register tmp5 = x29; 2944 const Register tmp6 = x30; 2945 const Register tmp7 = x31; 2946 2947 BLOCK_COMMENT("Entry:"); 2948 __ enter(); 2949 __ mv(y, x); 2950 __ mv(ylen, xlen); 2951 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2952 __ leave(); 2953 __ ret(); 2954 2955 return entry; 2956 } 2957 2958 // Arguments: 2959 // 2960 // Input: 2961 // c_rarg0 - newArr address 2962 // c_rarg1 - oldArr address 2963 // c_rarg2 - newIdx 2964 // c_rarg3 - shiftCount 2965 // c_rarg4 - numIter 2966 // 2967 address generate_bigIntegerLeftShift() { 2968 __ align(CodeEntryAlignment); 2969 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 2970 address entry = __ pc(); 2971 2972 Label loop, exit; 2973 2974 Register newArr = c_rarg0; 2975 Register oldArr = c_rarg1; 2976 Register newIdx = c_rarg2; 2977 Register shiftCount = c_rarg3; 2978 Register numIter = c_rarg4; 2979 2980 Register shiftRevCount = c_rarg5; 2981 Register oldArrNext = t1; 2982 2983 __ beqz(numIter, exit); 2984 __ shadd(newArr, newIdx, newArr, t0, 2); 2985 2986 __ mv(shiftRevCount, 32); 2987 __ sub(shiftRevCount, shiftRevCount, shiftCount); 2988 2989 __ bind(loop); 2990 __ addi(oldArrNext, oldArr, 4); 2991 __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4); 2992 __ vle32_v(v0, oldArr); 2993 __ vle32_v(v4, oldArrNext); 2994 __ vsll_vx(v0, v0, shiftCount); 2995 __ vsrl_vx(v4, v4, shiftRevCount); 2996 __ vor_vv(v0, v0, v4); 2997 __ vse32_v(v0, newArr); 2998 __ sub(numIter, numIter, t0); 2999 __ shadd(oldArr, t0, oldArr, t1, 2); 3000 __ shadd(newArr, t0, newArr, t1, 2); 3001 __ bnez(numIter, loop); 3002 3003 __ bind(exit); 3004 __ ret(); 3005 3006 return entry; 3007 } 3008 3009 // Arguments: 3010 // 3011 // Input: 3012 // c_rarg0 - newArr address 3013 // c_rarg1 - oldArr address 3014 // c_rarg2 - newIdx 3015 // c_rarg3 - shiftCount 3016 // c_rarg4 - numIter 3017 // 3018 address generate_bigIntegerRightShift() { 3019 __ align(CodeEntryAlignment); 3020 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 3021 address entry = __ pc(); 3022 3023 Label loop, exit; 3024 3025 Register newArr = c_rarg0; 3026 Register oldArr = c_rarg1; 3027 Register newIdx = c_rarg2; 3028 Register shiftCount = c_rarg3; 3029 Register numIter = c_rarg4; 3030 Register idx = numIter; 3031 3032 Register shiftRevCount = c_rarg5; 3033 Register oldArrNext = c_rarg6; 3034 Register newArrCur = t0; 3035 Register oldArrCur = t1; 3036 3037 __ beqz(idx, exit); 3038 __ shadd(newArr, newIdx, newArr, t0, 2); 3039 3040 __ mv(shiftRevCount, 32); 3041 __ sub(shiftRevCount, shiftRevCount, shiftCount); 3042 3043 __ bind(loop); 3044 __ vsetvli(t0, idx, Assembler::e32, Assembler::m4); 3045 __ sub(idx, idx, t0); 3046 __ shadd(oldArrNext, idx, oldArr, t1, 2); 3047 __ shadd(newArrCur, idx, newArr, t1, 2); 3048 __ addi(oldArrCur, oldArrNext, 4); 3049 __ vle32_v(v0, oldArrCur); 3050 __ vle32_v(v4, oldArrNext); 3051 __ vsrl_vx(v0, v0, shiftCount); 3052 __ vsll_vx(v4, v4, shiftRevCount); 3053 __ vor_vv(v0, v0, v4); 3054 __ vse32_v(v0, newArrCur); 3055 __ bnez(idx, loop); 3056 3057 __ bind(exit); 3058 __ ret(); 3059 3060 return entry; 3061 } 3062 #endif 3063 3064 #ifdef COMPILER2 3065 class MontgomeryMultiplyGenerator : public MacroAssembler { 3066 3067 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3068 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj; 3069 3070 RegSet _toSave; 3071 bool _squaring; 3072 3073 public: 3074 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3075 : MacroAssembler(as->code()), _squaring(squaring) { 3076 3077 // Register allocation 3078 3079 RegSetIterator<Register> regs = RegSet::range(x10, x26).begin(); 3080 Pa_base = *regs; // Argument registers 3081 if (squaring) { 3082 Pb_base = Pa_base; 3083 } else { 3084 Pb_base = *++regs; 3085 } 3086 Pn_base = *++regs; 3087 Rlen= *++regs; 3088 inv = *++regs; 3089 Pm_base = *++regs; 3090 3091 // Working registers: 3092 Ra = *++regs; // The current digit of a, b, n, and m. 3093 Rb = *++regs; 3094 Rm = *++regs; 3095 Rn = *++regs; 3096 3097 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 3098 Pb = *++regs; 3099 Pm = *++regs; 3100 Pn = *++regs; 3101 3102 tmp0 = *++regs; // Three registers which form a 3103 tmp1 = *++regs; // triple-precision accumuator. 3104 tmp2 = *++regs; 3105 3106 Ri = x6; // Inner and outer loop indexes. 3107 Rj = x7; 3108 3109 Rhi_ab = x28; // Product registers: low and high parts 3110 Rlo_ab = x29; // of a*b and m*n. 3111 Rhi_mn = x30; 3112 Rlo_mn = x31; 3113 3114 // x18 and up are callee-saved. 3115 _toSave = RegSet::range(x18, *regs) + Pm_base; 3116 } 3117 3118 private: 3119 void save_regs() { 3120 push_reg(_toSave, sp); 3121 } 3122 3123 void restore_regs() { 3124 pop_reg(_toSave, sp); 3125 } 3126 3127 template <typename T> 3128 void unroll_2(Register count, T block) { 3129 Label loop, end, odd; 3130 beqz(count, end); 3131 test_bit(t0, count, 0); 3132 bnez(t0, odd); 3133 align(16); 3134 bind(loop); 3135 (this->*block)(); 3136 bind(odd); 3137 (this->*block)(); 3138 addi(count, count, -2); 3139 bgtz(count, loop); 3140 bind(end); 3141 } 3142 3143 template <typename T> 3144 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3145 Label loop, end, odd; 3146 beqz(count, end); 3147 test_bit(tmp, count, 0); 3148 bnez(tmp, odd); 3149 align(16); 3150 bind(loop); 3151 (this->*block)(d, s, tmp); 3152 bind(odd); 3153 (this->*block)(d, s, tmp); 3154 addi(count, count, -2); 3155 bgtz(count, loop); 3156 bind(end); 3157 } 3158 3159 void pre1(RegisterOrConstant i) { 3160 block_comment("pre1"); 3161 // Pa = Pa_base; 3162 // Pb = Pb_base + i; 3163 // Pm = Pm_base; 3164 // Pn = Pn_base + i; 3165 // Ra = *Pa; 3166 // Rb = *Pb; 3167 // Rm = *Pm; 3168 // Rn = *Pn; 3169 if (i.is_register()) { 3170 slli(t0, i.as_register(), LogBytesPerWord); 3171 } else { 3172 mv(t0, i.as_constant()); 3173 slli(t0, t0, LogBytesPerWord); 3174 } 3175 3176 mv(Pa, Pa_base); 3177 add(Pb, Pb_base, t0); 3178 mv(Pm, Pm_base); 3179 add(Pn, Pn_base, t0); 3180 3181 ld(Ra, Address(Pa)); 3182 ld(Rb, Address(Pb)); 3183 ld(Rm, Address(Pm)); 3184 ld(Rn, Address(Pn)); 3185 3186 // Zero the m*n result. 3187 mv(Rhi_mn, zr); 3188 mv(Rlo_mn, zr); 3189 } 3190 3191 // The core multiply-accumulate step of a Montgomery 3192 // multiplication. The idea is to schedule operations as a 3193 // pipeline so that instructions with long latencies (loads and 3194 // multiplies) have time to complete before their results are 3195 // used. This most benefits in-order implementations of the 3196 // architecture but out-of-order ones also benefit. 3197 void step() { 3198 block_comment("step"); 3199 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3200 // Ra = *++Pa; 3201 // Rb = *--Pb; 3202 mulhu(Rhi_ab, Ra, Rb); 3203 mul(Rlo_ab, Ra, Rb); 3204 addi(Pa, Pa, wordSize); 3205 ld(Ra, Address(Pa)); 3206 addi(Pb, Pb, -wordSize); 3207 ld(Rb, Address(Pb)); 3208 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the 3209 // previous iteration. 3210 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3211 // Rm = *++Pm; 3212 // Rn = *--Pn; 3213 mulhu(Rhi_mn, Rm, Rn); 3214 mul(Rlo_mn, Rm, Rn); 3215 addi(Pm, Pm, wordSize); 3216 ld(Rm, Address(Pm)); 3217 addi(Pn, Pn, -wordSize); 3218 ld(Rn, Address(Pn)); 3219 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3220 } 3221 3222 void post1() { 3223 block_comment("post1"); 3224 3225 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3226 // Ra = *++Pa; 3227 // Rb = *--Pb; 3228 mulhu(Rhi_ab, Ra, Rb); 3229 mul(Rlo_ab, Ra, Rb); 3230 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3231 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3232 3233 // *Pm = Rm = tmp0 * inv; 3234 mul(Rm, tmp0, inv); 3235 sd(Rm, Address(Pm)); 3236 3237 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3238 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3239 mulhu(Rhi_mn, Rm, Rn); 3240 3241 #ifndef PRODUCT 3242 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3243 { 3244 mul(Rlo_mn, Rm, Rn); 3245 add(Rlo_mn, tmp0, Rlo_mn); 3246 Label ok; 3247 beqz(Rlo_mn, ok); 3248 stop("broken Montgomery multiply"); 3249 bind(ok); 3250 } 3251 #endif 3252 // We have very carefully set things up so that 3253 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3254 // the lower half of Rm * Rn because we know the result already: 3255 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3256 // tmp0 != 0. So, rather than do a mul and an cad we just set 3257 // the carry flag iff tmp0 is nonzero. 3258 // 3259 // mul(Rlo_mn, Rm, Rn); 3260 // cad(zr, tmp0, Rlo_mn); 3261 addi(t0, tmp0, -1); 3262 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3263 cadc(tmp0, tmp1, Rhi_mn, t0); 3264 adc(tmp1, tmp2, zr, t0); 3265 mv(tmp2, zr); 3266 } 3267 3268 void pre2(Register i, Register len) { 3269 block_comment("pre2"); 3270 // Pa = Pa_base + i-len; 3271 // Pb = Pb_base + len; 3272 // Pm = Pm_base + i-len; 3273 // Pn = Pn_base + len; 3274 3275 sub(Rj, i, len); 3276 // Rj == i-len 3277 3278 // Ra as temp register 3279 slli(Ra, Rj, LogBytesPerWord); 3280 add(Pa, Pa_base, Ra); 3281 add(Pm, Pm_base, Ra); 3282 slli(Ra, len, LogBytesPerWord); 3283 add(Pb, Pb_base, Ra); 3284 add(Pn, Pn_base, Ra); 3285 3286 // Ra = *++Pa; 3287 // Rb = *--Pb; 3288 // Rm = *++Pm; 3289 // Rn = *--Pn; 3290 add(Pa, Pa, wordSize); 3291 ld(Ra, Address(Pa)); 3292 add(Pb, Pb, -wordSize); 3293 ld(Rb, Address(Pb)); 3294 add(Pm, Pm, wordSize); 3295 ld(Rm, Address(Pm)); 3296 add(Pn, Pn, -wordSize); 3297 ld(Rn, Address(Pn)); 3298 3299 mv(Rhi_mn, zr); 3300 mv(Rlo_mn, zr); 3301 } 3302 3303 void post2(Register i, Register len) { 3304 block_comment("post2"); 3305 sub(Rj, i, len); 3306 3307 cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part 3308 3309 // As soon as we know the least significant digit of our result, 3310 // store it. 3311 // Pm_base[i-len] = tmp0; 3312 // Rj as temp register 3313 slli(Rj, Rj, LogBytesPerWord); 3314 add(Rj, Pm_base, Rj); 3315 sd(tmp0, Address(Rj)); 3316 3317 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3318 cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part 3319 adc(tmp1, tmp2, zr, t0); 3320 mv(tmp2, zr); 3321 } 3322 3323 // A carry in tmp0 after Montgomery multiplication means that we 3324 // should subtract multiples of n from our result in m. We'll 3325 // keep doing that until there is no carry. 3326 void normalize(Register len) { 3327 block_comment("normalize"); 3328 // while (tmp0) 3329 // tmp0 = sub(Pm_base, Pn_base, tmp0, len); 3330 Label loop, post, again; 3331 Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now 3332 beqz(tmp0, post); { 3333 bind(again); { 3334 mv(i, zr); 3335 mv(cnt, len); 3336 slli(Rn, i, LogBytesPerWord); 3337 add(Rm, Pm_base, Rn); 3338 ld(Rm, Address(Rm)); 3339 add(Rn, Pn_base, Rn); 3340 ld(Rn, Address(Rn)); 3341 mv(t0, 1); // set carry flag, i.e. no borrow 3342 align(16); 3343 bind(loop); { 3344 notr(Rn, Rn); 3345 add(Rm, Rm, t0); 3346 add(Rm, Rm, Rn); 3347 sltu(t0, Rm, Rn); 3348 slli(Rn, i, LogBytesPerWord); // Rn as temp register 3349 add(Rn, Pm_base, Rn); 3350 sd(Rm, Address(Rn)); 3351 add(i, i, 1); 3352 slli(Rn, i, LogBytesPerWord); 3353 add(Rm, Pm_base, Rn); 3354 ld(Rm, Address(Rm)); 3355 add(Rn, Pn_base, Rn); 3356 ld(Rn, Address(Rn)); 3357 sub(cnt, cnt, 1); 3358 } bnez(cnt, loop); 3359 addi(tmp0, tmp0, -1); 3360 add(tmp0, tmp0, t0); 3361 } bnez(tmp0, again); 3362 } bind(post); 3363 } 3364 3365 // Move memory at s to d, reversing words. 3366 // Increments d to end of copied memory 3367 // Destroys tmp1, tmp2 3368 // Preserves len 3369 // Leaves s pointing to the address which was in d at start 3370 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3371 assert(tmp1->encoding() < x28->encoding(), "register corruption"); 3372 assert(tmp2->encoding() < x28->encoding(), "register corruption"); 3373 3374 shadd(s, len, s, tmp1, LogBytesPerWord); 3375 mv(tmp1, len); 3376 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3377 slli(tmp1, len, LogBytesPerWord); 3378 sub(s, d, tmp1); 3379 } 3380 // [63...0] -> [31...0][63...32] 3381 void reverse1(Register d, Register s, Register tmp) { 3382 addi(s, s, -wordSize); 3383 ld(tmp, Address(s)); 3384 ror_imm(tmp, tmp, 32, t0); 3385 sd(tmp, Address(d)); 3386 addi(d, d, wordSize); 3387 } 3388 3389 void step_squaring() { 3390 // An extra ACC 3391 step(); 3392 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3393 } 3394 3395 void last_squaring(Register i) { 3396 Label dont; 3397 // if ((i & 1) == 0) { 3398 test_bit(t0, i, 0); 3399 bnez(t0, dont); { 3400 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3401 // Ra = *++Pa; 3402 // Rb = *--Pb; 3403 mulhu(Rhi_ab, Ra, Rb); 3404 mul(Rlo_ab, Ra, Rb); 3405 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3406 } bind(dont); 3407 } 3408 3409 void extra_step_squaring() { 3410 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3411 3412 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3413 // Rm = *++Pm; 3414 // Rn = *--Pn; 3415 mulhu(Rhi_mn, Rm, Rn); 3416 mul(Rlo_mn, Rm, Rn); 3417 addi(Pm, Pm, wordSize); 3418 ld(Rm, Address(Pm)); 3419 addi(Pn, Pn, -wordSize); 3420 ld(Rn, Address(Pn)); 3421 } 3422 3423 void post1_squaring() { 3424 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3425 3426 // *Pm = Rm = tmp0 * inv; 3427 mul(Rm, tmp0, inv); 3428 sd(Rm, Address(Pm)); 3429 3430 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3431 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3432 mulhu(Rhi_mn, Rm, Rn); 3433 3434 #ifndef PRODUCT 3435 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3436 { 3437 mul(Rlo_mn, Rm, Rn); 3438 add(Rlo_mn, tmp0, Rlo_mn); 3439 Label ok; 3440 beqz(Rlo_mn, ok); { 3441 stop("broken Montgomery multiply"); 3442 } bind(ok); 3443 } 3444 #endif 3445 // We have very carefully set things up so that 3446 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3447 // the lower half of Rm * Rn because we know the result already: 3448 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3449 // tmp0 != 0. So, rather than do a mul and a cad we just set 3450 // the carry flag iff tmp0 is nonzero. 3451 // 3452 // mul(Rlo_mn, Rm, Rn); 3453 // cad(zr, tmp, Rlo_mn); 3454 addi(t0, tmp0, -1); 3455 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3456 cadc(tmp0, tmp1, Rhi_mn, t0); 3457 adc(tmp1, tmp2, zr, t0); 3458 mv(tmp2, zr); 3459 } 3460 3461 // use t0 as carry 3462 void acc(Register Rhi, Register Rlo, 3463 Register tmp0, Register tmp1, Register tmp2) { 3464 cad(tmp0, tmp0, Rlo, t0); 3465 cadc(tmp1, tmp1, Rhi, t0); 3466 adc(tmp2, tmp2, zr, t0); 3467 } 3468 3469 public: 3470 /** 3471 * Fast Montgomery multiplication. The derivation of the 3472 * algorithm is in A Cryptographic Library for the Motorola 3473 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3474 * 3475 * Arguments: 3476 * 3477 * Inputs for multiplication: 3478 * c_rarg0 - int array elements a 3479 * c_rarg1 - int array elements b 3480 * c_rarg2 - int array elements n (the modulus) 3481 * c_rarg3 - int length 3482 * c_rarg4 - int inv 3483 * c_rarg5 - int array elements m (the result) 3484 * 3485 * Inputs for squaring: 3486 * c_rarg0 - int array elements a 3487 * c_rarg1 - int array elements n (the modulus) 3488 * c_rarg2 - int length 3489 * c_rarg3 - int inv 3490 * c_rarg4 - int array elements m (the result) 3491 * 3492 */ 3493 address generate_multiply() { 3494 Label argh, nothing; 3495 bind(argh); 3496 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3497 3498 align(CodeEntryAlignment); 3499 address entry = pc(); 3500 3501 beqz(Rlen, nothing); 3502 3503 enter(); 3504 3505 // Make room. 3506 mv(Ra, 512); 3507 bgt(Rlen, Ra, argh); 3508 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3509 sub(Ra, sp, Ra); 3510 andi(sp, Ra, -2 * wordSize); 3511 3512 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3513 3514 { 3515 // Copy input args, reversing as we go. We use Ra as a 3516 // temporary variable. 3517 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3518 if (!_squaring) 3519 reverse(Ra, Pb_base, Rlen, Ri, Rj); 3520 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3521 } 3522 3523 // Push all call-saved registers and also Pm_base which we'll need 3524 // at the end. 3525 save_regs(); 3526 3527 #ifndef PRODUCT 3528 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3529 { 3530 ld(Rn, Address(Pn_base)); 3531 mul(Rlo_mn, Rn, inv); 3532 mv(t0, -1); 3533 Label ok; 3534 beq(Rlo_mn, t0, ok); 3535 stop("broken inverse in Montgomery multiply"); 3536 bind(ok); 3537 } 3538 #endif 3539 3540 mv(Pm_base, Ra); 3541 3542 mv(tmp0, zr); 3543 mv(tmp1, zr); 3544 mv(tmp2, zr); 3545 3546 block_comment("for (int i = 0; i < len; i++) {"); 3547 mv(Ri, zr); { 3548 Label loop, end; 3549 bge(Ri, Rlen, end); 3550 3551 bind(loop); 3552 pre1(Ri); 3553 3554 block_comment(" for (j = i; j; j--) {"); { 3555 mv(Rj, Ri); 3556 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3557 } block_comment(" } // j"); 3558 3559 post1(); 3560 addw(Ri, Ri, 1); 3561 blt(Ri, Rlen, loop); 3562 bind(end); 3563 block_comment("} // i"); 3564 } 3565 3566 block_comment("for (int i = len; i < 2*len; i++) {"); 3567 mv(Ri, Rlen); { 3568 Label loop, end; 3569 slli(t0, Rlen, 1); 3570 bge(Ri, t0, end); 3571 3572 bind(loop); 3573 pre2(Ri, Rlen); 3574 3575 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3576 slliw(Rj, Rlen, 1); 3577 subw(Rj, Rj, Ri); 3578 subw(Rj, Rj, 1); 3579 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3580 } block_comment(" } // j"); 3581 3582 post2(Ri, Rlen); 3583 addw(Ri, Ri, 1); 3584 slli(t0, Rlen, 1); 3585 blt(Ri, t0, loop); 3586 bind(end); 3587 } 3588 block_comment("} // i"); 3589 3590 normalize(Rlen); 3591 3592 mv(Ra, Pm_base); // Save Pm_base in Ra 3593 restore_regs(); // Restore caller's Pm_base 3594 3595 // Copy our result into caller's Pm_base 3596 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3597 3598 leave(); 3599 bind(nothing); 3600 ret(); 3601 3602 return entry; 3603 } 3604 3605 /** 3606 * 3607 * Arguments: 3608 * 3609 * Inputs: 3610 * c_rarg0 - int array elements a 3611 * c_rarg1 - int array elements n (the modulus) 3612 * c_rarg2 - int length 3613 * c_rarg3 - int inv 3614 * c_rarg4 - int array elements m (the result) 3615 * 3616 */ 3617 address generate_square() { 3618 Label argh; 3619 bind(argh); 3620 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3621 3622 align(CodeEntryAlignment); 3623 address entry = pc(); 3624 3625 enter(); 3626 3627 // Make room. 3628 mv(Ra, 512); 3629 bgt(Rlen, Ra, argh); 3630 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3631 sub(Ra, sp, Ra); 3632 andi(sp, Ra, -2 * wordSize); 3633 3634 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3635 3636 { 3637 // Copy input args, reversing as we go. We use Ra as a 3638 // temporary variable. 3639 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3640 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3641 } 3642 3643 // Push all call-saved registers and also Pm_base which we'll need 3644 // at the end. 3645 save_regs(); 3646 3647 mv(Pm_base, Ra); 3648 3649 mv(tmp0, zr); 3650 mv(tmp1, zr); 3651 mv(tmp2, zr); 3652 3653 block_comment("for (int i = 0; i < len; i++) {"); 3654 mv(Ri, zr); { 3655 Label loop, end; 3656 bind(loop); 3657 bge(Ri, Rlen, end); 3658 3659 pre1(Ri); 3660 3661 block_comment("for (j = (i+1)/2; j; j--) {"); { 3662 addi(Rj, Ri, 1); 3663 srliw(Rj, Rj, 1); 3664 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3665 } block_comment(" } // j"); 3666 3667 last_squaring(Ri); 3668 3669 block_comment(" for (j = i/2; j; j--) {"); { 3670 srliw(Rj, Ri, 1); 3671 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3672 } block_comment(" } // j"); 3673 3674 post1_squaring(); 3675 addi(Ri, Ri, 1); 3676 blt(Ri, Rlen, loop); 3677 3678 bind(end); 3679 block_comment("} // i"); 3680 } 3681 3682 block_comment("for (int i = len; i < 2*len; i++) {"); 3683 mv(Ri, Rlen); { 3684 Label loop, end; 3685 bind(loop); 3686 slli(t0, Rlen, 1); 3687 bge(Ri, t0, end); 3688 3689 pre2(Ri, Rlen); 3690 3691 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 3692 slli(Rj, Rlen, 1); 3693 sub(Rj, Rj, Ri); 3694 sub(Rj, Rj, 1); 3695 srliw(Rj, Rj, 1); 3696 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3697 } block_comment(" } // j"); 3698 3699 last_squaring(Ri); 3700 3701 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 3702 slli(Rj, Rlen, 1); 3703 sub(Rj, Rj, Ri); 3704 srliw(Rj, Rj, 1); 3705 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3706 } block_comment(" } // j"); 3707 3708 post2(Ri, Rlen); 3709 addi(Ri, Ri, 1); 3710 slli(t0, Rlen, 1); 3711 blt(Ri, t0, loop); 3712 3713 bind(end); 3714 block_comment("} // i"); 3715 } 3716 3717 normalize(Rlen); 3718 3719 mv(Ra, Pm_base); // Save Pm_base in Ra 3720 restore_regs(); // Restore caller's Pm_base 3721 3722 // Copy our result into caller's Pm_base 3723 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3724 3725 leave(); 3726 ret(); 3727 3728 return entry; 3729 } 3730 }; 3731 3732 #endif // COMPILER2 3733 3734 address generate_cont_thaw(Continuation::thaw_kind kind) { 3735 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 3736 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 3737 3738 address start = __ pc(); 3739 3740 if (return_barrier) { 3741 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset())); 3742 } 3743 3744 #ifndef PRODUCT 3745 { 3746 Label OK; 3747 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset())); 3748 __ beq(sp, t0, OK); 3749 __ stop("incorrect sp"); 3750 __ bind(OK); 3751 } 3752 #endif 3753 3754 if (return_barrier) { 3755 // preserve possible return value from a method returning to the return barrier 3756 __ sub(sp, sp, 2 * wordSize); 3757 __ fsd(f10, Address(sp, 0 * wordSize)); 3758 __ sd(x10, Address(sp, 1 * wordSize)); 3759 } 3760 3761 __ mv(c_rarg1, (return_barrier ? 1 : 0)); 3762 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1); 3763 __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames 3764 3765 if (return_barrier) { 3766 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 3767 __ ld(x10, Address(sp, 1 * wordSize)); 3768 __ fld(f10, Address(sp, 0 * wordSize)); 3769 __ add(sp, sp, 2 * wordSize); 3770 } 3771 3772 #ifndef PRODUCT 3773 { 3774 Label OK; 3775 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset())); 3776 __ beq(sp, t0, OK); 3777 __ stop("incorrect sp"); 3778 __ bind(OK); 3779 } 3780 #endif 3781 3782 Label thaw_success; 3783 // t1 contains the size of the frames to thaw, 0 if overflow or no more frames 3784 __ bnez(t1, thaw_success); 3785 __ la(t0, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 3786 __ jr(t0); 3787 __ bind(thaw_success); 3788 3789 // make room for the thawed frames 3790 __ sub(t0, sp, t1); 3791 __ andi(sp, t0, -16); // align 3792 3793 if (return_barrier) { 3794 // save original return value -- again 3795 __ sub(sp, sp, 2 * wordSize); 3796 __ fsd(f10, Address(sp, 0 * wordSize)); 3797 __ sd(x10, Address(sp, 1 * wordSize)); 3798 } 3799 3800 // If we want, we can templatize thaw by kind, and have three different entries 3801 __ mv(c_rarg1, kind); 3802 3803 __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1); 3804 __ mv(t1, x10); // x10 is the sp of the yielding frame 3805 3806 if (return_barrier) { 3807 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 3808 __ ld(x10, Address(sp, 1 * wordSize)); 3809 __ fld(f10, Address(sp, 0 * wordSize)); 3810 __ add(sp, sp, 2 * wordSize); 3811 } else { 3812 __ mv(x10, zr); // return 0 (success) from doYield 3813 } 3814 3815 // we're now on the yield frame (which is in an address above us b/c sp has been pushed down) 3816 __ mv(fp, t1); 3817 __ sub(sp, t1, 2 * wordSize); // now pointing to fp spill 3818 3819 if (return_barrier_exception) { 3820 __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address 3821 __ verify_oop(x10); 3822 __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9 3823 3824 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1); 3825 3826 // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc 3827 3828 __ mv(x11, x10); // the exception handler 3829 __ mv(x10, x9); // restore return value contaning the exception oop 3830 __ verify_oop(x10); 3831 3832 __ leave(); 3833 __ mv(x13, ra); 3834 __ jr(x11); // the exception handler 3835 } else { 3836 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 3837 __ leave(); 3838 __ ret(); 3839 } 3840 3841 return start; 3842 } 3843 3844 address generate_cont_thaw() { 3845 if (!Continuations::enabled()) return nullptr; 3846 3847 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 3848 address start = __ pc(); 3849 generate_cont_thaw(Continuation::thaw_top); 3850 return start; 3851 } 3852 3853 address generate_cont_returnBarrier() { 3854 if (!Continuations::enabled()) return nullptr; 3855 3856 // TODO: will probably need multiple return barriers depending on return type 3857 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 3858 address start = __ pc(); 3859 3860 generate_cont_thaw(Continuation::thaw_return_barrier); 3861 3862 return start; 3863 } 3864 3865 address generate_cont_returnBarrier_exception() { 3866 if (!Continuations::enabled()) return nullptr; 3867 3868 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 3869 address start = __ pc(); 3870 3871 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 3872 3873 return start; 3874 } 3875 3876 address generate_cont_preempt_stub() { 3877 if (!Continuations::enabled()) return nullptr; 3878 StubCodeMark mark(this, "StubRoutines","Continuation preempt stub"); 3879 address start = __ pc(); 3880 3881 __ reset_last_Java_frame(true); 3882 3883 // reset the flag 3884 __ sb(zr, Address(xthread, JavaThread::preempting_offset())); 3885 3886 // Set sp to enterSpecial frame and then remove it from the stack 3887 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset())); 3888 3889 Label preemption_cancelled; 3890 __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset())); 3891 __ bnez(t0, preemption_cancelled); 3892 3893 // Remove enterSpecial frame from the stack and return to Continuation.run() 3894 SharedRuntime::continuation_enter_cleanup(_masm); 3895 __ leave(); 3896 __ ret(); 3897 3898 __ bind(preemption_cancelled); 3899 __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset())); 3900 __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize))); 3901 __ la(t0, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 3902 __ ld(t0, Address(t0)); 3903 __ jr(t0); 3904 3905 return start; 3906 } 3907 3908 #if COMPILER2_OR_JVMCI 3909 3910 #undef __ 3911 #define __ this-> 3912 3913 class Sha2Generator : public MacroAssembler { 3914 StubCodeGenerator* _cgen; 3915 public: 3916 Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {} 3917 address generate_sha256_implCompress(bool multi_block) { 3918 return generate_sha2_implCompress(Assembler::e32, multi_block); 3919 } 3920 address generate_sha512_implCompress(bool multi_block) { 3921 return generate_sha2_implCompress(Assembler::e64, multi_block); 3922 } 3923 private: 3924 3925 void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { 3926 if (vset_sew == Assembler::e32) __ vle32_v(vr, sr); 3927 else __ vle64_v(vr, sr); 3928 } 3929 3930 void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { 3931 if (vset_sew == Assembler::e32) __ vse32_v(vr, sr); 3932 else __ vse64_v(vr, sr); 3933 } 3934 3935 // Overview of the logic in each "quad round". 3936 // 3937 // The code below repeats 16/20 times the logic implementing four rounds 3938 // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds" 3939 // to implementing the 64/80 single rounds. 3940 // 3941 // // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0]) 3942 // // Output: 3943 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} 3944 // vl1reXX.v vTmp1, ofs 3945 // 3946 // // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b) 3947 // addi ofs, ofs, 16/32 3948 // 3949 // // Add constants to message schedule words: 3950 // // Input 3951 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} 3952 // // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0]; 3953 // // Output 3954 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 3955 // vadd.vv vTmp0, vTmp1, vW0 3956 // 3957 // // 2 rounds of working variables updates. 3958 // // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t] 3959 // // Input: 3960 // // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] " 3961 // // vState0 = {a[t],b[t],e[t],f[t]} 3962 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 3963 // // Output: 3964 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " 3965 // // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] " 3966 // vsha2cl.vv vState1, vState0, vTmp0 3967 // 3968 // // 2 rounds of working variables updates. 3969 // // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t] 3970 // // Input 3971 // // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] " 3972 // // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] " 3973 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " 3974 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 3975 // // Output: 3976 // // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] " 3977 // vsha2ch.vv vState0, vState1, vTmp0 3978 // 3979 // // Combine 2QW into 1QW 3980 // // 3981 // // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs 3982 // // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3] 3983 // // and it can only take 3 vectors as inputs. Hence we need to combine 3984 // // vW1[0] and vW2[1..3] in a single vector. 3985 // // 3986 // // vmerge Vt4, Vt1, Vt2, V0 3987 // // Input 3988 // // V0 = mask // first word from vW2, 1..3 words from vW1 3989 // // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5} 3990 // // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9} 3991 // // Output 3992 // // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5} 3993 // vmerge.vvm vTmp0, vW2, vW1, v0 3994 // 3995 // // Generate next Four Message Schedule Words (hence allowing for 4 more rounds) 3996 // // Input 3997 // // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0] 3998 // // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12] 3999 // // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4] 4000 // // Output (next four message schedule words) 4001 // // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16] 4002 // vsha2ms.vv vW0, vTmp0, vW3 4003 // 4004 // BEFORE 4005 // vW0 - vW3 hold the message schedule words (initially the block words) 4006 // vW0 = W[ 3: 0] "oldest" 4007 // vW1 = W[ 7: 4] 4008 // vW2 = W[11: 8] 4009 // vW3 = W[15:12] "newest" 4010 // 4011 // vt6 - vt7 hold the working state variables 4012 // vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0} 4013 // vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2} 4014 // 4015 // AFTER 4016 // vW0 - vW3 hold the message schedule words (initially the block words) 4017 // vW1 = W[ 7: 4] "oldest" 4018 // vW2 = W[11: 8] 4019 // vW3 = W[15:12] 4020 // vW0 = W[19:16] "newest" 4021 // 4022 // vState0 and vState1 hold the working state variables 4023 // vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]} 4024 // vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]} 4025 // 4026 // The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round, 4027 // hence the uses of those vectors rotate in each round, and we get back to the 4028 // initial configuration every 4 quad-rounds. We could avoid those changes at 4029 // the cost of moving those vectors at the end of each quad-rounds. 4030 void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4, 4031 Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh, 4032 bool gen_words = true, bool step_const = true) { 4033 __ vleXX_v(vset_sew, vtemp, scalarconst); 4034 if (step_const) { 4035 __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32); 4036 } 4037 __ vadd_vv(vtemp2, vtemp, rot1); 4038 __ vsha2cl_vv(v_cdgh, v_abef, vtemp2); 4039 __ vsha2ch_vv(v_abef, v_cdgh, vtemp2); 4040 if (gen_words) { 4041 __ vmerge_vvm(vtemp2, rot3, rot2); 4042 __ vsha2ms_vv(rot1, vtemp2, rot4); 4043 } 4044 } 4045 4046 const char* stub_name(Assembler::SEW vset_sew, bool multi_block) { 4047 if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress"; 4048 if (vset_sew == Assembler::e32 && multi_block) return "sha256_implCompressMB"; 4049 if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress"; 4050 if (vset_sew == Assembler::e64 && multi_block) return "sha512_implCompressMB"; 4051 ShouldNotReachHere(); 4052 return "bad name lookup"; 4053 } 4054 4055 // Arguments: 4056 // 4057 // Inputs: 4058 // c_rarg0 - byte[] source+offset 4059 // c_rarg1 - int[] SHA.state 4060 // c_rarg2 - int offset 4061 // c_rarg3 - int limit 4062 // 4063 address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) { 4064 alignas(64) static const uint32_t round_consts_256[64] = { 4065 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 4066 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 4067 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 4068 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 4069 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 4070 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 4071 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 4072 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 4073 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 4074 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 4075 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 4076 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 4077 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 4078 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 4079 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 4080 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 4081 }; 4082 alignas(64) static const uint64_t round_consts_512[80] = { 4083 0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl, 4084 0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l, 4085 0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l, 4086 0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l, 4087 0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l, 4088 0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l, 4089 0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l, 4090 0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l, 4091 0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl, 4092 0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l, 4093 0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl, 4094 0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl, 4095 0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l, 4096 0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l, 4097 0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l, 4098 0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l, 4099 0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l, 4100 0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl, 4101 0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl, 4102 0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl, 4103 0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l, 4104 0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l, 4105 0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal, 4106 0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl, 4107 0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl, 4108 0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al, 4109 0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l 4110 }; 4111 const int const_add = vset_sew == Assembler::e32 ? 16 : 32; 4112 4113 __ align(CodeEntryAlignment); 4114 StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block)); 4115 address start = __ pc(); 4116 4117 Register buf = c_rarg0; 4118 Register state = c_rarg1; 4119 Register ofs = c_rarg2; 4120 Register limit = c_rarg3; 4121 Register consts = t2; // caller saved 4122 Register state_c = x28; // caller saved 4123 VectorRegister vindex = v2; 4124 VectorRegister vW0 = v4; 4125 VectorRegister vW1 = v6; 4126 VectorRegister vW2 = v8; 4127 VectorRegister vW3 = v10; 4128 VectorRegister vState0 = v12; 4129 VectorRegister vState1 = v14; 4130 VectorRegister vHash0 = v16; 4131 VectorRegister vHash1 = v18; 4132 VectorRegister vTmp0 = v20; 4133 VectorRegister vTmp1 = v22; 4134 4135 Label multi_block_loop; 4136 4137 __ enter(); 4138 4139 address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512; 4140 la(consts, ExternalAddress(constant_table)); 4141 4142 // Register use in this function: 4143 // 4144 // VECTORS 4145 // vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message 4146 // schedule words (Wt). They start with the message block 4147 // content (W0 to W15), then further words in the message 4148 // schedule generated via vsha2ms from previous Wt. 4149 // Initially: 4150 // vW0 = W[ 3:0] = { W3, W2, W1, W0} 4151 // vW1 = W[ 7:4] = { W7, W6, W5, W4} 4152 // vW2 = W[ 11:8] = {W11, W10, W9, W8} 4153 // vW3 = W[15:12] = {W15, W14, W13, W12} 4154 // 4155 // vState0 - vState1 hold the working state variables (a, b, ..., h) 4156 // vState0 = {f[t],e[t],b[t],a[t]} 4157 // vState1 = {h[t],g[t],d[t],c[t]} 4158 // Initially: 4159 // vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1} 4160 // vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1} 4161 // 4162 // v0 = masks for vrgather/vmerge. Single value during the 16 rounds. 4163 // 4164 // vTmp0 = temporary, Wt+Kt 4165 // vTmp1 = temporary, Kt 4166 // 4167 // vHash0/vHash1 = hold the initial values of the hash, byte-swapped. 4168 // 4169 // During most of the function the vector state is configured so that each 4170 // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits). 4171 4172 // vsha2ch/vsha2cl uses EGW of 4*SEW. 4173 // SHA256 SEW = e32, EGW = 128-bits 4174 // SHA512 SEW = e64, EGW = 256-bits 4175 // 4176 // VLEN is required to be at least 128. 4177 // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256) 4178 // 4179 // m1: LMUL=1/2 4180 // ta: tail agnostic (don't care about those lanes) 4181 // ma: mask agnostic (don't care about those lanes) 4182 // x0 is not written, we known the number of vector elements. 4183 4184 if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128 4185 __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta); 4186 } else { 4187 __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta); 4188 } 4189 4190 int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul; 4191 __ li(t0, indexes); 4192 __ vmv_v_x(vindex, t0); 4193 4194 // Step-over a,b, so we are pointing to c. 4195 // const_add is equal to 4x state variable, div by 2 is thus 2, a,b 4196 __ addi(state_c, state, const_add/2); 4197 4198 // Use index-load to get {f,e,b,a},{h,g,d,c} 4199 __ vluxei8_v(vState0, state, vindex); 4200 __ vluxei8_v(vState1, state_c, vindex); 4201 4202 __ bind(multi_block_loop); 4203 4204 // Capture the initial H values in vHash0 and vHash1 to allow for computing 4205 // the resulting H', since H' = H+{a',b',c',...,h'}. 4206 __ vmv_v_v(vHash0, vState0); 4207 __ vmv_v_v(vHash1, vState1); 4208 4209 // Load the 512/1024-bits of the message block in vW0-vW3 and perform 4210 // an endian swap on each 4/8 bytes element. 4211 // 4212 // If Zvkb is not implemented one can use vrgather 4213 // with an index sequence to byte-swap. 4214 // sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] 4215 // <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate 4216 // this sequence. 'vid' gives us the N. 4217 __ vleXX_v(vset_sew, vW0, buf); 4218 __ vrev8_v(vW0, vW0); 4219 __ addi(buf, buf, const_add); 4220 __ vleXX_v(vset_sew, vW1, buf); 4221 __ vrev8_v(vW1, vW1); 4222 __ addi(buf, buf, const_add); 4223 __ vleXX_v(vset_sew, vW2, buf); 4224 __ vrev8_v(vW2, vW2); 4225 __ addi(buf, buf, const_add); 4226 __ vleXX_v(vset_sew, vW3, buf); 4227 __ vrev8_v(vW3, vW3); 4228 __ addi(buf, buf, const_add); 4229 4230 // Set v0 up for the vmerge that replaces the first word (idx==0) 4231 __ vid_v(v0); 4232 __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0) 4233 4234 VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3}; 4235 int rot_pos = 0; 4236 // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2) 4237 const int qr_end = vset_sew == Assembler::e32 ? 12 : 16; 4238 for (int i = 0; i < qr_end; i++) { 4239 sha2_quad_round(vset_sew, 4240 rotation_regs[(rot_pos + 0) & 0x3], 4241 rotation_regs[(rot_pos + 1) & 0x3], 4242 rotation_regs[(rot_pos + 2) & 0x3], 4243 rotation_regs[(rot_pos + 3) & 0x3], 4244 consts, 4245 vTmp1, vTmp0, vState0, vState1); 4246 ++rot_pos; 4247 } 4248 // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2) 4249 // Note that we stop generating new message schedule words (Wt, vW0-13) 4250 // as we already generated all the words we end up consuming (i.e., W[63:60]). 4251 const int qr_c_end = qr_end + 4; 4252 for (int i = qr_end; i < qr_c_end; i++) { 4253 sha2_quad_round(vset_sew, 4254 rotation_regs[(rot_pos + 0) & 0x3], 4255 rotation_regs[(rot_pos + 1) & 0x3], 4256 rotation_regs[(rot_pos + 2) & 0x3], 4257 rotation_regs[(rot_pos + 3) & 0x3], 4258 consts, 4259 vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1)); 4260 ++rot_pos; 4261 } 4262 4263 //-------------------------------------------------------------------------------- 4264 // Compute the updated hash value H' 4265 // H' = H + {h',g',...,b',a'} 4266 // = {h,g,...,b,a} + {h',g',...,b',a'} 4267 // = {h+h',g+g',...,b+b',a+a'} 4268 4269 // H' = H+{a',b',c',...,h'} 4270 __ vadd_vv(vState0, vHash0, vState0); 4271 __ vadd_vv(vState1, vHash1, vState1); 4272 4273 if (multi_block) { 4274 int total_adds = vset_sew == Assembler::e32 ? 240 : 608; 4275 __ addi(consts, consts, -total_adds); 4276 __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128); 4277 __ ble(ofs, limit, multi_block_loop); 4278 __ mv(c_rarg0, ofs); // return ofs 4279 } 4280 4281 // Store H[0..8] = {a,b,c,d,e,f,g,h} from 4282 // vState0 = {f,e,b,a} 4283 // vState1 = {h,g,d,c} 4284 __ vsuxei8_v(vState0, state, vindex); 4285 __ vsuxei8_v(vState1, state_c, vindex); 4286 4287 __ leave(); 4288 __ ret(); 4289 4290 return start; 4291 } 4292 }; 4293 4294 #undef __ 4295 #define __ _masm-> 4296 4297 // Set of L registers that correspond to a contiguous memory area. 4298 // Each 64-bit register typically corresponds to 2 32-bit integers. 4299 template <uint L> 4300 class RegCache { 4301 private: 4302 MacroAssembler *_masm; 4303 Register _regs[L]; 4304 4305 public: 4306 RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) { 4307 assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L); 4308 auto it = rs.begin(); 4309 for (auto &r: _regs) { 4310 r = *it; 4311 ++it; 4312 } 4313 } 4314 4315 // generate load for the i'th register 4316 void gen_load(uint i, Register base) { 4317 assert(i < L, "invalid i: %u", i); 4318 __ ld(_regs[i], Address(base, 8 * i)); 4319 } 4320 4321 // add i'th 32-bit integer to dest 4322 void add_u32(const Register dest, uint i, const Register rtmp = t0) { 4323 assert(i < 2 * L, "invalid i: %u", i); 4324 4325 if (is_even(i)) { 4326 // Use the bottom 32 bits. No need to mask off the top 32 bits 4327 // as addw will do the right thing. 4328 __ addw(dest, dest, _regs[i / 2]); 4329 } else { 4330 // Use the top 32 bits by right-shifting them. 4331 __ srli(rtmp, _regs[i / 2], 32); 4332 __ addw(dest, dest, rtmp); 4333 } 4334 } 4335 }; 4336 4337 typedef RegCache<8> BufRegCache; 4338 4339 // a += value + x + ac; 4340 // a = Integer.rotateLeft(a, s) + b; 4341 void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache, 4342 Register a, Register b, Register c, Register d, 4343 int k, int s, int t, 4344 Register value) { 4345 // a += ac 4346 __ addw(a, a, t, t1); 4347 4348 // a += x; 4349 reg_cache.add_u32(a, k); 4350 // a += value; 4351 __ addw(a, a, value); 4352 4353 // a = Integer.rotateLeft(a, s) + b; 4354 __ rolw_imm(a, a, s); 4355 __ addw(a, a, b); 4356 } 4357 4358 // a += ((b & c) | ((~b) & d)) + x + ac; 4359 // a = Integer.rotateLeft(a, s) + b; 4360 void md5_FF(BufRegCache& reg_cache, 4361 Register a, Register b, Register c, Register d, 4362 int k, int s, int t, 4363 Register rtmp1, Register rtmp2) { 4364 // rtmp1 = b & c 4365 __ andr(rtmp1, b, c); 4366 4367 // rtmp2 = (~b) & d 4368 __ andn(rtmp2, d, b); 4369 4370 // rtmp1 = (b & c) | ((~b) & d) 4371 __ orr(rtmp1, rtmp1, rtmp2); 4372 4373 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4374 } 4375 4376 // a += ((b & d) | (c & (~d))) + x + ac; 4377 // a = Integer.rotateLeft(a, s) + b; 4378 void md5_GG(BufRegCache& reg_cache, 4379 Register a, Register b, Register c, Register d, 4380 int k, int s, int t, 4381 Register rtmp1, Register rtmp2) { 4382 // rtmp1 = b & d 4383 __ andr(rtmp1, b, d); 4384 4385 // rtmp2 = c & (~d) 4386 __ andn(rtmp2, c, d); 4387 4388 // rtmp1 = (b & d) | (c & (~d)) 4389 __ orr(rtmp1, rtmp1, rtmp2); 4390 4391 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4392 } 4393 4394 // a += ((b ^ c) ^ d) + x + ac; 4395 // a = Integer.rotateLeft(a, s) + b; 4396 void md5_HH(BufRegCache& reg_cache, 4397 Register a, Register b, Register c, Register d, 4398 int k, int s, int t, 4399 Register rtmp1, Register rtmp2) { 4400 // rtmp1 = (b ^ c) ^ d 4401 __ xorr(rtmp2, b, c); 4402 __ xorr(rtmp1, rtmp2, d); 4403 4404 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4405 } 4406 4407 // a += (c ^ (b | (~d))) + x + ac; 4408 // a = Integer.rotateLeft(a, s) + b; 4409 void md5_II(BufRegCache& reg_cache, 4410 Register a, Register b, Register c, Register d, 4411 int k, int s, int t, 4412 Register rtmp1, Register rtmp2) { 4413 // rtmp1 = c ^ (b | (~d)) 4414 __ orn(rtmp2, b, d); 4415 __ xorr(rtmp1, c, rtmp2); 4416 4417 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4418 } 4419 4420 // Arguments: 4421 // 4422 // Inputs: 4423 // c_rarg0 - byte[] source+offset 4424 // c_rarg1 - int[] SHA.state 4425 // c_rarg2 - int offset (multi_block == True) 4426 // c_rarg3 - int limit (multi_block == True) 4427 // 4428 // Registers: 4429 // x0 zero (zero) 4430 // x1 ra (return address) 4431 // x2 sp (stack pointer) 4432 // x3 gp (global pointer) 4433 // x4 tp (thread pointer) 4434 // x5 t0 (tmp register) 4435 // x6 t1 (tmp register) 4436 // x7 t2 state0 4437 // x8 f0/s0 (frame pointer) 4438 // x9 s1 4439 // x10 a0 rtmp1 / c_rarg0 4440 // x11 a1 rtmp2 / c_rarg1 4441 // x12 a2 a / c_rarg2 4442 // x13 a3 b / c_rarg3 4443 // x14 a4 c 4444 // x15 a5 d 4445 // x16 a6 buf 4446 // x17 a7 state 4447 // x18 s2 ofs [saved-reg] (multi_block == True) 4448 // x19 s3 limit [saved-reg] (multi_block == True) 4449 // x20 s4 state1 [saved-reg] 4450 // x21 s5 state2 [saved-reg] 4451 // x22 s6 state3 [saved-reg] 4452 // x23 s7 4453 // x24 s8 buf0 [saved-reg] 4454 // x25 s9 buf1 [saved-reg] 4455 // x26 s10 buf2 [saved-reg] 4456 // x27 s11 buf3 [saved-reg] 4457 // x28 t3 buf4 4458 // x29 t4 buf5 4459 // x30 t5 buf6 4460 // x31 t6 buf7 4461 address generate_md5_implCompress(bool multi_block, const char *name) { 4462 __ align(CodeEntryAlignment); 4463 StubCodeMark mark(this, "StubRoutines", name); 4464 address start = __ pc(); 4465 4466 // rotation constants 4467 const int S11 = 7; 4468 const int S12 = 12; 4469 const int S13 = 17; 4470 const int S14 = 22; 4471 const int S21 = 5; 4472 const int S22 = 9; 4473 const int S23 = 14; 4474 const int S24 = 20; 4475 const int S31 = 4; 4476 const int S32 = 11; 4477 const int S33 = 16; 4478 const int S34 = 23; 4479 const int S41 = 6; 4480 const int S42 = 10; 4481 const int S43 = 15; 4482 const int S44 = 21; 4483 4484 const int64_t mask32 = 0xffffffff; 4485 4486 Register buf_arg = c_rarg0; // a0 4487 Register state_arg = c_rarg1; // a1 4488 Register ofs_arg = c_rarg2; // a2 4489 Register limit_arg = c_rarg3; // a3 4490 4491 // we'll copy the args to these registers to free up a0-a3 4492 // to use for other values manipulated by instructions 4493 // that can be compressed 4494 Register buf = x16; // a6 4495 Register state = x17; // a7 4496 Register ofs = x18; // s2 4497 Register limit = x19; // s3 4498 4499 // using x12->15 to allow compressed instructions 4500 Register a = x12; // a2 4501 Register b = x13; // a3 4502 Register c = x14; // a4 4503 Register d = x15; // a5 4504 4505 Register state0 = x7; // t2 4506 Register state1 = x20; // s4 4507 Register state2 = x21; // s5 4508 Register state3 = x22; // s6 4509 4510 // using x10->x11 to allow compressed instructions 4511 Register rtmp1 = x10; // a0 4512 Register rtmp2 = x11; // a1 4513 4514 RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11 4515 RegSet reg_cache_regs; 4516 reg_cache_regs += reg_cache_saved_regs; 4517 reg_cache_regs += RegSet::of(x28, x29, x30, x31); // t3, t4, t5, t6 4518 BufRegCache reg_cache(_masm, reg_cache_regs); 4519 4520 RegSet saved_regs; 4521 if (multi_block) { 4522 saved_regs += RegSet::of(ofs, limit); 4523 } 4524 saved_regs += RegSet::of(state1, state2, state3); 4525 saved_regs += reg_cache_saved_regs; 4526 4527 __ push_reg(saved_regs, sp); 4528 4529 __ mv(buf, buf_arg); 4530 __ mv(state, state_arg); 4531 if (multi_block) { 4532 __ mv(ofs, ofs_arg); 4533 __ mv(limit, limit_arg); 4534 } 4535 4536 // to minimize the number of memory operations: 4537 // read the 4 state 4-byte values in pairs, with a single ld, 4538 // and split them into 2 registers. 4539 // 4540 // And, as the core algorithm of md5 works on 32-bits words, so 4541 // in the following code, it does not care about the content of 4542 // higher 32-bits in state[x]. Based on this observation, 4543 // we can apply further optimization, which is to just ignore the 4544 // higher 32-bits in state0/state2, rather than set the higher 4545 // 32-bits of state0/state2 to zero explicitly with extra instructions. 4546 __ ld(state0, Address(state)); 4547 __ srli(state1, state0, 32); 4548 __ ld(state2, Address(state, 8)); 4549 __ srli(state3, state2, 32); 4550 4551 Label md5_loop; 4552 __ BIND(md5_loop); 4553 4554 __ mv(a, state0); 4555 __ mv(b, state1); 4556 __ mv(c, state2); 4557 __ mv(d, state3); 4558 4559 // Round 1 4560 reg_cache.gen_load(0, buf); 4561 md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2); 4562 md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2); 4563 reg_cache.gen_load(1, buf); 4564 md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2); 4565 md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2); 4566 reg_cache.gen_load(2, buf); 4567 md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2); 4568 md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2); 4569 reg_cache.gen_load(3, buf); 4570 md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2); 4571 md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2); 4572 reg_cache.gen_load(4, buf); 4573 md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2); 4574 md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2); 4575 reg_cache.gen_load(5, buf); 4576 md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2); 4577 md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2); 4578 reg_cache.gen_load(6, buf); 4579 md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2); 4580 md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2); 4581 reg_cache.gen_load(7, buf); 4582 md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2); 4583 md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2); 4584 4585 // Round 2 4586 md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2); 4587 md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2); 4588 md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2); 4589 md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2); 4590 md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2); 4591 md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2); 4592 md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2); 4593 md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2); 4594 md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2); 4595 md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2); 4596 md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2); 4597 md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2); 4598 md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2); 4599 md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2); 4600 md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2); 4601 md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2); 4602 4603 // Round 3 4604 md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2); 4605 md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2); 4606 md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2); 4607 md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2); 4608 md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2); 4609 md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2); 4610 md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2); 4611 md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2); 4612 md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2); 4613 md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2); 4614 md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2); 4615 md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2); 4616 md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2); 4617 md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2); 4618 md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2); 4619 md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2); 4620 4621 // Round 4 4622 md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2); 4623 md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2); 4624 md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2); 4625 md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2); 4626 md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2); 4627 md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2); 4628 md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2); 4629 md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2); 4630 md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2); 4631 md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2); 4632 md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2); 4633 md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2); 4634 md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2); 4635 md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2); 4636 md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2); 4637 md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2); 4638 4639 __ addw(state0, state0, a); 4640 __ addw(state1, state1, b); 4641 __ addw(state2, state2, c); 4642 __ addw(state3, state3, d); 4643 4644 if (multi_block) { 4645 __ addi(buf, buf, 64); 4646 __ addi(ofs, ofs, 64); 4647 // if (ofs <= limit) goto m5_loop 4648 __ bge(limit, ofs, md5_loop); 4649 __ mv(c_rarg0, ofs); // return ofs 4650 } 4651 4652 // to minimize the number of memory operations: 4653 // write back the 4 state 4-byte values in pairs, with a single sd 4654 __ mv(t0, mask32); 4655 __ andr(state0, state0, t0); 4656 __ slli(state1, state1, 32); 4657 __ orr(state0, state0, state1); 4658 __ sd(state0, Address(state)); 4659 __ andr(state2, state2, t0); 4660 __ slli(state3, state3, 32); 4661 __ orr(state2, state2, state3); 4662 __ sd(state2, Address(state, 8)); 4663 4664 __ pop_reg(saved_regs, sp); 4665 __ ret(); 4666 4667 return (address) start; 4668 } 4669 4670 /** 4671 * Perform the quarter round calculations on values contained within four vector registers. 4672 * 4673 * @param aVec the SIMD register containing only the "a" values 4674 * @param bVec the SIMD register containing only the "b" values 4675 * @param cVec the SIMD register containing only the "c" values 4676 * @param dVec the SIMD register containing only the "d" values 4677 * @param tmp_vr temporary vector register holds intermedia values. 4678 */ 4679 void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec, 4680 VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) { 4681 // a += b, d ^= a, d <<<= 16 4682 __ vadd_vv(aVec, aVec, bVec); 4683 __ vxor_vv(dVec, dVec, aVec); 4684 __ vrole32_vi(dVec, 16, tmp_vr); 4685 4686 // c += d, b ^= c, b <<<= 12 4687 __ vadd_vv(cVec, cVec, dVec); 4688 __ vxor_vv(bVec, bVec, cVec); 4689 __ vrole32_vi(bVec, 12, tmp_vr); 4690 4691 // a += b, d ^= a, d <<<= 8 4692 __ vadd_vv(aVec, aVec, bVec); 4693 __ vxor_vv(dVec, dVec, aVec); 4694 __ vrole32_vi(dVec, 8, tmp_vr); 4695 4696 // c += d, b ^= c, b <<<= 7 4697 __ vadd_vv(cVec, cVec, dVec); 4698 __ vxor_vv(bVec, bVec, cVec); 4699 __ vrole32_vi(bVec, 7, tmp_vr); 4700 } 4701 4702 /** 4703 * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result) 4704 * 4705 * Input arguments: 4706 * c_rarg0 - state, the starting state 4707 * c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function 4708 * 4709 * Implementation Note: 4710 * Parallelization is achieved by loading individual state elements into vectors for N blocks. 4711 * N depends on single vector register length. 4712 */ 4713 address generate_chacha20Block() { 4714 Label L_Rounds; 4715 4716 __ align(CodeEntryAlignment); 4717 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4718 address start = __ pc(); 4719 __ enter(); 4720 4721 const int states_len = 16; 4722 const int step = 4; 4723 const Register state = c_rarg0; 4724 const Register key_stream = c_rarg1; 4725 const Register tmp_addr = t0; 4726 const Register length = t1; 4727 4728 // Organize vector registers in an array that facilitates 4729 // putting repetitive opcodes into loop structures below. 4730 const VectorRegister work_vrs[16] = { 4731 v0, v1, v2, v3, v4, v5, v6, v7, 4732 v8, v9, v10, v11, v12, v13, v14, v15 4733 }; 4734 const VectorRegister tmp_vr = v16; 4735 const VectorRegister counter_vr = v17; 4736 4737 { 4738 // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024 4739 // in java level. 4740 __ vsetivli(length, 16, Assembler::e32, Assembler::m1); 4741 } 4742 4743 // Load from source state. 4744 // Every element in source state is duplicated to all elements in the corresponding vector. 4745 __ mv(tmp_addr, state); 4746 for (int i = 0; i < states_len; i += 1) { 4747 __ vlse32_v(work_vrs[i], tmp_addr, zr); 4748 __ addi(tmp_addr, tmp_addr, step); 4749 } 4750 // Adjust counter for every individual block. 4751 __ vid_v(counter_vr); 4752 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); 4753 4754 // Perform 10 iterations of the 8 quarter round set 4755 { 4756 const Register loop = t2; // share t2 with other non-overlapping usages. 4757 __ mv(loop, 10); 4758 __ BIND(L_Rounds); 4759 4760 chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr); 4761 chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr); 4762 chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr); 4763 chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr); 4764 4765 chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr); 4766 chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr); 4767 chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr); 4768 chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr); 4769 4770 __ sub(loop, loop, 1); 4771 __ bnez(loop, L_Rounds); 4772 } 4773 4774 // Add the original state into the end working state. 4775 // We do this by first duplicating every element in source state array to the corresponding 4776 // vector, then adding it to the post-loop working state. 4777 __ mv(tmp_addr, state); 4778 for (int i = 0; i < states_len; i += 1) { 4779 __ vlse32_v(tmp_vr, tmp_addr, zr); 4780 __ addi(tmp_addr, tmp_addr, step); 4781 __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr); 4782 } 4783 // Add the counter overlay onto work_vrs[12] at the end. 4784 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); 4785 4786 // Store result to key stream. 4787 { 4788 const Register stride = t2; // share t2 with other non-overlapping usages. 4789 // Every block occupies 64 bytes, so we use 64 as stride of the vector store. 4790 __ mv(stride, 64); 4791 for (int i = 0; i < states_len; i += 1) { 4792 __ vsse32_v(work_vrs[i], key_stream, stride); 4793 __ addi(key_stream, key_stream, step); 4794 } 4795 } 4796 4797 // Return length of output key_stream 4798 __ slli(c_rarg0, length, 6); 4799 4800 __ leave(); 4801 __ ret(); 4802 4803 return (address) start; 4804 } 4805 4806 4807 // ------------------------ SHA-1 intrinsic ------------------------ 4808 4809 // K't = 4810 // 5a827999, 0 <= t <= 19 4811 // 6ed9eba1, 20 <= t <= 39 4812 // 8f1bbcdc, 40 <= t <= 59 4813 // ca62c1d6, 60 <= t <= 79 4814 void sha1_prepare_k(Register cur_k, int round) { 4815 assert(round >= 0 && round < 80, "must be"); 4816 4817 static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6}; 4818 if ((round % 20) == 0) { 4819 __ mv(cur_k, ks[round/20]); 4820 } 4821 } 4822 4823 // W't = 4824 // M't, 0 <= t <= 15 4825 // ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 4826 void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) { 4827 assert(round >= 0 && round < 80, "must be"); 4828 4829 if (round < 16) { 4830 // in the first 16 rounds, in ws[], every register contains 2 W't, e.g. 4831 // in ws[0], high part contains W't-0, low part contains W't-1, 4832 // in ws[1], high part contains W't-2, low part contains W't-3, 4833 // ... 4834 // in ws[7], high part contains W't-14, low part contains W't-15. 4835 4836 if ((round % 2) == 0) { 4837 __ ld(ws[round/2], Address(buf, (round/2) * 8)); 4838 // reverse bytes, as SHA-1 is defined in big-endian. 4839 __ revb(ws[round/2], ws[round/2]); 4840 __ srli(cur_w, ws[round/2], 32); 4841 } else { 4842 __ mv(cur_w, ws[round/2]); 4843 } 4844 4845 return; 4846 } 4847 4848 if ((round % 2) == 0) { 4849 int idx = 16; 4850 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 4851 __ srli(t1, ws[(idx-8)/2], 32); 4852 __ xorr(t0, ws[(idx-3)/2], t1); 4853 4854 __ srli(t1, ws[(idx-14)/2], 32); 4855 __ srli(cur_w, ws[(idx-16)/2], 32); 4856 __ xorr(cur_w, cur_w, t1); 4857 4858 __ xorr(cur_w, cur_w, t0); 4859 __ rolw_imm(cur_w, cur_w, 1, t0); 4860 4861 // copy the cur_w value to ws[8]. 4862 // now, valid w't values are at: 4863 // w0: ws[0]'s lower 32 bits 4864 // w1 ~ w14: ws[1] ~ ws[7] 4865 // w15: ws[8]'s higher 32 bits 4866 __ slli(ws[idx/2], cur_w, 32); 4867 4868 return; 4869 } 4870 4871 int idx = 17; 4872 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 4873 __ srli(t1, ws[(idx-3)/2], 32); 4874 __ xorr(t0, t1, ws[(idx-8)/2]); 4875 4876 __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]); 4877 4878 __ xorr(cur_w, cur_w, t0); 4879 __ rolw_imm(cur_w, cur_w, 1, t0); 4880 4881 // copy the cur_w value to ws[8] 4882 __ zero_extend(cur_w, cur_w, 32); 4883 __ orr(ws[idx/2], ws[idx/2], cur_w); 4884 4885 // shift the w't registers, so they start from ws[0] again. 4886 // now, valid w't values are at: 4887 // w0 ~ w15: ws[0] ~ ws[7] 4888 Register ws_0 = ws[0]; 4889 for (int i = 0; i < 16/2; i++) { 4890 ws[i] = ws[i+1]; 4891 } 4892 ws[8] = ws_0; 4893 } 4894 4895 // f't(x, y, z) = 4896 // Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19 4897 // Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39 4898 // Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59 4899 // Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79 4900 void sha1_f(Register dst, Register x, Register y, Register z, int round) { 4901 assert(round >= 0 && round < 80, "must be"); 4902 assert_different_registers(dst, x, y, z, t0, t1); 4903 4904 if (round < 20) { 4905 // (x & y) ^ (~x & z) 4906 __ andr(t0, x, y); 4907 __ andn(dst, z, x); 4908 __ xorr(dst, dst, t0); 4909 } else if (round >= 40 && round < 60) { 4910 // (x & y) ^ (x & z) ^ (y & z) 4911 __ andr(t0, x, y); 4912 __ andr(t1, x, z); 4913 __ andr(dst, y, z); 4914 __ xorr(dst, dst, t0); 4915 __ xorr(dst, dst, t1); 4916 } else { 4917 // x ^ y ^ z 4918 __ xorr(dst, x, y); 4919 __ xorr(dst, dst, z); 4920 } 4921 } 4922 4923 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't 4924 // e = d 4925 // d = c 4926 // c = ROTL'30(b) 4927 // b = a 4928 // a = T 4929 void sha1_process_round(Register a, Register b, Register c, Register d, Register e, 4930 Register cur_k, Register cur_w, Register tmp, int round) { 4931 assert(round >= 0 && round < 80, "must be"); 4932 assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0); 4933 4934 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't 4935 4936 // cur_w will be recalculated at the beginning of each round, 4937 // so, we can reuse it as a temp register here. 4938 Register tmp2 = cur_w; 4939 4940 // reuse e as a temporary register, as we will mv new value into it later 4941 Register tmp3 = e; 4942 __ add(tmp2, cur_k, tmp2); 4943 __ add(tmp3, tmp3, tmp2); 4944 __ rolw_imm(tmp2, a, 5, t0); 4945 4946 sha1_f(tmp, b, c, d, round); 4947 4948 __ add(tmp2, tmp2, tmp); 4949 __ add(tmp2, tmp2, tmp3); 4950 4951 // e = d 4952 // d = c 4953 // c = ROTL'30(b) 4954 // b = a 4955 // a = T 4956 __ mv(e, d); 4957 __ mv(d, c); 4958 4959 __ rolw_imm(c, b, 30); 4960 __ mv(b, a); 4961 __ mv(a, tmp2); 4962 } 4963 4964 // H(i)0 = a + H(i-1)0 4965 // H(i)1 = b + H(i-1)1 4966 // H(i)2 = c + H(i-1)2 4967 // H(i)3 = d + H(i-1)3 4968 // H(i)4 = e + H(i-1)4 4969 void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e, 4970 Register prev_ab, Register prev_cd, Register prev_e) { 4971 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e); 4972 4973 __ add(a, a, prev_ab); 4974 __ srli(prev_ab, prev_ab, 32); 4975 __ add(b, b, prev_ab); 4976 4977 __ add(c, c, prev_cd); 4978 __ srli(prev_cd, prev_cd, 32); 4979 __ add(d, d, prev_cd); 4980 4981 __ add(e, e, prev_e); 4982 } 4983 4984 void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e, 4985 Register prev_ab, Register prev_cd, Register prev_e) { 4986 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0); 4987 4988 __ slli(t0, b, 32); 4989 __ zero_extend(prev_ab, a, 32); 4990 __ orr(prev_ab, prev_ab, t0); 4991 4992 __ slli(t0, d, 32); 4993 __ zero_extend(prev_cd, c, 32); 4994 __ orr(prev_cd, prev_cd, t0); 4995 4996 __ mv(prev_e, e); 4997 } 4998 4999 // Intrinsic for: 5000 // void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs) 5001 // void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit) 5002 // 5003 // Arguments: 5004 // 5005 // Inputs: 5006 // c_rarg0: byte[] src array + offset 5007 // c_rarg1: int[] SHA.state 5008 // - - - - - - below are only for implCompressMultiBlock0 - - - - - - 5009 // c_rarg2: int offset 5010 // c_rarg3: int limit 5011 // 5012 // Outputs: 5013 // - - - - - - below are only for implCompressMultiBlock0 - - - - - - 5014 // c_rarg0: int offset, when (multi_block == true) 5015 // 5016 address generate_sha1_implCompress(bool multi_block, const char *name) { 5017 __ align(CodeEntryAlignment); 5018 StubCodeMark mark(this, "StubRoutines", name); 5019 5020 address start = __ pc(); 5021 __ enter(); 5022 5023 RegSet saved_regs = RegSet::range(x18, x27); 5024 if (multi_block) { 5025 // use x9 as src below. 5026 saved_regs += RegSet::of(x9); 5027 } 5028 __ push_reg(saved_regs, sp); 5029 5030 // c_rarg0 - c_rarg3: x10 - x13 5031 Register buf = c_rarg0; 5032 Register state = c_rarg1; 5033 Register offset = c_rarg2; 5034 Register limit = c_rarg3; 5035 // use src to contain the original start point of the array. 5036 Register src = x9; 5037 5038 if (multi_block) { 5039 __ sub(limit, limit, offset); 5040 __ add(limit, limit, buf); 5041 __ sub(src, buf, offset); 5042 } 5043 5044 // [args-reg]: x14 - x17 5045 // [temp-reg]: x28 - x31 5046 // [saved-reg]: x18 - x27 5047 5048 // h0/1/2/3/4 5049 const Register a = x14, b = x15, c = x16, d = x17, e = x28; 5050 // w0, w1, ... w15 5051 // put two adjecent w's in one register: 5052 // one at high word part, another at low word part 5053 // at different round (even or odd), w't value reside in different items in ws[]. 5054 // w0 ~ w15, either reside in 5055 // ws[0] ~ ws[7], where 5056 // w0 at higher 32 bits of ws[0], 5057 // w1 at lower 32 bits of ws[0], 5058 // ... 5059 // w14 at higher 32 bits of ws[7], 5060 // w15 at lower 32 bits of ws[7]. 5061 // or, reside in 5062 // w0: ws[0]'s lower 32 bits 5063 // w1 ~ w14: ws[1] ~ ws[7] 5064 // w15: ws[8]'s higher 32 bits 5065 Register ws[9] = {x29, x30, x31, x18, 5066 x19, x20, x21, x22, 5067 x23}; // auxiliary register for calculating w's value 5068 // current k't's value 5069 const Register cur_k = x24; 5070 // current w't's value 5071 const Register cur_w = x25; 5072 // values of a, b, c, d, e in the previous round 5073 const Register prev_ab = x26, prev_cd = x27; 5074 const Register prev_e = offset; // reuse offset/c_rarg2 5075 5076 // load 5 words state into a, b, c, d, e. 5077 // 5078 // To minimize the number of memory operations, we apply following 5079 // optimization: read the states (a/b/c/d) of 4-byte values in pairs, 5080 // with a single ld, and split them into 2 registers. 5081 // 5082 // And, as the core algorithm of SHA-1 works on 32-bits words, so 5083 // in the following code, it does not care about the content of 5084 // higher 32-bits in a/b/c/d/e. Based on this observation, 5085 // we can apply further optimization, which is to just ignore the 5086 // higher 32-bits in a/c/e, rather than set the higher 5087 // 32-bits of a/c/e to zero explicitly with extra instructions. 5088 __ ld(a, Address(state, 0)); 5089 __ srli(b, a, 32); 5090 __ ld(c, Address(state, 8)); 5091 __ srli(d, c, 32); 5092 __ lw(e, Address(state, 16)); 5093 5094 Label L_sha1_loop; 5095 if (multi_block) { 5096 __ BIND(L_sha1_loop); 5097 } 5098 5099 sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5100 5101 for (int round = 0; round < 80; round++) { 5102 // prepare K't value 5103 sha1_prepare_k(cur_k, round); 5104 5105 // prepare W't value 5106 sha1_prepare_w(cur_w, ws, buf, round); 5107 5108 // one round process 5109 sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round); 5110 } 5111 5112 // compute the intermediate hash value 5113 sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5114 5115 if (multi_block) { 5116 int64_t block_bytes = 16 * 4; 5117 __ addi(buf, buf, block_bytes); 5118 5119 __ bge(limit, buf, L_sha1_loop, true); 5120 } 5121 5122 // store back the state. 5123 __ zero_extend(a, a, 32); 5124 __ slli(b, b, 32); 5125 __ orr(a, a, b); 5126 __ sd(a, Address(state, 0)); 5127 __ zero_extend(c, c, 32); 5128 __ slli(d, d, 32); 5129 __ orr(c, c, d); 5130 __ sd(c, Address(state, 8)); 5131 __ sw(e, Address(state, 16)); 5132 5133 // return offset 5134 if (multi_block) { 5135 __ sub(c_rarg0, buf, src); 5136 } 5137 5138 __ pop_reg(saved_regs, sp); 5139 5140 __ leave(); 5141 __ ret(); 5142 5143 return (address) start; 5144 } 5145 5146 /** 5147 * vector registers: 5148 * input VectorRegister's: intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3 5149 * index VectorRegister's: idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7 5150 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11 5151 * 5152 * NOTE: each field will occupy a vector register group 5153 */ 5154 void base64_vector_encode_round(Register src, Register dst, Register codec, 5155 Register size, Register stepSrc, Register stepDst, 5156 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, 5157 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, 5158 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4, 5159 Assembler::LMUL lmul) { 5160 // set vector register type/len 5161 __ vsetvli(x0, size, Assembler::e8, lmul); 5162 5163 // segmented load src into v registers: mem(src) => vr(3) 5164 __ vlseg3e8_v(inputV1, src); 5165 5166 // src = src + register_group_len_bytes * 3 5167 __ add(src, src, stepSrc); 5168 5169 // encoding 5170 // 1. compute index into lookup table: vr(3) => vr(4) 5171 __ vsrl_vi(idxV1, inputV1, 2); 5172 5173 __ vsrl_vi(idxV2, inputV2, 2); 5174 __ vsll_vi(inputV1, inputV1, 6); 5175 __ vor_vv(idxV2, idxV2, inputV1); 5176 __ vsrl_vi(idxV2, idxV2, 2); 5177 5178 __ vsrl_vi(idxV3, inputV3, 4); 5179 __ vsll_vi(inputV2, inputV2, 4); 5180 __ vor_vv(idxV3, inputV2, idxV3); 5181 __ vsrl_vi(idxV3, idxV3, 2); 5182 5183 __ vsll_vi(idxV4, inputV3, 2); 5184 __ vsrl_vi(idxV4, idxV4, 2); 5185 5186 // 2. indexed load: vr(4) => vr(4) 5187 __ vluxei8_v(outputV1, codec, idxV1); 5188 __ vluxei8_v(outputV2, codec, idxV2); 5189 __ vluxei8_v(outputV3, codec, idxV3); 5190 __ vluxei8_v(outputV4, codec, idxV4); 5191 5192 // segmented store encoded data in v registers back to dst: vr(4) => mem(dst) 5193 __ vsseg4e8_v(outputV1, dst); 5194 5195 // dst = dst + register_group_len_bytes * 4 5196 __ add(dst, dst, stepDst); 5197 } 5198 5199 /** 5200 * void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) 5201 * 5202 * Input arguments: 5203 * c_rarg0 - src, source array 5204 * c_rarg1 - sp, src start offset 5205 * c_rarg2 - sl, src end offset 5206 * c_rarg3 - dst, dest array 5207 * c_rarg4 - dp, dst start offset 5208 * c_rarg5 - isURL, Base64 or URL character set 5209 */ 5210 address generate_base64_encodeBlock() { 5211 alignas(64) static const char toBase64[64] = { 5212 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5213 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5214 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5215 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5216 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 5217 }; 5218 5219 alignas(64) static const char toBase64URL[64] = { 5220 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5221 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5222 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5223 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5224 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 5225 }; 5226 5227 __ align(CodeEntryAlignment); 5228 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 5229 address start = __ pc(); 5230 __ enter(); 5231 5232 Register src = c_rarg0; 5233 Register soff = c_rarg1; 5234 Register send = c_rarg2; 5235 Register dst = c_rarg3; 5236 Register doff = c_rarg4; 5237 Register isURL = c_rarg5; 5238 5239 Register codec = c_rarg6; 5240 Register length = c_rarg7; // total length of src data in bytes 5241 5242 Label ProcessData, Exit; 5243 5244 // length should be multiple of 3 5245 __ sub(length, send, soff); 5246 // real src/dst to process data 5247 __ add(src, src, soff); 5248 __ add(dst, dst, doff); 5249 5250 // load the codec base address 5251 __ la(codec, ExternalAddress((address) toBase64)); 5252 __ beqz(isURL, ProcessData); 5253 __ la(codec, ExternalAddress((address) toBase64URL)); 5254 __ BIND(ProcessData); 5255 5256 // vector version 5257 if (UseRVV) { 5258 Label ProcessM2, ProcessM1, ProcessScalar; 5259 5260 Register size = soff; 5261 Register stepSrcM1 = send; 5262 Register stepSrcM2 = doff; 5263 Register stepDst = isURL; 5264 5265 __ mv(size, MaxVectorSize * 2); 5266 __ mv(stepSrcM1, MaxVectorSize * 3); 5267 __ slli(stepSrcM2, stepSrcM1, 1); 5268 __ mv(stepDst, MaxVectorSize * 2 * 4); 5269 5270 __ blt(length, stepSrcM2, ProcessM1); 5271 5272 __ BIND(ProcessM2); 5273 base64_vector_encode_round(src, dst, codec, 5274 size, stepSrcM2, stepDst, 5275 v2, v4, v6, // inputs 5276 v8, v10, v12, v14, // indexes 5277 v16, v18, v20, v22, // outputs 5278 Assembler::m2); 5279 5280 __ sub(length, length, stepSrcM2); 5281 __ bge(length, stepSrcM2, ProcessM2); 5282 5283 __ BIND(ProcessM1); 5284 __ blt(length, stepSrcM1, ProcessScalar); 5285 5286 __ srli(size, size, 1); 5287 __ srli(stepDst, stepDst, 1); 5288 base64_vector_encode_round(src, dst, codec, 5289 size, stepSrcM1, stepDst, 5290 v1, v2, v3, // inputs 5291 v4, v5, v6, v7, // indexes 5292 v8, v9, v10, v11, // outputs 5293 Assembler::m1); 5294 __ sub(length, length, stepSrcM1); 5295 5296 __ BIND(ProcessScalar); 5297 } 5298 5299 // scalar version 5300 { 5301 Register byte1 = soff, byte0 = send, byte2 = doff; 5302 Register combined24Bits = isURL; 5303 5304 __ beqz(length, Exit); 5305 5306 Label ScalarLoop; 5307 __ BIND(ScalarLoop); 5308 { 5309 // plain: [byte0[7:0] : byte1[7:0] : byte2[7:0]] => 5310 // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]] 5311 5312 // load 3 bytes src data 5313 __ lbu(byte0, Address(src, 0)); 5314 __ lbu(byte1, Address(src, 1)); 5315 __ lbu(byte2, Address(src, 2)); 5316 __ addi(src, src, 3); 5317 5318 // construct 24 bits from 3 bytes 5319 __ slliw(byte0, byte0, 16); 5320 __ slliw(byte1, byte1, 8); 5321 __ orr(combined24Bits, byte0, byte1); 5322 __ orr(combined24Bits, combined24Bits, byte2); 5323 5324 // get codec index and encode(ie. load from codec by index) 5325 __ slliw(byte0, combined24Bits, 8); 5326 __ srliw(byte0, byte0, 26); 5327 __ add(byte0, codec, byte0); 5328 __ lbu(byte0, byte0); 5329 5330 __ slliw(byte1, combined24Bits, 14); 5331 __ srliw(byte1, byte1, 26); 5332 __ add(byte1, codec, byte1); 5333 __ lbu(byte1, byte1); 5334 5335 __ slliw(byte2, combined24Bits, 20); 5336 __ srliw(byte2, byte2, 26); 5337 __ add(byte2, codec, byte2); 5338 __ lbu(byte2, byte2); 5339 5340 __ andi(combined24Bits, combined24Bits, 0x3f); 5341 __ add(combined24Bits, codec, combined24Bits); 5342 __ lbu(combined24Bits, combined24Bits); 5343 5344 // store 4 bytes encoded data 5345 __ sb(byte0, Address(dst, 0)); 5346 __ sb(byte1, Address(dst, 1)); 5347 __ sb(byte2, Address(dst, 2)); 5348 __ sb(combined24Bits, Address(dst, 3)); 5349 5350 __ sub(length, length, 3); 5351 __ addi(dst, dst, 4); 5352 // loop back 5353 __ bnez(length, ScalarLoop); 5354 } 5355 } 5356 5357 __ BIND(Exit); 5358 5359 __ leave(); 5360 __ ret(); 5361 5362 return (address) start; 5363 } 5364 5365 /** 5366 * vector registers: 5367 * input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8 5368 * index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16 5369 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22 5370 * 5371 * NOTE: each field will occupy a single vector register group 5372 */ 5373 void base64_vector_decode_round(Register src, Register dst, Register codec, 5374 Register size, Register stepSrc, Register stepDst, Register failedIdx, 5375 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4, 5376 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, 5377 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, 5378 Assembler::LMUL lmul) { 5379 // set vector register type/len 5380 __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta); 5381 5382 // segmented load src into v registers: mem(src) => vr(4) 5383 __ vlseg4e8_v(inputV1, src); 5384 5385 // src = src + register_group_len_bytes * 4 5386 __ add(src, src, stepSrc); 5387 5388 // decoding 5389 // 1. indexed load: vr(4) => vr(4) 5390 __ vluxei8_v(idxV1, codec, inputV1); 5391 __ vluxei8_v(idxV2, codec, inputV2); 5392 __ vluxei8_v(idxV3, codec, inputV3); 5393 __ vluxei8_v(idxV4, codec, inputV4); 5394 5395 // 2. check wrong data 5396 __ vor_vv(outputV1, idxV1, idxV2); 5397 __ vor_vv(outputV2, idxV3, idxV4); 5398 __ vor_vv(outputV1, outputV1, outputV2); 5399 __ vmseq_vi(v0, outputV1, -1); 5400 __ vfirst_m(failedIdx, v0); 5401 Label NoFailure, FailureAtIdx0; 5402 // valid value can only be -1 when < 0 5403 __ bltz(failedIdx, NoFailure); 5404 // when the first data (at index 0) fails, no need to process data anymore 5405 __ beqz(failedIdx, FailureAtIdx0); 5406 __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu); 5407 __ slli(stepDst, failedIdx, 1); 5408 __ add(stepDst, failedIdx, stepDst); 5409 __ BIND(NoFailure); 5410 5411 // 3. compute the decoded data: vr(4) => vr(3) 5412 __ vsll_vi(idxV1, idxV1, 2); 5413 __ vsrl_vi(outputV1, idxV2, 4); 5414 __ vor_vv(outputV1, outputV1, idxV1); 5415 5416 __ vsll_vi(idxV2, idxV2, 4); 5417 __ vsrl_vi(outputV2, idxV3, 2); 5418 __ vor_vv(outputV2, outputV2, idxV2); 5419 5420 __ vsll_vi(idxV3, idxV3, 6); 5421 __ vor_vv(outputV3, idxV4, idxV3); 5422 5423 // segmented store encoded data in v registers back to dst: vr(3) => mem(dst) 5424 __ vsseg3e8_v(outputV1, dst); 5425 5426 // dst = dst + register_group_len_bytes * 3 5427 __ add(dst, dst, stepDst); 5428 __ BIND(FailureAtIdx0); 5429 } 5430 5431 /** 5432 * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME) 5433 * 5434 * Input arguments: 5435 * c_rarg0 - src, source array 5436 * c_rarg1 - sp, src start offset 5437 * c_rarg2 - sl, src end offset 5438 * c_rarg3 - dst, dest array 5439 * c_rarg4 - dp, dst start offset 5440 * c_rarg5 - isURL, Base64 or URL character set 5441 * c_rarg6 - isMIME, Decoding MIME block 5442 */ 5443 address generate_base64_decodeBlock() { 5444 5445 static const uint8_t fromBase64[256] = { 5446 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5447 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5448 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 5449 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 5450 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 5451 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 5452 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 5453 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 5454 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5455 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5456 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5457 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5458 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5459 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5460 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5461 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5462 }; 5463 5464 static const uint8_t fromBase64URL[256] = { 5465 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5466 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5467 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 5468 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 5469 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 5470 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 5471 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 5472 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 5473 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5474 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5475 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5476 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5477 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5478 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5479 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5480 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5481 }; 5482 5483 __ align(CodeEntryAlignment); 5484 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 5485 address start = __ pc(); 5486 __ enter(); 5487 5488 Register src = c_rarg0; 5489 Register soff = c_rarg1; 5490 Register send = c_rarg2; 5491 Register dst = c_rarg3; 5492 Register doff = c_rarg4; 5493 Register isURL = c_rarg5; 5494 Register isMIME = c_rarg6; 5495 5496 Register codec = c_rarg7; 5497 Register dstBackup = x31; 5498 Register length = x28; // t3, total length of src data in bytes 5499 5500 Label ProcessData, Exit; 5501 Label ProcessScalar, ScalarLoop; 5502 5503 // passed in length (send - soff) is guaranteed to be > 4, 5504 // and in this intrinsic we only process data of length in multiple of 4, 5505 // it's not guaranteed to be multiple of 4 by java level, so do it explicitly 5506 __ sub(length, send, soff); 5507 __ andi(length, length, -4); 5508 // real src/dst to process data 5509 __ add(src, src, soff); 5510 __ add(dst, dst, doff); 5511 // backup of dst, used to calculate the return value at exit 5512 __ mv(dstBackup, dst); 5513 5514 // load the codec base address 5515 __ la(codec, ExternalAddress((address) fromBase64)); 5516 __ beqz(isURL, ProcessData); 5517 __ la(codec, ExternalAddress((address) fromBase64URL)); 5518 __ BIND(ProcessData); 5519 5520 // vector version 5521 if (UseRVV) { 5522 // for MIME case, it has a default length limit of 76 which could be 5523 // different(smaller) from (send - soff), so in MIME case, we go through 5524 // the scalar code path directly. 5525 __ bnez(isMIME, ScalarLoop); 5526 5527 Label ProcessM1, ProcessM2; 5528 5529 Register failedIdx = soff; 5530 Register stepSrcM1 = send; 5531 Register stepSrcM2 = doff; 5532 Register stepDst = isURL; 5533 Register size = x29; // t4 5534 5535 __ mv(size, MaxVectorSize * 2); 5536 __ mv(stepSrcM1, MaxVectorSize * 4); 5537 __ slli(stepSrcM2, stepSrcM1, 1); 5538 __ mv(stepDst, MaxVectorSize * 2 * 3); 5539 5540 __ blt(length, stepSrcM2, ProcessM1); 5541 5542 5543 // Assembler::m2 5544 __ BIND(ProcessM2); 5545 base64_vector_decode_round(src, dst, codec, 5546 size, stepSrcM2, stepDst, failedIdx, 5547 v2, v4, v6, v8, // inputs 5548 v10, v12, v14, v16, // indexes 5549 v18, v20, v22, // outputs 5550 Assembler::m2); 5551 __ sub(length, length, stepSrcM2); 5552 5553 // error check 5554 // valid value of failedIdx can only be -1 when < 0 5555 __ bgez(failedIdx, Exit); 5556 5557 __ bge(length, stepSrcM2, ProcessM2); 5558 5559 5560 // Assembler::m1 5561 __ BIND(ProcessM1); 5562 __ blt(length, stepSrcM1, ProcessScalar); 5563 5564 __ srli(size, size, 1); 5565 __ srli(stepDst, stepDst, 1); 5566 base64_vector_decode_round(src, dst, codec, 5567 size, stepSrcM1, stepDst, failedIdx, 5568 v1, v2, v3, v4, // inputs 5569 v5, v6, v7, v8, // indexes 5570 v9, v10, v11, // outputs 5571 Assembler::m1); 5572 __ sub(length, length, stepSrcM1); 5573 5574 // error check 5575 // valid value of failedIdx can only be -1 when < 0 5576 __ bgez(failedIdx, Exit); 5577 5578 __ BIND(ProcessScalar); 5579 __ beqz(length, Exit); 5580 } 5581 5582 // scalar version 5583 { 5584 Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL; 5585 Register combined32Bits = x29; // t5 5586 5587 // encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] => 5588 // plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]] 5589 __ BIND(ScalarLoop); 5590 5591 // load 4 bytes encoded src data 5592 __ lbu(byte0, Address(src, 0)); 5593 __ lbu(byte1, Address(src, 1)); 5594 __ lbu(byte2, Address(src, 2)); 5595 __ lbu(byte3, Address(src, 3)); 5596 __ addi(src, src, 4); 5597 5598 // get codec index and decode (ie. load from codec by index) 5599 __ add(byte0, codec, byte0); 5600 __ add(byte1, codec, byte1); 5601 __ lb(byte0, Address(byte0, 0)); 5602 __ lb(byte1, Address(byte1, 0)); 5603 __ add(byte2, codec, byte2); 5604 __ add(byte3, codec, byte3); 5605 __ lb(byte2, Address(byte2, 0)); 5606 __ lb(byte3, Address(byte3, 0)); 5607 __ slliw(byte0, byte0, 18); 5608 __ slliw(byte1, byte1, 12); 5609 __ orr(byte0, byte0, byte1); 5610 __ orr(byte0, byte0, byte3); 5611 __ slliw(byte2, byte2, 6); 5612 // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time, 5613 // 1. error check below 5614 // 2. decode below 5615 __ orr(combined32Bits, byte0, byte2); 5616 5617 // error check 5618 __ bltz(combined32Bits, Exit); 5619 5620 // store 3 bytes decoded data 5621 __ sraiw(byte0, combined32Bits, 16); 5622 __ sraiw(byte1, combined32Bits, 8); 5623 __ sb(byte0, Address(dst, 0)); 5624 __ sb(byte1, Address(dst, 1)); 5625 __ sb(combined32Bits, Address(dst, 2)); 5626 5627 __ sub(length, length, 4); 5628 __ addi(dst, dst, 3); 5629 // loop back 5630 __ bnez(length, ScalarLoop); 5631 } 5632 5633 __ BIND(Exit); 5634 __ sub(c_rarg0, dst, dstBackup); 5635 5636 __ leave(); 5637 __ ret(); 5638 5639 return (address) start; 5640 } 5641 5642 void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable, 5643 VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc, 5644 Register temp0, Register temp1, Register temp2, Register temp3, 5645 VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) { 5646 5647 assert((lmul == Assembler::m4 && step == 64) || 5648 (lmul == Assembler::m2 && step == 32) || 5649 (lmul == Assembler::m1 && step == 16), 5650 "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16"); 5651 // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used. 5652 // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case. 5653 // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration. 5654 // In non-vectorized code, we update s1 and s2 as: 5655 // s1 <- s1 + b1 5656 // s2 <- s2 + s1 5657 // s1 <- s1 + b2 5658 // s2 <- s2 + b1 5659 // ... 5660 // s1 <- s1 + b64 5661 // s2 <- s2 + s1 5662 // Putting above assignments together, we have: 5663 // s1_new = s1 + b1 + b2 + ... + b64 5664 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) = 5665 // = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) = 5666 // = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1) 5667 5668 __ mv(temp3, step); 5669 // Load data 5670 __ vsetvli(temp0, temp3, Assembler::e8, lmul); 5671 __ vle8_v(vbytes, buff); 5672 __ addi(buff, buff, step); 5673 5674 // Upper bound reduction sum for s1_new: 5675 // 0xFF * 64 = 0x3FC0, so: 5676 // 1. Need to do vector-widening reduction sum 5677 // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements 5678 __ vwredsumu_vs(vs1acc, vbytes, vzero); 5679 // Multiplication for s2_new 5680 __ vwmulu_vv(vs2acc, vtable, vbytes); 5681 5682 // s2 = s2 + s1 * log2(step) 5683 __ slli(temp1, s1, exact_log2(step)); 5684 __ add(s2, s2, temp1); 5685 5686 // Summing up calculated results for s2_new 5687 if (MaxVectorSize > 16) { 5688 __ vsetvli(temp0, temp3, Assembler::e16, lmul); 5689 } else { 5690 // Half of vector-widening multiplication result is in successor of vs2acc 5691 // group for vlen == 16, in which case we need to double vector register 5692 // group width in order to reduction sum all of them 5693 Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 : 5694 (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8; 5695 __ vsetvli(temp0, temp3, Assembler::e16, lmulx2); 5696 } 5697 // Upper bound for reduction sum: 5698 // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so: 5699 // 1. Need to do vector-widening reduction sum 5700 // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements 5701 __ vwredsumu_vs(vtemp1, vs2acc, vzero); 5702 5703 // Extracting results for: 5704 // s1_new 5705 __ vmv_x_s(temp0, vs1acc); 5706 __ add(s1, s1, temp0); 5707 // s2_new 5708 __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1); 5709 __ vmv_x_s(temp1, vtemp1); 5710 __ add(s2, s2, temp1); 5711 } 5712 5713 /*** 5714 * int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len) 5715 * 5716 * Arguments: 5717 * 5718 * Inputs: 5719 * c_rarg0 - int adler 5720 * c_rarg1 - byte* buff (b + off) 5721 * c_rarg2 - int len 5722 * 5723 * Output: 5724 * c_rarg0 - int adler result 5725 */ 5726 address generate_updateBytesAdler32() { 5727 __ align(CodeEntryAlignment); 5728 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 5729 address start = __ pc(); 5730 5731 Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop, 5732 L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1; 5733 5734 // Aliases 5735 Register adler = c_rarg0; 5736 Register s1 = c_rarg0; 5737 Register s2 = c_rarg3; 5738 Register buff = c_rarg1; 5739 Register len = c_rarg2; 5740 Register nmax = c_rarg4; 5741 Register base = c_rarg5; 5742 Register count = c_rarg6; 5743 Register temp0 = x28; // t3 5744 Register temp1 = x29; // t4 5745 Register temp2 = x30; // t5 5746 Register temp3 = x31; // t6 5747 5748 VectorRegister vzero = v31; 5749 VectorRegister vbytes = v8; // group: v8, v9, v10, v11 5750 VectorRegister vs1acc = v12; // group: v12, v13, v14, v15 5751 VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23 5752 VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27 5753 VectorRegister vtable_32 = v4; // group: v4, v5 5754 VectorRegister vtable_16 = v30; 5755 VectorRegister vtemp1 = v28; 5756 VectorRegister vtemp2 = v29; 5757 5758 // Max number of bytes we can process before having to take the mod 5759 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 5760 const uint64_t BASE = 0xfff1; 5761 const uint64_t NMAX = 0x15B0; 5762 5763 // Loops steps 5764 int step_64 = 64; 5765 int step_32 = 32; 5766 int step_16 = 16; 5767 int step_1 = 1; 5768 5769 __ enter(); // Required for proper stackwalking of RuntimeStub frame 5770 __ mv(temp1, 64); 5771 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4); 5772 5773 // Generating accumulation coefficients for further calculations 5774 // vtable_64: 5775 __ vid_v(vtemp1); 5776 __ vrsub_vx(vtable_64, vtemp1, temp1); 5777 // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 } 5778 5779 // vtable_32: 5780 __ mv(temp1, 32); 5781 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2); 5782 __ vid_v(vtemp1); 5783 __ vrsub_vx(vtable_32, vtemp1, temp1); 5784 // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 } 5785 5786 __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1); 5787 // vtable_16: 5788 __ mv(temp1, 16); 5789 __ vid_v(vtemp1); 5790 __ vrsub_vx(vtable_16, vtemp1, temp1); 5791 // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 } 5792 5793 __ vmv_v_i(vzero, 0); 5794 5795 __ mv(base, BASE); 5796 __ mv(nmax, NMAX); 5797 5798 // s1 is initialized to the lower 16 bits of adler 5799 // s2 is initialized to the upper 16 bits of adler 5800 __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff) 5801 __ zero_extend(s1, adler, 16); // s1 = (adler & 0xffff) 5802 5803 // The pipelined loop needs at least 16 elements for 1 iteration 5804 // It does check this, but it is more effective to skip to the cleanup loop 5805 __ mv(temp0, step_16); 5806 __ bgeu(len, temp0, L_nmax); 5807 __ beqz(len, L_combine); 5808 5809 // Jumping to L_by1_loop 5810 __ sub(len, len, step_1); 5811 __ j(L_by1_loop); 5812 5813 __ bind(L_nmax); 5814 __ sub(len, len, nmax); 5815 __ sub(count, nmax, 16); 5816 __ bltz(len, L_by16); 5817 5818 // Align L_nmax loop by 64 5819 __ bind(L_nmax_loop_entry); 5820 __ sub(count, count, 32); 5821 5822 __ bind(L_nmax_loop); 5823 adler32_process_bytes(buff, s1, s2, vtable_64, vzero, 5824 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5825 vtemp1, vtemp2, step_64, Assembler::m4); 5826 __ sub(count, count, step_64); 5827 __ bgtz(count, L_nmax_loop); 5828 5829 // There are three iterations left to do 5830 adler32_process_bytes(buff, s1, s2, vtable_32, vzero, 5831 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5832 vtemp1, vtemp2, step_32, Assembler::m2); 5833 adler32_process_bytes(buff, s1, s2, vtable_16, vzero, 5834 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5835 vtemp1, vtemp2, step_16, Assembler::m1); 5836 5837 // s1 = s1 % BASE 5838 __ remuw(s1, s1, base); 5839 // s2 = s2 % BASE 5840 __ remuw(s2, s2, base); 5841 5842 __ sub(len, len, nmax); 5843 __ sub(count, nmax, 16); 5844 __ bgez(len, L_nmax_loop_entry); 5845 5846 __ bind(L_by16); 5847 __ add(len, len, count); 5848 __ bltz(len, L_by1); 5849 // Trying to unroll 5850 __ mv(temp3, step_64); 5851 __ blt(len, temp3, L_by16_loop); 5852 5853 __ bind(L_by16_loop_unroll); 5854 adler32_process_bytes(buff, s1, s2, vtable_64, vzero, 5855 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5856 vtemp1, vtemp2, step_64, Assembler::m4); 5857 __ sub(len, len, step_64); 5858 // By now the temp3 should still be 64 5859 __ bge(len, temp3, L_by16_loop_unroll); 5860 5861 __ bind(L_by16_loop); 5862 adler32_process_bytes(buff, s1, s2, vtable_16, vzero, 5863 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5864 vtemp1, vtemp2, step_16, Assembler::m1); 5865 __ sub(len, len, step_16); 5866 __ bgez(len, L_by16_loop); 5867 5868 __ bind(L_by1); 5869 __ add(len, len, 15); 5870 __ bltz(len, L_do_mod); 5871 5872 __ bind(L_by1_loop); 5873 __ lbu(temp0, Address(buff, 0)); 5874 __ addi(buff, buff, step_1); 5875 __ add(s1, temp0, s1); 5876 __ add(s2, s2, s1); 5877 __ sub(len, len, step_1); 5878 __ bgez(len, L_by1_loop); 5879 5880 __ bind(L_do_mod); 5881 // s1 = s1 % BASE 5882 __ remuw(s1, s1, base); 5883 // s2 = s2 % BASE 5884 __ remuw(s2, s2, base); 5885 5886 // Combine lower bits and higher bits 5887 // adler = s1 | (s2 << 16) 5888 __ bind(L_combine); 5889 __ slli(s2, s2, 16); 5890 __ orr(s1, s1, s2); 5891 5892 __ leave(); // Required for proper stackwalking of RuntimeStub frame 5893 __ ret(); 5894 5895 return start; 5896 } 5897 5898 #endif // COMPILER2_OR_JVMCI 5899 5900 #ifdef COMPILER2 5901 5902 static const int64_t right_2_bits = right_n_bits(2); 5903 static const int64_t right_3_bits = right_n_bits(3); 5904 5905 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 5906 // are represented as long[5], with BITS_PER_LIMB = 26. 5907 // Pack five 26-bit limbs into three 64-bit registers. 5908 void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) { 5909 assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2); 5910 5911 // The goal is to have 128-bit value in dest2:dest1:dest0 5912 __ ld(dest0, Address(src, 0)); // 26 bits in dest0 5913 5914 __ ld(tmp1, Address(src, sizeof(jlong))); 5915 __ slli(tmp1, tmp1, 26); 5916 __ add(dest0, dest0, tmp1); // 52 bits in dest0 5917 5918 __ ld(tmp2, Address(src, 2 * sizeof(jlong))); 5919 __ slli(tmp1, tmp2, 52); 5920 __ add(dest0, dest0, tmp1); // dest0 is full 5921 5922 __ srli(dest1, tmp2, 12); // 14-bit in dest1 5923 5924 __ ld(tmp1, Address(src, 3 * sizeof(jlong))); 5925 __ slli(tmp1, tmp1, 14); 5926 __ add(dest1, dest1, tmp1); // 40-bit in dest1 5927 5928 __ ld(tmp1, Address(src, 4 * sizeof(jlong))); 5929 __ slli(tmp2, tmp1, 40); 5930 __ add(dest1, dest1, tmp2); // dest1 is full 5931 5932 if (dest2->is_valid()) { 5933 __ srli(tmp1, tmp1, 24); 5934 __ mv(dest2, tmp1); // 2 bits in dest2 5935 } else { 5936 #ifdef ASSERT 5937 Label OK; 5938 __ srli(tmp1, tmp1, 24); 5939 __ beq(zr, tmp1, OK); // 2 bits 5940 __ stop("high bits of Poly1305 integer should be zero"); 5941 __ should_not_reach_here(); 5942 __ bind(OK); 5943 #endif 5944 } 5945 } 5946 5947 // As above, but return only a 128-bit integer, packed into two 5948 // 64-bit registers. 5949 void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) { 5950 poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2); 5951 } 5952 5953 // U_2:U_1:U_0: += (U_2 >> 2) * 5 5954 void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) { 5955 assert_different_registers(U_2, U_1, U_0, tmp1, tmp2); 5956 5957 // First, U_2:U_1:U_0 += (U_2 >> 2) 5958 __ srli(tmp1, U_2, 2); 5959 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2 5960 __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits 5961 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2 5962 __ add(U_2, U_2, tmp2); 5963 5964 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 5965 __ slli(tmp1, tmp1, 2); 5966 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2 5967 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2 5968 __ add(U_2, U_2, tmp2); 5969 } 5970 5971 // Poly1305, RFC 7539 5972 // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) 5973 5974 // Arguments: 5975 // c_rarg0: input_start -- where the input is stored 5976 // c_rarg1: length 5977 // c_rarg2: acc_start -- where the output will be stored 5978 // c_rarg3: r_start -- where the randomly generated 128-bit key is stored 5979 5980 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 5981 // description of the tricks used to simplify and accelerate this 5982 // computation. 5983 5984 address generate_poly1305_processBlocks() { 5985 __ align(CodeEntryAlignment); 5986 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 5987 address start = __ pc(); 5988 __ enter(); 5989 Label here; 5990 5991 RegSet saved_regs = RegSet::range(x18, x21); 5992 RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin(); 5993 __ push_reg(saved_regs, sp); 5994 5995 // Arguments 5996 const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3; 5997 5998 // R_n is the 128-bit randomly-generated key, packed into two 5999 // registers. The caller passes this key to us as long[5], with 6000 // BITS_PER_LIMB = 26. 6001 const Register R_0 = *regs, R_1 = *++regs; 6002 poly1305_pack_26(R_0, R_1, r_start, t1, t2); 6003 6004 // RR_n is (R_n >> 2) * 5 6005 const Register RR_0 = *++regs, RR_1 = *++regs; 6006 __ srli(t1, R_0, 2); 6007 __ shadd(RR_0, t1, t1, t2, 2); 6008 __ srli(t1, R_1, 2); 6009 __ shadd(RR_1, t1, t1, t2, 2); 6010 6011 // U_n is the current checksum 6012 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 6013 poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2); 6014 6015 static constexpr int BLOCK_LENGTH = 16; 6016 Label DONE, LOOP; 6017 6018 __ mv(t1, BLOCK_LENGTH); 6019 __ blt(length, t1, DONE); { 6020 __ bind(LOOP); 6021 6022 // S_n is to be the sum of U_n and the next block of data 6023 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 6024 __ ld(S_0, Address(input_start, 0)); 6025 __ ld(S_1, Address(input_start, wordSize)); 6026 6027 __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1 6028 __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1 6029 __ add(S_2, U_2, t1); 6030 6031 __ addi(S_2, S_2, 1); 6032 6033 const Register U_0HI = *++regs, U_1HI = *++regs; 6034 6035 // NB: this logic depends on some of the special properties of 6036 // Poly1305 keys. In particular, because we know that the top 6037 // four bits of R_0 and R_1 are zero, we can add together 6038 // partial products without any risk of needing to propagate a 6039 // carry out. 6040 __ wide_mul(U_0, U_0HI, S_0, R_0); 6041 __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2); 6042 __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2); 6043 6044 __ wide_mul(U_1, U_1HI, S_0, R_1); 6045 __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2); 6046 __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2); 6047 6048 __ andi(U_2, R_0, right_2_bits); 6049 __ mul(U_2, S_2, U_2); 6050 6051 // Partial reduction mod 2**130 - 5 6052 __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1 6053 __ adc(U_2, U_2, U_1HI, t1); 6054 // Sum is now in U_2:U_1:U_0. 6055 6056 // U_2:U_1:U_0: += (U_2 >> 2) * 5 6057 poly1305_reduce(U_2, U_1, U_0, t1, t2); 6058 6059 __ sub(length, length, BLOCK_LENGTH); 6060 __ addi(input_start, input_start, BLOCK_LENGTH); 6061 __ mv(t1, BLOCK_LENGTH); 6062 __ bge(length, t1, LOOP); 6063 } 6064 6065 // Further reduce modulo 2^130 - 5 6066 poly1305_reduce(U_2, U_1, U_0, t1, t2); 6067 6068 // Unpack the sum into five 26-bit limbs and write to memory. 6069 // First 26 bits is the first limb 6070 __ slli(t1, U_0, 38); // Take lowest 26 bits 6071 __ srli(t1, t1, 38); 6072 __ sd(t1, Address(acc_start)); // First 26-bit limb 6073 6074 // 27-52 bits of U_0 is the second limb 6075 __ slli(t1, U_0, 12); // Take next 27-52 bits 6076 __ srli(t1, t1, 38); 6077 __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb 6078 6079 // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register 6080 __ srli(t1, U_0, 52); 6081 __ slli(t2, U_1, 50); 6082 __ srli(t2, t2, 38); 6083 __ add(t1, t1, t2); 6084 __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb 6085 6086 // Storing 15-40 bits of U_1 6087 __ slli(t1, U_1, 24); // Already used up 14 bits 6088 __ srli(t1, t1, 38); // Clear all other bits from t1 6089 __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb 6090 6091 // Storing 41-64 bits of U_1 and first three bits from U_2 in one register 6092 __ srli(t1, U_1, 40); 6093 __ andi(t2, U_2, right_3_bits); 6094 __ slli(t2, t2, 24); 6095 __ add(t1, t1, t2); 6096 __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb 6097 6098 __ bind(DONE); 6099 __ pop_reg(saved_regs, sp); 6100 __ leave(); // Required for proper stackwalking 6101 __ ret(); 6102 6103 return start; 6104 } 6105 6106 #endif // COMPILER2 6107 6108 /** 6109 * Arguments: 6110 * 6111 * Inputs: 6112 * c_rarg0 - int crc 6113 * c_rarg1 - byte* buf 6114 * c_rarg2 - int length 6115 * 6116 * Output: 6117 * c_rarg0 - int crc result 6118 */ 6119 address generate_updateBytesCRC32() { 6120 assert(UseCRC32Intrinsics, "what are we doing here?"); 6121 6122 __ align(CodeEntryAlignment); 6123 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 6124 6125 address start = __ pc(); 6126 6127 // input parameters 6128 const Register crc = c_rarg0; // crc 6129 const Register buf = c_rarg1; // source java byte array address 6130 const Register len = c_rarg2; // length 6131 6132 BLOCK_COMMENT("Entry:"); 6133 __ enter(); // required for proper stackwalking of RuntimeStub frame 6134 6135 __ kernel_crc32(crc, buf, len, 6136 c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables 6137 c_rarg7, t2, x28, x29, x30, x31); // misc tmps 6138 6139 __ leave(); // required for proper stackwalking of RuntimeStub frame 6140 __ ret(); 6141 6142 return start; 6143 } 6144 6145 // exception handler for upcall stubs 6146 address generate_upcall_stub_exception_handler() { 6147 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 6148 address start = __ pc(); 6149 6150 // Native caller has no idea how to handle exceptions, 6151 // so we just crash here. Up to callee to catch exceptions. 6152 __ verify_oop(x10); // return a exception oop in a0 6153 __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception)); 6154 __ should_not_reach_here(); 6155 6156 return start; 6157 } 6158 6159 // load Method* target of MethodHandle 6160 // j_rarg0 = jobject receiver 6161 // xmethod = Method* result 6162 address generate_upcall_stub_load_target() { 6163 6164 StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target"); 6165 address start = __ pc(); 6166 6167 __ resolve_global_jobject(j_rarg0, t0, t1); 6168 // Load target method from receiver 6169 __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1); 6170 __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1); 6171 __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1); 6172 __ access_load_at(T_ADDRESS, IN_HEAP, xmethod, 6173 Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 6174 noreg, noreg); 6175 __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 6176 6177 __ ret(); 6178 6179 return start; 6180 } 6181 6182 #undef __ 6183 6184 // Initialization 6185 void generate_initial_stubs() { 6186 // Generate initial stubs and initializes the entry points 6187 6188 // entry points that exist in all platforms Note: This is code 6189 // that could be shared among different platforms - however the 6190 // benefit seems to be smaller than the disadvantage of having a 6191 // much more complicated generator structure. See also comment in 6192 // stubRoutines.hpp. 6193 6194 StubRoutines::_forward_exception_entry = generate_forward_exception(); 6195 6196 if (UnsafeMemoryAccess::_table == nullptr) { 6197 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 6198 } 6199 6200 StubRoutines::_call_stub_entry = 6201 generate_call_stub(StubRoutines::_call_stub_return_address); 6202 6203 // is referenced by megamorphic call 6204 StubRoutines::_catch_exception_entry = generate_catch_exception(); 6205 6206 if (UseCRC32Intrinsics) { 6207 // set table address before stub generation which use it 6208 StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table; 6209 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 6210 } 6211 } 6212 6213 void generate_continuation_stubs() { 6214 // Continuation stubs: 6215 StubRoutines::_cont_thaw = generate_cont_thaw(); 6216 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 6217 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 6218 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 6219 } 6220 6221 void generate_final_stubs() { 6222 // support for verify_oop (must happen after universe_init) 6223 if (VerifyOops) { 6224 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 6225 } 6226 6227 // arraycopy stubs used by compilers 6228 generate_arraycopy_stubs(); 6229 6230 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 6231 if (bs_nm != nullptr) { 6232 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 6233 } 6234 6235 #ifdef COMPILER2 6236 if (UseSecondarySupersTable) { 6237 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 6238 if (!InlineSecondarySupersTest) { 6239 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 6240 StubRoutines::_lookup_secondary_supers_table_stubs[slot] 6241 = generate_lookup_secondary_supers_table_stub(slot); 6242 } 6243 } 6244 } 6245 #endif // COMPILER2 6246 6247 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 6248 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 6249 6250 StubRoutines::riscv::set_completed(); 6251 } 6252 6253 void generate_compiler_stubs() { 6254 #ifdef COMPILER2 6255 if (UseMulAddIntrinsic) { 6256 StubRoutines::_mulAdd = generate_mulAdd(); 6257 } 6258 6259 if (UseMultiplyToLenIntrinsic) { 6260 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 6261 } 6262 6263 if (UseSquareToLenIntrinsic) { 6264 StubRoutines::_squareToLen = generate_squareToLen(); 6265 } 6266 6267 if (UseMontgomeryMultiplyIntrinsic) { 6268 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 6269 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 6270 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 6271 } 6272 6273 if (UseMontgomerySquareIntrinsic) { 6274 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 6275 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 6276 StubRoutines::_montgomerySquare = g.generate_square(); 6277 } 6278 6279 if (UsePoly1305Intrinsics) { 6280 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 6281 } 6282 6283 if (UseRVVForBigIntegerShiftIntrinsics) { 6284 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 6285 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 6286 } 6287 6288 if (UseSHA256Intrinsics) { 6289 Sha2Generator sha2(_masm, this); 6290 StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(false); 6291 StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true); 6292 } 6293 6294 if (UseSHA512Intrinsics) { 6295 Sha2Generator sha2(_masm, this); 6296 StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(false); 6297 StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true); 6298 } 6299 6300 if (UseMD5Intrinsics) { 6301 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 6302 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 6303 } 6304 6305 if (UseChaCha20Intrinsics) { 6306 StubRoutines::_chacha20Block = generate_chacha20Block(); 6307 } 6308 6309 if (UseSHA1Intrinsics) { 6310 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 6311 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 6312 } 6313 6314 if (UseBASE64Intrinsics) { 6315 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 6316 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 6317 } 6318 6319 if (UseAdler32Intrinsics) { 6320 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 6321 } 6322 6323 generate_compare_long_strings(); 6324 6325 generate_string_indexof_stubs(); 6326 6327 #endif // COMPILER2 6328 } 6329 6330 public: 6331 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 6332 switch(kind) { 6333 case Initial_stubs: 6334 generate_initial_stubs(); 6335 break; 6336 case Continuation_stubs: 6337 generate_continuation_stubs(); 6338 break; 6339 case Compiler_stubs: 6340 generate_compiler_stubs(); 6341 break; 6342 case Final_stubs: 6343 generate_final_stubs(); 6344 break; 6345 default: 6346 fatal("unexpected stubs kind: %d", kind); 6347 break; 6348 }; 6349 } 6350 }; // end class declaration 6351 6352 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 6353 StubGenerator g(code, kind); 6354 }