1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/universe.hpp" 35 #include "nativeInst_riscv.hpp" 36 #include "oops/instanceOop.hpp" 37 #include "oops/method.hpp" 38 #include "oops/objArrayKlass.hpp" 39 #include "oops/oop.inline.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "prims/upcallLinker.hpp" 42 #include "runtime/continuation.hpp" 43 #include "runtime/continuationEntry.inline.hpp" 44 #include "runtime/frame.inline.hpp" 45 #include "runtime/handles.inline.hpp" 46 #include "runtime/javaThread.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/stubCodeGenerator.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "utilities/align.hpp" 51 #include "utilities/powerOfTwo.hpp" 52 #ifdef COMPILER2 53 #include "opto/runtime.hpp" 54 #endif 55 56 // Declaration and definition of StubGenerator (no .hpp file). 57 // For a more detailed description of the stub routine structure 58 // see the comment in stubRoutines.hpp 59 60 #undef __ 61 #define __ _masm-> 62 63 #ifdef PRODUCT 64 #define BLOCK_COMMENT(str) /* nothing */ 65 #else 66 #define BLOCK_COMMENT(str) __ block_comment(str) 67 #endif 68 69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 70 71 // Stub Code definitions 72 73 class StubGenerator: public StubCodeGenerator { 74 private: 75 76 #ifdef PRODUCT 77 #define inc_counter_np(counter) ((void)0) 78 #else 79 void inc_counter_np_(uint& counter) { 80 __ incrementw(ExternalAddress((address)&counter)); 81 } 82 #define inc_counter_np(counter) \ 83 BLOCK_COMMENT("inc_counter " #counter); \ 84 inc_counter_np_(counter); 85 #endif 86 87 // Call stubs are used to call Java from C 88 // 89 // Arguments: 90 // c_rarg0: call wrapper address address 91 // c_rarg1: result address 92 // c_rarg2: result type BasicType 93 // c_rarg3: method Method* 94 // c_rarg4: (interpreter) entry point address 95 // c_rarg5: parameters intptr_t* 96 // c_rarg6: parameter size (in words) int 97 // c_rarg7: thread Thread* 98 // 99 // There is no return from the stub itself as any Java result 100 // is written to result 101 // 102 // we save x1 (ra) as the return PC at the base of the frame and 103 // link x8 (fp) below it as the frame pointer installing sp (x2) 104 // into fp. 105 // 106 // we save x10-x17, which accounts for all the c arguments. 107 // 108 // TODO: strictly do we need to save them all? they are treated as 109 // volatile by C so could we omit saving the ones we are going to 110 // place in global registers (thread? method?) or those we only use 111 // during setup of the Java call? 112 // 113 // we don't need to save x5 which C uses as an indirect result location 114 // return register. 115 // 116 // we don't need to save x6-x7 and x28-x31 which both C and Java treat as 117 // volatile 118 // 119 // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary 120 // registers and C expects to be callee-save 121 // 122 // so the stub frame looks like this when we enter Java code 123 // 124 // [ return_from_Java ] <--- sp 125 // [ argument word n ] 126 // ... 127 // -35 [ argument word 1 ] 128 // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call 129 // -33 [ saved f27 ] 130 // -32 [ saved f26 ] 131 // -31 [ saved f25 ] 132 // -30 [ saved f24 ] 133 // -29 [ saved f23 ] 134 // -28 [ saved f22 ] 135 // -27 [ saved f21 ] 136 // -26 [ saved f20 ] 137 // -25 [ saved f19 ] 138 // -24 [ saved f18 ] 139 // -23 [ saved f9 ] 140 // -22 [ saved f8 ] 141 // -21 [ saved x27 ] 142 // -20 [ saved x26 ] 143 // -19 [ saved x25 ] 144 // -18 [ saved x24 ] 145 // -17 [ saved x23 ] 146 // -16 [ saved x22 ] 147 // -15 [ saved x21 ] 148 // -14 [ saved x20 ] 149 // -13 [ saved x19 ] 150 // -12 [ saved x18 ] 151 // -11 [ saved x9 ] 152 // -10 [ call wrapper (x10) ] 153 // -9 [ result (x11) ] 154 // -8 [ result type (x12) ] 155 // -7 [ method (x13) ] 156 // -6 [ entry point (x14) ] 157 // -5 [ parameters (x15) ] 158 // -4 [ parameter size (x16) ] 159 // -3 [ thread (x17) ] 160 // -2 [ saved fp (x8) ] 161 // -1 [ saved ra (x1) ] 162 // 0 [ ] <--- fp == saved sp (x2) 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -34, 167 168 frm_off = sp_after_call_off, 169 f27_off = -33, 170 f26_off = -32, 171 f25_off = -31, 172 f24_off = -30, 173 f23_off = -29, 174 f22_off = -28, 175 f21_off = -27, 176 f20_off = -26, 177 f19_off = -25, 178 f18_off = -24, 179 f9_off = -23, 180 f8_off = -22, 181 182 x27_off = -21, 183 x26_off = -20, 184 x25_off = -19, 185 x24_off = -18, 186 x23_off = -17, 187 x22_off = -16, 188 x21_off = -15, 189 x20_off = -14, 190 x19_off = -13, 191 x18_off = -12, 192 x9_off = -11, 193 194 call_wrapper_off = -10, 195 result_off = -9, 196 result_type_off = -8, 197 method_off = -7, 198 entry_point_off = -6, 199 parameters_off = -5, 200 parameter_size_off = -4, 201 thread_off = -3, 202 fp_f = -2, 203 retaddr_off = -1, 204 }; 205 206 address generate_call_stub(address& return_address) { 207 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 208 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 209 "adjust this code"); 210 211 StubCodeMark mark(this, "StubRoutines", "call_stub"); 212 address start = __ pc(); 213 214 const Address sp_after_call (fp, sp_after_call_off * wordSize); 215 216 const Address frm_save (fp, frm_off * wordSize); 217 const Address call_wrapper (fp, call_wrapper_off * wordSize); 218 const Address result (fp, result_off * wordSize); 219 const Address result_type (fp, result_type_off * wordSize); 220 const Address method (fp, method_off * wordSize); 221 const Address entry_point (fp, entry_point_off * wordSize); 222 const Address parameters (fp, parameters_off * wordSize); 223 const Address parameter_size(fp, parameter_size_off * wordSize); 224 225 const Address thread (fp, thread_off * wordSize); 226 227 const Address f27_save (fp, f27_off * wordSize); 228 const Address f26_save (fp, f26_off * wordSize); 229 const Address f25_save (fp, f25_off * wordSize); 230 const Address f24_save (fp, f24_off * wordSize); 231 const Address f23_save (fp, f23_off * wordSize); 232 const Address f22_save (fp, f22_off * wordSize); 233 const Address f21_save (fp, f21_off * wordSize); 234 const Address f20_save (fp, f20_off * wordSize); 235 const Address f19_save (fp, f19_off * wordSize); 236 const Address f18_save (fp, f18_off * wordSize); 237 const Address f9_save (fp, f9_off * wordSize); 238 const Address f8_save (fp, f8_off * wordSize); 239 240 const Address x27_save (fp, x27_off * wordSize); 241 const Address x26_save (fp, x26_off * wordSize); 242 const Address x25_save (fp, x25_off * wordSize); 243 const Address x24_save (fp, x24_off * wordSize); 244 const Address x23_save (fp, x23_off * wordSize); 245 const Address x22_save (fp, x22_off * wordSize); 246 const Address x21_save (fp, x21_off * wordSize); 247 const Address x20_save (fp, x20_off * wordSize); 248 const Address x19_save (fp, x19_off * wordSize); 249 const Address x18_save (fp, x18_off * wordSize); 250 251 const Address x9_save (fp, x9_off * wordSize); 252 253 // stub code 254 255 address riscv_entry = __ pc(); 256 257 // set up frame and move sp to end of save area 258 __ enter(); 259 __ addi(sp, fp, sp_after_call_off * wordSize); 260 261 // save register parameters and Java temporary/global registers 262 // n.b. we save thread even though it gets installed in 263 // xthread because we want to sanity check tp later 264 __ sd(c_rarg7, thread); 265 __ sw(c_rarg6, parameter_size); 266 __ sd(c_rarg5, parameters); 267 __ sd(c_rarg4, entry_point); 268 __ sd(c_rarg3, method); 269 __ sd(c_rarg2, result_type); 270 __ sd(c_rarg1, result); 271 __ sd(c_rarg0, call_wrapper); 272 273 __ sd(x9, x9_save); 274 275 __ sd(x18, x18_save); 276 __ sd(x19, x19_save); 277 __ sd(x20, x20_save); 278 __ sd(x21, x21_save); 279 __ sd(x22, x22_save); 280 __ sd(x23, x23_save); 281 __ sd(x24, x24_save); 282 __ sd(x25, x25_save); 283 __ sd(x26, x26_save); 284 __ sd(x27, x27_save); 285 286 __ fsd(f8, f8_save); 287 __ fsd(f9, f9_save); 288 __ fsd(f18, f18_save); 289 __ fsd(f19, f19_save); 290 __ fsd(f20, f20_save); 291 __ fsd(f21, f21_save); 292 __ fsd(f22, f22_save); 293 __ fsd(f23, f23_save); 294 __ fsd(f24, f24_save); 295 __ fsd(f25, f25_save); 296 __ fsd(f26, f26_save); 297 __ fsd(f27, f27_save); 298 299 __ frrm(t0); 300 __ sd(t0, frm_save); 301 // Set frm to the state we need. We do want Round to Nearest. We 302 // don't want non-IEEE rounding modes. 303 Label skip_fsrmi; 304 guarantee(__ RoundingMode::rne == 0, "must be"); 305 __ beqz(t0, skip_fsrmi); 306 __ fsrmi(__ RoundingMode::rne); 307 __ bind(skip_fsrmi); 308 309 // install Java thread in global register now we have saved 310 // whatever value it held 311 __ mv(xthread, c_rarg7); 312 313 // And method 314 __ mv(xmethod, c_rarg3); 315 316 // set up the heapbase register 317 __ reinit_heapbase(); 318 319 #ifdef ASSERT 320 // make sure we have no pending exceptions 321 { 322 Label L; 323 __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset()))); 324 __ beqz(t0, L); 325 __ stop("StubRoutines::call_stub: entered with pending exception"); 326 __ BIND(L); 327 } 328 #endif 329 // pass parameters if any 330 __ mv(esp, sp); 331 __ slli(t0, c_rarg6, LogBytesPerWord); 332 __ sub(t0, sp, t0); // Move SP out of the way 333 __ andi(sp, t0, -2 * wordSize); 334 335 BLOCK_COMMENT("pass parameters if any"); 336 Label parameters_done; 337 // parameter count is still in c_rarg6 338 // and parameter pointer identifying param 1 is in c_rarg5 339 __ beqz(c_rarg6, parameters_done); 340 341 address loop = __ pc(); 342 __ ld(t0, Address(c_rarg5, 0)); 343 __ addi(c_rarg5, c_rarg5, wordSize); 344 __ addi(c_rarg6, c_rarg6, -1); 345 __ push_reg(t0); 346 __ bgtz(c_rarg6, loop); 347 348 __ BIND(parameters_done); 349 350 // call Java entry -- passing methdoOop, and current sp 351 // xmethod: Method* 352 // x19_sender_sp: sender sp 353 BLOCK_COMMENT("call Java function"); 354 __ mv(x19_sender_sp, sp); 355 __ jalr(c_rarg4); 356 357 // save current address for use by exception handling code 358 359 return_address = __ pc(); 360 361 // store result depending on type (everything that is not 362 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 363 // n.b. this assumes Java returns an integral result in x10 364 // and a floating result in j_farg0 365 __ ld(j_rarg2, result); 366 Label is_long, is_float, is_double, exit; 367 __ ld(j_rarg1, result_type); 368 __ mv(t0, (u1)T_OBJECT); 369 __ beq(j_rarg1, t0, is_long); 370 __ mv(t0, (u1)T_LONG); 371 __ beq(j_rarg1, t0, is_long); 372 __ mv(t0, (u1)T_FLOAT); 373 __ beq(j_rarg1, t0, is_float); 374 __ mv(t0, (u1)T_DOUBLE); 375 __ beq(j_rarg1, t0, is_double); 376 377 // handle T_INT case 378 __ sw(x10, Address(j_rarg2)); 379 380 __ BIND(exit); 381 382 // pop parameters 383 __ addi(esp, fp, sp_after_call_off * wordSize); 384 385 #ifdef ASSERT 386 // verify that threads correspond 387 { 388 Label L, S; 389 __ ld(t0, thread); 390 __ bne(xthread, t0, S); 391 __ get_thread(t0); 392 __ beq(xthread, t0, L); 393 __ BIND(S); 394 __ stop("StubRoutines::call_stub: threads must correspond"); 395 __ BIND(L); 396 } 397 #endif 398 399 __ pop_cont_fastpath(xthread); 400 401 // restore callee-save registers 402 __ fld(f27, f27_save); 403 __ fld(f26, f26_save); 404 __ fld(f25, f25_save); 405 __ fld(f24, f24_save); 406 __ fld(f23, f23_save); 407 __ fld(f22, f22_save); 408 __ fld(f21, f21_save); 409 __ fld(f20, f20_save); 410 __ fld(f19, f19_save); 411 __ fld(f18, f18_save); 412 __ fld(f9, f9_save); 413 __ fld(f8, f8_save); 414 415 __ ld(x27, x27_save); 416 __ ld(x26, x26_save); 417 __ ld(x25, x25_save); 418 __ ld(x24, x24_save); 419 __ ld(x23, x23_save); 420 __ ld(x22, x22_save); 421 __ ld(x21, x21_save); 422 __ ld(x20, x20_save); 423 __ ld(x19, x19_save); 424 __ ld(x18, x18_save); 425 426 __ ld(x9, x9_save); 427 428 // restore frm 429 Label skip_fsrm; 430 __ ld(t0, frm_save); 431 __ frrm(t1); 432 __ beq(t0, t1, skip_fsrm); 433 __ fsrm(t0); 434 __ bind(skip_fsrm); 435 436 __ ld(c_rarg0, call_wrapper); 437 __ ld(c_rarg1, result); 438 __ ld(c_rarg2, result_type); 439 __ ld(c_rarg3, method); 440 __ ld(c_rarg4, entry_point); 441 __ ld(c_rarg5, parameters); 442 __ ld(c_rarg6, parameter_size); 443 __ ld(c_rarg7, thread); 444 445 // leave frame and return to caller 446 __ leave(); 447 __ ret(); 448 449 // handle return types different from T_INT 450 451 __ BIND(is_long); 452 __ sd(x10, Address(j_rarg2, 0)); 453 __ j(exit); 454 455 __ BIND(is_float); 456 __ fsw(j_farg0, Address(j_rarg2, 0), t0); 457 __ j(exit); 458 459 __ BIND(is_double); 460 __ fsd(j_farg0, Address(j_rarg2, 0), t0); 461 __ j(exit); 462 463 return start; 464 } 465 466 // Return point for a Java call if there's an exception thrown in 467 // Java code. The exception is caught and transformed into a 468 // pending exception stored in JavaThread that can be tested from 469 // within the VM. 470 // 471 // Note: Usually the parameters are removed by the callee. In case 472 // of an exception crossing an activation frame boundary, that is 473 // not the case if the callee is compiled code => need to setup the 474 // sp. 475 // 476 // x10: exception oop 477 478 address generate_catch_exception() { 479 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 480 address start = __ pc(); 481 482 // same as in generate_call_stub(): 483 const Address thread(fp, thread_off * wordSize); 484 485 #ifdef ASSERT 486 // verify that threads correspond 487 { 488 Label L, S; 489 __ ld(t0, thread); 490 __ bne(xthread, t0, S); 491 __ get_thread(t0); 492 __ beq(xthread, t0, L); 493 __ bind(S); 494 __ stop("StubRoutines::catch_exception: threads must correspond"); 495 __ bind(L); 496 } 497 #endif 498 499 // set pending exception 500 __ verify_oop(x10); 501 502 __ sd(x10, Address(xthread, Thread::pending_exception_offset())); 503 __ mv(t0, (address)__FILE__); 504 __ sd(t0, Address(xthread, Thread::exception_file_offset())); 505 __ mv(t0, (int)__LINE__); 506 __ sw(t0, Address(xthread, Thread::exception_line_offset())); 507 508 // complete return to VM 509 assert(StubRoutines::_call_stub_return_address != nullptr, 510 "_call_stub_return_address must have been generated before"); 511 __ j(StubRoutines::_call_stub_return_address); 512 513 return start; 514 } 515 516 // Continuation point for runtime calls returning with a pending 517 // exception. The pending exception check happened in the runtime 518 // or native call stub. The pending exception in Thread is 519 // converted into a Java-level exception. 520 // 521 // Contract with Java-level exception handlers: 522 // x10: exception 523 // x13: throwing pc 524 // 525 // NOTE: At entry of this stub, exception-pc must be in RA !! 526 527 // NOTE: this is always used as a jump target within generated code 528 // so it just needs to be generated code with no x86 prolog 529 530 address generate_forward_exception() { 531 StubCodeMark mark(this, "StubRoutines", "forward exception"); 532 address start = __ pc(); 533 534 // Upon entry, RA points to the return address returning into 535 // Java (interpreted or compiled) code; i.e., the return address 536 // becomes the throwing pc. 537 // 538 // Arguments pushed before the runtime call are still on the stack 539 // but the exception handler will reset the stack pointer -> 540 // ignore them. A potential result in registers can be ignored as 541 // well. 542 543 #ifdef ASSERT 544 // make sure this code is only executed if there is a pending exception 545 { 546 Label L; 547 __ ld(t0, Address(xthread, Thread::pending_exception_offset())); 548 __ bnez(t0, L); 549 __ stop("StubRoutines::forward exception: no pending exception (1)"); 550 __ bind(L); 551 } 552 #endif 553 554 // compute exception handler into x9 555 556 // call the VM to find the handler address associated with the 557 // caller address. pass thread in x10 and caller pc (ret address) 558 // in x11. n.b. the caller pc is in ra, unlike x86 where it is on 559 // the stack. 560 __ mv(c_rarg1, ra); 561 // ra will be trashed by the VM call so we move it to x9 562 // (callee-saved) because we also need to pass it to the handler 563 // returned by this call. 564 __ mv(x9, ra); 565 BLOCK_COMMENT("call exception_handler_for_return_address"); 566 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 567 SharedRuntime::exception_handler_for_return_address), 568 xthread, c_rarg1); 569 // we should not really care that ra is no longer the callee 570 // address. we saved the value the handler needs in x9 so we can 571 // just copy it to x13. however, the C2 handler will push its own 572 // frame and then calls into the VM and the VM code asserts that 573 // the PC for the frame above the handler belongs to a compiled 574 // Java method. So, we restore ra here to satisfy that assert. 575 __ mv(ra, x9); 576 // setup x10 & x13 & clear pending exception 577 __ mv(x13, x9); 578 __ mv(x9, x10); 579 __ ld(x10, Address(xthread, Thread::pending_exception_offset())); 580 __ sd(zr, Address(xthread, Thread::pending_exception_offset())); 581 582 #ifdef ASSERT 583 // make sure exception is set 584 { 585 Label L; 586 __ bnez(x10, L); 587 __ stop("StubRoutines::forward exception: no pending exception (2)"); 588 __ bind(L); 589 } 590 #endif 591 592 // continue at exception handler 593 // x10: exception 594 // x13: throwing pc 595 // x9: exception handler 596 __ verify_oop(x10); 597 __ jr(x9); 598 599 return start; 600 } 601 602 // Non-destructive plausibility checks for oops 603 // 604 // Arguments: 605 // x10: oop to verify 606 // t0: error message 607 // 608 // Stack after saving c_rarg3: 609 // [tos + 0]: saved c_rarg3 610 // [tos + 1]: saved c_rarg2 611 // [tos + 2]: saved ra 612 // [tos + 3]: saved t1 613 // [tos + 4]: saved x10 614 // [tos + 5]: saved t0 615 address generate_verify_oop() { 616 617 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 618 address start = __ pc(); 619 620 Label exit, error; 621 622 __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3 623 624 __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 625 __ ld(c_rarg3, Address(c_rarg2)); 626 __ add(c_rarg3, c_rarg3, 1); 627 __ sd(c_rarg3, Address(c_rarg2)); 628 629 // object is in x10 630 // make sure object is 'reasonable' 631 __ beqz(x10, exit); // if obj is null it is OK 632 633 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 634 bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error); 635 636 // return if everything seems ok 637 __ bind(exit); 638 639 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3 640 __ ret(); 641 642 // handle errors 643 __ bind(error); 644 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3 645 646 __ push_reg(RegSet::range(x0, x31), sp); 647 // debug(char* msg, int64_t pc, int64_t regs[]) 648 __ mv(c_rarg0, t0); // pass address of error message 649 __ mv(c_rarg1, ra); // pass return address 650 __ mv(c_rarg2, sp); // pass address of regs on stack 651 #ifndef PRODUCT 652 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 653 #endif 654 BLOCK_COMMENT("call MacroAssembler::debug"); 655 __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 656 __ ebreak(); 657 658 return start; 659 } 660 661 // The inner part of zero_words(). 662 // 663 // Inputs: 664 // x28: the HeapWord-aligned base address of an array to zero. 665 // x29: the count in HeapWords, x29 > 0. 666 // 667 // Returns x28 and x29, adjusted for the caller to clear. 668 // x28: the base address of the tail of words left to clear. 669 // x29: the number of words in the tail. 670 // x29 < MacroAssembler::zero_words_block_size. 671 672 address generate_zero_blocks() { 673 Label done; 674 675 const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31; 676 677 __ align(CodeEntryAlignment); 678 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 679 address start = __ pc(); 680 681 if (UseBlockZeroing) { 682 // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero 683 // after alignment. 684 Label small; 685 int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize; 686 __ mv(tmp1, low_limit); 687 __ blt(cnt, tmp1, small); 688 __ zero_dcache_blocks(base, cnt, tmp1, tmp2); 689 __ bind(small); 690 } 691 692 { 693 // Clear the remaining blocks. 694 Label loop; 695 __ mv(tmp1, MacroAssembler::zero_words_block_size); 696 __ blt(cnt, tmp1, done); 697 __ bind(loop); 698 for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) { 699 __ sd(zr, Address(base, i * wordSize)); 700 } 701 __ add(base, base, MacroAssembler::zero_words_block_size * wordSize); 702 __ sub(cnt, cnt, MacroAssembler::zero_words_block_size); 703 __ bge(cnt, tmp1, loop); 704 __ bind(done); 705 } 706 707 __ ret(); 708 709 return start; 710 } 711 712 typedef enum { 713 copy_forwards = 1, 714 copy_backwards = -1 715 } copy_direction; 716 717 // Bulk copy of blocks of 8 words. 718 // 719 // count is a count of words. 720 // 721 // Precondition: count >= 8 722 // 723 // Postconditions: 724 // 725 // The least significant bit of count contains the remaining count 726 // of words to copy. The rest of count is trash. 727 // 728 // s and d are adjusted to point to the remaining words to copy 729 // 730 void generate_copy_longs(Label &start, Register s, Register d, Register count, 731 copy_direction direction) { 732 int unit = wordSize * direction; 733 int bias = wordSize; 734 735 const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16, 736 tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29; 737 738 const Register stride = x30; 739 740 assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3, 741 tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7); 742 assert_different_registers(s, d, count, t0); 743 744 Label again, drain; 745 const char* stub_name = nullptr; 746 if (direction == copy_forwards) { 747 stub_name = "forward_copy_longs"; 748 } else { 749 stub_name = "backward_copy_longs"; 750 } 751 StubCodeMark mark(this, "StubRoutines", stub_name); 752 __ align(CodeEntryAlignment); 753 __ bind(start); 754 755 if (direction == copy_forwards) { 756 __ sub(s, s, bias); 757 __ sub(d, d, bias); 758 } 759 760 #ifdef ASSERT 761 // Make sure we are never given < 8 words 762 { 763 Label L; 764 765 __ mv(t0, 8); 766 __ bge(count, t0, L); 767 __ stop("genrate_copy_longs called with < 8 words"); 768 __ bind(L); 769 } 770 #endif 771 772 __ ld(tmp_reg0, Address(s, 1 * unit)); 773 __ ld(tmp_reg1, Address(s, 2 * unit)); 774 __ ld(tmp_reg2, Address(s, 3 * unit)); 775 __ ld(tmp_reg3, Address(s, 4 * unit)); 776 __ ld(tmp_reg4, Address(s, 5 * unit)); 777 __ ld(tmp_reg5, Address(s, 6 * unit)); 778 __ ld(tmp_reg6, Address(s, 7 * unit)); 779 __ ld(tmp_reg7, Address(s, 8 * unit)); 780 __ addi(s, s, 8 * unit); 781 782 __ sub(count, count, 16); 783 __ bltz(count, drain); 784 785 __ bind(again); 786 787 __ sd(tmp_reg0, Address(d, 1 * unit)); 788 __ sd(tmp_reg1, Address(d, 2 * unit)); 789 __ sd(tmp_reg2, Address(d, 3 * unit)); 790 __ sd(tmp_reg3, Address(d, 4 * unit)); 791 __ sd(tmp_reg4, Address(d, 5 * unit)); 792 __ sd(tmp_reg5, Address(d, 6 * unit)); 793 __ sd(tmp_reg6, Address(d, 7 * unit)); 794 __ sd(tmp_reg7, Address(d, 8 * unit)); 795 796 __ ld(tmp_reg0, Address(s, 1 * unit)); 797 __ ld(tmp_reg1, Address(s, 2 * unit)); 798 __ ld(tmp_reg2, Address(s, 3 * unit)); 799 __ ld(tmp_reg3, Address(s, 4 * unit)); 800 __ ld(tmp_reg4, Address(s, 5 * unit)); 801 __ ld(tmp_reg5, Address(s, 6 * unit)); 802 __ ld(tmp_reg6, Address(s, 7 * unit)); 803 __ ld(tmp_reg7, Address(s, 8 * unit)); 804 805 __ addi(s, s, 8 * unit); 806 __ addi(d, d, 8 * unit); 807 808 __ sub(count, count, 8); 809 __ bgez(count, again); 810 811 // Drain 812 __ bind(drain); 813 814 __ sd(tmp_reg0, Address(d, 1 * unit)); 815 __ sd(tmp_reg1, Address(d, 2 * unit)); 816 __ sd(tmp_reg2, Address(d, 3 * unit)); 817 __ sd(tmp_reg3, Address(d, 4 * unit)); 818 __ sd(tmp_reg4, Address(d, 5 * unit)); 819 __ sd(tmp_reg5, Address(d, 6 * unit)); 820 __ sd(tmp_reg6, Address(d, 7 * unit)); 821 __ sd(tmp_reg7, Address(d, 8 * unit)); 822 __ addi(d, d, 8 * unit); 823 824 { 825 Label L1, L2; 826 __ test_bit(t0, count, 2); 827 __ beqz(t0, L1); 828 829 __ ld(tmp_reg0, Address(s, 1 * unit)); 830 __ ld(tmp_reg1, Address(s, 2 * unit)); 831 __ ld(tmp_reg2, Address(s, 3 * unit)); 832 __ ld(tmp_reg3, Address(s, 4 * unit)); 833 __ addi(s, s, 4 * unit); 834 835 __ sd(tmp_reg0, Address(d, 1 * unit)); 836 __ sd(tmp_reg1, Address(d, 2 * unit)); 837 __ sd(tmp_reg2, Address(d, 3 * unit)); 838 __ sd(tmp_reg3, Address(d, 4 * unit)); 839 __ addi(d, d, 4 * unit); 840 841 __ bind(L1); 842 843 if (direction == copy_forwards) { 844 __ addi(s, s, bias); 845 __ addi(d, d, bias); 846 } 847 848 __ test_bit(t0, count, 1); 849 __ beqz(t0, L2); 850 if (direction == copy_backwards) { 851 __ addi(s, s, 2 * unit); 852 __ ld(tmp_reg0, Address(s)); 853 __ ld(tmp_reg1, Address(s, wordSize)); 854 __ addi(d, d, 2 * unit); 855 __ sd(tmp_reg0, Address(d)); 856 __ sd(tmp_reg1, Address(d, wordSize)); 857 } else { 858 __ ld(tmp_reg0, Address(s)); 859 __ ld(tmp_reg1, Address(s, wordSize)); 860 __ addi(s, s, 2 * unit); 861 __ sd(tmp_reg0, Address(d)); 862 __ sd(tmp_reg1, Address(d, wordSize)); 863 __ addi(d, d, 2 * unit); 864 } 865 __ bind(L2); 866 } 867 868 __ ret(); 869 } 870 871 Label copy_f, copy_b; 872 873 typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp); 874 875 void copy_memory_v(Register s, Register d, Register count, int step) { 876 bool is_backward = step < 0; 877 int granularity = uabs(step); 878 879 const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17; 880 assert_different_registers(s, d, cnt, vl, tmp1, tmp2); 881 Assembler::SEW sew = Assembler::elembytes_to_sew(granularity); 882 Label loop_forward, loop_backward, done; 883 884 __ mv(dst, d); 885 __ mv(src, s); 886 __ mv(cnt, count); 887 888 __ bind(loop_forward); 889 __ vsetvli(vl, cnt, sew, Assembler::m8); 890 if (is_backward) { 891 __ bne(vl, cnt, loop_backward); 892 } 893 894 __ vlex_v(v0, src, sew); 895 __ sub(cnt, cnt, vl); 896 if (sew != Assembler::e8) { 897 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary 898 __ slli(vl, vl, sew); 899 } 900 __ add(src, src, vl); 901 902 __ vsex_v(v0, dst, sew); 903 __ add(dst, dst, vl); 904 __ bnez(cnt, loop_forward); 905 906 if (is_backward) { 907 __ j(done); 908 909 __ bind(loop_backward); 910 __ sub(t0, cnt, vl); 911 if (sew != Assembler::e8) { 912 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary 913 __ slli(t0, t0, sew); 914 } 915 __ add(tmp1, s, t0); 916 __ vlex_v(v0, tmp1, sew); 917 __ add(tmp2, d, t0); 918 __ vsex_v(v0, tmp2, sew); 919 __ sub(cnt, cnt, vl); 920 __ bnez(cnt, loop_forward); 921 __ bind(done); 922 } 923 } 924 925 // All-singing all-dancing memory copy. 926 // 927 // Copy count units of memory from s to d. The size of a unit is 928 // step, which can be positive or negative depending on the direction 929 // of copy. 930 // 931 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 932 Register s, Register d, Register count, int step) { 933 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 934 if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) { 935 return copy_memory_v(s, d, count, step); 936 } 937 938 bool is_backwards = step < 0; 939 int granularity = uabs(step); 940 941 const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13; 942 const Register gct1 = x28, gct2 = x29, gct3 = t2; 943 944 Label same_aligned; 945 Label copy_big, copy32_loop, copy8_loop, copy_small, done; 946 947 // The size of copy32_loop body increases significantly with ZGC GC barriers. 948 // Need conditional far branches to reach a point beyond the loop in this case. 949 bool is_far = UseZGC && ZGenerational; 950 951 __ beqz(count, done, is_far); 952 __ slli(cnt, count, exact_log2(granularity)); 953 if (is_backwards) { 954 __ add(src, s, cnt); 955 __ add(dst, d, cnt); 956 } else { 957 __ mv(src, s); 958 __ mv(dst, d); 959 } 960 961 if (is_aligned) { 962 __ addi(t0, cnt, -32); 963 __ bgez(t0, copy32_loop); 964 __ addi(t0, cnt, -8); 965 __ bgez(t0, copy8_loop, is_far); 966 __ j(copy_small); 967 } else { 968 __ mv(t0, 16); 969 __ blt(cnt, t0, copy_small, is_far); 970 971 __ xorr(t0, src, dst); 972 __ andi(t0, t0, 0b111); 973 __ bnez(t0, copy_small, is_far); 974 975 __ bind(same_aligned); 976 __ andi(t0, src, 0b111); 977 __ beqz(t0, copy_big); 978 if (is_backwards) { 979 __ addi(src, src, step); 980 __ addi(dst, dst, step); 981 } 982 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1); 983 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3); 984 if (!is_backwards) { 985 __ addi(src, src, step); 986 __ addi(dst, dst, step); 987 } 988 __ addi(cnt, cnt, -granularity); 989 __ beqz(cnt, done, is_far); 990 __ j(same_aligned); 991 992 __ bind(copy_big); 993 __ mv(t0, 32); 994 __ blt(cnt, t0, copy8_loop, is_far); 995 } 996 997 __ bind(copy32_loop); 998 if (is_backwards) { 999 __ addi(src, src, -wordSize * 4); 1000 __ addi(dst, dst, -wordSize * 4); 1001 } 1002 // we first load 32 bytes, then write it, so the direction here doesn't matter 1003 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1); 1004 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8), gct1); 1005 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1); 1006 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1); 1007 1008 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3); 1009 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8), tmp4, gct1, gct2, gct3); 1010 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3); 1011 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3); 1012 1013 if (!is_backwards) { 1014 __ addi(src, src, wordSize * 4); 1015 __ addi(dst, dst, wordSize * 4); 1016 } 1017 __ addi(t0, cnt, -(32 + wordSize * 4)); 1018 __ addi(cnt, cnt, -wordSize * 4); 1019 __ bgez(t0, copy32_loop); // cnt >= 32, do next loop 1020 1021 __ beqz(cnt, done); // if that's all - done 1022 1023 __ addi(t0, cnt, -8); // if not - copy the reminder 1024 __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop 1025 1026 __ bind(copy8_loop); 1027 if (is_backwards) { 1028 __ addi(src, src, -wordSize); 1029 __ addi(dst, dst, -wordSize); 1030 } 1031 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1); 1032 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3); 1033 1034 if (!is_backwards) { 1035 __ addi(src, src, wordSize); 1036 __ addi(dst, dst, wordSize); 1037 } 1038 __ addi(t0, cnt, -(8 + wordSize)); 1039 __ addi(cnt, cnt, -wordSize); 1040 __ bgez(t0, copy8_loop); // cnt >= 8, do next loop 1041 1042 __ beqz(cnt, done); // if that's all - done 1043 1044 __ bind(copy_small); 1045 if (is_backwards) { 1046 __ addi(src, src, step); 1047 __ addi(dst, dst, step); 1048 } 1049 1050 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1); 1051 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3); 1052 1053 if (!is_backwards) { 1054 __ addi(src, src, step); 1055 __ addi(dst, dst, step); 1056 } 1057 __ addi(cnt, cnt, -granularity); 1058 __ bgtz(cnt, copy_small); 1059 1060 __ bind(done); 1061 } 1062 1063 // Scan over array at a for count oops, verifying each one. 1064 // Preserves a and count, clobbers t0 and t1. 1065 void verify_oop_array(size_t size, Register a, Register count, Register temp) { 1066 Label loop, end; 1067 __ mv(t1, zr); 1068 __ slli(t0, count, exact_log2(size)); 1069 __ bind(loop); 1070 __ bgeu(t1, t0, end); 1071 1072 __ add(temp, a, t1); 1073 if (size == (size_t)wordSize) { 1074 __ ld(temp, Address(temp, 0)); 1075 __ verify_oop(temp); 1076 } else { 1077 __ lwu(temp, Address(temp, 0)); 1078 __ decode_heap_oop(temp); // calls verify_oop 1079 } 1080 __ add(t1, t1, size); 1081 __ j(loop); 1082 __ bind(end); 1083 } 1084 1085 // Arguments: 1086 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1087 // ignored 1088 // is_oop - true => oop array, so generate store check code 1089 // name - stub name string 1090 // 1091 // Inputs: 1092 // c_rarg0 - source array address 1093 // c_rarg1 - destination array address 1094 // c_rarg2 - element count, treated as ssize_t, can be zero 1095 // 1096 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1097 // the hardware handle it. The two dwords within qwords that span 1098 // cache line boundaries will still be loaded and stored atomically. 1099 // 1100 // Side Effects: 1101 // disjoint_int_copy_entry is set to the no-overlap entry point 1102 // used by generate_conjoint_int_oop_copy(). 1103 // 1104 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry, 1105 const char* name, bool dest_uninitialized = false) { 1106 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1107 RegSet saved_reg = RegSet::of(s, d, count); 1108 __ align(CodeEntryAlignment); 1109 StubCodeMark mark(this, "StubRoutines", name); 1110 address start = __ pc(); 1111 __ enter(); 1112 1113 if (entry != nullptr) { 1114 *entry = __ pc(); 1115 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1116 BLOCK_COMMENT("Entry:"); 1117 } 1118 1119 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1120 if (dest_uninitialized) { 1121 decorators |= IS_DEST_UNINITIALIZED; 1122 } 1123 if (aligned) { 1124 decorators |= ARRAYCOPY_ALIGNED; 1125 } 1126 1127 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1128 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1129 1130 if (is_oop) { 1131 // save regs before copy_memory 1132 __ push_reg(RegSet::of(d, count), sp); 1133 } 1134 1135 { 1136 // UnsafeMemoryAccess page error: continue after unsafe access 1137 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1138 UnsafeMemoryAccessMark umam(this, add_entry, true); 1139 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1140 } 1141 1142 if (is_oop) { 1143 __ pop_reg(RegSet::of(d, count), sp); 1144 if (VerifyOops) { 1145 verify_oop_array(size, d, count, t2); 1146 } 1147 } 1148 1149 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1150 1151 __ leave(); 1152 __ mv(x10, zr); // return 0 1153 __ ret(); 1154 return start; 1155 } 1156 1157 // Arguments: 1158 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1159 // ignored 1160 // is_oop - true => oop array, so generate store check code 1161 // name - stub name string 1162 // 1163 // Inputs: 1164 // c_rarg0 - source array address 1165 // c_rarg1 - destination array address 1166 // c_rarg2 - element count, treated as ssize_t, can be zero 1167 // 1168 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1169 // the hardware handle it. The two dwords within qwords that span 1170 // cache line boundaries will still be loaded and stored atomically. 1171 // 1172 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1173 address* entry, const char* name, 1174 bool dest_uninitialized = false) { 1175 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1176 RegSet saved_regs = RegSet::of(s, d, count); 1177 StubCodeMark mark(this, "StubRoutines", name); 1178 address start = __ pc(); 1179 __ enter(); 1180 1181 if (entry != nullptr) { 1182 *entry = __ pc(); 1183 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1184 BLOCK_COMMENT("Entry:"); 1185 } 1186 1187 // use fwd copy when (d-s) above_equal (count*size) 1188 __ sub(t0, d, s); 1189 __ slli(t1, count, exact_log2(size)); 1190 Label L_continue; 1191 __ bltu(t0, t1, L_continue); 1192 __ j(nooverlap_target); 1193 __ bind(L_continue); 1194 1195 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1196 if (dest_uninitialized) { 1197 decorators |= IS_DEST_UNINITIALIZED; 1198 } 1199 if (aligned) { 1200 decorators |= ARRAYCOPY_ALIGNED; 1201 } 1202 1203 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1204 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1205 1206 if (is_oop) { 1207 // save regs before copy_memory 1208 __ push_reg(RegSet::of(d, count), sp); 1209 } 1210 1211 { 1212 // UnsafeMemoryAccess page error: continue after unsafe access 1213 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1214 UnsafeMemoryAccessMark umam(this, add_entry, true); 1215 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1216 } 1217 1218 if (is_oop) { 1219 __ pop_reg(RegSet::of(d, count), sp); 1220 if (VerifyOops) { 1221 verify_oop_array(size, d, count, t2); 1222 } 1223 } 1224 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1225 __ leave(); 1226 __ mv(x10, zr); // return 0 1227 __ ret(); 1228 return start; 1229 } 1230 1231 // Arguments: 1232 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1233 // ignored 1234 // name - stub name string 1235 // 1236 // Inputs: 1237 // c_rarg0 - source array address 1238 // c_rarg1 - destination array address 1239 // c_rarg2 - element count, treated as ssize_t, can be zero 1240 // 1241 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1242 // we let the hardware handle it. The one to eight bytes within words, 1243 // dwords or qwords that span cache line boundaries will still be loaded 1244 // and stored atomically. 1245 // 1246 // Side Effects: 1247 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1248 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1249 // we let the hardware handle it. The one to eight bytes within words, 1250 // dwords or qwords that span cache line boundaries will still be loaded 1251 // and stored atomically. 1252 // 1253 // Side Effects: 1254 // disjoint_byte_copy_entry is set to the no-overlap entry point 1255 // used by generate_conjoint_byte_copy(). 1256 // 1257 address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) { 1258 const bool not_oop = false; 1259 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1260 } 1261 1262 // Arguments: 1263 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1264 // ignored 1265 // name - stub name string 1266 // 1267 // Inputs: 1268 // c_rarg0 - source array address 1269 // c_rarg1 - destination array address 1270 // c_rarg2 - element count, treated as ssize_t, can be zero 1271 // 1272 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1273 // we let the hardware handle it. The one to eight bytes within words, 1274 // dwords or qwords that span cache line boundaries will still be loaded 1275 // and stored atomically. 1276 // 1277 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1278 address* entry, const char* name) { 1279 const bool not_oop = false; 1280 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1281 } 1282 1283 // Arguments: 1284 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1285 // ignored 1286 // name - stub name string 1287 // 1288 // Inputs: 1289 // c_rarg0 - source array address 1290 // c_rarg1 - destination array address 1291 // c_rarg2 - element count, treated as ssize_t, can be zero 1292 // 1293 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1294 // let the hardware handle it. The two or four words within dwords 1295 // or qwords that span cache line boundaries will still be loaded 1296 // and stored atomically. 1297 // 1298 // Side Effects: 1299 // disjoint_short_copy_entry is set to the no-overlap entry point 1300 // used by generate_conjoint_short_copy(). 1301 // 1302 address generate_disjoint_short_copy(bool aligned, 1303 address* entry, const char* name) { 1304 const bool not_oop = false; 1305 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1306 } 1307 1308 // Arguments: 1309 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1310 // ignored 1311 // name - stub name string 1312 // 1313 // Inputs: 1314 // c_rarg0 - source array address 1315 // c_rarg1 - destination array address 1316 // c_rarg2 - element count, treated as ssize_t, can be zero 1317 // 1318 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1319 // let the hardware handle it. The two or four words within dwords 1320 // or qwords that span cache line boundaries will still be loaded 1321 // and stored atomically. 1322 // 1323 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1324 address* entry, const char* name) { 1325 const bool not_oop = false; 1326 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1327 } 1328 1329 // Arguments: 1330 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1331 // ignored 1332 // name - stub name string 1333 // 1334 // Inputs: 1335 // c_rarg0 - source array address 1336 // c_rarg1 - destination array address 1337 // c_rarg2 - element count, treated as ssize_t, can be zero 1338 // 1339 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1340 // the hardware handle it. The two dwords within qwords that span 1341 // cache line boundaries will still be loaded and stored atomically. 1342 // 1343 // Side Effects: 1344 // disjoint_int_copy_entry is set to the no-overlap entry point 1345 // used by generate_conjoint_int_oop_copy(). 1346 // 1347 address generate_disjoint_int_copy(bool aligned, address* entry, 1348 const char* name, bool dest_uninitialized = false) { 1349 const bool not_oop = false; 1350 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1351 } 1352 1353 // Arguments: 1354 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1355 // ignored 1356 // name - stub name string 1357 // 1358 // Inputs: 1359 // c_rarg0 - source array address 1360 // c_rarg1 - destination array address 1361 // c_rarg2 - element count, treated as ssize_t, can be zero 1362 // 1363 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1364 // the hardware handle it. The two dwords within qwords that span 1365 // cache line boundaries will still be loaded and stored atomically. 1366 // 1367 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1368 address* entry, const char* name, 1369 bool dest_uninitialized = false) { 1370 const bool not_oop = false; 1371 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1372 } 1373 1374 1375 // Arguments: 1376 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1377 // ignored 1378 // name - stub name string 1379 // 1380 // Inputs: 1381 // c_rarg0 - source array address 1382 // c_rarg1 - destination array address 1383 // c_rarg2 - element count, treated as size_t, can be zero 1384 // 1385 // Side Effects: 1386 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1387 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1388 // 1389 address generate_disjoint_long_copy(bool aligned, address* entry, 1390 const char* name, bool dest_uninitialized = false) { 1391 const bool not_oop = false; 1392 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1393 } 1394 1395 // Arguments: 1396 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1397 // ignored 1398 // name - stub name string 1399 // 1400 // Inputs: 1401 // c_rarg0 - source array address 1402 // c_rarg1 - destination array address 1403 // c_rarg2 - element count, treated as size_t, can be zero 1404 // 1405 address generate_conjoint_long_copy(bool aligned, 1406 address nooverlap_target, address* entry, 1407 const char* name, bool dest_uninitialized = false) { 1408 const bool not_oop = false; 1409 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1410 } 1411 1412 // Arguments: 1413 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1414 // ignored 1415 // name - stub name string 1416 // 1417 // Inputs: 1418 // c_rarg0 - source array address 1419 // c_rarg1 - destination array address 1420 // c_rarg2 - element count, treated as size_t, can be zero 1421 // 1422 // Side Effects: 1423 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1424 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1425 // 1426 address generate_disjoint_oop_copy(bool aligned, address* entry, 1427 const char* name, bool dest_uninitialized) { 1428 const bool is_oop = true; 1429 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1430 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1431 } 1432 1433 // Arguments: 1434 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1435 // ignored 1436 // name - stub name string 1437 // 1438 // Inputs: 1439 // c_rarg0 - source array address 1440 // c_rarg1 - destination array address 1441 // c_rarg2 - element count, treated as size_t, can be zero 1442 // 1443 address generate_conjoint_oop_copy(bool aligned, 1444 address nooverlap_target, address* entry, 1445 const char* name, bool dest_uninitialized) { 1446 const bool is_oop = true; 1447 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1448 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1449 name, dest_uninitialized); 1450 } 1451 1452 // Helper for generating a dynamic type check. 1453 // Smashes t0, t1. 1454 void generate_type_check(Register sub_klass, 1455 Register super_check_offset, 1456 Register super_klass, 1457 Label& L_success) { 1458 assert_different_registers(sub_klass, super_check_offset, super_klass); 1459 1460 BLOCK_COMMENT("type_check:"); 1461 1462 Label L_miss; 1463 1464 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset); 1465 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1466 1467 // Fall through on failure! 1468 __ BIND(L_miss); 1469 } 1470 1471 // 1472 // Generate checkcasting array copy stub 1473 // 1474 // Input: 1475 // c_rarg0 - source array address 1476 // c_rarg1 - destination array address 1477 // c_rarg2 - element count, treated as ssize_t, can be zero 1478 // c_rarg3 - size_t ckoff (super_check_offset) 1479 // c_rarg4 - oop ckval (super_klass) 1480 // 1481 // Output: 1482 // x10 == 0 - success 1483 // x10 == -1^K - failure, where K is partial transfer count 1484 // 1485 address generate_checkcast_copy(const char* name, address* entry, 1486 bool dest_uninitialized = false) { 1487 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1488 1489 // Input registers (after setup_arg_regs) 1490 const Register from = c_rarg0; // source array address 1491 const Register to = c_rarg1; // destination array address 1492 const Register count = c_rarg2; // elementscount 1493 const Register ckoff = c_rarg3; // super_check_offset 1494 const Register ckval = c_rarg4; // super_klass 1495 1496 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1497 RegSet wb_post_saved_regs = RegSet::of(count); 1498 1499 // Registers used as temps (x7, x9, x18 are save-on-entry) 1500 const Register count_save = x19; // orig elementscount 1501 const Register start_to = x18; // destination array start address 1502 const Register copied_oop = x7; // actual oop copied 1503 const Register r9_klass = x9; // oop._klass 1504 1505 // Registers used as gc temps (x15, x16, x17 are save-on-call) 1506 const Register gct1 = x15, gct2 = x16, gct3 = x17; 1507 1508 //--------------------------------------------------------------- 1509 // Assembler stub will be used for this call to arraycopy 1510 // if the two arrays are subtypes of Object[] but the 1511 // destination array type is not equal to or a supertype 1512 // of the source type. Each element must be separately 1513 // checked. 1514 1515 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1516 copied_oop, r9_klass, count_save); 1517 1518 __ align(CodeEntryAlignment); 1519 StubCodeMark mark(this, "StubRoutines", name); 1520 address start = __ pc(); 1521 1522 __ enter(); // required for proper stackwalking of RuntimeStub frame 1523 1524 // Caller of this entry point must set up the argument registers. 1525 if (entry != nullptr) { 1526 *entry = __ pc(); 1527 BLOCK_COMMENT("Entry:"); 1528 } 1529 1530 // Empty array: Nothing to do 1531 __ beqz(count, L_done); 1532 1533 __ push_reg(RegSet::of(x7, x9, x18, x19), sp); 1534 1535 #ifdef ASSERT 1536 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1537 // The ckoff and ckval must be mutually consistent, 1538 // even though caller generates both. 1539 { Label L; 1540 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1541 __ lwu(start_to, Address(ckval, sco_offset)); 1542 __ beq(ckoff, start_to, L); 1543 __ stop("super_check_offset inconsistent"); 1544 __ bind(L); 1545 } 1546 #endif //ASSERT 1547 1548 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1549 if (dest_uninitialized) { 1550 decorators |= IS_DEST_UNINITIALIZED; 1551 } 1552 1553 bool is_oop = true; 1554 int element_size = UseCompressedOops ? 4 : 8; 1555 1556 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1557 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1558 1559 // save the original count 1560 __ mv(count_save, count); 1561 1562 // Copy from low to high addresses 1563 __ mv(start_to, to); // Save destination array start address 1564 __ j(L_load_element); 1565 1566 // ======== begin loop ======== 1567 // (Loop is rotated; its entry is L_load_element.) 1568 // Loop control: 1569 // for count to 0 do 1570 // copied_oop = load_heap_oop(from++) 1571 // ... generate_type_check ... 1572 // store_heap_oop(to++, copied_oop) 1573 // end 1574 1575 __ align(OptoLoopAlignment); 1576 1577 __ BIND(L_store_element); 1578 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1579 Address(to, 0), copied_oop, 1580 gct1, gct2, gct3); 1581 __ add(to, to, UseCompressedOops ? 4 : 8); 1582 __ sub(count, count, 1); 1583 __ beqz(count, L_do_card_marks); 1584 1585 // ======== loop entry is here ======== 1586 __ BIND(L_load_element); 1587 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1588 copied_oop, Address(from, 0), 1589 gct1); 1590 __ add(from, from, UseCompressedOops ? 4 : 8); 1591 __ beqz(copied_oop, L_store_element); 1592 1593 __ load_klass(r9_klass, copied_oop);// query the object klass 1594 generate_type_check(r9_klass, ckoff, ckval, L_store_element); 1595 // ======== end loop ======== 1596 1597 // It was a real error; we must depend on the caller to finish the job. 1598 // Register count = remaining oops, count_orig = total oops. 1599 // Emit GC store barriers for the oops we have copied and report 1600 // their number to the caller. 1601 1602 __ sub(count, count_save, count); // K = partially copied oop count 1603 __ xori(count, count, -1); // report (-1^K) to caller 1604 __ beqz(count, L_done_pop); 1605 1606 __ BIND(L_do_card_marks); 1607 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs); 1608 1609 __ bind(L_done_pop); 1610 __ pop_reg(RegSet::of(x7, x9, x18, x19), sp); 1611 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1612 1613 __ bind(L_done); 1614 __ mv(x10, count); 1615 __ leave(); 1616 __ ret(); 1617 1618 return start; 1619 } 1620 1621 // Perform range checks on the proposed arraycopy. 1622 // Kills temp, but nothing else. 1623 // Also, clean the sign bits of src_pos and dst_pos. 1624 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1625 Register src_pos, // source position (c_rarg1) 1626 Register dst, // destination array oo (c_rarg2) 1627 Register dst_pos, // destination position (c_rarg3) 1628 Register length, 1629 Register temp, 1630 Label& L_failed) { 1631 BLOCK_COMMENT("arraycopy_range_checks:"); 1632 1633 assert_different_registers(t0, temp); 1634 1635 // if [src_pos + length > arrayOop(src)->length()] then FAIL 1636 __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes())); 1637 __ addw(temp, length, src_pos); 1638 __ bgtu(temp, t0, L_failed); 1639 1640 // if [dst_pos + length > arrayOop(dst)->length()] then FAIL 1641 __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1642 __ addw(temp, length, dst_pos); 1643 __ bgtu(temp, t0, L_failed); 1644 1645 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1646 __ zero_extend(src_pos, src_pos, 32); 1647 __ zero_extend(dst_pos, dst_pos, 32); 1648 1649 BLOCK_COMMENT("arraycopy_range_checks done"); 1650 } 1651 1652 // 1653 // Generate 'unsafe' array copy stub 1654 // Though just as safe as the other stubs, it takes an unscaled 1655 // size_t argument instead of an element count. 1656 // 1657 // Input: 1658 // c_rarg0 - source array address 1659 // c_rarg1 - destination array address 1660 // c_rarg2 - byte count, treated as ssize_t, can be zero 1661 // 1662 // Examines the alignment of the operands and dispatches 1663 // to a long, int, short, or byte copy loop. 1664 // 1665 address generate_unsafe_copy(const char* name, 1666 address byte_copy_entry, 1667 address short_copy_entry, 1668 address int_copy_entry, 1669 address long_copy_entry) { 1670 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr && 1671 int_copy_entry != nullptr && long_copy_entry != nullptr); 1672 Label L_long_aligned, L_int_aligned, L_short_aligned; 1673 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1674 1675 __ align(CodeEntryAlignment); 1676 StubCodeMark mark(this, "StubRoutines", name); 1677 address start = __ pc(); 1678 __ enter(); // required for proper stackwalking of RuntimeStub frame 1679 1680 // bump this on entry, not on exit: 1681 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1682 1683 __ orr(t0, s, d); 1684 __ orr(t0, t0, count); 1685 1686 __ andi(t0, t0, BytesPerLong - 1); 1687 __ beqz(t0, L_long_aligned); 1688 __ andi(t0, t0, BytesPerInt - 1); 1689 __ beqz(t0, L_int_aligned); 1690 __ test_bit(t0, t0, 0); 1691 __ beqz(t0, L_short_aligned); 1692 __ j(RuntimeAddress(byte_copy_entry)); 1693 1694 __ BIND(L_short_aligned); 1695 __ srli(count, count, LogBytesPerShort); // size => short_count 1696 __ j(RuntimeAddress(short_copy_entry)); 1697 __ BIND(L_int_aligned); 1698 __ srli(count, count, LogBytesPerInt); // size => int_count 1699 __ j(RuntimeAddress(int_copy_entry)); 1700 __ BIND(L_long_aligned); 1701 __ srli(count, count, LogBytesPerLong); // size => long_count 1702 __ j(RuntimeAddress(long_copy_entry)); 1703 1704 return start; 1705 } 1706 1707 // 1708 // Generate generic array copy stubs 1709 // 1710 // Input: 1711 // c_rarg0 - src oop 1712 // c_rarg1 - src_pos (32-bits) 1713 // c_rarg2 - dst oop 1714 // c_rarg3 - dst_pos (32-bits) 1715 // c_rarg4 - element count (32-bits) 1716 // 1717 // Output: 1718 // x10 == 0 - success 1719 // x10 == -1^K - failure, where K is partial transfer count 1720 // 1721 address generate_generic_copy(const char* name, 1722 address byte_copy_entry, address short_copy_entry, 1723 address int_copy_entry, address oop_copy_entry, 1724 address long_copy_entry, address checkcast_copy_entry) { 1725 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr && 1726 int_copy_entry != nullptr && oop_copy_entry != nullptr && 1727 long_copy_entry != nullptr && checkcast_copy_entry != nullptr); 1728 Label L_failed, L_failed_0, L_objArray; 1729 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1730 1731 // Input registers 1732 const Register src = c_rarg0; // source array oop 1733 const Register src_pos = c_rarg1; // source position 1734 const Register dst = c_rarg2; // destination array oop 1735 const Register dst_pos = c_rarg3; // destination position 1736 const Register length = c_rarg4; 1737 1738 // Registers used as temps 1739 const Register dst_klass = c_rarg5; 1740 1741 __ align(CodeEntryAlignment); 1742 1743 StubCodeMark mark(this, "StubRoutines", name); 1744 1745 address start = __ pc(); 1746 1747 __ enter(); // required for proper stackwalking of RuntimeStub frame 1748 1749 // bump this on entry, not on exit: 1750 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1751 1752 //----------------------------------------------------------------------- 1753 // Assembler stub will be used for this call to arraycopy 1754 // if the following conditions are met: 1755 // 1756 // (1) src and dst must not be null. 1757 // (2) src_pos must not be negative. 1758 // (3) dst_pos must not be negative. 1759 // (4) length must not be negative. 1760 // (5) src klass and dst klass should be the same and not null. 1761 // (6) src and dst should be arrays. 1762 // (7) src_pos + length must not exceed length of src. 1763 // (8) dst_pos + length must not exceed length of dst. 1764 // 1765 1766 // if src is null then return -1 1767 __ beqz(src, L_failed); 1768 1769 // if [src_pos < 0] then return -1 1770 __ sign_extend(t0, src_pos, 32); 1771 __ bltz(t0, L_failed); 1772 1773 // if dst is null then return -1 1774 __ beqz(dst, L_failed); 1775 1776 // if [dst_pos < 0] then return -1 1777 __ sign_extend(t0, dst_pos, 32); 1778 __ bltz(t0, L_failed); 1779 1780 // registers used as temp 1781 const Register scratch_length = x28; // elements count to copy 1782 const Register scratch_src_klass = x29; // array klass 1783 const Register lh = x30; // layout helper 1784 1785 // if [length < 0] then return -1 1786 __ sign_extend(scratch_length, length, 32); // length (elements count, 32-bits value) 1787 __ bltz(scratch_length, L_failed); 1788 1789 __ load_klass(scratch_src_klass, src); 1790 #ifdef ASSERT 1791 { 1792 BLOCK_COMMENT("assert klasses not null {"); 1793 Label L1, L2; 1794 __ bnez(scratch_src_klass, L2); // it is broken if klass is null 1795 __ bind(L1); 1796 __ stop("broken null klass"); 1797 __ bind(L2); 1798 __ load_klass(t0, dst, t1); 1799 __ beqz(t0, L1); // this would be broken also 1800 BLOCK_COMMENT("} assert klasses not null done"); 1801 } 1802 #endif 1803 1804 // Load layout helper (32-bits) 1805 // 1806 // |array_tag| | header_size | element_type | |log2_element_size| 1807 // 32 30 24 16 8 2 0 1808 // 1809 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1810 // 1811 1812 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1813 1814 // Handle objArrays completely differently... 1815 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1816 __ lw(lh, Address(scratch_src_klass, lh_offset)); 1817 __ mv(t0, objArray_lh); 1818 __ beq(lh, t0, L_objArray); 1819 1820 // if [src->klass() != dst->klass()] then return -1 1821 __ load_klass(t1, dst); 1822 __ bne(t1, scratch_src_klass, L_failed); 1823 1824 // if src->is_Array() isn't null then return -1 1825 // i.e. (lh >= 0) 1826 __ bgez(lh, L_failed); 1827 1828 // At this point, it is known to be a typeArray (array_tag 0x3). 1829 #ifdef ASSERT 1830 { 1831 BLOCK_COMMENT("assert primitive array {"); 1832 Label L; 1833 __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 1834 __ bge(lh, t1, L); 1835 __ stop("must be a primitive array"); 1836 __ bind(L); 1837 BLOCK_COMMENT("} assert primitive array done"); 1838 } 1839 #endif 1840 1841 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1842 t1, L_failed); 1843 1844 // TypeArrayKlass 1845 // 1846 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize) 1847 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize) 1848 // 1849 1850 const Register t0_offset = t0; // array offset 1851 const Register x30_elsize = lh; // element size 1852 1853 // Get array_header_in_bytes() 1854 int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1); 1855 int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width; 1856 __ slli(t0_offset, lh, XLEN - lh_header_size_msb); // left shift to remove 24 ~ 32; 1857 __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset 1858 1859 __ add(src, src, t0_offset); // src array offset 1860 __ add(dst, dst, t0_offset); // dst array offset 1861 BLOCK_COMMENT("choose copy loop based on element size"); 1862 1863 // next registers should be set before the jump to corresponding stub 1864 const Register from = c_rarg0; // source array address 1865 const Register to = c_rarg1; // destination array address 1866 const Register count = c_rarg2; // elements count 1867 1868 // 'from', 'to', 'count' registers should be set in such order 1869 // since they are the same as 'src', 'src_pos', 'dst'. 1870 1871 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1872 1873 // The possible values of elsize are 0-3, i.e. exact_log2(element 1874 // size in bytes). We do a simple bitwise binary search. 1875 __ BIND(L_copy_bytes); 1876 __ test_bit(t0, x30_elsize, 1); 1877 __ bnez(t0, L_copy_ints); 1878 __ test_bit(t0, x30_elsize, 0); 1879 __ bnez(t0, L_copy_shorts); 1880 __ add(from, src, src_pos); // src_addr 1881 __ add(to, dst, dst_pos); // dst_addr 1882 __ sign_extend(count, scratch_length, 32); // length 1883 __ j(RuntimeAddress(byte_copy_entry)); 1884 1885 __ BIND(L_copy_shorts); 1886 __ shadd(from, src_pos, src, t0, 1); // src_addr 1887 __ shadd(to, dst_pos, dst, t0, 1); // dst_addr 1888 __ sign_extend(count, scratch_length, 32); // length 1889 __ j(RuntimeAddress(short_copy_entry)); 1890 1891 __ BIND(L_copy_ints); 1892 __ test_bit(t0, x30_elsize, 0); 1893 __ bnez(t0, L_copy_longs); 1894 __ shadd(from, src_pos, src, t0, 2); // src_addr 1895 __ shadd(to, dst_pos, dst, t0, 2); // dst_addr 1896 __ sign_extend(count, scratch_length, 32); // length 1897 __ j(RuntimeAddress(int_copy_entry)); 1898 1899 __ BIND(L_copy_longs); 1900 #ifdef ASSERT 1901 { 1902 BLOCK_COMMENT("assert long copy {"); 1903 Label L; 1904 __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize 1905 __ sign_extend(lh, lh, 32); 1906 __ mv(t0, LogBytesPerLong); 1907 __ beq(x30_elsize, t0, L); 1908 __ stop("must be long copy, but elsize is wrong"); 1909 __ bind(L); 1910 BLOCK_COMMENT("} assert long copy done"); 1911 } 1912 #endif 1913 __ shadd(from, src_pos, src, t0, 3); // src_addr 1914 __ shadd(to, dst_pos, dst, t0, 3); // dst_addr 1915 __ sign_extend(count, scratch_length, 32); // length 1916 __ j(RuntimeAddress(long_copy_entry)); 1917 1918 // ObjArrayKlass 1919 __ BIND(L_objArray); 1920 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 1921 1922 Label L_plain_copy, L_checkcast_copy; 1923 // test array classes for subtyping 1924 __ load_klass(t2, dst); 1925 __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality 1926 1927 // Identically typed arrays can be copied without element-wise checks. 1928 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1929 t1, L_failed); 1930 1931 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1932 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1933 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1934 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1935 __ sign_extend(count, scratch_length, 32); // length 1936 __ BIND(L_plain_copy); 1937 __ j(RuntimeAddress(oop_copy_entry)); 1938 1939 __ BIND(L_checkcast_copy); 1940 // live at this point: scratch_src_klass, scratch_length, t2 (dst_klass) 1941 { 1942 // Before looking at dst.length, make sure dst is also an objArray. 1943 __ lwu(t0, Address(t2, lh_offset)); 1944 __ mv(t1, objArray_lh); 1945 __ bne(t0, t1, L_failed); 1946 1947 // It is safe to examine both src.length and dst.length. 1948 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1949 t2, L_failed); 1950 1951 __ load_klass(dst_klass, dst); // reload 1952 1953 // Marshal the base address arguments now, freeing registers. 1954 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1955 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1956 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1957 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1958 __ sign_extend(count, length, 32); // length (reloaded) 1959 const Register sco_temp = c_rarg3; // this register is free now 1960 assert_different_registers(from, to, count, sco_temp, 1961 dst_klass, scratch_src_klass); 1962 1963 // Generate the type check. 1964 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1965 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1966 1967 // Smashes t0, t1 1968 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 1969 1970 // Fetch destination element klass from the ObjArrayKlass header. 1971 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 1972 __ ld(dst_klass, Address(dst_klass, ek_offset)); 1973 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1974 1975 // the checkcast_copy loop needs two extra arguments: 1976 assert(c_rarg3 == sco_temp, "#3 already in place"); 1977 // Set up arguments for checkcast_copy_entry. 1978 __ mv(c_rarg4, dst_klass); // dst.klass.element_klass 1979 __ j(RuntimeAddress(checkcast_copy_entry)); 1980 } 1981 1982 __ BIND(L_failed); 1983 __ mv(x10, -1); 1984 __ leave(); // required for proper stackwalking of RuntimeStub frame 1985 __ ret(); 1986 1987 return start; 1988 } 1989 1990 // 1991 // Generate stub for array fill. If "aligned" is true, the 1992 // "to" address is assumed to be heapword aligned. 1993 // 1994 // Arguments for generated stub: 1995 // to: c_rarg0 1996 // value: c_rarg1 1997 // count: c_rarg2 treated as signed 1998 // 1999 address generate_fill(BasicType t, bool aligned, const char* name) { 2000 __ align(CodeEntryAlignment); 2001 StubCodeMark mark(this, "StubRoutines", name); 2002 address start = __ pc(); 2003 2004 BLOCK_COMMENT("Entry:"); 2005 2006 const Register to = c_rarg0; // source array address 2007 const Register value = c_rarg1; // value 2008 const Register count = c_rarg2; // elements count 2009 2010 const Register bz_base = x28; // base for block_zero routine 2011 const Register cnt_words = x29; // temp register 2012 const Register tmp_reg = t1; 2013 2014 __ enter(); 2015 2016 Label L_fill_elements, L_exit1; 2017 2018 int shift = -1; 2019 switch (t) { 2020 case T_BYTE: 2021 shift = 0; 2022 2023 // Zero extend value 2024 // 8 bit -> 16 bit 2025 __ andi(value, value, 0xff); 2026 __ mv(tmp_reg, value); 2027 __ slli(tmp_reg, tmp_reg, 8); 2028 __ orr(value, value, tmp_reg); 2029 2030 // 16 bit -> 32 bit 2031 __ mv(tmp_reg, value); 2032 __ slli(tmp_reg, tmp_reg, 16); 2033 __ orr(value, value, tmp_reg); 2034 2035 __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2036 __ bltu(count, tmp_reg, L_fill_elements); 2037 break; 2038 case T_SHORT: 2039 shift = 1; 2040 // Zero extend value 2041 // 16 bit -> 32 bit 2042 __ andi(value, value, 0xffff); 2043 __ mv(tmp_reg, value); 2044 __ slli(tmp_reg, tmp_reg, 16); 2045 __ orr(value, value, tmp_reg); 2046 2047 // Short arrays (< 8 bytes) fill by element 2048 __ mv(tmp_reg, 8 >> shift); 2049 __ bltu(count, tmp_reg, L_fill_elements); 2050 break; 2051 case T_INT: 2052 shift = 2; 2053 2054 // Short arrays (< 8 bytes) fill by element 2055 __ mv(tmp_reg, 8 >> shift); 2056 __ bltu(count, tmp_reg, L_fill_elements); 2057 break; 2058 default: ShouldNotReachHere(); 2059 } 2060 2061 // Align source address at 8 bytes address boundary. 2062 Label L_skip_align1, L_skip_align2, L_skip_align4; 2063 if (!aligned) { 2064 switch (t) { 2065 case T_BYTE: 2066 // One byte misalignment happens only for byte arrays. 2067 __ test_bit(t0, to, 0); 2068 __ beqz(t0, L_skip_align1); 2069 __ sb(value, Address(to, 0)); 2070 __ addi(to, to, 1); 2071 __ addiw(count, count, -1); 2072 __ bind(L_skip_align1); 2073 // Fallthrough 2074 case T_SHORT: 2075 // Two bytes misalignment happens only for byte and short (char) arrays. 2076 __ test_bit(t0, to, 1); 2077 __ beqz(t0, L_skip_align2); 2078 __ sh(value, Address(to, 0)); 2079 __ addi(to, to, 2); 2080 __ addiw(count, count, -(2 >> shift)); 2081 __ bind(L_skip_align2); 2082 // Fallthrough 2083 case T_INT: 2084 // Align to 8 bytes, we know we are 4 byte aligned to start. 2085 __ test_bit(t0, to, 2); 2086 __ beqz(t0, L_skip_align4); 2087 __ sw(value, Address(to, 0)); 2088 __ addi(to, to, 4); 2089 __ addiw(count, count, -(4 >> shift)); 2090 __ bind(L_skip_align4); 2091 break; 2092 default: ShouldNotReachHere(); 2093 } 2094 } 2095 2096 // 2097 // Fill large chunks 2098 // 2099 __ srliw(cnt_words, count, 3 - shift); // number of words 2100 2101 // 32 bit -> 64 bit 2102 __ andi(value, value, 0xffffffff); 2103 __ mv(tmp_reg, value); 2104 __ slli(tmp_reg, tmp_reg, 32); 2105 __ orr(value, value, tmp_reg); 2106 2107 __ slli(tmp_reg, cnt_words, 3 - shift); 2108 __ subw(count, count, tmp_reg); 2109 { 2110 __ fill_words(to, cnt_words, value); 2111 } 2112 2113 // Remaining count is less than 8 bytes. Fill it by a single store. 2114 // Note that the total length is no less than 8 bytes. 2115 if (t == T_BYTE || t == T_SHORT) { 2116 __ beqz(count, L_exit1); 2117 __ shadd(to, count, to, tmp_reg, shift); // points to the end 2118 __ sd(value, Address(to, -8)); // overwrite some elements 2119 __ bind(L_exit1); 2120 __ leave(); 2121 __ ret(); 2122 } 2123 2124 // Handle copies less than 8 bytes. 2125 Label L_fill_2, L_fill_4, L_exit2; 2126 __ bind(L_fill_elements); 2127 switch (t) { 2128 case T_BYTE: 2129 __ test_bit(t0, count, 0); 2130 __ beqz(t0, L_fill_2); 2131 __ sb(value, Address(to, 0)); 2132 __ addi(to, to, 1); 2133 __ bind(L_fill_2); 2134 __ test_bit(t0, count, 1); 2135 __ beqz(t0, L_fill_4); 2136 __ sh(value, Address(to, 0)); 2137 __ addi(to, to, 2); 2138 __ bind(L_fill_4); 2139 __ test_bit(t0, count, 2); 2140 __ beqz(t0, L_exit2); 2141 __ sw(value, Address(to, 0)); 2142 break; 2143 case T_SHORT: 2144 __ test_bit(t0, count, 0); 2145 __ beqz(t0, L_fill_4); 2146 __ sh(value, Address(to, 0)); 2147 __ addi(to, to, 2); 2148 __ bind(L_fill_4); 2149 __ test_bit(t0, count, 1); 2150 __ beqz(t0, L_exit2); 2151 __ sw(value, Address(to, 0)); 2152 break; 2153 case T_INT: 2154 __ beqz(count, L_exit2); 2155 __ sw(value, Address(to, 0)); 2156 break; 2157 default: ShouldNotReachHere(); 2158 } 2159 __ bind(L_exit2); 2160 __ leave(); 2161 __ ret(); 2162 return start; 2163 } 2164 2165 void generate_arraycopy_stubs() { 2166 address entry = nullptr; 2167 address entry_jbyte_arraycopy = nullptr; 2168 address entry_jshort_arraycopy = nullptr; 2169 address entry_jint_arraycopy = nullptr; 2170 address entry_oop_arraycopy = nullptr; 2171 address entry_jlong_arraycopy = nullptr; 2172 address entry_checkcast_arraycopy = nullptr; 2173 2174 generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards); 2175 generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards); 2176 2177 StubRoutines::riscv::_zero_blocks = generate_zero_blocks(); 2178 2179 //*** jbyte 2180 // Always need aligned and unaligned versions 2181 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2182 "jbyte_disjoint_arraycopy"); 2183 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2184 &entry_jbyte_arraycopy, 2185 "jbyte_arraycopy"); 2186 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2187 "arrayof_jbyte_disjoint_arraycopy"); 2188 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2189 "arrayof_jbyte_arraycopy"); 2190 2191 //*** jshort 2192 // Always need aligned and unaligned versions 2193 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2194 "jshort_disjoint_arraycopy"); 2195 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2196 &entry_jshort_arraycopy, 2197 "jshort_arraycopy"); 2198 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2199 "arrayof_jshort_disjoint_arraycopy"); 2200 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2201 "arrayof_jshort_arraycopy"); 2202 2203 //*** jint 2204 // Aligned versions 2205 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2206 "arrayof_jint_disjoint_arraycopy"); 2207 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2208 "arrayof_jint_arraycopy"); 2209 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2210 // entry_jint_arraycopy always points to the unaligned version 2211 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2212 "jint_disjoint_arraycopy"); 2213 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2214 &entry_jint_arraycopy, 2215 "jint_arraycopy"); 2216 2217 //*** jlong 2218 // It is always aligned 2219 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2220 "arrayof_jlong_disjoint_arraycopy"); 2221 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2222 "arrayof_jlong_arraycopy"); 2223 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2224 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2225 2226 //*** oops 2227 { 2228 // With compressed oops we need unaligned versions; notice that 2229 // we overwrite entry_oop_arraycopy. 2230 bool aligned = !UseCompressedOops; 2231 2232 StubRoutines::_arrayof_oop_disjoint_arraycopy 2233 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2234 /*dest_uninitialized*/false); 2235 StubRoutines::_arrayof_oop_arraycopy 2236 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2237 /*dest_uninitialized*/false); 2238 // Aligned versions without pre-barriers 2239 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2240 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2241 /*dest_uninitialized*/true); 2242 StubRoutines::_arrayof_oop_arraycopy_uninit 2243 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2244 /*dest_uninitialized*/true); 2245 } 2246 2247 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2248 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2249 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2250 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2251 2252 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2253 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2254 /*dest_uninitialized*/true); 2255 2256 2257 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2258 entry_jbyte_arraycopy, 2259 entry_jshort_arraycopy, 2260 entry_jint_arraycopy, 2261 entry_jlong_arraycopy); 2262 2263 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2264 entry_jbyte_arraycopy, 2265 entry_jshort_arraycopy, 2266 entry_jint_arraycopy, 2267 entry_oop_arraycopy, 2268 entry_jlong_arraycopy, 2269 entry_checkcast_arraycopy); 2270 2271 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2272 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2273 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2274 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2275 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2276 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2277 } 2278 2279 // code for comparing 16 bytes of strings with same encoding 2280 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 2281 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31; 2282 __ ld(tmp5, Address(str1)); 2283 __ addi(str1, str1, 8); 2284 __ xorr(tmp4, tmp1, tmp2); 2285 __ ld(cnt1, Address(str2)); 2286 __ addi(str2, str2, 8); 2287 __ bnez(tmp4, DIFF1); 2288 __ ld(tmp1, Address(str1)); 2289 __ addi(str1, str1, 8); 2290 __ xorr(tmp4, tmp5, cnt1); 2291 __ ld(tmp2, Address(str2)); 2292 __ addi(str2, str2, 8); 2293 __ bnez(tmp4, DIFF2); 2294 } 2295 2296 // code for comparing 8 characters of strings with Latin1 and Utf16 encoding 2297 void compare_string_8_x_LU(Register tmpL, Register tmpU, Register strL, Register strU, Label& DIFF) { 2298 const Register tmp = x30, tmpLval = x12; 2299 __ ld(tmpLval, Address(strL)); 2300 __ addi(strL, strL, wordSize); 2301 __ ld(tmpU, Address(strU)); 2302 __ addi(strU, strU, wordSize); 2303 __ inflate_lo32(tmpL, tmpLval); 2304 __ xorr(tmp, tmpU, tmpL); 2305 __ bnez(tmp, DIFF); 2306 2307 __ ld(tmpU, Address(strU)); 2308 __ addi(strU, strU, wordSize); 2309 __ inflate_hi32(tmpL, tmpLval); 2310 __ xorr(tmp, tmpU, tmpL); 2311 __ bnez(tmp, DIFF); 2312 } 2313 2314 // x10 = result 2315 // x11 = str1 2316 // x12 = cnt1 2317 // x13 = str2 2318 // x14 = cnt2 2319 // x28 = tmp1 2320 // x29 = tmp2 2321 // x30 = tmp3 2322 address generate_compare_long_string_different_encoding(bool isLU) { 2323 __ align(CodeEntryAlignment); 2324 StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL"); 2325 address entry = __ pc(); 2326 Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE; 2327 const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14, 2328 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12; 2329 2330 // cnt2 == amount of characters left to compare 2331 // Check already loaded first 4 symbols 2332 __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2); 2333 __ mv(isLU ? tmp1 : tmp2, tmp3); 2334 __ addi(str1, str1, isLU ? wordSize / 2 : wordSize); 2335 __ addi(str2, str2, isLU ? wordSize : wordSize / 2); 2336 __ sub(cnt2, cnt2, wordSize / 2); // Already loaded 4 symbols 2337 2338 __ xorr(tmp3, tmp1, tmp2); 2339 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2340 2341 Register strU = isLU ? str2 : str1, 2342 strL = isLU ? str1 : str2, 2343 tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison 2344 tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison 2345 2346 // make sure main loop is 8 byte-aligned, we should load another 4 bytes from strL 2347 // cnt2 is >= 68 here, no need to check it for >= 0 2348 __ lwu(tmpL, Address(strL)); 2349 __ addi(strL, strL, wordSize / 2); 2350 __ ld(tmpU, Address(strU)); 2351 __ addi(strU, strU, wordSize); 2352 __ inflate_lo32(tmp3, tmpL); 2353 __ mv(tmpL, tmp3); 2354 __ xorr(tmp3, tmpU, tmpL); 2355 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2356 __ addi(cnt2, cnt2, -wordSize / 2); 2357 2358 // we are now 8-bytes aligned on strL 2359 __ sub(cnt2, cnt2, wordSize * 2); 2360 __ bltz(cnt2, TAIL); 2361 __ bind(SMALL_LOOP); // smaller loop 2362 __ sub(cnt2, cnt2, wordSize * 2); 2363 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2364 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2365 __ bgez(cnt2, SMALL_LOOP); 2366 __ addi(t0, cnt2, wordSize * 2); 2367 __ beqz(t0, DONE); 2368 __ bind(TAIL); // 1..15 characters left 2369 // Aligned access. Load bytes in portions - 4, 2, 1. 2370 2371 __ addi(t0, cnt2, wordSize); 2372 __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process 2373 __ bltz(t0, LOAD_LAST); 2374 // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU 2375 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2376 __ addi(cnt2, cnt2, -wordSize); 2377 __ beqz(cnt2, DONE); // no character left 2378 __ bind(LOAD_LAST); // cnt2 = 1..7 characters left 2379 2380 __ addi(cnt2, cnt2, -wordSize); // cnt2 is now an offset in strL which points to last 8 bytes 2381 __ slli(t0, cnt2, 1); // t0 is now an offset in strU which points to last 16 bytes 2382 __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string 2383 __ add(strU, strU, t0); // Address of last 16 bytes in UTF-16 string 2384 __ load_int_misaligned(tmpL, Address(strL), t0, false); 2385 __ load_long_misaligned(tmpU, Address(strU), t0, 2); 2386 __ inflate_lo32(tmp3, tmpL); 2387 __ mv(tmpL, tmp3); 2388 __ xorr(tmp3, tmpU, tmpL); 2389 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2390 2391 __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string 2392 __ addi(strU, strU, wordSize); // Address of last 8 bytes in UTF-16 string 2393 __ load_int_misaligned(tmpL, Address(strL), t0, false); 2394 __ load_long_misaligned(tmpU, Address(strU), t0, 2); 2395 __ inflate_lo32(tmp3, tmpL); 2396 __ mv(tmpL, tmp3); 2397 __ xorr(tmp3, tmpU, tmpL); 2398 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2399 __ j(DONE); // no character left 2400 2401 // Find the first different characters in the longwords and 2402 // compute their difference. 2403 __ bind(CALCULATE_DIFFERENCE); 2404 __ ctzc_bit(tmp4, tmp3); 2405 __ srl(tmp1, tmp1, tmp4); 2406 __ srl(tmp2, tmp2, tmp4); 2407 __ andi(tmp1, tmp1, 0xFFFF); 2408 __ andi(tmp2, tmp2, 0xFFFF); 2409 __ sub(result, tmp1, tmp2); 2410 __ bind(DONE); 2411 __ ret(); 2412 return entry; 2413 } 2414 2415 address generate_method_entry_barrier() { 2416 __ align(CodeEntryAlignment); 2417 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 2418 2419 Label deoptimize_label; 2420 2421 address start = __ pc(); 2422 2423 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 2424 2425 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 2426 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 2427 Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 2428 __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr())); 2429 __ lwu(t1, t1); 2430 __ sw(t1, thread_epoch_addr); 2431 __ membar(__ LoadLoad); 2432 } 2433 2434 __ set_last_Java_frame(sp, fp, ra); 2435 2436 __ enter(); 2437 __ add(t1, sp, wordSize); 2438 2439 __ sub(sp, sp, 4 * wordSize); 2440 2441 __ push_call_clobbered_registers(); 2442 2443 __ mv(c_rarg0, t1); 2444 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 2445 2446 __ reset_last_Java_frame(true); 2447 2448 __ mv(t0, x10); 2449 2450 __ pop_call_clobbered_registers(); 2451 2452 __ bnez(t0, deoptimize_label); 2453 2454 __ leave(); 2455 __ ret(); 2456 2457 __ BIND(deoptimize_label); 2458 2459 __ ld(t0, Address(sp, 0)); 2460 __ ld(fp, Address(sp, wordSize)); 2461 __ ld(ra, Address(sp, wordSize * 2)); 2462 __ ld(t1, Address(sp, wordSize * 3)); 2463 2464 __ mv(sp, t0); 2465 __ jr(t1); 2466 2467 return start; 2468 } 2469 2470 // x10 = result 2471 // x11 = str1 2472 // x12 = cnt1 2473 // x13 = str2 2474 // x14 = cnt2 2475 // x28 = tmp1 2476 // x29 = tmp2 2477 // x30 = tmp3 2478 // x31 = tmp4 2479 address generate_compare_long_string_same_encoding(bool isLL) { 2480 __ align(CodeEntryAlignment); 2481 StubCodeMark mark(this, "StubRoutines", isLL ? 2482 "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU"); 2483 address entry = __ pc(); 2484 Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL, 2485 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF; 2486 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14, 2487 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31; 2488 RegSet spilled_regs = RegSet::of(tmp4, tmp5); 2489 2490 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 2491 // update cnt2 counter with already loaded 8 bytes 2492 __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2)); 2493 // update pointers, because of previous read 2494 __ add(str1, str1, wordSize); 2495 __ add(str2, str2, wordSize); 2496 // less than 16 bytes left? 2497 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2498 __ push_reg(spilled_regs, sp); 2499 __ bltz(cnt2, TAIL); 2500 __ bind(SMALL_LOOP); 2501 compare_string_16_bytes_same(DIFF, DIFF2); 2502 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2503 __ bgez(cnt2, SMALL_LOOP); 2504 __ bind(TAIL); 2505 __ addi(cnt2, cnt2, isLL ? 16 : 8); 2506 __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); 2507 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2508 __ blez(cnt2, CHECK_LAST); 2509 __ xorr(tmp4, tmp1, tmp2); 2510 __ bnez(tmp4, DIFF); 2511 __ ld(tmp1, Address(str1)); 2512 __ addi(str1, str1, 8); 2513 __ ld(tmp2, Address(str2)); 2514 __ addi(str2, str2, 8); 2515 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2516 __ bind(CHECK_LAST); 2517 if (!isLL) { 2518 __ add(cnt2, cnt2, cnt2); // now in bytes 2519 } 2520 __ xorr(tmp4, tmp1, tmp2); 2521 __ bnez(tmp4, DIFF); 2522 __ add(str1, str1, cnt2); 2523 __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2); 2524 __ add(str2, str2, cnt2); 2525 __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2); 2526 __ xorr(tmp4, tmp5, cnt1); 2527 __ beqz(tmp4, LENGTH_DIFF); 2528 // Find the first different characters in the longwords and 2529 // compute their difference. 2530 __ bind(DIFF2); 2531 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2532 __ srl(tmp5, tmp5, tmp3); 2533 __ srl(cnt1, cnt1, tmp3); 2534 if (isLL) { 2535 __ andi(tmp5, tmp5, 0xFF); 2536 __ andi(cnt1, cnt1, 0xFF); 2537 } else { 2538 __ andi(tmp5, tmp5, 0xFFFF); 2539 __ andi(cnt1, cnt1, 0xFFFF); 2540 } 2541 __ sub(result, tmp5, cnt1); 2542 __ j(LENGTH_DIFF); 2543 __ bind(DIFF); 2544 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2545 __ srl(tmp1, tmp1, tmp3); 2546 __ srl(tmp2, tmp2, tmp3); 2547 if (isLL) { 2548 __ andi(tmp1, tmp1, 0xFF); 2549 __ andi(tmp2, tmp2, 0xFF); 2550 } else { 2551 __ andi(tmp1, tmp1, 0xFFFF); 2552 __ andi(tmp2, tmp2, 0xFFFF); 2553 } 2554 __ sub(result, tmp1, tmp2); 2555 __ j(LENGTH_DIFF); 2556 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 2557 __ xorr(tmp4, tmp1, tmp2); 2558 __ bnez(tmp4, DIFF); 2559 __ bind(LENGTH_DIFF); 2560 __ pop_reg(spilled_regs, sp); 2561 __ ret(); 2562 return entry; 2563 } 2564 2565 void generate_compare_long_strings() { 2566 StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true); 2567 StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false); 2568 StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true); 2569 StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false); 2570 } 2571 2572 // x10 result 2573 // x11 src 2574 // x12 src count 2575 // x13 pattern 2576 // x14 pattern count 2577 address generate_string_indexof_linear(bool needle_isL, bool haystack_isL) 2578 { 2579 const char* stubName = needle_isL 2580 ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul") 2581 : "indexof_linear_uu"; 2582 __ align(CodeEntryAlignment); 2583 StubCodeMark mark(this, "StubRoutines", stubName); 2584 address entry = __ pc(); 2585 2586 int needle_chr_size = needle_isL ? 1 : 2; 2587 int haystack_chr_size = haystack_isL ? 1 : 2; 2588 int needle_chr_shift = needle_isL ? 0 : 1; 2589 int haystack_chr_shift = haystack_isL ? 0 : 1; 2590 bool isL = needle_isL && haystack_isL; 2591 // parameters 2592 Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14; 2593 // temporary registers 2594 Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25; 2595 // redefinitions 2596 Register ch1 = x28, ch2 = x29; 2597 RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29); 2598 2599 __ push_reg(spilled_regs, sp); 2600 2601 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 2602 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 2603 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 2604 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 2605 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 2606 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 2607 2608 __ ld(ch1, Address(needle)); 2609 __ ld(ch2, Address(haystack)); 2610 // src.length - pattern.length 2611 __ sub(haystack_len, haystack_len, needle_len); 2612 2613 // first is needle[0] 2614 __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first); 2615 uint64_t mask0101 = UCONST64(0x0101010101010101); 2616 uint64_t mask0001 = UCONST64(0x0001000100010001); 2617 __ mv(mask1, haystack_isL ? mask0101 : mask0001); 2618 __ mul(first, first, mask1); 2619 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 2620 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 2621 __ mv(mask2, haystack_isL ? mask7f7f : mask7fff); 2622 if (needle_isL != haystack_isL) { 2623 __ mv(tmp, ch1); 2624 } 2625 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1); 2626 __ blez(haystack_len, L_SMALL); 2627 2628 if (needle_isL != haystack_isL) { 2629 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2630 } 2631 // xorr, sub, orr, notr, andr 2632 // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i] 2633 // eg: 2634 // first: aa aa aa aa aa aa aa aa 2635 // ch2: aa aa li nx jd ka aa aa 2636 // match_mask: 80 80 00 00 00 00 80 80 2637 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2638 2639 // search first char of needle, if success, goto L_HAS_ZERO; 2640 __ bnez(match_mask, L_HAS_ZERO); 2641 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2642 __ add(result, result, wordSize / haystack_chr_size); 2643 __ add(haystack, haystack, wordSize); 2644 __ bltz(haystack_len, L_POST_LOOP); 2645 2646 __ bind(L_LOOP); 2647 __ ld(ch2, Address(haystack)); 2648 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2649 __ bnez(match_mask, L_HAS_ZERO); 2650 2651 __ bind(L_LOOP_PROCEED); 2652 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2653 __ add(haystack, haystack, wordSize); 2654 __ add(result, result, wordSize / haystack_chr_size); 2655 __ bgez(haystack_len, L_LOOP); 2656 2657 __ bind(L_POST_LOOP); 2658 __ mv(ch2, -wordSize / haystack_chr_size); 2659 __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check 2660 __ ld(ch2, Address(haystack)); 2661 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2662 __ neg(haystack_len, haystack_len); 2663 __ xorr(ch2, first, ch2); 2664 __ sub(match_mask, ch2, mask1); 2665 __ orr(ch2, ch2, mask2); 2666 __ mv(trailing_zeros, -1); // all bits set 2667 __ j(L_SMALL_PROCEED); 2668 2669 __ align(OptoLoopAlignment); 2670 __ bind(L_SMALL); 2671 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2672 __ neg(haystack_len, haystack_len); 2673 if (needle_isL != haystack_isL) { 2674 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2675 } 2676 __ xorr(ch2, first, ch2); 2677 __ sub(match_mask, ch2, mask1); 2678 __ orr(ch2, ch2, mask2); 2679 __ mv(trailing_zeros, -1); // all bits set 2680 2681 __ bind(L_SMALL_PROCEED); 2682 __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits. 2683 __ notr(ch2, ch2); 2684 __ andr(match_mask, match_mask, ch2); 2685 __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check 2686 __ beqz(match_mask, NOMATCH); 2687 2688 __ bind(L_SMALL_HAS_ZERO_LOOP); 2689 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros 2690 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2691 __ mv(ch2, wordSize / haystack_chr_size); 2692 __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2); 2693 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2694 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2695 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2696 2697 __ bind(L_SMALL_CMP_LOOP); 2698 __ shadd(first, trailing_zeros, needle, first, needle_chr_shift); 2699 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2700 needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first)); 2701 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2702 __ add(trailing_zeros, trailing_zeros, 1); 2703 __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP); 2704 __ beq(first, ch2, L_SMALL_CMP_LOOP); 2705 2706 __ bind(L_SMALL_CMP_LOOP_NOMATCH); 2707 __ beqz(match_mask, NOMATCH); 2708 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2709 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2710 __ add(result, result, 1); 2711 __ add(haystack, haystack, haystack_chr_size); 2712 __ j(L_SMALL_HAS_ZERO_LOOP); 2713 2714 __ align(OptoLoopAlignment); 2715 __ bind(L_SMALL_CMP_LOOP_LAST_CMP); 2716 __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2717 __ j(DONE); 2718 2719 __ align(OptoLoopAlignment); 2720 __ bind(L_SMALL_CMP_LOOP_LAST_CMP2); 2721 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2722 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2723 __ j(DONE); 2724 2725 __ align(OptoLoopAlignment); 2726 __ bind(L_HAS_ZERO); 2727 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2728 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2729 __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2); 2730 __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits) 2731 __ sub(result, result, 1); // array index from 0, so result -= 1 2732 2733 __ bind(L_HAS_ZERO_LOOP); 2734 __ mv(needle_len, wordSize / haystack_chr_size); 2735 __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2); 2736 __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2); 2737 // load next 8 bytes from haystack, and increase result index 2738 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2739 __ add(result, result, 1); 2740 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2741 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2742 2743 // compare one char 2744 __ bind(L_CMP_LOOP); 2745 __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift); 2746 needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len)); 2747 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2748 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2749 __ add(trailing_zeros, trailing_zeros, 1); // next char index 2750 __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2); 2751 __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP); 2752 __ beq(needle_len, ch2, L_CMP_LOOP); 2753 2754 __ bind(L_CMP_LOOP_NOMATCH); 2755 __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH); 2756 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index 2757 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2758 __ add(haystack, haystack, haystack_chr_size); 2759 __ j(L_HAS_ZERO_LOOP); 2760 2761 __ align(OptoLoopAlignment); 2762 __ bind(L_CMP_LOOP_LAST_CMP); 2763 __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH); 2764 __ j(DONE); 2765 2766 __ align(OptoLoopAlignment); 2767 __ bind(L_CMP_LOOP_LAST_CMP2); 2768 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2769 __ add(result, result, 1); 2770 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2771 __ j(DONE); 2772 2773 __ align(OptoLoopAlignment); 2774 __ bind(L_HAS_ZERO_LOOP_NOMATCH); 2775 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 2776 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 2777 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 2778 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 2779 // result by analyzed characters value, so, we can just reset lower bits 2780 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 2781 // 2) restore needle_len and haystack_len values from "compressed" haystack_len 2782 // 3) advance haystack value to represent next haystack octet. result & 7/3 is 2783 // index of last analyzed substring inside current octet. So, haystack in at 2784 // respective start address. We need to advance it to next octet 2785 __ andi(match_mask, result, wordSize / haystack_chr_size - 1); 2786 __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2); 2787 __ andi(result, result, haystack_isL ? -8 : -4); 2788 __ slli(tmp, match_mask, haystack_chr_shift); 2789 __ sub(haystack, haystack, tmp); 2790 __ sign_extend(haystack_len, haystack_len, 32); 2791 __ j(L_LOOP_PROCEED); 2792 2793 __ align(OptoLoopAlignment); 2794 __ bind(NOMATCH); 2795 __ mv(result, -1); 2796 2797 __ bind(DONE); 2798 __ pop_reg(spilled_regs, sp); 2799 __ ret(); 2800 return entry; 2801 } 2802 2803 void generate_string_indexof_stubs() 2804 { 2805 StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 2806 StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 2807 StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 2808 } 2809 2810 #ifdef COMPILER2 2811 address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { 2812 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table"); 2813 2814 address start = __ pc(); 2815 const Register 2816 r_super_klass = x10, 2817 r_array_base = x11, 2818 r_array_length = x12, 2819 r_array_index = x13, 2820 r_sub_klass = x14, 2821 result = x15, 2822 r_bitmap = x16; 2823 2824 Label L_success; 2825 __ enter(); 2826 __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, result, 2827 r_array_base, r_array_length, r_array_index, 2828 r_bitmap, super_klass_index, /*stub_is_near*/true); 2829 __ leave(); 2830 __ ret(); 2831 2832 return start; 2833 } 2834 2835 // Slow path implementation for UseSecondarySupersTable. 2836 address generate_lookup_secondary_supers_table_slow_path_stub() { 2837 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path"); 2838 2839 address start = __ pc(); 2840 const Register 2841 r_super_klass = x10, // argument 2842 r_array_base = x11, // argument 2843 temp1 = x12, // tmp 2844 r_array_index = x13, // argument 2845 result = x15, // argument 2846 r_bitmap = x16; // argument 2847 2848 2849 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 2850 __ ret(); 2851 2852 return start; 2853 } 2854 2855 address generate_mulAdd() 2856 { 2857 __ align(CodeEntryAlignment); 2858 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 2859 2860 address entry = __ pc(); 2861 2862 const Register out = x10; 2863 const Register in = x11; 2864 const Register offset = x12; 2865 const Register len = x13; 2866 const Register k = x14; 2867 const Register tmp = x28; 2868 2869 BLOCK_COMMENT("Entry:"); 2870 __ enter(); 2871 __ mul_add(out, in, offset, len, k, tmp); 2872 __ leave(); 2873 __ ret(); 2874 2875 return entry; 2876 } 2877 2878 /** 2879 * Arguments: 2880 * 2881 * Input: 2882 * c_rarg0 - x address 2883 * c_rarg1 - x length 2884 * c_rarg2 - y address 2885 * c_rarg3 - y length 2886 * c_rarg4 - z address 2887 */ 2888 address generate_multiplyToLen() 2889 { 2890 __ align(CodeEntryAlignment); 2891 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 2892 address entry = __ pc(); 2893 2894 const Register x = x10; 2895 const Register xlen = x11; 2896 const Register y = x12; 2897 const Register ylen = x13; 2898 const Register z = x14; 2899 2900 const Register tmp0 = x15; 2901 const Register tmp1 = x16; 2902 const Register tmp2 = x17; 2903 const Register tmp3 = x7; 2904 const Register tmp4 = x28; 2905 const Register tmp5 = x29; 2906 const Register tmp6 = x30; 2907 const Register tmp7 = x31; 2908 2909 BLOCK_COMMENT("Entry:"); 2910 __ enter(); // required for proper stackwalking of RuntimeStub frame 2911 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2912 __ leave(); // required for proper stackwalking of RuntimeStub frame 2913 __ ret(); 2914 2915 return entry; 2916 } 2917 2918 address generate_squareToLen() 2919 { 2920 __ align(CodeEntryAlignment); 2921 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 2922 address entry = __ pc(); 2923 2924 const Register x = x10; 2925 const Register xlen = x11; 2926 const Register z = x12; 2927 const Register y = x14; // == x 2928 const Register ylen = x15; // == xlen 2929 2930 const Register tmp0 = x13; // zlen, unused 2931 const Register tmp1 = x16; 2932 const Register tmp2 = x17; 2933 const Register tmp3 = x7; 2934 const Register tmp4 = x28; 2935 const Register tmp5 = x29; 2936 const Register tmp6 = x30; 2937 const Register tmp7 = x31; 2938 2939 BLOCK_COMMENT("Entry:"); 2940 __ enter(); 2941 __ mv(y, x); 2942 __ mv(ylen, xlen); 2943 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2944 __ leave(); 2945 __ ret(); 2946 2947 return entry; 2948 } 2949 2950 // Arguments: 2951 // 2952 // Input: 2953 // c_rarg0 - newArr address 2954 // c_rarg1 - oldArr address 2955 // c_rarg2 - newIdx 2956 // c_rarg3 - shiftCount 2957 // c_rarg4 - numIter 2958 // 2959 address generate_bigIntegerLeftShift() { 2960 __ align(CodeEntryAlignment); 2961 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 2962 address entry = __ pc(); 2963 2964 Label loop, exit; 2965 2966 Register newArr = c_rarg0; 2967 Register oldArr = c_rarg1; 2968 Register newIdx = c_rarg2; 2969 Register shiftCount = c_rarg3; 2970 Register numIter = c_rarg4; 2971 2972 Register shiftRevCount = c_rarg5; 2973 Register oldArrNext = t1; 2974 2975 __ beqz(numIter, exit); 2976 __ shadd(newArr, newIdx, newArr, t0, 2); 2977 2978 __ mv(shiftRevCount, 32); 2979 __ sub(shiftRevCount, shiftRevCount, shiftCount); 2980 2981 __ bind(loop); 2982 __ addi(oldArrNext, oldArr, 4); 2983 __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4); 2984 __ vle32_v(v0, oldArr); 2985 __ vle32_v(v4, oldArrNext); 2986 __ vsll_vx(v0, v0, shiftCount); 2987 __ vsrl_vx(v4, v4, shiftRevCount); 2988 __ vor_vv(v0, v0, v4); 2989 __ vse32_v(v0, newArr); 2990 __ sub(numIter, numIter, t0); 2991 __ shadd(oldArr, t0, oldArr, t1, 2); 2992 __ shadd(newArr, t0, newArr, t1, 2); 2993 __ bnez(numIter, loop); 2994 2995 __ bind(exit); 2996 __ ret(); 2997 2998 return entry; 2999 } 3000 3001 // Arguments: 3002 // 3003 // Input: 3004 // c_rarg0 - newArr address 3005 // c_rarg1 - oldArr address 3006 // c_rarg2 - newIdx 3007 // c_rarg3 - shiftCount 3008 // c_rarg4 - numIter 3009 // 3010 address generate_bigIntegerRightShift() { 3011 __ align(CodeEntryAlignment); 3012 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 3013 address entry = __ pc(); 3014 3015 Label loop, exit; 3016 3017 Register newArr = c_rarg0; 3018 Register oldArr = c_rarg1; 3019 Register newIdx = c_rarg2; 3020 Register shiftCount = c_rarg3; 3021 Register numIter = c_rarg4; 3022 Register idx = numIter; 3023 3024 Register shiftRevCount = c_rarg5; 3025 Register oldArrNext = c_rarg6; 3026 Register newArrCur = t0; 3027 Register oldArrCur = t1; 3028 3029 __ beqz(idx, exit); 3030 __ shadd(newArr, newIdx, newArr, t0, 2); 3031 3032 __ mv(shiftRevCount, 32); 3033 __ sub(shiftRevCount, shiftRevCount, shiftCount); 3034 3035 __ bind(loop); 3036 __ vsetvli(t0, idx, Assembler::e32, Assembler::m4); 3037 __ sub(idx, idx, t0); 3038 __ shadd(oldArrNext, idx, oldArr, t1, 2); 3039 __ shadd(newArrCur, idx, newArr, t1, 2); 3040 __ addi(oldArrCur, oldArrNext, 4); 3041 __ vle32_v(v0, oldArrCur); 3042 __ vle32_v(v4, oldArrNext); 3043 __ vsrl_vx(v0, v0, shiftCount); 3044 __ vsll_vx(v4, v4, shiftRevCount); 3045 __ vor_vv(v0, v0, v4); 3046 __ vse32_v(v0, newArrCur); 3047 __ bnez(idx, loop); 3048 3049 __ bind(exit); 3050 __ ret(); 3051 3052 return entry; 3053 } 3054 #endif 3055 3056 #ifdef COMPILER2 3057 class MontgomeryMultiplyGenerator : public MacroAssembler { 3058 3059 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3060 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj; 3061 3062 RegSet _toSave; 3063 bool _squaring; 3064 3065 public: 3066 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3067 : MacroAssembler(as->code()), _squaring(squaring) { 3068 3069 // Register allocation 3070 3071 RegSetIterator<Register> regs = RegSet::range(x10, x26).begin(); 3072 Pa_base = *regs; // Argument registers 3073 if (squaring) { 3074 Pb_base = Pa_base; 3075 } else { 3076 Pb_base = *++regs; 3077 } 3078 Pn_base = *++regs; 3079 Rlen= *++regs; 3080 inv = *++regs; 3081 Pm_base = *++regs; 3082 3083 // Working registers: 3084 Ra = *++regs; // The current digit of a, b, n, and m. 3085 Rb = *++regs; 3086 Rm = *++regs; 3087 Rn = *++regs; 3088 3089 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 3090 Pb = *++regs; 3091 Pm = *++regs; 3092 Pn = *++regs; 3093 3094 tmp0 = *++regs; // Three registers which form a 3095 tmp1 = *++regs; // triple-precision accumuator. 3096 tmp2 = *++regs; 3097 3098 Ri = x6; // Inner and outer loop indexes. 3099 Rj = x7; 3100 3101 Rhi_ab = x28; // Product registers: low and high parts 3102 Rlo_ab = x29; // of a*b and m*n. 3103 Rhi_mn = x30; 3104 Rlo_mn = x31; 3105 3106 // x18 and up are callee-saved. 3107 _toSave = RegSet::range(x18, *regs) + Pm_base; 3108 } 3109 3110 private: 3111 void save_regs() { 3112 push_reg(_toSave, sp); 3113 } 3114 3115 void restore_regs() { 3116 pop_reg(_toSave, sp); 3117 } 3118 3119 template <typename T> 3120 void unroll_2(Register count, T block) { 3121 Label loop, end, odd; 3122 beqz(count, end); 3123 test_bit(t0, count, 0); 3124 bnez(t0, odd); 3125 align(16); 3126 bind(loop); 3127 (this->*block)(); 3128 bind(odd); 3129 (this->*block)(); 3130 addi(count, count, -2); 3131 bgtz(count, loop); 3132 bind(end); 3133 } 3134 3135 template <typename T> 3136 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3137 Label loop, end, odd; 3138 beqz(count, end); 3139 test_bit(tmp, count, 0); 3140 bnez(tmp, odd); 3141 align(16); 3142 bind(loop); 3143 (this->*block)(d, s, tmp); 3144 bind(odd); 3145 (this->*block)(d, s, tmp); 3146 addi(count, count, -2); 3147 bgtz(count, loop); 3148 bind(end); 3149 } 3150 3151 void pre1(RegisterOrConstant i) { 3152 block_comment("pre1"); 3153 // Pa = Pa_base; 3154 // Pb = Pb_base + i; 3155 // Pm = Pm_base; 3156 // Pn = Pn_base + i; 3157 // Ra = *Pa; 3158 // Rb = *Pb; 3159 // Rm = *Pm; 3160 // Rn = *Pn; 3161 if (i.is_register()) { 3162 slli(t0, i.as_register(), LogBytesPerWord); 3163 } else { 3164 mv(t0, i.as_constant()); 3165 slli(t0, t0, LogBytesPerWord); 3166 } 3167 3168 mv(Pa, Pa_base); 3169 add(Pb, Pb_base, t0); 3170 mv(Pm, Pm_base); 3171 add(Pn, Pn_base, t0); 3172 3173 ld(Ra, Address(Pa)); 3174 ld(Rb, Address(Pb)); 3175 ld(Rm, Address(Pm)); 3176 ld(Rn, Address(Pn)); 3177 3178 // Zero the m*n result. 3179 mv(Rhi_mn, zr); 3180 mv(Rlo_mn, zr); 3181 } 3182 3183 // The core multiply-accumulate step of a Montgomery 3184 // multiplication. The idea is to schedule operations as a 3185 // pipeline so that instructions with long latencies (loads and 3186 // multiplies) have time to complete before their results are 3187 // used. This most benefits in-order implementations of the 3188 // architecture but out-of-order ones also benefit. 3189 void step() { 3190 block_comment("step"); 3191 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3192 // Ra = *++Pa; 3193 // Rb = *--Pb; 3194 mulhu(Rhi_ab, Ra, Rb); 3195 mul(Rlo_ab, Ra, Rb); 3196 addi(Pa, Pa, wordSize); 3197 ld(Ra, Address(Pa)); 3198 addi(Pb, Pb, -wordSize); 3199 ld(Rb, Address(Pb)); 3200 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the 3201 // previous iteration. 3202 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3203 // Rm = *++Pm; 3204 // Rn = *--Pn; 3205 mulhu(Rhi_mn, Rm, Rn); 3206 mul(Rlo_mn, Rm, Rn); 3207 addi(Pm, Pm, wordSize); 3208 ld(Rm, Address(Pm)); 3209 addi(Pn, Pn, -wordSize); 3210 ld(Rn, Address(Pn)); 3211 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3212 } 3213 3214 void post1() { 3215 block_comment("post1"); 3216 3217 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3218 // Ra = *++Pa; 3219 // Rb = *--Pb; 3220 mulhu(Rhi_ab, Ra, Rb); 3221 mul(Rlo_ab, Ra, Rb); 3222 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3223 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3224 3225 // *Pm = Rm = tmp0 * inv; 3226 mul(Rm, tmp0, inv); 3227 sd(Rm, Address(Pm)); 3228 3229 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3230 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3231 mulhu(Rhi_mn, Rm, Rn); 3232 3233 #ifndef PRODUCT 3234 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3235 { 3236 mul(Rlo_mn, Rm, Rn); 3237 add(Rlo_mn, tmp0, Rlo_mn); 3238 Label ok; 3239 beqz(Rlo_mn, ok); 3240 stop("broken Montgomery multiply"); 3241 bind(ok); 3242 } 3243 #endif 3244 // We have very carefully set things up so that 3245 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3246 // the lower half of Rm * Rn because we know the result already: 3247 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3248 // tmp0 != 0. So, rather than do a mul and an cad we just set 3249 // the carry flag iff tmp0 is nonzero. 3250 // 3251 // mul(Rlo_mn, Rm, Rn); 3252 // cad(zr, tmp0, Rlo_mn); 3253 addi(t0, tmp0, -1); 3254 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3255 cadc(tmp0, tmp1, Rhi_mn, t0); 3256 adc(tmp1, tmp2, zr, t0); 3257 mv(tmp2, zr); 3258 } 3259 3260 void pre2(Register i, Register len) { 3261 block_comment("pre2"); 3262 // Pa = Pa_base + i-len; 3263 // Pb = Pb_base + len; 3264 // Pm = Pm_base + i-len; 3265 // Pn = Pn_base + len; 3266 3267 sub(Rj, i, len); 3268 // Rj == i-len 3269 3270 // Ra as temp register 3271 slli(Ra, Rj, LogBytesPerWord); 3272 add(Pa, Pa_base, Ra); 3273 add(Pm, Pm_base, Ra); 3274 slli(Ra, len, LogBytesPerWord); 3275 add(Pb, Pb_base, Ra); 3276 add(Pn, Pn_base, Ra); 3277 3278 // Ra = *++Pa; 3279 // Rb = *--Pb; 3280 // Rm = *++Pm; 3281 // Rn = *--Pn; 3282 add(Pa, Pa, wordSize); 3283 ld(Ra, Address(Pa)); 3284 add(Pb, Pb, -wordSize); 3285 ld(Rb, Address(Pb)); 3286 add(Pm, Pm, wordSize); 3287 ld(Rm, Address(Pm)); 3288 add(Pn, Pn, -wordSize); 3289 ld(Rn, Address(Pn)); 3290 3291 mv(Rhi_mn, zr); 3292 mv(Rlo_mn, zr); 3293 } 3294 3295 void post2(Register i, Register len) { 3296 block_comment("post2"); 3297 sub(Rj, i, len); 3298 3299 cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part 3300 3301 // As soon as we know the least significant digit of our result, 3302 // store it. 3303 // Pm_base[i-len] = tmp0; 3304 // Rj as temp register 3305 slli(Rj, Rj, LogBytesPerWord); 3306 add(Rj, Pm_base, Rj); 3307 sd(tmp0, Address(Rj)); 3308 3309 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3310 cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part 3311 adc(tmp1, tmp2, zr, t0); 3312 mv(tmp2, zr); 3313 } 3314 3315 // A carry in tmp0 after Montgomery multiplication means that we 3316 // should subtract multiples of n from our result in m. We'll 3317 // keep doing that until there is no carry. 3318 void normalize(Register len) { 3319 block_comment("normalize"); 3320 // while (tmp0) 3321 // tmp0 = sub(Pm_base, Pn_base, tmp0, len); 3322 Label loop, post, again; 3323 Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now 3324 beqz(tmp0, post); { 3325 bind(again); { 3326 mv(i, zr); 3327 mv(cnt, len); 3328 slli(Rn, i, LogBytesPerWord); 3329 add(Rm, Pm_base, Rn); 3330 ld(Rm, Address(Rm)); 3331 add(Rn, Pn_base, Rn); 3332 ld(Rn, Address(Rn)); 3333 mv(t0, 1); // set carry flag, i.e. no borrow 3334 align(16); 3335 bind(loop); { 3336 notr(Rn, Rn); 3337 add(Rm, Rm, t0); 3338 add(Rm, Rm, Rn); 3339 sltu(t0, Rm, Rn); 3340 slli(Rn, i, LogBytesPerWord); // Rn as temp register 3341 add(Rn, Pm_base, Rn); 3342 sd(Rm, Address(Rn)); 3343 add(i, i, 1); 3344 slli(Rn, i, LogBytesPerWord); 3345 add(Rm, Pm_base, Rn); 3346 ld(Rm, Address(Rm)); 3347 add(Rn, Pn_base, Rn); 3348 ld(Rn, Address(Rn)); 3349 sub(cnt, cnt, 1); 3350 } bnez(cnt, loop); 3351 addi(tmp0, tmp0, -1); 3352 add(tmp0, tmp0, t0); 3353 } bnez(tmp0, again); 3354 } bind(post); 3355 } 3356 3357 // Move memory at s to d, reversing words. 3358 // Increments d to end of copied memory 3359 // Destroys tmp1, tmp2 3360 // Preserves len 3361 // Leaves s pointing to the address which was in d at start 3362 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3363 assert(tmp1->encoding() < x28->encoding(), "register corruption"); 3364 assert(tmp2->encoding() < x28->encoding(), "register corruption"); 3365 3366 shadd(s, len, s, tmp1, LogBytesPerWord); 3367 mv(tmp1, len); 3368 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3369 slli(tmp1, len, LogBytesPerWord); 3370 sub(s, d, tmp1); 3371 } 3372 // [63...0] -> [31...0][63...32] 3373 void reverse1(Register d, Register s, Register tmp) { 3374 addi(s, s, -wordSize); 3375 ld(tmp, Address(s)); 3376 ror_imm(tmp, tmp, 32, t0); 3377 sd(tmp, Address(d)); 3378 addi(d, d, wordSize); 3379 } 3380 3381 void step_squaring() { 3382 // An extra ACC 3383 step(); 3384 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3385 } 3386 3387 void last_squaring(Register i) { 3388 Label dont; 3389 // if ((i & 1) == 0) { 3390 test_bit(t0, i, 0); 3391 bnez(t0, dont); { 3392 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3393 // Ra = *++Pa; 3394 // Rb = *--Pb; 3395 mulhu(Rhi_ab, Ra, Rb); 3396 mul(Rlo_ab, Ra, Rb); 3397 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3398 } bind(dont); 3399 } 3400 3401 void extra_step_squaring() { 3402 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3403 3404 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3405 // Rm = *++Pm; 3406 // Rn = *--Pn; 3407 mulhu(Rhi_mn, Rm, Rn); 3408 mul(Rlo_mn, Rm, Rn); 3409 addi(Pm, Pm, wordSize); 3410 ld(Rm, Address(Pm)); 3411 addi(Pn, Pn, -wordSize); 3412 ld(Rn, Address(Pn)); 3413 } 3414 3415 void post1_squaring() { 3416 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3417 3418 // *Pm = Rm = tmp0 * inv; 3419 mul(Rm, tmp0, inv); 3420 sd(Rm, Address(Pm)); 3421 3422 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3423 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3424 mulhu(Rhi_mn, Rm, Rn); 3425 3426 #ifndef PRODUCT 3427 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3428 { 3429 mul(Rlo_mn, Rm, Rn); 3430 add(Rlo_mn, tmp0, Rlo_mn); 3431 Label ok; 3432 beqz(Rlo_mn, ok); { 3433 stop("broken Montgomery multiply"); 3434 } bind(ok); 3435 } 3436 #endif 3437 // We have very carefully set things up so that 3438 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3439 // the lower half of Rm * Rn because we know the result already: 3440 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3441 // tmp0 != 0. So, rather than do a mul and a cad we just set 3442 // the carry flag iff tmp0 is nonzero. 3443 // 3444 // mul(Rlo_mn, Rm, Rn); 3445 // cad(zr, tmp, Rlo_mn); 3446 addi(t0, tmp0, -1); 3447 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3448 cadc(tmp0, tmp1, Rhi_mn, t0); 3449 adc(tmp1, tmp2, zr, t0); 3450 mv(tmp2, zr); 3451 } 3452 3453 // use t0 as carry 3454 void acc(Register Rhi, Register Rlo, 3455 Register tmp0, Register tmp1, Register tmp2) { 3456 cad(tmp0, tmp0, Rlo, t0); 3457 cadc(tmp1, tmp1, Rhi, t0); 3458 adc(tmp2, tmp2, zr, t0); 3459 } 3460 3461 public: 3462 /** 3463 * Fast Montgomery multiplication. The derivation of the 3464 * algorithm is in A Cryptographic Library for the Motorola 3465 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3466 * 3467 * Arguments: 3468 * 3469 * Inputs for multiplication: 3470 * c_rarg0 - int array elements a 3471 * c_rarg1 - int array elements b 3472 * c_rarg2 - int array elements n (the modulus) 3473 * c_rarg3 - int length 3474 * c_rarg4 - int inv 3475 * c_rarg5 - int array elements m (the result) 3476 * 3477 * Inputs for squaring: 3478 * c_rarg0 - int array elements a 3479 * c_rarg1 - int array elements n (the modulus) 3480 * c_rarg2 - int length 3481 * c_rarg3 - int inv 3482 * c_rarg4 - int array elements m (the result) 3483 * 3484 */ 3485 address generate_multiply() { 3486 Label argh, nothing; 3487 bind(argh); 3488 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3489 3490 align(CodeEntryAlignment); 3491 address entry = pc(); 3492 3493 beqz(Rlen, nothing); 3494 3495 enter(); 3496 3497 // Make room. 3498 mv(Ra, 512); 3499 bgt(Rlen, Ra, argh); 3500 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3501 sub(Ra, sp, Ra); 3502 andi(sp, Ra, -2 * wordSize); 3503 3504 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3505 3506 { 3507 // Copy input args, reversing as we go. We use Ra as a 3508 // temporary variable. 3509 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3510 if (!_squaring) 3511 reverse(Ra, Pb_base, Rlen, Ri, Rj); 3512 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3513 } 3514 3515 // Push all call-saved registers and also Pm_base which we'll need 3516 // at the end. 3517 save_regs(); 3518 3519 #ifndef PRODUCT 3520 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3521 { 3522 ld(Rn, Address(Pn_base)); 3523 mul(Rlo_mn, Rn, inv); 3524 mv(t0, -1); 3525 Label ok; 3526 beq(Rlo_mn, t0, ok); 3527 stop("broken inverse in Montgomery multiply"); 3528 bind(ok); 3529 } 3530 #endif 3531 3532 mv(Pm_base, Ra); 3533 3534 mv(tmp0, zr); 3535 mv(tmp1, zr); 3536 mv(tmp2, zr); 3537 3538 block_comment("for (int i = 0; i < len; i++) {"); 3539 mv(Ri, zr); { 3540 Label loop, end; 3541 bge(Ri, Rlen, end); 3542 3543 bind(loop); 3544 pre1(Ri); 3545 3546 block_comment(" for (j = i; j; j--) {"); { 3547 mv(Rj, Ri); 3548 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3549 } block_comment(" } // j"); 3550 3551 post1(); 3552 addw(Ri, Ri, 1); 3553 blt(Ri, Rlen, loop); 3554 bind(end); 3555 block_comment("} // i"); 3556 } 3557 3558 block_comment("for (int i = len; i < 2*len; i++) {"); 3559 mv(Ri, Rlen); { 3560 Label loop, end; 3561 slli(t0, Rlen, 1); 3562 bge(Ri, t0, end); 3563 3564 bind(loop); 3565 pre2(Ri, Rlen); 3566 3567 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3568 slliw(Rj, Rlen, 1); 3569 subw(Rj, Rj, Ri); 3570 subw(Rj, Rj, 1); 3571 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3572 } block_comment(" } // j"); 3573 3574 post2(Ri, Rlen); 3575 addw(Ri, Ri, 1); 3576 slli(t0, Rlen, 1); 3577 blt(Ri, t0, loop); 3578 bind(end); 3579 } 3580 block_comment("} // i"); 3581 3582 normalize(Rlen); 3583 3584 mv(Ra, Pm_base); // Save Pm_base in Ra 3585 restore_regs(); // Restore caller's Pm_base 3586 3587 // Copy our result into caller's Pm_base 3588 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3589 3590 leave(); 3591 bind(nothing); 3592 ret(); 3593 3594 return entry; 3595 } 3596 3597 /** 3598 * 3599 * Arguments: 3600 * 3601 * Inputs: 3602 * c_rarg0 - int array elements a 3603 * c_rarg1 - int array elements n (the modulus) 3604 * c_rarg2 - int length 3605 * c_rarg3 - int inv 3606 * c_rarg4 - int array elements m (the result) 3607 * 3608 */ 3609 address generate_square() { 3610 Label argh; 3611 bind(argh); 3612 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3613 3614 align(CodeEntryAlignment); 3615 address entry = pc(); 3616 3617 enter(); 3618 3619 // Make room. 3620 mv(Ra, 512); 3621 bgt(Rlen, Ra, argh); 3622 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3623 sub(Ra, sp, Ra); 3624 andi(sp, Ra, -2 * wordSize); 3625 3626 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3627 3628 { 3629 // Copy input args, reversing as we go. We use Ra as a 3630 // temporary variable. 3631 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3632 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3633 } 3634 3635 // Push all call-saved registers and also Pm_base which we'll need 3636 // at the end. 3637 save_regs(); 3638 3639 mv(Pm_base, Ra); 3640 3641 mv(tmp0, zr); 3642 mv(tmp1, zr); 3643 mv(tmp2, zr); 3644 3645 block_comment("for (int i = 0; i < len; i++) {"); 3646 mv(Ri, zr); { 3647 Label loop, end; 3648 bind(loop); 3649 bge(Ri, Rlen, end); 3650 3651 pre1(Ri); 3652 3653 block_comment("for (j = (i+1)/2; j; j--) {"); { 3654 addi(Rj, Ri, 1); 3655 srliw(Rj, Rj, 1); 3656 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3657 } block_comment(" } // j"); 3658 3659 last_squaring(Ri); 3660 3661 block_comment(" for (j = i/2; j; j--) {"); { 3662 srliw(Rj, Ri, 1); 3663 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3664 } block_comment(" } // j"); 3665 3666 post1_squaring(); 3667 addi(Ri, Ri, 1); 3668 blt(Ri, Rlen, loop); 3669 3670 bind(end); 3671 block_comment("} // i"); 3672 } 3673 3674 block_comment("for (int i = len; i < 2*len; i++) {"); 3675 mv(Ri, Rlen); { 3676 Label loop, end; 3677 bind(loop); 3678 slli(t0, Rlen, 1); 3679 bge(Ri, t0, end); 3680 3681 pre2(Ri, Rlen); 3682 3683 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 3684 slli(Rj, Rlen, 1); 3685 sub(Rj, Rj, Ri); 3686 sub(Rj, Rj, 1); 3687 srliw(Rj, Rj, 1); 3688 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3689 } block_comment(" } // j"); 3690 3691 last_squaring(Ri); 3692 3693 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 3694 slli(Rj, Rlen, 1); 3695 sub(Rj, Rj, Ri); 3696 srliw(Rj, Rj, 1); 3697 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3698 } block_comment(" } // j"); 3699 3700 post2(Ri, Rlen); 3701 addi(Ri, Ri, 1); 3702 slli(t0, Rlen, 1); 3703 blt(Ri, t0, loop); 3704 3705 bind(end); 3706 block_comment("} // i"); 3707 } 3708 3709 normalize(Rlen); 3710 3711 mv(Ra, Pm_base); // Save Pm_base in Ra 3712 restore_regs(); // Restore caller's Pm_base 3713 3714 // Copy our result into caller's Pm_base 3715 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3716 3717 leave(); 3718 ret(); 3719 3720 return entry; 3721 } 3722 }; 3723 3724 #endif // COMPILER2 3725 3726 address generate_cont_thaw(Continuation::thaw_kind kind) { 3727 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 3728 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 3729 3730 address start = __ pc(); 3731 3732 if (return_barrier) { 3733 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset())); 3734 } 3735 3736 #ifndef PRODUCT 3737 { 3738 Label OK; 3739 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset())); 3740 __ beq(sp, t0, OK); 3741 __ stop("incorrect sp"); 3742 __ bind(OK); 3743 } 3744 #endif 3745 3746 if (return_barrier) { 3747 // preserve possible return value from a method returning to the return barrier 3748 __ sub(sp, sp, 2 * wordSize); 3749 __ fsd(f10, Address(sp, 0 * wordSize)); 3750 __ sd(x10, Address(sp, 1 * wordSize)); 3751 } 3752 3753 __ mv(c_rarg1, (return_barrier ? 1 : 0)); 3754 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1); 3755 __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames 3756 3757 if (return_barrier) { 3758 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 3759 __ ld(x10, Address(sp, 1 * wordSize)); 3760 __ fld(f10, Address(sp, 0 * wordSize)); 3761 __ add(sp, sp, 2 * wordSize); 3762 } 3763 3764 #ifndef PRODUCT 3765 { 3766 Label OK; 3767 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset())); 3768 __ beq(sp, t0, OK); 3769 __ stop("incorrect sp"); 3770 __ bind(OK); 3771 } 3772 #endif 3773 3774 Label thaw_success; 3775 // t1 contains the size of the frames to thaw, 0 if overflow or no more frames 3776 __ bnez(t1, thaw_success); 3777 __ la(t0, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 3778 __ jr(t0); 3779 __ bind(thaw_success); 3780 3781 // make room for the thawed frames 3782 __ sub(t0, sp, t1); 3783 __ andi(sp, t0, -16); // align 3784 3785 if (return_barrier) { 3786 // save original return value -- again 3787 __ sub(sp, sp, 2 * wordSize); 3788 __ fsd(f10, Address(sp, 0 * wordSize)); 3789 __ sd(x10, Address(sp, 1 * wordSize)); 3790 } 3791 3792 // If we want, we can templatize thaw by kind, and have three different entries 3793 __ mv(c_rarg1, kind); 3794 3795 __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1); 3796 __ mv(t1, x10); // x10 is the sp of the yielding frame 3797 3798 if (return_barrier) { 3799 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 3800 __ ld(x10, Address(sp, 1 * wordSize)); 3801 __ fld(f10, Address(sp, 0 * wordSize)); 3802 __ add(sp, sp, 2 * wordSize); 3803 } else { 3804 __ mv(x10, zr); // return 0 (success) from doYield 3805 } 3806 3807 // we're now on the yield frame (which is in an address above us b/c sp has been pushed down) 3808 __ mv(fp, t1); 3809 __ sub(sp, t1, 2 * wordSize); // now pointing to fp spill 3810 3811 if (return_barrier_exception) { 3812 __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address 3813 __ verify_oop(x10); 3814 __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9 3815 3816 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1); 3817 3818 // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc 3819 3820 __ mv(x11, x10); // the exception handler 3821 __ mv(x10, x9); // restore return value contaning the exception oop 3822 __ verify_oop(x10); 3823 3824 __ leave(); 3825 __ mv(x13, ra); 3826 __ jr(x11); // the exception handler 3827 } else { 3828 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 3829 __ leave(); 3830 __ ret(); 3831 } 3832 3833 return start; 3834 } 3835 3836 address generate_cont_thaw() { 3837 if (!Continuations::enabled()) return nullptr; 3838 3839 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 3840 address start = __ pc(); 3841 generate_cont_thaw(Continuation::thaw_top); 3842 return start; 3843 } 3844 3845 address generate_cont_returnBarrier() { 3846 if (!Continuations::enabled()) return nullptr; 3847 3848 // TODO: will probably need multiple return barriers depending on return type 3849 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 3850 address start = __ pc(); 3851 3852 generate_cont_thaw(Continuation::thaw_return_barrier); 3853 3854 return start; 3855 } 3856 3857 address generate_cont_returnBarrier_exception() { 3858 if (!Continuations::enabled()) return nullptr; 3859 3860 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 3861 address start = __ pc(); 3862 3863 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 3864 3865 return start; 3866 } 3867 3868 #if COMPILER2_OR_JVMCI 3869 3870 #undef __ 3871 #define __ this-> 3872 3873 class Sha2Generator : public MacroAssembler { 3874 StubCodeGenerator* _cgen; 3875 public: 3876 Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {} 3877 address generate_sha256_implCompress(bool multi_block) { 3878 return generate_sha2_implCompress(Assembler::e32, multi_block); 3879 } 3880 address generate_sha512_implCompress(bool multi_block) { 3881 return generate_sha2_implCompress(Assembler::e64, multi_block); 3882 } 3883 private: 3884 3885 void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { 3886 if (vset_sew == Assembler::e32) __ vle32_v(vr, sr); 3887 else __ vle64_v(vr, sr); 3888 } 3889 3890 void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { 3891 if (vset_sew == Assembler::e32) __ vse32_v(vr, sr); 3892 else __ vse64_v(vr, sr); 3893 } 3894 3895 // Overview of the logic in each "quad round". 3896 // 3897 // The code below repeats 16/20 times the logic implementing four rounds 3898 // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds" 3899 // to implementing the 64/80 single rounds. 3900 // 3901 // // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0]) 3902 // // Output: 3903 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} 3904 // vl1reXX.v vTmp1, ofs 3905 // 3906 // // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b) 3907 // addi ofs, ofs, 16/32 3908 // 3909 // // Add constants to message schedule words: 3910 // // Input 3911 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} 3912 // // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0]; 3913 // // Output 3914 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 3915 // vadd.vv vTmp0, vTmp1, vW0 3916 // 3917 // // 2 rounds of working variables updates. 3918 // // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t] 3919 // // Input: 3920 // // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] " 3921 // // vState0 = {a[t],b[t],e[t],f[t]} 3922 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 3923 // // Output: 3924 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " 3925 // // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] " 3926 // vsha2cl.vv vState1, vState0, vTmp0 3927 // 3928 // // 2 rounds of working variables updates. 3929 // // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t] 3930 // // Input 3931 // // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] " 3932 // // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] " 3933 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " 3934 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 3935 // // Output: 3936 // // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] " 3937 // vsha2ch.vv vState0, vState1, vTmp0 3938 // 3939 // // Combine 2QW into 1QW 3940 // // 3941 // // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs 3942 // // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3] 3943 // // and it can only take 3 vectors as inputs. Hence we need to combine 3944 // // vW1[0] and vW2[1..3] in a single vector. 3945 // // 3946 // // vmerge Vt4, Vt1, Vt2, V0 3947 // // Input 3948 // // V0 = mask // first word from vW2, 1..3 words from vW1 3949 // // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5} 3950 // // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9} 3951 // // Output 3952 // // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5} 3953 // vmerge.vvm vTmp0, vW2, vW1, v0 3954 // 3955 // // Generate next Four Message Schedule Words (hence allowing for 4 more rounds) 3956 // // Input 3957 // // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0] 3958 // // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12] 3959 // // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4] 3960 // // Output (next four message schedule words) 3961 // // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16] 3962 // vsha2ms.vv vW0, vTmp0, vW3 3963 // 3964 // BEFORE 3965 // vW0 - vW3 hold the message schedule words (initially the block words) 3966 // vW0 = W[ 3: 0] "oldest" 3967 // vW1 = W[ 7: 4] 3968 // vW2 = W[11: 8] 3969 // vW3 = W[15:12] "newest" 3970 // 3971 // vt6 - vt7 hold the working state variables 3972 // vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0} 3973 // vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2} 3974 // 3975 // AFTER 3976 // vW0 - vW3 hold the message schedule words (initially the block words) 3977 // vW1 = W[ 7: 4] "oldest" 3978 // vW2 = W[11: 8] 3979 // vW3 = W[15:12] 3980 // vW0 = W[19:16] "newest" 3981 // 3982 // vState0 and vState1 hold the working state variables 3983 // vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]} 3984 // vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]} 3985 // 3986 // The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round, 3987 // hence the uses of those vectors rotate in each round, and we get back to the 3988 // initial configuration every 4 quad-rounds. We could avoid those changes at 3989 // the cost of moving those vectors at the end of each quad-rounds. 3990 void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4, 3991 Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh, 3992 bool gen_words = true, bool step_const = true) { 3993 __ vleXX_v(vset_sew, vtemp, scalarconst); 3994 if (step_const) { 3995 __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32); 3996 } 3997 __ vadd_vv(vtemp2, vtemp, rot1); 3998 __ vsha2cl_vv(v_cdgh, v_abef, vtemp2); 3999 __ vsha2ch_vv(v_abef, v_cdgh, vtemp2); 4000 if (gen_words) { 4001 __ vmerge_vvm(vtemp2, rot3, rot2); 4002 __ vsha2ms_vv(rot1, vtemp2, rot4); 4003 } 4004 } 4005 4006 const char* stub_name(Assembler::SEW vset_sew, bool multi_block) { 4007 if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress"; 4008 if (vset_sew == Assembler::e32 && multi_block) return "sha256_implCompressMB"; 4009 if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress"; 4010 if (vset_sew == Assembler::e64 && multi_block) return "sha512_implCompressMB"; 4011 ShouldNotReachHere(); 4012 return "bad name lookup"; 4013 } 4014 4015 // Arguments: 4016 // 4017 // Inputs: 4018 // c_rarg0 - byte[] source+offset 4019 // c_rarg1 - int[] SHA.state 4020 // c_rarg2 - int offset 4021 // c_rarg3 - int limit 4022 // 4023 address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) { 4024 alignas(64) static const uint32_t round_consts_256[64] = { 4025 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 4026 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 4027 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 4028 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 4029 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 4030 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 4031 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 4032 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 4033 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 4034 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 4035 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 4036 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 4037 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 4038 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 4039 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 4040 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 4041 }; 4042 alignas(64) static const uint64_t round_consts_512[80] = { 4043 0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl, 4044 0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l, 4045 0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l, 4046 0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l, 4047 0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l, 4048 0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l, 4049 0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l, 4050 0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l, 4051 0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl, 4052 0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l, 4053 0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl, 4054 0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl, 4055 0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l, 4056 0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l, 4057 0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l, 4058 0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l, 4059 0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l, 4060 0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl, 4061 0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl, 4062 0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl, 4063 0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l, 4064 0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l, 4065 0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal, 4066 0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl, 4067 0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl, 4068 0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al, 4069 0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l 4070 }; 4071 const int const_add = vset_sew == Assembler::e32 ? 16 : 32; 4072 4073 __ align(CodeEntryAlignment); 4074 StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block)); 4075 address start = __ pc(); 4076 4077 Register buf = c_rarg0; 4078 Register state = c_rarg1; 4079 Register ofs = c_rarg2; 4080 Register limit = c_rarg3; 4081 Register consts = t2; // caller saved 4082 Register state_c = x28; // caller saved 4083 VectorRegister vindex = v2; 4084 VectorRegister vW0 = v4; 4085 VectorRegister vW1 = v6; 4086 VectorRegister vW2 = v8; 4087 VectorRegister vW3 = v10; 4088 VectorRegister vState0 = v12; 4089 VectorRegister vState1 = v14; 4090 VectorRegister vHash0 = v16; 4091 VectorRegister vHash1 = v18; 4092 VectorRegister vTmp0 = v20; 4093 VectorRegister vTmp1 = v22; 4094 4095 Label multi_block_loop; 4096 4097 __ enter(); 4098 4099 address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512; 4100 la(consts, ExternalAddress(constant_table)); 4101 4102 // Register use in this function: 4103 // 4104 // VECTORS 4105 // vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message 4106 // schedule words (Wt). They start with the message block 4107 // content (W0 to W15), then further words in the message 4108 // schedule generated via vsha2ms from previous Wt. 4109 // Initially: 4110 // vW0 = W[ 3:0] = { W3, W2, W1, W0} 4111 // vW1 = W[ 7:4] = { W7, W6, W5, W4} 4112 // vW2 = W[ 11:8] = {W11, W10, W9, W8} 4113 // vW3 = W[15:12] = {W15, W14, W13, W12} 4114 // 4115 // vState0 - vState1 hold the working state variables (a, b, ..., h) 4116 // vState0 = {f[t],e[t],b[t],a[t]} 4117 // vState1 = {h[t],g[t],d[t],c[t]} 4118 // Initially: 4119 // vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1} 4120 // vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1} 4121 // 4122 // v0 = masks for vrgather/vmerge. Single value during the 16 rounds. 4123 // 4124 // vTmp0 = temporary, Wt+Kt 4125 // vTmp1 = temporary, Kt 4126 // 4127 // vHash0/vHash1 = hold the initial values of the hash, byte-swapped. 4128 // 4129 // During most of the function the vector state is configured so that each 4130 // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits). 4131 4132 // vsha2ch/vsha2cl uses EGW of 4*SEW. 4133 // SHA256 SEW = e32, EGW = 128-bits 4134 // SHA512 SEW = e64, EGW = 256-bits 4135 // 4136 // VLEN is required to be at least 128. 4137 // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256) 4138 // 4139 // m1: LMUL=1/2 4140 // ta: tail agnostic (don't care about those lanes) 4141 // ma: mask agnostic (don't care about those lanes) 4142 // x0 is not written, we known the number of vector elements. 4143 4144 if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128 4145 __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta); 4146 } else { 4147 __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta); 4148 } 4149 4150 int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul; 4151 __ li(t0, indexes); 4152 __ vmv_v_x(vindex, t0); 4153 4154 // Step-over a,b, so we are pointing to c. 4155 // const_add is equal to 4x state variable, div by 2 is thus 2, a,b 4156 __ addi(state_c, state, const_add/2); 4157 4158 // Use index-load to get {f,e,b,a},{h,g,d,c} 4159 __ vluxei8_v(vState0, state, vindex); 4160 __ vluxei8_v(vState1, state_c, vindex); 4161 4162 __ bind(multi_block_loop); 4163 4164 // Capture the initial H values in vHash0 and vHash1 to allow for computing 4165 // the resulting H', since H' = H+{a',b',c',...,h'}. 4166 __ vmv_v_v(vHash0, vState0); 4167 __ vmv_v_v(vHash1, vState1); 4168 4169 // Load the 512/1024-bits of the message block in vW0-vW3 and perform 4170 // an endian swap on each 4/8 bytes element. 4171 // 4172 // If Zvkb is not implemented one can use vrgather 4173 // with an index sequence to byte-swap. 4174 // sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] 4175 // <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate 4176 // this sequence. 'vid' gives us the N. 4177 __ vleXX_v(vset_sew, vW0, buf); 4178 __ vrev8_v(vW0, vW0); 4179 __ addi(buf, buf, const_add); 4180 __ vleXX_v(vset_sew, vW1, buf); 4181 __ vrev8_v(vW1, vW1); 4182 __ addi(buf, buf, const_add); 4183 __ vleXX_v(vset_sew, vW2, buf); 4184 __ vrev8_v(vW2, vW2); 4185 __ addi(buf, buf, const_add); 4186 __ vleXX_v(vset_sew, vW3, buf); 4187 __ vrev8_v(vW3, vW3); 4188 __ addi(buf, buf, const_add); 4189 4190 // Set v0 up for the vmerge that replaces the first word (idx==0) 4191 __ vid_v(v0); 4192 __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0) 4193 4194 VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3}; 4195 int rot_pos = 0; 4196 // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2) 4197 const int qr_end = vset_sew == Assembler::e32 ? 12 : 16; 4198 for (int i = 0; i < qr_end; i++) { 4199 sha2_quad_round(vset_sew, 4200 rotation_regs[(rot_pos + 0) & 0x3], 4201 rotation_regs[(rot_pos + 1) & 0x3], 4202 rotation_regs[(rot_pos + 2) & 0x3], 4203 rotation_regs[(rot_pos + 3) & 0x3], 4204 consts, 4205 vTmp1, vTmp0, vState0, vState1); 4206 ++rot_pos; 4207 } 4208 // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2) 4209 // Note that we stop generating new message schedule words (Wt, vW0-13) 4210 // as we already generated all the words we end up consuming (i.e., W[63:60]). 4211 const int qr_c_end = qr_end + 4; 4212 for (int i = qr_end; i < qr_c_end; i++) { 4213 sha2_quad_round(vset_sew, 4214 rotation_regs[(rot_pos + 0) & 0x3], 4215 rotation_regs[(rot_pos + 1) & 0x3], 4216 rotation_regs[(rot_pos + 2) & 0x3], 4217 rotation_regs[(rot_pos + 3) & 0x3], 4218 consts, 4219 vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1)); 4220 ++rot_pos; 4221 } 4222 4223 //-------------------------------------------------------------------------------- 4224 // Compute the updated hash value H' 4225 // H' = H + {h',g',...,b',a'} 4226 // = {h,g,...,b,a} + {h',g',...,b',a'} 4227 // = {h+h',g+g',...,b+b',a+a'} 4228 4229 // H' = H+{a',b',c',...,h'} 4230 __ vadd_vv(vState0, vHash0, vState0); 4231 __ vadd_vv(vState1, vHash1, vState1); 4232 4233 if (multi_block) { 4234 int total_adds = vset_sew == Assembler::e32 ? 240 : 608; 4235 __ addi(consts, consts, -total_adds); 4236 __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128); 4237 __ ble(ofs, limit, multi_block_loop); 4238 __ mv(c_rarg0, ofs); // return ofs 4239 } 4240 4241 // Store H[0..8] = {a,b,c,d,e,f,g,h} from 4242 // vState0 = {f,e,b,a} 4243 // vState1 = {h,g,d,c} 4244 __ vsuxei8_v(vState0, state, vindex); 4245 __ vsuxei8_v(vState1, state_c, vindex); 4246 4247 __ leave(); 4248 __ ret(); 4249 4250 return start; 4251 } 4252 }; 4253 4254 #undef __ 4255 #define __ _masm-> 4256 4257 // Set of L registers that correspond to a contiguous memory area. 4258 // Each 64-bit register typically corresponds to 2 32-bit integers. 4259 template <uint L> 4260 class RegCache { 4261 private: 4262 MacroAssembler *_masm; 4263 Register _regs[L]; 4264 4265 public: 4266 RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) { 4267 assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L); 4268 auto it = rs.begin(); 4269 for (auto &r: _regs) { 4270 r = *it; 4271 ++it; 4272 } 4273 } 4274 4275 // generate load for the i'th register 4276 void gen_load(uint i, Register base) { 4277 assert(i < L, "invalid i: %u", i); 4278 __ ld(_regs[i], Address(base, 8 * i)); 4279 } 4280 4281 // add i'th 32-bit integer to dest 4282 void add_u32(const Register dest, uint i, const Register rtmp = t0) { 4283 assert(i < 2 * L, "invalid i: %u", i); 4284 4285 if (is_even(i)) { 4286 // Use the bottom 32 bits. No need to mask off the top 32 bits 4287 // as addw will do the right thing. 4288 __ addw(dest, dest, _regs[i / 2]); 4289 } else { 4290 // Use the top 32 bits by right-shifting them. 4291 __ srli(rtmp, _regs[i / 2], 32); 4292 __ addw(dest, dest, rtmp); 4293 } 4294 } 4295 }; 4296 4297 typedef RegCache<8> BufRegCache; 4298 4299 // a += value + x + ac; 4300 // a = Integer.rotateLeft(a, s) + b; 4301 void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache, 4302 Register a, Register b, Register c, Register d, 4303 int k, int s, int t, 4304 Register value) { 4305 // a += ac 4306 __ addw(a, a, t, t1); 4307 4308 // a += x; 4309 reg_cache.add_u32(a, k); 4310 // a += value; 4311 __ addw(a, a, value); 4312 4313 // a = Integer.rotateLeft(a, s) + b; 4314 __ rolw_imm(a, a, s); 4315 __ addw(a, a, b); 4316 } 4317 4318 // a += ((b & c) | ((~b) & d)) + x + ac; 4319 // a = Integer.rotateLeft(a, s) + b; 4320 void md5_FF(BufRegCache& reg_cache, 4321 Register a, Register b, Register c, Register d, 4322 int k, int s, int t, 4323 Register rtmp1, Register rtmp2) { 4324 // rtmp1 = b & c 4325 __ andr(rtmp1, b, c); 4326 4327 // rtmp2 = (~b) & d 4328 __ andn(rtmp2, d, b); 4329 4330 // rtmp1 = (b & c) | ((~b) & d) 4331 __ orr(rtmp1, rtmp1, rtmp2); 4332 4333 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4334 } 4335 4336 // a += ((b & d) | (c & (~d))) + x + ac; 4337 // a = Integer.rotateLeft(a, s) + b; 4338 void md5_GG(BufRegCache& reg_cache, 4339 Register a, Register b, Register c, Register d, 4340 int k, int s, int t, 4341 Register rtmp1, Register rtmp2) { 4342 // rtmp1 = b & d 4343 __ andr(rtmp1, b, d); 4344 4345 // rtmp2 = c & (~d) 4346 __ andn(rtmp2, c, d); 4347 4348 // rtmp1 = (b & d) | (c & (~d)) 4349 __ orr(rtmp1, rtmp1, rtmp2); 4350 4351 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4352 } 4353 4354 // a += ((b ^ c) ^ d) + x + ac; 4355 // a = Integer.rotateLeft(a, s) + b; 4356 void md5_HH(BufRegCache& reg_cache, 4357 Register a, Register b, Register c, Register d, 4358 int k, int s, int t, 4359 Register rtmp1, Register rtmp2) { 4360 // rtmp1 = (b ^ c) ^ d 4361 __ xorr(rtmp2, b, c); 4362 __ xorr(rtmp1, rtmp2, d); 4363 4364 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4365 } 4366 4367 // a += (c ^ (b | (~d))) + x + ac; 4368 // a = Integer.rotateLeft(a, s) + b; 4369 void md5_II(BufRegCache& reg_cache, 4370 Register a, Register b, Register c, Register d, 4371 int k, int s, int t, 4372 Register rtmp1, Register rtmp2) { 4373 // rtmp1 = c ^ (b | (~d)) 4374 __ orn(rtmp2, b, d); 4375 __ xorr(rtmp1, c, rtmp2); 4376 4377 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4378 } 4379 4380 // Arguments: 4381 // 4382 // Inputs: 4383 // c_rarg0 - byte[] source+offset 4384 // c_rarg1 - int[] SHA.state 4385 // c_rarg2 - int offset (multi_block == True) 4386 // c_rarg3 - int limit (multi_block == True) 4387 // 4388 // Registers: 4389 // x0 zero (zero) 4390 // x1 ra (return address) 4391 // x2 sp (stack pointer) 4392 // x3 gp (global pointer) 4393 // x4 tp (thread pointer) 4394 // x5 t0 (tmp register) 4395 // x6 t1 (tmp register) 4396 // x7 t2 state0 4397 // x8 f0/s0 (frame pointer) 4398 // x9 s1 4399 // x10 a0 rtmp1 / c_rarg0 4400 // x11 a1 rtmp2 / c_rarg1 4401 // x12 a2 a / c_rarg2 4402 // x13 a3 b / c_rarg3 4403 // x14 a4 c 4404 // x15 a5 d 4405 // x16 a6 buf 4406 // x17 a7 state 4407 // x18 s2 ofs [saved-reg] (multi_block == True) 4408 // x19 s3 limit [saved-reg] (multi_block == True) 4409 // x20 s4 state1 [saved-reg] 4410 // x21 s5 state2 [saved-reg] 4411 // x22 s6 state3 [saved-reg] 4412 // x23 s7 4413 // x24 s8 buf0 [saved-reg] 4414 // x25 s9 buf1 [saved-reg] 4415 // x26 s10 buf2 [saved-reg] 4416 // x27 s11 buf3 [saved-reg] 4417 // x28 t3 buf4 4418 // x29 t4 buf5 4419 // x30 t5 buf6 4420 // x31 t6 buf7 4421 address generate_md5_implCompress(bool multi_block, const char *name) { 4422 __ align(CodeEntryAlignment); 4423 StubCodeMark mark(this, "StubRoutines", name); 4424 address start = __ pc(); 4425 4426 // rotation constants 4427 const int S11 = 7; 4428 const int S12 = 12; 4429 const int S13 = 17; 4430 const int S14 = 22; 4431 const int S21 = 5; 4432 const int S22 = 9; 4433 const int S23 = 14; 4434 const int S24 = 20; 4435 const int S31 = 4; 4436 const int S32 = 11; 4437 const int S33 = 16; 4438 const int S34 = 23; 4439 const int S41 = 6; 4440 const int S42 = 10; 4441 const int S43 = 15; 4442 const int S44 = 21; 4443 4444 const int64_t mask32 = 0xffffffff; 4445 4446 Register buf_arg = c_rarg0; // a0 4447 Register state_arg = c_rarg1; // a1 4448 Register ofs_arg = c_rarg2; // a2 4449 Register limit_arg = c_rarg3; // a3 4450 4451 // we'll copy the args to these registers to free up a0-a3 4452 // to use for other values manipulated by instructions 4453 // that can be compressed 4454 Register buf = x16; // a6 4455 Register state = x17; // a7 4456 Register ofs = x18; // s2 4457 Register limit = x19; // s3 4458 4459 // using x12->15 to allow compressed instructions 4460 Register a = x12; // a2 4461 Register b = x13; // a3 4462 Register c = x14; // a4 4463 Register d = x15; // a5 4464 4465 Register state0 = x7; // t2 4466 Register state1 = x20; // s4 4467 Register state2 = x21; // s5 4468 Register state3 = x22; // s6 4469 4470 // using x10->x11 to allow compressed instructions 4471 Register rtmp1 = x10; // a0 4472 Register rtmp2 = x11; // a1 4473 4474 RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11 4475 RegSet reg_cache_regs; 4476 reg_cache_regs += reg_cache_saved_regs; 4477 reg_cache_regs += RegSet::of(x28, x29, x30, x31); // t3, t4, t5, t6 4478 BufRegCache reg_cache(_masm, reg_cache_regs); 4479 4480 RegSet saved_regs; 4481 if (multi_block) { 4482 saved_regs += RegSet::of(ofs, limit); 4483 } 4484 saved_regs += RegSet::of(state1, state2, state3); 4485 saved_regs += reg_cache_saved_regs; 4486 4487 __ push_reg(saved_regs, sp); 4488 4489 __ mv(buf, buf_arg); 4490 __ mv(state, state_arg); 4491 if (multi_block) { 4492 __ mv(ofs, ofs_arg); 4493 __ mv(limit, limit_arg); 4494 } 4495 4496 // to minimize the number of memory operations: 4497 // read the 4 state 4-byte values in pairs, with a single ld, 4498 // and split them into 2 registers. 4499 // 4500 // And, as the core algorithm of md5 works on 32-bits words, so 4501 // in the following code, it does not care about the content of 4502 // higher 32-bits in state[x]. Based on this observation, 4503 // we can apply further optimization, which is to just ignore the 4504 // higher 32-bits in state0/state2, rather than set the higher 4505 // 32-bits of state0/state2 to zero explicitly with extra instructions. 4506 __ ld(state0, Address(state)); 4507 __ srli(state1, state0, 32); 4508 __ ld(state2, Address(state, 8)); 4509 __ srli(state3, state2, 32); 4510 4511 Label md5_loop; 4512 __ BIND(md5_loop); 4513 4514 __ mv(a, state0); 4515 __ mv(b, state1); 4516 __ mv(c, state2); 4517 __ mv(d, state3); 4518 4519 // Round 1 4520 reg_cache.gen_load(0, buf); 4521 md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2); 4522 md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2); 4523 reg_cache.gen_load(1, buf); 4524 md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2); 4525 md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2); 4526 reg_cache.gen_load(2, buf); 4527 md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2); 4528 md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2); 4529 reg_cache.gen_load(3, buf); 4530 md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2); 4531 md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2); 4532 reg_cache.gen_load(4, buf); 4533 md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2); 4534 md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2); 4535 reg_cache.gen_load(5, buf); 4536 md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2); 4537 md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2); 4538 reg_cache.gen_load(6, buf); 4539 md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2); 4540 md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2); 4541 reg_cache.gen_load(7, buf); 4542 md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2); 4543 md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2); 4544 4545 // Round 2 4546 md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2); 4547 md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2); 4548 md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2); 4549 md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2); 4550 md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2); 4551 md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2); 4552 md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2); 4553 md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2); 4554 md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2); 4555 md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2); 4556 md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2); 4557 md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2); 4558 md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2); 4559 md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2); 4560 md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2); 4561 md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2); 4562 4563 // Round 3 4564 md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2); 4565 md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2); 4566 md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2); 4567 md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2); 4568 md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2); 4569 md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2); 4570 md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2); 4571 md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2); 4572 md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2); 4573 md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2); 4574 md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2); 4575 md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2); 4576 md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2); 4577 md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2); 4578 md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2); 4579 md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2); 4580 4581 // Round 4 4582 md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2); 4583 md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2); 4584 md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2); 4585 md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2); 4586 md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2); 4587 md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2); 4588 md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2); 4589 md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2); 4590 md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2); 4591 md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2); 4592 md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2); 4593 md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2); 4594 md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2); 4595 md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2); 4596 md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2); 4597 md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2); 4598 4599 __ addw(state0, state0, a); 4600 __ addw(state1, state1, b); 4601 __ addw(state2, state2, c); 4602 __ addw(state3, state3, d); 4603 4604 if (multi_block) { 4605 __ addi(buf, buf, 64); 4606 __ addi(ofs, ofs, 64); 4607 // if (ofs <= limit) goto m5_loop 4608 __ bge(limit, ofs, md5_loop); 4609 __ mv(c_rarg0, ofs); // return ofs 4610 } 4611 4612 // to minimize the number of memory operations: 4613 // write back the 4 state 4-byte values in pairs, with a single sd 4614 __ mv(t0, mask32); 4615 __ andr(state0, state0, t0); 4616 __ slli(state1, state1, 32); 4617 __ orr(state0, state0, state1); 4618 __ sd(state0, Address(state)); 4619 __ andr(state2, state2, t0); 4620 __ slli(state3, state3, 32); 4621 __ orr(state2, state2, state3); 4622 __ sd(state2, Address(state, 8)); 4623 4624 __ pop_reg(saved_regs, sp); 4625 __ ret(); 4626 4627 return (address) start; 4628 } 4629 4630 /** 4631 * Perform the quarter round calculations on values contained within four vector registers. 4632 * 4633 * @param aVec the SIMD register containing only the "a" values 4634 * @param bVec the SIMD register containing only the "b" values 4635 * @param cVec the SIMD register containing only the "c" values 4636 * @param dVec the SIMD register containing only the "d" values 4637 * @param tmp_vr temporary vector register holds intermedia values. 4638 */ 4639 void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec, 4640 VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) { 4641 // a += b, d ^= a, d <<<= 16 4642 __ vadd_vv(aVec, aVec, bVec); 4643 __ vxor_vv(dVec, dVec, aVec); 4644 __ vrole32_vi(dVec, 16, tmp_vr); 4645 4646 // c += d, b ^= c, b <<<= 12 4647 __ vadd_vv(cVec, cVec, dVec); 4648 __ vxor_vv(bVec, bVec, cVec); 4649 __ vrole32_vi(bVec, 12, tmp_vr); 4650 4651 // a += b, d ^= a, d <<<= 8 4652 __ vadd_vv(aVec, aVec, bVec); 4653 __ vxor_vv(dVec, dVec, aVec); 4654 __ vrole32_vi(dVec, 8, tmp_vr); 4655 4656 // c += d, b ^= c, b <<<= 7 4657 __ vadd_vv(cVec, cVec, dVec); 4658 __ vxor_vv(bVec, bVec, cVec); 4659 __ vrole32_vi(bVec, 7, tmp_vr); 4660 } 4661 4662 /** 4663 * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result) 4664 * 4665 * Input arguments: 4666 * c_rarg0 - state, the starting state 4667 * c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function 4668 * 4669 * Implementation Note: 4670 * Parallelization is achieved by loading individual state elements into vectors for N blocks. 4671 * N depends on single vector register length. 4672 */ 4673 address generate_chacha20Block() { 4674 Label L_Rounds; 4675 4676 __ align(CodeEntryAlignment); 4677 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4678 address start = __ pc(); 4679 __ enter(); 4680 4681 const int states_len = 16; 4682 const int step = 4; 4683 const Register state = c_rarg0; 4684 const Register key_stream = c_rarg1; 4685 const Register tmp_addr = t0; 4686 const Register length = t1; 4687 4688 // Organize vector registers in an array that facilitates 4689 // putting repetitive opcodes into loop structures below. 4690 const VectorRegister work_vrs[16] = { 4691 v0, v1, v2, v3, v4, v5, v6, v7, 4692 v8, v9, v10, v11, v12, v13, v14, v15 4693 }; 4694 const VectorRegister tmp_vr = v16; 4695 const VectorRegister counter_vr = v17; 4696 4697 { 4698 // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024 4699 // in java level. 4700 __ vsetivli(length, 16, Assembler::e32, Assembler::m1); 4701 } 4702 4703 // Load from source state. 4704 // Every element in source state is duplicated to all elements in the corresponding vector. 4705 __ mv(tmp_addr, state); 4706 for (int i = 0; i < states_len; i += 1) { 4707 __ vlse32_v(work_vrs[i], tmp_addr, zr); 4708 __ addi(tmp_addr, tmp_addr, step); 4709 } 4710 // Adjust counter for every individual block. 4711 __ vid_v(counter_vr); 4712 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); 4713 4714 // Perform 10 iterations of the 8 quarter round set 4715 { 4716 const Register loop = t2; // share t2 with other non-overlapping usages. 4717 __ mv(loop, 10); 4718 __ BIND(L_Rounds); 4719 4720 chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr); 4721 chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr); 4722 chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr); 4723 chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr); 4724 4725 chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr); 4726 chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr); 4727 chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr); 4728 chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr); 4729 4730 __ sub(loop, loop, 1); 4731 __ bnez(loop, L_Rounds); 4732 } 4733 4734 // Add the original state into the end working state. 4735 // We do this by first duplicating every element in source state array to the corresponding 4736 // vector, then adding it to the post-loop working state. 4737 __ mv(tmp_addr, state); 4738 for (int i = 0; i < states_len; i += 1) { 4739 __ vlse32_v(tmp_vr, tmp_addr, zr); 4740 __ addi(tmp_addr, tmp_addr, step); 4741 __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr); 4742 } 4743 // Add the counter overlay onto work_vrs[12] at the end. 4744 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); 4745 4746 // Store result to key stream. 4747 { 4748 const Register stride = t2; // share t2 with other non-overlapping usages. 4749 // Every block occupies 64 bytes, so we use 64 as stride of the vector store. 4750 __ mv(stride, 64); 4751 for (int i = 0; i < states_len; i += 1) { 4752 __ vsse32_v(work_vrs[i], key_stream, stride); 4753 __ addi(key_stream, key_stream, step); 4754 } 4755 } 4756 4757 // Return length of output key_stream 4758 __ slli(c_rarg0, length, 6); 4759 4760 __ leave(); 4761 __ ret(); 4762 4763 return (address) start; 4764 } 4765 4766 4767 // ------------------------ SHA-1 intrinsic ------------------------ 4768 4769 // K't = 4770 // 5a827999, 0 <= t <= 19 4771 // 6ed9eba1, 20 <= t <= 39 4772 // 8f1bbcdc, 40 <= t <= 59 4773 // ca62c1d6, 60 <= t <= 79 4774 void sha1_prepare_k(Register cur_k, int round) { 4775 assert(round >= 0 && round < 80, "must be"); 4776 4777 static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6}; 4778 if ((round % 20) == 0) { 4779 __ mv(cur_k, ks[round/20]); 4780 } 4781 } 4782 4783 // W't = 4784 // M't, 0 <= t <= 15 4785 // ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 4786 void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) { 4787 assert(round >= 0 && round < 80, "must be"); 4788 4789 if (round < 16) { 4790 // in the first 16 rounds, in ws[], every register contains 2 W't, e.g. 4791 // in ws[0], high part contains W't-0, low part contains W't-1, 4792 // in ws[1], high part contains W't-2, low part contains W't-3, 4793 // ... 4794 // in ws[7], high part contains W't-14, low part contains W't-15. 4795 4796 if ((round % 2) == 0) { 4797 __ ld(ws[round/2], Address(buf, (round/2) * 8)); 4798 // reverse bytes, as SHA-1 is defined in big-endian. 4799 __ revb(ws[round/2], ws[round/2]); 4800 __ srli(cur_w, ws[round/2], 32); 4801 } else { 4802 __ mv(cur_w, ws[round/2]); 4803 } 4804 4805 return; 4806 } 4807 4808 if ((round % 2) == 0) { 4809 int idx = 16; 4810 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 4811 __ srli(t1, ws[(idx-8)/2], 32); 4812 __ xorr(t0, ws[(idx-3)/2], t1); 4813 4814 __ srli(t1, ws[(idx-14)/2], 32); 4815 __ srli(cur_w, ws[(idx-16)/2], 32); 4816 __ xorr(cur_w, cur_w, t1); 4817 4818 __ xorr(cur_w, cur_w, t0); 4819 __ rolw_imm(cur_w, cur_w, 1, t0); 4820 4821 // copy the cur_w value to ws[8]. 4822 // now, valid w't values are at: 4823 // w0: ws[0]'s lower 32 bits 4824 // w1 ~ w14: ws[1] ~ ws[7] 4825 // w15: ws[8]'s higher 32 bits 4826 __ slli(ws[idx/2], cur_w, 32); 4827 4828 return; 4829 } 4830 4831 int idx = 17; 4832 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 4833 __ srli(t1, ws[(idx-3)/2], 32); 4834 __ xorr(t0, t1, ws[(idx-8)/2]); 4835 4836 __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]); 4837 4838 __ xorr(cur_w, cur_w, t0); 4839 __ rolw_imm(cur_w, cur_w, 1, t0); 4840 4841 // copy the cur_w value to ws[8] 4842 __ zero_extend(cur_w, cur_w, 32); 4843 __ orr(ws[idx/2], ws[idx/2], cur_w); 4844 4845 // shift the w't registers, so they start from ws[0] again. 4846 // now, valid w't values are at: 4847 // w0 ~ w15: ws[0] ~ ws[7] 4848 Register ws_0 = ws[0]; 4849 for (int i = 0; i < 16/2; i++) { 4850 ws[i] = ws[i+1]; 4851 } 4852 ws[8] = ws_0; 4853 } 4854 4855 // f't(x, y, z) = 4856 // Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19 4857 // Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39 4858 // Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59 4859 // Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79 4860 void sha1_f(Register dst, Register x, Register y, Register z, int round) { 4861 assert(round >= 0 && round < 80, "must be"); 4862 assert_different_registers(dst, x, y, z, t0, t1); 4863 4864 if (round < 20) { 4865 // (x & y) ^ (~x & z) 4866 __ andr(t0, x, y); 4867 __ andn(dst, z, x); 4868 __ xorr(dst, dst, t0); 4869 } else if (round >= 40 && round < 60) { 4870 // (x & y) ^ (x & z) ^ (y & z) 4871 __ andr(t0, x, y); 4872 __ andr(t1, x, z); 4873 __ andr(dst, y, z); 4874 __ xorr(dst, dst, t0); 4875 __ xorr(dst, dst, t1); 4876 } else { 4877 // x ^ y ^ z 4878 __ xorr(dst, x, y); 4879 __ xorr(dst, dst, z); 4880 } 4881 } 4882 4883 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't 4884 // e = d 4885 // d = c 4886 // c = ROTL'30(b) 4887 // b = a 4888 // a = T 4889 void sha1_process_round(Register a, Register b, Register c, Register d, Register e, 4890 Register cur_k, Register cur_w, Register tmp, int round) { 4891 assert(round >= 0 && round < 80, "must be"); 4892 assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0); 4893 4894 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't 4895 4896 // cur_w will be recalculated at the beginning of each round, 4897 // so, we can reuse it as a temp register here. 4898 Register tmp2 = cur_w; 4899 4900 // reuse e as a temporary register, as we will mv new value into it later 4901 Register tmp3 = e; 4902 __ add(tmp2, cur_k, tmp2); 4903 __ add(tmp3, tmp3, tmp2); 4904 __ rolw_imm(tmp2, a, 5, t0); 4905 4906 sha1_f(tmp, b, c, d, round); 4907 4908 __ add(tmp2, tmp2, tmp); 4909 __ add(tmp2, tmp2, tmp3); 4910 4911 // e = d 4912 // d = c 4913 // c = ROTL'30(b) 4914 // b = a 4915 // a = T 4916 __ mv(e, d); 4917 __ mv(d, c); 4918 4919 __ rolw_imm(c, b, 30); 4920 __ mv(b, a); 4921 __ mv(a, tmp2); 4922 } 4923 4924 // H(i)0 = a + H(i-1)0 4925 // H(i)1 = b + H(i-1)1 4926 // H(i)2 = c + H(i-1)2 4927 // H(i)3 = d + H(i-1)3 4928 // H(i)4 = e + H(i-1)4 4929 void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e, 4930 Register prev_ab, Register prev_cd, Register prev_e) { 4931 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e); 4932 4933 __ add(a, a, prev_ab); 4934 __ srli(prev_ab, prev_ab, 32); 4935 __ add(b, b, prev_ab); 4936 4937 __ add(c, c, prev_cd); 4938 __ srli(prev_cd, prev_cd, 32); 4939 __ add(d, d, prev_cd); 4940 4941 __ add(e, e, prev_e); 4942 } 4943 4944 void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e, 4945 Register prev_ab, Register prev_cd, Register prev_e) { 4946 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0); 4947 4948 __ slli(t0, b, 32); 4949 __ zero_extend(prev_ab, a, 32); 4950 __ orr(prev_ab, prev_ab, t0); 4951 4952 __ slli(t0, d, 32); 4953 __ zero_extend(prev_cd, c, 32); 4954 __ orr(prev_cd, prev_cd, t0); 4955 4956 __ mv(prev_e, e); 4957 } 4958 4959 // Intrinsic for: 4960 // void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs) 4961 // void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit) 4962 // 4963 // Arguments: 4964 // 4965 // Inputs: 4966 // c_rarg0: byte[] src array + offset 4967 // c_rarg1: int[] SHA.state 4968 // - - - - - - below are only for implCompressMultiBlock0 - - - - - - 4969 // c_rarg2: int offset 4970 // c_rarg3: int limit 4971 // 4972 // Outputs: 4973 // - - - - - - below are only for implCompressMultiBlock0 - - - - - - 4974 // c_rarg0: int offset, when (multi_block == true) 4975 // 4976 address generate_sha1_implCompress(bool multi_block, const char *name) { 4977 __ align(CodeEntryAlignment); 4978 StubCodeMark mark(this, "StubRoutines", name); 4979 4980 address start = __ pc(); 4981 __ enter(); 4982 4983 RegSet saved_regs = RegSet::range(x18, x27); 4984 if (multi_block) { 4985 // use x9 as src below. 4986 saved_regs += RegSet::of(x9); 4987 } 4988 __ push_reg(saved_regs, sp); 4989 4990 // c_rarg0 - c_rarg3: x10 - x13 4991 Register buf = c_rarg0; 4992 Register state = c_rarg1; 4993 Register offset = c_rarg2; 4994 Register limit = c_rarg3; 4995 // use src to contain the original start point of the array. 4996 Register src = x9; 4997 4998 if (multi_block) { 4999 __ sub(limit, limit, offset); 5000 __ add(limit, limit, buf); 5001 __ sub(src, buf, offset); 5002 } 5003 5004 // [args-reg]: x14 - x17 5005 // [temp-reg]: x28 - x31 5006 // [saved-reg]: x18 - x27 5007 5008 // h0/1/2/3/4 5009 const Register a = x14, b = x15, c = x16, d = x17, e = x28; 5010 // w0, w1, ... w15 5011 // put two adjecent w's in one register: 5012 // one at high word part, another at low word part 5013 // at different round (even or odd), w't value reside in different items in ws[]. 5014 // w0 ~ w15, either reside in 5015 // ws[0] ~ ws[7], where 5016 // w0 at higher 32 bits of ws[0], 5017 // w1 at lower 32 bits of ws[0], 5018 // ... 5019 // w14 at higher 32 bits of ws[7], 5020 // w15 at lower 32 bits of ws[7]. 5021 // or, reside in 5022 // w0: ws[0]'s lower 32 bits 5023 // w1 ~ w14: ws[1] ~ ws[7] 5024 // w15: ws[8]'s higher 32 bits 5025 Register ws[9] = {x29, x30, x31, x18, 5026 x19, x20, x21, x22, 5027 x23}; // auxiliary register for calculating w's value 5028 // current k't's value 5029 const Register cur_k = x24; 5030 // current w't's value 5031 const Register cur_w = x25; 5032 // values of a, b, c, d, e in the previous round 5033 const Register prev_ab = x26, prev_cd = x27; 5034 const Register prev_e = offset; // reuse offset/c_rarg2 5035 5036 // load 5 words state into a, b, c, d, e. 5037 // 5038 // To minimize the number of memory operations, we apply following 5039 // optimization: read the states (a/b/c/d) of 4-byte values in pairs, 5040 // with a single ld, and split them into 2 registers. 5041 // 5042 // And, as the core algorithm of SHA-1 works on 32-bits words, so 5043 // in the following code, it does not care about the content of 5044 // higher 32-bits in a/b/c/d/e. Based on this observation, 5045 // we can apply further optimization, which is to just ignore the 5046 // higher 32-bits in a/c/e, rather than set the higher 5047 // 32-bits of a/c/e to zero explicitly with extra instructions. 5048 __ ld(a, Address(state, 0)); 5049 __ srli(b, a, 32); 5050 __ ld(c, Address(state, 8)); 5051 __ srli(d, c, 32); 5052 __ lw(e, Address(state, 16)); 5053 5054 Label L_sha1_loop; 5055 if (multi_block) { 5056 __ BIND(L_sha1_loop); 5057 } 5058 5059 sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5060 5061 for (int round = 0; round < 80; round++) { 5062 // prepare K't value 5063 sha1_prepare_k(cur_k, round); 5064 5065 // prepare W't value 5066 sha1_prepare_w(cur_w, ws, buf, round); 5067 5068 // one round process 5069 sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round); 5070 } 5071 5072 // compute the intermediate hash value 5073 sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5074 5075 if (multi_block) { 5076 int64_t block_bytes = 16 * 4; 5077 __ addi(buf, buf, block_bytes); 5078 5079 __ bge(limit, buf, L_sha1_loop, true); 5080 } 5081 5082 // store back the state. 5083 __ zero_extend(a, a, 32); 5084 __ slli(b, b, 32); 5085 __ orr(a, a, b); 5086 __ sd(a, Address(state, 0)); 5087 __ zero_extend(c, c, 32); 5088 __ slli(d, d, 32); 5089 __ orr(c, c, d); 5090 __ sd(c, Address(state, 8)); 5091 __ sw(e, Address(state, 16)); 5092 5093 // return offset 5094 if (multi_block) { 5095 __ sub(c_rarg0, buf, src); 5096 } 5097 5098 __ pop_reg(saved_regs, sp); 5099 5100 __ leave(); 5101 __ ret(); 5102 5103 return (address) start; 5104 } 5105 5106 /** 5107 * vector registers: 5108 * input VectorRegister's: intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3 5109 * index VectorRegister's: idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7 5110 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11 5111 * 5112 * NOTE: each field will occupy a vector register group 5113 */ 5114 void base64_vector_encode_round(Register src, Register dst, Register codec, 5115 Register size, Register stepSrc, Register stepDst, 5116 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, 5117 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, 5118 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4, 5119 Assembler::LMUL lmul) { 5120 // set vector register type/len 5121 __ vsetvli(x0, size, Assembler::e8, lmul); 5122 5123 // segmented load src into v registers: mem(src) => vr(3) 5124 __ vlseg3e8_v(inputV1, src); 5125 5126 // src = src + register_group_len_bytes * 3 5127 __ add(src, src, stepSrc); 5128 5129 // encoding 5130 // 1. compute index into lookup table: vr(3) => vr(4) 5131 __ vsrl_vi(idxV1, inputV1, 2); 5132 5133 __ vsrl_vi(idxV2, inputV2, 2); 5134 __ vsll_vi(inputV1, inputV1, 6); 5135 __ vor_vv(idxV2, idxV2, inputV1); 5136 __ vsrl_vi(idxV2, idxV2, 2); 5137 5138 __ vsrl_vi(idxV3, inputV3, 4); 5139 __ vsll_vi(inputV2, inputV2, 4); 5140 __ vor_vv(idxV3, inputV2, idxV3); 5141 __ vsrl_vi(idxV3, idxV3, 2); 5142 5143 __ vsll_vi(idxV4, inputV3, 2); 5144 __ vsrl_vi(idxV4, idxV4, 2); 5145 5146 // 2. indexed load: vr(4) => vr(4) 5147 __ vluxei8_v(outputV1, codec, idxV1); 5148 __ vluxei8_v(outputV2, codec, idxV2); 5149 __ vluxei8_v(outputV3, codec, idxV3); 5150 __ vluxei8_v(outputV4, codec, idxV4); 5151 5152 // segmented store encoded data in v registers back to dst: vr(4) => mem(dst) 5153 __ vsseg4e8_v(outputV1, dst); 5154 5155 // dst = dst + register_group_len_bytes * 4 5156 __ add(dst, dst, stepDst); 5157 } 5158 5159 /** 5160 * void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) 5161 * 5162 * Input arguments: 5163 * c_rarg0 - src, source array 5164 * c_rarg1 - sp, src start offset 5165 * c_rarg2 - sl, src end offset 5166 * c_rarg3 - dst, dest array 5167 * c_rarg4 - dp, dst start offset 5168 * c_rarg5 - isURL, Base64 or URL character set 5169 */ 5170 address generate_base64_encodeBlock() { 5171 alignas(64) static const char toBase64[64] = { 5172 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5173 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5174 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5175 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5176 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 5177 }; 5178 5179 alignas(64) static const char toBase64URL[64] = { 5180 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5181 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5182 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5183 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5184 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 5185 }; 5186 5187 __ align(CodeEntryAlignment); 5188 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 5189 address start = __ pc(); 5190 __ enter(); 5191 5192 Register src = c_rarg0; 5193 Register soff = c_rarg1; 5194 Register send = c_rarg2; 5195 Register dst = c_rarg3; 5196 Register doff = c_rarg4; 5197 Register isURL = c_rarg5; 5198 5199 Register codec = c_rarg6; 5200 Register length = c_rarg7; // total length of src data in bytes 5201 5202 Label ProcessData, Exit; 5203 5204 // length should be multiple of 3 5205 __ sub(length, send, soff); 5206 // real src/dst to process data 5207 __ add(src, src, soff); 5208 __ add(dst, dst, doff); 5209 5210 // load the codec base address 5211 __ la(codec, ExternalAddress((address) toBase64)); 5212 __ beqz(isURL, ProcessData); 5213 __ la(codec, ExternalAddress((address) toBase64URL)); 5214 __ BIND(ProcessData); 5215 5216 // vector version 5217 if (UseRVV) { 5218 Label ProcessM2, ProcessM1, ProcessScalar; 5219 5220 Register size = soff; 5221 Register stepSrcM1 = send; 5222 Register stepSrcM2 = doff; 5223 Register stepDst = isURL; 5224 5225 __ mv(size, MaxVectorSize * 2); 5226 __ mv(stepSrcM1, MaxVectorSize * 3); 5227 __ slli(stepSrcM2, stepSrcM1, 1); 5228 __ mv(stepDst, MaxVectorSize * 2 * 4); 5229 5230 __ blt(length, stepSrcM2, ProcessM1); 5231 5232 __ BIND(ProcessM2); 5233 base64_vector_encode_round(src, dst, codec, 5234 size, stepSrcM2, stepDst, 5235 v2, v4, v6, // inputs 5236 v8, v10, v12, v14, // indexes 5237 v16, v18, v20, v22, // outputs 5238 Assembler::m2); 5239 5240 __ sub(length, length, stepSrcM2); 5241 __ bge(length, stepSrcM2, ProcessM2); 5242 5243 __ BIND(ProcessM1); 5244 __ blt(length, stepSrcM1, ProcessScalar); 5245 5246 __ srli(size, size, 1); 5247 __ srli(stepDst, stepDst, 1); 5248 base64_vector_encode_round(src, dst, codec, 5249 size, stepSrcM1, stepDst, 5250 v1, v2, v3, // inputs 5251 v4, v5, v6, v7, // indexes 5252 v8, v9, v10, v11, // outputs 5253 Assembler::m1); 5254 __ sub(length, length, stepSrcM1); 5255 5256 __ BIND(ProcessScalar); 5257 } 5258 5259 // scalar version 5260 { 5261 Register byte1 = soff, byte0 = send, byte2 = doff; 5262 Register combined24Bits = isURL; 5263 5264 __ beqz(length, Exit); 5265 5266 Label ScalarLoop; 5267 __ BIND(ScalarLoop); 5268 { 5269 // plain: [byte0[7:0] : byte1[7:0] : byte2[7:0]] => 5270 // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]] 5271 5272 // load 3 bytes src data 5273 __ lbu(byte0, Address(src, 0)); 5274 __ lbu(byte1, Address(src, 1)); 5275 __ lbu(byte2, Address(src, 2)); 5276 __ addi(src, src, 3); 5277 5278 // construct 24 bits from 3 bytes 5279 __ slliw(byte0, byte0, 16); 5280 __ slliw(byte1, byte1, 8); 5281 __ orr(combined24Bits, byte0, byte1); 5282 __ orr(combined24Bits, combined24Bits, byte2); 5283 5284 // get codec index and encode(ie. load from codec by index) 5285 __ slliw(byte0, combined24Bits, 8); 5286 __ srliw(byte0, byte0, 26); 5287 __ add(byte0, codec, byte0); 5288 __ lbu(byte0, byte0); 5289 5290 __ slliw(byte1, combined24Bits, 14); 5291 __ srliw(byte1, byte1, 26); 5292 __ add(byte1, codec, byte1); 5293 __ lbu(byte1, byte1); 5294 5295 __ slliw(byte2, combined24Bits, 20); 5296 __ srliw(byte2, byte2, 26); 5297 __ add(byte2, codec, byte2); 5298 __ lbu(byte2, byte2); 5299 5300 __ andi(combined24Bits, combined24Bits, 0x3f); 5301 __ add(combined24Bits, codec, combined24Bits); 5302 __ lbu(combined24Bits, combined24Bits); 5303 5304 // store 4 bytes encoded data 5305 __ sb(byte0, Address(dst, 0)); 5306 __ sb(byte1, Address(dst, 1)); 5307 __ sb(byte2, Address(dst, 2)); 5308 __ sb(combined24Bits, Address(dst, 3)); 5309 5310 __ sub(length, length, 3); 5311 __ addi(dst, dst, 4); 5312 // loop back 5313 __ bnez(length, ScalarLoop); 5314 } 5315 } 5316 5317 __ BIND(Exit); 5318 5319 __ leave(); 5320 __ ret(); 5321 5322 return (address) start; 5323 } 5324 5325 /** 5326 * vector registers: 5327 * input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8 5328 * index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16 5329 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22 5330 * 5331 * NOTE: each field will occupy a single vector register group 5332 */ 5333 void base64_vector_decode_round(Register src, Register dst, Register codec, 5334 Register size, Register stepSrc, Register stepDst, Register failedIdx, Register minusOne, 5335 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4, 5336 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, 5337 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, 5338 Assembler::LMUL lmul) { 5339 // set vector register type/len 5340 __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta); 5341 5342 // segmented load src into v registers: mem(src) => vr(4) 5343 __ vlseg4e8_v(inputV1, src); 5344 5345 // src = src + register_group_len_bytes * 4 5346 __ add(src, src, stepSrc); 5347 5348 // decoding 5349 // 1. indexed load: vr(4) => vr(4) 5350 __ vluxei8_v(idxV1, codec, inputV1); 5351 __ vluxei8_v(idxV2, codec, inputV2); 5352 __ vluxei8_v(idxV3, codec, inputV3); 5353 __ vluxei8_v(idxV4, codec, inputV4); 5354 5355 // 2. check wrong data 5356 __ vor_vv(outputV1, idxV1, idxV2); 5357 __ vor_vv(outputV2, idxV3, idxV4); 5358 __ vor_vv(outputV1, outputV1, outputV2); 5359 __ vmseq_vi(v0, outputV1, -1); 5360 __ vfirst_m(failedIdx, v0); 5361 Label NoFailure; 5362 __ beq(failedIdx, minusOne, NoFailure); 5363 __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu); 5364 __ slli(stepDst, failedIdx, 1); 5365 __ add(stepDst, failedIdx, stepDst); 5366 __ BIND(NoFailure); 5367 5368 // 3. compute the decoded data: vr(4) => vr(3) 5369 __ vsll_vi(idxV1, idxV1, 2); 5370 __ vsrl_vi(outputV1, idxV2, 4); 5371 __ vor_vv(outputV1, outputV1, idxV1); 5372 5373 __ vsll_vi(idxV2, idxV2, 4); 5374 __ vsrl_vi(outputV2, idxV3, 2); 5375 __ vor_vv(outputV2, outputV2, idxV2); 5376 5377 __ vsll_vi(idxV3, idxV3, 6); 5378 __ vor_vv(outputV3, idxV4, idxV3); 5379 5380 // segmented store encoded data in v registers back to dst: vr(3) => mem(dst) 5381 __ vsseg3e8_v(outputV1, dst); 5382 5383 // dst = dst + register_group_len_bytes * 3 5384 __ add(dst, dst, stepDst); 5385 } 5386 5387 /** 5388 * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME) 5389 * 5390 * Input arguments: 5391 * c_rarg0 - src, source array 5392 * c_rarg1 - sp, src start offset 5393 * c_rarg2 - sl, src end offset 5394 * c_rarg3 - dst, dest array 5395 * c_rarg4 - dp, dst start offset 5396 * c_rarg5 - isURL, Base64 or URL character set 5397 * c_rarg6 - isMIME, Decoding MIME block 5398 */ 5399 address generate_base64_decodeBlock() { 5400 5401 static const uint8_t fromBase64[256] = { 5402 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5403 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5404 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 5405 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 5406 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 5407 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 5408 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 5409 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 5410 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5411 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5412 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5413 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5414 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5415 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5416 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5417 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5418 }; 5419 5420 static const uint8_t fromBase64URL[256] = { 5421 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5422 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5423 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 5424 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 5425 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 5426 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 5427 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 5428 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 5429 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5430 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5431 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5432 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5433 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5434 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5435 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5436 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5437 }; 5438 5439 __ align(CodeEntryAlignment); 5440 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 5441 address start = __ pc(); 5442 __ enter(); 5443 5444 Register src = c_rarg0; 5445 Register soff = c_rarg1; 5446 Register send = c_rarg2; 5447 Register dst = c_rarg3; 5448 Register doff = c_rarg4; 5449 Register isURL = c_rarg5; 5450 Register isMIME = c_rarg6; 5451 5452 Register codec = c_rarg7; 5453 Register dstBackup = x31; 5454 Register length = x28; // t3, total length of src data in bytes 5455 5456 Label ProcessData, Exit; 5457 Label ProcessScalar, ScalarLoop; 5458 5459 // passed in length (send - soff) is guaranteed to be > 4, 5460 // and in this intrinsic we only process data of length in multiple of 4, 5461 // it's not guaranteed to be multiple of 4 by java level, so do it explicitly 5462 __ sub(length, send, soff); 5463 __ andi(length, length, -4); 5464 // real src/dst to process data 5465 __ add(src, src, soff); 5466 __ add(dst, dst, doff); 5467 // backup of dst, used to calculate the return value at exit 5468 __ mv(dstBackup, dst); 5469 5470 // load the codec base address 5471 __ la(codec, ExternalAddress((address) fromBase64)); 5472 __ beqz(isURL, ProcessData); 5473 __ la(codec, ExternalAddress((address) fromBase64URL)); 5474 __ BIND(ProcessData); 5475 5476 // vector version 5477 if (UseRVV) { 5478 // for MIME case, it has a default length limit of 76 which could be 5479 // different(smaller) from (send - soff), so in MIME case, we go through 5480 // the scalar code path directly. 5481 __ bnez(isMIME, ScalarLoop); 5482 5483 Label ProcessM1, ProcessM2; 5484 5485 Register failedIdx = soff; 5486 Register stepSrcM1 = send; 5487 Register stepSrcM2 = doff; 5488 Register stepDst = isURL; 5489 Register size = x29; // t4 5490 Register minusOne = x30; // t5 5491 5492 __ mv(minusOne, -1); 5493 __ mv(size, MaxVectorSize * 2); 5494 __ mv(stepSrcM1, MaxVectorSize * 4); 5495 __ slli(stepSrcM2, stepSrcM1, 1); 5496 __ mv(stepDst, MaxVectorSize * 2 * 3); 5497 5498 __ blt(length, stepSrcM2, ProcessM1); 5499 5500 5501 // Assembler::m2 5502 __ BIND(ProcessM2); 5503 base64_vector_decode_round(src, dst, codec, 5504 size, stepSrcM2, stepDst, failedIdx, minusOne, 5505 v2, v4, v6, v8, // inputs 5506 v10, v12, v14, v16, // indexes 5507 v18, v20, v22, // outputs 5508 Assembler::m2); 5509 __ sub(length, length, stepSrcM2); 5510 5511 // error check 5512 __ bne(failedIdx, minusOne, Exit); 5513 5514 __ bge(length, stepSrcM2, ProcessM2); 5515 5516 5517 // Assembler::m1 5518 __ BIND(ProcessM1); 5519 __ blt(length, stepSrcM1, ProcessScalar); 5520 5521 __ srli(size, size, 1); 5522 __ srli(stepDst, stepDst, 1); 5523 base64_vector_decode_round(src, dst, codec, 5524 size, stepSrcM1, stepDst, failedIdx, minusOne, 5525 v1, v2, v3, v4, // inputs 5526 v5, v6, v7, v8, // indexes 5527 v9, v10, v11, // outputs 5528 Assembler::m1); 5529 __ sub(length, length, stepSrcM1); 5530 5531 // error check 5532 __ bne(failedIdx, minusOne, Exit); 5533 5534 __ BIND(ProcessScalar); 5535 __ beqz(length, Exit); 5536 } 5537 5538 // scalar version 5539 { 5540 Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL; 5541 Register combined32Bits = x29; // t5 5542 5543 // encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] => 5544 // plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]] 5545 __ BIND(ScalarLoop); 5546 5547 // load 4 bytes encoded src data 5548 __ lbu(byte0, Address(src, 0)); 5549 __ lbu(byte1, Address(src, 1)); 5550 __ lbu(byte2, Address(src, 2)); 5551 __ lbu(byte3, Address(src, 3)); 5552 __ addi(src, src, 4); 5553 5554 // get codec index and decode (ie. load from codec by index) 5555 __ add(byte0, codec, byte0); 5556 __ add(byte1, codec, byte1); 5557 __ lb(byte0, Address(byte0, 0)); 5558 __ lb(byte1, Address(byte1, 0)); 5559 __ add(byte2, codec, byte2); 5560 __ add(byte3, codec, byte3); 5561 __ lb(byte2, Address(byte2, 0)); 5562 __ lb(byte3, Address(byte3, 0)); 5563 __ slliw(byte0, byte0, 18); 5564 __ slliw(byte1, byte1, 12); 5565 __ orr(byte0, byte0, byte1); 5566 __ orr(byte0, byte0, byte3); 5567 __ slliw(byte2, byte2, 6); 5568 // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time, 5569 // 1. error check below 5570 // 2. decode below 5571 __ orr(combined32Bits, byte0, byte2); 5572 5573 // error check 5574 __ bltz(combined32Bits, Exit); 5575 5576 // store 3 bytes decoded data 5577 __ sraiw(byte0, combined32Bits, 16); 5578 __ sraiw(byte1, combined32Bits, 8); 5579 __ sb(byte0, Address(dst, 0)); 5580 __ sb(byte1, Address(dst, 1)); 5581 __ sb(combined32Bits, Address(dst, 2)); 5582 5583 __ sub(length, length, 4); 5584 __ addi(dst, dst, 3); 5585 // loop back 5586 __ bnez(length, ScalarLoop); 5587 } 5588 5589 __ BIND(Exit); 5590 __ sub(c_rarg0, dst, dstBackup); 5591 5592 __ leave(); 5593 __ ret(); 5594 5595 return (address) start; 5596 } 5597 5598 void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable, 5599 VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc, 5600 Register temp0, Register temp1, Register temp2, Register temp3, 5601 VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) { 5602 5603 assert((lmul == Assembler::m4 && step == 64) || 5604 (lmul == Assembler::m2 && step == 32) || 5605 (lmul == Assembler::m1 && step == 16), 5606 "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16"); 5607 // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used. 5608 // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case. 5609 // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration. 5610 // In non-vectorized code, we update s1 and s2 as: 5611 // s1 <- s1 + b1 5612 // s2 <- s2 + s1 5613 // s1 <- s1 + b2 5614 // s2 <- s2 + b1 5615 // ... 5616 // s1 <- s1 + b64 5617 // s2 <- s2 + s1 5618 // Putting above assignments together, we have: 5619 // s1_new = s1 + b1 + b2 + ... + b64 5620 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) = 5621 // = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) = 5622 // = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1) 5623 5624 __ mv(temp3, step); 5625 // Load data 5626 __ vsetvli(temp0, temp3, Assembler::e8, lmul); 5627 __ vle8_v(vbytes, buff); 5628 __ addi(buff, buff, step); 5629 5630 // Upper bound reduction sum for s1_new: 5631 // 0xFF * 64 = 0x3FC0, so: 5632 // 1. Need to do vector-widening reduction sum 5633 // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements 5634 __ vwredsumu_vs(vs1acc, vbytes, vzero); 5635 // Multiplication for s2_new 5636 __ vwmulu_vv(vs2acc, vtable, vbytes); 5637 5638 // s2 = s2 + s1 * log2(step) 5639 __ slli(temp1, s1, exact_log2(step)); 5640 __ add(s2, s2, temp1); 5641 5642 // Summing up calculated results for s2_new 5643 if (MaxVectorSize > 16) { 5644 __ vsetvli(temp0, temp3, Assembler::e16, lmul); 5645 } else { 5646 // Half of vector-widening multiplication result is in successor of vs2acc 5647 // group for vlen == 16, in which case we need to double vector register 5648 // group width in order to reduction sum all of them 5649 Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 : 5650 (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8; 5651 __ vsetvli(temp0, temp3, Assembler::e16, lmulx2); 5652 } 5653 // Upper bound for reduction sum: 5654 // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so: 5655 // 1. Need to do vector-widening reduction sum 5656 // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements 5657 __ vwredsumu_vs(vtemp1, vs2acc, vzero); 5658 5659 // Extracting results for: 5660 // s1_new 5661 __ vmv_x_s(temp0, vs1acc); 5662 __ add(s1, s1, temp0); 5663 // s2_new 5664 __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1); 5665 __ vmv_x_s(temp1, vtemp1); 5666 __ add(s2, s2, temp1); 5667 } 5668 5669 /*** 5670 * int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len) 5671 * 5672 * Arguments: 5673 * 5674 * Inputs: 5675 * c_rarg0 - int adler 5676 * c_rarg1 - byte* buff (b + off) 5677 * c_rarg2 - int len 5678 * 5679 * Output: 5680 * c_rarg0 - int adler result 5681 */ 5682 address generate_updateBytesAdler32() { 5683 __ align(CodeEntryAlignment); 5684 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 5685 address start = __ pc(); 5686 5687 Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop, 5688 L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1; 5689 5690 // Aliases 5691 Register adler = c_rarg0; 5692 Register s1 = c_rarg0; 5693 Register s2 = c_rarg3; 5694 Register buff = c_rarg1; 5695 Register len = c_rarg2; 5696 Register nmax = c_rarg4; 5697 Register base = c_rarg5; 5698 Register count = c_rarg6; 5699 Register temp0 = x28; // t3 5700 Register temp1 = x29; // t4 5701 Register temp2 = x30; // t5 5702 Register temp3 = x31; // t6 5703 5704 VectorRegister vzero = v31; 5705 VectorRegister vbytes = v8; // group: v8, v9, v10, v11 5706 VectorRegister vs1acc = v12; // group: v12, v13, v14, v15 5707 VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23 5708 VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27 5709 VectorRegister vtable_32 = v4; // group: v4, v5 5710 VectorRegister vtable_16 = v30; 5711 VectorRegister vtemp1 = v28; 5712 VectorRegister vtemp2 = v29; 5713 5714 // Max number of bytes we can process before having to take the mod 5715 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 5716 const uint64_t BASE = 0xfff1; 5717 const uint64_t NMAX = 0x15B0; 5718 5719 // Loops steps 5720 int step_64 = 64; 5721 int step_32 = 32; 5722 int step_16 = 16; 5723 int step_1 = 1; 5724 5725 __ enter(); // Required for proper stackwalking of RuntimeStub frame 5726 __ mv(temp1, 64); 5727 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4); 5728 5729 // Generating accumulation coefficients for further calculations 5730 // vtable_64: 5731 __ vid_v(vtemp1); 5732 __ vrsub_vx(vtable_64, vtemp1, temp1); 5733 // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 } 5734 5735 // vtable_32: 5736 __ mv(temp1, 32); 5737 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2); 5738 __ vid_v(vtemp1); 5739 __ vrsub_vx(vtable_32, vtemp1, temp1); 5740 // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 } 5741 5742 __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1); 5743 // vtable_16: 5744 __ mv(temp1, 16); 5745 __ vid_v(vtemp1); 5746 __ vrsub_vx(vtable_16, vtemp1, temp1); 5747 // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 } 5748 5749 __ vmv_v_i(vzero, 0); 5750 5751 __ mv(base, BASE); 5752 __ mv(nmax, NMAX); 5753 5754 // s1 is initialized to the lower 16 bits of adler 5755 // s2 is initialized to the upper 16 bits of adler 5756 __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff) 5757 __ zero_extend(s1, adler, 16); // s1 = (adler & 0xffff) 5758 5759 // The pipelined loop needs at least 16 elements for 1 iteration 5760 // It does check this, but it is more effective to skip to the cleanup loop 5761 __ mv(temp0, step_16); 5762 __ bgeu(len, temp0, L_nmax); 5763 __ beqz(len, L_combine); 5764 5765 // Jumping to L_by1_loop 5766 __ sub(len, len, step_1); 5767 __ j(L_by1_loop); 5768 5769 __ bind(L_nmax); 5770 __ sub(len, len, nmax); 5771 __ sub(count, nmax, 16); 5772 __ bltz(len, L_by16); 5773 5774 // Align L_nmax loop by 64 5775 __ bind(L_nmax_loop_entry); 5776 __ sub(count, count, 32); 5777 5778 __ bind(L_nmax_loop); 5779 adler32_process_bytes(buff, s1, s2, vtable_64, vzero, 5780 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5781 vtemp1, vtemp2, step_64, Assembler::m4); 5782 __ sub(count, count, step_64); 5783 __ bgtz(count, L_nmax_loop); 5784 5785 // There are three iterations left to do 5786 adler32_process_bytes(buff, s1, s2, vtable_32, vzero, 5787 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5788 vtemp1, vtemp2, step_32, Assembler::m2); 5789 adler32_process_bytes(buff, s1, s2, vtable_16, vzero, 5790 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5791 vtemp1, vtemp2, step_16, Assembler::m1); 5792 5793 // s1 = s1 % BASE 5794 __ remuw(s1, s1, base); 5795 // s2 = s2 % BASE 5796 __ remuw(s2, s2, base); 5797 5798 __ sub(len, len, nmax); 5799 __ sub(count, nmax, 16); 5800 __ bgez(len, L_nmax_loop_entry); 5801 5802 __ bind(L_by16); 5803 __ add(len, len, count); 5804 __ bltz(len, L_by1); 5805 // Trying to unroll 5806 __ mv(temp3, step_64); 5807 __ blt(len, temp3, L_by16_loop); 5808 5809 __ bind(L_by16_loop_unroll); 5810 adler32_process_bytes(buff, s1, s2, vtable_64, vzero, 5811 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5812 vtemp1, vtemp2, step_64, Assembler::m4); 5813 __ sub(len, len, step_64); 5814 // By now the temp3 should still be 64 5815 __ bge(len, temp3, L_by16_loop_unroll); 5816 5817 __ bind(L_by16_loop); 5818 adler32_process_bytes(buff, s1, s2, vtable_16, vzero, 5819 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5820 vtemp1, vtemp2, step_16, Assembler::m1); 5821 __ sub(len, len, step_16); 5822 __ bgez(len, L_by16_loop); 5823 5824 __ bind(L_by1); 5825 __ add(len, len, 15); 5826 __ bltz(len, L_do_mod); 5827 5828 __ bind(L_by1_loop); 5829 __ lbu(temp0, Address(buff, 0)); 5830 __ addi(buff, buff, step_1); 5831 __ add(s1, temp0, s1); 5832 __ add(s2, s2, s1); 5833 __ sub(len, len, step_1); 5834 __ bgez(len, L_by1_loop); 5835 5836 __ bind(L_do_mod); 5837 // s1 = s1 % BASE 5838 __ remuw(s1, s1, base); 5839 // s2 = s2 % BASE 5840 __ remuw(s2, s2, base); 5841 5842 // Combine lower bits and higher bits 5843 // adler = s1 | (s2 << 16) 5844 __ bind(L_combine); 5845 __ slli(s2, s2, 16); 5846 __ orr(s1, s1, s2); 5847 5848 __ leave(); // Required for proper stackwalking of RuntimeStub frame 5849 __ ret(); 5850 5851 return start; 5852 } 5853 5854 #endif // COMPILER2_OR_JVMCI 5855 5856 #ifdef COMPILER2 5857 5858 static const int64_t right_2_bits = right_n_bits(2); 5859 static const int64_t right_3_bits = right_n_bits(3); 5860 5861 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 5862 // are represented as long[5], with BITS_PER_LIMB = 26. 5863 // Pack five 26-bit limbs into three 64-bit registers. 5864 void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) { 5865 assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2); 5866 5867 // The goal is to have 128-bit value in dest2:dest1:dest0 5868 __ ld(dest0, Address(src, 0)); // 26 bits in dest0 5869 5870 __ ld(tmp1, Address(src, sizeof(jlong))); 5871 __ slli(tmp1, tmp1, 26); 5872 __ add(dest0, dest0, tmp1); // 52 bits in dest0 5873 5874 __ ld(tmp2, Address(src, 2 * sizeof(jlong))); 5875 __ slli(tmp1, tmp2, 52); 5876 __ add(dest0, dest0, tmp1); // dest0 is full 5877 5878 __ srli(dest1, tmp2, 12); // 14-bit in dest1 5879 5880 __ ld(tmp1, Address(src, 3 * sizeof(jlong))); 5881 __ slli(tmp1, tmp1, 14); 5882 __ add(dest1, dest1, tmp1); // 40-bit in dest1 5883 5884 __ ld(tmp1, Address(src, 4 * sizeof(jlong))); 5885 __ slli(tmp2, tmp1, 40); 5886 __ add(dest1, dest1, tmp2); // dest1 is full 5887 5888 if (dest2->is_valid()) { 5889 __ srli(tmp1, tmp1, 24); 5890 __ mv(dest2, tmp1); // 2 bits in dest2 5891 } else { 5892 #ifdef ASSERT 5893 Label OK; 5894 __ srli(tmp1, tmp1, 24); 5895 __ beq(zr, tmp1, OK); // 2 bits 5896 __ stop("high bits of Poly1305 integer should be zero"); 5897 __ should_not_reach_here(); 5898 __ bind(OK); 5899 #endif 5900 } 5901 } 5902 5903 // As above, but return only a 128-bit integer, packed into two 5904 // 64-bit registers. 5905 void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) { 5906 poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2); 5907 } 5908 5909 // U_2:U_1:U_0: += (U_2 >> 2) * 5 5910 void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) { 5911 assert_different_registers(U_2, U_1, U_0, tmp1, tmp2); 5912 5913 // First, U_2:U_1:U_0 += (U_2 >> 2) 5914 __ srli(tmp1, U_2, 2); 5915 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2 5916 __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits 5917 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2 5918 __ add(U_2, U_2, tmp2); 5919 5920 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 5921 __ slli(tmp1, tmp1, 2); 5922 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2 5923 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2 5924 __ add(U_2, U_2, tmp2); 5925 } 5926 5927 // Poly1305, RFC 7539 5928 // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) 5929 5930 // Arguments: 5931 // c_rarg0: input_start -- where the input is stored 5932 // c_rarg1: length 5933 // c_rarg2: acc_start -- where the output will be stored 5934 // c_rarg3: r_start -- where the randomly generated 128-bit key is stored 5935 5936 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 5937 // description of the tricks used to simplify and accelerate this 5938 // computation. 5939 5940 address generate_poly1305_processBlocks() { 5941 __ align(CodeEntryAlignment); 5942 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 5943 address start = __ pc(); 5944 __ enter(); 5945 Label here; 5946 5947 RegSet saved_regs = RegSet::range(x18, x21); 5948 RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin(); 5949 __ push_reg(saved_regs, sp); 5950 5951 // Arguments 5952 const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3; 5953 5954 // R_n is the 128-bit randomly-generated key, packed into two 5955 // registers. The caller passes this key to us as long[5], with 5956 // BITS_PER_LIMB = 26. 5957 const Register R_0 = *regs, R_1 = *++regs; 5958 poly1305_pack_26(R_0, R_1, r_start, t1, t2); 5959 5960 // RR_n is (R_n >> 2) * 5 5961 const Register RR_0 = *++regs, RR_1 = *++regs; 5962 __ srli(t1, R_0, 2); 5963 __ shadd(RR_0, t1, t1, t2, 2); 5964 __ srli(t1, R_1, 2); 5965 __ shadd(RR_1, t1, t1, t2, 2); 5966 5967 // U_n is the current checksum 5968 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 5969 poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2); 5970 5971 static constexpr int BLOCK_LENGTH = 16; 5972 Label DONE, LOOP; 5973 5974 __ mv(t1, BLOCK_LENGTH); 5975 __ blt(length, t1, DONE); { 5976 __ bind(LOOP); 5977 5978 // S_n is to be the sum of U_n and the next block of data 5979 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 5980 __ ld(S_0, Address(input_start, 0)); 5981 __ ld(S_1, Address(input_start, wordSize)); 5982 5983 __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1 5984 __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1 5985 __ add(S_2, U_2, t1); 5986 5987 __ addi(S_2, S_2, 1); 5988 5989 const Register U_0HI = *++regs, U_1HI = *++regs; 5990 5991 // NB: this logic depends on some of the special properties of 5992 // Poly1305 keys. In particular, because we know that the top 5993 // four bits of R_0 and R_1 are zero, we can add together 5994 // partial products without any risk of needing to propagate a 5995 // carry out. 5996 __ wide_mul(U_0, U_0HI, S_0, R_0); 5997 __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2); 5998 __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2); 5999 6000 __ wide_mul(U_1, U_1HI, S_0, R_1); 6001 __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2); 6002 __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2); 6003 6004 __ andi(U_2, R_0, right_2_bits); 6005 __ mul(U_2, S_2, U_2); 6006 6007 // Partial reduction mod 2**130 - 5 6008 __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1 6009 __ adc(U_2, U_2, U_1HI, t1); 6010 // Sum is now in U_2:U_1:U_0. 6011 6012 // U_2:U_1:U_0: += (U_2 >> 2) * 5 6013 poly1305_reduce(U_2, U_1, U_0, t1, t2); 6014 6015 __ sub(length, length, BLOCK_LENGTH); 6016 __ addi(input_start, input_start, BLOCK_LENGTH); 6017 __ mv(t1, BLOCK_LENGTH); 6018 __ bge(length, t1, LOOP); 6019 } 6020 6021 // Further reduce modulo 2^130 - 5 6022 poly1305_reduce(U_2, U_1, U_0, t1, t2); 6023 6024 // Unpack the sum into five 26-bit limbs and write to memory. 6025 // First 26 bits is the first limb 6026 __ slli(t1, U_0, 38); // Take lowest 26 bits 6027 __ srli(t1, t1, 38); 6028 __ sd(t1, Address(acc_start)); // First 26-bit limb 6029 6030 // 27-52 bits of U_0 is the second limb 6031 __ slli(t1, U_0, 12); // Take next 27-52 bits 6032 __ srli(t1, t1, 38); 6033 __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb 6034 6035 // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register 6036 __ srli(t1, U_0, 52); 6037 __ slli(t2, U_1, 50); 6038 __ srli(t2, t2, 38); 6039 __ add(t1, t1, t2); 6040 __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb 6041 6042 // Storing 15-40 bits of U_1 6043 __ slli(t1, U_1, 24); // Already used up 14 bits 6044 __ srli(t1, t1, 38); // Clear all other bits from t1 6045 __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb 6046 6047 // Storing 41-64 bits of U_1 and first three bits from U_2 in one register 6048 __ srli(t1, U_1, 40); 6049 __ andi(t2, U_2, right_3_bits); 6050 __ slli(t2, t2, 24); 6051 __ add(t1, t1, t2); 6052 __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb 6053 6054 __ bind(DONE); 6055 __ pop_reg(saved_regs, sp); 6056 __ leave(); // Required for proper stackwalking 6057 __ ret(); 6058 6059 return start; 6060 } 6061 6062 #endif // COMPILER2 6063 6064 /** 6065 * Arguments: 6066 * 6067 * Inputs: 6068 * c_rarg0 - int crc 6069 * c_rarg1 - byte* buf 6070 * c_rarg2 - int length 6071 * 6072 * Output: 6073 * c_rarg0 - int crc result 6074 */ 6075 address generate_updateBytesCRC32() { 6076 assert(UseCRC32Intrinsics, "what are we doing here?"); 6077 6078 __ align(CodeEntryAlignment); 6079 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 6080 6081 address start = __ pc(); 6082 6083 const Register crc = c_rarg0; // crc 6084 const Register buf = c_rarg1; // source java byte array address 6085 const Register len = c_rarg2; // length 6086 const Register table0 = c_rarg3; // crc_table address 6087 const Register table1 = c_rarg4; 6088 const Register table2 = c_rarg5; 6089 const Register table3 = c_rarg6; 6090 6091 const Register tmp1 = c_rarg7; 6092 const Register tmp2 = t2; 6093 const Register tmp3 = x28; // t3 6094 const Register tmp4 = x29; // t4 6095 const Register tmp5 = x30; // t5 6096 const Register tmp6 = x31; // t6 6097 6098 BLOCK_COMMENT("Entry:"); 6099 __ enter(); // required for proper stackwalking of RuntimeStub frame 6100 6101 __ kernel_crc32(crc, buf, len, table0, table1, table2, 6102 table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 6103 6104 __ leave(); // required for proper stackwalking of RuntimeStub frame 6105 __ ret(); 6106 6107 return start; 6108 } 6109 6110 // exception handler for upcall stubs 6111 address generate_upcall_stub_exception_handler() { 6112 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 6113 address start = __ pc(); 6114 6115 // Native caller has no idea how to handle exceptions, 6116 // so we just crash here. Up to callee to catch exceptions. 6117 __ verify_oop(x10); // return a exception oop in a0 6118 __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception)); 6119 __ should_not_reach_here(); 6120 6121 return start; 6122 } 6123 6124 #undef __ 6125 6126 // Initialization 6127 void generate_initial_stubs() { 6128 // Generate initial stubs and initializes the entry points 6129 6130 // entry points that exist in all platforms Note: This is code 6131 // that could be shared among different platforms - however the 6132 // benefit seems to be smaller than the disadvantage of having a 6133 // much more complicated generator structure. See also comment in 6134 // stubRoutines.hpp. 6135 6136 StubRoutines::_forward_exception_entry = generate_forward_exception(); 6137 6138 if (UnsafeMemoryAccess::_table == nullptr) { 6139 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 6140 } 6141 6142 StubRoutines::_call_stub_entry = 6143 generate_call_stub(StubRoutines::_call_stub_return_address); 6144 6145 // is referenced by megamorphic call 6146 StubRoutines::_catch_exception_entry = generate_catch_exception(); 6147 6148 if (UseCRC32Intrinsics) { 6149 // set table address before stub generation which use it 6150 StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table; 6151 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 6152 } 6153 } 6154 6155 void generate_continuation_stubs() { 6156 // Continuation stubs: 6157 StubRoutines::_cont_thaw = generate_cont_thaw(); 6158 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 6159 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 6160 } 6161 6162 void generate_final_stubs() { 6163 // support for verify_oop (must happen after universe_init) 6164 if (VerifyOops) { 6165 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 6166 } 6167 6168 // arraycopy stubs used by compilers 6169 generate_arraycopy_stubs(); 6170 6171 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 6172 if (bs_nm != nullptr) { 6173 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 6174 } 6175 6176 #ifdef COMPILER2 6177 if (UseSecondarySupersTable) { 6178 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 6179 if (!InlineSecondarySupersTest) { 6180 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 6181 StubRoutines::_lookup_secondary_supers_table_stubs[slot] 6182 = generate_lookup_secondary_supers_table_stub(slot); 6183 } 6184 } 6185 } 6186 #endif // COMPILER2 6187 6188 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 6189 6190 StubRoutines::riscv::set_completed(); 6191 } 6192 6193 void generate_compiler_stubs() { 6194 #ifdef COMPILER2 6195 if (UseMulAddIntrinsic) { 6196 StubRoutines::_mulAdd = generate_mulAdd(); 6197 } 6198 6199 if (UseMultiplyToLenIntrinsic) { 6200 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 6201 } 6202 6203 if (UseSquareToLenIntrinsic) { 6204 StubRoutines::_squareToLen = generate_squareToLen(); 6205 } 6206 6207 if (UseMontgomeryMultiplyIntrinsic) { 6208 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 6209 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 6210 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 6211 } 6212 6213 if (UseMontgomerySquareIntrinsic) { 6214 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 6215 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 6216 StubRoutines::_montgomerySquare = g.generate_square(); 6217 } 6218 6219 if (UsePoly1305Intrinsics) { 6220 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 6221 } 6222 6223 if (UseRVVForBigIntegerShiftIntrinsics) { 6224 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 6225 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 6226 } 6227 6228 if (UseSHA256Intrinsics) { 6229 Sha2Generator sha2(_masm, this); 6230 StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(false); 6231 StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true); 6232 } 6233 6234 if (UseSHA512Intrinsics) { 6235 Sha2Generator sha2(_masm, this); 6236 StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(false); 6237 StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true); 6238 } 6239 6240 if (UseMD5Intrinsics) { 6241 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 6242 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 6243 } 6244 6245 if (UseChaCha20Intrinsics) { 6246 StubRoutines::_chacha20Block = generate_chacha20Block(); 6247 } 6248 6249 if (UseSHA1Intrinsics) { 6250 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 6251 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 6252 } 6253 6254 if (UseBASE64Intrinsics) { 6255 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 6256 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 6257 } 6258 6259 if (UseAdler32Intrinsics) { 6260 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 6261 } 6262 6263 generate_compare_long_strings(); 6264 6265 generate_string_indexof_stubs(); 6266 6267 #endif // COMPILER2 6268 } 6269 6270 public: 6271 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 6272 switch(kind) { 6273 case Initial_stubs: 6274 generate_initial_stubs(); 6275 break; 6276 case Continuation_stubs: 6277 generate_continuation_stubs(); 6278 break; 6279 case Compiler_stubs: 6280 generate_compiler_stubs(); 6281 break; 6282 case Final_stubs: 6283 generate_final_stubs(); 6284 break; 6285 default: 6286 fatal("unexpected stubs kind: %d", kind); 6287 break; 6288 }; 6289 } 6290 }; // end class declaration 6291 6292 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 6293 StubGenerator g(code, kind); 6294 }