1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/universe.hpp" 35 #include "nativeInst_riscv.hpp" 36 #include "oops/instanceOop.hpp" 37 #include "oops/method.hpp" 38 #include "oops/objArrayKlass.hpp" 39 #include "oops/oop.inline.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "prims/upcallLinker.hpp" 42 #include "runtime/continuation.hpp" 43 #include "runtime/continuationEntry.inline.hpp" 44 #include "runtime/frame.inline.hpp" 45 #include "runtime/handles.inline.hpp" 46 #include "runtime/javaThread.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/stubCodeGenerator.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "utilities/align.hpp" 51 #include "utilities/powerOfTwo.hpp" 52 #ifdef COMPILER2 53 #include "opto/runtime.hpp" 54 #endif 55 56 // Declaration and definition of StubGenerator (no .hpp file). 57 // For a more detailed description of the stub routine structure 58 // see the comment in stubRoutines.hpp 59 60 #undef __ 61 #define __ _masm-> 62 63 #ifdef PRODUCT 64 #define BLOCK_COMMENT(str) /* nothing */ 65 #else 66 #define BLOCK_COMMENT(str) __ block_comment(str) 67 #endif 68 69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 70 71 // Stub Code definitions 72 73 class StubGenerator: public StubCodeGenerator { 74 private: 75 76 #ifdef PRODUCT 77 #define inc_counter_np(counter) ((void)0) 78 #else 79 void inc_counter_np_(uint& counter) { 80 __ incrementw(ExternalAddress((address)&counter)); 81 } 82 #define inc_counter_np(counter) \ 83 BLOCK_COMMENT("inc_counter " #counter); \ 84 inc_counter_np_(counter); 85 #endif 86 87 // Call stubs are used to call Java from C 88 // 89 // Arguments: 90 // c_rarg0: call wrapper address address 91 // c_rarg1: result address 92 // c_rarg2: result type BasicType 93 // c_rarg3: method Method* 94 // c_rarg4: (interpreter) entry point address 95 // c_rarg5: parameters intptr_t* 96 // c_rarg6: parameter size (in words) int 97 // c_rarg7: thread Thread* 98 // 99 // There is no return from the stub itself as any Java result 100 // is written to result 101 // 102 // we save x1 (ra) as the return PC at the base of the frame and 103 // link x8 (fp) below it as the frame pointer installing sp (x2) 104 // into fp. 105 // 106 // we save x10-x17, which accounts for all the c arguments. 107 // 108 // TODO: strictly do we need to save them all? they are treated as 109 // volatile by C so could we omit saving the ones we are going to 110 // place in global registers (thread? method?) or those we only use 111 // during setup of the Java call? 112 // 113 // we don't need to save x5 which C uses as an indirect result location 114 // return register. 115 // 116 // we don't need to save x6-x7 and x28-x31 which both C and Java treat as 117 // volatile 118 // 119 // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary 120 // registers and C expects to be callee-save 121 // 122 // so the stub frame looks like this when we enter Java code 123 // 124 // [ return_from_Java ] <--- sp 125 // [ argument word n ] 126 // ... 127 // -35 [ argument word 1 ] 128 // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call 129 // -33 [ saved f27 ] 130 // -32 [ saved f26 ] 131 // -31 [ saved f25 ] 132 // -30 [ saved f24 ] 133 // -29 [ saved f23 ] 134 // -28 [ saved f22 ] 135 // -27 [ saved f21 ] 136 // -26 [ saved f20 ] 137 // -25 [ saved f19 ] 138 // -24 [ saved f18 ] 139 // -23 [ saved f9 ] 140 // -22 [ saved f8 ] 141 // -21 [ saved x27 ] 142 // -20 [ saved x26 ] 143 // -19 [ saved x25 ] 144 // -18 [ saved x24 ] 145 // -17 [ saved x23 ] 146 // -16 [ saved x22 ] 147 // -15 [ saved x21 ] 148 // -14 [ saved x20 ] 149 // -13 [ saved x19 ] 150 // -12 [ saved x18 ] 151 // -11 [ saved x9 ] 152 // -10 [ call wrapper (x10) ] 153 // -9 [ result (x11) ] 154 // -8 [ result type (x12) ] 155 // -7 [ method (x13) ] 156 // -6 [ entry point (x14) ] 157 // -5 [ parameters (x15) ] 158 // -4 [ parameter size (x16) ] 159 // -3 [ thread (x17) ] 160 // -2 [ saved fp (x8) ] 161 // -1 [ saved ra (x1) ] 162 // 0 [ ] <--- fp == saved sp (x2) 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -34, 167 168 frm_off = sp_after_call_off, 169 f27_off = -33, 170 f26_off = -32, 171 f25_off = -31, 172 f24_off = -30, 173 f23_off = -29, 174 f22_off = -28, 175 f21_off = -27, 176 f20_off = -26, 177 f19_off = -25, 178 f18_off = -24, 179 f9_off = -23, 180 f8_off = -22, 181 182 x27_off = -21, 183 x26_off = -20, 184 x25_off = -19, 185 x24_off = -18, 186 x23_off = -17, 187 x22_off = -16, 188 x21_off = -15, 189 x20_off = -14, 190 x19_off = -13, 191 x18_off = -12, 192 x9_off = -11, 193 194 call_wrapper_off = -10, 195 result_off = -9, 196 result_type_off = -8, 197 method_off = -7, 198 entry_point_off = -6, 199 parameters_off = -5, 200 parameter_size_off = -4, 201 thread_off = -3, 202 fp_f = -2, 203 retaddr_off = -1, 204 }; 205 206 address generate_call_stub(address& return_address) { 207 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 208 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 209 "adjust this code"); 210 211 StubCodeMark mark(this, "StubRoutines", "call_stub"); 212 address start = __ pc(); 213 214 const Address sp_after_call (fp, sp_after_call_off * wordSize); 215 216 const Address frm_save (fp, frm_off * wordSize); 217 const Address call_wrapper (fp, call_wrapper_off * wordSize); 218 const Address result (fp, result_off * wordSize); 219 const Address result_type (fp, result_type_off * wordSize); 220 const Address method (fp, method_off * wordSize); 221 const Address entry_point (fp, entry_point_off * wordSize); 222 const Address parameters (fp, parameters_off * wordSize); 223 const Address parameter_size(fp, parameter_size_off * wordSize); 224 225 const Address thread (fp, thread_off * wordSize); 226 227 const Address f27_save (fp, f27_off * wordSize); 228 const Address f26_save (fp, f26_off * wordSize); 229 const Address f25_save (fp, f25_off * wordSize); 230 const Address f24_save (fp, f24_off * wordSize); 231 const Address f23_save (fp, f23_off * wordSize); 232 const Address f22_save (fp, f22_off * wordSize); 233 const Address f21_save (fp, f21_off * wordSize); 234 const Address f20_save (fp, f20_off * wordSize); 235 const Address f19_save (fp, f19_off * wordSize); 236 const Address f18_save (fp, f18_off * wordSize); 237 const Address f9_save (fp, f9_off * wordSize); 238 const Address f8_save (fp, f8_off * wordSize); 239 240 const Address x27_save (fp, x27_off * wordSize); 241 const Address x26_save (fp, x26_off * wordSize); 242 const Address x25_save (fp, x25_off * wordSize); 243 const Address x24_save (fp, x24_off * wordSize); 244 const Address x23_save (fp, x23_off * wordSize); 245 const Address x22_save (fp, x22_off * wordSize); 246 const Address x21_save (fp, x21_off * wordSize); 247 const Address x20_save (fp, x20_off * wordSize); 248 const Address x19_save (fp, x19_off * wordSize); 249 const Address x18_save (fp, x18_off * wordSize); 250 251 const Address x9_save (fp, x9_off * wordSize); 252 253 // stub code 254 255 address riscv_entry = __ pc(); 256 257 // set up frame and move sp to end of save area 258 __ enter(); 259 __ addi(sp, fp, sp_after_call_off * wordSize); 260 261 // save register parameters and Java temporary/global registers 262 // n.b. we save thread even though it gets installed in 263 // xthread because we want to sanity check tp later 264 __ sd(c_rarg7, thread); 265 __ sw(c_rarg6, parameter_size); 266 __ sd(c_rarg5, parameters); 267 __ sd(c_rarg4, entry_point); 268 __ sd(c_rarg3, method); 269 __ sd(c_rarg2, result_type); 270 __ sd(c_rarg1, result); 271 __ sd(c_rarg0, call_wrapper); 272 273 __ sd(x9, x9_save); 274 275 __ sd(x18, x18_save); 276 __ sd(x19, x19_save); 277 __ sd(x20, x20_save); 278 __ sd(x21, x21_save); 279 __ sd(x22, x22_save); 280 __ sd(x23, x23_save); 281 __ sd(x24, x24_save); 282 __ sd(x25, x25_save); 283 __ sd(x26, x26_save); 284 __ sd(x27, x27_save); 285 286 __ fsd(f8, f8_save); 287 __ fsd(f9, f9_save); 288 __ fsd(f18, f18_save); 289 __ fsd(f19, f19_save); 290 __ fsd(f20, f20_save); 291 __ fsd(f21, f21_save); 292 __ fsd(f22, f22_save); 293 __ fsd(f23, f23_save); 294 __ fsd(f24, f24_save); 295 __ fsd(f25, f25_save); 296 __ fsd(f26, f26_save); 297 __ fsd(f27, f27_save); 298 299 __ frrm(t0); 300 __ sd(t0, frm_save); 301 // Set frm to the state we need. We do want Round to Nearest. We 302 // don't want non-IEEE rounding modes. 303 Label skip_fsrmi; 304 guarantee(__ RoundingMode::rne == 0, "must be"); 305 __ beqz(t0, skip_fsrmi); 306 __ fsrmi(__ RoundingMode::rne); 307 __ bind(skip_fsrmi); 308 309 // install Java thread in global register now we have saved 310 // whatever value it held 311 __ mv(xthread, c_rarg7); 312 313 // And method 314 __ mv(xmethod, c_rarg3); 315 316 // set up the heapbase register 317 __ reinit_heapbase(); 318 319 #ifdef ASSERT 320 // make sure we have no pending exceptions 321 { 322 Label L; 323 __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset()))); 324 __ beqz(t0, L); 325 __ stop("StubRoutines::call_stub: entered with pending exception"); 326 __ BIND(L); 327 } 328 #endif 329 // pass parameters if any 330 __ mv(esp, sp); 331 __ slli(t0, c_rarg6, LogBytesPerWord); 332 __ sub(t0, sp, t0); // Move SP out of the way 333 __ andi(sp, t0, -2 * wordSize); 334 335 BLOCK_COMMENT("pass parameters if any"); 336 Label parameters_done; 337 // parameter count is still in c_rarg6 338 // and parameter pointer identifying param 1 is in c_rarg5 339 __ beqz(c_rarg6, parameters_done); 340 341 address loop = __ pc(); 342 __ ld(t0, Address(c_rarg5, 0)); 343 __ addi(c_rarg5, c_rarg5, wordSize); 344 __ addi(c_rarg6, c_rarg6, -1); 345 __ push_reg(t0); 346 __ bgtz(c_rarg6, loop); 347 348 __ BIND(parameters_done); 349 350 // call Java entry -- passing methdoOop, and current sp 351 // xmethod: Method* 352 // x19_sender_sp: sender sp 353 BLOCK_COMMENT("call Java function"); 354 __ mv(x19_sender_sp, sp); 355 __ jalr(c_rarg4); 356 357 // save current address for use by exception handling code 358 359 return_address = __ pc(); 360 361 // store result depending on type (everything that is not 362 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 363 // n.b. this assumes Java returns an integral result in x10 364 // and a floating result in j_farg0 365 __ ld(j_rarg2, result); 366 Label is_long, is_float, is_double, exit; 367 __ ld(j_rarg1, result_type); 368 __ mv(t0, (u1)T_OBJECT); 369 __ beq(j_rarg1, t0, is_long); 370 __ mv(t0, (u1)T_LONG); 371 __ beq(j_rarg1, t0, is_long); 372 __ mv(t0, (u1)T_FLOAT); 373 __ beq(j_rarg1, t0, is_float); 374 __ mv(t0, (u1)T_DOUBLE); 375 __ beq(j_rarg1, t0, is_double); 376 377 // handle T_INT case 378 __ sw(x10, Address(j_rarg2)); 379 380 __ BIND(exit); 381 382 // pop parameters 383 __ addi(esp, fp, sp_after_call_off * wordSize); 384 385 #ifdef ASSERT 386 // verify that threads correspond 387 { 388 Label L, S; 389 __ ld(t0, thread); 390 __ bne(xthread, t0, S); 391 __ get_thread(t0); 392 __ beq(xthread, t0, L); 393 __ BIND(S); 394 __ stop("StubRoutines::call_stub: threads must correspond"); 395 __ BIND(L); 396 } 397 #endif 398 399 __ pop_cont_fastpath(xthread); 400 401 // restore callee-save registers 402 __ fld(f27, f27_save); 403 __ fld(f26, f26_save); 404 __ fld(f25, f25_save); 405 __ fld(f24, f24_save); 406 __ fld(f23, f23_save); 407 __ fld(f22, f22_save); 408 __ fld(f21, f21_save); 409 __ fld(f20, f20_save); 410 __ fld(f19, f19_save); 411 __ fld(f18, f18_save); 412 __ fld(f9, f9_save); 413 __ fld(f8, f8_save); 414 415 __ ld(x27, x27_save); 416 __ ld(x26, x26_save); 417 __ ld(x25, x25_save); 418 __ ld(x24, x24_save); 419 __ ld(x23, x23_save); 420 __ ld(x22, x22_save); 421 __ ld(x21, x21_save); 422 __ ld(x20, x20_save); 423 __ ld(x19, x19_save); 424 __ ld(x18, x18_save); 425 426 __ ld(x9, x9_save); 427 428 // restore frm 429 Label skip_fsrm; 430 __ ld(t0, frm_save); 431 __ frrm(t1); 432 __ beq(t0, t1, skip_fsrm); 433 __ fsrm(t0); 434 __ bind(skip_fsrm); 435 436 __ ld(c_rarg0, call_wrapper); 437 __ ld(c_rarg1, result); 438 __ ld(c_rarg2, result_type); 439 __ ld(c_rarg3, method); 440 __ ld(c_rarg4, entry_point); 441 __ ld(c_rarg5, parameters); 442 __ ld(c_rarg6, parameter_size); 443 __ ld(c_rarg7, thread); 444 445 // leave frame and return to caller 446 __ leave(); 447 __ ret(); 448 449 // handle return types different from T_INT 450 451 __ BIND(is_long); 452 __ sd(x10, Address(j_rarg2, 0)); 453 __ j(exit); 454 455 __ BIND(is_float); 456 __ fsw(j_farg0, Address(j_rarg2, 0), t0); 457 __ j(exit); 458 459 __ BIND(is_double); 460 __ fsd(j_farg0, Address(j_rarg2, 0), t0); 461 __ j(exit); 462 463 return start; 464 } 465 466 // Return point for a Java call if there's an exception thrown in 467 // Java code. The exception is caught and transformed into a 468 // pending exception stored in JavaThread that can be tested from 469 // within the VM. 470 // 471 // Note: Usually the parameters are removed by the callee. In case 472 // of an exception crossing an activation frame boundary, that is 473 // not the case if the callee is compiled code => need to setup the 474 // sp. 475 // 476 // x10: exception oop 477 478 address generate_catch_exception() { 479 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 480 address start = __ pc(); 481 482 // same as in generate_call_stub(): 483 const Address thread(fp, thread_off * wordSize); 484 485 #ifdef ASSERT 486 // verify that threads correspond 487 { 488 Label L, S; 489 __ ld(t0, thread); 490 __ bne(xthread, t0, S); 491 __ get_thread(t0); 492 __ beq(xthread, t0, L); 493 __ bind(S); 494 __ stop("StubRoutines::catch_exception: threads must correspond"); 495 __ bind(L); 496 } 497 #endif 498 499 // set pending exception 500 __ verify_oop(x10); 501 502 __ sd(x10, Address(xthread, Thread::pending_exception_offset())); 503 __ mv(t0, (address)__FILE__); 504 __ sd(t0, Address(xthread, Thread::exception_file_offset())); 505 __ mv(t0, (int)__LINE__); 506 __ sw(t0, Address(xthread, Thread::exception_line_offset())); 507 508 // complete return to VM 509 assert(StubRoutines::_call_stub_return_address != nullptr, 510 "_call_stub_return_address must have been generated before"); 511 __ j(StubRoutines::_call_stub_return_address); 512 513 return start; 514 } 515 516 // Continuation point for runtime calls returning with a pending 517 // exception. The pending exception check happened in the runtime 518 // or native call stub. The pending exception in Thread is 519 // converted into a Java-level exception. 520 // 521 // Contract with Java-level exception handlers: 522 // x10: exception 523 // x13: throwing pc 524 // 525 // NOTE: At entry of this stub, exception-pc must be in RA !! 526 527 // NOTE: this is always used as a jump target within generated code 528 // so it just needs to be generated code with no x86 prolog 529 530 address generate_forward_exception() { 531 StubCodeMark mark(this, "StubRoutines", "forward exception"); 532 address start = __ pc(); 533 534 // Upon entry, RA points to the return address returning into 535 // Java (interpreted or compiled) code; i.e., the return address 536 // becomes the throwing pc. 537 // 538 // Arguments pushed before the runtime call are still on the stack 539 // but the exception handler will reset the stack pointer -> 540 // ignore them. A potential result in registers can be ignored as 541 // well. 542 543 #ifdef ASSERT 544 // make sure this code is only executed if there is a pending exception 545 { 546 Label L; 547 __ ld(t0, Address(xthread, Thread::pending_exception_offset())); 548 __ bnez(t0, L); 549 __ stop("StubRoutines::forward exception: no pending exception (1)"); 550 __ bind(L); 551 } 552 #endif 553 554 // compute exception handler into x9 555 556 // call the VM to find the handler address associated with the 557 // caller address. pass thread in x10 and caller pc (ret address) 558 // in x11. n.b. the caller pc is in ra, unlike x86 where it is on 559 // the stack. 560 __ mv(c_rarg1, ra); 561 // ra will be trashed by the VM call so we move it to x9 562 // (callee-saved) because we also need to pass it to the handler 563 // returned by this call. 564 __ mv(x9, ra); 565 BLOCK_COMMENT("call exception_handler_for_return_address"); 566 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 567 SharedRuntime::exception_handler_for_return_address), 568 xthread, c_rarg1); 569 // we should not really care that ra is no longer the callee 570 // address. we saved the value the handler needs in x9 so we can 571 // just copy it to x13. however, the C2 handler will push its own 572 // frame and then calls into the VM and the VM code asserts that 573 // the PC for the frame above the handler belongs to a compiled 574 // Java method. So, we restore ra here to satisfy that assert. 575 __ mv(ra, x9); 576 // setup x10 & x13 & clear pending exception 577 __ mv(x13, x9); 578 __ mv(x9, x10); 579 __ ld(x10, Address(xthread, Thread::pending_exception_offset())); 580 __ sd(zr, Address(xthread, Thread::pending_exception_offset())); 581 582 #ifdef ASSERT 583 // make sure exception is set 584 { 585 Label L; 586 __ bnez(x10, L); 587 __ stop("StubRoutines::forward exception: no pending exception (2)"); 588 __ bind(L); 589 } 590 #endif 591 592 // continue at exception handler 593 // x10: exception 594 // x13: throwing pc 595 // x9: exception handler 596 __ verify_oop(x10); 597 __ jr(x9); 598 599 return start; 600 } 601 602 // Non-destructive plausibility checks for oops 603 // 604 // Arguments: 605 // x10: oop to verify 606 // t0: error message 607 // 608 // Stack after saving c_rarg3: 609 // [tos + 0]: saved c_rarg3 610 // [tos + 1]: saved c_rarg2 611 // [tos + 2]: saved ra 612 // [tos + 3]: saved t1 613 // [tos + 4]: saved x10 614 // [tos + 5]: saved t0 615 address generate_verify_oop() { 616 617 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 618 address start = __ pc(); 619 620 Label exit, error; 621 622 __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3 623 624 __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 625 __ ld(c_rarg3, Address(c_rarg2)); 626 __ add(c_rarg3, c_rarg3, 1); 627 __ sd(c_rarg3, Address(c_rarg2)); 628 629 // object is in x10 630 // make sure object is 'reasonable' 631 __ beqz(x10, exit); // if obj is null it is OK 632 633 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 634 bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error); 635 636 // return if everything seems ok 637 __ bind(exit); 638 639 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3 640 __ ret(); 641 642 // handle errors 643 __ bind(error); 644 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3 645 646 __ push_reg(RegSet::range(x0, x31), sp); 647 // debug(char* msg, int64_t pc, int64_t regs[]) 648 __ mv(c_rarg0, t0); // pass address of error message 649 __ mv(c_rarg1, ra); // pass return address 650 __ mv(c_rarg2, sp); // pass address of regs on stack 651 #ifndef PRODUCT 652 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 653 #endif 654 BLOCK_COMMENT("call MacroAssembler::debug"); 655 __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 656 __ ebreak(); 657 658 return start; 659 } 660 661 // The inner part of zero_words(). 662 // 663 // Inputs: 664 // x28: the HeapWord-aligned base address of an array to zero. 665 // x29: the count in HeapWords, x29 > 0. 666 // 667 // Returns x28 and x29, adjusted for the caller to clear. 668 // x28: the base address of the tail of words left to clear. 669 // x29: the number of words in the tail. 670 // x29 < MacroAssembler::zero_words_block_size. 671 672 address generate_zero_blocks() { 673 Label done; 674 675 const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31; 676 677 __ align(CodeEntryAlignment); 678 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 679 address start = __ pc(); 680 681 if (UseBlockZeroing) { 682 // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero 683 // after alignment. 684 Label small; 685 int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize; 686 __ mv(tmp1, low_limit); 687 __ blt(cnt, tmp1, small); 688 __ zero_dcache_blocks(base, cnt, tmp1, tmp2); 689 __ bind(small); 690 } 691 692 { 693 // Clear the remaining blocks. 694 Label loop; 695 __ mv(tmp1, MacroAssembler::zero_words_block_size); 696 __ blt(cnt, tmp1, done); 697 __ bind(loop); 698 for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) { 699 __ sd(zr, Address(base, i * wordSize)); 700 } 701 __ add(base, base, MacroAssembler::zero_words_block_size * wordSize); 702 __ sub(cnt, cnt, MacroAssembler::zero_words_block_size); 703 __ bge(cnt, tmp1, loop); 704 __ bind(done); 705 } 706 707 __ ret(); 708 709 return start; 710 } 711 712 typedef enum { 713 copy_forwards = 1, 714 copy_backwards = -1 715 } copy_direction; 716 717 // Bulk copy of blocks of 8 words. 718 // 719 // count is a count of words. 720 // 721 // Precondition: count >= 8 722 // 723 // Postconditions: 724 // 725 // The least significant bit of count contains the remaining count 726 // of words to copy. The rest of count is trash. 727 // 728 // s and d are adjusted to point to the remaining words to copy 729 // 730 void generate_copy_longs(Label &start, Register s, Register d, Register count, 731 copy_direction direction) { 732 int unit = wordSize * direction; 733 int bias = wordSize; 734 735 const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16, 736 tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29; 737 738 const Register stride = x30; 739 740 assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3, 741 tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7); 742 assert_different_registers(s, d, count, t0); 743 744 Label again, drain; 745 const char* stub_name = nullptr; 746 if (direction == copy_forwards) { 747 stub_name = "forward_copy_longs"; 748 } else { 749 stub_name = "backward_copy_longs"; 750 } 751 StubCodeMark mark(this, "StubRoutines", stub_name); 752 __ align(CodeEntryAlignment); 753 __ bind(start); 754 755 if (direction == copy_forwards) { 756 __ sub(s, s, bias); 757 __ sub(d, d, bias); 758 } 759 760 #ifdef ASSERT 761 // Make sure we are never given < 8 words 762 { 763 Label L; 764 765 __ mv(t0, 8); 766 __ bge(count, t0, L); 767 __ stop("genrate_copy_longs called with < 8 words"); 768 __ bind(L); 769 } 770 #endif 771 772 __ ld(tmp_reg0, Address(s, 1 * unit)); 773 __ ld(tmp_reg1, Address(s, 2 * unit)); 774 __ ld(tmp_reg2, Address(s, 3 * unit)); 775 __ ld(tmp_reg3, Address(s, 4 * unit)); 776 __ ld(tmp_reg4, Address(s, 5 * unit)); 777 __ ld(tmp_reg5, Address(s, 6 * unit)); 778 __ ld(tmp_reg6, Address(s, 7 * unit)); 779 __ ld(tmp_reg7, Address(s, 8 * unit)); 780 __ addi(s, s, 8 * unit); 781 782 __ sub(count, count, 16); 783 __ bltz(count, drain); 784 785 __ bind(again); 786 787 __ sd(tmp_reg0, Address(d, 1 * unit)); 788 __ sd(tmp_reg1, Address(d, 2 * unit)); 789 __ sd(tmp_reg2, Address(d, 3 * unit)); 790 __ sd(tmp_reg3, Address(d, 4 * unit)); 791 __ sd(tmp_reg4, Address(d, 5 * unit)); 792 __ sd(tmp_reg5, Address(d, 6 * unit)); 793 __ sd(tmp_reg6, Address(d, 7 * unit)); 794 __ sd(tmp_reg7, Address(d, 8 * unit)); 795 796 __ ld(tmp_reg0, Address(s, 1 * unit)); 797 __ ld(tmp_reg1, Address(s, 2 * unit)); 798 __ ld(tmp_reg2, Address(s, 3 * unit)); 799 __ ld(tmp_reg3, Address(s, 4 * unit)); 800 __ ld(tmp_reg4, Address(s, 5 * unit)); 801 __ ld(tmp_reg5, Address(s, 6 * unit)); 802 __ ld(tmp_reg6, Address(s, 7 * unit)); 803 __ ld(tmp_reg7, Address(s, 8 * unit)); 804 805 __ addi(s, s, 8 * unit); 806 __ addi(d, d, 8 * unit); 807 808 __ sub(count, count, 8); 809 __ bgez(count, again); 810 811 // Drain 812 __ bind(drain); 813 814 __ sd(tmp_reg0, Address(d, 1 * unit)); 815 __ sd(tmp_reg1, Address(d, 2 * unit)); 816 __ sd(tmp_reg2, Address(d, 3 * unit)); 817 __ sd(tmp_reg3, Address(d, 4 * unit)); 818 __ sd(tmp_reg4, Address(d, 5 * unit)); 819 __ sd(tmp_reg5, Address(d, 6 * unit)); 820 __ sd(tmp_reg6, Address(d, 7 * unit)); 821 __ sd(tmp_reg7, Address(d, 8 * unit)); 822 __ addi(d, d, 8 * unit); 823 824 { 825 Label L1, L2; 826 __ test_bit(t0, count, 2); 827 __ beqz(t0, L1); 828 829 __ ld(tmp_reg0, Address(s, 1 * unit)); 830 __ ld(tmp_reg1, Address(s, 2 * unit)); 831 __ ld(tmp_reg2, Address(s, 3 * unit)); 832 __ ld(tmp_reg3, Address(s, 4 * unit)); 833 __ addi(s, s, 4 * unit); 834 835 __ sd(tmp_reg0, Address(d, 1 * unit)); 836 __ sd(tmp_reg1, Address(d, 2 * unit)); 837 __ sd(tmp_reg2, Address(d, 3 * unit)); 838 __ sd(tmp_reg3, Address(d, 4 * unit)); 839 __ addi(d, d, 4 * unit); 840 841 __ bind(L1); 842 843 if (direction == copy_forwards) { 844 __ addi(s, s, bias); 845 __ addi(d, d, bias); 846 } 847 848 __ test_bit(t0, count, 1); 849 __ beqz(t0, L2); 850 if (direction == copy_backwards) { 851 __ addi(s, s, 2 * unit); 852 __ ld(tmp_reg0, Address(s)); 853 __ ld(tmp_reg1, Address(s, wordSize)); 854 __ addi(d, d, 2 * unit); 855 __ sd(tmp_reg0, Address(d)); 856 __ sd(tmp_reg1, Address(d, wordSize)); 857 } else { 858 __ ld(tmp_reg0, Address(s)); 859 __ ld(tmp_reg1, Address(s, wordSize)); 860 __ addi(s, s, 2 * unit); 861 __ sd(tmp_reg0, Address(d)); 862 __ sd(tmp_reg1, Address(d, wordSize)); 863 __ addi(d, d, 2 * unit); 864 } 865 __ bind(L2); 866 } 867 868 __ ret(); 869 } 870 871 Label copy_f, copy_b; 872 873 typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp); 874 875 void copy_memory_v(Register s, Register d, Register count, int step) { 876 bool is_backward = step < 0; 877 int granularity = uabs(step); 878 879 const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17; 880 assert_different_registers(s, d, cnt, vl, tmp1, tmp2); 881 Assembler::SEW sew = Assembler::elembytes_to_sew(granularity); 882 Label loop_forward, loop_backward, done; 883 884 __ mv(dst, d); 885 __ mv(src, s); 886 __ mv(cnt, count); 887 888 __ bind(loop_forward); 889 __ vsetvli(vl, cnt, sew, Assembler::m8); 890 if (is_backward) { 891 __ bne(vl, cnt, loop_backward); 892 } 893 894 __ vlex_v(v0, src, sew); 895 __ sub(cnt, cnt, vl); 896 if (sew != Assembler::e8) { 897 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary 898 __ slli(vl, vl, sew); 899 } 900 __ add(src, src, vl); 901 902 __ vsex_v(v0, dst, sew); 903 __ add(dst, dst, vl); 904 __ bnez(cnt, loop_forward); 905 906 if (is_backward) { 907 __ j(done); 908 909 __ bind(loop_backward); 910 __ sub(t0, cnt, vl); 911 if (sew != Assembler::e8) { 912 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary 913 __ slli(t0, t0, sew); 914 } 915 __ add(tmp1, s, t0); 916 __ vlex_v(v0, tmp1, sew); 917 __ add(tmp2, d, t0); 918 __ vsex_v(v0, tmp2, sew); 919 __ sub(cnt, cnt, vl); 920 __ bnez(cnt, loop_forward); 921 __ bind(done); 922 } 923 } 924 925 // All-singing all-dancing memory copy. 926 // 927 // Copy count units of memory from s to d. The size of a unit is 928 // step, which can be positive or negative depending on the direction 929 // of copy. 930 // 931 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 932 Register s, Register d, Register count, int step) { 933 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 934 if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) { 935 return copy_memory_v(s, d, count, step); 936 } 937 938 bool is_backwards = step < 0; 939 int granularity = uabs(step); 940 941 const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13; 942 const Register gct1 = x28, gct2 = x29, gct3 = t2; 943 944 Label same_aligned; 945 Label copy_big, copy32_loop, copy8_loop, copy_small, done; 946 947 // The size of copy32_loop body increases significantly with ZGC GC barriers. 948 // Need conditional far branches to reach a point beyond the loop in this case. 949 bool is_far = UseZGC && ZGenerational; 950 951 __ beqz(count, done, is_far); 952 __ slli(cnt, count, exact_log2(granularity)); 953 if (is_backwards) { 954 __ add(src, s, cnt); 955 __ add(dst, d, cnt); 956 } else { 957 __ mv(src, s); 958 __ mv(dst, d); 959 } 960 961 if (is_aligned) { 962 __ addi(t0, cnt, -32); 963 __ bgez(t0, copy32_loop); 964 __ addi(t0, cnt, -8); 965 __ bgez(t0, copy8_loop, is_far); 966 __ j(copy_small); 967 } else { 968 __ mv(t0, 16); 969 __ blt(cnt, t0, copy_small, is_far); 970 971 __ xorr(t0, src, dst); 972 __ andi(t0, t0, 0b111); 973 __ bnez(t0, copy_small, is_far); 974 975 __ bind(same_aligned); 976 __ andi(t0, src, 0b111); 977 __ beqz(t0, copy_big); 978 if (is_backwards) { 979 __ addi(src, src, step); 980 __ addi(dst, dst, step); 981 } 982 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1); 983 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3); 984 if (!is_backwards) { 985 __ addi(src, src, step); 986 __ addi(dst, dst, step); 987 } 988 __ addi(cnt, cnt, -granularity); 989 __ beqz(cnt, done, is_far); 990 __ j(same_aligned); 991 992 __ bind(copy_big); 993 __ mv(t0, 32); 994 __ blt(cnt, t0, copy8_loop, is_far); 995 } 996 997 __ bind(copy32_loop); 998 if (is_backwards) { 999 __ addi(src, src, -wordSize * 4); 1000 __ addi(dst, dst, -wordSize * 4); 1001 } 1002 // we first load 32 bytes, then write it, so the direction here doesn't matter 1003 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1); 1004 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8), gct1); 1005 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1); 1006 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1); 1007 1008 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3); 1009 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8), tmp4, gct1, gct2, gct3); 1010 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3); 1011 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3); 1012 1013 if (!is_backwards) { 1014 __ addi(src, src, wordSize * 4); 1015 __ addi(dst, dst, wordSize * 4); 1016 } 1017 __ addi(t0, cnt, -(32 + wordSize * 4)); 1018 __ addi(cnt, cnt, -wordSize * 4); 1019 __ bgez(t0, copy32_loop); // cnt >= 32, do next loop 1020 1021 __ beqz(cnt, done); // if that's all - done 1022 1023 __ addi(t0, cnt, -8); // if not - copy the reminder 1024 __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop 1025 1026 __ bind(copy8_loop); 1027 if (is_backwards) { 1028 __ addi(src, src, -wordSize); 1029 __ addi(dst, dst, -wordSize); 1030 } 1031 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1); 1032 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3); 1033 1034 if (!is_backwards) { 1035 __ addi(src, src, wordSize); 1036 __ addi(dst, dst, wordSize); 1037 } 1038 __ addi(t0, cnt, -(8 + wordSize)); 1039 __ addi(cnt, cnt, -wordSize); 1040 __ bgez(t0, copy8_loop); // cnt >= 8, do next loop 1041 1042 __ beqz(cnt, done); // if that's all - done 1043 1044 __ bind(copy_small); 1045 if (is_backwards) { 1046 __ addi(src, src, step); 1047 __ addi(dst, dst, step); 1048 } 1049 1050 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1); 1051 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3); 1052 1053 if (!is_backwards) { 1054 __ addi(src, src, step); 1055 __ addi(dst, dst, step); 1056 } 1057 __ addi(cnt, cnt, -granularity); 1058 __ bgtz(cnt, copy_small); 1059 1060 __ bind(done); 1061 } 1062 1063 // Scan over array at a for count oops, verifying each one. 1064 // Preserves a and count, clobbers t0 and t1. 1065 void verify_oop_array(size_t size, Register a, Register count, Register temp) { 1066 Label loop, end; 1067 __ mv(t1, zr); 1068 __ slli(t0, count, exact_log2(size)); 1069 __ bind(loop); 1070 __ bgeu(t1, t0, end); 1071 1072 __ add(temp, a, t1); 1073 if (size == (size_t)wordSize) { 1074 __ ld(temp, Address(temp, 0)); 1075 __ verify_oop(temp); 1076 } else { 1077 __ lwu(temp, Address(temp, 0)); 1078 __ decode_heap_oop(temp); // calls verify_oop 1079 } 1080 __ add(t1, t1, size); 1081 __ j(loop); 1082 __ bind(end); 1083 } 1084 1085 // Arguments: 1086 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1087 // ignored 1088 // is_oop - true => oop array, so generate store check code 1089 // name - stub name string 1090 // 1091 // Inputs: 1092 // c_rarg0 - source array address 1093 // c_rarg1 - destination array address 1094 // c_rarg2 - element count, treated as ssize_t, can be zero 1095 // 1096 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1097 // the hardware handle it. The two dwords within qwords that span 1098 // cache line boundaries will still be loaded and stored atomically. 1099 // 1100 // Side Effects: 1101 // disjoint_int_copy_entry is set to the no-overlap entry point 1102 // used by generate_conjoint_int_oop_copy(). 1103 // 1104 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry, 1105 const char* name, bool dest_uninitialized = false) { 1106 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1107 RegSet saved_reg = RegSet::of(s, d, count); 1108 __ align(CodeEntryAlignment); 1109 StubCodeMark mark(this, "StubRoutines", name); 1110 address start = __ pc(); 1111 __ enter(); 1112 1113 if (entry != nullptr) { 1114 *entry = __ pc(); 1115 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1116 BLOCK_COMMENT("Entry:"); 1117 } 1118 1119 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1120 if (dest_uninitialized) { 1121 decorators |= IS_DEST_UNINITIALIZED; 1122 } 1123 if (aligned) { 1124 decorators |= ARRAYCOPY_ALIGNED; 1125 } 1126 1127 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1128 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1129 1130 if (is_oop) { 1131 // save regs before copy_memory 1132 __ push_reg(RegSet::of(d, count), sp); 1133 } 1134 1135 { 1136 // UnsafeMemoryAccess page error: continue after unsafe access 1137 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1138 UnsafeMemoryAccessMark umam(this, add_entry, true); 1139 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1140 } 1141 1142 if (is_oop) { 1143 __ pop_reg(RegSet::of(d, count), sp); 1144 if (VerifyOops) { 1145 verify_oop_array(size, d, count, t2); 1146 } 1147 } 1148 1149 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1150 1151 __ leave(); 1152 __ mv(x10, zr); // return 0 1153 __ ret(); 1154 return start; 1155 } 1156 1157 // Arguments: 1158 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1159 // ignored 1160 // is_oop - true => oop array, so generate store check code 1161 // name - stub name string 1162 // 1163 // Inputs: 1164 // c_rarg0 - source array address 1165 // c_rarg1 - destination array address 1166 // c_rarg2 - element count, treated as ssize_t, can be zero 1167 // 1168 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1169 // the hardware handle it. The two dwords within qwords that span 1170 // cache line boundaries will still be loaded and stored atomically. 1171 // 1172 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1173 address* entry, const char* name, 1174 bool dest_uninitialized = false) { 1175 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1176 RegSet saved_regs = RegSet::of(s, d, count); 1177 StubCodeMark mark(this, "StubRoutines", name); 1178 address start = __ pc(); 1179 __ enter(); 1180 1181 if (entry != nullptr) { 1182 *entry = __ pc(); 1183 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1184 BLOCK_COMMENT("Entry:"); 1185 } 1186 1187 // use fwd copy when (d-s) above_equal (count*size) 1188 __ sub(t0, d, s); 1189 __ slli(t1, count, exact_log2(size)); 1190 Label L_continue; 1191 __ bltu(t0, t1, L_continue); 1192 __ j(nooverlap_target); 1193 __ bind(L_continue); 1194 1195 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1196 if (dest_uninitialized) { 1197 decorators |= IS_DEST_UNINITIALIZED; 1198 } 1199 if (aligned) { 1200 decorators |= ARRAYCOPY_ALIGNED; 1201 } 1202 1203 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1204 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1205 1206 if (is_oop) { 1207 // save regs before copy_memory 1208 __ push_reg(RegSet::of(d, count), sp); 1209 } 1210 1211 { 1212 // UnsafeMemoryAccess page error: continue after unsafe access 1213 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1214 UnsafeMemoryAccessMark umam(this, add_entry, true); 1215 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1216 } 1217 1218 if (is_oop) { 1219 __ pop_reg(RegSet::of(d, count), sp); 1220 if (VerifyOops) { 1221 verify_oop_array(size, d, count, t2); 1222 } 1223 } 1224 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1225 __ leave(); 1226 __ mv(x10, zr); // return 0 1227 __ ret(); 1228 return start; 1229 } 1230 1231 // Arguments: 1232 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1233 // ignored 1234 // name - stub name string 1235 // 1236 // Inputs: 1237 // c_rarg0 - source array address 1238 // c_rarg1 - destination array address 1239 // c_rarg2 - element count, treated as ssize_t, can be zero 1240 // 1241 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1242 // we let the hardware handle it. The one to eight bytes within words, 1243 // dwords or qwords that span cache line boundaries will still be loaded 1244 // and stored atomically. 1245 // 1246 // Side Effects: 1247 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1248 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1249 // we let the hardware handle it. The one to eight bytes within words, 1250 // dwords or qwords that span cache line boundaries will still be loaded 1251 // and stored atomically. 1252 // 1253 // Side Effects: 1254 // disjoint_byte_copy_entry is set to the no-overlap entry point 1255 // used by generate_conjoint_byte_copy(). 1256 // 1257 address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) { 1258 const bool not_oop = false; 1259 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1260 } 1261 1262 // Arguments: 1263 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1264 // ignored 1265 // name - stub name string 1266 // 1267 // Inputs: 1268 // c_rarg0 - source array address 1269 // c_rarg1 - destination array address 1270 // c_rarg2 - element count, treated as ssize_t, can be zero 1271 // 1272 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1273 // we let the hardware handle it. The one to eight bytes within words, 1274 // dwords or qwords that span cache line boundaries will still be loaded 1275 // and stored atomically. 1276 // 1277 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1278 address* entry, const char* name) { 1279 const bool not_oop = false; 1280 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1281 } 1282 1283 // Arguments: 1284 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1285 // ignored 1286 // name - stub name string 1287 // 1288 // Inputs: 1289 // c_rarg0 - source array address 1290 // c_rarg1 - destination array address 1291 // c_rarg2 - element count, treated as ssize_t, can be zero 1292 // 1293 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1294 // let the hardware handle it. The two or four words within dwords 1295 // or qwords that span cache line boundaries will still be loaded 1296 // and stored atomically. 1297 // 1298 // Side Effects: 1299 // disjoint_short_copy_entry is set to the no-overlap entry point 1300 // used by generate_conjoint_short_copy(). 1301 // 1302 address generate_disjoint_short_copy(bool aligned, 1303 address* entry, const char* name) { 1304 const bool not_oop = false; 1305 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1306 } 1307 1308 // Arguments: 1309 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1310 // ignored 1311 // name - stub name string 1312 // 1313 // Inputs: 1314 // c_rarg0 - source array address 1315 // c_rarg1 - destination array address 1316 // c_rarg2 - element count, treated as ssize_t, can be zero 1317 // 1318 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1319 // let the hardware handle it. The two or four words within dwords 1320 // or qwords that span cache line boundaries will still be loaded 1321 // and stored atomically. 1322 // 1323 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1324 address* entry, const char* name) { 1325 const bool not_oop = false; 1326 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1327 } 1328 1329 // Arguments: 1330 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1331 // ignored 1332 // name - stub name string 1333 // 1334 // Inputs: 1335 // c_rarg0 - source array address 1336 // c_rarg1 - destination array address 1337 // c_rarg2 - element count, treated as ssize_t, can be zero 1338 // 1339 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1340 // the hardware handle it. The two dwords within qwords that span 1341 // cache line boundaries will still be loaded and stored atomically. 1342 // 1343 // Side Effects: 1344 // disjoint_int_copy_entry is set to the no-overlap entry point 1345 // used by generate_conjoint_int_oop_copy(). 1346 // 1347 address generate_disjoint_int_copy(bool aligned, address* entry, 1348 const char* name, bool dest_uninitialized = false) { 1349 const bool not_oop = false; 1350 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1351 } 1352 1353 // Arguments: 1354 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1355 // ignored 1356 // name - stub name string 1357 // 1358 // Inputs: 1359 // c_rarg0 - source array address 1360 // c_rarg1 - destination array address 1361 // c_rarg2 - element count, treated as ssize_t, can be zero 1362 // 1363 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1364 // the hardware handle it. The two dwords within qwords that span 1365 // cache line boundaries will still be loaded and stored atomically. 1366 // 1367 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1368 address* entry, const char* name, 1369 bool dest_uninitialized = false) { 1370 const bool not_oop = false; 1371 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1372 } 1373 1374 1375 // Arguments: 1376 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1377 // ignored 1378 // name - stub name string 1379 // 1380 // Inputs: 1381 // c_rarg0 - source array address 1382 // c_rarg1 - destination array address 1383 // c_rarg2 - element count, treated as size_t, can be zero 1384 // 1385 // Side Effects: 1386 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1387 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1388 // 1389 address generate_disjoint_long_copy(bool aligned, address* entry, 1390 const char* name, bool dest_uninitialized = false) { 1391 const bool not_oop = false; 1392 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1393 } 1394 1395 // Arguments: 1396 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1397 // ignored 1398 // name - stub name string 1399 // 1400 // Inputs: 1401 // c_rarg0 - source array address 1402 // c_rarg1 - destination array address 1403 // c_rarg2 - element count, treated as size_t, can be zero 1404 // 1405 address generate_conjoint_long_copy(bool aligned, 1406 address nooverlap_target, address* entry, 1407 const char* name, bool dest_uninitialized = false) { 1408 const bool not_oop = false; 1409 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1410 } 1411 1412 // Arguments: 1413 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1414 // ignored 1415 // name - stub name string 1416 // 1417 // Inputs: 1418 // c_rarg0 - source array address 1419 // c_rarg1 - destination array address 1420 // c_rarg2 - element count, treated as size_t, can be zero 1421 // 1422 // Side Effects: 1423 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1424 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1425 // 1426 address generate_disjoint_oop_copy(bool aligned, address* entry, 1427 const char* name, bool dest_uninitialized) { 1428 const bool is_oop = true; 1429 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1430 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1431 } 1432 1433 // Arguments: 1434 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1435 // ignored 1436 // name - stub name string 1437 // 1438 // Inputs: 1439 // c_rarg0 - source array address 1440 // c_rarg1 - destination array address 1441 // c_rarg2 - element count, treated as size_t, can be zero 1442 // 1443 address generate_conjoint_oop_copy(bool aligned, 1444 address nooverlap_target, address* entry, 1445 const char* name, bool dest_uninitialized) { 1446 const bool is_oop = true; 1447 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1448 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1449 name, dest_uninitialized); 1450 } 1451 1452 // Helper for generating a dynamic type check. 1453 // Smashes t0, t1. 1454 void generate_type_check(Register sub_klass, 1455 Register super_check_offset, 1456 Register super_klass, 1457 Label& L_success) { 1458 assert_different_registers(sub_klass, super_check_offset, super_klass); 1459 1460 BLOCK_COMMENT("type_check:"); 1461 1462 Label L_miss; 1463 1464 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset); 1465 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1466 1467 // Fall through on failure! 1468 __ BIND(L_miss); 1469 } 1470 1471 // 1472 // Generate checkcasting array copy stub 1473 // 1474 // Input: 1475 // c_rarg0 - source array address 1476 // c_rarg1 - destination array address 1477 // c_rarg2 - element count, treated as ssize_t, can be zero 1478 // c_rarg3 - size_t ckoff (super_check_offset) 1479 // c_rarg4 - oop ckval (super_klass) 1480 // 1481 // Output: 1482 // x10 == 0 - success 1483 // x10 == -1^K - failure, where K is partial transfer count 1484 // 1485 address generate_checkcast_copy(const char* name, address* entry, 1486 bool dest_uninitialized = false) { 1487 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1488 1489 // Input registers (after setup_arg_regs) 1490 const Register from = c_rarg0; // source array address 1491 const Register to = c_rarg1; // destination array address 1492 const Register count = c_rarg2; // elementscount 1493 const Register ckoff = c_rarg3; // super_check_offset 1494 const Register ckval = c_rarg4; // super_klass 1495 1496 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1497 RegSet wb_post_saved_regs = RegSet::of(count); 1498 1499 // Registers used as temps (x7, x9, x18 are save-on-entry) 1500 const Register count_save = x19; // orig elementscount 1501 const Register start_to = x18; // destination array start address 1502 const Register copied_oop = x7; // actual oop copied 1503 const Register r9_klass = x9; // oop._klass 1504 1505 // Registers used as gc temps (x15, x16, x17 are save-on-call) 1506 const Register gct1 = x15, gct2 = x16, gct3 = x17; 1507 1508 //--------------------------------------------------------------- 1509 // Assembler stub will be used for this call to arraycopy 1510 // if the two arrays are subtypes of Object[] but the 1511 // destination array type is not equal to or a supertype 1512 // of the source type. Each element must be separately 1513 // checked. 1514 1515 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1516 copied_oop, r9_klass, count_save); 1517 1518 __ align(CodeEntryAlignment); 1519 StubCodeMark mark(this, "StubRoutines", name); 1520 address start = __ pc(); 1521 1522 __ enter(); // required for proper stackwalking of RuntimeStub frame 1523 1524 // Caller of this entry point must set up the argument registers. 1525 if (entry != nullptr) { 1526 *entry = __ pc(); 1527 BLOCK_COMMENT("Entry:"); 1528 } 1529 1530 // Empty array: Nothing to do 1531 __ beqz(count, L_done); 1532 1533 __ push_reg(RegSet::of(x7, x9, x18, x19), sp); 1534 1535 #ifdef ASSERT 1536 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1537 // The ckoff and ckval must be mutually consistent, 1538 // even though caller generates both. 1539 { Label L; 1540 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1541 __ lwu(start_to, Address(ckval, sco_offset)); 1542 __ beq(ckoff, start_to, L); 1543 __ stop("super_check_offset inconsistent"); 1544 __ bind(L); 1545 } 1546 #endif //ASSERT 1547 1548 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1549 if (dest_uninitialized) { 1550 decorators |= IS_DEST_UNINITIALIZED; 1551 } 1552 1553 bool is_oop = true; 1554 int element_size = UseCompressedOops ? 4 : 8; 1555 1556 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1557 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1558 1559 // save the original count 1560 __ mv(count_save, count); 1561 1562 // Copy from low to high addresses 1563 __ mv(start_to, to); // Save destination array start address 1564 __ j(L_load_element); 1565 1566 // ======== begin loop ======== 1567 // (Loop is rotated; its entry is L_load_element.) 1568 // Loop control: 1569 // for count to 0 do 1570 // copied_oop = load_heap_oop(from++) 1571 // ... generate_type_check ... 1572 // store_heap_oop(to++, copied_oop) 1573 // end 1574 1575 __ align(OptoLoopAlignment); 1576 1577 __ BIND(L_store_element); 1578 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1579 Address(to, 0), copied_oop, 1580 gct1, gct2, gct3); 1581 __ add(to, to, UseCompressedOops ? 4 : 8); 1582 __ sub(count, count, 1); 1583 __ beqz(count, L_do_card_marks); 1584 1585 // ======== loop entry is here ======== 1586 __ BIND(L_load_element); 1587 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1588 copied_oop, Address(from, 0), 1589 gct1); 1590 __ add(from, from, UseCompressedOops ? 4 : 8); 1591 __ beqz(copied_oop, L_store_element); 1592 1593 __ load_klass(r9_klass, copied_oop);// query the object klass 1594 generate_type_check(r9_klass, ckoff, ckval, L_store_element); 1595 // ======== end loop ======== 1596 1597 // It was a real error; we must depend on the caller to finish the job. 1598 // Register count = remaining oops, count_orig = total oops. 1599 // Emit GC store barriers for the oops we have copied and report 1600 // their number to the caller. 1601 1602 __ sub(count, count_save, count); // K = partially copied oop count 1603 __ xori(count, count, -1); // report (-1^K) to caller 1604 __ beqz(count, L_done_pop); 1605 1606 __ BIND(L_do_card_marks); 1607 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs); 1608 1609 __ bind(L_done_pop); 1610 __ pop_reg(RegSet::of(x7, x9, x18, x19), sp); 1611 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1612 1613 __ bind(L_done); 1614 __ mv(x10, count); 1615 __ leave(); 1616 __ ret(); 1617 1618 return start; 1619 } 1620 1621 // Perform range checks on the proposed arraycopy. 1622 // Kills temp, but nothing else. 1623 // Also, clean the sign bits of src_pos and dst_pos. 1624 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1625 Register src_pos, // source position (c_rarg1) 1626 Register dst, // destination array oo (c_rarg2) 1627 Register dst_pos, // destination position (c_rarg3) 1628 Register length, 1629 Register temp, 1630 Label& L_failed) { 1631 BLOCK_COMMENT("arraycopy_range_checks:"); 1632 1633 assert_different_registers(t0, temp); 1634 1635 // if [src_pos + length > arrayOop(src)->length()] then FAIL 1636 __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes())); 1637 __ addw(temp, length, src_pos); 1638 __ bgtu(temp, t0, L_failed); 1639 1640 // if [dst_pos + length > arrayOop(dst)->length()] then FAIL 1641 __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1642 __ addw(temp, length, dst_pos); 1643 __ bgtu(temp, t0, L_failed); 1644 1645 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1646 __ zero_extend(src_pos, src_pos, 32); 1647 __ zero_extend(dst_pos, dst_pos, 32); 1648 1649 BLOCK_COMMENT("arraycopy_range_checks done"); 1650 } 1651 1652 // 1653 // Generate 'unsafe' array copy stub 1654 // Though just as safe as the other stubs, it takes an unscaled 1655 // size_t argument instead of an element count. 1656 // 1657 // Input: 1658 // c_rarg0 - source array address 1659 // c_rarg1 - destination array address 1660 // c_rarg2 - byte count, treated as ssize_t, can be zero 1661 // 1662 // Examines the alignment of the operands and dispatches 1663 // to a long, int, short, or byte copy loop. 1664 // 1665 address generate_unsafe_copy(const char* name, 1666 address byte_copy_entry, 1667 address short_copy_entry, 1668 address int_copy_entry, 1669 address long_copy_entry) { 1670 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr && 1671 int_copy_entry != nullptr && long_copy_entry != nullptr); 1672 Label L_long_aligned, L_int_aligned, L_short_aligned; 1673 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1674 1675 __ align(CodeEntryAlignment); 1676 StubCodeMark mark(this, "StubRoutines", name); 1677 address start = __ pc(); 1678 __ enter(); // required for proper stackwalking of RuntimeStub frame 1679 1680 // bump this on entry, not on exit: 1681 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1682 1683 __ orr(t0, s, d); 1684 __ orr(t0, t0, count); 1685 1686 __ andi(t0, t0, BytesPerLong - 1); 1687 __ beqz(t0, L_long_aligned); 1688 __ andi(t0, t0, BytesPerInt - 1); 1689 __ beqz(t0, L_int_aligned); 1690 __ test_bit(t0, t0, 0); 1691 __ beqz(t0, L_short_aligned); 1692 __ j(RuntimeAddress(byte_copy_entry)); 1693 1694 __ BIND(L_short_aligned); 1695 __ srli(count, count, LogBytesPerShort); // size => short_count 1696 __ j(RuntimeAddress(short_copy_entry)); 1697 __ BIND(L_int_aligned); 1698 __ srli(count, count, LogBytesPerInt); // size => int_count 1699 __ j(RuntimeAddress(int_copy_entry)); 1700 __ BIND(L_long_aligned); 1701 __ srli(count, count, LogBytesPerLong); // size => long_count 1702 __ j(RuntimeAddress(long_copy_entry)); 1703 1704 return start; 1705 } 1706 1707 // 1708 // Generate generic array copy stubs 1709 // 1710 // Input: 1711 // c_rarg0 - src oop 1712 // c_rarg1 - src_pos (32-bits) 1713 // c_rarg2 - dst oop 1714 // c_rarg3 - dst_pos (32-bits) 1715 // c_rarg4 - element count (32-bits) 1716 // 1717 // Output: 1718 // x10 == 0 - success 1719 // x10 == -1^K - failure, where K is partial transfer count 1720 // 1721 address generate_generic_copy(const char* name, 1722 address byte_copy_entry, address short_copy_entry, 1723 address int_copy_entry, address oop_copy_entry, 1724 address long_copy_entry, address checkcast_copy_entry) { 1725 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr && 1726 int_copy_entry != nullptr && oop_copy_entry != nullptr && 1727 long_copy_entry != nullptr && checkcast_copy_entry != nullptr); 1728 Label L_failed, L_failed_0, L_objArray; 1729 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1730 1731 // Input registers 1732 const Register src = c_rarg0; // source array oop 1733 const Register src_pos = c_rarg1; // source position 1734 const Register dst = c_rarg2; // destination array oop 1735 const Register dst_pos = c_rarg3; // destination position 1736 const Register length = c_rarg4; 1737 1738 // Registers used as temps 1739 const Register dst_klass = c_rarg5; 1740 1741 __ align(CodeEntryAlignment); 1742 1743 StubCodeMark mark(this, "StubRoutines", name); 1744 1745 address start = __ pc(); 1746 1747 __ enter(); // required for proper stackwalking of RuntimeStub frame 1748 1749 // bump this on entry, not on exit: 1750 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1751 1752 //----------------------------------------------------------------------- 1753 // Assembler stub will be used for this call to arraycopy 1754 // if the following conditions are met: 1755 // 1756 // (1) src and dst must not be null. 1757 // (2) src_pos must not be negative. 1758 // (3) dst_pos must not be negative. 1759 // (4) length must not be negative. 1760 // (5) src klass and dst klass should be the same and not null. 1761 // (6) src and dst should be arrays. 1762 // (7) src_pos + length must not exceed length of src. 1763 // (8) dst_pos + length must not exceed length of dst. 1764 // 1765 1766 // if src is null then return -1 1767 __ beqz(src, L_failed); 1768 1769 // if [src_pos < 0] then return -1 1770 __ sign_extend(t0, src_pos, 32); 1771 __ bltz(t0, L_failed); 1772 1773 // if dst is null then return -1 1774 __ beqz(dst, L_failed); 1775 1776 // if [dst_pos < 0] then return -1 1777 __ sign_extend(t0, dst_pos, 32); 1778 __ bltz(t0, L_failed); 1779 1780 // registers used as temp 1781 const Register scratch_length = x28; // elements count to copy 1782 const Register scratch_src_klass = x29; // array klass 1783 const Register lh = x30; // layout helper 1784 1785 // if [length < 0] then return -1 1786 __ sign_extend(scratch_length, length, 32); // length (elements count, 32-bits value) 1787 __ bltz(scratch_length, L_failed); 1788 1789 __ load_klass(scratch_src_klass, src); 1790 #ifdef ASSERT 1791 { 1792 BLOCK_COMMENT("assert klasses not null {"); 1793 Label L1, L2; 1794 __ bnez(scratch_src_klass, L2); // it is broken if klass is null 1795 __ bind(L1); 1796 __ stop("broken null klass"); 1797 __ bind(L2); 1798 __ load_klass(t0, dst, t1); 1799 __ beqz(t0, L1); // this would be broken also 1800 BLOCK_COMMENT("} assert klasses not null done"); 1801 } 1802 #endif 1803 1804 // Load layout helper (32-bits) 1805 // 1806 // |array_tag| | header_size | element_type | |log2_element_size| 1807 // 32 30 24 16 8 2 0 1808 // 1809 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1810 // 1811 1812 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1813 1814 // Handle objArrays completely differently... 1815 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1816 __ lw(lh, Address(scratch_src_klass, lh_offset)); 1817 __ mv(t0, objArray_lh); 1818 __ beq(lh, t0, L_objArray); 1819 1820 // if [src->klass() != dst->klass()] then return -1 1821 __ load_klass(t1, dst); 1822 __ bne(t1, scratch_src_klass, L_failed); 1823 1824 // if src->is_Array() isn't null then return -1 1825 // i.e. (lh >= 0) 1826 __ bgez(lh, L_failed); 1827 1828 // At this point, it is known to be a typeArray (array_tag 0x3). 1829 #ifdef ASSERT 1830 { 1831 BLOCK_COMMENT("assert primitive array {"); 1832 Label L; 1833 __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 1834 __ bge(lh, t1, L); 1835 __ stop("must be a primitive array"); 1836 __ bind(L); 1837 BLOCK_COMMENT("} assert primitive array done"); 1838 } 1839 #endif 1840 1841 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1842 t1, L_failed); 1843 1844 // TypeArrayKlass 1845 // 1846 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize) 1847 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize) 1848 // 1849 1850 const Register t0_offset = t0; // array offset 1851 const Register x30_elsize = lh; // element size 1852 1853 // Get array_header_in_bytes() 1854 int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1); 1855 int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width; 1856 __ slli(t0_offset, lh, XLEN - lh_header_size_msb); // left shift to remove 24 ~ 32; 1857 __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset 1858 1859 __ add(src, src, t0_offset); // src array offset 1860 __ add(dst, dst, t0_offset); // dst array offset 1861 BLOCK_COMMENT("choose copy loop based on element size"); 1862 1863 // next registers should be set before the jump to corresponding stub 1864 const Register from = c_rarg0; // source array address 1865 const Register to = c_rarg1; // destination array address 1866 const Register count = c_rarg2; // elements count 1867 1868 // 'from', 'to', 'count' registers should be set in such order 1869 // since they are the same as 'src', 'src_pos', 'dst'. 1870 1871 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1872 1873 // The possible values of elsize are 0-3, i.e. exact_log2(element 1874 // size in bytes). We do a simple bitwise binary search. 1875 __ BIND(L_copy_bytes); 1876 __ test_bit(t0, x30_elsize, 1); 1877 __ bnez(t0, L_copy_ints); 1878 __ test_bit(t0, x30_elsize, 0); 1879 __ bnez(t0, L_copy_shorts); 1880 __ add(from, src, src_pos); // src_addr 1881 __ add(to, dst, dst_pos); // dst_addr 1882 __ sign_extend(count, scratch_length, 32); // length 1883 __ j(RuntimeAddress(byte_copy_entry)); 1884 1885 __ BIND(L_copy_shorts); 1886 __ shadd(from, src_pos, src, t0, 1); // src_addr 1887 __ shadd(to, dst_pos, dst, t0, 1); // dst_addr 1888 __ sign_extend(count, scratch_length, 32); // length 1889 __ j(RuntimeAddress(short_copy_entry)); 1890 1891 __ BIND(L_copy_ints); 1892 __ test_bit(t0, x30_elsize, 0); 1893 __ bnez(t0, L_copy_longs); 1894 __ shadd(from, src_pos, src, t0, 2); // src_addr 1895 __ shadd(to, dst_pos, dst, t0, 2); // dst_addr 1896 __ sign_extend(count, scratch_length, 32); // length 1897 __ j(RuntimeAddress(int_copy_entry)); 1898 1899 __ BIND(L_copy_longs); 1900 #ifdef ASSERT 1901 { 1902 BLOCK_COMMENT("assert long copy {"); 1903 Label L; 1904 __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize 1905 __ sign_extend(lh, lh, 32); 1906 __ mv(t0, LogBytesPerLong); 1907 __ beq(x30_elsize, t0, L); 1908 __ stop("must be long copy, but elsize is wrong"); 1909 __ bind(L); 1910 BLOCK_COMMENT("} assert long copy done"); 1911 } 1912 #endif 1913 __ shadd(from, src_pos, src, t0, 3); // src_addr 1914 __ shadd(to, dst_pos, dst, t0, 3); // dst_addr 1915 __ sign_extend(count, scratch_length, 32); // length 1916 __ j(RuntimeAddress(long_copy_entry)); 1917 1918 // ObjArrayKlass 1919 __ BIND(L_objArray); 1920 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 1921 1922 Label L_plain_copy, L_checkcast_copy; 1923 // test array classes for subtyping 1924 __ load_klass(t2, dst); 1925 __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality 1926 1927 // Identically typed arrays can be copied without element-wise checks. 1928 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1929 t1, L_failed); 1930 1931 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1932 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1933 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1934 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1935 __ sign_extend(count, scratch_length, 32); // length 1936 __ BIND(L_plain_copy); 1937 __ j(RuntimeAddress(oop_copy_entry)); 1938 1939 __ BIND(L_checkcast_copy); 1940 // live at this point: scratch_src_klass, scratch_length, t2 (dst_klass) 1941 { 1942 // Before looking at dst.length, make sure dst is also an objArray. 1943 __ lwu(t0, Address(t2, lh_offset)); 1944 __ mv(t1, objArray_lh); 1945 __ bne(t0, t1, L_failed); 1946 1947 // It is safe to examine both src.length and dst.length. 1948 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1949 t2, L_failed); 1950 1951 __ load_klass(dst_klass, dst); // reload 1952 1953 // Marshal the base address arguments now, freeing registers. 1954 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1955 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1956 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1957 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1958 __ sign_extend(count, length, 32); // length (reloaded) 1959 const Register sco_temp = c_rarg3; // this register is free now 1960 assert_different_registers(from, to, count, sco_temp, 1961 dst_klass, scratch_src_klass); 1962 1963 // Generate the type check. 1964 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1965 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1966 1967 // Smashes t0, t1 1968 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 1969 1970 // Fetch destination element klass from the ObjArrayKlass header. 1971 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 1972 __ ld(dst_klass, Address(dst_klass, ek_offset)); 1973 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1974 1975 // the checkcast_copy loop needs two extra arguments: 1976 assert(c_rarg3 == sco_temp, "#3 already in place"); 1977 // Set up arguments for checkcast_copy_entry. 1978 __ mv(c_rarg4, dst_klass); // dst.klass.element_klass 1979 __ j(RuntimeAddress(checkcast_copy_entry)); 1980 } 1981 1982 __ BIND(L_failed); 1983 __ mv(x10, -1); 1984 __ leave(); // required for proper stackwalking of RuntimeStub frame 1985 __ ret(); 1986 1987 return start; 1988 } 1989 1990 // 1991 // Generate stub for array fill. If "aligned" is true, the 1992 // "to" address is assumed to be heapword aligned. 1993 // 1994 // Arguments for generated stub: 1995 // to: c_rarg0 1996 // value: c_rarg1 1997 // count: c_rarg2 treated as signed 1998 // 1999 address generate_fill(BasicType t, bool aligned, const char* name) { 2000 __ align(CodeEntryAlignment); 2001 StubCodeMark mark(this, "StubRoutines", name); 2002 address start = __ pc(); 2003 2004 BLOCK_COMMENT("Entry:"); 2005 2006 const Register to = c_rarg0; // source array address 2007 const Register value = c_rarg1; // value 2008 const Register count = c_rarg2; // elements count 2009 2010 const Register bz_base = x28; // base for block_zero routine 2011 const Register cnt_words = x29; // temp register 2012 const Register tmp_reg = t1; 2013 2014 __ enter(); 2015 2016 Label L_fill_elements, L_exit1; 2017 2018 int shift = -1; 2019 switch (t) { 2020 case T_BYTE: 2021 shift = 0; 2022 2023 // Zero extend value 2024 // 8 bit -> 16 bit 2025 __ andi(value, value, 0xff); 2026 __ mv(tmp_reg, value); 2027 __ slli(tmp_reg, tmp_reg, 8); 2028 __ orr(value, value, tmp_reg); 2029 2030 // 16 bit -> 32 bit 2031 __ mv(tmp_reg, value); 2032 __ slli(tmp_reg, tmp_reg, 16); 2033 __ orr(value, value, tmp_reg); 2034 2035 __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2036 __ bltu(count, tmp_reg, L_fill_elements); 2037 break; 2038 case T_SHORT: 2039 shift = 1; 2040 // Zero extend value 2041 // 16 bit -> 32 bit 2042 __ andi(value, value, 0xffff); 2043 __ mv(tmp_reg, value); 2044 __ slli(tmp_reg, tmp_reg, 16); 2045 __ orr(value, value, tmp_reg); 2046 2047 // Short arrays (< 8 bytes) fill by element 2048 __ mv(tmp_reg, 8 >> shift); 2049 __ bltu(count, tmp_reg, L_fill_elements); 2050 break; 2051 case T_INT: 2052 shift = 2; 2053 2054 // Short arrays (< 8 bytes) fill by element 2055 __ mv(tmp_reg, 8 >> shift); 2056 __ bltu(count, tmp_reg, L_fill_elements); 2057 break; 2058 default: ShouldNotReachHere(); 2059 } 2060 2061 // Align source address at 8 bytes address boundary. 2062 Label L_skip_align1, L_skip_align2, L_skip_align4; 2063 if (!aligned) { 2064 switch (t) { 2065 case T_BYTE: 2066 // One byte misalignment happens only for byte arrays. 2067 __ test_bit(t0, to, 0); 2068 __ beqz(t0, L_skip_align1); 2069 __ sb(value, Address(to, 0)); 2070 __ addi(to, to, 1); 2071 __ addiw(count, count, -1); 2072 __ bind(L_skip_align1); 2073 // Fallthrough 2074 case T_SHORT: 2075 // Two bytes misalignment happens only for byte and short (char) arrays. 2076 __ test_bit(t0, to, 1); 2077 __ beqz(t0, L_skip_align2); 2078 __ sh(value, Address(to, 0)); 2079 __ addi(to, to, 2); 2080 __ addiw(count, count, -(2 >> shift)); 2081 __ bind(L_skip_align2); 2082 // Fallthrough 2083 case T_INT: 2084 // Align to 8 bytes, we know we are 4 byte aligned to start. 2085 __ test_bit(t0, to, 2); 2086 __ beqz(t0, L_skip_align4); 2087 __ sw(value, Address(to, 0)); 2088 __ addi(to, to, 4); 2089 __ addiw(count, count, -(4 >> shift)); 2090 __ bind(L_skip_align4); 2091 break; 2092 default: ShouldNotReachHere(); 2093 } 2094 } 2095 2096 // 2097 // Fill large chunks 2098 // 2099 __ srliw(cnt_words, count, 3 - shift); // number of words 2100 2101 // 32 bit -> 64 bit 2102 __ andi(value, value, 0xffffffff); 2103 __ mv(tmp_reg, value); 2104 __ slli(tmp_reg, tmp_reg, 32); 2105 __ orr(value, value, tmp_reg); 2106 2107 __ slli(tmp_reg, cnt_words, 3 - shift); 2108 __ subw(count, count, tmp_reg); 2109 { 2110 __ fill_words(to, cnt_words, value); 2111 } 2112 2113 // Remaining count is less than 8 bytes. Fill it by a single store. 2114 // Note that the total length is no less than 8 bytes. 2115 if (t == T_BYTE || t == T_SHORT) { 2116 __ beqz(count, L_exit1); 2117 __ shadd(to, count, to, tmp_reg, shift); // points to the end 2118 __ sd(value, Address(to, -8)); // overwrite some elements 2119 __ bind(L_exit1); 2120 __ leave(); 2121 __ ret(); 2122 } 2123 2124 // Handle copies less than 8 bytes. 2125 Label L_fill_2, L_fill_4, L_exit2; 2126 __ bind(L_fill_elements); 2127 switch (t) { 2128 case T_BYTE: 2129 __ test_bit(t0, count, 0); 2130 __ beqz(t0, L_fill_2); 2131 __ sb(value, Address(to, 0)); 2132 __ addi(to, to, 1); 2133 __ bind(L_fill_2); 2134 __ test_bit(t0, count, 1); 2135 __ beqz(t0, L_fill_4); 2136 __ sh(value, Address(to, 0)); 2137 __ addi(to, to, 2); 2138 __ bind(L_fill_4); 2139 __ test_bit(t0, count, 2); 2140 __ beqz(t0, L_exit2); 2141 __ sw(value, Address(to, 0)); 2142 break; 2143 case T_SHORT: 2144 __ test_bit(t0, count, 0); 2145 __ beqz(t0, L_fill_4); 2146 __ sh(value, Address(to, 0)); 2147 __ addi(to, to, 2); 2148 __ bind(L_fill_4); 2149 __ test_bit(t0, count, 1); 2150 __ beqz(t0, L_exit2); 2151 __ sw(value, Address(to, 0)); 2152 break; 2153 case T_INT: 2154 __ beqz(count, L_exit2); 2155 __ sw(value, Address(to, 0)); 2156 break; 2157 default: ShouldNotReachHere(); 2158 } 2159 __ bind(L_exit2); 2160 __ leave(); 2161 __ ret(); 2162 return start; 2163 } 2164 2165 void generate_arraycopy_stubs() { 2166 address entry = nullptr; 2167 address entry_jbyte_arraycopy = nullptr; 2168 address entry_jshort_arraycopy = nullptr; 2169 address entry_jint_arraycopy = nullptr; 2170 address entry_oop_arraycopy = nullptr; 2171 address entry_jlong_arraycopy = nullptr; 2172 address entry_checkcast_arraycopy = nullptr; 2173 2174 generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards); 2175 generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards); 2176 2177 StubRoutines::riscv::_zero_blocks = generate_zero_blocks(); 2178 2179 //*** jbyte 2180 // Always need aligned and unaligned versions 2181 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2182 "jbyte_disjoint_arraycopy"); 2183 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2184 &entry_jbyte_arraycopy, 2185 "jbyte_arraycopy"); 2186 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2187 "arrayof_jbyte_disjoint_arraycopy"); 2188 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2189 "arrayof_jbyte_arraycopy"); 2190 2191 //*** jshort 2192 // Always need aligned and unaligned versions 2193 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2194 "jshort_disjoint_arraycopy"); 2195 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2196 &entry_jshort_arraycopy, 2197 "jshort_arraycopy"); 2198 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2199 "arrayof_jshort_disjoint_arraycopy"); 2200 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2201 "arrayof_jshort_arraycopy"); 2202 2203 //*** jint 2204 // Aligned versions 2205 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2206 "arrayof_jint_disjoint_arraycopy"); 2207 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2208 "arrayof_jint_arraycopy"); 2209 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2210 // entry_jint_arraycopy always points to the unaligned version 2211 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2212 "jint_disjoint_arraycopy"); 2213 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2214 &entry_jint_arraycopy, 2215 "jint_arraycopy"); 2216 2217 //*** jlong 2218 // It is always aligned 2219 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2220 "arrayof_jlong_disjoint_arraycopy"); 2221 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2222 "arrayof_jlong_arraycopy"); 2223 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2224 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2225 2226 //*** oops 2227 { 2228 // With compressed oops we need unaligned versions; notice that 2229 // we overwrite entry_oop_arraycopy. 2230 bool aligned = !UseCompressedOops; 2231 2232 StubRoutines::_arrayof_oop_disjoint_arraycopy 2233 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2234 /*dest_uninitialized*/false); 2235 StubRoutines::_arrayof_oop_arraycopy 2236 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2237 /*dest_uninitialized*/false); 2238 // Aligned versions without pre-barriers 2239 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2240 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2241 /*dest_uninitialized*/true); 2242 StubRoutines::_arrayof_oop_arraycopy_uninit 2243 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2244 /*dest_uninitialized*/true); 2245 } 2246 2247 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2248 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2249 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2250 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2251 2252 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2253 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2254 /*dest_uninitialized*/true); 2255 2256 2257 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2258 entry_jbyte_arraycopy, 2259 entry_jshort_arraycopy, 2260 entry_jint_arraycopy, 2261 entry_jlong_arraycopy); 2262 2263 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2264 entry_jbyte_arraycopy, 2265 entry_jshort_arraycopy, 2266 entry_jint_arraycopy, 2267 entry_oop_arraycopy, 2268 entry_jlong_arraycopy, 2269 entry_checkcast_arraycopy); 2270 2271 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2272 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2273 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2274 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2275 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2276 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2277 } 2278 2279 // code for comparing 16 bytes of strings with same encoding 2280 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 2281 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31; 2282 __ ld(tmp5, Address(str1)); 2283 __ addi(str1, str1, 8); 2284 __ xorr(tmp4, tmp1, tmp2); 2285 __ ld(cnt1, Address(str2)); 2286 __ addi(str2, str2, 8); 2287 __ bnez(tmp4, DIFF1); 2288 __ ld(tmp1, Address(str1)); 2289 __ addi(str1, str1, 8); 2290 __ xorr(tmp4, tmp5, cnt1); 2291 __ ld(tmp2, Address(str2)); 2292 __ addi(str2, str2, 8); 2293 __ bnez(tmp4, DIFF2); 2294 } 2295 2296 // code for comparing 8 characters of strings with Latin1 and Utf16 encoding 2297 void compare_string_8_x_LU(Register tmpL, Register tmpU, Register strL, Register strU, Label& DIFF) { 2298 const Register tmp = x30, tmpLval = x12; 2299 __ ld(tmpLval, Address(strL)); 2300 __ addi(strL, strL, wordSize); 2301 __ ld(tmpU, Address(strU)); 2302 __ addi(strU, strU, wordSize); 2303 __ inflate_lo32(tmpL, tmpLval); 2304 __ xorr(tmp, tmpU, tmpL); 2305 __ bnez(tmp, DIFF); 2306 2307 __ ld(tmpU, Address(strU)); 2308 __ addi(strU, strU, wordSize); 2309 __ inflate_hi32(tmpL, tmpLval); 2310 __ xorr(tmp, tmpU, tmpL); 2311 __ bnez(tmp, DIFF); 2312 } 2313 2314 // x10 = result 2315 // x11 = str1 2316 // x12 = cnt1 2317 // x13 = str2 2318 // x14 = cnt2 2319 // x28 = tmp1 2320 // x29 = tmp2 2321 // x30 = tmp3 2322 address generate_compare_long_string_different_encoding(bool isLU) { 2323 __ align(CodeEntryAlignment); 2324 StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL"); 2325 address entry = __ pc(); 2326 Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE; 2327 const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14, 2328 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12; 2329 2330 // cnt2 == amount of characters left to compare 2331 // Check already loaded first 4 symbols 2332 __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2); 2333 __ mv(isLU ? tmp1 : tmp2, tmp3); 2334 __ addi(str1, str1, isLU ? wordSize / 2 : wordSize); 2335 __ addi(str2, str2, isLU ? wordSize : wordSize / 2); 2336 __ sub(cnt2, cnt2, wordSize / 2); // Already loaded 4 symbols 2337 2338 __ xorr(tmp3, tmp1, tmp2); 2339 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2340 2341 Register strU = isLU ? str2 : str1, 2342 strL = isLU ? str1 : str2, 2343 tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison 2344 tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison 2345 2346 // make sure main loop is 8 byte-aligned, we should load another 4 bytes from strL 2347 // cnt2 is >= 68 here, no need to check it for >= 0 2348 __ lwu(tmpL, Address(strL)); 2349 __ addi(strL, strL, wordSize / 2); 2350 __ ld(tmpU, Address(strU)); 2351 __ addi(strU, strU, wordSize); 2352 __ inflate_lo32(tmp3, tmpL); 2353 __ mv(tmpL, tmp3); 2354 __ xorr(tmp3, tmpU, tmpL); 2355 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2356 __ addi(cnt2, cnt2, -wordSize / 2); 2357 2358 // we are now 8-bytes aligned on strL 2359 __ sub(cnt2, cnt2, wordSize * 2); 2360 __ bltz(cnt2, TAIL); 2361 __ bind(SMALL_LOOP); // smaller loop 2362 __ sub(cnt2, cnt2, wordSize * 2); 2363 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2364 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2365 __ bgez(cnt2, SMALL_LOOP); 2366 __ addi(t0, cnt2, wordSize * 2); 2367 __ beqz(t0, DONE); 2368 __ bind(TAIL); // 1..15 characters left 2369 // Aligned access. Load bytes in portions - 4, 2, 1. 2370 2371 __ addi(t0, cnt2, wordSize); 2372 __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process 2373 __ bltz(t0, LOAD_LAST); 2374 // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU 2375 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2376 __ addi(cnt2, cnt2, -wordSize); 2377 __ beqz(cnt2, DONE); // no character left 2378 __ bind(LOAD_LAST); // cnt2 = 1..7 characters left 2379 2380 __ addi(cnt2, cnt2, -wordSize); // cnt2 is now an offset in strL which points to last 8 bytes 2381 __ slli(t0, cnt2, 1); // t0 is now an offset in strU which points to last 16 bytes 2382 __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string 2383 __ add(strU, strU, t0); // Address of last 16 bytes in UTF-16 string 2384 __ load_int_misaligned(tmpL, Address(strL), t0, false); 2385 __ load_long_misaligned(tmpU, Address(strU), t0, 2); 2386 __ inflate_lo32(tmp3, tmpL); 2387 __ mv(tmpL, tmp3); 2388 __ xorr(tmp3, tmpU, tmpL); 2389 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2390 2391 __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string 2392 __ addi(strU, strU, wordSize); // Address of last 8 bytes in UTF-16 string 2393 __ load_int_misaligned(tmpL, Address(strL), t0, false); 2394 __ load_long_misaligned(tmpU, Address(strU), t0, 2); 2395 __ inflate_lo32(tmp3, tmpL); 2396 __ mv(tmpL, tmp3); 2397 __ xorr(tmp3, tmpU, tmpL); 2398 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2399 __ j(DONE); // no character left 2400 2401 // Find the first different characters in the longwords and 2402 // compute their difference. 2403 __ bind(CALCULATE_DIFFERENCE); 2404 __ ctzc_bit(tmp4, tmp3); 2405 __ srl(tmp1, tmp1, tmp4); 2406 __ srl(tmp2, tmp2, tmp4); 2407 __ andi(tmp1, tmp1, 0xFFFF); 2408 __ andi(tmp2, tmp2, 0xFFFF); 2409 __ sub(result, tmp1, tmp2); 2410 __ bind(DONE); 2411 __ ret(); 2412 return entry; 2413 } 2414 2415 address generate_method_entry_barrier() { 2416 __ align(CodeEntryAlignment); 2417 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 2418 2419 Label deoptimize_label; 2420 2421 address start = __ pc(); 2422 2423 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 2424 2425 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 2426 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 2427 Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 2428 __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr())); 2429 __ lwu(t1, t1); 2430 __ sw(t1, thread_epoch_addr); 2431 __ membar(__ LoadLoad); 2432 } 2433 2434 __ set_last_Java_frame(sp, fp, ra); 2435 2436 __ enter(); 2437 __ add(t1, sp, wordSize); 2438 2439 __ sub(sp, sp, 4 * wordSize); 2440 2441 __ push_call_clobbered_registers(); 2442 2443 __ mv(c_rarg0, t1); 2444 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 2445 2446 __ reset_last_Java_frame(true); 2447 2448 __ mv(t0, x10); 2449 2450 __ pop_call_clobbered_registers(); 2451 2452 __ bnez(t0, deoptimize_label); 2453 2454 __ leave(); 2455 __ ret(); 2456 2457 __ BIND(deoptimize_label); 2458 2459 __ ld(t0, Address(sp, 0)); 2460 __ ld(fp, Address(sp, wordSize)); 2461 __ ld(ra, Address(sp, wordSize * 2)); 2462 __ ld(t1, Address(sp, wordSize * 3)); 2463 2464 __ mv(sp, t0); 2465 __ jr(t1); 2466 2467 return start; 2468 } 2469 2470 // x10 = result 2471 // x11 = str1 2472 // x12 = cnt1 2473 // x13 = str2 2474 // x14 = cnt2 2475 // x28 = tmp1 2476 // x29 = tmp2 2477 // x30 = tmp3 2478 // x31 = tmp4 2479 address generate_compare_long_string_same_encoding(bool isLL) { 2480 __ align(CodeEntryAlignment); 2481 StubCodeMark mark(this, "StubRoutines", isLL ? 2482 "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU"); 2483 address entry = __ pc(); 2484 Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL, 2485 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF; 2486 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14, 2487 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31; 2488 RegSet spilled_regs = RegSet::of(tmp4, tmp5); 2489 2490 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 2491 // update cnt2 counter with already loaded 8 bytes 2492 __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2)); 2493 // update pointers, because of previous read 2494 __ add(str1, str1, wordSize); 2495 __ add(str2, str2, wordSize); 2496 // less than 16 bytes left? 2497 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2498 __ push_reg(spilled_regs, sp); 2499 __ bltz(cnt2, TAIL); 2500 __ bind(SMALL_LOOP); 2501 compare_string_16_bytes_same(DIFF, DIFF2); 2502 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2503 __ bgez(cnt2, SMALL_LOOP); 2504 __ bind(TAIL); 2505 __ addi(cnt2, cnt2, isLL ? 16 : 8); 2506 __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); 2507 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2508 __ blez(cnt2, CHECK_LAST); 2509 __ xorr(tmp4, tmp1, tmp2); 2510 __ bnez(tmp4, DIFF); 2511 __ ld(tmp1, Address(str1)); 2512 __ addi(str1, str1, 8); 2513 __ ld(tmp2, Address(str2)); 2514 __ addi(str2, str2, 8); 2515 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2516 __ bind(CHECK_LAST); 2517 if (!isLL) { 2518 __ add(cnt2, cnt2, cnt2); // now in bytes 2519 } 2520 __ xorr(tmp4, tmp1, tmp2); 2521 __ bnez(tmp4, DIFF); 2522 __ add(str1, str1, cnt2); 2523 __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2); 2524 __ add(str2, str2, cnt2); 2525 __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2); 2526 __ xorr(tmp4, tmp5, cnt1); 2527 __ beqz(tmp4, LENGTH_DIFF); 2528 // Find the first different characters in the longwords and 2529 // compute their difference. 2530 __ bind(DIFF2); 2531 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2532 __ srl(tmp5, tmp5, tmp3); 2533 __ srl(cnt1, cnt1, tmp3); 2534 if (isLL) { 2535 __ andi(tmp5, tmp5, 0xFF); 2536 __ andi(cnt1, cnt1, 0xFF); 2537 } else { 2538 __ andi(tmp5, tmp5, 0xFFFF); 2539 __ andi(cnt1, cnt1, 0xFFFF); 2540 } 2541 __ sub(result, tmp5, cnt1); 2542 __ j(LENGTH_DIFF); 2543 __ bind(DIFF); 2544 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2545 __ srl(tmp1, tmp1, tmp3); 2546 __ srl(tmp2, tmp2, tmp3); 2547 if (isLL) { 2548 __ andi(tmp1, tmp1, 0xFF); 2549 __ andi(tmp2, tmp2, 0xFF); 2550 } else { 2551 __ andi(tmp1, tmp1, 0xFFFF); 2552 __ andi(tmp2, tmp2, 0xFFFF); 2553 } 2554 __ sub(result, tmp1, tmp2); 2555 __ j(LENGTH_DIFF); 2556 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 2557 __ xorr(tmp4, tmp1, tmp2); 2558 __ bnez(tmp4, DIFF); 2559 __ bind(LENGTH_DIFF); 2560 __ pop_reg(spilled_regs, sp); 2561 __ ret(); 2562 return entry; 2563 } 2564 2565 void generate_compare_long_strings() { 2566 StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true); 2567 StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false); 2568 StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true); 2569 StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false); 2570 } 2571 2572 // x10 result 2573 // x11 src 2574 // x12 src count 2575 // x13 pattern 2576 // x14 pattern count 2577 address generate_string_indexof_linear(bool needle_isL, bool haystack_isL) 2578 { 2579 const char* stubName = needle_isL 2580 ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul") 2581 : "indexof_linear_uu"; 2582 __ align(CodeEntryAlignment); 2583 StubCodeMark mark(this, "StubRoutines", stubName); 2584 address entry = __ pc(); 2585 2586 int needle_chr_size = needle_isL ? 1 : 2; 2587 int haystack_chr_size = haystack_isL ? 1 : 2; 2588 int needle_chr_shift = needle_isL ? 0 : 1; 2589 int haystack_chr_shift = haystack_isL ? 0 : 1; 2590 bool isL = needle_isL && haystack_isL; 2591 // parameters 2592 Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14; 2593 // temporary registers 2594 Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25; 2595 // redefinitions 2596 Register ch1 = x28, ch2 = x29; 2597 RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29); 2598 2599 __ push_reg(spilled_regs, sp); 2600 2601 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 2602 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 2603 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 2604 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 2605 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 2606 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 2607 2608 __ ld(ch1, Address(needle)); 2609 __ ld(ch2, Address(haystack)); 2610 // src.length - pattern.length 2611 __ sub(haystack_len, haystack_len, needle_len); 2612 2613 // first is needle[0] 2614 __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first); 2615 uint64_t mask0101 = UCONST64(0x0101010101010101); 2616 uint64_t mask0001 = UCONST64(0x0001000100010001); 2617 __ mv(mask1, haystack_isL ? mask0101 : mask0001); 2618 __ mul(first, first, mask1); 2619 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 2620 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 2621 __ mv(mask2, haystack_isL ? mask7f7f : mask7fff); 2622 if (needle_isL != haystack_isL) { 2623 __ mv(tmp, ch1); 2624 } 2625 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1); 2626 __ blez(haystack_len, L_SMALL); 2627 2628 if (needle_isL != haystack_isL) { 2629 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2630 } 2631 // xorr, sub, orr, notr, andr 2632 // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i] 2633 // eg: 2634 // first: aa aa aa aa aa aa aa aa 2635 // ch2: aa aa li nx jd ka aa aa 2636 // match_mask: 80 80 00 00 00 00 80 80 2637 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2638 2639 // search first char of needle, if success, goto L_HAS_ZERO; 2640 __ bnez(match_mask, L_HAS_ZERO); 2641 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2642 __ add(result, result, wordSize / haystack_chr_size); 2643 __ add(haystack, haystack, wordSize); 2644 __ bltz(haystack_len, L_POST_LOOP); 2645 2646 __ bind(L_LOOP); 2647 __ ld(ch2, Address(haystack)); 2648 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2649 __ bnez(match_mask, L_HAS_ZERO); 2650 2651 __ bind(L_LOOP_PROCEED); 2652 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2653 __ add(haystack, haystack, wordSize); 2654 __ add(result, result, wordSize / haystack_chr_size); 2655 __ bgez(haystack_len, L_LOOP); 2656 2657 __ bind(L_POST_LOOP); 2658 __ mv(ch2, -wordSize / haystack_chr_size); 2659 __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check 2660 __ ld(ch2, Address(haystack)); 2661 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2662 __ neg(haystack_len, haystack_len); 2663 __ xorr(ch2, first, ch2); 2664 __ sub(match_mask, ch2, mask1); 2665 __ orr(ch2, ch2, mask2); 2666 __ mv(trailing_zeros, -1); // all bits set 2667 __ j(L_SMALL_PROCEED); 2668 2669 __ align(OptoLoopAlignment); 2670 __ bind(L_SMALL); 2671 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2672 __ neg(haystack_len, haystack_len); 2673 if (needle_isL != haystack_isL) { 2674 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2675 } 2676 __ xorr(ch2, first, ch2); 2677 __ sub(match_mask, ch2, mask1); 2678 __ orr(ch2, ch2, mask2); 2679 __ mv(trailing_zeros, -1); // all bits set 2680 2681 __ bind(L_SMALL_PROCEED); 2682 __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits. 2683 __ notr(ch2, ch2); 2684 __ andr(match_mask, match_mask, ch2); 2685 __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check 2686 __ beqz(match_mask, NOMATCH); 2687 2688 __ bind(L_SMALL_HAS_ZERO_LOOP); 2689 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros 2690 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2691 __ mv(ch2, wordSize / haystack_chr_size); 2692 __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2); 2693 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2694 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2695 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2696 2697 __ bind(L_SMALL_CMP_LOOP); 2698 __ shadd(first, trailing_zeros, needle, first, needle_chr_shift); 2699 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2700 needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first)); 2701 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2702 __ add(trailing_zeros, trailing_zeros, 1); 2703 __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP); 2704 __ beq(first, ch2, L_SMALL_CMP_LOOP); 2705 2706 __ bind(L_SMALL_CMP_LOOP_NOMATCH); 2707 __ beqz(match_mask, NOMATCH); 2708 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2709 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2710 __ add(result, result, 1); 2711 __ add(haystack, haystack, haystack_chr_size); 2712 __ j(L_SMALL_HAS_ZERO_LOOP); 2713 2714 __ align(OptoLoopAlignment); 2715 __ bind(L_SMALL_CMP_LOOP_LAST_CMP); 2716 __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2717 __ j(DONE); 2718 2719 __ align(OptoLoopAlignment); 2720 __ bind(L_SMALL_CMP_LOOP_LAST_CMP2); 2721 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2722 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2723 __ j(DONE); 2724 2725 __ align(OptoLoopAlignment); 2726 __ bind(L_HAS_ZERO); 2727 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2728 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2729 __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2); 2730 __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits) 2731 __ sub(result, result, 1); // array index from 0, so result -= 1 2732 2733 __ bind(L_HAS_ZERO_LOOP); 2734 __ mv(needle_len, wordSize / haystack_chr_size); 2735 __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2); 2736 __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2); 2737 // load next 8 bytes from haystack, and increase result index 2738 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2739 __ add(result, result, 1); 2740 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2741 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2742 2743 // compare one char 2744 __ bind(L_CMP_LOOP); 2745 __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift); 2746 needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len)); 2747 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2748 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2749 __ add(trailing_zeros, trailing_zeros, 1); // next char index 2750 __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2); 2751 __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP); 2752 __ beq(needle_len, ch2, L_CMP_LOOP); 2753 2754 __ bind(L_CMP_LOOP_NOMATCH); 2755 __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH); 2756 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index 2757 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2758 __ add(haystack, haystack, haystack_chr_size); 2759 __ j(L_HAS_ZERO_LOOP); 2760 2761 __ align(OptoLoopAlignment); 2762 __ bind(L_CMP_LOOP_LAST_CMP); 2763 __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH); 2764 __ j(DONE); 2765 2766 __ align(OptoLoopAlignment); 2767 __ bind(L_CMP_LOOP_LAST_CMP2); 2768 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2769 __ add(result, result, 1); 2770 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2771 __ j(DONE); 2772 2773 __ align(OptoLoopAlignment); 2774 __ bind(L_HAS_ZERO_LOOP_NOMATCH); 2775 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 2776 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 2777 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 2778 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 2779 // result by analyzed characters value, so, we can just reset lower bits 2780 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 2781 // 2) restore needle_len and haystack_len values from "compressed" haystack_len 2782 // 3) advance haystack value to represent next haystack octet. result & 7/3 is 2783 // index of last analyzed substring inside current octet. So, haystack in at 2784 // respective start address. We need to advance it to next octet 2785 __ andi(match_mask, result, wordSize / haystack_chr_size - 1); 2786 __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2); 2787 __ andi(result, result, haystack_isL ? -8 : -4); 2788 __ slli(tmp, match_mask, haystack_chr_shift); 2789 __ sub(haystack, haystack, tmp); 2790 __ sign_extend(haystack_len, haystack_len, 32); 2791 __ j(L_LOOP_PROCEED); 2792 2793 __ align(OptoLoopAlignment); 2794 __ bind(NOMATCH); 2795 __ mv(result, -1); 2796 2797 __ bind(DONE); 2798 __ pop_reg(spilled_regs, sp); 2799 __ ret(); 2800 return entry; 2801 } 2802 2803 void generate_string_indexof_stubs() 2804 { 2805 StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 2806 StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 2807 StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 2808 } 2809 2810 #ifdef COMPILER2 2811 address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { 2812 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table"); 2813 2814 address start = __ pc(); 2815 const Register 2816 r_super_klass = x10, 2817 r_array_base = x11, 2818 r_array_length = x12, 2819 r_array_index = x13, 2820 r_sub_klass = x14, 2821 result = x15, 2822 r_bitmap = x16; 2823 2824 Label L_success; 2825 __ enter(); 2826 __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, result, 2827 r_array_base, r_array_length, r_array_index, 2828 r_bitmap, super_klass_index, /*stub_is_near*/true); 2829 __ leave(); 2830 __ ret(); 2831 2832 return start; 2833 } 2834 2835 // Slow path implementation for UseSecondarySupersTable. 2836 address generate_lookup_secondary_supers_table_slow_path_stub() { 2837 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path"); 2838 2839 address start = __ pc(); 2840 const Register 2841 r_super_klass = x10, // argument 2842 r_array_base = x11, // argument 2843 temp1 = x12, // tmp 2844 r_array_index = x13, // argument 2845 result = x15, // argument 2846 r_bitmap = x16; // argument 2847 2848 2849 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 2850 __ ret(); 2851 2852 return start; 2853 } 2854 2855 address generate_mulAdd() 2856 { 2857 __ align(CodeEntryAlignment); 2858 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 2859 2860 address entry = __ pc(); 2861 2862 const Register out = x10; 2863 const Register in = x11; 2864 const Register offset = x12; 2865 const Register len = x13; 2866 const Register k = x14; 2867 const Register tmp = x28; 2868 2869 BLOCK_COMMENT("Entry:"); 2870 __ enter(); 2871 __ mul_add(out, in, offset, len, k, tmp); 2872 __ leave(); 2873 __ ret(); 2874 2875 return entry; 2876 } 2877 2878 /** 2879 * Arguments: 2880 * 2881 * Input: 2882 * c_rarg0 - x address 2883 * c_rarg1 - x length 2884 * c_rarg2 - y address 2885 * c_rarg3 - y length 2886 * c_rarg4 - z address 2887 */ 2888 address generate_multiplyToLen() 2889 { 2890 __ align(CodeEntryAlignment); 2891 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 2892 address entry = __ pc(); 2893 2894 const Register x = x10; 2895 const Register xlen = x11; 2896 const Register y = x12; 2897 const Register ylen = x13; 2898 const Register z = x14; 2899 2900 const Register tmp0 = x15; 2901 const Register tmp1 = x16; 2902 const Register tmp2 = x17; 2903 const Register tmp3 = x7; 2904 const Register tmp4 = x28; 2905 const Register tmp5 = x29; 2906 const Register tmp6 = x30; 2907 const Register tmp7 = x31; 2908 2909 BLOCK_COMMENT("Entry:"); 2910 __ enter(); // required for proper stackwalking of RuntimeStub frame 2911 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2912 __ leave(); // required for proper stackwalking of RuntimeStub frame 2913 __ ret(); 2914 2915 return entry; 2916 } 2917 2918 address generate_squareToLen() 2919 { 2920 __ align(CodeEntryAlignment); 2921 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 2922 address entry = __ pc(); 2923 2924 const Register x = x10; 2925 const Register xlen = x11; 2926 const Register z = x12; 2927 const Register y = x14; // == x 2928 const Register ylen = x15; // == xlen 2929 2930 const Register tmp0 = x13; // zlen, unused 2931 const Register tmp1 = x16; 2932 const Register tmp2 = x17; 2933 const Register tmp3 = x7; 2934 const Register tmp4 = x28; 2935 const Register tmp5 = x29; 2936 const Register tmp6 = x30; 2937 const Register tmp7 = x31; 2938 2939 BLOCK_COMMENT("Entry:"); 2940 __ enter(); 2941 __ mv(y, x); 2942 __ mv(ylen, xlen); 2943 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2944 __ leave(); 2945 __ ret(); 2946 2947 return entry; 2948 } 2949 2950 // Arguments: 2951 // 2952 // Input: 2953 // c_rarg0 - newArr address 2954 // c_rarg1 - oldArr address 2955 // c_rarg2 - newIdx 2956 // c_rarg3 - shiftCount 2957 // c_rarg4 - numIter 2958 // 2959 address generate_bigIntegerLeftShift() { 2960 __ align(CodeEntryAlignment); 2961 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 2962 address entry = __ pc(); 2963 2964 Label loop, exit; 2965 2966 Register newArr = c_rarg0; 2967 Register oldArr = c_rarg1; 2968 Register newIdx = c_rarg2; 2969 Register shiftCount = c_rarg3; 2970 Register numIter = c_rarg4; 2971 2972 Register shiftRevCount = c_rarg5; 2973 Register oldArrNext = t1; 2974 2975 __ beqz(numIter, exit); 2976 __ shadd(newArr, newIdx, newArr, t0, 2); 2977 2978 __ mv(shiftRevCount, 32); 2979 __ sub(shiftRevCount, shiftRevCount, shiftCount); 2980 2981 __ bind(loop); 2982 __ addi(oldArrNext, oldArr, 4); 2983 __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4); 2984 __ vle32_v(v0, oldArr); 2985 __ vle32_v(v4, oldArrNext); 2986 __ vsll_vx(v0, v0, shiftCount); 2987 __ vsrl_vx(v4, v4, shiftRevCount); 2988 __ vor_vv(v0, v0, v4); 2989 __ vse32_v(v0, newArr); 2990 __ sub(numIter, numIter, t0); 2991 __ shadd(oldArr, t0, oldArr, t1, 2); 2992 __ shadd(newArr, t0, newArr, t1, 2); 2993 __ bnez(numIter, loop); 2994 2995 __ bind(exit); 2996 __ ret(); 2997 2998 return entry; 2999 } 3000 3001 // Arguments: 3002 // 3003 // Input: 3004 // c_rarg0 - newArr address 3005 // c_rarg1 - oldArr address 3006 // c_rarg2 - newIdx 3007 // c_rarg3 - shiftCount 3008 // c_rarg4 - numIter 3009 // 3010 address generate_bigIntegerRightShift() { 3011 __ align(CodeEntryAlignment); 3012 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 3013 address entry = __ pc(); 3014 3015 Label loop, exit; 3016 3017 Register newArr = c_rarg0; 3018 Register oldArr = c_rarg1; 3019 Register newIdx = c_rarg2; 3020 Register shiftCount = c_rarg3; 3021 Register numIter = c_rarg4; 3022 Register idx = numIter; 3023 3024 Register shiftRevCount = c_rarg5; 3025 Register oldArrNext = c_rarg6; 3026 Register newArrCur = t0; 3027 Register oldArrCur = t1; 3028 3029 __ beqz(idx, exit); 3030 __ shadd(newArr, newIdx, newArr, t0, 2); 3031 3032 __ mv(shiftRevCount, 32); 3033 __ sub(shiftRevCount, shiftRevCount, shiftCount); 3034 3035 __ bind(loop); 3036 __ vsetvli(t0, idx, Assembler::e32, Assembler::m4); 3037 __ sub(idx, idx, t0); 3038 __ shadd(oldArrNext, idx, oldArr, t1, 2); 3039 __ shadd(newArrCur, idx, newArr, t1, 2); 3040 __ addi(oldArrCur, oldArrNext, 4); 3041 __ vle32_v(v0, oldArrCur); 3042 __ vle32_v(v4, oldArrNext); 3043 __ vsrl_vx(v0, v0, shiftCount); 3044 __ vsll_vx(v4, v4, shiftRevCount); 3045 __ vor_vv(v0, v0, v4); 3046 __ vse32_v(v0, newArrCur); 3047 __ bnez(idx, loop); 3048 3049 __ bind(exit); 3050 __ ret(); 3051 3052 return entry; 3053 } 3054 #endif 3055 3056 #ifdef COMPILER2 3057 class MontgomeryMultiplyGenerator : public MacroAssembler { 3058 3059 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3060 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj; 3061 3062 RegSet _toSave; 3063 bool _squaring; 3064 3065 public: 3066 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3067 : MacroAssembler(as->code()), _squaring(squaring) { 3068 3069 // Register allocation 3070 3071 RegSetIterator<Register> regs = RegSet::range(x10, x26).begin(); 3072 Pa_base = *regs; // Argument registers 3073 if (squaring) { 3074 Pb_base = Pa_base; 3075 } else { 3076 Pb_base = *++regs; 3077 } 3078 Pn_base = *++regs; 3079 Rlen= *++regs; 3080 inv = *++regs; 3081 Pm_base = *++regs; 3082 3083 // Working registers: 3084 Ra = *++regs; // The current digit of a, b, n, and m. 3085 Rb = *++regs; 3086 Rm = *++regs; 3087 Rn = *++regs; 3088 3089 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 3090 Pb = *++regs; 3091 Pm = *++regs; 3092 Pn = *++regs; 3093 3094 tmp0 = *++regs; // Three registers which form a 3095 tmp1 = *++regs; // triple-precision accumuator. 3096 tmp2 = *++regs; 3097 3098 Ri = x6; // Inner and outer loop indexes. 3099 Rj = x7; 3100 3101 Rhi_ab = x28; // Product registers: low and high parts 3102 Rlo_ab = x29; // of a*b and m*n. 3103 Rhi_mn = x30; 3104 Rlo_mn = x31; 3105 3106 // x18 and up are callee-saved. 3107 _toSave = RegSet::range(x18, *regs) + Pm_base; 3108 } 3109 3110 private: 3111 void save_regs() { 3112 push_reg(_toSave, sp); 3113 } 3114 3115 void restore_regs() { 3116 pop_reg(_toSave, sp); 3117 } 3118 3119 template <typename T> 3120 void unroll_2(Register count, T block) { 3121 Label loop, end, odd; 3122 beqz(count, end); 3123 test_bit(t0, count, 0); 3124 bnez(t0, odd); 3125 align(16); 3126 bind(loop); 3127 (this->*block)(); 3128 bind(odd); 3129 (this->*block)(); 3130 addi(count, count, -2); 3131 bgtz(count, loop); 3132 bind(end); 3133 } 3134 3135 template <typename T> 3136 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3137 Label loop, end, odd; 3138 beqz(count, end); 3139 test_bit(tmp, count, 0); 3140 bnez(tmp, odd); 3141 align(16); 3142 bind(loop); 3143 (this->*block)(d, s, tmp); 3144 bind(odd); 3145 (this->*block)(d, s, tmp); 3146 addi(count, count, -2); 3147 bgtz(count, loop); 3148 bind(end); 3149 } 3150 3151 void pre1(RegisterOrConstant i) { 3152 block_comment("pre1"); 3153 // Pa = Pa_base; 3154 // Pb = Pb_base + i; 3155 // Pm = Pm_base; 3156 // Pn = Pn_base + i; 3157 // Ra = *Pa; 3158 // Rb = *Pb; 3159 // Rm = *Pm; 3160 // Rn = *Pn; 3161 if (i.is_register()) { 3162 slli(t0, i.as_register(), LogBytesPerWord); 3163 } else { 3164 mv(t0, i.as_constant()); 3165 slli(t0, t0, LogBytesPerWord); 3166 } 3167 3168 mv(Pa, Pa_base); 3169 add(Pb, Pb_base, t0); 3170 mv(Pm, Pm_base); 3171 add(Pn, Pn_base, t0); 3172 3173 ld(Ra, Address(Pa)); 3174 ld(Rb, Address(Pb)); 3175 ld(Rm, Address(Pm)); 3176 ld(Rn, Address(Pn)); 3177 3178 // Zero the m*n result. 3179 mv(Rhi_mn, zr); 3180 mv(Rlo_mn, zr); 3181 } 3182 3183 // The core multiply-accumulate step of a Montgomery 3184 // multiplication. The idea is to schedule operations as a 3185 // pipeline so that instructions with long latencies (loads and 3186 // multiplies) have time to complete before their results are 3187 // used. This most benefits in-order implementations of the 3188 // architecture but out-of-order ones also benefit. 3189 void step() { 3190 block_comment("step"); 3191 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3192 // Ra = *++Pa; 3193 // Rb = *--Pb; 3194 mulhu(Rhi_ab, Ra, Rb); 3195 mul(Rlo_ab, Ra, Rb); 3196 addi(Pa, Pa, wordSize); 3197 ld(Ra, Address(Pa)); 3198 addi(Pb, Pb, -wordSize); 3199 ld(Rb, Address(Pb)); 3200 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the 3201 // previous iteration. 3202 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3203 // Rm = *++Pm; 3204 // Rn = *--Pn; 3205 mulhu(Rhi_mn, Rm, Rn); 3206 mul(Rlo_mn, Rm, Rn); 3207 addi(Pm, Pm, wordSize); 3208 ld(Rm, Address(Pm)); 3209 addi(Pn, Pn, -wordSize); 3210 ld(Rn, Address(Pn)); 3211 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3212 } 3213 3214 void post1() { 3215 block_comment("post1"); 3216 3217 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3218 // Ra = *++Pa; 3219 // Rb = *--Pb; 3220 mulhu(Rhi_ab, Ra, Rb); 3221 mul(Rlo_ab, Ra, Rb); 3222 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3223 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3224 3225 // *Pm = Rm = tmp0 * inv; 3226 mul(Rm, tmp0, inv); 3227 sd(Rm, Address(Pm)); 3228 3229 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3230 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3231 mulhu(Rhi_mn, Rm, Rn); 3232 3233 #ifndef PRODUCT 3234 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3235 { 3236 mul(Rlo_mn, Rm, Rn); 3237 add(Rlo_mn, tmp0, Rlo_mn); 3238 Label ok; 3239 beqz(Rlo_mn, ok); 3240 stop("broken Montgomery multiply"); 3241 bind(ok); 3242 } 3243 #endif 3244 // We have very carefully set things up so that 3245 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3246 // the lower half of Rm * Rn because we know the result already: 3247 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3248 // tmp0 != 0. So, rather than do a mul and an cad we just set 3249 // the carry flag iff tmp0 is nonzero. 3250 // 3251 // mul(Rlo_mn, Rm, Rn); 3252 // cad(zr, tmp0, Rlo_mn); 3253 addi(t0, tmp0, -1); 3254 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3255 cadc(tmp0, tmp1, Rhi_mn, t0); 3256 adc(tmp1, tmp2, zr, t0); 3257 mv(tmp2, zr); 3258 } 3259 3260 void pre2(Register i, Register len) { 3261 block_comment("pre2"); 3262 // Pa = Pa_base + i-len; 3263 // Pb = Pb_base + len; 3264 // Pm = Pm_base + i-len; 3265 // Pn = Pn_base + len; 3266 3267 sub(Rj, i, len); 3268 // Rj == i-len 3269 3270 // Ra as temp register 3271 slli(Ra, Rj, LogBytesPerWord); 3272 add(Pa, Pa_base, Ra); 3273 add(Pm, Pm_base, Ra); 3274 slli(Ra, len, LogBytesPerWord); 3275 add(Pb, Pb_base, Ra); 3276 add(Pn, Pn_base, Ra); 3277 3278 // Ra = *++Pa; 3279 // Rb = *--Pb; 3280 // Rm = *++Pm; 3281 // Rn = *--Pn; 3282 add(Pa, Pa, wordSize); 3283 ld(Ra, Address(Pa)); 3284 add(Pb, Pb, -wordSize); 3285 ld(Rb, Address(Pb)); 3286 add(Pm, Pm, wordSize); 3287 ld(Rm, Address(Pm)); 3288 add(Pn, Pn, -wordSize); 3289 ld(Rn, Address(Pn)); 3290 3291 mv(Rhi_mn, zr); 3292 mv(Rlo_mn, zr); 3293 } 3294 3295 void post2(Register i, Register len) { 3296 block_comment("post2"); 3297 sub(Rj, i, len); 3298 3299 cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part 3300 3301 // As soon as we know the least significant digit of our result, 3302 // store it. 3303 // Pm_base[i-len] = tmp0; 3304 // Rj as temp register 3305 slli(Rj, Rj, LogBytesPerWord); 3306 add(Rj, Pm_base, Rj); 3307 sd(tmp0, Address(Rj)); 3308 3309 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3310 cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part 3311 adc(tmp1, tmp2, zr, t0); 3312 mv(tmp2, zr); 3313 } 3314 3315 // A carry in tmp0 after Montgomery multiplication means that we 3316 // should subtract multiples of n from our result in m. We'll 3317 // keep doing that until there is no carry. 3318 void normalize(Register len) { 3319 block_comment("normalize"); 3320 // while (tmp0) 3321 // tmp0 = sub(Pm_base, Pn_base, tmp0, len); 3322 Label loop, post, again; 3323 Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now 3324 beqz(tmp0, post); { 3325 bind(again); { 3326 mv(i, zr); 3327 mv(cnt, len); 3328 slli(Rn, i, LogBytesPerWord); 3329 add(Rm, Pm_base, Rn); 3330 ld(Rm, Address(Rm)); 3331 add(Rn, Pn_base, Rn); 3332 ld(Rn, Address(Rn)); 3333 mv(t0, 1); // set carry flag, i.e. no borrow 3334 align(16); 3335 bind(loop); { 3336 notr(Rn, Rn); 3337 add(Rm, Rm, t0); 3338 add(Rm, Rm, Rn); 3339 sltu(t0, Rm, Rn); 3340 slli(Rn, i, LogBytesPerWord); // Rn as temp register 3341 add(Rn, Pm_base, Rn); 3342 sd(Rm, Address(Rn)); 3343 add(i, i, 1); 3344 slli(Rn, i, LogBytesPerWord); 3345 add(Rm, Pm_base, Rn); 3346 ld(Rm, Address(Rm)); 3347 add(Rn, Pn_base, Rn); 3348 ld(Rn, Address(Rn)); 3349 sub(cnt, cnt, 1); 3350 } bnez(cnt, loop); 3351 addi(tmp0, tmp0, -1); 3352 add(tmp0, tmp0, t0); 3353 } bnez(tmp0, again); 3354 } bind(post); 3355 } 3356 3357 // Move memory at s to d, reversing words. 3358 // Increments d to end of copied memory 3359 // Destroys tmp1, tmp2 3360 // Preserves len 3361 // Leaves s pointing to the address which was in d at start 3362 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3363 assert(tmp1->encoding() < x28->encoding(), "register corruption"); 3364 assert(tmp2->encoding() < x28->encoding(), "register corruption"); 3365 3366 shadd(s, len, s, tmp1, LogBytesPerWord); 3367 mv(tmp1, len); 3368 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3369 slli(tmp1, len, LogBytesPerWord); 3370 sub(s, d, tmp1); 3371 } 3372 // [63...0] -> [31...0][63...32] 3373 void reverse1(Register d, Register s, Register tmp) { 3374 addi(s, s, -wordSize); 3375 ld(tmp, Address(s)); 3376 ror_imm(tmp, tmp, 32, t0); 3377 sd(tmp, Address(d)); 3378 addi(d, d, wordSize); 3379 } 3380 3381 void step_squaring() { 3382 // An extra ACC 3383 step(); 3384 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3385 } 3386 3387 void last_squaring(Register i) { 3388 Label dont; 3389 // if ((i & 1) == 0) { 3390 test_bit(t0, i, 0); 3391 bnez(t0, dont); { 3392 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3393 // Ra = *++Pa; 3394 // Rb = *--Pb; 3395 mulhu(Rhi_ab, Ra, Rb); 3396 mul(Rlo_ab, Ra, Rb); 3397 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3398 } bind(dont); 3399 } 3400 3401 void extra_step_squaring() { 3402 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3403 3404 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3405 // Rm = *++Pm; 3406 // Rn = *--Pn; 3407 mulhu(Rhi_mn, Rm, Rn); 3408 mul(Rlo_mn, Rm, Rn); 3409 addi(Pm, Pm, wordSize); 3410 ld(Rm, Address(Pm)); 3411 addi(Pn, Pn, -wordSize); 3412 ld(Rn, Address(Pn)); 3413 } 3414 3415 void post1_squaring() { 3416 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3417 3418 // *Pm = Rm = tmp0 * inv; 3419 mul(Rm, tmp0, inv); 3420 sd(Rm, Address(Pm)); 3421 3422 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3423 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3424 mulhu(Rhi_mn, Rm, Rn); 3425 3426 #ifndef PRODUCT 3427 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3428 { 3429 mul(Rlo_mn, Rm, Rn); 3430 add(Rlo_mn, tmp0, Rlo_mn); 3431 Label ok; 3432 beqz(Rlo_mn, ok); { 3433 stop("broken Montgomery multiply"); 3434 } bind(ok); 3435 } 3436 #endif 3437 // We have very carefully set things up so that 3438 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3439 // the lower half of Rm * Rn because we know the result already: 3440 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3441 // tmp0 != 0. So, rather than do a mul and a cad we just set 3442 // the carry flag iff tmp0 is nonzero. 3443 // 3444 // mul(Rlo_mn, Rm, Rn); 3445 // cad(zr, tmp, Rlo_mn); 3446 addi(t0, tmp0, -1); 3447 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3448 cadc(tmp0, tmp1, Rhi_mn, t0); 3449 adc(tmp1, tmp2, zr, t0); 3450 mv(tmp2, zr); 3451 } 3452 3453 // use t0 as carry 3454 void acc(Register Rhi, Register Rlo, 3455 Register tmp0, Register tmp1, Register tmp2) { 3456 cad(tmp0, tmp0, Rlo, t0); 3457 cadc(tmp1, tmp1, Rhi, t0); 3458 adc(tmp2, tmp2, zr, t0); 3459 } 3460 3461 public: 3462 /** 3463 * Fast Montgomery multiplication. The derivation of the 3464 * algorithm is in A Cryptographic Library for the Motorola 3465 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3466 * 3467 * Arguments: 3468 * 3469 * Inputs for multiplication: 3470 * c_rarg0 - int array elements a 3471 * c_rarg1 - int array elements b 3472 * c_rarg2 - int array elements n (the modulus) 3473 * c_rarg3 - int length 3474 * c_rarg4 - int inv 3475 * c_rarg5 - int array elements m (the result) 3476 * 3477 * Inputs for squaring: 3478 * c_rarg0 - int array elements a 3479 * c_rarg1 - int array elements n (the modulus) 3480 * c_rarg2 - int length 3481 * c_rarg3 - int inv 3482 * c_rarg4 - int array elements m (the result) 3483 * 3484 */ 3485 address generate_multiply() { 3486 Label argh, nothing; 3487 bind(argh); 3488 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3489 3490 align(CodeEntryAlignment); 3491 address entry = pc(); 3492 3493 beqz(Rlen, nothing); 3494 3495 enter(); 3496 3497 // Make room. 3498 mv(Ra, 512); 3499 bgt(Rlen, Ra, argh); 3500 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3501 sub(Ra, sp, Ra); 3502 andi(sp, Ra, -2 * wordSize); 3503 3504 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3505 3506 { 3507 // Copy input args, reversing as we go. We use Ra as a 3508 // temporary variable. 3509 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3510 if (!_squaring) 3511 reverse(Ra, Pb_base, Rlen, Ri, Rj); 3512 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3513 } 3514 3515 // Push all call-saved registers and also Pm_base which we'll need 3516 // at the end. 3517 save_regs(); 3518 3519 #ifndef PRODUCT 3520 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3521 { 3522 ld(Rn, Address(Pn_base)); 3523 mul(Rlo_mn, Rn, inv); 3524 mv(t0, -1); 3525 Label ok; 3526 beq(Rlo_mn, t0, ok); 3527 stop("broken inverse in Montgomery multiply"); 3528 bind(ok); 3529 } 3530 #endif 3531 3532 mv(Pm_base, Ra); 3533 3534 mv(tmp0, zr); 3535 mv(tmp1, zr); 3536 mv(tmp2, zr); 3537 3538 block_comment("for (int i = 0; i < len; i++) {"); 3539 mv(Ri, zr); { 3540 Label loop, end; 3541 bge(Ri, Rlen, end); 3542 3543 bind(loop); 3544 pre1(Ri); 3545 3546 block_comment(" for (j = i; j; j--) {"); { 3547 mv(Rj, Ri); 3548 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3549 } block_comment(" } // j"); 3550 3551 post1(); 3552 addw(Ri, Ri, 1); 3553 blt(Ri, Rlen, loop); 3554 bind(end); 3555 block_comment("} // i"); 3556 } 3557 3558 block_comment("for (int i = len; i < 2*len; i++) {"); 3559 mv(Ri, Rlen); { 3560 Label loop, end; 3561 slli(t0, Rlen, 1); 3562 bge(Ri, t0, end); 3563 3564 bind(loop); 3565 pre2(Ri, Rlen); 3566 3567 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3568 slliw(Rj, Rlen, 1); 3569 subw(Rj, Rj, Ri); 3570 subw(Rj, Rj, 1); 3571 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3572 } block_comment(" } // j"); 3573 3574 post2(Ri, Rlen); 3575 addw(Ri, Ri, 1); 3576 slli(t0, Rlen, 1); 3577 blt(Ri, t0, loop); 3578 bind(end); 3579 } 3580 block_comment("} // i"); 3581 3582 normalize(Rlen); 3583 3584 mv(Ra, Pm_base); // Save Pm_base in Ra 3585 restore_regs(); // Restore caller's Pm_base 3586 3587 // Copy our result into caller's Pm_base 3588 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3589 3590 leave(); 3591 bind(nothing); 3592 ret(); 3593 3594 return entry; 3595 } 3596 3597 /** 3598 * 3599 * Arguments: 3600 * 3601 * Inputs: 3602 * c_rarg0 - int array elements a 3603 * c_rarg1 - int array elements n (the modulus) 3604 * c_rarg2 - int length 3605 * c_rarg3 - int inv 3606 * c_rarg4 - int array elements m (the result) 3607 * 3608 */ 3609 address generate_square() { 3610 Label argh; 3611 bind(argh); 3612 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3613 3614 align(CodeEntryAlignment); 3615 address entry = pc(); 3616 3617 enter(); 3618 3619 // Make room. 3620 mv(Ra, 512); 3621 bgt(Rlen, Ra, argh); 3622 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3623 sub(Ra, sp, Ra); 3624 andi(sp, Ra, -2 * wordSize); 3625 3626 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3627 3628 { 3629 // Copy input args, reversing as we go. We use Ra as a 3630 // temporary variable. 3631 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3632 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3633 } 3634 3635 // Push all call-saved registers and also Pm_base which we'll need 3636 // at the end. 3637 save_regs(); 3638 3639 mv(Pm_base, Ra); 3640 3641 mv(tmp0, zr); 3642 mv(tmp1, zr); 3643 mv(tmp2, zr); 3644 3645 block_comment("for (int i = 0; i < len; i++) {"); 3646 mv(Ri, zr); { 3647 Label loop, end; 3648 bind(loop); 3649 bge(Ri, Rlen, end); 3650 3651 pre1(Ri); 3652 3653 block_comment("for (j = (i+1)/2; j; j--) {"); { 3654 addi(Rj, Ri, 1); 3655 srliw(Rj, Rj, 1); 3656 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3657 } block_comment(" } // j"); 3658 3659 last_squaring(Ri); 3660 3661 block_comment(" for (j = i/2; j; j--) {"); { 3662 srliw(Rj, Ri, 1); 3663 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3664 } block_comment(" } // j"); 3665 3666 post1_squaring(); 3667 addi(Ri, Ri, 1); 3668 blt(Ri, Rlen, loop); 3669 3670 bind(end); 3671 block_comment("} // i"); 3672 } 3673 3674 block_comment("for (int i = len; i < 2*len; i++) {"); 3675 mv(Ri, Rlen); { 3676 Label loop, end; 3677 bind(loop); 3678 slli(t0, Rlen, 1); 3679 bge(Ri, t0, end); 3680 3681 pre2(Ri, Rlen); 3682 3683 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 3684 slli(Rj, Rlen, 1); 3685 sub(Rj, Rj, Ri); 3686 sub(Rj, Rj, 1); 3687 srliw(Rj, Rj, 1); 3688 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3689 } block_comment(" } // j"); 3690 3691 last_squaring(Ri); 3692 3693 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 3694 slli(Rj, Rlen, 1); 3695 sub(Rj, Rj, Ri); 3696 srliw(Rj, Rj, 1); 3697 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3698 } block_comment(" } // j"); 3699 3700 post2(Ri, Rlen); 3701 addi(Ri, Ri, 1); 3702 slli(t0, Rlen, 1); 3703 blt(Ri, t0, loop); 3704 3705 bind(end); 3706 block_comment("} // i"); 3707 } 3708 3709 normalize(Rlen); 3710 3711 mv(Ra, Pm_base); // Save Pm_base in Ra 3712 restore_regs(); // Restore caller's Pm_base 3713 3714 // Copy our result into caller's Pm_base 3715 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3716 3717 leave(); 3718 ret(); 3719 3720 return entry; 3721 } 3722 }; 3723 3724 #endif // COMPILER2 3725 3726 address generate_cont_thaw(Continuation::thaw_kind kind) { 3727 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 3728 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 3729 3730 address start = __ pc(); 3731 3732 if (return_barrier) { 3733 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset())); 3734 } 3735 3736 #ifndef PRODUCT 3737 { 3738 Label OK; 3739 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset())); 3740 __ beq(sp, t0, OK); 3741 __ stop("incorrect sp"); 3742 __ bind(OK); 3743 } 3744 #endif 3745 3746 if (return_barrier) { 3747 // preserve possible return value from a method returning to the return barrier 3748 __ sub(sp, sp, 2 * wordSize); 3749 __ fsd(f10, Address(sp, 0 * wordSize)); 3750 __ sd(x10, Address(sp, 1 * wordSize)); 3751 } 3752 3753 __ mv(c_rarg1, (return_barrier ? 1 : 0)); 3754 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1); 3755 __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames 3756 3757 if (return_barrier) { 3758 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 3759 __ ld(x10, Address(sp, 1 * wordSize)); 3760 __ fld(f10, Address(sp, 0 * wordSize)); 3761 __ add(sp, sp, 2 * wordSize); 3762 } 3763 3764 #ifndef PRODUCT 3765 { 3766 Label OK; 3767 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset())); 3768 __ beq(sp, t0, OK); 3769 __ stop("incorrect sp"); 3770 __ bind(OK); 3771 } 3772 #endif 3773 3774 Label thaw_success; 3775 // t1 contains the size of the frames to thaw, 0 if overflow or no more frames 3776 __ bnez(t1, thaw_success); 3777 __ la(t0, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 3778 __ jr(t0); 3779 __ bind(thaw_success); 3780 3781 // make room for the thawed frames 3782 __ sub(t0, sp, t1); 3783 __ andi(sp, t0, -16); // align 3784 3785 if (return_barrier) { 3786 // save original return value -- again 3787 __ sub(sp, sp, 2 * wordSize); 3788 __ fsd(f10, Address(sp, 0 * wordSize)); 3789 __ sd(x10, Address(sp, 1 * wordSize)); 3790 } 3791 3792 // If we want, we can templatize thaw by kind, and have three different entries 3793 __ mv(c_rarg1, kind); 3794 3795 __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1); 3796 __ mv(t1, x10); // x10 is the sp of the yielding frame 3797 3798 if (return_barrier) { 3799 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 3800 __ ld(x10, Address(sp, 1 * wordSize)); 3801 __ fld(f10, Address(sp, 0 * wordSize)); 3802 __ add(sp, sp, 2 * wordSize); 3803 } else { 3804 __ mv(x10, zr); // return 0 (success) from doYield 3805 } 3806 3807 // we're now on the yield frame (which is in an address above us b/c sp has been pushed down) 3808 __ mv(fp, t1); 3809 __ sub(sp, t1, 2 * wordSize); // now pointing to fp spill 3810 3811 if (return_barrier_exception) { 3812 __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address 3813 __ verify_oop(x10); 3814 __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9 3815 3816 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1); 3817 3818 // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc 3819 3820 __ mv(x11, x10); // the exception handler 3821 __ mv(x10, x9); // restore return value contaning the exception oop 3822 __ verify_oop(x10); 3823 3824 __ leave(); 3825 __ mv(x13, ra); 3826 __ jr(x11); // the exception handler 3827 } else { 3828 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 3829 __ leave(); 3830 __ ret(); 3831 } 3832 3833 return start; 3834 } 3835 3836 address generate_cont_thaw() { 3837 if (!Continuations::enabled()) return nullptr; 3838 3839 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 3840 address start = __ pc(); 3841 generate_cont_thaw(Continuation::thaw_top); 3842 return start; 3843 } 3844 3845 address generate_cont_returnBarrier() { 3846 if (!Continuations::enabled()) return nullptr; 3847 3848 // TODO: will probably need multiple return barriers depending on return type 3849 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 3850 address start = __ pc(); 3851 3852 generate_cont_thaw(Continuation::thaw_return_barrier); 3853 3854 return start; 3855 } 3856 3857 address generate_cont_returnBarrier_exception() { 3858 if (!Continuations::enabled()) return nullptr; 3859 3860 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 3861 address start = __ pc(); 3862 3863 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 3864 3865 return start; 3866 } 3867 3868 address generate_cont_preempt_stub() { 3869 if (!Continuations::enabled()) return nullptr; 3870 StubCodeMark mark(this, "StubRoutines","Continuation preempt stub"); 3871 address start = __ pc(); 3872 3873 __ reset_last_Java_frame(true); 3874 3875 // reset the flag 3876 __ sb(zr, Address(xthread, JavaThread::preempting_offset())); 3877 3878 // Set sp to enterSpecial frame and then remove it from the stack 3879 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset())); 3880 3881 Label preemption_cancelled; 3882 __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset())); 3883 __ bnez(t0, preemption_cancelled); 3884 3885 // Remove enterSpecial frame from the stack and return to Continuation.run() 3886 SharedRuntime::continuation_enter_cleanup(_masm); 3887 __ leave(); 3888 __ ret(); 3889 3890 __ bind(preemption_cancelled); 3891 __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset())); 3892 __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize))); 3893 __ la(t0, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 3894 __ ld(t0, Address(t0)); 3895 __ jr(t0); 3896 3897 return start; 3898 } 3899 3900 #if COMPILER2_OR_JVMCI 3901 3902 #undef __ 3903 #define __ this-> 3904 3905 class Sha2Generator : public MacroAssembler { 3906 StubCodeGenerator* _cgen; 3907 public: 3908 Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {} 3909 address generate_sha256_implCompress(bool multi_block) { 3910 return generate_sha2_implCompress(Assembler::e32, multi_block); 3911 } 3912 address generate_sha512_implCompress(bool multi_block) { 3913 return generate_sha2_implCompress(Assembler::e64, multi_block); 3914 } 3915 private: 3916 3917 void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { 3918 if (vset_sew == Assembler::e32) __ vle32_v(vr, sr); 3919 else __ vle64_v(vr, sr); 3920 } 3921 3922 void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { 3923 if (vset_sew == Assembler::e32) __ vse32_v(vr, sr); 3924 else __ vse64_v(vr, sr); 3925 } 3926 3927 // Overview of the logic in each "quad round". 3928 // 3929 // The code below repeats 16/20 times the logic implementing four rounds 3930 // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds" 3931 // to implementing the 64/80 single rounds. 3932 // 3933 // // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0]) 3934 // // Output: 3935 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} 3936 // vl1reXX.v vTmp1, ofs 3937 // 3938 // // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b) 3939 // addi ofs, ofs, 16/32 3940 // 3941 // // Add constants to message schedule words: 3942 // // Input 3943 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} 3944 // // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0]; 3945 // // Output 3946 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 3947 // vadd.vv vTmp0, vTmp1, vW0 3948 // 3949 // // 2 rounds of working variables updates. 3950 // // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t] 3951 // // Input: 3952 // // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] " 3953 // // vState0 = {a[t],b[t],e[t],f[t]} 3954 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 3955 // // Output: 3956 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " 3957 // // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] " 3958 // vsha2cl.vv vState1, vState0, vTmp0 3959 // 3960 // // 2 rounds of working variables updates. 3961 // // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t] 3962 // // Input 3963 // // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] " 3964 // // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] " 3965 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " 3966 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 3967 // // Output: 3968 // // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] " 3969 // vsha2ch.vv vState0, vState1, vTmp0 3970 // 3971 // // Combine 2QW into 1QW 3972 // // 3973 // // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs 3974 // // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3] 3975 // // and it can only take 3 vectors as inputs. Hence we need to combine 3976 // // vW1[0] and vW2[1..3] in a single vector. 3977 // // 3978 // // vmerge Vt4, Vt1, Vt2, V0 3979 // // Input 3980 // // V0 = mask // first word from vW2, 1..3 words from vW1 3981 // // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5} 3982 // // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9} 3983 // // Output 3984 // // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5} 3985 // vmerge.vvm vTmp0, vW2, vW1, v0 3986 // 3987 // // Generate next Four Message Schedule Words (hence allowing for 4 more rounds) 3988 // // Input 3989 // // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0] 3990 // // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12] 3991 // // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4] 3992 // // Output (next four message schedule words) 3993 // // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16] 3994 // vsha2ms.vv vW0, vTmp0, vW3 3995 // 3996 // BEFORE 3997 // vW0 - vW3 hold the message schedule words (initially the block words) 3998 // vW0 = W[ 3: 0] "oldest" 3999 // vW1 = W[ 7: 4] 4000 // vW2 = W[11: 8] 4001 // vW3 = W[15:12] "newest" 4002 // 4003 // vt6 - vt7 hold the working state variables 4004 // vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0} 4005 // vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2} 4006 // 4007 // AFTER 4008 // vW0 - vW3 hold the message schedule words (initially the block words) 4009 // vW1 = W[ 7: 4] "oldest" 4010 // vW2 = W[11: 8] 4011 // vW3 = W[15:12] 4012 // vW0 = W[19:16] "newest" 4013 // 4014 // vState0 and vState1 hold the working state variables 4015 // vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]} 4016 // vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]} 4017 // 4018 // The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round, 4019 // hence the uses of those vectors rotate in each round, and we get back to the 4020 // initial configuration every 4 quad-rounds. We could avoid those changes at 4021 // the cost of moving those vectors at the end of each quad-rounds. 4022 void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4, 4023 Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh, 4024 bool gen_words = true, bool step_const = true) { 4025 __ vleXX_v(vset_sew, vtemp, scalarconst); 4026 if (step_const) { 4027 __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32); 4028 } 4029 __ vadd_vv(vtemp2, vtemp, rot1); 4030 __ vsha2cl_vv(v_cdgh, v_abef, vtemp2); 4031 __ vsha2ch_vv(v_abef, v_cdgh, vtemp2); 4032 if (gen_words) { 4033 __ vmerge_vvm(vtemp2, rot3, rot2); 4034 __ vsha2ms_vv(rot1, vtemp2, rot4); 4035 } 4036 } 4037 4038 const char* stub_name(Assembler::SEW vset_sew, bool multi_block) { 4039 if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress"; 4040 if (vset_sew == Assembler::e32 && multi_block) return "sha256_implCompressMB"; 4041 if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress"; 4042 if (vset_sew == Assembler::e64 && multi_block) return "sha512_implCompressMB"; 4043 ShouldNotReachHere(); 4044 return "bad name lookup"; 4045 } 4046 4047 // Arguments: 4048 // 4049 // Inputs: 4050 // c_rarg0 - byte[] source+offset 4051 // c_rarg1 - int[] SHA.state 4052 // c_rarg2 - int offset 4053 // c_rarg3 - int limit 4054 // 4055 address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) { 4056 alignas(64) static const uint32_t round_consts_256[64] = { 4057 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 4058 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 4059 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 4060 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 4061 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 4062 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 4063 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 4064 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 4065 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 4066 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 4067 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 4068 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 4069 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 4070 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 4071 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 4072 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 4073 }; 4074 alignas(64) static const uint64_t round_consts_512[80] = { 4075 0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl, 4076 0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l, 4077 0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l, 4078 0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l, 4079 0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l, 4080 0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l, 4081 0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l, 4082 0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l, 4083 0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl, 4084 0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l, 4085 0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl, 4086 0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl, 4087 0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l, 4088 0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l, 4089 0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l, 4090 0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l, 4091 0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l, 4092 0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl, 4093 0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl, 4094 0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl, 4095 0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l, 4096 0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l, 4097 0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal, 4098 0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl, 4099 0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl, 4100 0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al, 4101 0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l 4102 }; 4103 const int const_add = vset_sew == Assembler::e32 ? 16 : 32; 4104 4105 __ align(CodeEntryAlignment); 4106 StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block)); 4107 address start = __ pc(); 4108 4109 Register buf = c_rarg0; 4110 Register state = c_rarg1; 4111 Register ofs = c_rarg2; 4112 Register limit = c_rarg3; 4113 Register consts = t2; // caller saved 4114 Register state_c = x28; // caller saved 4115 VectorRegister vindex = v2; 4116 VectorRegister vW0 = v4; 4117 VectorRegister vW1 = v6; 4118 VectorRegister vW2 = v8; 4119 VectorRegister vW3 = v10; 4120 VectorRegister vState0 = v12; 4121 VectorRegister vState1 = v14; 4122 VectorRegister vHash0 = v16; 4123 VectorRegister vHash1 = v18; 4124 VectorRegister vTmp0 = v20; 4125 VectorRegister vTmp1 = v22; 4126 4127 Label multi_block_loop; 4128 4129 __ enter(); 4130 4131 address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512; 4132 la(consts, ExternalAddress(constant_table)); 4133 4134 // Register use in this function: 4135 // 4136 // VECTORS 4137 // vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message 4138 // schedule words (Wt). They start with the message block 4139 // content (W0 to W15), then further words in the message 4140 // schedule generated via vsha2ms from previous Wt. 4141 // Initially: 4142 // vW0 = W[ 3:0] = { W3, W2, W1, W0} 4143 // vW1 = W[ 7:4] = { W7, W6, W5, W4} 4144 // vW2 = W[ 11:8] = {W11, W10, W9, W8} 4145 // vW3 = W[15:12] = {W15, W14, W13, W12} 4146 // 4147 // vState0 - vState1 hold the working state variables (a, b, ..., h) 4148 // vState0 = {f[t],e[t],b[t],a[t]} 4149 // vState1 = {h[t],g[t],d[t],c[t]} 4150 // Initially: 4151 // vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1} 4152 // vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1} 4153 // 4154 // v0 = masks for vrgather/vmerge. Single value during the 16 rounds. 4155 // 4156 // vTmp0 = temporary, Wt+Kt 4157 // vTmp1 = temporary, Kt 4158 // 4159 // vHash0/vHash1 = hold the initial values of the hash, byte-swapped. 4160 // 4161 // During most of the function the vector state is configured so that each 4162 // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits). 4163 4164 // vsha2ch/vsha2cl uses EGW of 4*SEW. 4165 // SHA256 SEW = e32, EGW = 128-bits 4166 // SHA512 SEW = e64, EGW = 256-bits 4167 // 4168 // VLEN is required to be at least 128. 4169 // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256) 4170 // 4171 // m1: LMUL=1/2 4172 // ta: tail agnostic (don't care about those lanes) 4173 // ma: mask agnostic (don't care about those lanes) 4174 // x0 is not written, we known the number of vector elements. 4175 4176 if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128 4177 __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta); 4178 } else { 4179 __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta); 4180 } 4181 4182 int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul; 4183 __ li(t0, indexes); 4184 __ vmv_v_x(vindex, t0); 4185 4186 // Step-over a,b, so we are pointing to c. 4187 // const_add is equal to 4x state variable, div by 2 is thus 2, a,b 4188 __ addi(state_c, state, const_add/2); 4189 4190 // Use index-load to get {f,e,b,a},{h,g,d,c} 4191 __ vluxei8_v(vState0, state, vindex); 4192 __ vluxei8_v(vState1, state_c, vindex); 4193 4194 __ bind(multi_block_loop); 4195 4196 // Capture the initial H values in vHash0 and vHash1 to allow for computing 4197 // the resulting H', since H' = H+{a',b',c',...,h'}. 4198 __ vmv_v_v(vHash0, vState0); 4199 __ vmv_v_v(vHash1, vState1); 4200 4201 // Load the 512/1024-bits of the message block in vW0-vW3 and perform 4202 // an endian swap on each 4/8 bytes element. 4203 // 4204 // If Zvkb is not implemented one can use vrgather 4205 // with an index sequence to byte-swap. 4206 // sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] 4207 // <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate 4208 // this sequence. 'vid' gives us the N. 4209 __ vleXX_v(vset_sew, vW0, buf); 4210 __ vrev8_v(vW0, vW0); 4211 __ addi(buf, buf, const_add); 4212 __ vleXX_v(vset_sew, vW1, buf); 4213 __ vrev8_v(vW1, vW1); 4214 __ addi(buf, buf, const_add); 4215 __ vleXX_v(vset_sew, vW2, buf); 4216 __ vrev8_v(vW2, vW2); 4217 __ addi(buf, buf, const_add); 4218 __ vleXX_v(vset_sew, vW3, buf); 4219 __ vrev8_v(vW3, vW3); 4220 __ addi(buf, buf, const_add); 4221 4222 // Set v0 up for the vmerge that replaces the first word (idx==0) 4223 __ vid_v(v0); 4224 __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0) 4225 4226 VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3}; 4227 int rot_pos = 0; 4228 // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2) 4229 const int qr_end = vset_sew == Assembler::e32 ? 12 : 16; 4230 for (int i = 0; i < qr_end; i++) { 4231 sha2_quad_round(vset_sew, 4232 rotation_regs[(rot_pos + 0) & 0x3], 4233 rotation_regs[(rot_pos + 1) & 0x3], 4234 rotation_regs[(rot_pos + 2) & 0x3], 4235 rotation_regs[(rot_pos + 3) & 0x3], 4236 consts, 4237 vTmp1, vTmp0, vState0, vState1); 4238 ++rot_pos; 4239 } 4240 // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2) 4241 // Note that we stop generating new message schedule words (Wt, vW0-13) 4242 // as we already generated all the words we end up consuming (i.e., W[63:60]). 4243 const int qr_c_end = qr_end + 4; 4244 for (int i = qr_end; i < qr_c_end; i++) { 4245 sha2_quad_round(vset_sew, 4246 rotation_regs[(rot_pos + 0) & 0x3], 4247 rotation_regs[(rot_pos + 1) & 0x3], 4248 rotation_regs[(rot_pos + 2) & 0x3], 4249 rotation_regs[(rot_pos + 3) & 0x3], 4250 consts, 4251 vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1)); 4252 ++rot_pos; 4253 } 4254 4255 //-------------------------------------------------------------------------------- 4256 // Compute the updated hash value H' 4257 // H' = H + {h',g',...,b',a'} 4258 // = {h,g,...,b,a} + {h',g',...,b',a'} 4259 // = {h+h',g+g',...,b+b',a+a'} 4260 4261 // H' = H+{a',b',c',...,h'} 4262 __ vadd_vv(vState0, vHash0, vState0); 4263 __ vadd_vv(vState1, vHash1, vState1); 4264 4265 if (multi_block) { 4266 int total_adds = vset_sew == Assembler::e32 ? 240 : 608; 4267 __ addi(consts, consts, -total_adds); 4268 __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128); 4269 __ ble(ofs, limit, multi_block_loop); 4270 __ mv(c_rarg0, ofs); // return ofs 4271 } 4272 4273 // Store H[0..8] = {a,b,c,d,e,f,g,h} from 4274 // vState0 = {f,e,b,a} 4275 // vState1 = {h,g,d,c} 4276 __ vsuxei8_v(vState0, state, vindex); 4277 __ vsuxei8_v(vState1, state_c, vindex); 4278 4279 __ leave(); 4280 __ ret(); 4281 4282 return start; 4283 } 4284 }; 4285 4286 #undef __ 4287 #define __ _masm-> 4288 4289 // Set of L registers that correspond to a contiguous memory area. 4290 // Each 64-bit register typically corresponds to 2 32-bit integers. 4291 template <uint L> 4292 class RegCache { 4293 private: 4294 MacroAssembler *_masm; 4295 Register _regs[L]; 4296 4297 public: 4298 RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) { 4299 assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L); 4300 auto it = rs.begin(); 4301 for (auto &r: _regs) { 4302 r = *it; 4303 ++it; 4304 } 4305 } 4306 4307 // generate load for the i'th register 4308 void gen_load(uint i, Register base) { 4309 assert(i < L, "invalid i: %u", i); 4310 __ ld(_regs[i], Address(base, 8 * i)); 4311 } 4312 4313 // add i'th 32-bit integer to dest 4314 void add_u32(const Register dest, uint i, const Register rtmp = t0) { 4315 assert(i < 2 * L, "invalid i: %u", i); 4316 4317 if (is_even(i)) { 4318 // Use the bottom 32 bits. No need to mask off the top 32 bits 4319 // as addw will do the right thing. 4320 __ addw(dest, dest, _regs[i / 2]); 4321 } else { 4322 // Use the top 32 bits by right-shifting them. 4323 __ srli(rtmp, _regs[i / 2], 32); 4324 __ addw(dest, dest, rtmp); 4325 } 4326 } 4327 }; 4328 4329 typedef RegCache<8> BufRegCache; 4330 4331 // a += value + x + ac; 4332 // a = Integer.rotateLeft(a, s) + b; 4333 void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache, 4334 Register a, Register b, Register c, Register d, 4335 int k, int s, int t, 4336 Register value) { 4337 // a += ac 4338 __ addw(a, a, t, t1); 4339 4340 // a += x; 4341 reg_cache.add_u32(a, k); 4342 // a += value; 4343 __ addw(a, a, value); 4344 4345 // a = Integer.rotateLeft(a, s) + b; 4346 __ rolw_imm(a, a, s); 4347 __ addw(a, a, b); 4348 } 4349 4350 // a += ((b & c) | ((~b) & d)) + x + ac; 4351 // a = Integer.rotateLeft(a, s) + b; 4352 void md5_FF(BufRegCache& reg_cache, 4353 Register a, Register b, Register c, Register d, 4354 int k, int s, int t, 4355 Register rtmp1, Register rtmp2) { 4356 // rtmp1 = b & c 4357 __ andr(rtmp1, b, c); 4358 4359 // rtmp2 = (~b) & d 4360 __ andn(rtmp2, d, b); 4361 4362 // rtmp1 = (b & c) | ((~b) & d) 4363 __ orr(rtmp1, rtmp1, rtmp2); 4364 4365 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4366 } 4367 4368 // a += ((b & d) | (c & (~d))) + x + ac; 4369 // a = Integer.rotateLeft(a, s) + b; 4370 void md5_GG(BufRegCache& reg_cache, 4371 Register a, Register b, Register c, Register d, 4372 int k, int s, int t, 4373 Register rtmp1, Register rtmp2) { 4374 // rtmp1 = b & d 4375 __ andr(rtmp1, b, d); 4376 4377 // rtmp2 = c & (~d) 4378 __ andn(rtmp2, c, d); 4379 4380 // rtmp1 = (b & d) | (c & (~d)) 4381 __ orr(rtmp1, rtmp1, rtmp2); 4382 4383 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4384 } 4385 4386 // a += ((b ^ c) ^ d) + x + ac; 4387 // a = Integer.rotateLeft(a, s) + b; 4388 void md5_HH(BufRegCache& reg_cache, 4389 Register a, Register b, Register c, Register d, 4390 int k, int s, int t, 4391 Register rtmp1, Register rtmp2) { 4392 // rtmp1 = (b ^ c) ^ d 4393 __ xorr(rtmp2, b, c); 4394 __ xorr(rtmp1, rtmp2, d); 4395 4396 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4397 } 4398 4399 // a += (c ^ (b | (~d))) + x + ac; 4400 // a = Integer.rotateLeft(a, s) + b; 4401 void md5_II(BufRegCache& reg_cache, 4402 Register a, Register b, Register c, Register d, 4403 int k, int s, int t, 4404 Register rtmp1, Register rtmp2) { 4405 // rtmp1 = c ^ (b | (~d)) 4406 __ orn(rtmp2, b, d); 4407 __ xorr(rtmp1, c, rtmp2); 4408 4409 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4410 } 4411 4412 // Arguments: 4413 // 4414 // Inputs: 4415 // c_rarg0 - byte[] source+offset 4416 // c_rarg1 - int[] SHA.state 4417 // c_rarg2 - int offset (multi_block == True) 4418 // c_rarg3 - int limit (multi_block == True) 4419 // 4420 // Registers: 4421 // x0 zero (zero) 4422 // x1 ra (return address) 4423 // x2 sp (stack pointer) 4424 // x3 gp (global pointer) 4425 // x4 tp (thread pointer) 4426 // x5 t0 (tmp register) 4427 // x6 t1 (tmp register) 4428 // x7 t2 state0 4429 // x8 f0/s0 (frame pointer) 4430 // x9 s1 4431 // x10 a0 rtmp1 / c_rarg0 4432 // x11 a1 rtmp2 / c_rarg1 4433 // x12 a2 a / c_rarg2 4434 // x13 a3 b / c_rarg3 4435 // x14 a4 c 4436 // x15 a5 d 4437 // x16 a6 buf 4438 // x17 a7 state 4439 // x18 s2 ofs [saved-reg] (multi_block == True) 4440 // x19 s3 limit [saved-reg] (multi_block == True) 4441 // x20 s4 state1 [saved-reg] 4442 // x21 s5 state2 [saved-reg] 4443 // x22 s6 state3 [saved-reg] 4444 // x23 s7 4445 // x24 s8 buf0 [saved-reg] 4446 // x25 s9 buf1 [saved-reg] 4447 // x26 s10 buf2 [saved-reg] 4448 // x27 s11 buf3 [saved-reg] 4449 // x28 t3 buf4 4450 // x29 t4 buf5 4451 // x30 t5 buf6 4452 // x31 t6 buf7 4453 address generate_md5_implCompress(bool multi_block, const char *name) { 4454 __ align(CodeEntryAlignment); 4455 StubCodeMark mark(this, "StubRoutines", name); 4456 address start = __ pc(); 4457 4458 // rotation constants 4459 const int S11 = 7; 4460 const int S12 = 12; 4461 const int S13 = 17; 4462 const int S14 = 22; 4463 const int S21 = 5; 4464 const int S22 = 9; 4465 const int S23 = 14; 4466 const int S24 = 20; 4467 const int S31 = 4; 4468 const int S32 = 11; 4469 const int S33 = 16; 4470 const int S34 = 23; 4471 const int S41 = 6; 4472 const int S42 = 10; 4473 const int S43 = 15; 4474 const int S44 = 21; 4475 4476 const int64_t mask32 = 0xffffffff; 4477 4478 Register buf_arg = c_rarg0; // a0 4479 Register state_arg = c_rarg1; // a1 4480 Register ofs_arg = c_rarg2; // a2 4481 Register limit_arg = c_rarg3; // a3 4482 4483 // we'll copy the args to these registers to free up a0-a3 4484 // to use for other values manipulated by instructions 4485 // that can be compressed 4486 Register buf = x16; // a6 4487 Register state = x17; // a7 4488 Register ofs = x18; // s2 4489 Register limit = x19; // s3 4490 4491 // using x12->15 to allow compressed instructions 4492 Register a = x12; // a2 4493 Register b = x13; // a3 4494 Register c = x14; // a4 4495 Register d = x15; // a5 4496 4497 Register state0 = x7; // t2 4498 Register state1 = x20; // s4 4499 Register state2 = x21; // s5 4500 Register state3 = x22; // s6 4501 4502 // using x10->x11 to allow compressed instructions 4503 Register rtmp1 = x10; // a0 4504 Register rtmp2 = x11; // a1 4505 4506 RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11 4507 RegSet reg_cache_regs; 4508 reg_cache_regs += reg_cache_saved_regs; 4509 reg_cache_regs += RegSet::of(x28, x29, x30, x31); // t3, t4, t5, t6 4510 BufRegCache reg_cache(_masm, reg_cache_regs); 4511 4512 RegSet saved_regs; 4513 if (multi_block) { 4514 saved_regs += RegSet::of(ofs, limit); 4515 } 4516 saved_regs += RegSet::of(state1, state2, state3); 4517 saved_regs += reg_cache_saved_regs; 4518 4519 __ push_reg(saved_regs, sp); 4520 4521 __ mv(buf, buf_arg); 4522 __ mv(state, state_arg); 4523 if (multi_block) { 4524 __ mv(ofs, ofs_arg); 4525 __ mv(limit, limit_arg); 4526 } 4527 4528 // to minimize the number of memory operations: 4529 // read the 4 state 4-byte values in pairs, with a single ld, 4530 // and split them into 2 registers. 4531 // 4532 // And, as the core algorithm of md5 works on 32-bits words, so 4533 // in the following code, it does not care about the content of 4534 // higher 32-bits in state[x]. Based on this observation, 4535 // we can apply further optimization, which is to just ignore the 4536 // higher 32-bits in state0/state2, rather than set the higher 4537 // 32-bits of state0/state2 to zero explicitly with extra instructions. 4538 __ ld(state0, Address(state)); 4539 __ srli(state1, state0, 32); 4540 __ ld(state2, Address(state, 8)); 4541 __ srli(state3, state2, 32); 4542 4543 Label md5_loop; 4544 __ BIND(md5_loop); 4545 4546 __ mv(a, state0); 4547 __ mv(b, state1); 4548 __ mv(c, state2); 4549 __ mv(d, state3); 4550 4551 // Round 1 4552 reg_cache.gen_load(0, buf); 4553 md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2); 4554 md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2); 4555 reg_cache.gen_load(1, buf); 4556 md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2); 4557 md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2); 4558 reg_cache.gen_load(2, buf); 4559 md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2); 4560 md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2); 4561 reg_cache.gen_load(3, buf); 4562 md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2); 4563 md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2); 4564 reg_cache.gen_load(4, buf); 4565 md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2); 4566 md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2); 4567 reg_cache.gen_load(5, buf); 4568 md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2); 4569 md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2); 4570 reg_cache.gen_load(6, buf); 4571 md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2); 4572 md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2); 4573 reg_cache.gen_load(7, buf); 4574 md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2); 4575 md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2); 4576 4577 // Round 2 4578 md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2); 4579 md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2); 4580 md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2); 4581 md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2); 4582 md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2); 4583 md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2); 4584 md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2); 4585 md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2); 4586 md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2); 4587 md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2); 4588 md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2); 4589 md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2); 4590 md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2); 4591 md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2); 4592 md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2); 4593 md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2); 4594 4595 // Round 3 4596 md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2); 4597 md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2); 4598 md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2); 4599 md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2); 4600 md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2); 4601 md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2); 4602 md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2); 4603 md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2); 4604 md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2); 4605 md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2); 4606 md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2); 4607 md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2); 4608 md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2); 4609 md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2); 4610 md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2); 4611 md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2); 4612 4613 // Round 4 4614 md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2); 4615 md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2); 4616 md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2); 4617 md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2); 4618 md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2); 4619 md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2); 4620 md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2); 4621 md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2); 4622 md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2); 4623 md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2); 4624 md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2); 4625 md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2); 4626 md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2); 4627 md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2); 4628 md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2); 4629 md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2); 4630 4631 __ addw(state0, state0, a); 4632 __ addw(state1, state1, b); 4633 __ addw(state2, state2, c); 4634 __ addw(state3, state3, d); 4635 4636 if (multi_block) { 4637 __ addi(buf, buf, 64); 4638 __ addi(ofs, ofs, 64); 4639 // if (ofs <= limit) goto m5_loop 4640 __ bge(limit, ofs, md5_loop); 4641 __ mv(c_rarg0, ofs); // return ofs 4642 } 4643 4644 // to minimize the number of memory operations: 4645 // write back the 4 state 4-byte values in pairs, with a single sd 4646 __ mv(t0, mask32); 4647 __ andr(state0, state0, t0); 4648 __ slli(state1, state1, 32); 4649 __ orr(state0, state0, state1); 4650 __ sd(state0, Address(state)); 4651 __ andr(state2, state2, t0); 4652 __ slli(state3, state3, 32); 4653 __ orr(state2, state2, state3); 4654 __ sd(state2, Address(state, 8)); 4655 4656 __ pop_reg(saved_regs, sp); 4657 __ ret(); 4658 4659 return (address) start; 4660 } 4661 4662 /** 4663 * Perform the quarter round calculations on values contained within four vector registers. 4664 * 4665 * @param aVec the SIMD register containing only the "a" values 4666 * @param bVec the SIMD register containing only the "b" values 4667 * @param cVec the SIMD register containing only the "c" values 4668 * @param dVec the SIMD register containing only the "d" values 4669 * @param tmp_vr temporary vector register holds intermedia values. 4670 */ 4671 void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec, 4672 VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) { 4673 // a += b, d ^= a, d <<<= 16 4674 __ vadd_vv(aVec, aVec, bVec); 4675 __ vxor_vv(dVec, dVec, aVec); 4676 __ vrole32_vi(dVec, 16, tmp_vr); 4677 4678 // c += d, b ^= c, b <<<= 12 4679 __ vadd_vv(cVec, cVec, dVec); 4680 __ vxor_vv(bVec, bVec, cVec); 4681 __ vrole32_vi(bVec, 12, tmp_vr); 4682 4683 // a += b, d ^= a, d <<<= 8 4684 __ vadd_vv(aVec, aVec, bVec); 4685 __ vxor_vv(dVec, dVec, aVec); 4686 __ vrole32_vi(dVec, 8, tmp_vr); 4687 4688 // c += d, b ^= c, b <<<= 7 4689 __ vadd_vv(cVec, cVec, dVec); 4690 __ vxor_vv(bVec, bVec, cVec); 4691 __ vrole32_vi(bVec, 7, tmp_vr); 4692 } 4693 4694 /** 4695 * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result) 4696 * 4697 * Input arguments: 4698 * c_rarg0 - state, the starting state 4699 * c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function 4700 * 4701 * Implementation Note: 4702 * Parallelization is achieved by loading individual state elements into vectors for N blocks. 4703 * N depends on single vector register length. 4704 */ 4705 address generate_chacha20Block() { 4706 Label L_Rounds; 4707 4708 __ align(CodeEntryAlignment); 4709 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4710 address start = __ pc(); 4711 __ enter(); 4712 4713 const int states_len = 16; 4714 const int step = 4; 4715 const Register state = c_rarg0; 4716 const Register key_stream = c_rarg1; 4717 const Register tmp_addr = t0; 4718 const Register length = t1; 4719 4720 // Organize vector registers in an array that facilitates 4721 // putting repetitive opcodes into loop structures below. 4722 const VectorRegister work_vrs[16] = { 4723 v0, v1, v2, v3, v4, v5, v6, v7, 4724 v8, v9, v10, v11, v12, v13, v14, v15 4725 }; 4726 const VectorRegister tmp_vr = v16; 4727 const VectorRegister counter_vr = v17; 4728 4729 { 4730 // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024 4731 // in java level. 4732 __ vsetivli(length, 16, Assembler::e32, Assembler::m1); 4733 } 4734 4735 // Load from source state. 4736 // Every element in source state is duplicated to all elements in the corresponding vector. 4737 __ mv(tmp_addr, state); 4738 for (int i = 0; i < states_len; i += 1) { 4739 __ vlse32_v(work_vrs[i], tmp_addr, zr); 4740 __ addi(tmp_addr, tmp_addr, step); 4741 } 4742 // Adjust counter for every individual block. 4743 __ vid_v(counter_vr); 4744 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); 4745 4746 // Perform 10 iterations of the 8 quarter round set 4747 { 4748 const Register loop = t2; // share t2 with other non-overlapping usages. 4749 __ mv(loop, 10); 4750 __ BIND(L_Rounds); 4751 4752 chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr); 4753 chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr); 4754 chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr); 4755 chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr); 4756 4757 chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr); 4758 chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr); 4759 chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr); 4760 chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr); 4761 4762 __ sub(loop, loop, 1); 4763 __ bnez(loop, L_Rounds); 4764 } 4765 4766 // Add the original state into the end working state. 4767 // We do this by first duplicating every element in source state array to the corresponding 4768 // vector, then adding it to the post-loop working state. 4769 __ mv(tmp_addr, state); 4770 for (int i = 0; i < states_len; i += 1) { 4771 __ vlse32_v(tmp_vr, tmp_addr, zr); 4772 __ addi(tmp_addr, tmp_addr, step); 4773 __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr); 4774 } 4775 // Add the counter overlay onto work_vrs[12] at the end. 4776 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); 4777 4778 // Store result to key stream. 4779 { 4780 const Register stride = t2; // share t2 with other non-overlapping usages. 4781 // Every block occupies 64 bytes, so we use 64 as stride of the vector store. 4782 __ mv(stride, 64); 4783 for (int i = 0; i < states_len; i += 1) { 4784 __ vsse32_v(work_vrs[i], key_stream, stride); 4785 __ addi(key_stream, key_stream, step); 4786 } 4787 } 4788 4789 // Return length of output key_stream 4790 __ slli(c_rarg0, length, 6); 4791 4792 __ leave(); 4793 __ ret(); 4794 4795 return (address) start; 4796 } 4797 4798 4799 // ------------------------ SHA-1 intrinsic ------------------------ 4800 4801 // K't = 4802 // 5a827999, 0 <= t <= 19 4803 // 6ed9eba1, 20 <= t <= 39 4804 // 8f1bbcdc, 40 <= t <= 59 4805 // ca62c1d6, 60 <= t <= 79 4806 void sha1_prepare_k(Register cur_k, int round) { 4807 assert(round >= 0 && round < 80, "must be"); 4808 4809 static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6}; 4810 if ((round % 20) == 0) { 4811 __ mv(cur_k, ks[round/20]); 4812 } 4813 } 4814 4815 // W't = 4816 // M't, 0 <= t <= 15 4817 // ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 4818 void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) { 4819 assert(round >= 0 && round < 80, "must be"); 4820 4821 if (round < 16) { 4822 // in the first 16 rounds, in ws[], every register contains 2 W't, e.g. 4823 // in ws[0], high part contains W't-0, low part contains W't-1, 4824 // in ws[1], high part contains W't-2, low part contains W't-3, 4825 // ... 4826 // in ws[7], high part contains W't-14, low part contains W't-15. 4827 4828 if ((round % 2) == 0) { 4829 __ ld(ws[round/2], Address(buf, (round/2) * 8)); 4830 // reverse bytes, as SHA-1 is defined in big-endian. 4831 __ revb(ws[round/2], ws[round/2]); 4832 __ srli(cur_w, ws[round/2], 32); 4833 } else { 4834 __ mv(cur_w, ws[round/2]); 4835 } 4836 4837 return; 4838 } 4839 4840 if ((round % 2) == 0) { 4841 int idx = 16; 4842 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 4843 __ srli(t1, ws[(idx-8)/2], 32); 4844 __ xorr(t0, ws[(idx-3)/2], t1); 4845 4846 __ srli(t1, ws[(idx-14)/2], 32); 4847 __ srli(cur_w, ws[(idx-16)/2], 32); 4848 __ xorr(cur_w, cur_w, t1); 4849 4850 __ xorr(cur_w, cur_w, t0); 4851 __ rolw_imm(cur_w, cur_w, 1, t0); 4852 4853 // copy the cur_w value to ws[8]. 4854 // now, valid w't values are at: 4855 // w0: ws[0]'s lower 32 bits 4856 // w1 ~ w14: ws[1] ~ ws[7] 4857 // w15: ws[8]'s higher 32 bits 4858 __ slli(ws[idx/2], cur_w, 32); 4859 4860 return; 4861 } 4862 4863 int idx = 17; 4864 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 4865 __ srli(t1, ws[(idx-3)/2], 32); 4866 __ xorr(t0, t1, ws[(idx-8)/2]); 4867 4868 __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]); 4869 4870 __ xorr(cur_w, cur_w, t0); 4871 __ rolw_imm(cur_w, cur_w, 1, t0); 4872 4873 // copy the cur_w value to ws[8] 4874 __ zero_extend(cur_w, cur_w, 32); 4875 __ orr(ws[idx/2], ws[idx/2], cur_w); 4876 4877 // shift the w't registers, so they start from ws[0] again. 4878 // now, valid w't values are at: 4879 // w0 ~ w15: ws[0] ~ ws[7] 4880 Register ws_0 = ws[0]; 4881 for (int i = 0; i < 16/2; i++) { 4882 ws[i] = ws[i+1]; 4883 } 4884 ws[8] = ws_0; 4885 } 4886 4887 // f't(x, y, z) = 4888 // Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19 4889 // Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39 4890 // Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59 4891 // Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79 4892 void sha1_f(Register dst, Register x, Register y, Register z, int round) { 4893 assert(round >= 0 && round < 80, "must be"); 4894 assert_different_registers(dst, x, y, z, t0, t1); 4895 4896 if (round < 20) { 4897 // (x & y) ^ (~x & z) 4898 __ andr(t0, x, y); 4899 __ andn(dst, z, x); 4900 __ xorr(dst, dst, t0); 4901 } else if (round >= 40 && round < 60) { 4902 // (x & y) ^ (x & z) ^ (y & z) 4903 __ andr(t0, x, y); 4904 __ andr(t1, x, z); 4905 __ andr(dst, y, z); 4906 __ xorr(dst, dst, t0); 4907 __ xorr(dst, dst, t1); 4908 } else { 4909 // x ^ y ^ z 4910 __ xorr(dst, x, y); 4911 __ xorr(dst, dst, z); 4912 } 4913 } 4914 4915 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't 4916 // e = d 4917 // d = c 4918 // c = ROTL'30(b) 4919 // b = a 4920 // a = T 4921 void sha1_process_round(Register a, Register b, Register c, Register d, Register e, 4922 Register cur_k, Register cur_w, Register tmp, int round) { 4923 assert(round >= 0 && round < 80, "must be"); 4924 assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0); 4925 4926 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't 4927 4928 // cur_w will be recalculated at the beginning of each round, 4929 // so, we can reuse it as a temp register here. 4930 Register tmp2 = cur_w; 4931 4932 // reuse e as a temporary register, as we will mv new value into it later 4933 Register tmp3 = e; 4934 __ add(tmp2, cur_k, tmp2); 4935 __ add(tmp3, tmp3, tmp2); 4936 __ rolw_imm(tmp2, a, 5, t0); 4937 4938 sha1_f(tmp, b, c, d, round); 4939 4940 __ add(tmp2, tmp2, tmp); 4941 __ add(tmp2, tmp2, tmp3); 4942 4943 // e = d 4944 // d = c 4945 // c = ROTL'30(b) 4946 // b = a 4947 // a = T 4948 __ mv(e, d); 4949 __ mv(d, c); 4950 4951 __ rolw_imm(c, b, 30); 4952 __ mv(b, a); 4953 __ mv(a, tmp2); 4954 } 4955 4956 // H(i)0 = a + H(i-1)0 4957 // H(i)1 = b + H(i-1)1 4958 // H(i)2 = c + H(i-1)2 4959 // H(i)3 = d + H(i-1)3 4960 // H(i)4 = e + H(i-1)4 4961 void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e, 4962 Register prev_ab, Register prev_cd, Register prev_e) { 4963 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e); 4964 4965 __ add(a, a, prev_ab); 4966 __ srli(prev_ab, prev_ab, 32); 4967 __ add(b, b, prev_ab); 4968 4969 __ add(c, c, prev_cd); 4970 __ srli(prev_cd, prev_cd, 32); 4971 __ add(d, d, prev_cd); 4972 4973 __ add(e, e, prev_e); 4974 } 4975 4976 void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e, 4977 Register prev_ab, Register prev_cd, Register prev_e) { 4978 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0); 4979 4980 __ slli(t0, b, 32); 4981 __ zero_extend(prev_ab, a, 32); 4982 __ orr(prev_ab, prev_ab, t0); 4983 4984 __ slli(t0, d, 32); 4985 __ zero_extend(prev_cd, c, 32); 4986 __ orr(prev_cd, prev_cd, t0); 4987 4988 __ mv(prev_e, e); 4989 } 4990 4991 // Intrinsic for: 4992 // void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs) 4993 // void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit) 4994 // 4995 // Arguments: 4996 // 4997 // Inputs: 4998 // c_rarg0: byte[] src array + offset 4999 // c_rarg1: int[] SHA.state 5000 // - - - - - - below are only for implCompressMultiBlock0 - - - - - - 5001 // c_rarg2: int offset 5002 // c_rarg3: int limit 5003 // 5004 // Outputs: 5005 // - - - - - - below are only for implCompressMultiBlock0 - - - - - - 5006 // c_rarg0: int offset, when (multi_block == true) 5007 // 5008 address generate_sha1_implCompress(bool multi_block, const char *name) { 5009 __ align(CodeEntryAlignment); 5010 StubCodeMark mark(this, "StubRoutines", name); 5011 5012 address start = __ pc(); 5013 __ enter(); 5014 5015 RegSet saved_regs = RegSet::range(x18, x27); 5016 if (multi_block) { 5017 // use x9 as src below. 5018 saved_regs += RegSet::of(x9); 5019 } 5020 __ push_reg(saved_regs, sp); 5021 5022 // c_rarg0 - c_rarg3: x10 - x13 5023 Register buf = c_rarg0; 5024 Register state = c_rarg1; 5025 Register offset = c_rarg2; 5026 Register limit = c_rarg3; 5027 // use src to contain the original start point of the array. 5028 Register src = x9; 5029 5030 if (multi_block) { 5031 __ sub(limit, limit, offset); 5032 __ add(limit, limit, buf); 5033 __ sub(src, buf, offset); 5034 } 5035 5036 // [args-reg]: x14 - x17 5037 // [temp-reg]: x28 - x31 5038 // [saved-reg]: x18 - x27 5039 5040 // h0/1/2/3/4 5041 const Register a = x14, b = x15, c = x16, d = x17, e = x28; 5042 // w0, w1, ... w15 5043 // put two adjecent w's in one register: 5044 // one at high word part, another at low word part 5045 // at different round (even or odd), w't value reside in different items in ws[]. 5046 // w0 ~ w15, either reside in 5047 // ws[0] ~ ws[7], where 5048 // w0 at higher 32 bits of ws[0], 5049 // w1 at lower 32 bits of ws[0], 5050 // ... 5051 // w14 at higher 32 bits of ws[7], 5052 // w15 at lower 32 bits of ws[7]. 5053 // or, reside in 5054 // w0: ws[0]'s lower 32 bits 5055 // w1 ~ w14: ws[1] ~ ws[7] 5056 // w15: ws[8]'s higher 32 bits 5057 Register ws[9] = {x29, x30, x31, x18, 5058 x19, x20, x21, x22, 5059 x23}; // auxiliary register for calculating w's value 5060 // current k't's value 5061 const Register cur_k = x24; 5062 // current w't's value 5063 const Register cur_w = x25; 5064 // values of a, b, c, d, e in the previous round 5065 const Register prev_ab = x26, prev_cd = x27; 5066 const Register prev_e = offset; // reuse offset/c_rarg2 5067 5068 // load 5 words state into a, b, c, d, e. 5069 // 5070 // To minimize the number of memory operations, we apply following 5071 // optimization: read the states (a/b/c/d) of 4-byte values in pairs, 5072 // with a single ld, and split them into 2 registers. 5073 // 5074 // And, as the core algorithm of SHA-1 works on 32-bits words, so 5075 // in the following code, it does not care about the content of 5076 // higher 32-bits in a/b/c/d/e. Based on this observation, 5077 // we can apply further optimization, which is to just ignore the 5078 // higher 32-bits in a/c/e, rather than set the higher 5079 // 32-bits of a/c/e to zero explicitly with extra instructions. 5080 __ ld(a, Address(state, 0)); 5081 __ srli(b, a, 32); 5082 __ ld(c, Address(state, 8)); 5083 __ srli(d, c, 32); 5084 __ lw(e, Address(state, 16)); 5085 5086 Label L_sha1_loop; 5087 if (multi_block) { 5088 __ BIND(L_sha1_loop); 5089 } 5090 5091 sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5092 5093 for (int round = 0; round < 80; round++) { 5094 // prepare K't value 5095 sha1_prepare_k(cur_k, round); 5096 5097 // prepare W't value 5098 sha1_prepare_w(cur_w, ws, buf, round); 5099 5100 // one round process 5101 sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round); 5102 } 5103 5104 // compute the intermediate hash value 5105 sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5106 5107 if (multi_block) { 5108 int64_t block_bytes = 16 * 4; 5109 __ addi(buf, buf, block_bytes); 5110 5111 __ bge(limit, buf, L_sha1_loop, true); 5112 } 5113 5114 // store back the state. 5115 __ zero_extend(a, a, 32); 5116 __ slli(b, b, 32); 5117 __ orr(a, a, b); 5118 __ sd(a, Address(state, 0)); 5119 __ zero_extend(c, c, 32); 5120 __ slli(d, d, 32); 5121 __ orr(c, c, d); 5122 __ sd(c, Address(state, 8)); 5123 __ sw(e, Address(state, 16)); 5124 5125 // return offset 5126 if (multi_block) { 5127 __ sub(c_rarg0, buf, src); 5128 } 5129 5130 __ pop_reg(saved_regs, sp); 5131 5132 __ leave(); 5133 __ ret(); 5134 5135 return (address) start; 5136 } 5137 5138 /** 5139 * vector registers: 5140 * input VectorRegister's: intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3 5141 * index VectorRegister's: idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7 5142 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11 5143 * 5144 * NOTE: each field will occupy a vector register group 5145 */ 5146 void base64_vector_encode_round(Register src, Register dst, Register codec, 5147 Register size, Register stepSrc, Register stepDst, 5148 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, 5149 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, 5150 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4, 5151 Assembler::LMUL lmul) { 5152 // set vector register type/len 5153 __ vsetvli(x0, size, Assembler::e8, lmul); 5154 5155 // segmented load src into v registers: mem(src) => vr(3) 5156 __ vlseg3e8_v(inputV1, src); 5157 5158 // src = src + register_group_len_bytes * 3 5159 __ add(src, src, stepSrc); 5160 5161 // encoding 5162 // 1. compute index into lookup table: vr(3) => vr(4) 5163 __ vsrl_vi(idxV1, inputV1, 2); 5164 5165 __ vsrl_vi(idxV2, inputV2, 2); 5166 __ vsll_vi(inputV1, inputV1, 6); 5167 __ vor_vv(idxV2, idxV2, inputV1); 5168 __ vsrl_vi(idxV2, idxV2, 2); 5169 5170 __ vsrl_vi(idxV3, inputV3, 4); 5171 __ vsll_vi(inputV2, inputV2, 4); 5172 __ vor_vv(idxV3, inputV2, idxV3); 5173 __ vsrl_vi(idxV3, idxV3, 2); 5174 5175 __ vsll_vi(idxV4, inputV3, 2); 5176 __ vsrl_vi(idxV4, idxV4, 2); 5177 5178 // 2. indexed load: vr(4) => vr(4) 5179 __ vluxei8_v(outputV1, codec, idxV1); 5180 __ vluxei8_v(outputV2, codec, idxV2); 5181 __ vluxei8_v(outputV3, codec, idxV3); 5182 __ vluxei8_v(outputV4, codec, idxV4); 5183 5184 // segmented store encoded data in v registers back to dst: vr(4) => mem(dst) 5185 __ vsseg4e8_v(outputV1, dst); 5186 5187 // dst = dst + register_group_len_bytes * 4 5188 __ add(dst, dst, stepDst); 5189 } 5190 5191 /** 5192 * void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) 5193 * 5194 * Input arguments: 5195 * c_rarg0 - src, source array 5196 * c_rarg1 - sp, src start offset 5197 * c_rarg2 - sl, src end offset 5198 * c_rarg3 - dst, dest array 5199 * c_rarg4 - dp, dst start offset 5200 * c_rarg5 - isURL, Base64 or URL character set 5201 */ 5202 address generate_base64_encodeBlock() { 5203 alignas(64) static const char toBase64[64] = { 5204 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5205 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5206 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5207 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5208 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 5209 }; 5210 5211 alignas(64) static const char toBase64URL[64] = { 5212 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5213 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5214 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5215 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5216 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 5217 }; 5218 5219 __ align(CodeEntryAlignment); 5220 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 5221 address start = __ pc(); 5222 __ enter(); 5223 5224 Register src = c_rarg0; 5225 Register soff = c_rarg1; 5226 Register send = c_rarg2; 5227 Register dst = c_rarg3; 5228 Register doff = c_rarg4; 5229 Register isURL = c_rarg5; 5230 5231 Register codec = c_rarg6; 5232 Register length = c_rarg7; // total length of src data in bytes 5233 5234 Label ProcessData, Exit; 5235 5236 // length should be multiple of 3 5237 __ sub(length, send, soff); 5238 // real src/dst to process data 5239 __ add(src, src, soff); 5240 __ add(dst, dst, doff); 5241 5242 // load the codec base address 5243 __ la(codec, ExternalAddress((address) toBase64)); 5244 __ beqz(isURL, ProcessData); 5245 __ la(codec, ExternalAddress((address) toBase64URL)); 5246 __ BIND(ProcessData); 5247 5248 // vector version 5249 if (UseRVV) { 5250 Label ProcessM2, ProcessM1, ProcessScalar; 5251 5252 Register size = soff; 5253 Register stepSrcM1 = send; 5254 Register stepSrcM2 = doff; 5255 Register stepDst = isURL; 5256 5257 __ mv(size, MaxVectorSize * 2); 5258 __ mv(stepSrcM1, MaxVectorSize * 3); 5259 __ slli(stepSrcM2, stepSrcM1, 1); 5260 __ mv(stepDst, MaxVectorSize * 2 * 4); 5261 5262 __ blt(length, stepSrcM2, ProcessM1); 5263 5264 __ BIND(ProcessM2); 5265 base64_vector_encode_round(src, dst, codec, 5266 size, stepSrcM2, stepDst, 5267 v2, v4, v6, // inputs 5268 v8, v10, v12, v14, // indexes 5269 v16, v18, v20, v22, // outputs 5270 Assembler::m2); 5271 5272 __ sub(length, length, stepSrcM2); 5273 __ bge(length, stepSrcM2, ProcessM2); 5274 5275 __ BIND(ProcessM1); 5276 __ blt(length, stepSrcM1, ProcessScalar); 5277 5278 __ srli(size, size, 1); 5279 __ srli(stepDst, stepDst, 1); 5280 base64_vector_encode_round(src, dst, codec, 5281 size, stepSrcM1, stepDst, 5282 v1, v2, v3, // inputs 5283 v4, v5, v6, v7, // indexes 5284 v8, v9, v10, v11, // outputs 5285 Assembler::m1); 5286 __ sub(length, length, stepSrcM1); 5287 5288 __ BIND(ProcessScalar); 5289 } 5290 5291 // scalar version 5292 { 5293 Register byte1 = soff, byte0 = send, byte2 = doff; 5294 Register combined24Bits = isURL; 5295 5296 __ beqz(length, Exit); 5297 5298 Label ScalarLoop; 5299 __ BIND(ScalarLoop); 5300 { 5301 // plain: [byte0[7:0] : byte1[7:0] : byte2[7:0]] => 5302 // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]] 5303 5304 // load 3 bytes src data 5305 __ lbu(byte0, Address(src, 0)); 5306 __ lbu(byte1, Address(src, 1)); 5307 __ lbu(byte2, Address(src, 2)); 5308 __ addi(src, src, 3); 5309 5310 // construct 24 bits from 3 bytes 5311 __ slliw(byte0, byte0, 16); 5312 __ slliw(byte1, byte1, 8); 5313 __ orr(combined24Bits, byte0, byte1); 5314 __ orr(combined24Bits, combined24Bits, byte2); 5315 5316 // get codec index and encode(ie. load from codec by index) 5317 __ slliw(byte0, combined24Bits, 8); 5318 __ srliw(byte0, byte0, 26); 5319 __ add(byte0, codec, byte0); 5320 __ lbu(byte0, byte0); 5321 5322 __ slliw(byte1, combined24Bits, 14); 5323 __ srliw(byte1, byte1, 26); 5324 __ add(byte1, codec, byte1); 5325 __ lbu(byte1, byte1); 5326 5327 __ slliw(byte2, combined24Bits, 20); 5328 __ srliw(byte2, byte2, 26); 5329 __ add(byte2, codec, byte2); 5330 __ lbu(byte2, byte2); 5331 5332 __ andi(combined24Bits, combined24Bits, 0x3f); 5333 __ add(combined24Bits, codec, combined24Bits); 5334 __ lbu(combined24Bits, combined24Bits); 5335 5336 // store 4 bytes encoded data 5337 __ sb(byte0, Address(dst, 0)); 5338 __ sb(byte1, Address(dst, 1)); 5339 __ sb(byte2, Address(dst, 2)); 5340 __ sb(combined24Bits, Address(dst, 3)); 5341 5342 __ sub(length, length, 3); 5343 __ addi(dst, dst, 4); 5344 // loop back 5345 __ bnez(length, ScalarLoop); 5346 } 5347 } 5348 5349 __ BIND(Exit); 5350 5351 __ leave(); 5352 __ ret(); 5353 5354 return (address) start; 5355 } 5356 5357 /** 5358 * vector registers: 5359 * input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8 5360 * index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16 5361 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22 5362 * 5363 * NOTE: each field will occupy a single vector register group 5364 */ 5365 void base64_vector_decode_round(Register src, Register dst, Register codec, 5366 Register size, Register stepSrc, Register stepDst, Register failedIdx, Register minusOne, 5367 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4, 5368 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, 5369 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, 5370 Assembler::LMUL lmul) { 5371 // set vector register type/len 5372 __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta); 5373 5374 // segmented load src into v registers: mem(src) => vr(4) 5375 __ vlseg4e8_v(inputV1, src); 5376 5377 // src = src + register_group_len_bytes * 4 5378 __ add(src, src, stepSrc); 5379 5380 // decoding 5381 // 1. indexed load: vr(4) => vr(4) 5382 __ vluxei8_v(idxV1, codec, inputV1); 5383 __ vluxei8_v(idxV2, codec, inputV2); 5384 __ vluxei8_v(idxV3, codec, inputV3); 5385 __ vluxei8_v(idxV4, codec, inputV4); 5386 5387 // 2. check wrong data 5388 __ vor_vv(outputV1, idxV1, idxV2); 5389 __ vor_vv(outputV2, idxV3, idxV4); 5390 __ vor_vv(outputV1, outputV1, outputV2); 5391 __ vmseq_vi(v0, outputV1, -1); 5392 __ vfirst_m(failedIdx, v0); 5393 Label NoFailure; 5394 __ beq(failedIdx, minusOne, NoFailure); 5395 __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu); 5396 __ slli(stepDst, failedIdx, 1); 5397 __ add(stepDst, failedIdx, stepDst); 5398 __ BIND(NoFailure); 5399 5400 // 3. compute the decoded data: vr(4) => vr(3) 5401 __ vsll_vi(idxV1, idxV1, 2); 5402 __ vsrl_vi(outputV1, idxV2, 4); 5403 __ vor_vv(outputV1, outputV1, idxV1); 5404 5405 __ vsll_vi(idxV2, idxV2, 4); 5406 __ vsrl_vi(outputV2, idxV3, 2); 5407 __ vor_vv(outputV2, outputV2, idxV2); 5408 5409 __ vsll_vi(idxV3, idxV3, 6); 5410 __ vor_vv(outputV3, idxV4, idxV3); 5411 5412 // segmented store encoded data in v registers back to dst: vr(3) => mem(dst) 5413 __ vsseg3e8_v(outputV1, dst); 5414 5415 // dst = dst + register_group_len_bytes * 3 5416 __ add(dst, dst, stepDst); 5417 } 5418 5419 /** 5420 * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME) 5421 * 5422 * Input arguments: 5423 * c_rarg0 - src, source array 5424 * c_rarg1 - sp, src start offset 5425 * c_rarg2 - sl, src end offset 5426 * c_rarg3 - dst, dest array 5427 * c_rarg4 - dp, dst start offset 5428 * c_rarg5 - isURL, Base64 or URL character set 5429 * c_rarg6 - isMIME, Decoding MIME block 5430 */ 5431 address generate_base64_decodeBlock() { 5432 5433 static const uint8_t fromBase64[256] = { 5434 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5435 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5436 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 5437 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 5438 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 5439 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 5440 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 5441 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 5442 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5443 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5444 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5445 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5446 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5447 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5448 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5449 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5450 }; 5451 5452 static const uint8_t fromBase64URL[256] = { 5453 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5454 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5455 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 5456 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 5457 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 5458 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 5459 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 5460 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 5461 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5462 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5463 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5464 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5465 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5466 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5467 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5468 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5469 }; 5470 5471 __ align(CodeEntryAlignment); 5472 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 5473 address start = __ pc(); 5474 __ enter(); 5475 5476 Register src = c_rarg0; 5477 Register soff = c_rarg1; 5478 Register send = c_rarg2; 5479 Register dst = c_rarg3; 5480 Register doff = c_rarg4; 5481 Register isURL = c_rarg5; 5482 Register isMIME = c_rarg6; 5483 5484 Register codec = c_rarg7; 5485 Register dstBackup = x31; 5486 Register length = x28; // t3, total length of src data in bytes 5487 5488 Label ProcessData, Exit; 5489 Label ProcessScalar, ScalarLoop; 5490 5491 // passed in length (send - soff) is guaranteed to be > 4, 5492 // and in this intrinsic we only process data of length in multiple of 4, 5493 // it's not guaranteed to be multiple of 4 by java level, so do it explicitly 5494 __ sub(length, send, soff); 5495 __ andi(length, length, -4); 5496 // real src/dst to process data 5497 __ add(src, src, soff); 5498 __ add(dst, dst, doff); 5499 // backup of dst, used to calculate the return value at exit 5500 __ mv(dstBackup, dst); 5501 5502 // load the codec base address 5503 __ la(codec, ExternalAddress((address) fromBase64)); 5504 __ beqz(isURL, ProcessData); 5505 __ la(codec, ExternalAddress((address) fromBase64URL)); 5506 __ BIND(ProcessData); 5507 5508 // vector version 5509 if (UseRVV) { 5510 // for MIME case, it has a default length limit of 76 which could be 5511 // different(smaller) from (send - soff), so in MIME case, we go through 5512 // the scalar code path directly. 5513 __ bnez(isMIME, ScalarLoop); 5514 5515 Label ProcessM1, ProcessM2; 5516 5517 Register failedIdx = soff; 5518 Register stepSrcM1 = send; 5519 Register stepSrcM2 = doff; 5520 Register stepDst = isURL; 5521 Register size = x29; // t4 5522 Register minusOne = x30; // t5 5523 5524 __ mv(minusOne, -1); 5525 __ mv(size, MaxVectorSize * 2); 5526 __ mv(stepSrcM1, MaxVectorSize * 4); 5527 __ slli(stepSrcM2, stepSrcM1, 1); 5528 __ mv(stepDst, MaxVectorSize * 2 * 3); 5529 5530 __ blt(length, stepSrcM2, ProcessM1); 5531 5532 5533 // Assembler::m2 5534 __ BIND(ProcessM2); 5535 base64_vector_decode_round(src, dst, codec, 5536 size, stepSrcM2, stepDst, failedIdx, minusOne, 5537 v2, v4, v6, v8, // inputs 5538 v10, v12, v14, v16, // indexes 5539 v18, v20, v22, // outputs 5540 Assembler::m2); 5541 __ sub(length, length, stepSrcM2); 5542 5543 // error check 5544 __ bne(failedIdx, minusOne, Exit); 5545 5546 __ bge(length, stepSrcM2, ProcessM2); 5547 5548 5549 // Assembler::m1 5550 __ BIND(ProcessM1); 5551 __ blt(length, stepSrcM1, ProcessScalar); 5552 5553 __ srli(size, size, 1); 5554 __ srli(stepDst, stepDst, 1); 5555 base64_vector_decode_round(src, dst, codec, 5556 size, stepSrcM1, stepDst, failedIdx, minusOne, 5557 v1, v2, v3, v4, // inputs 5558 v5, v6, v7, v8, // indexes 5559 v9, v10, v11, // outputs 5560 Assembler::m1); 5561 __ sub(length, length, stepSrcM1); 5562 5563 // error check 5564 __ bne(failedIdx, minusOne, Exit); 5565 5566 __ BIND(ProcessScalar); 5567 __ beqz(length, Exit); 5568 } 5569 5570 // scalar version 5571 { 5572 Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL; 5573 Register combined32Bits = x29; // t5 5574 5575 // encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] => 5576 // plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]] 5577 __ BIND(ScalarLoop); 5578 5579 // load 4 bytes encoded src data 5580 __ lbu(byte0, Address(src, 0)); 5581 __ lbu(byte1, Address(src, 1)); 5582 __ lbu(byte2, Address(src, 2)); 5583 __ lbu(byte3, Address(src, 3)); 5584 __ addi(src, src, 4); 5585 5586 // get codec index and decode (ie. load from codec by index) 5587 __ add(byte0, codec, byte0); 5588 __ add(byte1, codec, byte1); 5589 __ lb(byte0, Address(byte0, 0)); 5590 __ lb(byte1, Address(byte1, 0)); 5591 __ add(byte2, codec, byte2); 5592 __ add(byte3, codec, byte3); 5593 __ lb(byte2, Address(byte2, 0)); 5594 __ lb(byte3, Address(byte3, 0)); 5595 __ slliw(byte0, byte0, 18); 5596 __ slliw(byte1, byte1, 12); 5597 __ orr(byte0, byte0, byte1); 5598 __ orr(byte0, byte0, byte3); 5599 __ slliw(byte2, byte2, 6); 5600 // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time, 5601 // 1. error check below 5602 // 2. decode below 5603 __ orr(combined32Bits, byte0, byte2); 5604 5605 // error check 5606 __ bltz(combined32Bits, Exit); 5607 5608 // store 3 bytes decoded data 5609 __ sraiw(byte0, combined32Bits, 16); 5610 __ sraiw(byte1, combined32Bits, 8); 5611 __ sb(byte0, Address(dst, 0)); 5612 __ sb(byte1, Address(dst, 1)); 5613 __ sb(combined32Bits, Address(dst, 2)); 5614 5615 __ sub(length, length, 4); 5616 __ addi(dst, dst, 3); 5617 // loop back 5618 __ bnez(length, ScalarLoop); 5619 } 5620 5621 __ BIND(Exit); 5622 __ sub(c_rarg0, dst, dstBackup); 5623 5624 __ leave(); 5625 __ ret(); 5626 5627 return (address) start; 5628 } 5629 5630 void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable, 5631 VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc, 5632 Register temp0, Register temp1, Register temp2, Register temp3, 5633 VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) { 5634 5635 assert((lmul == Assembler::m4 && step == 64) || 5636 (lmul == Assembler::m2 && step == 32) || 5637 (lmul == Assembler::m1 && step == 16), 5638 "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16"); 5639 // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used. 5640 // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case. 5641 // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration. 5642 // In non-vectorized code, we update s1 and s2 as: 5643 // s1 <- s1 + b1 5644 // s2 <- s2 + s1 5645 // s1 <- s1 + b2 5646 // s2 <- s2 + b1 5647 // ... 5648 // s1 <- s1 + b64 5649 // s2 <- s2 + s1 5650 // Putting above assignments together, we have: 5651 // s1_new = s1 + b1 + b2 + ... + b64 5652 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) = 5653 // = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) = 5654 // = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1) 5655 5656 __ mv(temp3, step); 5657 // Load data 5658 __ vsetvli(temp0, temp3, Assembler::e8, lmul); 5659 __ vle8_v(vbytes, buff); 5660 __ addi(buff, buff, step); 5661 5662 // Upper bound reduction sum for s1_new: 5663 // 0xFF * 64 = 0x3FC0, so: 5664 // 1. Need to do vector-widening reduction sum 5665 // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements 5666 __ vwredsumu_vs(vs1acc, vbytes, vzero); 5667 // Multiplication for s2_new 5668 __ vwmulu_vv(vs2acc, vtable, vbytes); 5669 5670 // s2 = s2 + s1 * log2(step) 5671 __ slli(temp1, s1, exact_log2(step)); 5672 __ add(s2, s2, temp1); 5673 5674 // Summing up calculated results for s2_new 5675 if (MaxVectorSize > 16) { 5676 __ vsetvli(temp0, temp3, Assembler::e16, lmul); 5677 } else { 5678 // Half of vector-widening multiplication result is in successor of vs2acc 5679 // group for vlen == 16, in which case we need to double vector register 5680 // group width in order to reduction sum all of them 5681 Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 : 5682 (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8; 5683 __ vsetvli(temp0, temp3, Assembler::e16, lmulx2); 5684 } 5685 // Upper bound for reduction sum: 5686 // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so: 5687 // 1. Need to do vector-widening reduction sum 5688 // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements 5689 __ vwredsumu_vs(vtemp1, vs2acc, vzero); 5690 5691 // Extracting results for: 5692 // s1_new 5693 __ vmv_x_s(temp0, vs1acc); 5694 __ add(s1, s1, temp0); 5695 // s2_new 5696 __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1); 5697 __ vmv_x_s(temp1, vtemp1); 5698 __ add(s2, s2, temp1); 5699 } 5700 5701 /*** 5702 * int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len) 5703 * 5704 * Arguments: 5705 * 5706 * Inputs: 5707 * c_rarg0 - int adler 5708 * c_rarg1 - byte* buff (b + off) 5709 * c_rarg2 - int len 5710 * 5711 * Output: 5712 * c_rarg0 - int adler result 5713 */ 5714 address generate_updateBytesAdler32() { 5715 __ align(CodeEntryAlignment); 5716 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 5717 address start = __ pc(); 5718 5719 Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop, 5720 L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1; 5721 5722 // Aliases 5723 Register adler = c_rarg0; 5724 Register s1 = c_rarg0; 5725 Register s2 = c_rarg3; 5726 Register buff = c_rarg1; 5727 Register len = c_rarg2; 5728 Register nmax = c_rarg4; 5729 Register base = c_rarg5; 5730 Register count = c_rarg6; 5731 Register temp0 = x28; // t3 5732 Register temp1 = x29; // t4 5733 Register temp2 = x30; // t5 5734 Register temp3 = x31; // t6 5735 5736 VectorRegister vzero = v31; 5737 VectorRegister vbytes = v8; // group: v8, v9, v10, v11 5738 VectorRegister vs1acc = v12; // group: v12, v13, v14, v15 5739 VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23 5740 VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27 5741 VectorRegister vtable_32 = v4; // group: v4, v5 5742 VectorRegister vtable_16 = v30; 5743 VectorRegister vtemp1 = v28; 5744 VectorRegister vtemp2 = v29; 5745 5746 // Max number of bytes we can process before having to take the mod 5747 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 5748 const uint64_t BASE = 0xfff1; 5749 const uint64_t NMAX = 0x15B0; 5750 5751 // Loops steps 5752 int step_64 = 64; 5753 int step_32 = 32; 5754 int step_16 = 16; 5755 int step_1 = 1; 5756 5757 __ enter(); // Required for proper stackwalking of RuntimeStub frame 5758 __ mv(temp1, 64); 5759 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4); 5760 5761 // Generating accumulation coefficients for further calculations 5762 // vtable_64: 5763 __ vid_v(vtemp1); 5764 __ vrsub_vx(vtable_64, vtemp1, temp1); 5765 // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 } 5766 5767 // vtable_32: 5768 __ mv(temp1, 32); 5769 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2); 5770 __ vid_v(vtemp1); 5771 __ vrsub_vx(vtable_32, vtemp1, temp1); 5772 // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 } 5773 5774 __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1); 5775 // vtable_16: 5776 __ mv(temp1, 16); 5777 __ vid_v(vtemp1); 5778 __ vrsub_vx(vtable_16, vtemp1, temp1); 5779 // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 } 5780 5781 __ vmv_v_i(vzero, 0); 5782 5783 __ mv(base, BASE); 5784 __ mv(nmax, NMAX); 5785 5786 // s1 is initialized to the lower 16 bits of adler 5787 // s2 is initialized to the upper 16 bits of adler 5788 __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff) 5789 __ zero_extend(s1, adler, 16); // s1 = (adler & 0xffff) 5790 5791 // The pipelined loop needs at least 16 elements for 1 iteration 5792 // It does check this, but it is more effective to skip to the cleanup loop 5793 __ mv(temp0, step_16); 5794 __ bgeu(len, temp0, L_nmax); 5795 __ beqz(len, L_combine); 5796 5797 // Jumping to L_by1_loop 5798 __ sub(len, len, step_1); 5799 __ j(L_by1_loop); 5800 5801 __ bind(L_nmax); 5802 __ sub(len, len, nmax); 5803 __ sub(count, nmax, 16); 5804 __ bltz(len, L_by16); 5805 5806 // Align L_nmax loop by 64 5807 __ bind(L_nmax_loop_entry); 5808 __ sub(count, count, 32); 5809 5810 __ bind(L_nmax_loop); 5811 adler32_process_bytes(buff, s1, s2, vtable_64, vzero, 5812 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5813 vtemp1, vtemp2, step_64, Assembler::m4); 5814 __ sub(count, count, step_64); 5815 __ bgtz(count, L_nmax_loop); 5816 5817 // There are three iterations left to do 5818 adler32_process_bytes(buff, s1, s2, vtable_32, vzero, 5819 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5820 vtemp1, vtemp2, step_32, Assembler::m2); 5821 adler32_process_bytes(buff, s1, s2, vtable_16, vzero, 5822 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5823 vtemp1, vtemp2, step_16, Assembler::m1); 5824 5825 // s1 = s1 % BASE 5826 __ remuw(s1, s1, base); 5827 // s2 = s2 % BASE 5828 __ remuw(s2, s2, base); 5829 5830 __ sub(len, len, nmax); 5831 __ sub(count, nmax, 16); 5832 __ bgez(len, L_nmax_loop_entry); 5833 5834 __ bind(L_by16); 5835 __ add(len, len, count); 5836 __ bltz(len, L_by1); 5837 // Trying to unroll 5838 __ mv(temp3, step_64); 5839 __ blt(len, temp3, L_by16_loop); 5840 5841 __ bind(L_by16_loop_unroll); 5842 adler32_process_bytes(buff, s1, s2, vtable_64, vzero, 5843 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5844 vtemp1, vtemp2, step_64, Assembler::m4); 5845 __ sub(len, len, step_64); 5846 // By now the temp3 should still be 64 5847 __ bge(len, temp3, L_by16_loop_unroll); 5848 5849 __ bind(L_by16_loop); 5850 adler32_process_bytes(buff, s1, s2, vtable_16, vzero, 5851 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5852 vtemp1, vtemp2, step_16, Assembler::m1); 5853 __ sub(len, len, step_16); 5854 __ bgez(len, L_by16_loop); 5855 5856 __ bind(L_by1); 5857 __ add(len, len, 15); 5858 __ bltz(len, L_do_mod); 5859 5860 __ bind(L_by1_loop); 5861 __ lbu(temp0, Address(buff, 0)); 5862 __ addi(buff, buff, step_1); 5863 __ add(s1, temp0, s1); 5864 __ add(s2, s2, s1); 5865 __ sub(len, len, step_1); 5866 __ bgez(len, L_by1_loop); 5867 5868 __ bind(L_do_mod); 5869 // s1 = s1 % BASE 5870 __ remuw(s1, s1, base); 5871 // s2 = s2 % BASE 5872 __ remuw(s2, s2, base); 5873 5874 // Combine lower bits and higher bits 5875 // adler = s1 | (s2 << 16) 5876 __ bind(L_combine); 5877 __ slli(s2, s2, 16); 5878 __ orr(s1, s1, s2); 5879 5880 __ leave(); // Required for proper stackwalking of RuntimeStub frame 5881 __ ret(); 5882 5883 return start; 5884 } 5885 5886 #endif // COMPILER2_OR_JVMCI 5887 5888 #ifdef COMPILER2 5889 5890 static const int64_t right_2_bits = right_n_bits(2); 5891 static const int64_t right_3_bits = right_n_bits(3); 5892 5893 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 5894 // are represented as long[5], with BITS_PER_LIMB = 26. 5895 // Pack five 26-bit limbs into three 64-bit registers. 5896 void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) { 5897 assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2); 5898 5899 // The goal is to have 128-bit value in dest2:dest1:dest0 5900 __ ld(dest0, Address(src, 0)); // 26 bits in dest0 5901 5902 __ ld(tmp1, Address(src, sizeof(jlong))); 5903 __ slli(tmp1, tmp1, 26); 5904 __ add(dest0, dest0, tmp1); // 52 bits in dest0 5905 5906 __ ld(tmp2, Address(src, 2 * sizeof(jlong))); 5907 __ slli(tmp1, tmp2, 52); 5908 __ add(dest0, dest0, tmp1); // dest0 is full 5909 5910 __ srli(dest1, tmp2, 12); // 14-bit in dest1 5911 5912 __ ld(tmp1, Address(src, 3 * sizeof(jlong))); 5913 __ slli(tmp1, tmp1, 14); 5914 __ add(dest1, dest1, tmp1); // 40-bit in dest1 5915 5916 __ ld(tmp1, Address(src, 4 * sizeof(jlong))); 5917 __ slli(tmp2, tmp1, 40); 5918 __ add(dest1, dest1, tmp2); // dest1 is full 5919 5920 if (dest2->is_valid()) { 5921 __ srli(tmp1, tmp1, 24); 5922 __ mv(dest2, tmp1); // 2 bits in dest2 5923 } else { 5924 #ifdef ASSERT 5925 Label OK; 5926 __ srli(tmp1, tmp1, 24); 5927 __ beq(zr, tmp1, OK); // 2 bits 5928 __ stop("high bits of Poly1305 integer should be zero"); 5929 __ should_not_reach_here(); 5930 __ bind(OK); 5931 #endif 5932 } 5933 } 5934 5935 // As above, but return only a 128-bit integer, packed into two 5936 // 64-bit registers. 5937 void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) { 5938 poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2); 5939 } 5940 5941 // U_2:U_1:U_0: += (U_2 >> 2) * 5 5942 void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) { 5943 assert_different_registers(U_2, U_1, U_0, tmp1, tmp2); 5944 5945 // First, U_2:U_1:U_0 += (U_2 >> 2) 5946 __ srli(tmp1, U_2, 2); 5947 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2 5948 __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits 5949 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2 5950 __ add(U_2, U_2, tmp2); 5951 5952 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 5953 __ slli(tmp1, tmp1, 2); 5954 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2 5955 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2 5956 __ add(U_2, U_2, tmp2); 5957 } 5958 5959 // Poly1305, RFC 7539 5960 // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) 5961 5962 // Arguments: 5963 // c_rarg0: input_start -- where the input is stored 5964 // c_rarg1: length 5965 // c_rarg2: acc_start -- where the output will be stored 5966 // c_rarg3: r_start -- where the randomly generated 128-bit key is stored 5967 5968 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 5969 // description of the tricks used to simplify and accelerate this 5970 // computation. 5971 5972 address generate_poly1305_processBlocks() { 5973 __ align(CodeEntryAlignment); 5974 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 5975 address start = __ pc(); 5976 __ enter(); 5977 Label here; 5978 5979 RegSet saved_regs = RegSet::range(x18, x21); 5980 RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin(); 5981 __ push_reg(saved_regs, sp); 5982 5983 // Arguments 5984 const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3; 5985 5986 // R_n is the 128-bit randomly-generated key, packed into two 5987 // registers. The caller passes this key to us as long[5], with 5988 // BITS_PER_LIMB = 26. 5989 const Register R_0 = *regs, R_1 = *++regs; 5990 poly1305_pack_26(R_0, R_1, r_start, t1, t2); 5991 5992 // RR_n is (R_n >> 2) * 5 5993 const Register RR_0 = *++regs, RR_1 = *++regs; 5994 __ srli(t1, R_0, 2); 5995 __ shadd(RR_0, t1, t1, t2, 2); 5996 __ srli(t1, R_1, 2); 5997 __ shadd(RR_1, t1, t1, t2, 2); 5998 5999 // U_n is the current checksum 6000 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 6001 poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2); 6002 6003 static constexpr int BLOCK_LENGTH = 16; 6004 Label DONE, LOOP; 6005 6006 __ mv(t1, BLOCK_LENGTH); 6007 __ blt(length, t1, DONE); { 6008 __ bind(LOOP); 6009 6010 // S_n is to be the sum of U_n and the next block of data 6011 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 6012 __ ld(S_0, Address(input_start, 0)); 6013 __ ld(S_1, Address(input_start, wordSize)); 6014 6015 __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1 6016 __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1 6017 __ add(S_2, U_2, t1); 6018 6019 __ addi(S_2, S_2, 1); 6020 6021 const Register U_0HI = *++regs, U_1HI = *++regs; 6022 6023 // NB: this logic depends on some of the special properties of 6024 // Poly1305 keys. In particular, because we know that the top 6025 // four bits of R_0 and R_1 are zero, we can add together 6026 // partial products without any risk of needing to propagate a 6027 // carry out. 6028 __ wide_mul(U_0, U_0HI, S_0, R_0); 6029 __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2); 6030 __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2); 6031 6032 __ wide_mul(U_1, U_1HI, S_0, R_1); 6033 __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2); 6034 __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2); 6035 6036 __ andi(U_2, R_0, right_2_bits); 6037 __ mul(U_2, S_2, U_2); 6038 6039 // Partial reduction mod 2**130 - 5 6040 __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1 6041 __ adc(U_2, U_2, U_1HI, t1); 6042 // Sum is now in U_2:U_1:U_0. 6043 6044 // U_2:U_1:U_0: += (U_2 >> 2) * 5 6045 poly1305_reduce(U_2, U_1, U_0, t1, t2); 6046 6047 __ sub(length, length, BLOCK_LENGTH); 6048 __ addi(input_start, input_start, BLOCK_LENGTH); 6049 __ mv(t1, BLOCK_LENGTH); 6050 __ bge(length, t1, LOOP); 6051 } 6052 6053 // Further reduce modulo 2^130 - 5 6054 poly1305_reduce(U_2, U_1, U_0, t1, t2); 6055 6056 // Unpack the sum into five 26-bit limbs and write to memory. 6057 // First 26 bits is the first limb 6058 __ slli(t1, U_0, 38); // Take lowest 26 bits 6059 __ srli(t1, t1, 38); 6060 __ sd(t1, Address(acc_start)); // First 26-bit limb 6061 6062 // 27-52 bits of U_0 is the second limb 6063 __ slli(t1, U_0, 12); // Take next 27-52 bits 6064 __ srli(t1, t1, 38); 6065 __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb 6066 6067 // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register 6068 __ srli(t1, U_0, 52); 6069 __ slli(t2, U_1, 50); 6070 __ srli(t2, t2, 38); 6071 __ add(t1, t1, t2); 6072 __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb 6073 6074 // Storing 15-40 bits of U_1 6075 __ slli(t1, U_1, 24); // Already used up 14 bits 6076 __ srli(t1, t1, 38); // Clear all other bits from t1 6077 __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb 6078 6079 // Storing 41-64 bits of U_1 and first three bits from U_2 in one register 6080 __ srli(t1, U_1, 40); 6081 __ andi(t2, U_2, right_3_bits); 6082 __ slli(t2, t2, 24); 6083 __ add(t1, t1, t2); 6084 __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb 6085 6086 __ bind(DONE); 6087 __ pop_reg(saved_regs, sp); 6088 __ leave(); // Required for proper stackwalking 6089 __ ret(); 6090 6091 return start; 6092 } 6093 6094 #endif // COMPILER2 6095 6096 /** 6097 * Arguments: 6098 * 6099 * Inputs: 6100 * c_rarg0 - int crc 6101 * c_rarg1 - byte* buf 6102 * c_rarg2 - int length 6103 * 6104 * Output: 6105 * c_rarg0 - int crc result 6106 */ 6107 address generate_updateBytesCRC32() { 6108 assert(UseCRC32Intrinsics, "what are we doing here?"); 6109 6110 __ align(CodeEntryAlignment); 6111 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 6112 6113 address start = __ pc(); 6114 6115 const Register crc = c_rarg0; // crc 6116 const Register buf = c_rarg1; // source java byte array address 6117 const Register len = c_rarg2; // length 6118 const Register table0 = c_rarg3; // crc_table address 6119 const Register table1 = c_rarg4; 6120 const Register table2 = c_rarg5; 6121 const Register table3 = c_rarg6; 6122 6123 const Register tmp1 = c_rarg7; 6124 const Register tmp2 = t2; 6125 const Register tmp3 = x28; // t3 6126 const Register tmp4 = x29; // t4 6127 const Register tmp5 = x30; // t5 6128 const Register tmp6 = x31; // t6 6129 6130 BLOCK_COMMENT("Entry:"); 6131 __ enter(); // required for proper stackwalking of RuntimeStub frame 6132 6133 __ kernel_crc32(crc, buf, len, table0, table1, table2, 6134 table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 6135 6136 __ leave(); // required for proper stackwalking of RuntimeStub frame 6137 __ ret(); 6138 6139 return start; 6140 } 6141 6142 // exception handler for upcall stubs 6143 address generate_upcall_stub_exception_handler() { 6144 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 6145 address start = __ pc(); 6146 6147 // Native caller has no idea how to handle exceptions, 6148 // so we just crash here. Up to callee to catch exceptions. 6149 __ verify_oop(x10); // return a exception oop in a0 6150 __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception)); 6151 __ should_not_reach_here(); 6152 6153 return start; 6154 } 6155 6156 #undef __ 6157 6158 // Initialization 6159 void generate_initial_stubs() { 6160 // Generate initial stubs and initializes the entry points 6161 6162 // entry points that exist in all platforms Note: This is code 6163 // that could be shared among different platforms - however the 6164 // benefit seems to be smaller than the disadvantage of having a 6165 // much more complicated generator structure. See also comment in 6166 // stubRoutines.hpp. 6167 6168 StubRoutines::_forward_exception_entry = generate_forward_exception(); 6169 6170 if (UnsafeMemoryAccess::_table == nullptr) { 6171 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 6172 } 6173 6174 StubRoutines::_call_stub_entry = 6175 generate_call_stub(StubRoutines::_call_stub_return_address); 6176 6177 // is referenced by megamorphic call 6178 StubRoutines::_catch_exception_entry = generate_catch_exception(); 6179 6180 if (UseCRC32Intrinsics) { 6181 // set table address before stub generation which use it 6182 StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table; 6183 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 6184 } 6185 } 6186 6187 void generate_continuation_stubs() { 6188 // Continuation stubs: 6189 StubRoutines::_cont_thaw = generate_cont_thaw(); 6190 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 6191 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 6192 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 6193 } 6194 6195 void generate_final_stubs() { 6196 // support for verify_oop (must happen after universe_init) 6197 if (VerifyOops) { 6198 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 6199 } 6200 6201 // arraycopy stubs used by compilers 6202 generate_arraycopy_stubs(); 6203 6204 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 6205 if (bs_nm != nullptr) { 6206 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 6207 } 6208 6209 #ifdef COMPILER2 6210 if (UseSecondarySupersTable) { 6211 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 6212 if (!InlineSecondarySupersTest) { 6213 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 6214 StubRoutines::_lookup_secondary_supers_table_stubs[slot] 6215 = generate_lookup_secondary_supers_table_stub(slot); 6216 } 6217 } 6218 } 6219 #endif // COMPILER2 6220 6221 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 6222 6223 StubRoutines::riscv::set_completed(); 6224 } 6225 6226 void generate_compiler_stubs() { 6227 #ifdef COMPILER2 6228 if (UseMulAddIntrinsic) { 6229 StubRoutines::_mulAdd = generate_mulAdd(); 6230 } 6231 6232 if (UseMultiplyToLenIntrinsic) { 6233 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 6234 } 6235 6236 if (UseSquareToLenIntrinsic) { 6237 StubRoutines::_squareToLen = generate_squareToLen(); 6238 } 6239 6240 if (UseMontgomeryMultiplyIntrinsic) { 6241 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 6242 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 6243 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 6244 } 6245 6246 if (UseMontgomerySquareIntrinsic) { 6247 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 6248 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 6249 StubRoutines::_montgomerySquare = g.generate_square(); 6250 } 6251 6252 if (UsePoly1305Intrinsics) { 6253 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 6254 } 6255 6256 if (UseRVVForBigIntegerShiftIntrinsics) { 6257 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 6258 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 6259 } 6260 6261 if (UseSHA256Intrinsics) { 6262 Sha2Generator sha2(_masm, this); 6263 StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(false); 6264 StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true); 6265 } 6266 6267 if (UseSHA512Intrinsics) { 6268 Sha2Generator sha2(_masm, this); 6269 StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(false); 6270 StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true); 6271 } 6272 6273 if (UseMD5Intrinsics) { 6274 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 6275 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 6276 } 6277 6278 if (UseChaCha20Intrinsics) { 6279 StubRoutines::_chacha20Block = generate_chacha20Block(); 6280 } 6281 6282 if (UseSHA1Intrinsics) { 6283 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 6284 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 6285 } 6286 6287 if (UseBASE64Intrinsics) { 6288 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 6289 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 6290 } 6291 6292 if (UseAdler32Intrinsics) { 6293 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 6294 } 6295 6296 generate_compare_long_strings(); 6297 6298 generate_string_indexof_stubs(); 6299 6300 #endif // COMPILER2 6301 } 6302 6303 public: 6304 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 6305 switch(kind) { 6306 case Initial_stubs: 6307 generate_initial_stubs(); 6308 break; 6309 case Continuation_stubs: 6310 generate_continuation_stubs(); 6311 break; 6312 case Compiler_stubs: 6313 generate_compiler_stubs(); 6314 break; 6315 case Final_stubs: 6316 generate_final_stubs(); 6317 break; 6318 default: 6319 fatal("unexpected stubs kind: %d", kind); 6320 break; 6321 }; 6322 } 6323 }; // end class declaration 6324 6325 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 6326 StubGenerator g(code, kind); 6327 }