1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/universe.hpp" 35 #include "nativeInst_riscv.hpp" 36 #include "oops/instanceOop.hpp" 37 #include "oops/method.hpp" 38 #include "oops/objArrayKlass.hpp" 39 #include "oops/oop.inline.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "prims/upcallLinker.hpp" 42 #include "runtime/continuation.hpp" 43 #include "runtime/continuationEntry.inline.hpp" 44 #include "runtime/frame.inline.hpp" 45 #include "runtime/handles.inline.hpp" 46 #include "runtime/javaThread.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/stubCodeGenerator.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "utilities/align.hpp" 51 #include "utilities/powerOfTwo.hpp" 52 #ifdef COMPILER2 53 #include "opto/runtime.hpp" 54 #endif 55 56 // Declaration and definition of StubGenerator (no .hpp file). 57 // For a more detailed description of the stub routine structure 58 // see the comment in stubRoutines.hpp 59 60 #undef __ 61 #define __ _masm-> 62 63 #ifdef PRODUCT 64 #define BLOCK_COMMENT(str) /* nothing */ 65 #else 66 #define BLOCK_COMMENT(str) __ block_comment(str) 67 #endif 68 69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 70 71 // Stub Code definitions 72 73 class StubGenerator: public StubCodeGenerator { 74 private: 75 76 #ifdef PRODUCT 77 #define inc_counter_np(counter) ((void)0) 78 #else 79 void inc_counter_np_(uint& counter) { 80 __ incrementw(ExternalAddress((address)&counter)); 81 } 82 #define inc_counter_np(counter) \ 83 BLOCK_COMMENT("inc_counter " #counter); \ 84 inc_counter_np_(counter); 85 #endif 86 87 // Call stubs are used to call Java from C 88 // 89 // Arguments: 90 // c_rarg0: call wrapper address address 91 // c_rarg1: result address 92 // c_rarg2: result type BasicType 93 // c_rarg3: method Method* 94 // c_rarg4: (interpreter) entry point address 95 // c_rarg5: parameters intptr_t* 96 // c_rarg6: parameter size (in words) int 97 // c_rarg7: thread Thread* 98 // 99 // There is no return from the stub itself as any Java result 100 // is written to result 101 // 102 // we save x1 (ra) as the return PC at the base of the frame and 103 // link x8 (fp) below it as the frame pointer installing sp (x2) 104 // into fp. 105 // 106 // we save x10-x17, which accounts for all the c arguments. 107 // 108 // TODO: strictly do we need to save them all? they are treated as 109 // volatile by C so could we omit saving the ones we are going to 110 // place in global registers (thread? method?) or those we only use 111 // during setup of the Java call? 112 // 113 // we don't need to save x5 which C uses as an indirect result location 114 // return register. 115 // 116 // we don't need to save x6-x7 and x28-x31 which both C and Java treat as 117 // volatile 118 // 119 // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary 120 // registers and C expects to be callee-save 121 // 122 // so the stub frame looks like this when we enter Java code 123 // 124 // [ return_from_Java ] <--- sp 125 // [ argument word n ] 126 // ... 127 // -35 [ argument word 1 ] 128 // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call 129 // -33 [ saved f27 ] 130 // -32 [ saved f26 ] 131 // -31 [ saved f25 ] 132 // -30 [ saved f24 ] 133 // -29 [ saved f23 ] 134 // -28 [ saved f22 ] 135 // -27 [ saved f21 ] 136 // -26 [ saved f20 ] 137 // -25 [ saved f19 ] 138 // -24 [ saved f18 ] 139 // -23 [ saved f9 ] 140 // -22 [ saved f8 ] 141 // -21 [ saved x27 ] 142 // -20 [ saved x26 ] 143 // -19 [ saved x25 ] 144 // -18 [ saved x24 ] 145 // -17 [ saved x23 ] 146 // -16 [ saved x22 ] 147 // -15 [ saved x21 ] 148 // -14 [ saved x20 ] 149 // -13 [ saved x19 ] 150 // -12 [ saved x18 ] 151 // -11 [ saved x9 ] 152 // -10 [ call wrapper (x10) ] 153 // -9 [ result (x11) ] 154 // -8 [ result type (x12) ] 155 // -7 [ method (x13) ] 156 // -6 [ entry point (x14) ] 157 // -5 [ parameters (x15) ] 158 // -4 [ parameter size (x16) ] 159 // -3 [ thread (x17) ] 160 // -2 [ saved fp (x8) ] 161 // -1 [ saved ra (x1) ] 162 // 0 [ ] <--- fp == saved sp (x2) 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -34, 167 168 frm_off = sp_after_call_off, 169 f27_off = -33, 170 f26_off = -32, 171 f25_off = -31, 172 f24_off = -30, 173 f23_off = -29, 174 f22_off = -28, 175 f21_off = -27, 176 f20_off = -26, 177 f19_off = -25, 178 f18_off = -24, 179 f9_off = -23, 180 f8_off = -22, 181 182 x27_off = -21, 183 x26_off = -20, 184 x25_off = -19, 185 x24_off = -18, 186 x23_off = -17, 187 x22_off = -16, 188 x21_off = -15, 189 x20_off = -14, 190 x19_off = -13, 191 x18_off = -12, 192 x9_off = -11, 193 194 call_wrapper_off = -10, 195 result_off = -9, 196 result_type_off = -8, 197 method_off = -7, 198 entry_point_off = -6, 199 parameters_off = -5, 200 parameter_size_off = -4, 201 thread_off = -3, 202 fp_f = -2, 203 retaddr_off = -1, 204 }; 205 206 address generate_call_stub(address& return_address) { 207 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 208 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 209 "adjust this code"); 210 211 StubCodeMark mark(this, "StubRoutines", "call_stub"); 212 address start = __ pc(); 213 214 const Address sp_after_call (fp, sp_after_call_off * wordSize); 215 216 const Address frm_save (fp, frm_off * wordSize); 217 const Address call_wrapper (fp, call_wrapper_off * wordSize); 218 const Address result (fp, result_off * wordSize); 219 const Address result_type (fp, result_type_off * wordSize); 220 const Address method (fp, method_off * wordSize); 221 const Address entry_point (fp, entry_point_off * wordSize); 222 const Address parameters (fp, parameters_off * wordSize); 223 const Address parameter_size(fp, parameter_size_off * wordSize); 224 225 const Address thread (fp, thread_off * wordSize); 226 227 const Address f27_save (fp, f27_off * wordSize); 228 const Address f26_save (fp, f26_off * wordSize); 229 const Address f25_save (fp, f25_off * wordSize); 230 const Address f24_save (fp, f24_off * wordSize); 231 const Address f23_save (fp, f23_off * wordSize); 232 const Address f22_save (fp, f22_off * wordSize); 233 const Address f21_save (fp, f21_off * wordSize); 234 const Address f20_save (fp, f20_off * wordSize); 235 const Address f19_save (fp, f19_off * wordSize); 236 const Address f18_save (fp, f18_off * wordSize); 237 const Address f9_save (fp, f9_off * wordSize); 238 const Address f8_save (fp, f8_off * wordSize); 239 240 const Address x27_save (fp, x27_off * wordSize); 241 const Address x26_save (fp, x26_off * wordSize); 242 const Address x25_save (fp, x25_off * wordSize); 243 const Address x24_save (fp, x24_off * wordSize); 244 const Address x23_save (fp, x23_off * wordSize); 245 const Address x22_save (fp, x22_off * wordSize); 246 const Address x21_save (fp, x21_off * wordSize); 247 const Address x20_save (fp, x20_off * wordSize); 248 const Address x19_save (fp, x19_off * wordSize); 249 const Address x18_save (fp, x18_off * wordSize); 250 251 const Address x9_save (fp, x9_off * wordSize); 252 253 // stub code 254 255 address riscv_entry = __ pc(); 256 257 // set up frame and move sp to end of save area 258 __ enter(); 259 __ addi(sp, fp, sp_after_call_off * wordSize); 260 261 // save register parameters and Java temporary/global registers 262 // n.b. we save thread even though it gets installed in 263 // xthread because we want to sanity check tp later 264 __ sd(c_rarg7, thread); 265 __ sw(c_rarg6, parameter_size); 266 __ sd(c_rarg5, parameters); 267 __ sd(c_rarg4, entry_point); 268 __ sd(c_rarg3, method); 269 __ sd(c_rarg2, result_type); 270 __ sd(c_rarg1, result); 271 __ sd(c_rarg0, call_wrapper); 272 273 __ sd(x9, x9_save); 274 275 __ sd(x18, x18_save); 276 __ sd(x19, x19_save); 277 __ sd(x20, x20_save); 278 __ sd(x21, x21_save); 279 __ sd(x22, x22_save); 280 __ sd(x23, x23_save); 281 __ sd(x24, x24_save); 282 __ sd(x25, x25_save); 283 __ sd(x26, x26_save); 284 __ sd(x27, x27_save); 285 286 __ fsd(f8, f8_save); 287 __ fsd(f9, f9_save); 288 __ fsd(f18, f18_save); 289 __ fsd(f19, f19_save); 290 __ fsd(f20, f20_save); 291 __ fsd(f21, f21_save); 292 __ fsd(f22, f22_save); 293 __ fsd(f23, f23_save); 294 __ fsd(f24, f24_save); 295 __ fsd(f25, f25_save); 296 __ fsd(f26, f26_save); 297 __ fsd(f27, f27_save); 298 299 __ frrm(t0); 300 __ sd(t0, frm_save); 301 // Set frm to the state we need. We do want Round to Nearest. We 302 // don't want non-IEEE rounding modes. 303 Label skip_fsrmi; 304 guarantee(__ RoundingMode::rne == 0, "must be"); 305 __ beqz(t0, skip_fsrmi); 306 __ fsrmi(__ RoundingMode::rne); 307 __ bind(skip_fsrmi); 308 309 // install Java thread in global register now we have saved 310 // whatever value it held 311 __ mv(xthread, c_rarg7); 312 313 // And method 314 __ mv(xmethod, c_rarg3); 315 316 // set up the heapbase register 317 __ reinit_heapbase(); 318 319 #ifdef ASSERT 320 // make sure we have no pending exceptions 321 { 322 Label L; 323 __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset()))); 324 __ beqz(t0, L); 325 __ stop("StubRoutines::call_stub: entered with pending exception"); 326 __ BIND(L); 327 } 328 #endif 329 // pass parameters if any 330 __ mv(esp, sp); 331 __ slli(t0, c_rarg6, LogBytesPerWord); 332 __ sub(t0, sp, t0); // Move SP out of the way 333 __ andi(sp, t0, -2 * wordSize); 334 335 BLOCK_COMMENT("pass parameters if any"); 336 Label parameters_done; 337 // parameter count is still in c_rarg6 338 // and parameter pointer identifying param 1 is in c_rarg5 339 __ beqz(c_rarg6, parameters_done); 340 341 address loop = __ pc(); 342 __ ld(t0, Address(c_rarg5, 0)); 343 __ addi(c_rarg5, c_rarg5, wordSize); 344 __ addi(c_rarg6, c_rarg6, -1); 345 __ push_reg(t0); 346 __ bgtz(c_rarg6, loop); 347 348 __ BIND(parameters_done); 349 350 // call Java entry -- passing methdoOop, and current sp 351 // xmethod: Method* 352 // x19_sender_sp: sender sp 353 BLOCK_COMMENT("call Java function"); 354 __ mv(x19_sender_sp, sp); 355 __ jalr(c_rarg4); 356 357 // save current address for use by exception handling code 358 359 return_address = __ pc(); 360 361 // store result depending on type (everything that is not 362 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 363 // n.b. this assumes Java returns an integral result in x10 364 // and a floating result in j_farg0 365 __ ld(j_rarg2, result); 366 Label is_long, is_float, is_double, exit; 367 __ ld(j_rarg1, result_type); 368 __ mv(t0, (u1)T_OBJECT); 369 __ beq(j_rarg1, t0, is_long); 370 __ mv(t0, (u1)T_LONG); 371 __ beq(j_rarg1, t0, is_long); 372 __ mv(t0, (u1)T_FLOAT); 373 __ beq(j_rarg1, t0, is_float); 374 __ mv(t0, (u1)T_DOUBLE); 375 __ beq(j_rarg1, t0, is_double); 376 377 // handle T_INT case 378 __ sw(x10, Address(j_rarg2)); 379 380 __ BIND(exit); 381 382 // pop parameters 383 __ addi(esp, fp, sp_after_call_off * wordSize); 384 385 #ifdef ASSERT 386 // verify that threads correspond 387 { 388 Label L, S; 389 __ ld(t0, thread); 390 __ bne(xthread, t0, S); 391 __ get_thread(t0); 392 __ beq(xthread, t0, L); 393 __ BIND(S); 394 __ stop("StubRoutines::call_stub: threads must correspond"); 395 __ BIND(L); 396 } 397 #endif 398 399 __ pop_cont_fastpath(xthread); 400 401 // restore callee-save registers 402 __ fld(f27, f27_save); 403 __ fld(f26, f26_save); 404 __ fld(f25, f25_save); 405 __ fld(f24, f24_save); 406 __ fld(f23, f23_save); 407 __ fld(f22, f22_save); 408 __ fld(f21, f21_save); 409 __ fld(f20, f20_save); 410 __ fld(f19, f19_save); 411 __ fld(f18, f18_save); 412 __ fld(f9, f9_save); 413 __ fld(f8, f8_save); 414 415 __ ld(x27, x27_save); 416 __ ld(x26, x26_save); 417 __ ld(x25, x25_save); 418 __ ld(x24, x24_save); 419 __ ld(x23, x23_save); 420 __ ld(x22, x22_save); 421 __ ld(x21, x21_save); 422 __ ld(x20, x20_save); 423 __ ld(x19, x19_save); 424 __ ld(x18, x18_save); 425 426 __ ld(x9, x9_save); 427 428 // restore frm 429 Label skip_fsrm; 430 __ ld(t0, frm_save); 431 __ frrm(t1); 432 __ beq(t0, t1, skip_fsrm); 433 __ fsrm(t0); 434 __ bind(skip_fsrm); 435 436 __ ld(c_rarg0, call_wrapper); 437 __ ld(c_rarg1, result); 438 __ ld(c_rarg2, result_type); 439 __ ld(c_rarg3, method); 440 __ ld(c_rarg4, entry_point); 441 __ ld(c_rarg5, parameters); 442 __ ld(c_rarg6, parameter_size); 443 __ ld(c_rarg7, thread); 444 445 // leave frame and return to caller 446 __ leave(); 447 __ ret(); 448 449 // handle return types different from T_INT 450 451 __ BIND(is_long); 452 __ sd(x10, Address(j_rarg2, 0)); 453 __ j(exit); 454 455 __ BIND(is_float); 456 __ fsw(j_farg0, Address(j_rarg2, 0), t0); 457 __ j(exit); 458 459 __ BIND(is_double); 460 __ fsd(j_farg0, Address(j_rarg2, 0), t0); 461 __ j(exit); 462 463 return start; 464 } 465 466 // Return point for a Java call if there's an exception thrown in 467 // Java code. The exception is caught and transformed into a 468 // pending exception stored in JavaThread that can be tested from 469 // within the VM. 470 // 471 // Note: Usually the parameters are removed by the callee. In case 472 // of an exception crossing an activation frame boundary, that is 473 // not the case if the callee is compiled code => need to setup the 474 // sp. 475 // 476 // x10: exception oop 477 478 address generate_catch_exception() { 479 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 480 address start = __ pc(); 481 482 // same as in generate_call_stub(): 483 const Address thread(fp, thread_off * wordSize); 484 485 #ifdef ASSERT 486 // verify that threads correspond 487 { 488 Label L, S; 489 __ ld(t0, thread); 490 __ bne(xthread, t0, S); 491 __ get_thread(t0); 492 __ beq(xthread, t0, L); 493 __ bind(S); 494 __ stop("StubRoutines::catch_exception: threads must correspond"); 495 __ bind(L); 496 } 497 #endif 498 499 // set pending exception 500 __ verify_oop(x10); 501 502 __ sd(x10, Address(xthread, Thread::pending_exception_offset())); 503 __ mv(t0, (address)__FILE__); 504 __ sd(t0, Address(xthread, Thread::exception_file_offset())); 505 __ mv(t0, (int)__LINE__); 506 __ sw(t0, Address(xthread, Thread::exception_line_offset())); 507 508 // complete return to VM 509 assert(StubRoutines::_call_stub_return_address != nullptr, 510 "_call_stub_return_address must have been generated before"); 511 __ j(RuntimeAddress(StubRoutines::_call_stub_return_address)); 512 513 return start; 514 } 515 516 // Continuation point for runtime calls returning with a pending 517 // exception. The pending exception check happened in the runtime 518 // or native call stub. The pending exception in Thread is 519 // converted into a Java-level exception. 520 // 521 // Contract with Java-level exception handlers: 522 // x10: exception 523 // x13: throwing pc 524 // 525 // NOTE: At entry of this stub, exception-pc must be in RA !! 526 527 // NOTE: this is always used as a jump target within generated code 528 // so it just needs to be generated code with no x86 prolog 529 530 address generate_forward_exception() { 531 StubCodeMark mark(this, "StubRoutines", "forward exception"); 532 address start = __ pc(); 533 534 // Upon entry, RA points to the return address returning into 535 // Java (interpreted or compiled) code; i.e., the return address 536 // becomes the throwing pc. 537 // 538 // Arguments pushed before the runtime call are still on the stack 539 // but the exception handler will reset the stack pointer -> 540 // ignore them. A potential result in registers can be ignored as 541 // well. 542 543 #ifdef ASSERT 544 // make sure this code is only executed if there is a pending exception 545 { 546 Label L; 547 __ ld(t0, Address(xthread, Thread::pending_exception_offset())); 548 __ bnez(t0, L); 549 __ stop("StubRoutines::forward exception: no pending exception (1)"); 550 __ bind(L); 551 } 552 #endif 553 554 // compute exception handler into x9 555 556 // call the VM to find the handler address associated with the 557 // caller address. pass thread in x10 and caller pc (ret address) 558 // in x11. n.b. the caller pc is in ra, unlike x86 where it is on 559 // the stack. 560 __ mv(c_rarg1, ra); 561 // ra will be trashed by the VM call so we move it to x9 562 // (callee-saved) because we also need to pass it to the handler 563 // returned by this call. 564 __ mv(x9, ra); 565 BLOCK_COMMENT("call exception_handler_for_return_address"); 566 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 567 SharedRuntime::exception_handler_for_return_address), 568 xthread, c_rarg1); 569 // we should not really care that ra is no longer the callee 570 // address. we saved the value the handler needs in x9 so we can 571 // just copy it to x13. however, the C2 handler will push its own 572 // frame and then calls into the VM and the VM code asserts that 573 // the PC for the frame above the handler belongs to a compiled 574 // Java method. So, we restore ra here to satisfy that assert. 575 __ mv(ra, x9); 576 // setup x10 & x13 & clear pending exception 577 __ mv(x13, x9); 578 __ mv(x9, x10); 579 __ ld(x10, Address(xthread, Thread::pending_exception_offset())); 580 __ sd(zr, Address(xthread, Thread::pending_exception_offset())); 581 582 #ifdef ASSERT 583 // make sure exception is set 584 { 585 Label L; 586 __ bnez(x10, L); 587 __ stop("StubRoutines::forward exception: no pending exception (2)"); 588 __ bind(L); 589 } 590 #endif 591 592 // continue at exception handler 593 // x10: exception 594 // x13: throwing pc 595 // x9: exception handler 596 __ verify_oop(x10); 597 __ jr(x9); 598 599 return start; 600 } 601 602 // Non-destructive plausibility checks for oops 603 // 604 // Arguments: 605 // x10: oop to verify 606 // t0: error message 607 // 608 // Stack after saving c_rarg3: 609 // [tos + 0]: saved c_rarg3 610 // [tos + 1]: saved c_rarg2 611 // [tos + 2]: saved ra 612 // [tos + 3]: saved t1 613 // [tos + 4]: saved x10 614 // [tos + 5]: saved t0 615 address generate_verify_oop() { 616 617 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 618 address start = __ pc(); 619 620 Label exit, error; 621 622 __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3 623 624 __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 625 __ ld(c_rarg3, Address(c_rarg2)); 626 __ add(c_rarg3, c_rarg3, 1); 627 __ sd(c_rarg3, Address(c_rarg2)); 628 629 // object is in x10 630 // make sure object is 'reasonable' 631 __ beqz(x10, exit); // if obj is null it is OK 632 633 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 634 bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error); 635 636 // return if everything seems ok 637 __ bind(exit); 638 639 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3 640 __ ret(); 641 642 // handle errors 643 __ bind(error); 644 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3 645 646 __ push_reg(RegSet::range(x0, x31), sp); 647 // debug(char* msg, int64_t pc, int64_t regs[]) 648 __ mv(c_rarg0, t0); // pass address of error message 649 __ mv(c_rarg1, ra); // pass return address 650 __ mv(c_rarg2, sp); // pass address of regs on stack 651 #ifndef PRODUCT 652 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 653 #endif 654 BLOCK_COMMENT("call MacroAssembler::debug"); 655 __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 656 __ ebreak(); 657 658 return start; 659 } 660 661 // The inner part of zero_words(). 662 // 663 // Inputs: 664 // x28: the HeapWord-aligned base address of an array to zero. 665 // x29: the count in HeapWords, x29 > 0. 666 // 667 // Returns x28 and x29, adjusted for the caller to clear. 668 // x28: the base address of the tail of words left to clear. 669 // x29: the number of words in the tail. 670 // x29 < MacroAssembler::zero_words_block_size. 671 672 address generate_zero_blocks() { 673 Label done; 674 675 const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31; 676 677 __ align(CodeEntryAlignment); 678 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 679 address start = __ pc(); 680 681 if (UseBlockZeroing) { 682 // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero 683 // after alignment. 684 Label small; 685 int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize; 686 __ mv(tmp1, low_limit); 687 __ blt(cnt, tmp1, small); 688 __ zero_dcache_blocks(base, cnt, tmp1, tmp2); 689 __ bind(small); 690 } 691 692 { 693 // Clear the remaining blocks. 694 Label loop; 695 __ mv(tmp1, MacroAssembler::zero_words_block_size); 696 __ blt(cnt, tmp1, done); 697 __ bind(loop); 698 for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) { 699 __ sd(zr, Address(base, i * wordSize)); 700 } 701 __ add(base, base, MacroAssembler::zero_words_block_size * wordSize); 702 __ sub(cnt, cnt, MacroAssembler::zero_words_block_size); 703 __ bge(cnt, tmp1, loop); 704 __ bind(done); 705 } 706 707 __ ret(); 708 709 return start; 710 } 711 712 typedef enum { 713 copy_forwards = 1, 714 copy_backwards = -1 715 } copy_direction; 716 717 // Bulk copy of blocks of 8 words. 718 // 719 // count is a count of words. 720 // 721 // Precondition: count >= 8 722 // 723 // Postconditions: 724 // 725 // The least significant bit of count contains the remaining count 726 // of words to copy. The rest of count is trash. 727 // 728 // s and d are adjusted to point to the remaining words to copy 729 // 730 void generate_copy_longs(Label &start, Register s, Register d, Register count, 731 copy_direction direction) { 732 int unit = wordSize * direction; 733 int bias = wordSize; 734 735 const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16, 736 tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29; 737 738 const Register stride = x30; 739 740 assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3, 741 tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7); 742 assert_different_registers(s, d, count, t0); 743 744 Label again, drain; 745 const char* stub_name = nullptr; 746 if (direction == copy_forwards) { 747 stub_name = "forward_copy_longs"; 748 } else { 749 stub_name = "backward_copy_longs"; 750 } 751 StubCodeMark mark(this, "StubRoutines", stub_name); 752 __ align(CodeEntryAlignment); 753 __ bind(start); 754 755 if (direction == copy_forwards) { 756 __ sub(s, s, bias); 757 __ sub(d, d, bias); 758 } 759 760 #ifdef ASSERT 761 // Make sure we are never given < 8 words 762 { 763 Label L; 764 765 __ mv(t0, 8); 766 __ bge(count, t0, L); 767 __ stop("genrate_copy_longs called with < 8 words"); 768 __ bind(L); 769 } 770 #endif 771 772 __ ld(tmp_reg0, Address(s, 1 * unit)); 773 __ ld(tmp_reg1, Address(s, 2 * unit)); 774 __ ld(tmp_reg2, Address(s, 3 * unit)); 775 __ ld(tmp_reg3, Address(s, 4 * unit)); 776 __ ld(tmp_reg4, Address(s, 5 * unit)); 777 __ ld(tmp_reg5, Address(s, 6 * unit)); 778 __ ld(tmp_reg6, Address(s, 7 * unit)); 779 __ ld(tmp_reg7, Address(s, 8 * unit)); 780 __ addi(s, s, 8 * unit); 781 782 __ sub(count, count, 16); 783 __ bltz(count, drain); 784 785 __ bind(again); 786 787 __ sd(tmp_reg0, Address(d, 1 * unit)); 788 __ sd(tmp_reg1, Address(d, 2 * unit)); 789 __ sd(tmp_reg2, Address(d, 3 * unit)); 790 __ sd(tmp_reg3, Address(d, 4 * unit)); 791 __ sd(tmp_reg4, Address(d, 5 * unit)); 792 __ sd(tmp_reg5, Address(d, 6 * unit)); 793 __ sd(tmp_reg6, Address(d, 7 * unit)); 794 __ sd(tmp_reg7, Address(d, 8 * unit)); 795 796 __ ld(tmp_reg0, Address(s, 1 * unit)); 797 __ ld(tmp_reg1, Address(s, 2 * unit)); 798 __ ld(tmp_reg2, Address(s, 3 * unit)); 799 __ ld(tmp_reg3, Address(s, 4 * unit)); 800 __ ld(tmp_reg4, Address(s, 5 * unit)); 801 __ ld(tmp_reg5, Address(s, 6 * unit)); 802 __ ld(tmp_reg6, Address(s, 7 * unit)); 803 __ ld(tmp_reg7, Address(s, 8 * unit)); 804 805 __ addi(s, s, 8 * unit); 806 __ addi(d, d, 8 * unit); 807 808 __ sub(count, count, 8); 809 __ bgez(count, again); 810 811 // Drain 812 __ bind(drain); 813 814 __ sd(tmp_reg0, Address(d, 1 * unit)); 815 __ sd(tmp_reg1, Address(d, 2 * unit)); 816 __ sd(tmp_reg2, Address(d, 3 * unit)); 817 __ sd(tmp_reg3, Address(d, 4 * unit)); 818 __ sd(tmp_reg4, Address(d, 5 * unit)); 819 __ sd(tmp_reg5, Address(d, 6 * unit)); 820 __ sd(tmp_reg6, Address(d, 7 * unit)); 821 __ sd(tmp_reg7, Address(d, 8 * unit)); 822 __ addi(d, d, 8 * unit); 823 824 { 825 Label L1, L2; 826 __ test_bit(t0, count, 2); 827 __ beqz(t0, L1); 828 829 __ ld(tmp_reg0, Address(s, 1 * unit)); 830 __ ld(tmp_reg1, Address(s, 2 * unit)); 831 __ ld(tmp_reg2, Address(s, 3 * unit)); 832 __ ld(tmp_reg3, Address(s, 4 * unit)); 833 __ addi(s, s, 4 * unit); 834 835 __ sd(tmp_reg0, Address(d, 1 * unit)); 836 __ sd(tmp_reg1, Address(d, 2 * unit)); 837 __ sd(tmp_reg2, Address(d, 3 * unit)); 838 __ sd(tmp_reg3, Address(d, 4 * unit)); 839 __ addi(d, d, 4 * unit); 840 841 __ bind(L1); 842 843 if (direction == copy_forwards) { 844 __ addi(s, s, bias); 845 __ addi(d, d, bias); 846 } 847 848 __ test_bit(t0, count, 1); 849 __ beqz(t0, L2); 850 if (direction == copy_backwards) { 851 __ addi(s, s, 2 * unit); 852 __ ld(tmp_reg0, Address(s)); 853 __ ld(tmp_reg1, Address(s, wordSize)); 854 __ addi(d, d, 2 * unit); 855 __ sd(tmp_reg0, Address(d)); 856 __ sd(tmp_reg1, Address(d, wordSize)); 857 } else { 858 __ ld(tmp_reg0, Address(s)); 859 __ ld(tmp_reg1, Address(s, wordSize)); 860 __ addi(s, s, 2 * unit); 861 __ sd(tmp_reg0, Address(d)); 862 __ sd(tmp_reg1, Address(d, wordSize)); 863 __ addi(d, d, 2 * unit); 864 } 865 __ bind(L2); 866 } 867 868 __ ret(); 869 } 870 871 Label copy_f, copy_b; 872 873 typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp); 874 875 void copy_memory_v(Register s, Register d, Register count, int step) { 876 bool is_backward = step < 0; 877 int granularity = uabs(step); 878 879 const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17; 880 assert_different_registers(s, d, cnt, vl, tmp1, tmp2); 881 Assembler::SEW sew = Assembler::elembytes_to_sew(granularity); 882 Label loop_forward, loop_backward, done; 883 884 __ mv(dst, d); 885 __ mv(src, s); 886 __ mv(cnt, count); 887 888 __ bind(loop_forward); 889 __ vsetvli(vl, cnt, sew, Assembler::m8); 890 if (is_backward) { 891 __ bne(vl, cnt, loop_backward); 892 } 893 894 __ vlex_v(v0, src, sew); 895 __ sub(cnt, cnt, vl); 896 if (sew != Assembler::e8) { 897 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary 898 __ slli(vl, vl, sew); 899 } 900 __ add(src, src, vl); 901 902 __ vsex_v(v0, dst, sew); 903 __ add(dst, dst, vl); 904 __ bnez(cnt, loop_forward); 905 906 if (is_backward) { 907 __ j(done); 908 909 __ bind(loop_backward); 910 __ sub(t0, cnt, vl); 911 if (sew != Assembler::e8) { 912 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary 913 __ slli(t0, t0, sew); 914 } 915 __ add(tmp1, s, t0); 916 __ vlex_v(v0, tmp1, sew); 917 __ add(tmp2, d, t0); 918 __ vsex_v(v0, tmp2, sew); 919 __ sub(cnt, cnt, vl); 920 __ bnez(cnt, loop_forward); 921 __ bind(done); 922 } 923 } 924 925 // All-singing all-dancing memory copy. 926 // 927 // Copy count units of memory from s to d. The size of a unit is 928 // step, which can be positive or negative depending on the direction 929 // of copy. 930 // 931 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 932 Register s, Register d, Register count, int step) { 933 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 934 if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) { 935 return copy_memory_v(s, d, count, step); 936 } 937 938 bool is_backwards = step < 0; 939 int granularity = uabs(step); 940 941 const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13; 942 const Register gct1 = x28, gct2 = x29, gct3 = t2; 943 944 Label same_aligned; 945 Label copy_big, copy32_loop, copy8_loop, copy_small, done; 946 947 // The size of copy32_loop body increases significantly with ZGC GC barriers. 948 // Need conditional far branches to reach a point beyond the loop in this case. 949 bool is_far = UseZGC; 950 951 __ beqz(count, done, is_far); 952 __ slli(cnt, count, exact_log2(granularity)); 953 if (is_backwards) { 954 __ add(src, s, cnt); 955 __ add(dst, d, cnt); 956 } else { 957 __ mv(src, s); 958 __ mv(dst, d); 959 } 960 961 if (is_aligned) { 962 __ addi(t0, cnt, -32); 963 __ bgez(t0, copy32_loop); 964 __ addi(t0, cnt, -8); 965 __ bgez(t0, copy8_loop, is_far); 966 __ j(copy_small); 967 } else { 968 __ mv(t0, 16); 969 __ blt(cnt, t0, copy_small, is_far); 970 971 __ xorr(t0, src, dst); 972 __ andi(t0, t0, 0b111); 973 __ bnez(t0, copy_small, is_far); 974 975 __ bind(same_aligned); 976 __ andi(t0, src, 0b111); 977 __ beqz(t0, copy_big); 978 if (is_backwards) { 979 __ addi(src, src, step); 980 __ addi(dst, dst, step); 981 } 982 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1); 983 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3); 984 if (!is_backwards) { 985 __ addi(src, src, step); 986 __ addi(dst, dst, step); 987 } 988 __ addi(cnt, cnt, -granularity); 989 __ beqz(cnt, done, is_far); 990 __ j(same_aligned); 991 992 __ bind(copy_big); 993 __ mv(t0, 32); 994 __ blt(cnt, t0, copy8_loop, is_far); 995 } 996 997 __ bind(copy32_loop); 998 if (is_backwards) { 999 __ addi(src, src, -wordSize * 4); 1000 __ addi(dst, dst, -wordSize * 4); 1001 } 1002 // we first load 32 bytes, then write it, so the direction here doesn't matter 1003 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1); 1004 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8), gct1); 1005 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1); 1006 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1); 1007 1008 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3); 1009 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8), tmp4, gct1, gct2, gct3); 1010 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3); 1011 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3); 1012 1013 if (!is_backwards) { 1014 __ addi(src, src, wordSize * 4); 1015 __ addi(dst, dst, wordSize * 4); 1016 } 1017 __ addi(t0, cnt, -(32 + wordSize * 4)); 1018 __ addi(cnt, cnt, -wordSize * 4); 1019 __ bgez(t0, copy32_loop); // cnt >= 32, do next loop 1020 1021 __ beqz(cnt, done); // if that's all - done 1022 1023 __ addi(t0, cnt, -8); // if not - copy the reminder 1024 __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop 1025 1026 __ bind(copy8_loop); 1027 if (is_backwards) { 1028 __ addi(src, src, -wordSize); 1029 __ addi(dst, dst, -wordSize); 1030 } 1031 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1); 1032 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3); 1033 1034 if (!is_backwards) { 1035 __ addi(src, src, wordSize); 1036 __ addi(dst, dst, wordSize); 1037 } 1038 __ addi(t0, cnt, -(8 + wordSize)); 1039 __ addi(cnt, cnt, -wordSize); 1040 __ bgez(t0, copy8_loop); // cnt >= 8, do next loop 1041 1042 __ beqz(cnt, done); // if that's all - done 1043 1044 __ bind(copy_small); 1045 if (is_backwards) { 1046 __ addi(src, src, step); 1047 __ addi(dst, dst, step); 1048 } 1049 1050 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1); 1051 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3); 1052 1053 if (!is_backwards) { 1054 __ addi(src, src, step); 1055 __ addi(dst, dst, step); 1056 } 1057 __ addi(cnt, cnt, -granularity); 1058 __ bgtz(cnt, copy_small); 1059 1060 __ bind(done); 1061 } 1062 1063 // Scan over array at a for count oops, verifying each one. 1064 // Preserves a and count, clobbers t0 and t1. 1065 void verify_oop_array(size_t size, Register a, Register count, Register temp) { 1066 Label loop, end; 1067 __ mv(t1, zr); 1068 __ slli(t0, count, exact_log2(size)); 1069 __ bind(loop); 1070 __ bgeu(t1, t0, end); 1071 1072 __ add(temp, a, t1); 1073 if (size == (size_t)wordSize) { 1074 __ ld(temp, Address(temp, 0)); 1075 __ verify_oop(temp); 1076 } else { 1077 __ lwu(temp, Address(temp, 0)); 1078 __ decode_heap_oop(temp); // calls verify_oop 1079 } 1080 __ add(t1, t1, size); 1081 __ j(loop); 1082 __ bind(end); 1083 } 1084 1085 // Arguments: 1086 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1087 // ignored 1088 // is_oop - true => oop array, so generate store check code 1089 // name - stub name string 1090 // 1091 // Inputs: 1092 // c_rarg0 - source array address 1093 // c_rarg1 - destination array address 1094 // c_rarg2 - element count, treated as ssize_t, can be zero 1095 // 1096 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1097 // the hardware handle it. The two dwords within qwords that span 1098 // cache line boundaries will still be loaded and stored atomically. 1099 // 1100 // Side Effects: 1101 // disjoint_int_copy_entry is set to the no-overlap entry point 1102 // used by generate_conjoint_int_oop_copy(). 1103 // 1104 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry, 1105 const char* name, bool dest_uninitialized = false) { 1106 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1107 RegSet saved_reg = RegSet::of(s, d, count); 1108 __ align(CodeEntryAlignment); 1109 StubCodeMark mark(this, "StubRoutines", name); 1110 address start = __ pc(); 1111 __ enter(); 1112 1113 if (entry != nullptr) { 1114 *entry = __ pc(); 1115 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1116 BLOCK_COMMENT("Entry:"); 1117 } 1118 1119 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1120 if (dest_uninitialized) { 1121 decorators |= IS_DEST_UNINITIALIZED; 1122 } 1123 if (aligned) { 1124 decorators |= ARRAYCOPY_ALIGNED; 1125 } 1126 1127 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1128 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1129 1130 if (is_oop) { 1131 // save regs before copy_memory 1132 __ push_reg(RegSet::of(d, count), sp); 1133 } 1134 1135 { 1136 // UnsafeMemoryAccess page error: continue after unsafe access 1137 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1138 UnsafeMemoryAccessMark umam(this, add_entry, true); 1139 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1140 } 1141 1142 if (is_oop) { 1143 __ pop_reg(RegSet::of(d, count), sp); 1144 if (VerifyOops) { 1145 verify_oop_array(size, d, count, t2); 1146 } 1147 } 1148 1149 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1150 1151 __ leave(); 1152 __ mv(x10, zr); // return 0 1153 __ ret(); 1154 return start; 1155 } 1156 1157 // Arguments: 1158 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1159 // ignored 1160 // is_oop - true => oop array, so generate store check code 1161 // name - stub name string 1162 // 1163 // Inputs: 1164 // c_rarg0 - source array address 1165 // c_rarg1 - destination array address 1166 // c_rarg2 - element count, treated as ssize_t, can be zero 1167 // 1168 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1169 // the hardware handle it. The two dwords within qwords that span 1170 // cache line boundaries will still be loaded and stored atomically. 1171 // 1172 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1173 address* entry, const char* name, 1174 bool dest_uninitialized = false) { 1175 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1176 RegSet saved_regs = RegSet::of(s, d, count); 1177 StubCodeMark mark(this, "StubRoutines", name); 1178 address start = __ pc(); 1179 __ enter(); 1180 1181 if (entry != nullptr) { 1182 *entry = __ pc(); 1183 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1184 BLOCK_COMMENT("Entry:"); 1185 } 1186 1187 // use fwd copy when (d-s) above_equal (count*size) 1188 __ sub(t0, d, s); 1189 __ slli(t1, count, exact_log2(size)); 1190 Label L_continue; 1191 __ bltu(t0, t1, L_continue); 1192 __ j(nooverlap_target); 1193 __ bind(L_continue); 1194 1195 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1196 if (dest_uninitialized) { 1197 decorators |= IS_DEST_UNINITIALIZED; 1198 } 1199 if (aligned) { 1200 decorators |= ARRAYCOPY_ALIGNED; 1201 } 1202 1203 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1204 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1205 1206 if (is_oop) { 1207 // save regs before copy_memory 1208 __ push_reg(RegSet::of(d, count), sp); 1209 } 1210 1211 { 1212 // UnsafeMemoryAccess page error: continue after unsafe access 1213 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1214 UnsafeMemoryAccessMark umam(this, add_entry, true); 1215 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1216 } 1217 1218 if (is_oop) { 1219 __ pop_reg(RegSet::of(d, count), sp); 1220 if (VerifyOops) { 1221 verify_oop_array(size, d, count, t2); 1222 } 1223 } 1224 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1225 __ leave(); 1226 __ mv(x10, zr); // return 0 1227 __ ret(); 1228 return start; 1229 } 1230 1231 // Arguments: 1232 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1233 // ignored 1234 // name - stub name string 1235 // 1236 // Inputs: 1237 // c_rarg0 - source array address 1238 // c_rarg1 - destination array address 1239 // c_rarg2 - element count, treated as ssize_t, can be zero 1240 // 1241 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1242 // we let the hardware handle it. The one to eight bytes within words, 1243 // dwords or qwords that span cache line boundaries will still be loaded 1244 // and stored atomically. 1245 // 1246 // Side Effects: 1247 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1248 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1249 // we let the hardware handle it. The one to eight bytes within words, 1250 // dwords or qwords that span cache line boundaries will still be loaded 1251 // and stored atomically. 1252 // 1253 // Side Effects: 1254 // disjoint_byte_copy_entry is set to the no-overlap entry point 1255 // used by generate_conjoint_byte_copy(). 1256 // 1257 address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) { 1258 const bool not_oop = false; 1259 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1260 } 1261 1262 // Arguments: 1263 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1264 // ignored 1265 // name - stub name string 1266 // 1267 // Inputs: 1268 // c_rarg0 - source array address 1269 // c_rarg1 - destination array address 1270 // c_rarg2 - element count, treated as ssize_t, can be zero 1271 // 1272 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1273 // we let the hardware handle it. The one to eight bytes within words, 1274 // dwords or qwords that span cache line boundaries will still be loaded 1275 // and stored atomically. 1276 // 1277 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1278 address* entry, const char* name) { 1279 const bool not_oop = false; 1280 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1281 } 1282 1283 // Arguments: 1284 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1285 // ignored 1286 // name - stub name string 1287 // 1288 // Inputs: 1289 // c_rarg0 - source array address 1290 // c_rarg1 - destination array address 1291 // c_rarg2 - element count, treated as ssize_t, can be zero 1292 // 1293 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1294 // let the hardware handle it. The two or four words within dwords 1295 // or qwords that span cache line boundaries will still be loaded 1296 // and stored atomically. 1297 // 1298 // Side Effects: 1299 // disjoint_short_copy_entry is set to the no-overlap entry point 1300 // used by generate_conjoint_short_copy(). 1301 // 1302 address generate_disjoint_short_copy(bool aligned, 1303 address* entry, const char* name) { 1304 const bool not_oop = false; 1305 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1306 } 1307 1308 // Arguments: 1309 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1310 // ignored 1311 // name - stub name string 1312 // 1313 // Inputs: 1314 // c_rarg0 - source array address 1315 // c_rarg1 - destination array address 1316 // c_rarg2 - element count, treated as ssize_t, can be zero 1317 // 1318 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1319 // let the hardware handle it. The two or four words within dwords 1320 // or qwords that span cache line boundaries will still be loaded 1321 // and stored atomically. 1322 // 1323 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1324 address* entry, const char* name) { 1325 const bool not_oop = false; 1326 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1327 } 1328 1329 // Arguments: 1330 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1331 // ignored 1332 // name - stub name string 1333 // 1334 // Inputs: 1335 // c_rarg0 - source array address 1336 // c_rarg1 - destination array address 1337 // c_rarg2 - element count, treated as ssize_t, can be zero 1338 // 1339 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1340 // the hardware handle it. The two dwords within qwords that span 1341 // cache line boundaries will still be loaded and stored atomically. 1342 // 1343 // Side Effects: 1344 // disjoint_int_copy_entry is set to the no-overlap entry point 1345 // used by generate_conjoint_int_oop_copy(). 1346 // 1347 address generate_disjoint_int_copy(bool aligned, address* entry, 1348 const char* name, bool dest_uninitialized = false) { 1349 const bool not_oop = false; 1350 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1351 } 1352 1353 // Arguments: 1354 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1355 // ignored 1356 // name - stub name string 1357 // 1358 // Inputs: 1359 // c_rarg0 - source array address 1360 // c_rarg1 - destination array address 1361 // c_rarg2 - element count, treated as ssize_t, can be zero 1362 // 1363 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1364 // the hardware handle it. The two dwords within qwords that span 1365 // cache line boundaries will still be loaded and stored atomically. 1366 // 1367 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1368 address* entry, const char* name, 1369 bool dest_uninitialized = false) { 1370 const bool not_oop = false; 1371 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1372 } 1373 1374 1375 // Arguments: 1376 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1377 // ignored 1378 // name - stub name string 1379 // 1380 // Inputs: 1381 // c_rarg0 - source array address 1382 // c_rarg1 - destination array address 1383 // c_rarg2 - element count, treated as size_t, can be zero 1384 // 1385 // Side Effects: 1386 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1387 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1388 // 1389 address generate_disjoint_long_copy(bool aligned, address* entry, 1390 const char* name, bool dest_uninitialized = false) { 1391 const bool not_oop = false; 1392 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1393 } 1394 1395 // Arguments: 1396 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1397 // ignored 1398 // name - stub name string 1399 // 1400 // Inputs: 1401 // c_rarg0 - source array address 1402 // c_rarg1 - destination array address 1403 // c_rarg2 - element count, treated as size_t, can be zero 1404 // 1405 address generate_conjoint_long_copy(bool aligned, 1406 address nooverlap_target, address* entry, 1407 const char* name, bool dest_uninitialized = false) { 1408 const bool not_oop = false; 1409 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1410 } 1411 1412 // Arguments: 1413 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1414 // ignored 1415 // name - stub name string 1416 // 1417 // Inputs: 1418 // c_rarg0 - source array address 1419 // c_rarg1 - destination array address 1420 // c_rarg2 - element count, treated as size_t, can be zero 1421 // 1422 // Side Effects: 1423 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1424 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1425 // 1426 address generate_disjoint_oop_copy(bool aligned, address* entry, 1427 const char* name, bool dest_uninitialized) { 1428 const bool is_oop = true; 1429 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1430 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1431 } 1432 1433 // Arguments: 1434 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1435 // ignored 1436 // name - stub name string 1437 // 1438 // Inputs: 1439 // c_rarg0 - source array address 1440 // c_rarg1 - destination array address 1441 // c_rarg2 - element count, treated as size_t, can be zero 1442 // 1443 address generate_conjoint_oop_copy(bool aligned, 1444 address nooverlap_target, address* entry, 1445 const char* name, bool dest_uninitialized) { 1446 const bool is_oop = true; 1447 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1448 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1449 name, dest_uninitialized); 1450 } 1451 1452 // Helper for generating a dynamic type check. 1453 // Smashes t0, t1. 1454 void generate_type_check(Register sub_klass, 1455 Register super_check_offset, 1456 Register super_klass, 1457 Label& L_success) { 1458 assert_different_registers(sub_klass, super_check_offset, super_klass); 1459 1460 BLOCK_COMMENT("type_check:"); 1461 1462 Label L_miss; 1463 1464 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset); 1465 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr); 1466 1467 // Fall through on failure! 1468 __ BIND(L_miss); 1469 } 1470 1471 // 1472 // Generate checkcasting array copy stub 1473 // 1474 // Input: 1475 // c_rarg0 - source array address 1476 // c_rarg1 - destination array address 1477 // c_rarg2 - element count, treated as ssize_t, can be zero 1478 // c_rarg3 - size_t ckoff (super_check_offset) 1479 // c_rarg4 - oop ckval (super_klass) 1480 // 1481 // Output: 1482 // x10 == 0 - success 1483 // x10 == -1^K - failure, where K is partial transfer count 1484 // 1485 address generate_checkcast_copy(const char* name, address* entry, 1486 bool dest_uninitialized = false) { 1487 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1488 1489 // Input registers (after setup_arg_regs) 1490 const Register from = c_rarg0; // source array address 1491 const Register to = c_rarg1; // destination array address 1492 const Register count = c_rarg2; // elementscount 1493 const Register ckoff = c_rarg3; // super_check_offset 1494 const Register ckval = c_rarg4; // super_klass 1495 1496 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1497 RegSet wb_post_saved_regs = RegSet::of(count); 1498 1499 // Registers used as temps (x7, x9, x18 are save-on-entry) 1500 const Register count_save = x19; // orig elementscount 1501 const Register start_to = x18; // destination array start address 1502 const Register copied_oop = x7; // actual oop copied 1503 const Register r9_klass = x9; // oop._klass 1504 1505 // Registers used as gc temps (x15, x16, x17 are save-on-call) 1506 const Register gct1 = x15, gct2 = x16, gct3 = x17; 1507 1508 //--------------------------------------------------------------- 1509 // Assembler stub will be used for this call to arraycopy 1510 // if the two arrays are subtypes of Object[] but the 1511 // destination array type is not equal to or a supertype 1512 // of the source type. Each element must be separately 1513 // checked. 1514 1515 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1516 copied_oop, r9_klass, count_save); 1517 1518 __ align(CodeEntryAlignment); 1519 StubCodeMark mark(this, "StubRoutines", name); 1520 address start = __ pc(); 1521 1522 __ enter(); // required for proper stackwalking of RuntimeStub frame 1523 1524 // Caller of this entry point must set up the argument registers. 1525 if (entry != nullptr) { 1526 *entry = __ pc(); 1527 BLOCK_COMMENT("Entry:"); 1528 } 1529 1530 // Empty array: Nothing to do 1531 __ beqz(count, L_done); 1532 1533 __ push_reg(RegSet::of(x7, x9, x18, x19), sp); 1534 1535 #ifdef ASSERT 1536 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1537 // The ckoff and ckval must be mutually consistent, 1538 // even though caller generates both. 1539 { Label L; 1540 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1541 __ lwu(start_to, Address(ckval, sco_offset)); 1542 __ beq(ckoff, start_to, L); 1543 __ stop("super_check_offset inconsistent"); 1544 __ bind(L); 1545 } 1546 #endif //ASSERT 1547 1548 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1549 if (dest_uninitialized) { 1550 decorators |= IS_DEST_UNINITIALIZED; 1551 } 1552 1553 bool is_oop = true; 1554 int element_size = UseCompressedOops ? 4 : 8; 1555 1556 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1557 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1558 1559 // save the original count 1560 __ mv(count_save, count); 1561 1562 // Copy from low to high addresses 1563 __ mv(start_to, to); // Save destination array start address 1564 __ j(L_load_element); 1565 1566 // ======== begin loop ======== 1567 // (Loop is rotated; its entry is L_load_element.) 1568 // Loop control: 1569 // for count to 0 do 1570 // copied_oop = load_heap_oop(from++) 1571 // ... generate_type_check ... 1572 // store_heap_oop(to++, copied_oop) 1573 // end 1574 1575 __ align(OptoLoopAlignment); 1576 1577 __ BIND(L_store_element); 1578 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1579 Address(to, 0), copied_oop, 1580 gct1, gct2, gct3); 1581 __ add(to, to, UseCompressedOops ? 4 : 8); 1582 __ sub(count, count, 1); 1583 __ beqz(count, L_do_card_marks); 1584 1585 // ======== loop entry is here ======== 1586 __ BIND(L_load_element); 1587 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1588 copied_oop, Address(from, 0), 1589 gct1); 1590 __ add(from, from, UseCompressedOops ? 4 : 8); 1591 __ beqz(copied_oop, L_store_element); 1592 1593 __ load_klass(r9_klass, copied_oop);// query the object klass 1594 generate_type_check(r9_klass, ckoff, ckval, L_store_element); 1595 // ======== end loop ======== 1596 1597 // It was a real error; we must depend on the caller to finish the job. 1598 // Register count = remaining oops, count_orig = total oops. 1599 // Emit GC store barriers for the oops we have copied and report 1600 // their number to the caller. 1601 1602 __ sub(count, count_save, count); // K = partially copied oop count 1603 __ xori(count, count, -1); // report (-1^K) to caller 1604 __ beqz(count, L_done_pop); 1605 1606 __ BIND(L_do_card_marks); 1607 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs); 1608 1609 __ bind(L_done_pop); 1610 __ pop_reg(RegSet::of(x7, x9, x18, x19), sp); 1611 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1612 1613 __ bind(L_done); 1614 __ mv(x10, count); 1615 __ leave(); 1616 __ ret(); 1617 1618 return start; 1619 } 1620 1621 // Perform range checks on the proposed arraycopy. 1622 // Kills temp, but nothing else. 1623 // Also, clean the sign bits of src_pos and dst_pos. 1624 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1625 Register src_pos, // source position (c_rarg1) 1626 Register dst, // destination array oo (c_rarg2) 1627 Register dst_pos, // destination position (c_rarg3) 1628 Register length, 1629 Register temp, 1630 Label& L_failed) { 1631 BLOCK_COMMENT("arraycopy_range_checks:"); 1632 1633 assert_different_registers(t0, temp); 1634 1635 // if [src_pos + length > arrayOop(src)->length()] then FAIL 1636 __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes())); 1637 __ addw(temp, length, src_pos); 1638 __ bgtu(temp, t0, L_failed); 1639 1640 // if [dst_pos + length > arrayOop(dst)->length()] then FAIL 1641 __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1642 __ addw(temp, length, dst_pos); 1643 __ bgtu(temp, t0, L_failed); 1644 1645 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1646 __ zero_extend(src_pos, src_pos, 32); 1647 __ zero_extend(dst_pos, dst_pos, 32); 1648 1649 BLOCK_COMMENT("arraycopy_range_checks done"); 1650 } 1651 1652 // 1653 // Generate 'unsafe' array copy stub 1654 // Though just as safe as the other stubs, it takes an unscaled 1655 // size_t argument instead of an element count. 1656 // 1657 // Input: 1658 // c_rarg0 - source array address 1659 // c_rarg1 - destination array address 1660 // c_rarg2 - byte count, treated as ssize_t, can be zero 1661 // 1662 // Examines the alignment of the operands and dispatches 1663 // to a long, int, short, or byte copy loop. 1664 // 1665 address generate_unsafe_copy(const char* name, 1666 address byte_copy_entry, 1667 address short_copy_entry, 1668 address int_copy_entry, 1669 address long_copy_entry) { 1670 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr && 1671 int_copy_entry != nullptr && long_copy_entry != nullptr); 1672 Label L_long_aligned, L_int_aligned, L_short_aligned; 1673 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1674 1675 __ align(CodeEntryAlignment); 1676 StubCodeMark mark(this, "StubRoutines", name); 1677 address start = __ pc(); 1678 __ enter(); // required for proper stackwalking of RuntimeStub frame 1679 1680 // bump this on entry, not on exit: 1681 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1682 1683 __ orr(t0, s, d); 1684 __ orr(t0, t0, count); 1685 1686 __ andi(t0, t0, BytesPerLong - 1); 1687 __ beqz(t0, L_long_aligned); 1688 __ andi(t0, t0, BytesPerInt - 1); 1689 __ beqz(t0, L_int_aligned); 1690 __ test_bit(t0, t0, 0); 1691 __ beqz(t0, L_short_aligned); 1692 __ j(RuntimeAddress(byte_copy_entry)); 1693 1694 __ BIND(L_short_aligned); 1695 __ srli(count, count, LogBytesPerShort); // size => short_count 1696 __ j(RuntimeAddress(short_copy_entry)); 1697 __ BIND(L_int_aligned); 1698 __ srli(count, count, LogBytesPerInt); // size => int_count 1699 __ j(RuntimeAddress(int_copy_entry)); 1700 __ BIND(L_long_aligned); 1701 __ srli(count, count, LogBytesPerLong); // size => long_count 1702 __ j(RuntimeAddress(long_copy_entry)); 1703 1704 return start; 1705 } 1706 1707 // 1708 // Generate generic array copy stubs 1709 // 1710 // Input: 1711 // c_rarg0 - src oop 1712 // c_rarg1 - src_pos (32-bits) 1713 // c_rarg2 - dst oop 1714 // c_rarg3 - dst_pos (32-bits) 1715 // c_rarg4 - element count (32-bits) 1716 // 1717 // Output: 1718 // x10 == 0 - success 1719 // x10 == -1^K - failure, where K is partial transfer count 1720 // 1721 address generate_generic_copy(const char* name, 1722 address byte_copy_entry, address short_copy_entry, 1723 address int_copy_entry, address oop_copy_entry, 1724 address long_copy_entry, address checkcast_copy_entry) { 1725 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr && 1726 int_copy_entry != nullptr && oop_copy_entry != nullptr && 1727 long_copy_entry != nullptr && checkcast_copy_entry != nullptr); 1728 Label L_failed, L_failed_0, L_objArray; 1729 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1730 1731 // Input registers 1732 const Register src = c_rarg0; // source array oop 1733 const Register src_pos = c_rarg1; // source position 1734 const Register dst = c_rarg2; // destination array oop 1735 const Register dst_pos = c_rarg3; // destination position 1736 const Register length = c_rarg4; 1737 1738 // Registers used as temps 1739 const Register dst_klass = c_rarg5; 1740 1741 __ align(CodeEntryAlignment); 1742 1743 StubCodeMark mark(this, "StubRoutines", name); 1744 1745 address start = __ pc(); 1746 1747 __ enter(); // required for proper stackwalking of RuntimeStub frame 1748 1749 // bump this on entry, not on exit: 1750 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1751 1752 //----------------------------------------------------------------------- 1753 // Assembler stub will be used for this call to arraycopy 1754 // if the following conditions are met: 1755 // 1756 // (1) src and dst must not be null. 1757 // (2) src_pos must not be negative. 1758 // (3) dst_pos must not be negative. 1759 // (4) length must not be negative. 1760 // (5) src klass and dst klass should be the same and not null. 1761 // (6) src and dst should be arrays. 1762 // (7) src_pos + length must not exceed length of src. 1763 // (8) dst_pos + length must not exceed length of dst. 1764 // 1765 1766 // if src is null then return -1 1767 __ beqz(src, L_failed); 1768 1769 // if [src_pos < 0] then return -1 1770 __ sign_extend(t0, src_pos, 32); 1771 __ bltz(t0, L_failed); 1772 1773 // if dst is null then return -1 1774 __ beqz(dst, L_failed); 1775 1776 // if [dst_pos < 0] then return -1 1777 __ sign_extend(t0, dst_pos, 32); 1778 __ bltz(t0, L_failed); 1779 1780 // registers used as temp 1781 const Register scratch_length = x28; // elements count to copy 1782 const Register scratch_src_klass = x29; // array klass 1783 const Register lh = x30; // layout helper 1784 1785 // if [length < 0] then return -1 1786 __ sign_extend(scratch_length, length, 32); // length (elements count, 32-bits value) 1787 __ bltz(scratch_length, L_failed); 1788 1789 __ load_klass(scratch_src_klass, src); 1790 #ifdef ASSERT 1791 { 1792 BLOCK_COMMENT("assert klasses not null {"); 1793 Label L1, L2; 1794 __ bnez(scratch_src_klass, L2); // it is broken if klass is null 1795 __ bind(L1); 1796 __ stop("broken null klass"); 1797 __ bind(L2); 1798 __ load_klass(t0, dst, t1); 1799 __ beqz(t0, L1); // this would be broken also 1800 BLOCK_COMMENT("} assert klasses not null done"); 1801 } 1802 #endif 1803 1804 // Load layout helper (32-bits) 1805 // 1806 // |array_tag| | header_size | element_type | |log2_element_size| 1807 // 32 30 24 16 8 2 0 1808 // 1809 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1810 // 1811 1812 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1813 1814 // Handle objArrays completely differently... 1815 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1816 __ lw(lh, Address(scratch_src_klass, lh_offset)); 1817 __ mv(t0, objArray_lh); 1818 __ beq(lh, t0, L_objArray); 1819 1820 // if [src->klass() != dst->klass()] then return -1 1821 __ load_klass(t1, dst); 1822 __ bne(t1, scratch_src_klass, L_failed); 1823 1824 // if src->is_Array() isn't null then return -1 1825 // i.e. (lh >= 0) 1826 __ bgez(lh, L_failed); 1827 1828 // At this point, it is known to be a typeArray (array_tag 0x3). 1829 #ifdef ASSERT 1830 { 1831 BLOCK_COMMENT("assert primitive array {"); 1832 Label L; 1833 __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 1834 __ bge(lh, t1, L); 1835 __ stop("must be a primitive array"); 1836 __ bind(L); 1837 BLOCK_COMMENT("} assert primitive array done"); 1838 } 1839 #endif 1840 1841 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1842 t1, L_failed); 1843 1844 // TypeArrayKlass 1845 // 1846 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize) 1847 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize) 1848 // 1849 1850 const Register t0_offset = t0; // array offset 1851 const Register x30_elsize = lh; // element size 1852 1853 // Get array_header_in_bytes() 1854 int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1); 1855 int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width; 1856 __ slli(t0_offset, lh, XLEN - lh_header_size_msb); // left shift to remove 24 ~ 32; 1857 __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset 1858 1859 __ add(src, src, t0_offset); // src array offset 1860 __ add(dst, dst, t0_offset); // dst array offset 1861 BLOCK_COMMENT("choose copy loop based on element size"); 1862 1863 // next registers should be set before the jump to corresponding stub 1864 const Register from = c_rarg0; // source array address 1865 const Register to = c_rarg1; // destination array address 1866 const Register count = c_rarg2; // elements count 1867 1868 // 'from', 'to', 'count' registers should be set in such order 1869 // since they are the same as 'src', 'src_pos', 'dst'. 1870 1871 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1872 1873 // The possible values of elsize are 0-3, i.e. exact_log2(element 1874 // size in bytes). We do a simple bitwise binary search. 1875 __ BIND(L_copy_bytes); 1876 __ test_bit(t0, x30_elsize, 1); 1877 __ bnez(t0, L_copy_ints); 1878 __ test_bit(t0, x30_elsize, 0); 1879 __ bnez(t0, L_copy_shorts); 1880 __ add(from, src, src_pos); // src_addr 1881 __ add(to, dst, dst_pos); // dst_addr 1882 __ sign_extend(count, scratch_length, 32); // length 1883 __ j(RuntimeAddress(byte_copy_entry)); 1884 1885 __ BIND(L_copy_shorts); 1886 __ shadd(from, src_pos, src, t0, 1); // src_addr 1887 __ shadd(to, dst_pos, dst, t0, 1); // dst_addr 1888 __ sign_extend(count, scratch_length, 32); // length 1889 __ j(RuntimeAddress(short_copy_entry)); 1890 1891 __ BIND(L_copy_ints); 1892 __ test_bit(t0, x30_elsize, 0); 1893 __ bnez(t0, L_copy_longs); 1894 __ shadd(from, src_pos, src, t0, 2); // src_addr 1895 __ shadd(to, dst_pos, dst, t0, 2); // dst_addr 1896 __ sign_extend(count, scratch_length, 32); // length 1897 __ j(RuntimeAddress(int_copy_entry)); 1898 1899 __ BIND(L_copy_longs); 1900 #ifdef ASSERT 1901 { 1902 BLOCK_COMMENT("assert long copy {"); 1903 Label L; 1904 __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize 1905 __ sign_extend(lh, lh, 32); 1906 __ mv(t0, LogBytesPerLong); 1907 __ beq(x30_elsize, t0, L); 1908 __ stop("must be long copy, but elsize is wrong"); 1909 __ bind(L); 1910 BLOCK_COMMENT("} assert long copy done"); 1911 } 1912 #endif 1913 __ shadd(from, src_pos, src, t0, 3); // src_addr 1914 __ shadd(to, dst_pos, dst, t0, 3); // dst_addr 1915 __ sign_extend(count, scratch_length, 32); // length 1916 __ j(RuntimeAddress(long_copy_entry)); 1917 1918 // ObjArrayKlass 1919 __ BIND(L_objArray); 1920 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 1921 1922 Label L_plain_copy, L_checkcast_copy; 1923 // test array classes for subtyping 1924 __ load_klass(t2, dst); 1925 __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality 1926 1927 // Identically typed arrays can be copied without element-wise checks. 1928 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1929 t1, L_failed); 1930 1931 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1932 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1933 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1934 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1935 __ sign_extend(count, scratch_length, 32); // length 1936 __ BIND(L_plain_copy); 1937 __ j(RuntimeAddress(oop_copy_entry)); 1938 1939 __ BIND(L_checkcast_copy); 1940 // live at this point: scratch_src_klass, scratch_length, t2 (dst_klass) 1941 { 1942 // Before looking at dst.length, make sure dst is also an objArray. 1943 __ lwu(t0, Address(t2, lh_offset)); 1944 __ mv(t1, objArray_lh); 1945 __ bne(t0, t1, L_failed); 1946 1947 // It is safe to examine both src.length and dst.length. 1948 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1949 t2, L_failed); 1950 1951 __ load_klass(dst_klass, dst); // reload 1952 1953 // Marshal the base address arguments now, freeing registers. 1954 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1955 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1956 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1957 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1958 __ sign_extend(count, length, 32); // length (reloaded) 1959 const Register sco_temp = c_rarg3; // this register is free now 1960 assert_different_registers(from, to, count, sco_temp, 1961 dst_klass, scratch_src_klass); 1962 1963 // Generate the type check. 1964 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1965 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1966 1967 // Smashes t0, t1 1968 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 1969 1970 // Fetch destination element klass from the ObjArrayKlass header. 1971 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 1972 __ ld(dst_klass, Address(dst_klass, ek_offset)); 1973 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1974 1975 // the checkcast_copy loop needs two extra arguments: 1976 assert(c_rarg3 == sco_temp, "#3 already in place"); 1977 // Set up arguments for checkcast_copy_entry. 1978 __ mv(c_rarg4, dst_klass); // dst.klass.element_klass 1979 __ j(RuntimeAddress(checkcast_copy_entry)); 1980 } 1981 1982 __ BIND(L_failed); 1983 __ mv(x10, -1); 1984 __ leave(); // required for proper stackwalking of RuntimeStub frame 1985 __ ret(); 1986 1987 return start; 1988 } 1989 1990 // 1991 // Generate stub for array fill. If "aligned" is true, the 1992 // "to" address is assumed to be heapword aligned. 1993 // 1994 // Arguments for generated stub: 1995 // to: c_rarg0 1996 // value: c_rarg1 1997 // count: c_rarg2 treated as signed 1998 // 1999 address generate_fill(BasicType t, bool aligned, const char* name) { 2000 __ align(CodeEntryAlignment); 2001 StubCodeMark mark(this, "StubRoutines", name); 2002 address start = __ pc(); 2003 2004 BLOCK_COMMENT("Entry:"); 2005 2006 const Register to = c_rarg0; // source array address 2007 const Register value = c_rarg1; // value 2008 const Register count = c_rarg2; // elements count 2009 2010 const Register bz_base = x28; // base for block_zero routine 2011 const Register cnt_words = x29; // temp register 2012 const Register tmp_reg = t1; 2013 2014 __ enter(); 2015 2016 Label L_fill_elements, L_exit1; 2017 2018 int shift = -1; 2019 switch (t) { 2020 case T_BYTE: 2021 shift = 0; 2022 2023 // Zero extend value 2024 // 8 bit -> 16 bit 2025 __ andi(value, value, 0xff); 2026 __ mv(tmp_reg, value); 2027 __ slli(tmp_reg, tmp_reg, 8); 2028 __ orr(value, value, tmp_reg); 2029 2030 // 16 bit -> 32 bit 2031 __ mv(tmp_reg, value); 2032 __ slli(tmp_reg, tmp_reg, 16); 2033 __ orr(value, value, tmp_reg); 2034 2035 __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2036 __ bltu(count, tmp_reg, L_fill_elements); 2037 break; 2038 case T_SHORT: 2039 shift = 1; 2040 // Zero extend value 2041 // 16 bit -> 32 bit 2042 __ andi(value, value, 0xffff); 2043 __ mv(tmp_reg, value); 2044 __ slli(tmp_reg, tmp_reg, 16); 2045 __ orr(value, value, tmp_reg); 2046 2047 // Short arrays (< 8 bytes) fill by element 2048 __ mv(tmp_reg, 8 >> shift); 2049 __ bltu(count, tmp_reg, L_fill_elements); 2050 break; 2051 case T_INT: 2052 shift = 2; 2053 2054 // Short arrays (< 8 bytes) fill by element 2055 __ mv(tmp_reg, 8 >> shift); 2056 __ bltu(count, tmp_reg, L_fill_elements); 2057 break; 2058 default: ShouldNotReachHere(); 2059 } 2060 2061 // Align source address at 8 bytes address boundary. 2062 Label L_skip_align1, L_skip_align2, L_skip_align4; 2063 if (!aligned) { 2064 switch (t) { 2065 case T_BYTE: 2066 // One byte misalignment happens only for byte arrays. 2067 __ test_bit(t0, to, 0); 2068 __ beqz(t0, L_skip_align1); 2069 __ sb(value, Address(to, 0)); 2070 __ addi(to, to, 1); 2071 __ addiw(count, count, -1); 2072 __ bind(L_skip_align1); 2073 // Fallthrough 2074 case T_SHORT: 2075 // Two bytes misalignment happens only for byte and short (char) arrays. 2076 __ test_bit(t0, to, 1); 2077 __ beqz(t0, L_skip_align2); 2078 __ sh(value, Address(to, 0)); 2079 __ addi(to, to, 2); 2080 __ addiw(count, count, -(2 >> shift)); 2081 __ bind(L_skip_align2); 2082 // Fallthrough 2083 case T_INT: 2084 // Align to 8 bytes, we know we are 4 byte aligned to start. 2085 __ test_bit(t0, to, 2); 2086 __ beqz(t0, L_skip_align4); 2087 __ sw(value, Address(to, 0)); 2088 __ addi(to, to, 4); 2089 __ addiw(count, count, -(4 >> shift)); 2090 __ bind(L_skip_align4); 2091 break; 2092 default: ShouldNotReachHere(); 2093 } 2094 } 2095 2096 // 2097 // Fill large chunks 2098 // 2099 __ srliw(cnt_words, count, 3 - shift); // number of words 2100 2101 // 32 bit -> 64 bit 2102 __ andi(value, value, 0xffffffff); 2103 __ mv(tmp_reg, value); 2104 __ slli(tmp_reg, tmp_reg, 32); 2105 __ orr(value, value, tmp_reg); 2106 2107 __ slli(tmp_reg, cnt_words, 3 - shift); 2108 __ subw(count, count, tmp_reg); 2109 { 2110 __ fill_words(to, cnt_words, value); 2111 } 2112 2113 // Remaining count is less than 8 bytes. Fill it by a single store. 2114 // Note that the total length is no less than 8 bytes. 2115 if (t == T_BYTE || t == T_SHORT) { 2116 __ beqz(count, L_exit1); 2117 __ shadd(to, count, to, tmp_reg, shift); // points to the end 2118 __ sd(value, Address(to, -8)); // overwrite some elements 2119 __ bind(L_exit1); 2120 __ leave(); 2121 __ ret(); 2122 } 2123 2124 // Handle copies less than 8 bytes. 2125 Label L_fill_2, L_fill_4, L_exit2; 2126 __ bind(L_fill_elements); 2127 switch (t) { 2128 case T_BYTE: 2129 __ test_bit(t0, count, 0); 2130 __ beqz(t0, L_fill_2); 2131 __ sb(value, Address(to, 0)); 2132 __ addi(to, to, 1); 2133 __ bind(L_fill_2); 2134 __ test_bit(t0, count, 1); 2135 __ beqz(t0, L_fill_4); 2136 __ sh(value, Address(to, 0)); 2137 __ addi(to, to, 2); 2138 __ bind(L_fill_4); 2139 __ test_bit(t0, count, 2); 2140 __ beqz(t0, L_exit2); 2141 __ sw(value, Address(to, 0)); 2142 break; 2143 case T_SHORT: 2144 __ test_bit(t0, count, 0); 2145 __ beqz(t0, L_fill_4); 2146 __ sh(value, Address(to, 0)); 2147 __ addi(to, to, 2); 2148 __ bind(L_fill_4); 2149 __ test_bit(t0, count, 1); 2150 __ beqz(t0, L_exit2); 2151 __ sw(value, Address(to, 0)); 2152 break; 2153 case T_INT: 2154 __ beqz(count, L_exit2); 2155 __ sw(value, Address(to, 0)); 2156 break; 2157 default: ShouldNotReachHere(); 2158 } 2159 __ bind(L_exit2); 2160 __ leave(); 2161 __ ret(); 2162 return start; 2163 } 2164 2165 void generate_arraycopy_stubs() { 2166 address entry = nullptr; 2167 address entry_jbyte_arraycopy = nullptr; 2168 address entry_jshort_arraycopy = nullptr; 2169 address entry_jint_arraycopy = nullptr; 2170 address entry_oop_arraycopy = nullptr; 2171 address entry_jlong_arraycopy = nullptr; 2172 address entry_checkcast_arraycopy = nullptr; 2173 2174 generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards); 2175 generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards); 2176 2177 StubRoutines::riscv::_zero_blocks = generate_zero_blocks(); 2178 2179 //*** jbyte 2180 // Always need aligned and unaligned versions 2181 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2182 "jbyte_disjoint_arraycopy"); 2183 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2184 &entry_jbyte_arraycopy, 2185 "jbyte_arraycopy"); 2186 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2187 "arrayof_jbyte_disjoint_arraycopy"); 2188 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2189 "arrayof_jbyte_arraycopy"); 2190 2191 //*** jshort 2192 // Always need aligned and unaligned versions 2193 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2194 "jshort_disjoint_arraycopy"); 2195 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2196 &entry_jshort_arraycopy, 2197 "jshort_arraycopy"); 2198 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2199 "arrayof_jshort_disjoint_arraycopy"); 2200 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2201 "arrayof_jshort_arraycopy"); 2202 2203 //*** jint 2204 // Aligned versions 2205 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2206 "arrayof_jint_disjoint_arraycopy"); 2207 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2208 "arrayof_jint_arraycopy"); 2209 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2210 // entry_jint_arraycopy always points to the unaligned version 2211 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2212 "jint_disjoint_arraycopy"); 2213 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2214 &entry_jint_arraycopy, 2215 "jint_arraycopy"); 2216 2217 //*** jlong 2218 // It is always aligned 2219 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2220 "arrayof_jlong_disjoint_arraycopy"); 2221 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2222 "arrayof_jlong_arraycopy"); 2223 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2224 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2225 2226 //*** oops 2227 { 2228 // With compressed oops we need unaligned versions; notice that 2229 // we overwrite entry_oop_arraycopy. 2230 bool aligned = !UseCompressedOops; 2231 2232 StubRoutines::_arrayof_oop_disjoint_arraycopy 2233 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2234 /*dest_uninitialized*/false); 2235 StubRoutines::_arrayof_oop_arraycopy 2236 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2237 /*dest_uninitialized*/false); 2238 // Aligned versions without pre-barriers 2239 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2240 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2241 /*dest_uninitialized*/true); 2242 StubRoutines::_arrayof_oop_arraycopy_uninit 2243 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2244 /*dest_uninitialized*/true); 2245 } 2246 2247 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2248 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2249 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2250 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2251 2252 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2253 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2254 /*dest_uninitialized*/true); 2255 2256 2257 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2258 entry_jbyte_arraycopy, 2259 entry_jshort_arraycopy, 2260 entry_jint_arraycopy, 2261 entry_jlong_arraycopy); 2262 2263 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2264 entry_jbyte_arraycopy, 2265 entry_jshort_arraycopy, 2266 entry_jint_arraycopy, 2267 entry_oop_arraycopy, 2268 entry_jlong_arraycopy, 2269 entry_checkcast_arraycopy); 2270 2271 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2272 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2273 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2274 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2275 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2276 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2277 } 2278 2279 void generate_aes_loadkeys(const Register &key, VectorRegister *working_vregs, int rounds) { 2280 const int step = 16; 2281 for (int i = 0; i < rounds; i++) { 2282 __ vle32_v(working_vregs[i], key); 2283 // The keys are stored in little-endian array, while we need 2284 // to operate in big-endian. 2285 // So performing an endian-swap here with vrev8.v instruction 2286 __ vrev8_v(working_vregs[i], working_vregs[i]); 2287 __ addi(key, key, step); 2288 } 2289 } 2290 2291 void generate_aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) { 2292 assert(rounds <= 15, "rounds should be less than or equal to working_vregs size"); 2293 2294 __ vxor_vv(res, res, working_vregs[0]); 2295 for (int i = 1; i < rounds - 1; i++) { 2296 __ vaesem_vv(res, working_vregs[i]); 2297 } 2298 __ vaesef_vv(res, working_vregs[rounds - 1]); 2299 } 2300 2301 // Arguments: 2302 // 2303 // Inputs: 2304 // c_rarg0 - source byte array address 2305 // c_rarg1 - destination byte array address 2306 // c_rarg2 - K (key) in little endian int array 2307 // 2308 address generate_aescrypt_encryptBlock() { 2309 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support"); 2310 2311 __ align(CodeEntryAlignment); 2312 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2313 2314 Label L_aes128, L_aes192; 2315 2316 const Register from = c_rarg0; // source array address 2317 const Register to = c_rarg1; // destination array address 2318 const Register key = c_rarg2; // key array address 2319 const Register keylen = c_rarg3; 2320 2321 VectorRegister working_vregs[] = { 2322 v4, v5, v6, v7, v8, v9, v10, v11, 2323 v12, v13, v14, v15, v16, v17, v18 2324 }; 2325 const VectorRegister res = v19; 2326 2327 address start = __ pc(); 2328 __ enter(); 2329 2330 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2331 2332 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1); 2333 __ vle32_v(res, from); 2334 2335 __ mv(t2, 52); 2336 __ blt(keylen, t2, L_aes128); 2337 __ beq(keylen, t2, L_aes192); 2338 // Else we fallthrough to the biggest case (256-bit key size) 2339 2340 // Note: the following function performs key += 15*16 2341 generate_aes_loadkeys(key, working_vregs, 15); 2342 generate_aes_encrypt(res, working_vregs, 15); 2343 __ vse32_v(res, to); 2344 __ mv(c_rarg0, 0); 2345 __ leave(); 2346 __ ret(); 2347 2348 __ bind(L_aes192); 2349 // Note: the following function performs key += 13*16 2350 generate_aes_loadkeys(key, working_vregs, 13); 2351 generate_aes_encrypt(res, working_vregs, 13); 2352 __ vse32_v(res, to); 2353 __ mv(c_rarg0, 0); 2354 __ leave(); 2355 __ ret(); 2356 2357 __ bind(L_aes128); 2358 // Note: the following function performs key += 11*16 2359 generate_aes_loadkeys(key, working_vregs, 11); 2360 generate_aes_encrypt(res, working_vregs, 11); 2361 __ vse32_v(res, to); 2362 __ mv(c_rarg0, 0); 2363 __ leave(); 2364 __ ret(); 2365 2366 return start; 2367 } 2368 2369 void generate_aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) { 2370 assert(rounds <= 15, "rounds should be less than or equal to working_vregs size"); 2371 2372 __ vxor_vv(res, res, working_vregs[rounds - 1]); 2373 for (int i = rounds - 2; i > 0; i--) { 2374 __ vaesdm_vv(res, working_vregs[i]); 2375 } 2376 __ vaesdf_vv(res, working_vregs[0]); 2377 } 2378 2379 // Arguments: 2380 // 2381 // Inputs: 2382 // c_rarg0 - source byte array address 2383 // c_rarg1 - destination byte array address 2384 // c_rarg2 - K (key) in little endian int array 2385 // 2386 address generate_aescrypt_decryptBlock() { 2387 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support"); 2388 2389 __ align(CodeEntryAlignment); 2390 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2391 2392 Label L_aes128, L_aes192; 2393 2394 const Register from = c_rarg0; // source array address 2395 const Register to = c_rarg1; // destination array address 2396 const Register key = c_rarg2; // key array address 2397 const Register keylen = c_rarg3; 2398 2399 VectorRegister working_vregs[] = { 2400 v4, v5, v6, v7, v8, v9, v10, v11, 2401 v12, v13, v14, v15, v16, v17, v18 2402 }; 2403 const VectorRegister res = v19; 2404 2405 address start = __ pc(); 2406 __ enter(); // required for proper stackwalking of RuntimeStub frame 2407 2408 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2409 2410 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1); 2411 __ vle32_v(res, from); 2412 2413 __ mv(t2, 52); 2414 __ blt(keylen, t2, L_aes128); 2415 __ beq(keylen, t2, L_aes192); 2416 // Else we fallthrough to the biggest case (256-bit key size) 2417 2418 // Note: the following function performs key += 15*16 2419 generate_aes_loadkeys(key, working_vregs, 15); 2420 generate_aes_decrypt(res, working_vregs, 15); 2421 __ vse32_v(res, to); 2422 __ mv(c_rarg0, 0); 2423 __ leave(); 2424 __ ret(); 2425 2426 __ bind(L_aes192); 2427 // Note: the following function performs key += 13*16 2428 generate_aes_loadkeys(key, working_vregs, 13); 2429 generate_aes_decrypt(res, working_vregs, 13); 2430 __ vse32_v(res, to); 2431 __ mv(c_rarg0, 0); 2432 __ leave(); 2433 __ ret(); 2434 2435 __ bind(L_aes128); 2436 // Note: the following function performs key += 11*16 2437 generate_aes_loadkeys(key, working_vregs, 11); 2438 generate_aes_decrypt(res, working_vregs, 11); 2439 __ vse32_v(res, to); 2440 __ mv(c_rarg0, 0); 2441 __ leave(); 2442 __ ret(); 2443 2444 return start; 2445 } 2446 2447 // code for comparing 16 bytes of strings with same encoding 2448 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 2449 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31; 2450 __ ld(tmp5, Address(str1)); 2451 __ addi(str1, str1, 8); 2452 __ xorr(tmp4, tmp1, tmp2); 2453 __ ld(cnt1, Address(str2)); 2454 __ addi(str2, str2, 8); 2455 __ bnez(tmp4, DIFF1); 2456 __ ld(tmp1, Address(str1)); 2457 __ addi(str1, str1, 8); 2458 __ xorr(tmp4, tmp5, cnt1); 2459 __ ld(tmp2, Address(str2)); 2460 __ addi(str2, str2, 8); 2461 __ bnez(tmp4, DIFF2); 2462 } 2463 2464 // code for comparing 8 characters of strings with Latin1 and Utf16 encoding 2465 void compare_string_8_x_LU(Register tmpL, Register tmpU, Register strL, Register strU, Label& DIFF) { 2466 const Register tmp = x30, tmpLval = x12; 2467 __ ld(tmpLval, Address(strL)); 2468 __ addi(strL, strL, wordSize); 2469 __ ld(tmpU, Address(strU)); 2470 __ addi(strU, strU, wordSize); 2471 __ inflate_lo32(tmpL, tmpLval); 2472 __ xorr(tmp, tmpU, tmpL); 2473 __ bnez(tmp, DIFF); 2474 2475 __ ld(tmpU, Address(strU)); 2476 __ addi(strU, strU, wordSize); 2477 __ inflate_hi32(tmpL, tmpLval); 2478 __ xorr(tmp, tmpU, tmpL); 2479 __ bnez(tmp, DIFF); 2480 } 2481 2482 // x10 = result 2483 // x11 = str1 2484 // x12 = cnt1 2485 // x13 = str2 2486 // x14 = cnt2 2487 // x28 = tmp1 2488 // x29 = tmp2 2489 // x30 = tmp3 2490 address generate_compare_long_string_different_encoding(bool isLU) { 2491 __ align(CodeEntryAlignment); 2492 StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL"); 2493 address entry = __ pc(); 2494 Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE; 2495 const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14, 2496 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12; 2497 2498 // cnt2 == amount of characters left to compare 2499 // Check already loaded first 4 symbols 2500 __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2); 2501 __ mv(isLU ? tmp1 : tmp2, tmp3); 2502 __ addi(str1, str1, isLU ? wordSize / 2 : wordSize); 2503 __ addi(str2, str2, isLU ? wordSize : wordSize / 2); 2504 __ sub(cnt2, cnt2, wordSize / 2); // Already loaded 4 symbols 2505 2506 __ xorr(tmp3, tmp1, tmp2); 2507 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2508 2509 Register strU = isLU ? str2 : str1, 2510 strL = isLU ? str1 : str2, 2511 tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison 2512 tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison 2513 2514 // make sure main loop is 8 byte-aligned, we should load another 4 bytes from strL 2515 // cnt2 is >= 68 here, no need to check it for >= 0 2516 __ lwu(tmpL, Address(strL)); 2517 __ addi(strL, strL, wordSize / 2); 2518 __ ld(tmpU, Address(strU)); 2519 __ addi(strU, strU, wordSize); 2520 __ inflate_lo32(tmp3, tmpL); 2521 __ mv(tmpL, tmp3); 2522 __ xorr(tmp3, tmpU, tmpL); 2523 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2524 __ addi(cnt2, cnt2, -wordSize / 2); 2525 2526 // we are now 8-bytes aligned on strL 2527 __ sub(cnt2, cnt2, wordSize * 2); 2528 __ bltz(cnt2, TAIL); 2529 __ bind(SMALL_LOOP); // smaller loop 2530 __ sub(cnt2, cnt2, wordSize * 2); 2531 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2532 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2533 __ bgez(cnt2, SMALL_LOOP); 2534 __ addi(t0, cnt2, wordSize * 2); 2535 __ beqz(t0, DONE); 2536 __ bind(TAIL); // 1..15 characters left 2537 // Aligned access. Load bytes in portions - 4, 2, 1. 2538 2539 __ addi(t0, cnt2, wordSize); 2540 __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process 2541 __ bltz(t0, LOAD_LAST); 2542 // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU 2543 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2544 __ addi(cnt2, cnt2, -wordSize); 2545 __ beqz(cnt2, DONE); // no character left 2546 __ bind(LOAD_LAST); // cnt2 = 1..7 characters left 2547 2548 __ addi(cnt2, cnt2, -wordSize); // cnt2 is now an offset in strL which points to last 8 bytes 2549 __ slli(t0, cnt2, 1); // t0 is now an offset in strU which points to last 16 bytes 2550 __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string 2551 __ add(strU, strU, t0); // Address of last 16 bytes in UTF-16 string 2552 __ load_int_misaligned(tmpL, Address(strL), t0, false); 2553 __ load_long_misaligned(tmpU, Address(strU), t0, 2); 2554 __ inflate_lo32(tmp3, tmpL); 2555 __ mv(tmpL, tmp3); 2556 __ xorr(tmp3, tmpU, tmpL); 2557 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2558 2559 __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string 2560 __ addi(strU, strU, wordSize); // Address of last 8 bytes in UTF-16 string 2561 __ load_int_misaligned(tmpL, Address(strL), t0, false); 2562 __ load_long_misaligned(tmpU, Address(strU), t0, 2); 2563 __ inflate_lo32(tmp3, tmpL); 2564 __ mv(tmpL, tmp3); 2565 __ xorr(tmp3, tmpU, tmpL); 2566 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2567 __ j(DONE); // no character left 2568 2569 // Find the first different characters in the longwords and 2570 // compute their difference. 2571 __ bind(CALCULATE_DIFFERENCE); 2572 __ ctzc_bit(tmp4, tmp3); 2573 __ srl(tmp1, tmp1, tmp4); 2574 __ srl(tmp2, tmp2, tmp4); 2575 __ andi(tmp1, tmp1, 0xFFFF); 2576 __ andi(tmp2, tmp2, 0xFFFF); 2577 __ sub(result, tmp1, tmp2); 2578 __ bind(DONE); 2579 __ ret(); 2580 return entry; 2581 } 2582 2583 address generate_method_entry_barrier() { 2584 __ align(CodeEntryAlignment); 2585 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 2586 2587 Label deoptimize_label; 2588 2589 address start = __ pc(); 2590 2591 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 2592 2593 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 2594 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 2595 Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 2596 __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr())); 2597 __ lwu(t1, t1); 2598 __ sw(t1, thread_epoch_addr); 2599 // There are two ways this can work: 2600 // - The writer did system icache shootdown after the instruction stream update. 2601 // Hence do nothing. 2602 // - The writer trust us to make sure our icache is in sync before entering. 2603 // Hence use cmodx fence (fence.i, may change). 2604 if (UseCtxFencei) { 2605 __ cmodx_fence(); 2606 } 2607 __ membar(__ LoadLoad); 2608 } 2609 2610 __ set_last_Java_frame(sp, fp, ra); 2611 2612 __ enter(); 2613 __ add(t1, sp, wordSize); 2614 2615 __ sub(sp, sp, 4 * wordSize); 2616 2617 __ push_call_clobbered_registers(); 2618 2619 __ mv(c_rarg0, t1); 2620 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 2621 2622 __ reset_last_Java_frame(true); 2623 2624 __ mv(t0, x10); 2625 2626 __ pop_call_clobbered_registers(); 2627 2628 __ bnez(t0, deoptimize_label); 2629 2630 __ leave(); 2631 __ ret(); 2632 2633 __ BIND(deoptimize_label); 2634 2635 __ ld(t0, Address(sp, 0)); 2636 __ ld(fp, Address(sp, wordSize)); 2637 __ ld(ra, Address(sp, wordSize * 2)); 2638 __ ld(t1, Address(sp, wordSize * 3)); 2639 2640 __ mv(sp, t0); 2641 __ jr(t1); 2642 2643 return start; 2644 } 2645 2646 // x10 = result 2647 // x11 = str1 2648 // x12 = cnt1 2649 // x13 = str2 2650 // x14 = cnt2 2651 // x28 = tmp1 2652 // x29 = tmp2 2653 // x30 = tmp3 2654 // x31 = tmp4 2655 address generate_compare_long_string_same_encoding(bool isLL) { 2656 __ align(CodeEntryAlignment); 2657 StubCodeMark mark(this, "StubRoutines", isLL ? 2658 "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU"); 2659 address entry = __ pc(); 2660 Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL, 2661 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF; 2662 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14, 2663 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31; 2664 RegSet spilled_regs = RegSet::of(tmp4, tmp5); 2665 2666 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 2667 // update cnt2 counter with already loaded 8 bytes 2668 __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2)); 2669 // update pointers, because of previous read 2670 __ add(str1, str1, wordSize); 2671 __ add(str2, str2, wordSize); 2672 // less than 16 bytes left? 2673 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2674 __ push_reg(spilled_regs, sp); 2675 __ bltz(cnt2, TAIL); 2676 __ bind(SMALL_LOOP); 2677 compare_string_16_bytes_same(DIFF, DIFF2); 2678 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2679 __ bgez(cnt2, SMALL_LOOP); 2680 __ bind(TAIL); 2681 __ addi(cnt2, cnt2, isLL ? 16 : 8); 2682 __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); 2683 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2684 __ blez(cnt2, CHECK_LAST); 2685 __ xorr(tmp4, tmp1, tmp2); 2686 __ bnez(tmp4, DIFF); 2687 __ ld(tmp1, Address(str1)); 2688 __ addi(str1, str1, 8); 2689 __ ld(tmp2, Address(str2)); 2690 __ addi(str2, str2, 8); 2691 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2692 __ bind(CHECK_LAST); 2693 if (!isLL) { 2694 __ add(cnt2, cnt2, cnt2); // now in bytes 2695 } 2696 __ xorr(tmp4, tmp1, tmp2); 2697 __ bnez(tmp4, DIFF); 2698 __ add(str1, str1, cnt2); 2699 __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2); 2700 __ add(str2, str2, cnt2); 2701 __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2); 2702 __ xorr(tmp4, tmp5, cnt1); 2703 __ beqz(tmp4, LENGTH_DIFF); 2704 // Find the first different characters in the longwords and 2705 // compute their difference. 2706 __ bind(DIFF2); 2707 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2708 __ srl(tmp5, tmp5, tmp3); 2709 __ srl(cnt1, cnt1, tmp3); 2710 if (isLL) { 2711 __ andi(tmp5, tmp5, 0xFF); 2712 __ andi(cnt1, cnt1, 0xFF); 2713 } else { 2714 __ andi(tmp5, tmp5, 0xFFFF); 2715 __ andi(cnt1, cnt1, 0xFFFF); 2716 } 2717 __ sub(result, tmp5, cnt1); 2718 __ j(LENGTH_DIFF); 2719 __ bind(DIFF); 2720 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2721 __ srl(tmp1, tmp1, tmp3); 2722 __ srl(tmp2, tmp2, tmp3); 2723 if (isLL) { 2724 __ andi(tmp1, tmp1, 0xFF); 2725 __ andi(tmp2, tmp2, 0xFF); 2726 } else { 2727 __ andi(tmp1, tmp1, 0xFFFF); 2728 __ andi(tmp2, tmp2, 0xFFFF); 2729 } 2730 __ sub(result, tmp1, tmp2); 2731 __ j(LENGTH_DIFF); 2732 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 2733 __ xorr(tmp4, tmp1, tmp2); 2734 __ bnez(tmp4, DIFF); 2735 __ bind(LENGTH_DIFF); 2736 __ pop_reg(spilled_regs, sp); 2737 __ ret(); 2738 return entry; 2739 } 2740 2741 void generate_compare_long_strings() { 2742 StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true); 2743 StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false); 2744 StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true); 2745 StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false); 2746 } 2747 2748 // x10 result 2749 // x11 src 2750 // x12 src count 2751 // x13 pattern 2752 // x14 pattern count 2753 address generate_string_indexof_linear(bool needle_isL, bool haystack_isL) 2754 { 2755 const char* stubName = needle_isL 2756 ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul") 2757 : "indexof_linear_uu"; 2758 __ align(CodeEntryAlignment); 2759 StubCodeMark mark(this, "StubRoutines", stubName); 2760 address entry = __ pc(); 2761 2762 int needle_chr_size = needle_isL ? 1 : 2; 2763 int haystack_chr_size = haystack_isL ? 1 : 2; 2764 int needle_chr_shift = needle_isL ? 0 : 1; 2765 int haystack_chr_shift = haystack_isL ? 0 : 1; 2766 bool isL = needle_isL && haystack_isL; 2767 // parameters 2768 Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14; 2769 // temporary registers 2770 Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25; 2771 // redefinitions 2772 Register ch1 = x28, ch2 = x29; 2773 RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29); 2774 2775 __ push_reg(spilled_regs, sp); 2776 2777 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 2778 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 2779 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 2780 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 2781 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 2782 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 2783 2784 __ ld(ch1, Address(needle)); 2785 __ ld(ch2, Address(haystack)); 2786 // src.length - pattern.length 2787 __ sub(haystack_len, haystack_len, needle_len); 2788 2789 // first is needle[0] 2790 __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first); 2791 uint64_t mask0101 = UCONST64(0x0101010101010101); 2792 uint64_t mask0001 = UCONST64(0x0001000100010001); 2793 __ mv(mask1, haystack_isL ? mask0101 : mask0001); 2794 __ mul(first, first, mask1); 2795 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 2796 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 2797 __ mv(mask2, haystack_isL ? mask7f7f : mask7fff); 2798 if (needle_isL != haystack_isL) { 2799 __ mv(tmp, ch1); 2800 } 2801 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1); 2802 __ blez(haystack_len, L_SMALL); 2803 2804 if (needle_isL != haystack_isL) { 2805 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2806 } 2807 // xorr, sub, orr, notr, andr 2808 // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i] 2809 // eg: 2810 // first: aa aa aa aa aa aa aa aa 2811 // ch2: aa aa li nx jd ka aa aa 2812 // match_mask: 80 80 00 00 00 00 80 80 2813 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2814 2815 // search first char of needle, if success, goto L_HAS_ZERO; 2816 __ bnez(match_mask, L_HAS_ZERO); 2817 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2818 __ add(result, result, wordSize / haystack_chr_size); 2819 __ add(haystack, haystack, wordSize); 2820 __ bltz(haystack_len, L_POST_LOOP); 2821 2822 __ bind(L_LOOP); 2823 __ ld(ch2, Address(haystack)); 2824 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2825 __ bnez(match_mask, L_HAS_ZERO); 2826 2827 __ bind(L_LOOP_PROCEED); 2828 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2829 __ add(haystack, haystack, wordSize); 2830 __ add(result, result, wordSize / haystack_chr_size); 2831 __ bgez(haystack_len, L_LOOP); 2832 2833 __ bind(L_POST_LOOP); 2834 __ mv(ch2, -wordSize / haystack_chr_size); 2835 __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check 2836 __ ld(ch2, Address(haystack)); 2837 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2838 __ neg(haystack_len, haystack_len); 2839 __ xorr(ch2, first, ch2); 2840 __ sub(match_mask, ch2, mask1); 2841 __ orr(ch2, ch2, mask2); 2842 __ mv(trailing_zeros, -1); // all bits set 2843 __ j(L_SMALL_PROCEED); 2844 2845 __ align(OptoLoopAlignment); 2846 __ bind(L_SMALL); 2847 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2848 __ neg(haystack_len, haystack_len); 2849 if (needle_isL != haystack_isL) { 2850 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2851 } 2852 __ xorr(ch2, first, ch2); 2853 __ sub(match_mask, ch2, mask1); 2854 __ orr(ch2, ch2, mask2); 2855 __ mv(trailing_zeros, -1); // all bits set 2856 2857 __ bind(L_SMALL_PROCEED); 2858 __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits. 2859 __ notr(ch2, ch2); 2860 __ andr(match_mask, match_mask, ch2); 2861 __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check 2862 __ beqz(match_mask, NOMATCH); 2863 2864 __ bind(L_SMALL_HAS_ZERO_LOOP); 2865 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros 2866 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2867 __ mv(ch2, wordSize / haystack_chr_size); 2868 __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2); 2869 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2870 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2871 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2872 2873 __ bind(L_SMALL_CMP_LOOP); 2874 __ shadd(first, trailing_zeros, needle, first, needle_chr_shift); 2875 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2876 needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first)); 2877 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2878 __ add(trailing_zeros, trailing_zeros, 1); 2879 __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP); 2880 __ beq(first, ch2, L_SMALL_CMP_LOOP); 2881 2882 __ bind(L_SMALL_CMP_LOOP_NOMATCH); 2883 __ beqz(match_mask, NOMATCH); 2884 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2885 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2886 __ add(result, result, 1); 2887 __ add(haystack, haystack, haystack_chr_size); 2888 __ j(L_SMALL_HAS_ZERO_LOOP); 2889 2890 __ align(OptoLoopAlignment); 2891 __ bind(L_SMALL_CMP_LOOP_LAST_CMP); 2892 __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2893 __ j(DONE); 2894 2895 __ align(OptoLoopAlignment); 2896 __ bind(L_SMALL_CMP_LOOP_LAST_CMP2); 2897 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2898 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2899 __ j(DONE); 2900 2901 __ align(OptoLoopAlignment); 2902 __ bind(L_HAS_ZERO); 2903 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2904 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2905 __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2); 2906 __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits) 2907 __ sub(result, result, 1); // array index from 0, so result -= 1 2908 2909 __ bind(L_HAS_ZERO_LOOP); 2910 __ mv(needle_len, wordSize / haystack_chr_size); 2911 __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2); 2912 __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2); 2913 // load next 8 bytes from haystack, and increase result index 2914 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2915 __ add(result, result, 1); 2916 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2917 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2918 2919 // compare one char 2920 __ bind(L_CMP_LOOP); 2921 __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift); 2922 needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len)); 2923 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2924 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2925 __ add(trailing_zeros, trailing_zeros, 1); // next char index 2926 __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2); 2927 __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP); 2928 __ beq(needle_len, ch2, L_CMP_LOOP); 2929 2930 __ bind(L_CMP_LOOP_NOMATCH); 2931 __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH); 2932 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index 2933 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2934 __ add(haystack, haystack, haystack_chr_size); 2935 __ j(L_HAS_ZERO_LOOP); 2936 2937 __ align(OptoLoopAlignment); 2938 __ bind(L_CMP_LOOP_LAST_CMP); 2939 __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH); 2940 __ j(DONE); 2941 2942 __ align(OptoLoopAlignment); 2943 __ bind(L_CMP_LOOP_LAST_CMP2); 2944 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2945 __ add(result, result, 1); 2946 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2947 __ j(DONE); 2948 2949 __ align(OptoLoopAlignment); 2950 __ bind(L_HAS_ZERO_LOOP_NOMATCH); 2951 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 2952 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 2953 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 2954 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 2955 // result by analyzed characters value, so, we can just reset lower bits 2956 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 2957 // 2) restore needle_len and haystack_len values from "compressed" haystack_len 2958 // 3) advance haystack value to represent next haystack octet. result & 7/3 is 2959 // index of last analyzed substring inside current octet. So, haystack in at 2960 // respective start address. We need to advance it to next octet 2961 __ andi(match_mask, result, wordSize / haystack_chr_size - 1); 2962 __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2); 2963 __ andi(result, result, haystack_isL ? -8 : -4); 2964 __ slli(tmp, match_mask, haystack_chr_shift); 2965 __ sub(haystack, haystack, tmp); 2966 __ sign_extend(haystack_len, haystack_len, 32); 2967 __ j(L_LOOP_PROCEED); 2968 2969 __ align(OptoLoopAlignment); 2970 __ bind(NOMATCH); 2971 __ mv(result, -1); 2972 2973 __ bind(DONE); 2974 __ pop_reg(spilled_regs, sp); 2975 __ ret(); 2976 return entry; 2977 } 2978 2979 void generate_string_indexof_stubs() 2980 { 2981 StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 2982 StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 2983 StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 2984 } 2985 2986 #ifdef COMPILER2 2987 address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { 2988 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table"); 2989 2990 address start = __ pc(); 2991 const Register 2992 r_super_klass = x10, 2993 r_array_base = x11, 2994 r_array_length = x12, 2995 r_array_index = x13, 2996 r_sub_klass = x14, 2997 result = x15, 2998 r_bitmap = x16; 2999 3000 Label L_success; 3001 __ enter(); 3002 __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, result, 3003 r_array_base, r_array_length, r_array_index, 3004 r_bitmap, super_klass_index, /*stub_is_near*/true); 3005 __ leave(); 3006 __ ret(); 3007 3008 return start; 3009 } 3010 3011 // Slow path implementation for UseSecondarySupersTable. 3012 address generate_lookup_secondary_supers_table_slow_path_stub() { 3013 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path"); 3014 3015 address start = __ pc(); 3016 const Register 3017 r_super_klass = x10, // argument 3018 r_array_base = x11, // argument 3019 temp1 = x12, // tmp 3020 r_array_index = x13, // argument 3021 result = x15, // argument 3022 r_bitmap = x16; // argument 3023 3024 3025 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 3026 __ ret(); 3027 3028 return start; 3029 } 3030 3031 address generate_mulAdd() 3032 { 3033 __ align(CodeEntryAlignment); 3034 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3035 3036 address entry = __ pc(); 3037 3038 const Register out = x10; 3039 const Register in = x11; 3040 const Register offset = x12; 3041 const Register len = x13; 3042 const Register k = x14; 3043 const Register tmp = x28; 3044 3045 BLOCK_COMMENT("Entry:"); 3046 __ enter(); 3047 __ mul_add(out, in, offset, len, k, tmp); 3048 __ leave(); 3049 __ ret(); 3050 3051 return entry; 3052 } 3053 3054 /** 3055 * Arguments: 3056 * 3057 * Input: 3058 * c_rarg0 - x address 3059 * c_rarg1 - x length 3060 * c_rarg2 - y address 3061 * c_rarg3 - y length 3062 * c_rarg4 - z address 3063 */ 3064 address generate_multiplyToLen() 3065 { 3066 __ align(CodeEntryAlignment); 3067 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3068 address entry = __ pc(); 3069 3070 const Register x = x10; 3071 const Register xlen = x11; 3072 const Register y = x12; 3073 const Register ylen = x13; 3074 const Register z = x14; 3075 3076 const Register tmp0 = x15; 3077 const Register tmp1 = x16; 3078 const Register tmp2 = x17; 3079 const Register tmp3 = x7; 3080 const Register tmp4 = x28; 3081 const Register tmp5 = x29; 3082 const Register tmp6 = x30; 3083 const Register tmp7 = x31; 3084 3085 BLOCK_COMMENT("Entry:"); 3086 __ enter(); // required for proper stackwalking of RuntimeStub frame 3087 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3088 __ leave(); // required for proper stackwalking of RuntimeStub frame 3089 __ ret(); 3090 3091 return entry; 3092 } 3093 3094 address generate_squareToLen() 3095 { 3096 __ align(CodeEntryAlignment); 3097 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3098 address entry = __ pc(); 3099 3100 const Register x = x10; 3101 const Register xlen = x11; 3102 const Register z = x12; 3103 const Register y = x14; // == x 3104 const Register ylen = x15; // == xlen 3105 3106 const Register tmp0 = x13; // zlen, unused 3107 const Register tmp1 = x16; 3108 const Register tmp2 = x17; 3109 const Register tmp3 = x7; 3110 const Register tmp4 = x28; 3111 const Register tmp5 = x29; 3112 const Register tmp6 = x30; 3113 const Register tmp7 = x31; 3114 3115 BLOCK_COMMENT("Entry:"); 3116 __ enter(); 3117 __ mv(y, x); 3118 __ mv(ylen, xlen); 3119 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3120 __ leave(); 3121 __ ret(); 3122 3123 return entry; 3124 } 3125 3126 // Arguments: 3127 // 3128 // Input: 3129 // c_rarg0 - newArr address 3130 // c_rarg1 - oldArr address 3131 // c_rarg2 - newIdx 3132 // c_rarg3 - shiftCount 3133 // c_rarg4 - numIter 3134 // 3135 address generate_bigIntegerLeftShift() { 3136 __ align(CodeEntryAlignment); 3137 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 3138 address entry = __ pc(); 3139 3140 Label loop, exit; 3141 3142 Register newArr = c_rarg0; 3143 Register oldArr = c_rarg1; 3144 Register newIdx = c_rarg2; 3145 Register shiftCount = c_rarg3; 3146 Register numIter = c_rarg4; 3147 3148 Register shiftRevCount = c_rarg5; 3149 Register oldArrNext = t1; 3150 3151 __ beqz(numIter, exit); 3152 __ shadd(newArr, newIdx, newArr, t0, 2); 3153 3154 __ mv(shiftRevCount, 32); 3155 __ sub(shiftRevCount, shiftRevCount, shiftCount); 3156 3157 __ bind(loop); 3158 __ addi(oldArrNext, oldArr, 4); 3159 __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4); 3160 __ vle32_v(v0, oldArr); 3161 __ vle32_v(v4, oldArrNext); 3162 __ vsll_vx(v0, v0, shiftCount); 3163 __ vsrl_vx(v4, v4, shiftRevCount); 3164 __ vor_vv(v0, v0, v4); 3165 __ vse32_v(v0, newArr); 3166 __ sub(numIter, numIter, t0); 3167 __ shadd(oldArr, t0, oldArr, t1, 2); 3168 __ shadd(newArr, t0, newArr, t1, 2); 3169 __ bnez(numIter, loop); 3170 3171 __ bind(exit); 3172 __ ret(); 3173 3174 return entry; 3175 } 3176 3177 // Arguments: 3178 // 3179 // Input: 3180 // c_rarg0 - newArr address 3181 // c_rarg1 - oldArr address 3182 // c_rarg2 - newIdx 3183 // c_rarg3 - shiftCount 3184 // c_rarg4 - numIter 3185 // 3186 address generate_bigIntegerRightShift() { 3187 __ align(CodeEntryAlignment); 3188 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 3189 address entry = __ pc(); 3190 3191 Label loop, exit; 3192 3193 Register newArr = c_rarg0; 3194 Register oldArr = c_rarg1; 3195 Register newIdx = c_rarg2; 3196 Register shiftCount = c_rarg3; 3197 Register numIter = c_rarg4; 3198 Register idx = numIter; 3199 3200 Register shiftRevCount = c_rarg5; 3201 Register oldArrNext = c_rarg6; 3202 Register newArrCur = t0; 3203 Register oldArrCur = t1; 3204 3205 __ beqz(idx, exit); 3206 __ shadd(newArr, newIdx, newArr, t0, 2); 3207 3208 __ mv(shiftRevCount, 32); 3209 __ sub(shiftRevCount, shiftRevCount, shiftCount); 3210 3211 __ bind(loop); 3212 __ vsetvli(t0, idx, Assembler::e32, Assembler::m4); 3213 __ sub(idx, idx, t0); 3214 __ shadd(oldArrNext, idx, oldArr, t1, 2); 3215 __ shadd(newArrCur, idx, newArr, t1, 2); 3216 __ addi(oldArrCur, oldArrNext, 4); 3217 __ vle32_v(v0, oldArrCur); 3218 __ vle32_v(v4, oldArrNext); 3219 __ vsrl_vx(v0, v0, shiftCount); 3220 __ vsll_vx(v4, v4, shiftRevCount); 3221 __ vor_vv(v0, v0, v4); 3222 __ vse32_v(v0, newArrCur); 3223 __ bnez(idx, loop); 3224 3225 __ bind(exit); 3226 __ ret(); 3227 3228 return entry; 3229 } 3230 #endif 3231 3232 #ifdef COMPILER2 3233 class MontgomeryMultiplyGenerator : public MacroAssembler { 3234 3235 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3236 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj; 3237 3238 RegSet _toSave; 3239 bool _squaring; 3240 3241 public: 3242 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3243 : MacroAssembler(as->code()), _squaring(squaring) { 3244 3245 // Register allocation 3246 3247 RegSetIterator<Register> regs = RegSet::range(x10, x26).begin(); 3248 Pa_base = *regs; // Argument registers 3249 if (squaring) { 3250 Pb_base = Pa_base; 3251 } else { 3252 Pb_base = *++regs; 3253 } 3254 Pn_base = *++regs; 3255 Rlen= *++regs; 3256 inv = *++regs; 3257 Pm_base = *++regs; 3258 3259 // Working registers: 3260 Ra = *++regs; // The current digit of a, b, n, and m. 3261 Rb = *++regs; 3262 Rm = *++regs; 3263 Rn = *++regs; 3264 3265 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 3266 Pb = *++regs; 3267 Pm = *++regs; 3268 Pn = *++regs; 3269 3270 tmp0 = *++regs; // Three registers which form a 3271 tmp1 = *++regs; // triple-precision accumuator. 3272 tmp2 = *++regs; 3273 3274 Ri = x6; // Inner and outer loop indexes. 3275 Rj = x7; 3276 3277 Rhi_ab = x28; // Product registers: low and high parts 3278 Rlo_ab = x29; // of a*b and m*n. 3279 Rhi_mn = x30; 3280 Rlo_mn = x31; 3281 3282 // x18 and up are callee-saved. 3283 _toSave = RegSet::range(x18, *regs) + Pm_base; 3284 } 3285 3286 private: 3287 void save_regs() { 3288 push_reg(_toSave, sp); 3289 } 3290 3291 void restore_regs() { 3292 pop_reg(_toSave, sp); 3293 } 3294 3295 template <typename T> 3296 void unroll_2(Register count, T block) { 3297 Label loop, end, odd; 3298 beqz(count, end); 3299 test_bit(t0, count, 0); 3300 bnez(t0, odd); 3301 align(16); 3302 bind(loop); 3303 (this->*block)(); 3304 bind(odd); 3305 (this->*block)(); 3306 addi(count, count, -2); 3307 bgtz(count, loop); 3308 bind(end); 3309 } 3310 3311 template <typename T> 3312 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3313 Label loop, end, odd; 3314 beqz(count, end); 3315 test_bit(tmp, count, 0); 3316 bnez(tmp, odd); 3317 align(16); 3318 bind(loop); 3319 (this->*block)(d, s, tmp); 3320 bind(odd); 3321 (this->*block)(d, s, tmp); 3322 addi(count, count, -2); 3323 bgtz(count, loop); 3324 bind(end); 3325 } 3326 3327 void pre1(RegisterOrConstant i) { 3328 block_comment("pre1"); 3329 // Pa = Pa_base; 3330 // Pb = Pb_base + i; 3331 // Pm = Pm_base; 3332 // Pn = Pn_base + i; 3333 // Ra = *Pa; 3334 // Rb = *Pb; 3335 // Rm = *Pm; 3336 // Rn = *Pn; 3337 if (i.is_register()) { 3338 slli(t0, i.as_register(), LogBytesPerWord); 3339 } else { 3340 mv(t0, i.as_constant()); 3341 slli(t0, t0, LogBytesPerWord); 3342 } 3343 3344 mv(Pa, Pa_base); 3345 add(Pb, Pb_base, t0); 3346 mv(Pm, Pm_base); 3347 add(Pn, Pn_base, t0); 3348 3349 ld(Ra, Address(Pa)); 3350 ld(Rb, Address(Pb)); 3351 ld(Rm, Address(Pm)); 3352 ld(Rn, Address(Pn)); 3353 3354 // Zero the m*n result. 3355 mv(Rhi_mn, zr); 3356 mv(Rlo_mn, zr); 3357 } 3358 3359 // The core multiply-accumulate step of a Montgomery 3360 // multiplication. The idea is to schedule operations as a 3361 // pipeline so that instructions with long latencies (loads and 3362 // multiplies) have time to complete before their results are 3363 // used. This most benefits in-order implementations of the 3364 // architecture but out-of-order ones also benefit. 3365 void step() { 3366 block_comment("step"); 3367 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3368 // Ra = *++Pa; 3369 // Rb = *--Pb; 3370 mulhu(Rhi_ab, Ra, Rb); 3371 mul(Rlo_ab, Ra, Rb); 3372 addi(Pa, Pa, wordSize); 3373 ld(Ra, Address(Pa)); 3374 addi(Pb, Pb, -wordSize); 3375 ld(Rb, Address(Pb)); 3376 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the 3377 // previous iteration. 3378 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3379 // Rm = *++Pm; 3380 // Rn = *--Pn; 3381 mulhu(Rhi_mn, Rm, Rn); 3382 mul(Rlo_mn, Rm, Rn); 3383 addi(Pm, Pm, wordSize); 3384 ld(Rm, Address(Pm)); 3385 addi(Pn, Pn, -wordSize); 3386 ld(Rn, Address(Pn)); 3387 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3388 } 3389 3390 void post1() { 3391 block_comment("post1"); 3392 3393 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3394 // Ra = *++Pa; 3395 // Rb = *--Pb; 3396 mulhu(Rhi_ab, Ra, Rb); 3397 mul(Rlo_ab, Ra, Rb); 3398 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3399 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3400 3401 // *Pm = Rm = tmp0 * inv; 3402 mul(Rm, tmp0, inv); 3403 sd(Rm, Address(Pm)); 3404 3405 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3406 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3407 mulhu(Rhi_mn, Rm, Rn); 3408 3409 #ifndef PRODUCT 3410 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3411 { 3412 mul(Rlo_mn, Rm, Rn); 3413 add(Rlo_mn, tmp0, Rlo_mn); 3414 Label ok; 3415 beqz(Rlo_mn, ok); 3416 stop("broken Montgomery multiply"); 3417 bind(ok); 3418 } 3419 #endif 3420 // We have very carefully set things up so that 3421 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3422 // the lower half of Rm * Rn because we know the result already: 3423 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3424 // tmp0 != 0. So, rather than do a mul and an cad we just set 3425 // the carry flag iff tmp0 is nonzero. 3426 // 3427 // mul(Rlo_mn, Rm, Rn); 3428 // cad(zr, tmp0, Rlo_mn); 3429 addi(t0, tmp0, -1); 3430 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3431 cadc(tmp0, tmp1, Rhi_mn, t0); 3432 adc(tmp1, tmp2, zr, t0); 3433 mv(tmp2, zr); 3434 } 3435 3436 void pre2(Register i, Register len) { 3437 block_comment("pre2"); 3438 // Pa = Pa_base + i-len; 3439 // Pb = Pb_base + len; 3440 // Pm = Pm_base + i-len; 3441 // Pn = Pn_base + len; 3442 3443 sub(Rj, i, len); 3444 // Rj == i-len 3445 3446 // Ra as temp register 3447 slli(Ra, Rj, LogBytesPerWord); 3448 add(Pa, Pa_base, Ra); 3449 add(Pm, Pm_base, Ra); 3450 slli(Ra, len, LogBytesPerWord); 3451 add(Pb, Pb_base, Ra); 3452 add(Pn, Pn_base, Ra); 3453 3454 // Ra = *++Pa; 3455 // Rb = *--Pb; 3456 // Rm = *++Pm; 3457 // Rn = *--Pn; 3458 add(Pa, Pa, wordSize); 3459 ld(Ra, Address(Pa)); 3460 add(Pb, Pb, -wordSize); 3461 ld(Rb, Address(Pb)); 3462 add(Pm, Pm, wordSize); 3463 ld(Rm, Address(Pm)); 3464 add(Pn, Pn, -wordSize); 3465 ld(Rn, Address(Pn)); 3466 3467 mv(Rhi_mn, zr); 3468 mv(Rlo_mn, zr); 3469 } 3470 3471 void post2(Register i, Register len) { 3472 block_comment("post2"); 3473 sub(Rj, i, len); 3474 3475 cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part 3476 3477 // As soon as we know the least significant digit of our result, 3478 // store it. 3479 // Pm_base[i-len] = tmp0; 3480 // Rj as temp register 3481 slli(Rj, Rj, LogBytesPerWord); 3482 add(Rj, Pm_base, Rj); 3483 sd(tmp0, Address(Rj)); 3484 3485 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3486 cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part 3487 adc(tmp1, tmp2, zr, t0); 3488 mv(tmp2, zr); 3489 } 3490 3491 // A carry in tmp0 after Montgomery multiplication means that we 3492 // should subtract multiples of n from our result in m. We'll 3493 // keep doing that until there is no carry. 3494 void normalize(Register len) { 3495 block_comment("normalize"); 3496 // while (tmp0) 3497 // tmp0 = sub(Pm_base, Pn_base, tmp0, len); 3498 Label loop, post, again; 3499 Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now 3500 beqz(tmp0, post); { 3501 bind(again); { 3502 mv(i, zr); 3503 mv(cnt, len); 3504 slli(Rn, i, LogBytesPerWord); 3505 add(Rm, Pm_base, Rn); 3506 ld(Rm, Address(Rm)); 3507 add(Rn, Pn_base, Rn); 3508 ld(Rn, Address(Rn)); 3509 mv(t0, 1); // set carry flag, i.e. no borrow 3510 align(16); 3511 bind(loop); { 3512 notr(Rn, Rn); 3513 add(Rm, Rm, t0); 3514 add(Rm, Rm, Rn); 3515 sltu(t0, Rm, Rn); 3516 slli(Rn, i, LogBytesPerWord); // Rn as temp register 3517 add(Rn, Pm_base, Rn); 3518 sd(Rm, Address(Rn)); 3519 add(i, i, 1); 3520 slli(Rn, i, LogBytesPerWord); 3521 add(Rm, Pm_base, Rn); 3522 ld(Rm, Address(Rm)); 3523 add(Rn, Pn_base, Rn); 3524 ld(Rn, Address(Rn)); 3525 sub(cnt, cnt, 1); 3526 } bnez(cnt, loop); 3527 addi(tmp0, tmp0, -1); 3528 add(tmp0, tmp0, t0); 3529 } bnez(tmp0, again); 3530 } bind(post); 3531 } 3532 3533 // Move memory at s to d, reversing words. 3534 // Increments d to end of copied memory 3535 // Destroys tmp1, tmp2 3536 // Preserves len 3537 // Leaves s pointing to the address which was in d at start 3538 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3539 assert(tmp1->encoding() < x28->encoding(), "register corruption"); 3540 assert(tmp2->encoding() < x28->encoding(), "register corruption"); 3541 3542 shadd(s, len, s, tmp1, LogBytesPerWord); 3543 mv(tmp1, len); 3544 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3545 slli(tmp1, len, LogBytesPerWord); 3546 sub(s, d, tmp1); 3547 } 3548 // [63...0] -> [31...0][63...32] 3549 void reverse1(Register d, Register s, Register tmp) { 3550 addi(s, s, -wordSize); 3551 ld(tmp, Address(s)); 3552 ror_imm(tmp, tmp, 32, t0); 3553 sd(tmp, Address(d)); 3554 addi(d, d, wordSize); 3555 } 3556 3557 void step_squaring() { 3558 // An extra ACC 3559 step(); 3560 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3561 } 3562 3563 void last_squaring(Register i) { 3564 Label dont; 3565 // if ((i & 1) == 0) { 3566 test_bit(t0, i, 0); 3567 bnez(t0, dont); { 3568 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3569 // Ra = *++Pa; 3570 // Rb = *--Pb; 3571 mulhu(Rhi_ab, Ra, Rb); 3572 mul(Rlo_ab, Ra, Rb); 3573 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3574 } bind(dont); 3575 } 3576 3577 void extra_step_squaring() { 3578 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3579 3580 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3581 // Rm = *++Pm; 3582 // Rn = *--Pn; 3583 mulhu(Rhi_mn, Rm, Rn); 3584 mul(Rlo_mn, Rm, Rn); 3585 addi(Pm, Pm, wordSize); 3586 ld(Rm, Address(Pm)); 3587 addi(Pn, Pn, -wordSize); 3588 ld(Rn, Address(Pn)); 3589 } 3590 3591 void post1_squaring() { 3592 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3593 3594 // *Pm = Rm = tmp0 * inv; 3595 mul(Rm, tmp0, inv); 3596 sd(Rm, Address(Pm)); 3597 3598 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3599 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3600 mulhu(Rhi_mn, Rm, Rn); 3601 3602 #ifndef PRODUCT 3603 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3604 { 3605 mul(Rlo_mn, Rm, Rn); 3606 add(Rlo_mn, tmp0, Rlo_mn); 3607 Label ok; 3608 beqz(Rlo_mn, ok); { 3609 stop("broken Montgomery multiply"); 3610 } bind(ok); 3611 } 3612 #endif 3613 // We have very carefully set things up so that 3614 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3615 // the lower half of Rm * Rn because we know the result already: 3616 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3617 // tmp0 != 0. So, rather than do a mul and a cad we just set 3618 // the carry flag iff tmp0 is nonzero. 3619 // 3620 // mul(Rlo_mn, Rm, Rn); 3621 // cad(zr, tmp, Rlo_mn); 3622 addi(t0, tmp0, -1); 3623 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3624 cadc(tmp0, tmp1, Rhi_mn, t0); 3625 adc(tmp1, tmp2, zr, t0); 3626 mv(tmp2, zr); 3627 } 3628 3629 // use t0 as carry 3630 void acc(Register Rhi, Register Rlo, 3631 Register tmp0, Register tmp1, Register tmp2) { 3632 cad(tmp0, tmp0, Rlo, t0); 3633 cadc(tmp1, tmp1, Rhi, t0); 3634 adc(tmp2, tmp2, zr, t0); 3635 } 3636 3637 public: 3638 /** 3639 * Fast Montgomery multiplication. The derivation of the 3640 * algorithm is in A Cryptographic Library for the Motorola 3641 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3642 * 3643 * Arguments: 3644 * 3645 * Inputs for multiplication: 3646 * c_rarg0 - int array elements a 3647 * c_rarg1 - int array elements b 3648 * c_rarg2 - int array elements n (the modulus) 3649 * c_rarg3 - int length 3650 * c_rarg4 - int inv 3651 * c_rarg5 - int array elements m (the result) 3652 * 3653 * Inputs for squaring: 3654 * c_rarg0 - int array elements a 3655 * c_rarg1 - int array elements n (the modulus) 3656 * c_rarg2 - int length 3657 * c_rarg3 - int inv 3658 * c_rarg4 - int array elements m (the result) 3659 * 3660 */ 3661 address generate_multiply() { 3662 Label argh, nothing; 3663 bind(argh); 3664 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3665 3666 align(CodeEntryAlignment); 3667 address entry = pc(); 3668 3669 beqz(Rlen, nothing); 3670 3671 enter(); 3672 3673 // Make room. 3674 mv(Ra, 512); 3675 bgt(Rlen, Ra, argh); 3676 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3677 sub(Ra, sp, Ra); 3678 andi(sp, Ra, -2 * wordSize); 3679 3680 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3681 3682 { 3683 // Copy input args, reversing as we go. We use Ra as a 3684 // temporary variable. 3685 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3686 if (!_squaring) 3687 reverse(Ra, Pb_base, Rlen, Ri, Rj); 3688 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3689 } 3690 3691 // Push all call-saved registers and also Pm_base which we'll need 3692 // at the end. 3693 save_regs(); 3694 3695 #ifndef PRODUCT 3696 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3697 { 3698 ld(Rn, Address(Pn_base)); 3699 mul(Rlo_mn, Rn, inv); 3700 mv(t0, -1); 3701 Label ok; 3702 beq(Rlo_mn, t0, ok); 3703 stop("broken inverse in Montgomery multiply"); 3704 bind(ok); 3705 } 3706 #endif 3707 3708 mv(Pm_base, Ra); 3709 3710 mv(tmp0, zr); 3711 mv(tmp1, zr); 3712 mv(tmp2, zr); 3713 3714 block_comment("for (int i = 0; i < len; i++) {"); 3715 mv(Ri, zr); { 3716 Label loop, end; 3717 bge(Ri, Rlen, end); 3718 3719 bind(loop); 3720 pre1(Ri); 3721 3722 block_comment(" for (j = i; j; j--) {"); { 3723 mv(Rj, Ri); 3724 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3725 } block_comment(" } // j"); 3726 3727 post1(); 3728 addw(Ri, Ri, 1); 3729 blt(Ri, Rlen, loop); 3730 bind(end); 3731 block_comment("} // i"); 3732 } 3733 3734 block_comment("for (int i = len; i < 2*len; i++) {"); 3735 mv(Ri, Rlen); { 3736 Label loop, end; 3737 slli(t0, Rlen, 1); 3738 bge(Ri, t0, end); 3739 3740 bind(loop); 3741 pre2(Ri, Rlen); 3742 3743 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3744 slliw(Rj, Rlen, 1); 3745 subw(Rj, Rj, Ri); 3746 subw(Rj, Rj, 1); 3747 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3748 } block_comment(" } // j"); 3749 3750 post2(Ri, Rlen); 3751 addw(Ri, Ri, 1); 3752 slli(t0, Rlen, 1); 3753 blt(Ri, t0, loop); 3754 bind(end); 3755 } 3756 block_comment("} // i"); 3757 3758 normalize(Rlen); 3759 3760 mv(Ra, Pm_base); // Save Pm_base in Ra 3761 restore_regs(); // Restore caller's Pm_base 3762 3763 // Copy our result into caller's Pm_base 3764 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3765 3766 leave(); 3767 bind(nothing); 3768 ret(); 3769 3770 return entry; 3771 } 3772 3773 /** 3774 * 3775 * Arguments: 3776 * 3777 * Inputs: 3778 * c_rarg0 - int array elements a 3779 * c_rarg1 - int array elements n (the modulus) 3780 * c_rarg2 - int length 3781 * c_rarg3 - int inv 3782 * c_rarg4 - int array elements m (the result) 3783 * 3784 */ 3785 address generate_square() { 3786 Label argh; 3787 bind(argh); 3788 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3789 3790 align(CodeEntryAlignment); 3791 address entry = pc(); 3792 3793 enter(); 3794 3795 // Make room. 3796 mv(Ra, 512); 3797 bgt(Rlen, Ra, argh); 3798 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3799 sub(Ra, sp, Ra); 3800 andi(sp, Ra, -2 * wordSize); 3801 3802 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3803 3804 { 3805 // Copy input args, reversing as we go. We use Ra as a 3806 // temporary variable. 3807 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3808 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3809 } 3810 3811 // Push all call-saved registers and also Pm_base which we'll need 3812 // at the end. 3813 save_regs(); 3814 3815 mv(Pm_base, Ra); 3816 3817 mv(tmp0, zr); 3818 mv(tmp1, zr); 3819 mv(tmp2, zr); 3820 3821 block_comment("for (int i = 0; i < len; i++) {"); 3822 mv(Ri, zr); { 3823 Label loop, end; 3824 bind(loop); 3825 bge(Ri, Rlen, end); 3826 3827 pre1(Ri); 3828 3829 block_comment("for (j = (i+1)/2; j; j--) {"); { 3830 addi(Rj, Ri, 1); 3831 srliw(Rj, Rj, 1); 3832 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3833 } block_comment(" } // j"); 3834 3835 last_squaring(Ri); 3836 3837 block_comment(" for (j = i/2; j; j--) {"); { 3838 srliw(Rj, Ri, 1); 3839 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3840 } block_comment(" } // j"); 3841 3842 post1_squaring(); 3843 addi(Ri, Ri, 1); 3844 blt(Ri, Rlen, loop); 3845 3846 bind(end); 3847 block_comment("} // i"); 3848 } 3849 3850 block_comment("for (int i = len; i < 2*len; i++) {"); 3851 mv(Ri, Rlen); { 3852 Label loop, end; 3853 bind(loop); 3854 slli(t0, Rlen, 1); 3855 bge(Ri, t0, end); 3856 3857 pre2(Ri, Rlen); 3858 3859 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 3860 slli(Rj, Rlen, 1); 3861 sub(Rj, Rj, Ri); 3862 sub(Rj, Rj, 1); 3863 srliw(Rj, Rj, 1); 3864 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3865 } block_comment(" } // j"); 3866 3867 last_squaring(Ri); 3868 3869 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 3870 slli(Rj, Rlen, 1); 3871 sub(Rj, Rj, Ri); 3872 srliw(Rj, Rj, 1); 3873 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3874 } block_comment(" } // j"); 3875 3876 post2(Ri, Rlen); 3877 addi(Ri, Ri, 1); 3878 slli(t0, Rlen, 1); 3879 blt(Ri, t0, loop); 3880 3881 bind(end); 3882 block_comment("} // i"); 3883 } 3884 3885 normalize(Rlen); 3886 3887 mv(Ra, Pm_base); // Save Pm_base in Ra 3888 restore_regs(); // Restore caller's Pm_base 3889 3890 // Copy our result into caller's Pm_base 3891 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3892 3893 leave(); 3894 ret(); 3895 3896 return entry; 3897 } 3898 }; 3899 3900 #endif // COMPILER2 3901 3902 address generate_cont_thaw(Continuation::thaw_kind kind) { 3903 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 3904 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 3905 3906 address start = __ pc(); 3907 3908 if (return_barrier) { 3909 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset())); 3910 } 3911 3912 #ifndef PRODUCT 3913 { 3914 Label OK; 3915 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset())); 3916 __ beq(sp, t0, OK); 3917 __ stop("incorrect sp"); 3918 __ bind(OK); 3919 } 3920 #endif 3921 3922 if (return_barrier) { 3923 // preserve possible return value from a method returning to the return barrier 3924 __ sub(sp, sp, 2 * wordSize); 3925 __ fsd(f10, Address(sp, 0 * wordSize)); 3926 __ sd(x10, Address(sp, 1 * wordSize)); 3927 } 3928 3929 __ mv(c_rarg1, (return_barrier ? 1 : 0)); 3930 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1); 3931 __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames 3932 3933 if (return_barrier) { 3934 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 3935 __ ld(x10, Address(sp, 1 * wordSize)); 3936 __ fld(f10, Address(sp, 0 * wordSize)); 3937 __ add(sp, sp, 2 * wordSize); 3938 } 3939 3940 #ifndef PRODUCT 3941 { 3942 Label OK; 3943 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset())); 3944 __ beq(sp, t0, OK); 3945 __ stop("incorrect sp"); 3946 __ bind(OK); 3947 } 3948 #endif 3949 3950 Label thaw_success; 3951 // t1 contains the size of the frames to thaw, 0 if overflow or no more frames 3952 __ bnez(t1, thaw_success); 3953 __ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 3954 __ bind(thaw_success); 3955 3956 // make room for the thawed frames 3957 __ sub(t0, sp, t1); 3958 __ andi(sp, t0, -16); // align 3959 3960 if (return_barrier) { 3961 // save original return value -- again 3962 __ sub(sp, sp, 2 * wordSize); 3963 __ fsd(f10, Address(sp, 0 * wordSize)); 3964 __ sd(x10, Address(sp, 1 * wordSize)); 3965 } 3966 3967 // If we want, we can templatize thaw by kind, and have three different entries 3968 __ mv(c_rarg1, kind); 3969 3970 __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1); 3971 __ mv(t1, x10); // x10 is the sp of the yielding frame 3972 3973 if (return_barrier) { 3974 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 3975 __ ld(x10, Address(sp, 1 * wordSize)); 3976 __ fld(f10, Address(sp, 0 * wordSize)); 3977 __ add(sp, sp, 2 * wordSize); 3978 } else { 3979 __ mv(x10, zr); // return 0 (success) from doYield 3980 } 3981 3982 // we're now on the yield frame (which is in an address above us b/c sp has been pushed down) 3983 __ mv(fp, t1); 3984 __ sub(sp, t1, 2 * wordSize); // now pointing to fp spill 3985 3986 if (return_barrier_exception) { 3987 __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address 3988 __ verify_oop(x10); 3989 __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9 3990 3991 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1); 3992 3993 // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc 3994 3995 __ mv(x11, x10); // the exception handler 3996 __ mv(x10, x9); // restore return value contaning the exception oop 3997 __ verify_oop(x10); 3998 3999 __ leave(); 4000 __ mv(x13, ra); 4001 __ jr(x11); // the exception handler 4002 } else { 4003 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 4004 __ leave(); 4005 __ ret(); 4006 } 4007 4008 return start; 4009 } 4010 4011 address generate_cont_thaw() { 4012 if (!Continuations::enabled()) return nullptr; 4013 4014 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 4015 address start = __ pc(); 4016 generate_cont_thaw(Continuation::thaw_top); 4017 return start; 4018 } 4019 4020 address generate_cont_returnBarrier() { 4021 if (!Continuations::enabled()) return nullptr; 4022 4023 // TODO: will probably need multiple return barriers depending on return type 4024 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 4025 address start = __ pc(); 4026 4027 generate_cont_thaw(Continuation::thaw_return_barrier); 4028 4029 return start; 4030 } 4031 4032 address generate_cont_returnBarrier_exception() { 4033 if (!Continuations::enabled()) return nullptr; 4034 4035 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 4036 address start = __ pc(); 4037 4038 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 4039 4040 return start; 4041 } 4042 4043 #if COMPILER2_OR_JVMCI 4044 4045 #undef __ 4046 #define __ this-> 4047 4048 class Sha2Generator : public MacroAssembler { 4049 StubCodeGenerator* _cgen; 4050 public: 4051 Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {} 4052 address generate_sha256_implCompress(bool multi_block) { 4053 return generate_sha2_implCompress(Assembler::e32, multi_block); 4054 } 4055 address generate_sha512_implCompress(bool multi_block) { 4056 return generate_sha2_implCompress(Assembler::e64, multi_block); 4057 } 4058 private: 4059 4060 void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { 4061 if (vset_sew == Assembler::e32) __ vle32_v(vr, sr); 4062 else __ vle64_v(vr, sr); 4063 } 4064 4065 void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { 4066 if (vset_sew == Assembler::e32) __ vse32_v(vr, sr); 4067 else __ vse64_v(vr, sr); 4068 } 4069 4070 // Overview of the logic in each "quad round". 4071 // 4072 // The code below repeats 16/20 times the logic implementing four rounds 4073 // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds" 4074 // to implementing the 64/80 single rounds. 4075 // 4076 // // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0]) 4077 // // Output: 4078 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} 4079 // vl1reXX.v vTmp1, ofs 4080 // 4081 // // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b) 4082 // addi ofs, ofs, 16/32 4083 // 4084 // // Add constants to message schedule words: 4085 // // Input 4086 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} 4087 // // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0]; 4088 // // Output 4089 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 4090 // vadd.vv vTmp0, vTmp1, vW0 4091 // 4092 // // 2 rounds of working variables updates. 4093 // // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t] 4094 // // Input: 4095 // // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] " 4096 // // vState0 = {a[t],b[t],e[t],f[t]} 4097 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 4098 // // Output: 4099 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " 4100 // // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] " 4101 // vsha2cl.vv vState1, vState0, vTmp0 4102 // 4103 // // 2 rounds of working variables updates. 4104 // // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t] 4105 // // Input 4106 // // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] " 4107 // // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] " 4108 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " 4109 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 4110 // // Output: 4111 // // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] " 4112 // vsha2ch.vv vState0, vState1, vTmp0 4113 // 4114 // // Combine 2QW into 1QW 4115 // // 4116 // // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs 4117 // // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3] 4118 // // and it can only take 3 vectors as inputs. Hence we need to combine 4119 // // vW1[0] and vW2[1..3] in a single vector. 4120 // // 4121 // // vmerge Vt4, Vt1, Vt2, V0 4122 // // Input 4123 // // V0 = mask // first word from vW2, 1..3 words from vW1 4124 // // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5} 4125 // // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9} 4126 // // Output 4127 // // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5} 4128 // vmerge.vvm vTmp0, vW2, vW1, v0 4129 // 4130 // // Generate next Four Message Schedule Words (hence allowing for 4 more rounds) 4131 // // Input 4132 // // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0] 4133 // // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12] 4134 // // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4] 4135 // // Output (next four message schedule words) 4136 // // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16] 4137 // vsha2ms.vv vW0, vTmp0, vW3 4138 // 4139 // BEFORE 4140 // vW0 - vW3 hold the message schedule words (initially the block words) 4141 // vW0 = W[ 3: 0] "oldest" 4142 // vW1 = W[ 7: 4] 4143 // vW2 = W[11: 8] 4144 // vW3 = W[15:12] "newest" 4145 // 4146 // vt6 - vt7 hold the working state variables 4147 // vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0} 4148 // vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2} 4149 // 4150 // AFTER 4151 // vW0 - vW3 hold the message schedule words (initially the block words) 4152 // vW1 = W[ 7: 4] "oldest" 4153 // vW2 = W[11: 8] 4154 // vW3 = W[15:12] 4155 // vW0 = W[19:16] "newest" 4156 // 4157 // vState0 and vState1 hold the working state variables 4158 // vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]} 4159 // vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]} 4160 // 4161 // The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round, 4162 // hence the uses of those vectors rotate in each round, and we get back to the 4163 // initial configuration every 4 quad-rounds. We could avoid those changes at 4164 // the cost of moving those vectors at the end of each quad-rounds. 4165 void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4, 4166 Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh, 4167 bool gen_words = true, bool step_const = true) { 4168 __ vleXX_v(vset_sew, vtemp, scalarconst); 4169 if (step_const) { 4170 __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32); 4171 } 4172 __ vadd_vv(vtemp2, vtemp, rot1); 4173 __ vsha2cl_vv(v_cdgh, v_abef, vtemp2); 4174 __ vsha2ch_vv(v_abef, v_cdgh, vtemp2); 4175 if (gen_words) { 4176 __ vmerge_vvm(vtemp2, rot3, rot2); 4177 __ vsha2ms_vv(rot1, vtemp2, rot4); 4178 } 4179 } 4180 4181 const char* stub_name(Assembler::SEW vset_sew, bool multi_block) { 4182 if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress"; 4183 if (vset_sew == Assembler::e32 && multi_block) return "sha256_implCompressMB"; 4184 if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress"; 4185 if (vset_sew == Assembler::e64 && multi_block) return "sha512_implCompressMB"; 4186 ShouldNotReachHere(); 4187 return "bad name lookup"; 4188 } 4189 4190 // Arguments: 4191 // 4192 // Inputs: 4193 // c_rarg0 - byte[] source+offset 4194 // c_rarg1 - int[] SHA.state 4195 // c_rarg2 - int offset 4196 // c_rarg3 - int limit 4197 // 4198 address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) { 4199 alignas(64) static const uint32_t round_consts_256[64] = { 4200 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 4201 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 4202 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 4203 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 4204 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 4205 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 4206 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 4207 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 4208 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 4209 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 4210 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 4211 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 4212 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 4213 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 4214 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 4215 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 4216 }; 4217 alignas(64) static const uint64_t round_consts_512[80] = { 4218 0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl, 4219 0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l, 4220 0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l, 4221 0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l, 4222 0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l, 4223 0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l, 4224 0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l, 4225 0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l, 4226 0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl, 4227 0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l, 4228 0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl, 4229 0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl, 4230 0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l, 4231 0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l, 4232 0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l, 4233 0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l, 4234 0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l, 4235 0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl, 4236 0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl, 4237 0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl, 4238 0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l, 4239 0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l, 4240 0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal, 4241 0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl, 4242 0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl, 4243 0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al, 4244 0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l 4245 }; 4246 const int const_add = vset_sew == Assembler::e32 ? 16 : 32; 4247 4248 __ align(CodeEntryAlignment); 4249 StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block)); 4250 address start = __ pc(); 4251 4252 Register buf = c_rarg0; 4253 Register state = c_rarg1; 4254 Register ofs = c_rarg2; 4255 Register limit = c_rarg3; 4256 Register consts = t2; // caller saved 4257 Register state_c = x28; // caller saved 4258 VectorRegister vindex = v2; 4259 VectorRegister vW0 = v4; 4260 VectorRegister vW1 = v6; 4261 VectorRegister vW2 = v8; 4262 VectorRegister vW3 = v10; 4263 VectorRegister vState0 = v12; 4264 VectorRegister vState1 = v14; 4265 VectorRegister vHash0 = v16; 4266 VectorRegister vHash1 = v18; 4267 VectorRegister vTmp0 = v20; 4268 VectorRegister vTmp1 = v22; 4269 4270 Label multi_block_loop; 4271 4272 __ enter(); 4273 4274 address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512; 4275 la(consts, ExternalAddress(constant_table)); 4276 4277 // Register use in this function: 4278 // 4279 // VECTORS 4280 // vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message 4281 // schedule words (Wt). They start with the message block 4282 // content (W0 to W15), then further words in the message 4283 // schedule generated via vsha2ms from previous Wt. 4284 // Initially: 4285 // vW0 = W[ 3:0] = { W3, W2, W1, W0} 4286 // vW1 = W[ 7:4] = { W7, W6, W5, W4} 4287 // vW2 = W[ 11:8] = {W11, W10, W9, W8} 4288 // vW3 = W[15:12] = {W15, W14, W13, W12} 4289 // 4290 // vState0 - vState1 hold the working state variables (a, b, ..., h) 4291 // vState0 = {f[t],e[t],b[t],a[t]} 4292 // vState1 = {h[t],g[t],d[t],c[t]} 4293 // Initially: 4294 // vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1} 4295 // vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1} 4296 // 4297 // v0 = masks for vrgather/vmerge. Single value during the 16 rounds. 4298 // 4299 // vTmp0 = temporary, Wt+Kt 4300 // vTmp1 = temporary, Kt 4301 // 4302 // vHash0/vHash1 = hold the initial values of the hash, byte-swapped. 4303 // 4304 // During most of the function the vector state is configured so that each 4305 // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits). 4306 4307 // vsha2ch/vsha2cl uses EGW of 4*SEW. 4308 // SHA256 SEW = e32, EGW = 128-bits 4309 // SHA512 SEW = e64, EGW = 256-bits 4310 // 4311 // VLEN is required to be at least 128. 4312 // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256) 4313 // 4314 // m1: LMUL=1/2 4315 // ta: tail agnostic (don't care about those lanes) 4316 // ma: mask agnostic (don't care about those lanes) 4317 // x0 is not written, we known the number of vector elements. 4318 4319 if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128 4320 __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta); 4321 } else { 4322 __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta); 4323 } 4324 4325 int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul; 4326 __ li(t0, indexes); 4327 __ vmv_v_x(vindex, t0); 4328 4329 // Step-over a,b, so we are pointing to c. 4330 // const_add is equal to 4x state variable, div by 2 is thus 2, a,b 4331 __ addi(state_c, state, const_add/2); 4332 4333 // Use index-load to get {f,e,b,a},{h,g,d,c} 4334 __ vluxei8_v(vState0, state, vindex); 4335 __ vluxei8_v(vState1, state_c, vindex); 4336 4337 __ bind(multi_block_loop); 4338 4339 // Capture the initial H values in vHash0 and vHash1 to allow for computing 4340 // the resulting H', since H' = H+{a',b',c',...,h'}. 4341 __ vmv_v_v(vHash0, vState0); 4342 __ vmv_v_v(vHash1, vState1); 4343 4344 // Load the 512/1024-bits of the message block in vW0-vW3 and perform 4345 // an endian swap on each 4/8 bytes element. 4346 // 4347 // If Zvkb is not implemented one can use vrgather 4348 // with an index sequence to byte-swap. 4349 // sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] 4350 // <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate 4351 // this sequence. 'vid' gives us the N. 4352 __ vleXX_v(vset_sew, vW0, buf); 4353 __ vrev8_v(vW0, vW0); 4354 __ addi(buf, buf, const_add); 4355 __ vleXX_v(vset_sew, vW1, buf); 4356 __ vrev8_v(vW1, vW1); 4357 __ addi(buf, buf, const_add); 4358 __ vleXX_v(vset_sew, vW2, buf); 4359 __ vrev8_v(vW2, vW2); 4360 __ addi(buf, buf, const_add); 4361 __ vleXX_v(vset_sew, vW3, buf); 4362 __ vrev8_v(vW3, vW3); 4363 __ addi(buf, buf, const_add); 4364 4365 // Set v0 up for the vmerge that replaces the first word (idx==0) 4366 __ vid_v(v0); 4367 __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0) 4368 4369 VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3}; 4370 int rot_pos = 0; 4371 // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2) 4372 const int qr_end = vset_sew == Assembler::e32 ? 12 : 16; 4373 for (int i = 0; i < qr_end; i++) { 4374 sha2_quad_round(vset_sew, 4375 rotation_regs[(rot_pos + 0) & 0x3], 4376 rotation_regs[(rot_pos + 1) & 0x3], 4377 rotation_regs[(rot_pos + 2) & 0x3], 4378 rotation_regs[(rot_pos + 3) & 0x3], 4379 consts, 4380 vTmp1, vTmp0, vState0, vState1); 4381 ++rot_pos; 4382 } 4383 // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2) 4384 // Note that we stop generating new message schedule words (Wt, vW0-13) 4385 // as we already generated all the words we end up consuming (i.e., W[63:60]). 4386 const int qr_c_end = qr_end + 4; 4387 for (int i = qr_end; i < qr_c_end; i++) { 4388 sha2_quad_round(vset_sew, 4389 rotation_regs[(rot_pos + 0) & 0x3], 4390 rotation_regs[(rot_pos + 1) & 0x3], 4391 rotation_regs[(rot_pos + 2) & 0x3], 4392 rotation_regs[(rot_pos + 3) & 0x3], 4393 consts, 4394 vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1)); 4395 ++rot_pos; 4396 } 4397 4398 //-------------------------------------------------------------------------------- 4399 // Compute the updated hash value H' 4400 // H' = H + {h',g',...,b',a'} 4401 // = {h,g,...,b,a} + {h',g',...,b',a'} 4402 // = {h+h',g+g',...,b+b',a+a'} 4403 4404 // H' = H+{a',b',c',...,h'} 4405 __ vadd_vv(vState0, vHash0, vState0); 4406 __ vadd_vv(vState1, vHash1, vState1); 4407 4408 if (multi_block) { 4409 int total_adds = vset_sew == Assembler::e32 ? 240 : 608; 4410 __ addi(consts, consts, -total_adds); 4411 __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128); 4412 __ ble(ofs, limit, multi_block_loop); 4413 __ mv(c_rarg0, ofs); // return ofs 4414 } 4415 4416 // Store H[0..8] = {a,b,c,d,e,f,g,h} from 4417 // vState0 = {f,e,b,a} 4418 // vState1 = {h,g,d,c} 4419 __ vsuxei8_v(vState0, state, vindex); 4420 __ vsuxei8_v(vState1, state_c, vindex); 4421 4422 __ leave(); 4423 __ ret(); 4424 4425 return start; 4426 } 4427 }; 4428 4429 #undef __ 4430 #define __ _masm-> 4431 4432 // Set of L registers that correspond to a contiguous memory area. 4433 // Each 64-bit register typically corresponds to 2 32-bit integers. 4434 template <uint L> 4435 class RegCache { 4436 private: 4437 MacroAssembler *_masm; 4438 Register _regs[L]; 4439 4440 public: 4441 RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) { 4442 assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L); 4443 auto it = rs.begin(); 4444 for (auto &r: _regs) { 4445 r = *it; 4446 ++it; 4447 } 4448 } 4449 4450 // generate load for the i'th register 4451 void gen_load(uint i, Register base) { 4452 assert(i < L, "invalid i: %u", i); 4453 __ ld(_regs[i], Address(base, 8 * i)); 4454 } 4455 4456 // add i'th 32-bit integer to dest 4457 void add_u32(const Register dest, uint i, const Register rtmp = t0) { 4458 assert(i < 2 * L, "invalid i: %u", i); 4459 4460 if (is_even(i)) { 4461 // Use the bottom 32 bits. No need to mask off the top 32 bits 4462 // as addw will do the right thing. 4463 __ addw(dest, dest, _regs[i / 2]); 4464 } else { 4465 // Use the top 32 bits by right-shifting them. 4466 __ srli(rtmp, _regs[i / 2], 32); 4467 __ addw(dest, dest, rtmp); 4468 } 4469 } 4470 }; 4471 4472 typedef RegCache<8> BufRegCache; 4473 4474 // a += value + x + ac; 4475 // a = Integer.rotateLeft(a, s) + b; 4476 void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache, 4477 Register a, Register b, Register c, Register d, 4478 int k, int s, int t, 4479 Register value) { 4480 // a += ac 4481 __ addw(a, a, t, t1); 4482 4483 // a += x; 4484 reg_cache.add_u32(a, k); 4485 // a += value; 4486 __ addw(a, a, value); 4487 4488 // a = Integer.rotateLeft(a, s) + b; 4489 __ rolw_imm(a, a, s); 4490 __ addw(a, a, b); 4491 } 4492 4493 // a += ((b & c) | ((~b) & d)) + x + ac; 4494 // a = Integer.rotateLeft(a, s) + b; 4495 void md5_FF(BufRegCache& reg_cache, 4496 Register a, Register b, Register c, Register d, 4497 int k, int s, int t, 4498 Register rtmp1, Register rtmp2) { 4499 // rtmp1 = b & c 4500 __ andr(rtmp1, b, c); 4501 4502 // rtmp2 = (~b) & d 4503 __ andn(rtmp2, d, b); 4504 4505 // rtmp1 = (b & c) | ((~b) & d) 4506 __ orr(rtmp1, rtmp1, rtmp2); 4507 4508 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4509 } 4510 4511 // a += ((b & d) | (c & (~d))) + x + ac; 4512 // a = Integer.rotateLeft(a, s) + b; 4513 void md5_GG(BufRegCache& reg_cache, 4514 Register a, Register b, Register c, Register d, 4515 int k, int s, int t, 4516 Register rtmp1, Register rtmp2) { 4517 // rtmp1 = b & d 4518 __ andr(rtmp1, b, d); 4519 4520 // rtmp2 = c & (~d) 4521 __ andn(rtmp2, c, d); 4522 4523 // rtmp1 = (b & d) | (c & (~d)) 4524 __ orr(rtmp1, rtmp1, rtmp2); 4525 4526 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4527 } 4528 4529 // a += ((b ^ c) ^ d) + x + ac; 4530 // a = Integer.rotateLeft(a, s) + b; 4531 void md5_HH(BufRegCache& reg_cache, 4532 Register a, Register b, Register c, Register d, 4533 int k, int s, int t, 4534 Register rtmp1, Register rtmp2) { 4535 // rtmp1 = (b ^ c) ^ d 4536 __ xorr(rtmp2, b, c); 4537 __ xorr(rtmp1, rtmp2, d); 4538 4539 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4540 } 4541 4542 // a += (c ^ (b | (~d))) + x + ac; 4543 // a = Integer.rotateLeft(a, s) + b; 4544 void md5_II(BufRegCache& reg_cache, 4545 Register a, Register b, Register c, Register d, 4546 int k, int s, int t, 4547 Register rtmp1, Register rtmp2) { 4548 // rtmp1 = c ^ (b | (~d)) 4549 __ orn(rtmp2, b, d); 4550 __ xorr(rtmp1, c, rtmp2); 4551 4552 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4553 } 4554 4555 // Arguments: 4556 // 4557 // Inputs: 4558 // c_rarg0 - byte[] source+offset 4559 // c_rarg1 - int[] SHA.state 4560 // c_rarg2 - int offset (multi_block == True) 4561 // c_rarg3 - int limit (multi_block == True) 4562 // 4563 // Registers: 4564 // x0 zero (zero) 4565 // x1 ra (return address) 4566 // x2 sp (stack pointer) 4567 // x3 gp (global pointer) 4568 // x4 tp (thread pointer) 4569 // x5 t0 (tmp register) 4570 // x6 t1 (tmp register) 4571 // x7 t2 state0 4572 // x8 f0/s0 (frame pointer) 4573 // x9 s1 4574 // x10 a0 rtmp1 / c_rarg0 4575 // x11 a1 rtmp2 / c_rarg1 4576 // x12 a2 a / c_rarg2 4577 // x13 a3 b / c_rarg3 4578 // x14 a4 c 4579 // x15 a5 d 4580 // x16 a6 buf 4581 // x17 a7 state 4582 // x18 s2 ofs [saved-reg] (multi_block == True) 4583 // x19 s3 limit [saved-reg] (multi_block == True) 4584 // x20 s4 state1 [saved-reg] 4585 // x21 s5 state2 [saved-reg] 4586 // x22 s6 state3 [saved-reg] 4587 // x23 s7 4588 // x24 s8 buf0 [saved-reg] 4589 // x25 s9 buf1 [saved-reg] 4590 // x26 s10 buf2 [saved-reg] 4591 // x27 s11 buf3 [saved-reg] 4592 // x28 t3 buf4 4593 // x29 t4 buf5 4594 // x30 t5 buf6 4595 // x31 t6 buf7 4596 address generate_md5_implCompress(bool multi_block, const char *name) { 4597 __ align(CodeEntryAlignment); 4598 StubCodeMark mark(this, "StubRoutines", name); 4599 address start = __ pc(); 4600 4601 // rotation constants 4602 const int S11 = 7; 4603 const int S12 = 12; 4604 const int S13 = 17; 4605 const int S14 = 22; 4606 const int S21 = 5; 4607 const int S22 = 9; 4608 const int S23 = 14; 4609 const int S24 = 20; 4610 const int S31 = 4; 4611 const int S32 = 11; 4612 const int S33 = 16; 4613 const int S34 = 23; 4614 const int S41 = 6; 4615 const int S42 = 10; 4616 const int S43 = 15; 4617 const int S44 = 21; 4618 4619 const int64_t mask32 = 0xffffffff; 4620 4621 Register buf_arg = c_rarg0; // a0 4622 Register state_arg = c_rarg1; // a1 4623 Register ofs_arg = c_rarg2; // a2 4624 Register limit_arg = c_rarg3; // a3 4625 4626 // we'll copy the args to these registers to free up a0-a3 4627 // to use for other values manipulated by instructions 4628 // that can be compressed 4629 Register buf = x16; // a6 4630 Register state = x17; // a7 4631 Register ofs = x18; // s2 4632 Register limit = x19; // s3 4633 4634 // using x12->15 to allow compressed instructions 4635 Register a = x12; // a2 4636 Register b = x13; // a3 4637 Register c = x14; // a4 4638 Register d = x15; // a5 4639 4640 Register state0 = x7; // t2 4641 Register state1 = x20; // s4 4642 Register state2 = x21; // s5 4643 Register state3 = x22; // s6 4644 4645 // using x10->x11 to allow compressed instructions 4646 Register rtmp1 = x10; // a0 4647 Register rtmp2 = x11; // a1 4648 4649 RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11 4650 RegSet reg_cache_regs; 4651 reg_cache_regs += reg_cache_saved_regs; 4652 reg_cache_regs += RegSet::of(t3, t4, t5, t6); 4653 BufRegCache reg_cache(_masm, reg_cache_regs); 4654 4655 RegSet saved_regs; 4656 if (multi_block) { 4657 saved_regs += RegSet::of(ofs, limit); 4658 } 4659 saved_regs += RegSet::of(state1, state2, state3); 4660 saved_regs += reg_cache_saved_regs; 4661 4662 __ push_reg(saved_regs, sp); 4663 4664 __ mv(buf, buf_arg); 4665 __ mv(state, state_arg); 4666 if (multi_block) { 4667 __ mv(ofs, ofs_arg); 4668 __ mv(limit, limit_arg); 4669 } 4670 4671 // to minimize the number of memory operations: 4672 // read the 4 state 4-byte values in pairs, with a single ld, 4673 // and split them into 2 registers. 4674 // 4675 // And, as the core algorithm of md5 works on 32-bits words, so 4676 // in the following code, it does not care about the content of 4677 // higher 32-bits in state[x]. Based on this observation, 4678 // we can apply further optimization, which is to just ignore the 4679 // higher 32-bits in state0/state2, rather than set the higher 4680 // 32-bits of state0/state2 to zero explicitly with extra instructions. 4681 __ ld(state0, Address(state)); 4682 __ srli(state1, state0, 32); 4683 __ ld(state2, Address(state, 8)); 4684 __ srli(state3, state2, 32); 4685 4686 Label md5_loop; 4687 __ BIND(md5_loop); 4688 4689 __ mv(a, state0); 4690 __ mv(b, state1); 4691 __ mv(c, state2); 4692 __ mv(d, state3); 4693 4694 // Round 1 4695 reg_cache.gen_load(0, buf); 4696 md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2); 4697 md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2); 4698 reg_cache.gen_load(1, buf); 4699 md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2); 4700 md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2); 4701 reg_cache.gen_load(2, buf); 4702 md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2); 4703 md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2); 4704 reg_cache.gen_load(3, buf); 4705 md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2); 4706 md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2); 4707 reg_cache.gen_load(4, buf); 4708 md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2); 4709 md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2); 4710 reg_cache.gen_load(5, buf); 4711 md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2); 4712 md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2); 4713 reg_cache.gen_load(6, buf); 4714 md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2); 4715 md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2); 4716 reg_cache.gen_load(7, buf); 4717 md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2); 4718 md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2); 4719 4720 // Round 2 4721 md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2); 4722 md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2); 4723 md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2); 4724 md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2); 4725 md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2); 4726 md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2); 4727 md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2); 4728 md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2); 4729 md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2); 4730 md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2); 4731 md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2); 4732 md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2); 4733 md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2); 4734 md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2); 4735 md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2); 4736 md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2); 4737 4738 // Round 3 4739 md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2); 4740 md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2); 4741 md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2); 4742 md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2); 4743 md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2); 4744 md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2); 4745 md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2); 4746 md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2); 4747 md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2); 4748 md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2); 4749 md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2); 4750 md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2); 4751 md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2); 4752 md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2); 4753 md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2); 4754 md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2); 4755 4756 // Round 4 4757 md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2); 4758 md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2); 4759 md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2); 4760 md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2); 4761 md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2); 4762 md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2); 4763 md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2); 4764 md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2); 4765 md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2); 4766 md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2); 4767 md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2); 4768 md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2); 4769 md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2); 4770 md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2); 4771 md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2); 4772 md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2); 4773 4774 __ addw(state0, state0, a); 4775 __ addw(state1, state1, b); 4776 __ addw(state2, state2, c); 4777 __ addw(state3, state3, d); 4778 4779 if (multi_block) { 4780 __ addi(buf, buf, 64); 4781 __ addi(ofs, ofs, 64); 4782 // if (ofs <= limit) goto m5_loop 4783 __ bge(limit, ofs, md5_loop); 4784 __ mv(c_rarg0, ofs); // return ofs 4785 } 4786 4787 // to minimize the number of memory operations: 4788 // write back the 4 state 4-byte values in pairs, with a single sd 4789 __ mv(t0, mask32); 4790 __ andr(state0, state0, t0); 4791 __ slli(state1, state1, 32); 4792 __ orr(state0, state0, state1); 4793 __ sd(state0, Address(state)); 4794 __ andr(state2, state2, t0); 4795 __ slli(state3, state3, 32); 4796 __ orr(state2, state2, state3); 4797 __ sd(state2, Address(state, 8)); 4798 4799 __ pop_reg(saved_regs, sp); 4800 __ ret(); 4801 4802 return (address) start; 4803 } 4804 4805 /** 4806 * Perform the quarter round calculations on values contained within four vector registers. 4807 * 4808 * @param aVec the SIMD register containing only the "a" values 4809 * @param bVec the SIMD register containing only the "b" values 4810 * @param cVec the SIMD register containing only the "c" values 4811 * @param dVec the SIMD register containing only the "d" values 4812 * @param tmp_vr temporary vector register holds intermedia values. 4813 */ 4814 void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec, 4815 VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) { 4816 // a += b, d ^= a, d <<<= 16 4817 __ vadd_vv(aVec, aVec, bVec); 4818 __ vxor_vv(dVec, dVec, aVec); 4819 __ vrole32_vi(dVec, 16, tmp_vr); 4820 4821 // c += d, b ^= c, b <<<= 12 4822 __ vadd_vv(cVec, cVec, dVec); 4823 __ vxor_vv(bVec, bVec, cVec); 4824 __ vrole32_vi(bVec, 12, tmp_vr); 4825 4826 // a += b, d ^= a, d <<<= 8 4827 __ vadd_vv(aVec, aVec, bVec); 4828 __ vxor_vv(dVec, dVec, aVec); 4829 __ vrole32_vi(dVec, 8, tmp_vr); 4830 4831 // c += d, b ^= c, b <<<= 7 4832 __ vadd_vv(cVec, cVec, dVec); 4833 __ vxor_vv(bVec, bVec, cVec); 4834 __ vrole32_vi(bVec, 7, tmp_vr); 4835 } 4836 4837 /** 4838 * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result) 4839 * 4840 * Input arguments: 4841 * c_rarg0 - state, the starting state 4842 * c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function 4843 * 4844 * Implementation Note: 4845 * Parallelization is achieved by loading individual state elements into vectors for N blocks. 4846 * N depends on single vector register length. 4847 */ 4848 address generate_chacha20Block() { 4849 Label L_Rounds; 4850 4851 __ align(CodeEntryAlignment); 4852 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4853 address start = __ pc(); 4854 __ enter(); 4855 4856 const int states_len = 16; 4857 const int step = 4; 4858 const Register state = c_rarg0; 4859 const Register key_stream = c_rarg1; 4860 const Register tmp_addr = t0; 4861 const Register length = t1; 4862 4863 // Organize vector registers in an array that facilitates 4864 // putting repetitive opcodes into loop structures below. 4865 const VectorRegister work_vrs[16] = { 4866 v0, v1, v2, v3, v4, v5, v6, v7, 4867 v8, v9, v10, v11, v12, v13, v14, v15 4868 }; 4869 const VectorRegister tmp_vr = v16; 4870 const VectorRegister counter_vr = v17; 4871 4872 { 4873 // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024 4874 // in java level. 4875 __ vsetivli(length, 16, Assembler::e32, Assembler::m1); 4876 } 4877 4878 // Load from source state. 4879 // Every element in source state is duplicated to all elements in the corresponding vector. 4880 __ mv(tmp_addr, state); 4881 for (int i = 0; i < states_len; i += 1) { 4882 __ vlse32_v(work_vrs[i], tmp_addr, zr); 4883 __ addi(tmp_addr, tmp_addr, step); 4884 } 4885 // Adjust counter for every individual block. 4886 __ vid_v(counter_vr); 4887 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); 4888 4889 // Perform 10 iterations of the 8 quarter round set 4890 { 4891 const Register loop = t2; // share t2 with other non-overlapping usages. 4892 __ mv(loop, 10); 4893 __ BIND(L_Rounds); 4894 4895 chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr); 4896 chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr); 4897 chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr); 4898 chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr); 4899 4900 chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr); 4901 chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr); 4902 chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr); 4903 chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr); 4904 4905 __ sub(loop, loop, 1); 4906 __ bnez(loop, L_Rounds); 4907 } 4908 4909 // Add the original state into the end working state. 4910 // We do this by first duplicating every element in source state array to the corresponding 4911 // vector, then adding it to the post-loop working state. 4912 __ mv(tmp_addr, state); 4913 for (int i = 0; i < states_len; i += 1) { 4914 __ vlse32_v(tmp_vr, tmp_addr, zr); 4915 __ addi(tmp_addr, tmp_addr, step); 4916 __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr); 4917 } 4918 // Add the counter overlay onto work_vrs[12] at the end. 4919 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); 4920 4921 // Store result to key stream. 4922 { 4923 const Register stride = t2; // share t2 with other non-overlapping usages. 4924 // Every block occupies 64 bytes, so we use 64 as stride of the vector store. 4925 __ mv(stride, 64); 4926 for (int i = 0; i < states_len; i += 1) { 4927 __ vsse32_v(work_vrs[i], key_stream, stride); 4928 __ addi(key_stream, key_stream, step); 4929 } 4930 } 4931 4932 // Return length of output key_stream 4933 __ slli(c_rarg0, length, 6); 4934 4935 __ leave(); 4936 __ ret(); 4937 4938 return (address) start; 4939 } 4940 4941 4942 // ------------------------ SHA-1 intrinsic ------------------------ 4943 4944 // K't = 4945 // 5a827999, 0 <= t <= 19 4946 // 6ed9eba1, 20 <= t <= 39 4947 // 8f1bbcdc, 40 <= t <= 59 4948 // ca62c1d6, 60 <= t <= 79 4949 void sha1_prepare_k(Register cur_k, int round) { 4950 assert(round >= 0 && round < 80, "must be"); 4951 4952 static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6}; 4953 if ((round % 20) == 0) { 4954 __ mv(cur_k, ks[round/20]); 4955 } 4956 } 4957 4958 // W't = 4959 // M't, 0 <= t <= 15 4960 // ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 4961 void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) { 4962 assert(round >= 0 && round < 80, "must be"); 4963 4964 if (round < 16) { 4965 // in the first 16 rounds, in ws[], every register contains 2 W't, e.g. 4966 // in ws[0], high part contains W't-0, low part contains W't-1, 4967 // in ws[1], high part contains W't-2, low part contains W't-3, 4968 // ... 4969 // in ws[7], high part contains W't-14, low part contains W't-15. 4970 4971 if ((round % 2) == 0) { 4972 __ ld(ws[round/2], Address(buf, (round/2) * 8)); 4973 // reverse bytes, as SHA-1 is defined in big-endian. 4974 __ revb(ws[round/2], ws[round/2]); 4975 __ srli(cur_w, ws[round/2], 32); 4976 } else { 4977 __ mv(cur_w, ws[round/2]); 4978 } 4979 4980 return; 4981 } 4982 4983 if ((round % 2) == 0) { 4984 int idx = 16; 4985 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 4986 __ srli(t1, ws[(idx-8)/2], 32); 4987 __ xorr(t0, ws[(idx-3)/2], t1); 4988 4989 __ srli(t1, ws[(idx-14)/2], 32); 4990 __ srli(cur_w, ws[(idx-16)/2], 32); 4991 __ xorr(cur_w, cur_w, t1); 4992 4993 __ xorr(cur_w, cur_w, t0); 4994 __ rolw_imm(cur_w, cur_w, 1, t0); 4995 4996 // copy the cur_w value to ws[8]. 4997 // now, valid w't values are at: 4998 // w0: ws[0]'s lower 32 bits 4999 // w1 ~ w14: ws[1] ~ ws[7] 5000 // w15: ws[8]'s higher 32 bits 5001 __ slli(ws[idx/2], cur_w, 32); 5002 5003 return; 5004 } 5005 5006 int idx = 17; 5007 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 5008 __ srli(t1, ws[(idx-3)/2], 32); 5009 __ xorr(t0, t1, ws[(idx-8)/2]); 5010 5011 __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]); 5012 5013 __ xorr(cur_w, cur_w, t0); 5014 __ rolw_imm(cur_w, cur_w, 1, t0); 5015 5016 // copy the cur_w value to ws[8] 5017 __ zero_extend(cur_w, cur_w, 32); 5018 __ orr(ws[idx/2], ws[idx/2], cur_w); 5019 5020 // shift the w't registers, so they start from ws[0] again. 5021 // now, valid w't values are at: 5022 // w0 ~ w15: ws[0] ~ ws[7] 5023 Register ws_0 = ws[0]; 5024 for (int i = 0; i < 16/2; i++) { 5025 ws[i] = ws[i+1]; 5026 } 5027 ws[8] = ws_0; 5028 } 5029 5030 // f't(x, y, z) = 5031 // Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19 5032 // Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39 5033 // Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59 5034 // Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79 5035 void sha1_f(Register dst, Register x, Register y, Register z, int round) { 5036 assert(round >= 0 && round < 80, "must be"); 5037 assert_different_registers(dst, x, y, z, t0, t1); 5038 5039 if (round < 20) { 5040 // (x & y) ^ (~x & z) 5041 __ andr(t0, x, y); 5042 __ andn(dst, z, x); 5043 __ xorr(dst, dst, t0); 5044 } else if (round >= 40 && round < 60) { 5045 // (x & y) ^ (x & z) ^ (y & z) 5046 __ andr(t0, x, y); 5047 __ andr(t1, x, z); 5048 __ andr(dst, y, z); 5049 __ xorr(dst, dst, t0); 5050 __ xorr(dst, dst, t1); 5051 } else { 5052 // x ^ y ^ z 5053 __ xorr(dst, x, y); 5054 __ xorr(dst, dst, z); 5055 } 5056 } 5057 5058 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't 5059 // e = d 5060 // d = c 5061 // c = ROTL'30(b) 5062 // b = a 5063 // a = T 5064 void sha1_process_round(Register a, Register b, Register c, Register d, Register e, 5065 Register cur_k, Register cur_w, Register tmp, int round) { 5066 assert(round >= 0 && round < 80, "must be"); 5067 assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0); 5068 5069 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't 5070 5071 // cur_w will be recalculated at the beginning of each round, 5072 // so, we can reuse it as a temp register here. 5073 Register tmp2 = cur_w; 5074 5075 // reuse e as a temporary register, as we will mv new value into it later 5076 Register tmp3 = e; 5077 __ add(tmp2, cur_k, tmp2); 5078 __ add(tmp3, tmp3, tmp2); 5079 __ rolw_imm(tmp2, a, 5, t0); 5080 5081 sha1_f(tmp, b, c, d, round); 5082 5083 __ add(tmp2, tmp2, tmp); 5084 __ add(tmp2, tmp2, tmp3); 5085 5086 // e = d 5087 // d = c 5088 // c = ROTL'30(b) 5089 // b = a 5090 // a = T 5091 __ mv(e, d); 5092 __ mv(d, c); 5093 5094 __ rolw_imm(c, b, 30); 5095 __ mv(b, a); 5096 __ mv(a, tmp2); 5097 } 5098 5099 // H(i)0 = a + H(i-1)0 5100 // H(i)1 = b + H(i-1)1 5101 // H(i)2 = c + H(i-1)2 5102 // H(i)3 = d + H(i-1)3 5103 // H(i)4 = e + H(i-1)4 5104 void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e, 5105 Register prev_ab, Register prev_cd, Register prev_e) { 5106 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5107 5108 __ add(a, a, prev_ab); 5109 __ srli(prev_ab, prev_ab, 32); 5110 __ add(b, b, prev_ab); 5111 5112 __ add(c, c, prev_cd); 5113 __ srli(prev_cd, prev_cd, 32); 5114 __ add(d, d, prev_cd); 5115 5116 __ add(e, e, prev_e); 5117 } 5118 5119 void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e, 5120 Register prev_ab, Register prev_cd, Register prev_e) { 5121 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0); 5122 5123 __ slli(t0, b, 32); 5124 __ zero_extend(prev_ab, a, 32); 5125 __ orr(prev_ab, prev_ab, t0); 5126 5127 __ slli(t0, d, 32); 5128 __ zero_extend(prev_cd, c, 32); 5129 __ orr(prev_cd, prev_cd, t0); 5130 5131 __ mv(prev_e, e); 5132 } 5133 5134 // Intrinsic for: 5135 // void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs) 5136 // void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit) 5137 // 5138 // Arguments: 5139 // 5140 // Inputs: 5141 // c_rarg0: byte[] src array + offset 5142 // c_rarg1: int[] SHA.state 5143 // - - - - - - below are only for implCompressMultiBlock0 - - - - - - 5144 // c_rarg2: int offset 5145 // c_rarg3: int limit 5146 // 5147 // Outputs: 5148 // - - - - - - below are only for implCompressMultiBlock0 - - - - - - 5149 // c_rarg0: int offset, when (multi_block == true) 5150 // 5151 address generate_sha1_implCompress(bool multi_block, const char *name) { 5152 __ align(CodeEntryAlignment); 5153 StubCodeMark mark(this, "StubRoutines", name); 5154 5155 address start = __ pc(); 5156 __ enter(); 5157 5158 RegSet saved_regs = RegSet::range(x18, x27); 5159 if (multi_block) { 5160 // use x9 as src below. 5161 saved_regs += RegSet::of(x9); 5162 } 5163 __ push_reg(saved_regs, sp); 5164 5165 // c_rarg0 - c_rarg3: x10 - x13 5166 Register buf = c_rarg0; 5167 Register state = c_rarg1; 5168 Register offset = c_rarg2; 5169 Register limit = c_rarg3; 5170 // use src to contain the original start point of the array. 5171 Register src = x9; 5172 5173 if (multi_block) { 5174 __ sub(limit, limit, offset); 5175 __ add(limit, limit, buf); 5176 __ sub(src, buf, offset); 5177 } 5178 5179 // [args-reg]: x14 - x17 5180 // [temp-reg]: x28 - x31 5181 // [saved-reg]: x18 - x27 5182 5183 // h0/1/2/3/4 5184 const Register a = x14, b = x15, c = x16, d = x17, e = x28; 5185 // w0, w1, ... w15 5186 // put two adjecent w's in one register: 5187 // one at high word part, another at low word part 5188 // at different round (even or odd), w't value reside in different items in ws[]. 5189 // w0 ~ w15, either reside in 5190 // ws[0] ~ ws[7], where 5191 // w0 at higher 32 bits of ws[0], 5192 // w1 at lower 32 bits of ws[0], 5193 // ... 5194 // w14 at higher 32 bits of ws[7], 5195 // w15 at lower 32 bits of ws[7]. 5196 // or, reside in 5197 // w0: ws[0]'s lower 32 bits 5198 // w1 ~ w14: ws[1] ~ ws[7] 5199 // w15: ws[8]'s higher 32 bits 5200 Register ws[9] = {x29, x30, x31, x18, 5201 x19, x20, x21, x22, 5202 x23}; // auxiliary register for calculating w's value 5203 // current k't's value 5204 const Register cur_k = x24; 5205 // current w't's value 5206 const Register cur_w = x25; 5207 // values of a, b, c, d, e in the previous round 5208 const Register prev_ab = x26, prev_cd = x27; 5209 const Register prev_e = offset; // reuse offset/c_rarg2 5210 5211 // load 5 words state into a, b, c, d, e. 5212 // 5213 // To minimize the number of memory operations, we apply following 5214 // optimization: read the states (a/b/c/d) of 4-byte values in pairs, 5215 // with a single ld, and split them into 2 registers. 5216 // 5217 // And, as the core algorithm of SHA-1 works on 32-bits words, so 5218 // in the following code, it does not care about the content of 5219 // higher 32-bits in a/b/c/d/e. Based on this observation, 5220 // we can apply further optimization, which is to just ignore the 5221 // higher 32-bits in a/c/e, rather than set the higher 5222 // 32-bits of a/c/e to zero explicitly with extra instructions. 5223 __ ld(a, Address(state, 0)); 5224 __ srli(b, a, 32); 5225 __ ld(c, Address(state, 8)); 5226 __ srli(d, c, 32); 5227 __ lw(e, Address(state, 16)); 5228 5229 Label L_sha1_loop; 5230 if (multi_block) { 5231 __ BIND(L_sha1_loop); 5232 } 5233 5234 sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5235 5236 for (int round = 0; round < 80; round++) { 5237 // prepare K't value 5238 sha1_prepare_k(cur_k, round); 5239 5240 // prepare W't value 5241 sha1_prepare_w(cur_w, ws, buf, round); 5242 5243 // one round process 5244 sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round); 5245 } 5246 5247 // compute the intermediate hash value 5248 sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5249 5250 if (multi_block) { 5251 int64_t block_bytes = 16 * 4; 5252 __ addi(buf, buf, block_bytes); 5253 5254 __ bge(limit, buf, L_sha1_loop, true); 5255 } 5256 5257 // store back the state. 5258 __ zero_extend(a, a, 32); 5259 __ slli(b, b, 32); 5260 __ orr(a, a, b); 5261 __ sd(a, Address(state, 0)); 5262 __ zero_extend(c, c, 32); 5263 __ slli(d, d, 32); 5264 __ orr(c, c, d); 5265 __ sd(c, Address(state, 8)); 5266 __ sw(e, Address(state, 16)); 5267 5268 // return offset 5269 if (multi_block) { 5270 __ sub(c_rarg0, buf, src); 5271 } 5272 5273 __ pop_reg(saved_regs, sp); 5274 5275 __ leave(); 5276 __ ret(); 5277 5278 return (address) start; 5279 } 5280 5281 /** 5282 * vector registers: 5283 * input VectorRegister's: intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3 5284 * index VectorRegister's: idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7 5285 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11 5286 * 5287 * NOTE: each field will occupy a vector register group 5288 */ 5289 void base64_vector_encode_round(Register src, Register dst, Register codec, 5290 Register size, Register stepSrc, Register stepDst, 5291 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, 5292 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, 5293 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4, 5294 Assembler::LMUL lmul) { 5295 // set vector register type/len 5296 __ vsetvli(x0, size, Assembler::e8, lmul); 5297 5298 // segmented load src into v registers: mem(src) => vr(3) 5299 __ vlseg3e8_v(inputV1, src); 5300 5301 // src = src + register_group_len_bytes * 3 5302 __ add(src, src, stepSrc); 5303 5304 // encoding 5305 // 1. compute index into lookup table: vr(3) => vr(4) 5306 __ vsrl_vi(idxV1, inputV1, 2); 5307 5308 __ vsrl_vi(idxV2, inputV2, 2); 5309 __ vsll_vi(inputV1, inputV1, 6); 5310 __ vor_vv(idxV2, idxV2, inputV1); 5311 __ vsrl_vi(idxV2, idxV2, 2); 5312 5313 __ vsrl_vi(idxV3, inputV3, 4); 5314 __ vsll_vi(inputV2, inputV2, 4); 5315 __ vor_vv(idxV3, inputV2, idxV3); 5316 __ vsrl_vi(idxV3, idxV3, 2); 5317 5318 __ vsll_vi(idxV4, inputV3, 2); 5319 __ vsrl_vi(idxV4, idxV4, 2); 5320 5321 // 2. indexed load: vr(4) => vr(4) 5322 __ vluxei8_v(outputV1, codec, idxV1); 5323 __ vluxei8_v(outputV2, codec, idxV2); 5324 __ vluxei8_v(outputV3, codec, idxV3); 5325 __ vluxei8_v(outputV4, codec, idxV4); 5326 5327 // segmented store encoded data in v registers back to dst: vr(4) => mem(dst) 5328 __ vsseg4e8_v(outputV1, dst); 5329 5330 // dst = dst + register_group_len_bytes * 4 5331 __ add(dst, dst, stepDst); 5332 } 5333 5334 /** 5335 * void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) 5336 * 5337 * Input arguments: 5338 * c_rarg0 - src, source array 5339 * c_rarg1 - sp, src start offset 5340 * c_rarg2 - sl, src end offset 5341 * c_rarg3 - dst, dest array 5342 * c_rarg4 - dp, dst start offset 5343 * c_rarg5 - isURL, Base64 or URL character set 5344 */ 5345 address generate_base64_encodeBlock() { 5346 alignas(64) static const char toBase64[64] = { 5347 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5348 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5349 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5350 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5351 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 5352 }; 5353 5354 alignas(64) static const char toBase64URL[64] = { 5355 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5356 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5357 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5358 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5359 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 5360 }; 5361 5362 __ align(CodeEntryAlignment); 5363 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 5364 address start = __ pc(); 5365 __ enter(); 5366 5367 Register src = c_rarg0; 5368 Register soff = c_rarg1; 5369 Register send = c_rarg2; 5370 Register dst = c_rarg3; 5371 Register doff = c_rarg4; 5372 Register isURL = c_rarg5; 5373 5374 Register codec = c_rarg6; 5375 Register length = c_rarg7; // total length of src data in bytes 5376 5377 Label ProcessData, Exit; 5378 5379 // length should be multiple of 3 5380 __ sub(length, send, soff); 5381 // real src/dst to process data 5382 __ add(src, src, soff); 5383 __ add(dst, dst, doff); 5384 5385 // load the codec base address 5386 __ la(codec, ExternalAddress((address) toBase64)); 5387 __ beqz(isURL, ProcessData); 5388 __ la(codec, ExternalAddress((address) toBase64URL)); 5389 __ BIND(ProcessData); 5390 5391 // vector version 5392 if (UseRVV) { 5393 Label ProcessM2, ProcessM1, ProcessScalar; 5394 5395 Register size = soff; 5396 Register stepSrcM1 = send; 5397 Register stepSrcM2 = doff; 5398 Register stepDst = isURL; 5399 5400 __ mv(size, MaxVectorSize * 2); 5401 __ mv(stepSrcM1, MaxVectorSize * 3); 5402 __ slli(stepSrcM2, stepSrcM1, 1); 5403 __ mv(stepDst, MaxVectorSize * 2 * 4); 5404 5405 __ blt(length, stepSrcM2, ProcessM1); 5406 5407 __ BIND(ProcessM2); 5408 base64_vector_encode_round(src, dst, codec, 5409 size, stepSrcM2, stepDst, 5410 v2, v4, v6, // inputs 5411 v8, v10, v12, v14, // indexes 5412 v16, v18, v20, v22, // outputs 5413 Assembler::m2); 5414 5415 __ sub(length, length, stepSrcM2); 5416 __ bge(length, stepSrcM2, ProcessM2); 5417 5418 __ BIND(ProcessM1); 5419 __ blt(length, stepSrcM1, ProcessScalar); 5420 5421 __ srli(size, size, 1); 5422 __ srli(stepDst, stepDst, 1); 5423 base64_vector_encode_round(src, dst, codec, 5424 size, stepSrcM1, stepDst, 5425 v1, v2, v3, // inputs 5426 v4, v5, v6, v7, // indexes 5427 v8, v9, v10, v11, // outputs 5428 Assembler::m1); 5429 __ sub(length, length, stepSrcM1); 5430 5431 __ BIND(ProcessScalar); 5432 } 5433 5434 // scalar version 5435 { 5436 Register byte1 = soff, byte0 = send, byte2 = doff; 5437 Register combined24Bits = isURL; 5438 5439 __ beqz(length, Exit); 5440 5441 Label ScalarLoop; 5442 __ BIND(ScalarLoop); 5443 { 5444 // plain: [byte0[7:0] : byte1[7:0] : byte2[7:0]] => 5445 // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]] 5446 5447 // load 3 bytes src data 5448 __ lbu(byte0, Address(src, 0)); 5449 __ lbu(byte1, Address(src, 1)); 5450 __ lbu(byte2, Address(src, 2)); 5451 __ addi(src, src, 3); 5452 5453 // construct 24 bits from 3 bytes 5454 __ slliw(byte0, byte0, 16); 5455 __ slliw(byte1, byte1, 8); 5456 __ orr(combined24Bits, byte0, byte1); 5457 __ orr(combined24Bits, combined24Bits, byte2); 5458 5459 // get codec index and encode(ie. load from codec by index) 5460 __ slliw(byte0, combined24Bits, 8); 5461 __ srliw(byte0, byte0, 26); 5462 __ add(byte0, codec, byte0); 5463 __ lbu(byte0, byte0); 5464 5465 __ slliw(byte1, combined24Bits, 14); 5466 __ srliw(byte1, byte1, 26); 5467 __ add(byte1, codec, byte1); 5468 __ lbu(byte1, byte1); 5469 5470 __ slliw(byte2, combined24Bits, 20); 5471 __ srliw(byte2, byte2, 26); 5472 __ add(byte2, codec, byte2); 5473 __ lbu(byte2, byte2); 5474 5475 __ andi(combined24Bits, combined24Bits, 0x3f); 5476 __ add(combined24Bits, codec, combined24Bits); 5477 __ lbu(combined24Bits, combined24Bits); 5478 5479 // store 4 bytes encoded data 5480 __ sb(byte0, Address(dst, 0)); 5481 __ sb(byte1, Address(dst, 1)); 5482 __ sb(byte2, Address(dst, 2)); 5483 __ sb(combined24Bits, Address(dst, 3)); 5484 5485 __ sub(length, length, 3); 5486 __ addi(dst, dst, 4); 5487 // loop back 5488 __ bnez(length, ScalarLoop); 5489 } 5490 } 5491 5492 __ BIND(Exit); 5493 5494 __ leave(); 5495 __ ret(); 5496 5497 return (address) start; 5498 } 5499 5500 /** 5501 * vector registers: 5502 * input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8 5503 * index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16 5504 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22 5505 * 5506 * NOTE: each field will occupy a single vector register group 5507 */ 5508 void base64_vector_decode_round(Register src, Register dst, Register codec, 5509 Register size, Register stepSrc, Register stepDst, Register failedIdx, 5510 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4, 5511 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, 5512 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, 5513 Assembler::LMUL lmul) { 5514 // set vector register type/len 5515 __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta); 5516 5517 // segmented load src into v registers: mem(src) => vr(4) 5518 __ vlseg4e8_v(inputV1, src); 5519 5520 // src = src + register_group_len_bytes * 4 5521 __ add(src, src, stepSrc); 5522 5523 // decoding 5524 // 1. indexed load: vr(4) => vr(4) 5525 __ vluxei8_v(idxV1, codec, inputV1); 5526 __ vluxei8_v(idxV2, codec, inputV2); 5527 __ vluxei8_v(idxV3, codec, inputV3); 5528 __ vluxei8_v(idxV4, codec, inputV4); 5529 5530 // 2. check wrong data 5531 __ vor_vv(outputV1, idxV1, idxV2); 5532 __ vor_vv(outputV2, idxV3, idxV4); 5533 __ vor_vv(outputV1, outputV1, outputV2); 5534 __ vmseq_vi(v0, outputV1, -1); 5535 __ vfirst_m(failedIdx, v0); 5536 Label NoFailure, FailureAtIdx0; 5537 // valid value can only be -1 when < 0 5538 __ bltz(failedIdx, NoFailure); 5539 // when the first data (at index 0) fails, no need to process data anymore 5540 __ beqz(failedIdx, FailureAtIdx0); 5541 __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu); 5542 __ slli(stepDst, failedIdx, 1); 5543 __ add(stepDst, failedIdx, stepDst); 5544 __ BIND(NoFailure); 5545 5546 // 3. compute the decoded data: vr(4) => vr(3) 5547 __ vsll_vi(idxV1, idxV1, 2); 5548 __ vsrl_vi(outputV1, idxV2, 4); 5549 __ vor_vv(outputV1, outputV1, idxV1); 5550 5551 __ vsll_vi(idxV2, idxV2, 4); 5552 __ vsrl_vi(outputV2, idxV3, 2); 5553 __ vor_vv(outputV2, outputV2, idxV2); 5554 5555 __ vsll_vi(idxV3, idxV3, 6); 5556 __ vor_vv(outputV3, idxV4, idxV3); 5557 5558 // segmented store encoded data in v registers back to dst: vr(3) => mem(dst) 5559 __ vsseg3e8_v(outputV1, dst); 5560 5561 // dst = dst + register_group_len_bytes * 3 5562 __ add(dst, dst, stepDst); 5563 __ BIND(FailureAtIdx0); 5564 } 5565 5566 /** 5567 * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME) 5568 * 5569 * Input arguments: 5570 * c_rarg0 - src, source array 5571 * c_rarg1 - sp, src start offset 5572 * c_rarg2 - sl, src end offset 5573 * c_rarg3 - dst, dest array 5574 * c_rarg4 - dp, dst start offset 5575 * c_rarg5 - isURL, Base64 or URL character set 5576 * c_rarg6 - isMIME, Decoding MIME block 5577 */ 5578 address generate_base64_decodeBlock() { 5579 5580 static const uint8_t fromBase64[256] = { 5581 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5582 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5583 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 5584 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 5585 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 5586 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 5587 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 5588 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 5589 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5590 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5591 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5592 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5593 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5594 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5595 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5596 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5597 }; 5598 5599 static const uint8_t fromBase64URL[256] = { 5600 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5601 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5602 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 5603 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 5604 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 5605 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 5606 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 5607 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 5608 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5609 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5610 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5611 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5612 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5613 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5614 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5615 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5616 }; 5617 5618 __ align(CodeEntryAlignment); 5619 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 5620 address start = __ pc(); 5621 __ enter(); 5622 5623 Register src = c_rarg0; 5624 Register soff = c_rarg1; 5625 Register send = c_rarg2; 5626 Register dst = c_rarg3; 5627 Register doff = c_rarg4; 5628 Register isURL = c_rarg5; 5629 Register isMIME = c_rarg6; 5630 5631 Register codec = c_rarg7; 5632 Register dstBackup = t6; 5633 Register length = t3; // total length of src data in bytes 5634 5635 Label ProcessData, Exit; 5636 Label ProcessScalar, ScalarLoop; 5637 5638 // passed in length (send - soff) is guaranteed to be > 4, 5639 // and in this intrinsic we only process data of length in multiple of 4, 5640 // it's not guaranteed to be multiple of 4 by java level, so do it explicitly 5641 __ sub(length, send, soff); 5642 __ andi(length, length, -4); 5643 // real src/dst to process data 5644 __ add(src, src, soff); 5645 __ add(dst, dst, doff); 5646 // backup of dst, used to calculate the return value at exit 5647 __ mv(dstBackup, dst); 5648 5649 // load the codec base address 5650 __ la(codec, ExternalAddress((address) fromBase64)); 5651 __ beqz(isURL, ProcessData); 5652 __ la(codec, ExternalAddress((address) fromBase64URL)); 5653 __ BIND(ProcessData); 5654 5655 // vector version 5656 if (UseRVV) { 5657 // for MIME case, it has a default length limit of 76 which could be 5658 // different(smaller) from (send - soff), so in MIME case, we go through 5659 // the scalar code path directly. 5660 __ bnez(isMIME, ScalarLoop); 5661 5662 Label ProcessM1, ProcessM2; 5663 5664 Register failedIdx = soff; 5665 Register stepSrcM1 = send; 5666 Register stepSrcM2 = doff; 5667 Register stepDst = isURL; 5668 Register size = t4; 5669 5670 __ mv(size, MaxVectorSize * 2); 5671 __ mv(stepSrcM1, MaxVectorSize * 4); 5672 __ slli(stepSrcM2, stepSrcM1, 1); 5673 __ mv(stepDst, MaxVectorSize * 2 * 3); 5674 5675 __ blt(length, stepSrcM2, ProcessM1); 5676 5677 5678 // Assembler::m2 5679 __ BIND(ProcessM2); 5680 base64_vector_decode_round(src, dst, codec, 5681 size, stepSrcM2, stepDst, failedIdx, 5682 v2, v4, v6, v8, // inputs 5683 v10, v12, v14, v16, // indexes 5684 v18, v20, v22, // outputs 5685 Assembler::m2); 5686 __ sub(length, length, stepSrcM2); 5687 5688 // error check 5689 // valid value of failedIdx can only be -1 when < 0 5690 __ bgez(failedIdx, Exit); 5691 5692 __ bge(length, stepSrcM2, ProcessM2); 5693 5694 5695 // Assembler::m1 5696 __ BIND(ProcessM1); 5697 __ blt(length, stepSrcM1, ProcessScalar); 5698 5699 __ srli(size, size, 1); 5700 __ srli(stepDst, stepDst, 1); 5701 base64_vector_decode_round(src, dst, codec, 5702 size, stepSrcM1, stepDst, failedIdx, 5703 v1, v2, v3, v4, // inputs 5704 v5, v6, v7, v8, // indexes 5705 v9, v10, v11, // outputs 5706 Assembler::m1); 5707 __ sub(length, length, stepSrcM1); 5708 5709 // error check 5710 // valid value of failedIdx can only be -1 when < 0 5711 __ bgez(failedIdx, Exit); 5712 5713 __ BIND(ProcessScalar); 5714 __ beqz(length, Exit); 5715 } 5716 5717 // scalar version 5718 { 5719 Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL; 5720 Register combined32Bits = t4; 5721 5722 // encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] => 5723 // plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]] 5724 __ BIND(ScalarLoop); 5725 5726 // load 4 bytes encoded src data 5727 __ lbu(byte0, Address(src, 0)); 5728 __ lbu(byte1, Address(src, 1)); 5729 __ lbu(byte2, Address(src, 2)); 5730 __ lbu(byte3, Address(src, 3)); 5731 __ addi(src, src, 4); 5732 5733 // get codec index and decode (ie. load from codec by index) 5734 __ add(byte0, codec, byte0); 5735 __ add(byte1, codec, byte1); 5736 __ lb(byte0, Address(byte0, 0)); 5737 __ lb(byte1, Address(byte1, 0)); 5738 __ add(byte2, codec, byte2); 5739 __ add(byte3, codec, byte3); 5740 __ lb(byte2, Address(byte2, 0)); 5741 __ lb(byte3, Address(byte3, 0)); 5742 __ slliw(byte0, byte0, 18); 5743 __ slliw(byte1, byte1, 12); 5744 __ orr(byte0, byte0, byte1); 5745 __ orr(byte0, byte0, byte3); 5746 __ slliw(byte2, byte2, 6); 5747 // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time, 5748 // 1. error check below 5749 // 2. decode below 5750 __ orr(combined32Bits, byte0, byte2); 5751 5752 // error check 5753 __ bltz(combined32Bits, Exit); 5754 5755 // store 3 bytes decoded data 5756 __ sraiw(byte0, combined32Bits, 16); 5757 __ sraiw(byte1, combined32Bits, 8); 5758 __ sb(byte0, Address(dst, 0)); 5759 __ sb(byte1, Address(dst, 1)); 5760 __ sb(combined32Bits, Address(dst, 2)); 5761 5762 __ sub(length, length, 4); 5763 __ addi(dst, dst, 3); 5764 // loop back 5765 __ bnez(length, ScalarLoop); 5766 } 5767 5768 __ BIND(Exit); 5769 __ sub(c_rarg0, dst, dstBackup); 5770 5771 __ leave(); 5772 __ ret(); 5773 5774 return (address) start; 5775 } 5776 5777 void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable, 5778 VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc, 5779 Register temp0, Register temp1, Register temp2, Register temp3, 5780 VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) { 5781 5782 assert((lmul == Assembler::m4 && step == 64) || 5783 (lmul == Assembler::m2 && step == 32) || 5784 (lmul == Assembler::m1 && step == 16), 5785 "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16"); 5786 // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used. 5787 // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case. 5788 // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration. 5789 // In non-vectorized code, we update s1 and s2 as: 5790 // s1 <- s1 + b1 5791 // s2 <- s2 + s1 5792 // s1 <- s1 + b2 5793 // s2 <- s2 + b1 5794 // ... 5795 // s1 <- s1 + b64 5796 // s2 <- s2 + s1 5797 // Putting above assignments together, we have: 5798 // s1_new = s1 + b1 + b2 + ... + b64 5799 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) = 5800 // = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) = 5801 // = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1) 5802 5803 __ mv(temp3, step); 5804 // Load data 5805 __ vsetvli(temp0, temp3, Assembler::e8, lmul); 5806 __ vle8_v(vbytes, buff); 5807 __ addi(buff, buff, step); 5808 5809 // Upper bound reduction sum for s1_new: 5810 // 0xFF * 64 = 0x3FC0, so: 5811 // 1. Need to do vector-widening reduction sum 5812 // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements 5813 __ vwredsumu_vs(vs1acc, vbytes, vzero); 5814 // Multiplication for s2_new 5815 __ vwmulu_vv(vs2acc, vtable, vbytes); 5816 5817 // s2 = s2 + s1 * log2(step) 5818 __ slli(temp1, s1, exact_log2(step)); 5819 __ add(s2, s2, temp1); 5820 5821 // Summing up calculated results for s2_new 5822 if (MaxVectorSize > 16) { 5823 __ vsetvli(temp0, temp3, Assembler::e16, lmul); 5824 } else { 5825 // Half of vector-widening multiplication result is in successor of vs2acc 5826 // group for vlen == 16, in which case we need to double vector register 5827 // group width in order to reduction sum all of them 5828 Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 : 5829 (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8; 5830 __ vsetvli(temp0, temp3, Assembler::e16, lmulx2); 5831 } 5832 // Upper bound for reduction sum: 5833 // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so: 5834 // 1. Need to do vector-widening reduction sum 5835 // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements 5836 __ vwredsumu_vs(vtemp1, vs2acc, vzero); 5837 5838 // Extracting results for: 5839 // s1_new 5840 __ vmv_x_s(temp0, vs1acc); 5841 __ add(s1, s1, temp0); 5842 // s2_new 5843 __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1); 5844 __ vmv_x_s(temp1, vtemp1); 5845 __ add(s2, s2, temp1); 5846 } 5847 5848 /*** 5849 * int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len) 5850 * 5851 * Arguments: 5852 * 5853 * Inputs: 5854 * c_rarg0 - int adler 5855 * c_rarg1 - byte* buff (b + off) 5856 * c_rarg2 - int len 5857 * 5858 * Output: 5859 * c_rarg0 - int adler result 5860 */ 5861 address generate_updateBytesAdler32() { 5862 __ align(CodeEntryAlignment); 5863 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 5864 address start = __ pc(); 5865 5866 Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop, 5867 L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1; 5868 5869 // Aliases 5870 Register adler = c_rarg0; 5871 Register s1 = c_rarg0; 5872 Register s2 = c_rarg3; 5873 Register buff = c_rarg1; 5874 Register len = c_rarg2; 5875 Register nmax = c_rarg4; 5876 Register base = c_rarg5; 5877 Register count = c_rarg6; 5878 Register temp0 = t3; 5879 Register temp1 = t4; 5880 Register temp2 = t5; 5881 Register temp3 = t6; 5882 5883 VectorRegister vzero = v31; 5884 VectorRegister vbytes = v8; // group: v8, v9, v10, v11 5885 VectorRegister vs1acc = v12; // group: v12, v13, v14, v15 5886 VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23 5887 VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27 5888 VectorRegister vtable_32 = v4; // group: v4, v5 5889 VectorRegister vtable_16 = v30; 5890 VectorRegister vtemp1 = v28; 5891 VectorRegister vtemp2 = v29; 5892 5893 // Max number of bytes we can process before having to take the mod 5894 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 5895 const uint64_t BASE = 0xfff1; 5896 const uint64_t NMAX = 0x15B0; 5897 5898 // Loops steps 5899 int step_64 = 64; 5900 int step_32 = 32; 5901 int step_16 = 16; 5902 int step_1 = 1; 5903 5904 __ enter(); // Required for proper stackwalking of RuntimeStub frame 5905 __ mv(temp1, 64); 5906 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4); 5907 5908 // Generating accumulation coefficients for further calculations 5909 // vtable_64: 5910 __ vid_v(vtemp1); 5911 __ vrsub_vx(vtable_64, vtemp1, temp1); 5912 // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 } 5913 5914 // vtable_32: 5915 __ mv(temp1, 32); 5916 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2); 5917 __ vid_v(vtemp1); 5918 __ vrsub_vx(vtable_32, vtemp1, temp1); 5919 // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 } 5920 5921 __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1); 5922 // vtable_16: 5923 __ mv(temp1, 16); 5924 __ vid_v(vtemp1); 5925 __ vrsub_vx(vtable_16, vtemp1, temp1); 5926 // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 } 5927 5928 __ vmv_v_i(vzero, 0); 5929 5930 __ mv(base, BASE); 5931 __ mv(nmax, NMAX); 5932 5933 // s1 is initialized to the lower 16 bits of adler 5934 // s2 is initialized to the upper 16 bits of adler 5935 __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff) 5936 __ zero_extend(s1, adler, 16); // s1 = (adler & 0xffff) 5937 5938 // The pipelined loop needs at least 16 elements for 1 iteration 5939 // It does check this, but it is more effective to skip to the cleanup loop 5940 __ mv(temp0, step_16); 5941 __ bgeu(len, temp0, L_nmax); 5942 __ beqz(len, L_combine); 5943 5944 // Jumping to L_by1_loop 5945 __ sub(len, len, step_1); 5946 __ j(L_by1_loop); 5947 5948 __ bind(L_nmax); 5949 __ sub(len, len, nmax); 5950 __ sub(count, nmax, 16); 5951 __ bltz(len, L_by16); 5952 5953 // Align L_nmax loop by 64 5954 __ bind(L_nmax_loop_entry); 5955 __ sub(count, count, 32); 5956 5957 __ bind(L_nmax_loop); 5958 adler32_process_bytes(buff, s1, s2, vtable_64, vzero, 5959 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5960 vtemp1, vtemp2, step_64, Assembler::m4); 5961 __ sub(count, count, step_64); 5962 __ bgtz(count, L_nmax_loop); 5963 5964 // There are three iterations left to do 5965 adler32_process_bytes(buff, s1, s2, vtable_32, vzero, 5966 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5967 vtemp1, vtemp2, step_32, Assembler::m2); 5968 adler32_process_bytes(buff, s1, s2, vtable_16, vzero, 5969 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5970 vtemp1, vtemp2, step_16, Assembler::m1); 5971 5972 // s1 = s1 % BASE 5973 __ remuw(s1, s1, base); 5974 // s2 = s2 % BASE 5975 __ remuw(s2, s2, base); 5976 5977 __ sub(len, len, nmax); 5978 __ sub(count, nmax, 16); 5979 __ bgez(len, L_nmax_loop_entry); 5980 5981 __ bind(L_by16); 5982 __ add(len, len, count); 5983 __ bltz(len, L_by1); 5984 // Trying to unroll 5985 __ mv(temp3, step_64); 5986 __ blt(len, temp3, L_by16_loop); 5987 5988 __ bind(L_by16_loop_unroll); 5989 adler32_process_bytes(buff, s1, s2, vtable_64, vzero, 5990 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5991 vtemp1, vtemp2, step_64, Assembler::m4); 5992 __ sub(len, len, step_64); 5993 // By now the temp3 should still be 64 5994 __ bge(len, temp3, L_by16_loop_unroll); 5995 5996 __ bind(L_by16_loop); 5997 adler32_process_bytes(buff, s1, s2, vtable_16, vzero, 5998 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 5999 vtemp1, vtemp2, step_16, Assembler::m1); 6000 __ sub(len, len, step_16); 6001 __ bgez(len, L_by16_loop); 6002 6003 __ bind(L_by1); 6004 __ add(len, len, 15); 6005 __ bltz(len, L_do_mod); 6006 6007 __ bind(L_by1_loop); 6008 __ lbu(temp0, Address(buff, 0)); 6009 __ addi(buff, buff, step_1); 6010 __ add(s1, temp0, s1); 6011 __ add(s2, s2, s1); 6012 __ sub(len, len, step_1); 6013 __ bgez(len, L_by1_loop); 6014 6015 __ bind(L_do_mod); 6016 // s1 = s1 % BASE 6017 __ remuw(s1, s1, base); 6018 // s2 = s2 % BASE 6019 __ remuw(s2, s2, base); 6020 6021 // Combine lower bits and higher bits 6022 // adler = s1 | (s2 << 16) 6023 __ bind(L_combine); 6024 __ slli(s2, s2, 16); 6025 __ orr(s1, s1, s2); 6026 6027 __ leave(); // Required for proper stackwalking of RuntimeStub frame 6028 __ ret(); 6029 6030 return start; 6031 } 6032 6033 #endif // COMPILER2_OR_JVMCI 6034 6035 #ifdef COMPILER2 6036 6037 static const int64_t right_2_bits = right_n_bits(2); 6038 static const int64_t right_3_bits = right_n_bits(3); 6039 6040 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 6041 // are represented as long[5], with BITS_PER_LIMB = 26. 6042 // Pack five 26-bit limbs into three 64-bit registers. 6043 void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) { 6044 assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2); 6045 6046 // The goal is to have 128-bit value in dest2:dest1:dest0 6047 __ ld(dest0, Address(src, 0)); // 26 bits in dest0 6048 6049 __ ld(tmp1, Address(src, sizeof(jlong))); 6050 __ slli(tmp1, tmp1, 26); 6051 __ add(dest0, dest0, tmp1); // 52 bits in dest0 6052 6053 __ ld(tmp2, Address(src, 2 * sizeof(jlong))); 6054 __ slli(tmp1, tmp2, 52); 6055 __ add(dest0, dest0, tmp1); // dest0 is full 6056 6057 __ srli(dest1, tmp2, 12); // 14-bit in dest1 6058 6059 __ ld(tmp1, Address(src, 3 * sizeof(jlong))); 6060 __ slli(tmp1, tmp1, 14); 6061 __ add(dest1, dest1, tmp1); // 40-bit in dest1 6062 6063 __ ld(tmp1, Address(src, 4 * sizeof(jlong))); 6064 __ slli(tmp2, tmp1, 40); 6065 __ add(dest1, dest1, tmp2); // dest1 is full 6066 6067 if (dest2->is_valid()) { 6068 __ srli(tmp1, tmp1, 24); 6069 __ mv(dest2, tmp1); // 2 bits in dest2 6070 } else { 6071 #ifdef ASSERT 6072 Label OK; 6073 __ srli(tmp1, tmp1, 24); 6074 __ beq(zr, tmp1, OK); // 2 bits 6075 __ stop("high bits of Poly1305 integer should be zero"); 6076 __ should_not_reach_here(); 6077 __ bind(OK); 6078 #endif 6079 } 6080 } 6081 6082 // As above, but return only a 128-bit integer, packed into two 6083 // 64-bit registers. 6084 void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) { 6085 poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2); 6086 } 6087 6088 // U_2:U_1:U_0: += (U_2 >> 2) * 5 6089 void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) { 6090 assert_different_registers(U_2, U_1, U_0, tmp1, tmp2); 6091 6092 // First, U_2:U_1:U_0 += (U_2 >> 2) 6093 __ srli(tmp1, U_2, 2); 6094 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2 6095 __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits 6096 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2 6097 __ add(U_2, U_2, tmp2); 6098 6099 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 6100 __ slli(tmp1, tmp1, 2); 6101 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2 6102 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2 6103 __ add(U_2, U_2, tmp2); 6104 } 6105 6106 // Poly1305, RFC 7539 6107 // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) 6108 6109 // Arguments: 6110 // c_rarg0: input_start -- where the input is stored 6111 // c_rarg1: length 6112 // c_rarg2: acc_start -- where the output will be stored 6113 // c_rarg3: r_start -- where the randomly generated 128-bit key is stored 6114 6115 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 6116 // description of the tricks used to simplify and accelerate this 6117 // computation. 6118 6119 address generate_poly1305_processBlocks() { 6120 __ align(CodeEntryAlignment); 6121 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 6122 address start = __ pc(); 6123 __ enter(); 6124 Label here; 6125 6126 RegSet saved_regs = RegSet::range(x18, x21); 6127 RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin(); 6128 __ push_reg(saved_regs, sp); 6129 6130 // Arguments 6131 const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3; 6132 6133 // R_n is the 128-bit randomly-generated key, packed into two 6134 // registers. The caller passes this key to us as long[5], with 6135 // BITS_PER_LIMB = 26. 6136 const Register R_0 = *regs, R_1 = *++regs; 6137 poly1305_pack_26(R_0, R_1, r_start, t1, t2); 6138 6139 // RR_n is (R_n >> 2) * 5 6140 const Register RR_0 = *++regs, RR_1 = *++regs; 6141 __ srli(t1, R_0, 2); 6142 __ shadd(RR_0, t1, t1, t2, 2); 6143 __ srli(t1, R_1, 2); 6144 __ shadd(RR_1, t1, t1, t2, 2); 6145 6146 // U_n is the current checksum 6147 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 6148 poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2); 6149 6150 static constexpr int BLOCK_LENGTH = 16; 6151 Label DONE, LOOP; 6152 6153 __ mv(t1, BLOCK_LENGTH); 6154 __ blt(length, t1, DONE); { 6155 __ bind(LOOP); 6156 6157 // S_n is to be the sum of U_n and the next block of data 6158 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 6159 __ ld(S_0, Address(input_start, 0)); 6160 __ ld(S_1, Address(input_start, wordSize)); 6161 6162 __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1 6163 __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1 6164 __ add(S_2, U_2, t1); 6165 6166 __ addi(S_2, S_2, 1); 6167 6168 const Register U_0HI = *++regs, U_1HI = *++regs; 6169 6170 // NB: this logic depends on some of the special properties of 6171 // Poly1305 keys. In particular, because we know that the top 6172 // four bits of R_0 and R_1 are zero, we can add together 6173 // partial products without any risk of needing to propagate a 6174 // carry out. 6175 __ wide_mul(U_0, U_0HI, S_0, R_0); 6176 __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2); 6177 __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2); 6178 6179 __ wide_mul(U_1, U_1HI, S_0, R_1); 6180 __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2); 6181 __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2); 6182 6183 __ andi(U_2, R_0, right_2_bits); 6184 __ mul(U_2, S_2, U_2); 6185 6186 // Partial reduction mod 2**130 - 5 6187 __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1 6188 __ adc(U_2, U_2, U_1HI, t1); 6189 // Sum is now in U_2:U_1:U_0. 6190 6191 // U_2:U_1:U_0: += (U_2 >> 2) * 5 6192 poly1305_reduce(U_2, U_1, U_0, t1, t2); 6193 6194 __ sub(length, length, BLOCK_LENGTH); 6195 __ addi(input_start, input_start, BLOCK_LENGTH); 6196 __ mv(t1, BLOCK_LENGTH); 6197 __ bge(length, t1, LOOP); 6198 } 6199 6200 // Further reduce modulo 2^130 - 5 6201 poly1305_reduce(U_2, U_1, U_0, t1, t2); 6202 6203 // Unpack the sum into five 26-bit limbs and write to memory. 6204 // First 26 bits is the first limb 6205 __ slli(t1, U_0, 38); // Take lowest 26 bits 6206 __ srli(t1, t1, 38); 6207 __ sd(t1, Address(acc_start)); // First 26-bit limb 6208 6209 // 27-52 bits of U_0 is the second limb 6210 __ slli(t1, U_0, 12); // Take next 27-52 bits 6211 __ srli(t1, t1, 38); 6212 __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb 6213 6214 // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register 6215 __ srli(t1, U_0, 52); 6216 __ slli(t2, U_1, 50); 6217 __ srli(t2, t2, 38); 6218 __ add(t1, t1, t2); 6219 __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb 6220 6221 // Storing 15-40 bits of U_1 6222 __ slli(t1, U_1, 24); // Already used up 14 bits 6223 __ srli(t1, t1, 38); // Clear all other bits from t1 6224 __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb 6225 6226 // Storing 41-64 bits of U_1 and first three bits from U_2 in one register 6227 __ srli(t1, U_1, 40); 6228 __ andi(t2, U_2, right_3_bits); 6229 __ slli(t2, t2, 24); 6230 __ add(t1, t1, t2); 6231 __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb 6232 6233 __ bind(DONE); 6234 __ pop_reg(saved_regs, sp); 6235 __ leave(); // Required for proper stackwalking 6236 __ ret(); 6237 6238 return start; 6239 } 6240 6241 void generate_vector_math_stubs() { 6242 if (!UseRVV) { 6243 log_info(library)("vector is not supported, skip loading vector math (sleef) library!"); 6244 return; 6245 } 6246 6247 // Get native vector math stub routine addresses 6248 void* libsleef = nullptr; 6249 char ebuf[1024]; 6250 char dll_name[JVM_MAXPATHLEN]; 6251 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 6252 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 6253 } 6254 if (libsleef == nullptr) { 6255 log_info(library)("Failed to load native vector math (sleef) library, %s!", ebuf); 6256 return; 6257 } 6258 6259 // Method naming convention 6260 // All the methods are named as <OP><T>_<U><suffix> 6261 // 6262 // Where: 6263 // <OP> is the operation name, e.g. sin, cos 6264 // <T> is to indicate float/double 6265 // "fx/dx" for vector float/double operation 6266 // <U> is the precision level 6267 // "u10/u05" represents 1.0/0.5 ULP error bounds 6268 // We use "u10" for all operations by default 6269 // But for those functions do not have u10 support, we use "u05" instead 6270 // <suffix> rvv, indicates riscv vector extension 6271 // 6272 // e.g. sinfx_u10rvv is the method for computing vector float sin using rvv instructions 6273 // 6274 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 6275 6276 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 6277 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 6278 if (vop == VectorSupport::VECTOR_OP_TANH) { // skip tanh because of performance regression 6279 continue; 6280 } 6281 6282 // The native library does not support u10 level of "hypot". 6283 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 6284 6285 snprintf(ebuf, sizeof(ebuf), "%sfx_%srvv", VectorSupport::mathname[op], ulf); 6286 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 6287 6288 snprintf(ebuf, sizeof(ebuf), "%sdx_%srvv", VectorSupport::mathname[op], ulf); 6289 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 6290 } 6291 } 6292 6293 #endif // COMPILER2 6294 6295 /** 6296 * Arguments: 6297 * 6298 * Inputs: 6299 * c_rarg0 - int crc 6300 * c_rarg1 - byte* buf 6301 * c_rarg2 - int length 6302 * 6303 * Output: 6304 * c_rarg0 - int crc result 6305 */ 6306 address generate_updateBytesCRC32() { 6307 assert(UseCRC32Intrinsics, "what are we doing here?"); 6308 6309 __ align(CodeEntryAlignment); 6310 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 6311 6312 address start = __ pc(); 6313 6314 // input parameters 6315 const Register crc = c_rarg0; // crc 6316 const Register buf = c_rarg1; // source java byte array address 6317 const Register len = c_rarg2; // length 6318 6319 BLOCK_COMMENT("Entry:"); 6320 __ enter(); // required for proper stackwalking of RuntimeStub frame 6321 6322 __ kernel_crc32(crc, buf, len, 6323 c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables 6324 c_rarg7, t2, t3, t4, t5, t6); // misc tmps 6325 6326 __ leave(); // required for proper stackwalking of RuntimeStub frame 6327 __ ret(); 6328 6329 return start; 6330 } 6331 6332 // exception handler for upcall stubs 6333 address generate_upcall_stub_exception_handler() { 6334 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 6335 address start = __ pc(); 6336 6337 // Native caller has no idea how to handle exceptions, 6338 // so we just crash here. Up to callee to catch exceptions. 6339 __ verify_oop(x10); // return a exception oop in a0 6340 __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception)); 6341 __ should_not_reach_here(); 6342 6343 return start; 6344 } 6345 6346 // load Method* target of MethodHandle 6347 // j_rarg0 = jobject receiver 6348 // xmethod = Method* result 6349 address generate_upcall_stub_load_target() { 6350 6351 StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target"); 6352 address start = __ pc(); 6353 6354 __ resolve_global_jobject(j_rarg0, t0, t1); 6355 // Load target method from receiver 6356 __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1); 6357 __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1); 6358 __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1); 6359 __ access_load_at(T_ADDRESS, IN_HEAP, xmethod, 6360 Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 6361 noreg, noreg); 6362 __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 6363 6364 __ ret(); 6365 6366 return start; 6367 } 6368 6369 #undef __ 6370 6371 // Initialization 6372 void generate_initial_stubs() { 6373 // Generate initial stubs and initializes the entry points 6374 6375 // entry points that exist in all platforms Note: This is code 6376 // that could be shared among different platforms - however the 6377 // benefit seems to be smaller than the disadvantage of having a 6378 // much more complicated generator structure. See also comment in 6379 // stubRoutines.hpp. 6380 6381 StubRoutines::_forward_exception_entry = generate_forward_exception(); 6382 6383 if (UnsafeMemoryAccess::_table == nullptr) { 6384 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 6385 } 6386 6387 StubRoutines::_call_stub_entry = 6388 generate_call_stub(StubRoutines::_call_stub_return_address); 6389 6390 // is referenced by megamorphic call 6391 StubRoutines::_catch_exception_entry = generate_catch_exception(); 6392 6393 if (UseCRC32Intrinsics) { 6394 // set table address before stub generation which use it 6395 StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table; 6396 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 6397 } 6398 } 6399 6400 void generate_continuation_stubs() { 6401 // Continuation stubs: 6402 StubRoutines::_cont_thaw = generate_cont_thaw(); 6403 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 6404 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 6405 } 6406 6407 void generate_final_stubs() { 6408 // support for verify_oop (must happen after universe_init) 6409 if (VerifyOops) { 6410 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 6411 } 6412 6413 // arraycopy stubs used by compilers 6414 generate_arraycopy_stubs(); 6415 6416 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 6417 if (bs_nm != nullptr) { 6418 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 6419 } 6420 6421 #ifdef COMPILER2 6422 if (UseSecondarySupersTable) { 6423 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 6424 if (!InlineSecondarySupersTest) { 6425 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 6426 StubRoutines::_lookup_secondary_supers_table_stubs[slot] 6427 = generate_lookup_secondary_supers_table_stub(slot); 6428 } 6429 } 6430 } 6431 #endif // COMPILER2 6432 6433 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 6434 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 6435 6436 StubRoutines::riscv::set_completed(); 6437 } 6438 6439 void generate_compiler_stubs() { 6440 #ifdef COMPILER2 6441 if (UseMulAddIntrinsic) { 6442 StubRoutines::_mulAdd = generate_mulAdd(); 6443 } 6444 6445 if (UseMultiplyToLenIntrinsic) { 6446 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 6447 } 6448 6449 if (UseSquareToLenIntrinsic) { 6450 StubRoutines::_squareToLen = generate_squareToLen(); 6451 } 6452 6453 if (UseMontgomeryMultiplyIntrinsic) { 6454 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 6455 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 6456 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 6457 } 6458 6459 if (UseMontgomerySquareIntrinsic) { 6460 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 6461 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 6462 StubRoutines::_montgomerySquare = g.generate_square(); 6463 } 6464 6465 if (UseAESIntrinsics) { 6466 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 6467 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 6468 } 6469 6470 if (UsePoly1305Intrinsics) { 6471 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 6472 } 6473 6474 if (UseRVVForBigIntegerShiftIntrinsics) { 6475 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 6476 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 6477 } 6478 6479 if (UseSHA256Intrinsics) { 6480 Sha2Generator sha2(_masm, this); 6481 StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(false); 6482 StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true); 6483 } 6484 6485 if (UseSHA512Intrinsics) { 6486 Sha2Generator sha2(_masm, this); 6487 StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(false); 6488 StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true); 6489 } 6490 6491 if (UseMD5Intrinsics) { 6492 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 6493 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 6494 } 6495 6496 if (UseChaCha20Intrinsics) { 6497 StubRoutines::_chacha20Block = generate_chacha20Block(); 6498 } 6499 6500 if (UseSHA1Intrinsics) { 6501 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 6502 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 6503 } 6504 6505 if (UseBASE64Intrinsics) { 6506 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 6507 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 6508 } 6509 6510 if (UseAdler32Intrinsics) { 6511 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 6512 } 6513 6514 generate_compare_long_strings(); 6515 6516 generate_string_indexof_stubs(); 6517 6518 generate_vector_math_stubs(); 6519 6520 #endif // COMPILER2 6521 } 6522 6523 public: 6524 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 6525 switch(kind) { 6526 case Initial_stubs: 6527 generate_initial_stubs(); 6528 break; 6529 case Continuation_stubs: 6530 generate_continuation_stubs(); 6531 break; 6532 case Compiler_stubs: 6533 generate_compiler_stubs(); 6534 break; 6535 case Final_stubs: 6536 generate_final_stubs(); 6537 break; 6538 default: 6539 fatal("unexpected stubs kind: %d", kind); 6540 break; 6541 }; 6542 } 6543 }; // end class declaration 6544 6545 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 6546 StubGenerator g(code, kind); 6547 } --- EOF ---