1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * Copyright (c) 2020, 2025, Huawei Technologies Co., Ltd. All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "compiler/oopMap.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/universe.hpp" 34 #include "nativeInst_riscv.hpp" 35 #include "oops/instanceOop.hpp" 36 #include "oops/method.hpp" 37 #include "oops/objArrayKlass.hpp" 38 #include "oops/oop.inline.hpp" 39 #include "prims/methodHandles.hpp" 40 #include "prims/upcallLinker.hpp" 41 #include "runtime/continuation.hpp" 42 #include "runtime/continuationEntry.inline.hpp" 43 #include "runtime/frame.inline.hpp" 44 #include "runtime/handles.inline.hpp" 45 #include "runtime/javaThread.hpp" 46 #include "runtime/sharedRuntime.hpp" 47 #include "runtime/stubCodeGenerator.hpp" 48 #include "runtime/stubRoutines.hpp" 49 #include "utilities/align.hpp" 50 #include "utilities/powerOfTwo.hpp" 51 #ifdef COMPILER2 52 #include "opto/runtime.hpp" 53 #endif 54 55 // Declaration and definition of StubGenerator (no .hpp file). 56 // For a more detailed description of the stub routine structure 57 // see the comment in stubRoutines.hpp 58 59 #undef __ 60 #define __ _masm-> 61 62 #ifdef PRODUCT 63 #define BLOCK_COMMENT(str) /* nothing */ 64 #else 65 #define BLOCK_COMMENT(str) __ block_comment(str) 66 #endif 67 68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 69 70 // Stub Code definitions 71 72 class StubGenerator: public StubCodeGenerator { 73 private: 74 75 #ifdef PRODUCT 76 #define inc_counter_np(counter) ((void)0) 77 #else 78 void inc_counter_np_(uint& counter) { 79 __ incrementw(ExternalAddress((address)&counter)); 80 } 81 #define inc_counter_np(counter) \ 82 BLOCK_COMMENT("inc_counter " #counter); \ 83 inc_counter_np_(counter); 84 #endif 85 86 // Call stubs are used to call Java from C 87 // 88 // Arguments: 89 // c_rarg0: call wrapper address address 90 // c_rarg1: result address 91 // c_rarg2: result type BasicType 92 // c_rarg3: method Method* 93 // c_rarg4: (interpreter) entry point address 94 // c_rarg5: parameters intptr_t* 95 // c_rarg6: parameter size (in words) int 96 // c_rarg7: thread Thread* 97 // 98 // There is no return from the stub itself as any Java result 99 // is written to result 100 // 101 // we save x1 (ra) as the return PC at the base of the frame and 102 // link x8 (fp) below it as the frame pointer installing sp (x2) 103 // into fp. 104 // 105 // we save x10-x17, which accounts for all the c arguments. 106 // 107 // TODO: strictly do we need to save them all? they are treated as 108 // volatile by C so could we omit saving the ones we are going to 109 // place in global registers (thread? method?) or those we only use 110 // during setup of the Java call? 111 // 112 // we don't need to save x5 which C uses as an indirect result location 113 // return register. 114 // 115 // we don't need to save x6-x7 and x28-x31 which both C and Java treat as 116 // volatile 117 // 118 // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary 119 // registers and C expects to be callee-save 120 // 121 // so the stub frame looks like this when we enter Java code 122 // 123 // [ return_from_Java ] <--- sp 124 // [ argument word n ] 125 // ... 126 // -35 [ argument word 1 ] 127 // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call 128 // -33 [ saved f27 ] 129 // -32 [ saved f26 ] 130 // -31 [ saved f25 ] 131 // -30 [ saved f24 ] 132 // -29 [ saved f23 ] 133 // -28 [ saved f22 ] 134 // -27 [ saved f21 ] 135 // -26 [ saved f20 ] 136 // -25 [ saved f19 ] 137 // -24 [ saved f18 ] 138 // -23 [ saved f9 ] 139 // -22 [ saved f8 ] 140 // -21 [ saved x27 ] 141 // -20 [ saved x26 ] 142 // -19 [ saved x25 ] 143 // -18 [ saved x24 ] 144 // -17 [ saved x23 ] 145 // -16 [ saved x22 ] 146 // -15 [ saved x21 ] 147 // -14 [ saved x20 ] 148 // -13 [ saved x19 ] 149 // -12 [ saved x18 ] 150 // -11 [ saved x9 ] 151 // -10 [ call wrapper (x10) ] 152 // -9 [ result (x11) ] 153 // -8 [ result type (x12) ] 154 // -7 [ method (x13) ] 155 // -6 [ entry point (x14) ] 156 // -5 [ parameters (x15) ] 157 // -4 [ parameter size (x16) ] 158 // -3 [ thread (x17) ] 159 // -2 [ saved fp (x8) ] 160 // -1 [ saved ra (x1) ] 161 // 0 [ ] <--- fp == saved sp (x2) 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -34, 166 167 frm_off = sp_after_call_off, 168 f27_off = -33, 169 f26_off = -32, 170 f25_off = -31, 171 f24_off = -30, 172 f23_off = -29, 173 f22_off = -28, 174 f21_off = -27, 175 f20_off = -26, 176 f19_off = -25, 177 f18_off = -24, 178 f9_off = -23, 179 f8_off = -22, 180 181 x27_off = -21, 182 x26_off = -20, 183 x25_off = -19, 184 x24_off = -18, 185 x23_off = -17, 186 x22_off = -16, 187 x21_off = -15, 188 x20_off = -14, 189 x19_off = -13, 190 x18_off = -12, 191 x9_off = -11, 192 193 call_wrapper_off = -10, 194 result_off = -9, 195 result_type_off = -8, 196 method_off = -7, 197 entry_point_off = -6, 198 parameters_off = -5, 199 parameter_size_off = -4, 200 thread_off = -3, 201 fp_f = -2, 202 retaddr_off = -1, 203 }; 204 205 address generate_call_stub(address& return_address) { 206 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 207 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 208 "adjust this code"); 209 210 StubGenStubId stub_id = StubGenStubId::call_stub_id; 211 StubCodeMark mark(this, stub_id); 212 address start = __ pc(); 213 214 const Address sp_after_call (fp, sp_after_call_off * wordSize); 215 216 const Address frm_save (fp, frm_off * wordSize); 217 const Address call_wrapper (fp, call_wrapper_off * wordSize); 218 const Address result (fp, result_off * wordSize); 219 const Address result_type (fp, result_type_off * wordSize); 220 const Address method (fp, method_off * wordSize); 221 const Address entry_point (fp, entry_point_off * wordSize); 222 const Address parameters (fp, parameters_off * wordSize); 223 const Address parameter_size(fp, parameter_size_off * wordSize); 224 225 const Address thread (fp, thread_off * wordSize); 226 227 const Address f27_save (fp, f27_off * wordSize); 228 const Address f26_save (fp, f26_off * wordSize); 229 const Address f25_save (fp, f25_off * wordSize); 230 const Address f24_save (fp, f24_off * wordSize); 231 const Address f23_save (fp, f23_off * wordSize); 232 const Address f22_save (fp, f22_off * wordSize); 233 const Address f21_save (fp, f21_off * wordSize); 234 const Address f20_save (fp, f20_off * wordSize); 235 const Address f19_save (fp, f19_off * wordSize); 236 const Address f18_save (fp, f18_off * wordSize); 237 const Address f9_save (fp, f9_off * wordSize); 238 const Address f8_save (fp, f8_off * wordSize); 239 240 const Address x27_save (fp, x27_off * wordSize); 241 const Address x26_save (fp, x26_off * wordSize); 242 const Address x25_save (fp, x25_off * wordSize); 243 const Address x24_save (fp, x24_off * wordSize); 244 const Address x23_save (fp, x23_off * wordSize); 245 const Address x22_save (fp, x22_off * wordSize); 246 const Address x21_save (fp, x21_off * wordSize); 247 const Address x20_save (fp, x20_off * wordSize); 248 const Address x19_save (fp, x19_off * wordSize); 249 const Address x18_save (fp, x18_off * wordSize); 250 251 const Address x9_save (fp, x9_off * wordSize); 252 253 // stub code 254 255 address riscv_entry = __ pc(); 256 257 // set up frame and move sp to end of save area 258 __ enter(); 259 __ addi(sp, fp, sp_after_call_off * wordSize); 260 261 // save register parameters and Java temporary/global registers 262 // n.b. we save thread even though it gets installed in 263 // xthread because we want to sanity check tp later 264 __ sd(c_rarg7, thread); 265 __ sw(c_rarg6, parameter_size); 266 __ sd(c_rarg5, parameters); 267 __ sd(c_rarg4, entry_point); 268 __ sd(c_rarg3, method); 269 __ sd(c_rarg2, result_type); 270 __ sd(c_rarg1, result); 271 __ sd(c_rarg0, call_wrapper); 272 273 __ sd(x9, x9_save); 274 275 __ sd(x18, x18_save); 276 __ sd(x19, x19_save); 277 __ sd(x20, x20_save); 278 __ sd(x21, x21_save); 279 __ sd(x22, x22_save); 280 __ sd(x23, x23_save); 281 __ sd(x24, x24_save); 282 __ sd(x25, x25_save); 283 __ sd(x26, x26_save); 284 __ sd(x27, x27_save); 285 286 __ fsd(f8, f8_save); 287 __ fsd(f9, f9_save); 288 __ fsd(f18, f18_save); 289 __ fsd(f19, f19_save); 290 __ fsd(f20, f20_save); 291 __ fsd(f21, f21_save); 292 __ fsd(f22, f22_save); 293 __ fsd(f23, f23_save); 294 __ fsd(f24, f24_save); 295 __ fsd(f25, f25_save); 296 __ fsd(f26, f26_save); 297 __ fsd(f27, f27_save); 298 299 __ frrm(t0); 300 __ sd(t0, frm_save); 301 // Set frm to the state we need. We do want Round to Nearest. We 302 // don't want non-IEEE rounding modes. 303 Label skip_fsrmi; 304 guarantee(__ RoundingMode::rne == 0, "must be"); 305 __ beqz(t0, skip_fsrmi); 306 __ fsrmi(__ RoundingMode::rne); 307 __ bind(skip_fsrmi); 308 309 // install Java thread in global register now we have saved 310 // whatever value it held 311 __ mv(xthread, c_rarg7); 312 313 // And method 314 __ mv(xmethod, c_rarg3); 315 316 // set up the heapbase register 317 __ reinit_heapbase(); 318 319 #ifdef ASSERT 320 // make sure we have no pending exceptions 321 { 322 Label L; 323 __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset()))); 324 __ beqz(t0, L); 325 __ stop("StubRoutines::call_stub: entered with pending exception"); 326 __ BIND(L); 327 } 328 #endif 329 // pass parameters if any 330 __ mv(esp, sp); 331 __ slli(t0, c_rarg6, LogBytesPerWord); 332 __ sub(t0, sp, t0); // Move SP out of the way 333 __ andi(sp, t0, -2 * wordSize); 334 335 BLOCK_COMMENT("pass parameters if any"); 336 Label parameters_done; 337 // parameter count is still in c_rarg6 338 // and parameter pointer identifying param 1 is in c_rarg5 339 __ beqz(c_rarg6, parameters_done); 340 341 address loop = __ pc(); 342 __ ld(t0, Address(c_rarg5, 0)); 343 __ addi(c_rarg5, c_rarg5, wordSize); 344 __ subi(c_rarg6, c_rarg6, 1); 345 __ push_reg(t0); 346 __ bgtz(c_rarg6, loop); 347 348 __ BIND(parameters_done); 349 350 // call Java entry -- passing methdoOop, and current sp 351 // xmethod: Method* 352 // x19_sender_sp: sender sp 353 BLOCK_COMMENT("call Java function"); 354 __ mv(x19_sender_sp, sp); 355 __ jalr(c_rarg4); 356 357 // save current address for use by exception handling code 358 359 return_address = __ pc(); 360 361 // store result depending on type (everything that is not 362 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 363 // n.b. this assumes Java returns an integral result in x10 364 // and a floating result in j_farg0 365 __ ld(j_rarg2, result); 366 Label is_long, is_float, is_double, exit; 367 __ ld(j_rarg1, result_type); 368 __ mv(t0, (u1)T_OBJECT); 369 __ beq(j_rarg1, t0, is_long); 370 __ mv(t0, (u1)T_LONG); 371 __ beq(j_rarg1, t0, is_long); 372 __ mv(t0, (u1)T_FLOAT); 373 __ beq(j_rarg1, t0, is_float); 374 __ mv(t0, (u1)T_DOUBLE); 375 __ beq(j_rarg1, t0, is_double); 376 377 // handle T_INT case 378 __ sw(x10, Address(j_rarg2)); 379 380 __ BIND(exit); 381 382 // pop parameters 383 __ addi(esp, fp, sp_after_call_off * wordSize); 384 385 #ifdef ASSERT 386 // verify that threads correspond 387 { 388 Label L, S; 389 __ ld(t0, thread); 390 __ bne(xthread, t0, S); 391 __ get_thread(t0); 392 __ beq(xthread, t0, L); 393 __ BIND(S); 394 __ stop("StubRoutines::call_stub: threads must correspond"); 395 __ BIND(L); 396 } 397 #endif 398 399 __ pop_cont_fastpath(xthread); 400 401 // restore callee-save registers 402 __ fld(f27, f27_save); 403 __ fld(f26, f26_save); 404 __ fld(f25, f25_save); 405 __ fld(f24, f24_save); 406 __ fld(f23, f23_save); 407 __ fld(f22, f22_save); 408 __ fld(f21, f21_save); 409 __ fld(f20, f20_save); 410 __ fld(f19, f19_save); 411 __ fld(f18, f18_save); 412 __ fld(f9, f9_save); 413 __ fld(f8, f8_save); 414 415 __ ld(x27, x27_save); 416 __ ld(x26, x26_save); 417 __ ld(x25, x25_save); 418 __ ld(x24, x24_save); 419 __ ld(x23, x23_save); 420 __ ld(x22, x22_save); 421 __ ld(x21, x21_save); 422 __ ld(x20, x20_save); 423 __ ld(x19, x19_save); 424 __ ld(x18, x18_save); 425 426 __ ld(x9, x9_save); 427 428 // restore frm 429 Label skip_fsrm; 430 __ ld(t0, frm_save); 431 __ frrm(t1); 432 __ beq(t0, t1, skip_fsrm); 433 __ fsrm(t0); 434 __ bind(skip_fsrm); 435 436 __ ld(c_rarg0, call_wrapper); 437 __ ld(c_rarg1, result); 438 __ ld(c_rarg2, result_type); 439 __ ld(c_rarg3, method); 440 __ ld(c_rarg4, entry_point); 441 __ ld(c_rarg5, parameters); 442 __ ld(c_rarg6, parameter_size); 443 __ ld(c_rarg7, thread); 444 445 // leave frame and return to caller 446 __ leave(); 447 __ ret(); 448 449 // handle return types different from T_INT 450 451 __ BIND(is_long); 452 __ sd(x10, Address(j_rarg2, 0)); 453 __ j(exit); 454 455 __ BIND(is_float); 456 __ fsw(j_farg0, Address(j_rarg2, 0), t0); 457 __ j(exit); 458 459 __ BIND(is_double); 460 __ fsd(j_farg0, Address(j_rarg2, 0), t0); 461 __ j(exit); 462 463 return start; 464 } 465 466 // Return point for a Java call if there's an exception thrown in 467 // Java code. The exception is caught and transformed into a 468 // pending exception stored in JavaThread that can be tested from 469 // within the VM. 470 // 471 // Note: Usually the parameters are removed by the callee. In case 472 // of an exception crossing an activation frame boundary, that is 473 // not the case if the callee is compiled code => need to setup the 474 // sp. 475 // 476 // x10: exception oop 477 478 address generate_catch_exception() { 479 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 480 StubCodeMark mark(this, stub_id); 481 address start = __ pc(); 482 483 // same as in generate_call_stub(): 484 const Address thread(fp, thread_off * wordSize); 485 486 #ifdef ASSERT 487 // verify that threads correspond 488 { 489 Label L, S; 490 __ ld(t0, thread); 491 __ bne(xthread, t0, S); 492 __ get_thread(t0); 493 __ beq(xthread, t0, L); 494 __ bind(S); 495 __ stop("StubRoutines::catch_exception: threads must correspond"); 496 __ bind(L); 497 } 498 #endif 499 500 // set pending exception 501 __ verify_oop(x10); 502 503 __ sd(x10, Address(xthread, Thread::pending_exception_offset())); 504 __ mv(t0, (address)__FILE__); 505 __ sd(t0, Address(xthread, Thread::exception_file_offset())); 506 __ mv(t0, (int)__LINE__); 507 __ sw(t0, Address(xthread, Thread::exception_line_offset())); 508 509 // complete return to VM 510 assert(StubRoutines::_call_stub_return_address != nullptr, 511 "_call_stub_return_address must have been generated before"); 512 __ j(RuntimeAddress(StubRoutines::_call_stub_return_address)); 513 514 return start; 515 } 516 517 // Continuation point for runtime calls returning with a pending 518 // exception. The pending exception check happened in the runtime 519 // or native call stub. The pending exception in Thread is 520 // converted into a Java-level exception. 521 // 522 // Contract with Java-level exception handlers: 523 // x10: exception 524 // x13: throwing pc 525 // 526 // NOTE: At entry of this stub, exception-pc must be in RA !! 527 528 // NOTE: this is always used as a jump target within generated code 529 // so it just needs to be generated code with no x86 prolog 530 531 address generate_forward_exception() { 532 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 533 StubCodeMark mark(this, stub_id); 534 address start = __ pc(); 535 536 // Upon entry, RA points to the return address returning into 537 // Java (interpreted or compiled) code; i.e., the return address 538 // becomes the throwing pc. 539 // 540 // Arguments pushed before the runtime call are still on the stack 541 // but the exception handler will reset the stack pointer -> 542 // ignore them. A potential result in registers can be ignored as 543 // well. 544 545 #ifdef ASSERT 546 // make sure this code is only executed if there is a pending exception 547 { 548 Label L; 549 __ ld(t0, Address(xthread, Thread::pending_exception_offset())); 550 __ bnez(t0, L); 551 __ stop("StubRoutines::forward exception: no pending exception (1)"); 552 __ bind(L); 553 } 554 #endif 555 556 // compute exception handler into x9 557 558 // call the VM to find the handler address associated with the 559 // caller address. pass thread in x10 and caller pc (ret address) 560 // in x11. n.b. the caller pc is in ra, unlike x86 where it is on 561 // the stack. 562 __ mv(c_rarg1, ra); 563 // ra will be trashed by the VM call so we move it to x9 564 // (callee-saved) because we also need to pass it to the handler 565 // returned by this call. 566 __ mv(x9, ra); 567 BLOCK_COMMENT("call exception_handler_for_return_address"); 568 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 569 SharedRuntime::exception_handler_for_return_address), 570 xthread, c_rarg1); 571 // we should not really care that ra is no longer the callee 572 // address. we saved the value the handler needs in x9 so we can 573 // just copy it to x13. however, the C2 handler will push its own 574 // frame and then calls into the VM and the VM code asserts that 575 // the PC for the frame above the handler belongs to a compiled 576 // Java method. So, we restore ra here to satisfy that assert. 577 __ mv(ra, x9); 578 // setup x10 & x13 & clear pending exception 579 __ mv(x13, x9); 580 __ mv(x9, x10); 581 __ ld(x10, Address(xthread, Thread::pending_exception_offset())); 582 __ sd(zr, Address(xthread, Thread::pending_exception_offset())); 583 584 #ifdef ASSERT 585 // make sure exception is set 586 { 587 Label L; 588 __ bnez(x10, L); 589 __ stop("StubRoutines::forward exception: no pending exception (2)"); 590 __ bind(L); 591 } 592 #endif 593 594 // continue at exception handler 595 // x10: exception 596 // x13: throwing pc 597 // x9: exception handler 598 __ verify_oop(x10); 599 __ jr(x9); 600 601 return start; 602 } 603 604 // Non-destructive plausibility checks for oops 605 // 606 // Arguments: 607 // x10: oop to verify 608 // t0: error message 609 // 610 // Stack after saving c_rarg3: 611 // [tos + 0]: saved c_rarg3 612 // [tos + 1]: saved c_rarg2 613 // [tos + 2]: saved ra 614 // [tos + 3]: saved t1 615 // [tos + 4]: saved x10 616 // [tos + 5]: saved t0 617 address generate_verify_oop() { 618 619 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 620 StubCodeMark mark(this, stub_id); 621 address start = __ pc(); 622 623 Label exit, error; 624 625 __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3 626 627 __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 628 __ ld(c_rarg3, Address(c_rarg2)); 629 __ addi(c_rarg3, c_rarg3, 1); 630 __ sd(c_rarg3, Address(c_rarg2)); 631 632 // object is in x10 633 // make sure object is 'reasonable' 634 __ beqz(x10, exit); // if obj is null it is OK 635 636 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 637 bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error); 638 639 // return if everything seems ok 640 __ bind(exit); 641 642 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3 643 __ ret(); 644 645 // handle errors 646 __ bind(error); 647 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3 648 649 __ push_reg(RegSet::range(x0, x31), sp); 650 // debug(char* msg, int64_t pc, int64_t regs[]) 651 __ mv(c_rarg0, t0); // pass address of error message 652 __ mv(c_rarg1, ra); // pass return address 653 __ mv(c_rarg2, sp); // pass address of regs on stack 654 #ifndef PRODUCT 655 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 656 #endif 657 BLOCK_COMMENT("call MacroAssembler::debug"); 658 __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 659 __ ebreak(); 660 661 return start; 662 } 663 664 // The inner part of zero_words(). 665 // 666 // Inputs: 667 // x28: the HeapWord-aligned base address of an array to zero. 668 // x29: the count in HeapWords, x29 > 0. 669 // 670 // Returns x28 and x29, adjusted for the caller to clear. 671 // x28: the base address of the tail of words left to clear. 672 // x29: the number of words in the tail. 673 // x29 < MacroAssembler::zero_words_block_size. 674 675 address generate_zero_blocks() { 676 Label done; 677 678 const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31; 679 680 __ align(CodeEntryAlignment); 681 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 682 StubCodeMark mark(this, stub_id); 683 address start = __ pc(); 684 685 if (UseBlockZeroing) { 686 // Ensure count >= 2*CacheLineSize so that it still deserves a cbo.zero 687 // after alignment. 688 Label small; 689 int low_limit = MAX2(2 * CacheLineSize, BlockZeroingLowLimit) / wordSize; 690 __ mv(tmp1, low_limit); 691 __ blt(cnt, tmp1, small); 692 __ zero_dcache_blocks(base, cnt, tmp1, tmp2); 693 __ bind(small); 694 } 695 696 { 697 // Clear the remaining blocks. 698 Label loop; 699 __ mv(tmp1, MacroAssembler::zero_words_block_size); 700 __ blt(cnt, tmp1, done); 701 __ bind(loop); 702 for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) { 703 __ sd(zr, Address(base, i * wordSize)); 704 } 705 __ addi(base, base, MacroAssembler::zero_words_block_size * wordSize); 706 __ subi(cnt, cnt, MacroAssembler::zero_words_block_size); 707 __ bge(cnt, tmp1, loop); 708 __ bind(done); 709 } 710 711 __ ret(); 712 713 return start; 714 } 715 716 typedef enum { 717 copy_forwards = 1, 718 copy_backwards = -1 719 } copy_direction; 720 721 // Bulk copy of blocks of 8 words. 722 // 723 // count is a count of words. 724 // 725 // Precondition: count >= 8 726 // 727 // Postconditions: 728 // 729 // The least significant bit of count contains the remaining count 730 // of words to copy. The rest of count is trash. 731 // 732 // s and d are adjusted to point to the remaining words to copy 733 // 734 void generate_copy_longs(StubGenStubId stub_id, Label &start, 735 Register s, Register d, Register count) { 736 BasicType type; 737 copy_direction direction; 738 switch (stub_id) { 739 case copy_byte_f_id: 740 direction = copy_forwards; 741 type = T_BYTE; 742 break; 743 case copy_byte_b_id: 744 direction = copy_backwards; 745 type = T_BYTE; 746 break; 747 default: 748 ShouldNotReachHere(); 749 } 750 int unit = wordSize * direction; 751 int bias = wordSize; 752 753 const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16, 754 tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29; 755 756 const Register stride = x30; 757 758 assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3, 759 tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7); 760 assert_different_registers(s, d, count, t0); 761 762 Label again, drain; 763 StubCodeMark mark(this, stub_id); 764 __ align(CodeEntryAlignment); 765 __ bind(start); 766 767 if (direction == copy_forwards) { 768 __ sub(s, s, bias); 769 __ sub(d, d, bias); 770 } 771 772 #ifdef ASSERT 773 // Make sure we are never given < 8 words 774 { 775 Label L; 776 777 __ mv(t0, 8); 778 __ bge(count, t0, L); 779 __ stop("genrate_copy_longs called with < 8 words"); 780 __ bind(L); 781 } 782 #endif 783 784 __ ld(tmp_reg0, Address(s, 1 * unit)); 785 __ ld(tmp_reg1, Address(s, 2 * unit)); 786 __ ld(tmp_reg2, Address(s, 3 * unit)); 787 __ ld(tmp_reg3, Address(s, 4 * unit)); 788 __ ld(tmp_reg4, Address(s, 5 * unit)); 789 __ ld(tmp_reg5, Address(s, 6 * unit)); 790 __ ld(tmp_reg6, Address(s, 7 * unit)); 791 __ ld(tmp_reg7, Address(s, 8 * unit)); 792 __ addi(s, s, 8 * unit); 793 794 __ subi(count, count, 16); 795 __ bltz(count, drain); 796 797 __ bind(again); 798 799 __ sd(tmp_reg0, Address(d, 1 * unit)); 800 __ sd(tmp_reg1, Address(d, 2 * unit)); 801 __ sd(tmp_reg2, Address(d, 3 * unit)); 802 __ sd(tmp_reg3, Address(d, 4 * unit)); 803 __ sd(tmp_reg4, Address(d, 5 * unit)); 804 __ sd(tmp_reg5, Address(d, 6 * unit)); 805 __ sd(tmp_reg6, Address(d, 7 * unit)); 806 __ sd(tmp_reg7, Address(d, 8 * unit)); 807 808 __ ld(tmp_reg0, Address(s, 1 * unit)); 809 __ ld(tmp_reg1, Address(s, 2 * unit)); 810 __ ld(tmp_reg2, Address(s, 3 * unit)); 811 __ ld(tmp_reg3, Address(s, 4 * unit)); 812 __ ld(tmp_reg4, Address(s, 5 * unit)); 813 __ ld(tmp_reg5, Address(s, 6 * unit)); 814 __ ld(tmp_reg6, Address(s, 7 * unit)); 815 __ ld(tmp_reg7, Address(s, 8 * unit)); 816 817 __ addi(s, s, 8 * unit); 818 __ addi(d, d, 8 * unit); 819 820 __ subi(count, count, 8); 821 __ bgez(count, again); 822 823 // Drain 824 __ bind(drain); 825 826 __ sd(tmp_reg0, Address(d, 1 * unit)); 827 __ sd(tmp_reg1, Address(d, 2 * unit)); 828 __ sd(tmp_reg2, Address(d, 3 * unit)); 829 __ sd(tmp_reg3, Address(d, 4 * unit)); 830 __ sd(tmp_reg4, Address(d, 5 * unit)); 831 __ sd(tmp_reg5, Address(d, 6 * unit)); 832 __ sd(tmp_reg6, Address(d, 7 * unit)); 833 __ sd(tmp_reg7, Address(d, 8 * unit)); 834 __ addi(d, d, 8 * unit); 835 836 { 837 Label L1, L2; 838 __ test_bit(t0, count, 2); 839 __ beqz(t0, L1); 840 841 __ ld(tmp_reg0, Address(s, 1 * unit)); 842 __ ld(tmp_reg1, Address(s, 2 * unit)); 843 __ ld(tmp_reg2, Address(s, 3 * unit)); 844 __ ld(tmp_reg3, Address(s, 4 * unit)); 845 __ addi(s, s, 4 * unit); 846 847 __ sd(tmp_reg0, Address(d, 1 * unit)); 848 __ sd(tmp_reg1, Address(d, 2 * unit)); 849 __ sd(tmp_reg2, Address(d, 3 * unit)); 850 __ sd(tmp_reg3, Address(d, 4 * unit)); 851 __ addi(d, d, 4 * unit); 852 853 __ bind(L1); 854 855 if (direction == copy_forwards) { 856 __ addi(s, s, bias); 857 __ addi(d, d, bias); 858 } 859 860 __ test_bit(t0, count, 1); 861 __ beqz(t0, L2); 862 if (direction == copy_backwards) { 863 __ addi(s, s, 2 * unit); 864 __ ld(tmp_reg0, Address(s)); 865 __ ld(tmp_reg1, Address(s, wordSize)); 866 __ addi(d, d, 2 * unit); 867 __ sd(tmp_reg0, Address(d)); 868 __ sd(tmp_reg1, Address(d, wordSize)); 869 } else { 870 __ ld(tmp_reg0, Address(s)); 871 __ ld(tmp_reg1, Address(s, wordSize)); 872 __ addi(s, s, 2 * unit); 873 __ sd(tmp_reg0, Address(d)); 874 __ sd(tmp_reg1, Address(d, wordSize)); 875 __ addi(d, d, 2 * unit); 876 } 877 __ bind(L2); 878 } 879 880 __ ret(); 881 } 882 883 Label copy_f, copy_b; 884 885 typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp); 886 887 void copy_memory_v(Register s, Register d, Register count, int step) { 888 bool is_backward = step < 0; 889 int granularity = g_uabs(step); 890 891 const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17; 892 assert_different_registers(s, d, cnt, vl, tmp1, tmp2); 893 Assembler::SEW sew = Assembler::elembytes_to_sew(granularity); 894 Label loop_forward, loop_backward, done; 895 896 __ mv(dst, d); 897 __ mv(src, s); 898 __ mv(cnt, count); 899 900 __ bind(loop_forward); 901 __ vsetvli(vl, cnt, sew, Assembler::m8); 902 if (is_backward) { 903 __ bne(vl, cnt, loop_backward); 904 } 905 906 __ vlex_v(v0, src, sew); 907 __ sub(cnt, cnt, vl); 908 if (sew != Assembler::e8) { 909 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary 910 __ slli(vl, vl, sew); 911 } 912 __ add(src, src, vl); 913 914 __ vsex_v(v0, dst, sew); 915 __ add(dst, dst, vl); 916 __ bnez(cnt, loop_forward); 917 918 if (is_backward) { 919 __ j(done); 920 921 __ bind(loop_backward); 922 __ sub(t0, cnt, vl); 923 if (sew != Assembler::e8) { 924 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary 925 __ slli(t0, t0, sew); 926 } 927 __ add(tmp1, s, t0); 928 __ vlex_v(v0, tmp1, sew); 929 __ add(tmp2, d, t0); 930 __ vsex_v(v0, tmp2, sew); 931 __ sub(cnt, cnt, vl); 932 __ bnez(cnt, loop_forward); 933 __ bind(done); 934 } 935 } 936 937 // All-singing all-dancing memory copy. 938 // 939 // Copy count units of memory from s to d. The size of a unit is 940 // step, which can be positive or negative depending on the direction 941 // of copy. 942 // 943 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 944 Register s, Register d, Register count, int step) { 945 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 946 if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) { 947 return copy_memory_v(s, d, count, step); 948 } 949 950 bool is_backwards = step < 0; 951 int granularity = g_uabs(step); 952 953 const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13; 954 const Register gct1 = x28, gct2 = x29, gct3 = t2; 955 956 Label same_aligned; 957 Label copy_big, copy32_loop, copy8_loop, copy_small, done; 958 959 // The size of copy32_loop body increases significantly with ZGC GC barriers. 960 // Need conditional far branches to reach a point beyond the loop in this case. 961 bool is_far = UseZGC; 962 963 __ beqz(count, done, is_far); 964 __ slli(cnt, count, exact_log2(granularity)); 965 if (is_backwards) { 966 __ add(src, s, cnt); 967 __ add(dst, d, cnt); 968 } else { 969 __ mv(src, s); 970 __ mv(dst, d); 971 } 972 973 if (is_aligned) { 974 __ subi(t0, cnt, 32); 975 __ bgez(t0, copy32_loop); 976 __ subi(t0, cnt, 8); 977 __ bgez(t0, copy8_loop, is_far); 978 __ j(copy_small); 979 } else { 980 __ mv(t0, 16); 981 __ blt(cnt, t0, copy_small, is_far); 982 983 __ xorr(t0, src, dst); 984 __ andi(t0, t0, 0b111); 985 __ bnez(t0, copy_small, is_far); 986 987 __ bind(same_aligned); 988 __ andi(t0, src, 0b111); 989 __ beqz(t0, copy_big); 990 if (is_backwards) { 991 __ addi(src, src, step); 992 __ addi(dst, dst, step); 993 } 994 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1); 995 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3); 996 if (!is_backwards) { 997 __ addi(src, src, step); 998 __ addi(dst, dst, step); 999 } 1000 __ subi(cnt, cnt, granularity); 1001 __ beqz(cnt, done, is_far); 1002 __ j(same_aligned); 1003 1004 __ bind(copy_big); 1005 __ mv(t0, 32); 1006 __ blt(cnt, t0, copy8_loop, is_far); 1007 } 1008 1009 __ bind(copy32_loop); 1010 if (is_backwards) { 1011 __ subi(src, src, wordSize * 4); 1012 __ subi(dst, dst, wordSize * 4); 1013 } 1014 // we first load 32 bytes, then write it, so the direction here doesn't matter 1015 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1); 1016 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8), gct1); 1017 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1); 1018 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1); 1019 1020 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3); 1021 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8), tmp4, gct1, gct2, gct3); 1022 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3); 1023 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3); 1024 1025 if (!is_backwards) { 1026 __ addi(src, src, wordSize * 4); 1027 __ addi(dst, dst, wordSize * 4); 1028 } 1029 __ subi(t0, cnt, 32 + wordSize * 4); 1030 __ subi(cnt, cnt, wordSize * 4); 1031 __ bgez(t0, copy32_loop); // cnt >= 32, do next loop 1032 1033 __ beqz(cnt, done); // if that's all - done 1034 1035 __ subi(t0, cnt, 8); // if not - copy the reminder 1036 __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop 1037 1038 __ bind(copy8_loop); 1039 if (is_backwards) { 1040 __ subi(src, src, wordSize); 1041 __ subi(dst, dst, wordSize); 1042 } 1043 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1); 1044 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3); 1045 1046 if (!is_backwards) { 1047 __ addi(src, src, wordSize); 1048 __ addi(dst, dst, wordSize); 1049 } 1050 __ subi(t0, cnt, 8 + wordSize); 1051 __ subi(cnt, cnt, wordSize); 1052 __ bgez(t0, copy8_loop); // cnt >= 8, do next loop 1053 1054 __ beqz(cnt, done); // if that's all - done 1055 1056 __ bind(copy_small); 1057 if (is_backwards) { 1058 __ addi(src, src, step); 1059 __ addi(dst, dst, step); 1060 } 1061 1062 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1); 1063 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3); 1064 1065 if (!is_backwards) { 1066 __ addi(src, src, step); 1067 __ addi(dst, dst, step); 1068 } 1069 __ subi(cnt, cnt, granularity); 1070 __ bgtz(cnt, copy_small); 1071 1072 __ bind(done); 1073 } 1074 1075 // Scan over array at a for count oops, verifying each one. 1076 // Preserves a and count, clobbers t0 and t1. 1077 void verify_oop_array(size_t size, Register a, Register count, Register temp) { 1078 Label loop, end; 1079 __ mv(t1, zr); 1080 __ slli(t0, count, exact_log2(size)); 1081 __ bind(loop); 1082 __ bgeu(t1, t0, end); 1083 1084 __ add(temp, a, t1); 1085 if (size == (size_t)wordSize) { 1086 __ ld(temp, Address(temp, 0)); 1087 __ verify_oop(temp); 1088 } else { 1089 __ lwu(temp, Address(temp, 0)); 1090 __ decode_heap_oop(temp); // calls verify_oop 1091 } 1092 __ add(t1, t1, size); 1093 __ j(loop); 1094 __ bind(end); 1095 } 1096 1097 // Arguments: 1098 // stub_id - is used to name the stub and identify all details of 1099 // how to perform the copy. 1100 // 1101 // entry - is assigned to the stub's post push entry point unless 1102 // it is null 1103 // 1104 // Inputs: 1105 // c_rarg0 - source array address 1106 // c_rarg1 - destination array address 1107 // c_rarg2 - element count, treated as ssize_t, can be zero 1108 // 1109 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1110 // the hardware handle it. The two dwords within qwords that span 1111 // cache line boundaries will still be loaded and stored atomically. 1112 // 1113 // Side Effects: entry is set to the (post push) entry point so it 1114 // can be used by the corresponding conjoint copy 1115 // method 1116 // 1117 address generate_disjoint_copy(StubGenStubId stub_id, address* entry) { 1118 size_t size; 1119 bool aligned; 1120 bool is_oop; 1121 bool dest_uninitialized; 1122 switch (stub_id) { 1123 case jbyte_disjoint_arraycopy_id: 1124 size = sizeof(jbyte); 1125 aligned = false; 1126 is_oop = false; 1127 dest_uninitialized = false; 1128 break; 1129 case arrayof_jbyte_disjoint_arraycopy_id: 1130 size = sizeof(jbyte); 1131 aligned = true; 1132 is_oop = false; 1133 dest_uninitialized = false; 1134 break; 1135 case jshort_disjoint_arraycopy_id: 1136 size = sizeof(jshort); 1137 aligned = false; 1138 is_oop = false; 1139 dest_uninitialized = false; 1140 break; 1141 case arrayof_jshort_disjoint_arraycopy_id: 1142 size = sizeof(jshort); 1143 aligned = true; 1144 is_oop = false; 1145 dest_uninitialized = false; 1146 break; 1147 case jint_disjoint_arraycopy_id: 1148 size = sizeof(jint); 1149 aligned = false; 1150 is_oop = false; 1151 dest_uninitialized = false; 1152 break; 1153 case arrayof_jint_disjoint_arraycopy_id: 1154 size = sizeof(jint); 1155 aligned = true; 1156 is_oop = false; 1157 dest_uninitialized = false; 1158 break; 1159 case jlong_disjoint_arraycopy_id: 1160 // since this is always aligned we can (should!) use the same 1161 // stub as for case arrayof_jlong_disjoint_arraycopy 1162 ShouldNotReachHere(); 1163 break; 1164 case arrayof_jlong_disjoint_arraycopy_id: 1165 size = sizeof(jlong); 1166 aligned = true; 1167 is_oop = false; 1168 dest_uninitialized = false; 1169 break; 1170 case oop_disjoint_arraycopy_id: 1171 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1172 aligned = !UseCompressedOops; 1173 is_oop = true; 1174 dest_uninitialized = false; 1175 break; 1176 case arrayof_oop_disjoint_arraycopy_id: 1177 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1178 aligned = !UseCompressedOops; 1179 is_oop = true; 1180 dest_uninitialized = false; 1181 break; 1182 case oop_disjoint_arraycopy_uninit_id: 1183 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1184 aligned = !UseCompressedOops; 1185 is_oop = true; 1186 dest_uninitialized = true; 1187 break; 1188 case arrayof_oop_disjoint_arraycopy_uninit_id: 1189 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1190 aligned = !UseCompressedOops; 1191 is_oop = true; 1192 dest_uninitialized = true; 1193 break; 1194 default: 1195 ShouldNotReachHere(); 1196 break; 1197 } 1198 1199 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1200 RegSet saved_reg = RegSet::of(s, d, count); 1201 __ align(CodeEntryAlignment); 1202 StubCodeMark mark(this, stub_id); 1203 address start = __ pc(); 1204 __ enter(); 1205 1206 if (entry != nullptr) { 1207 *entry = __ pc(); 1208 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1209 BLOCK_COMMENT("Entry:"); 1210 } 1211 1212 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1213 if (dest_uninitialized) { 1214 decorators |= IS_DEST_UNINITIALIZED; 1215 } 1216 if (aligned) { 1217 decorators |= ARRAYCOPY_ALIGNED; 1218 } 1219 1220 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1221 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1222 1223 if (is_oop) { 1224 // save regs before copy_memory 1225 __ push_reg(RegSet::of(d, count), sp); 1226 } 1227 1228 { 1229 // UnsafeMemoryAccess page error: continue after unsafe access 1230 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1231 UnsafeMemoryAccessMark umam(this, add_entry, true); 1232 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1233 } 1234 1235 if (is_oop) { 1236 __ pop_reg(RegSet::of(d, count), sp); 1237 if (VerifyOops) { 1238 verify_oop_array(size, d, count, t2); 1239 } 1240 } 1241 1242 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1243 1244 __ leave(); 1245 __ mv(x10, zr); // return 0 1246 __ ret(); 1247 return start; 1248 } 1249 1250 // Arguments: 1251 // stub_id - is used to name the stub and identify all details of 1252 // how to perform the copy. 1253 // 1254 // nooverlap_target - identifes the (post push) entry for the 1255 // corresponding disjoint copy routine which can be 1256 // jumped to if the ranges do not actually overlap 1257 // 1258 // entry - is assigned to the stub's post push entry point unless 1259 // it is null 1260 // 1261 // Inputs: 1262 // c_rarg0 - source array address 1263 // c_rarg1 - destination array address 1264 // c_rarg2 - element count, treated as ssize_t, can be zero 1265 // 1266 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1267 // the hardware handle it. The two dwords within qwords that span 1268 // cache line boundaries will still be loaded and stored atomically. 1269 // 1270 // Side Effects: 1271 // entry is set to the no-overlap entry point so it can be used by 1272 // some other conjoint copy method 1273 // 1274 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1275 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1276 RegSet saved_regs = RegSet::of(s, d, count); 1277 int size; 1278 bool aligned; 1279 bool is_oop; 1280 bool dest_uninitialized; 1281 switch (stub_id) { 1282 case jbyte_arraycopy_id: 1283 size = sizeof(jbyte); 1284 aligned = false; 1285 is_oop = false; 1286 dest_uninitialized = false; 1287 break; 1288 case arrayof_jbyte_arraycopy_id: 1289 size = sizeof(jbyte); 1290 aligned = true; 1291 is_oop = false; 1292 dest_uninitialized = false; 1293 break; 1294 case jshort_arraycopy_id: 1295 size = sizeof(jshort); 1296 aligned = false; 1297 is_oop = false; 1298 dest_uninitialized = false; 1299 break; 1300 case arrayof_jshort_arraycopy_id: 1301 size = sizeof(jshort); 1302 aligned = true; 1303 is_oop = false; 1304 dest_uninitialized = false; 1305 break; 1306 case jint_arraycopy_id: 1307 size = sizeof(jint); 1308 aligned = false; 1309 is_oop = false; 1310 dest_uninitialized = false; 1311 break; 1312 case arrayof_jint_arraycopy_id: 1313 size = sizeof(jint); 1314 aligned = true; 1315 is_oop = false; 1316 dest_uninitialized = false; 1317 break; 1318 case jlong_arraycopy_id: 1319 // since this is always aligned we can (should!) use the same 1320 // stub as for case arrayof_jlong_disjoint_arraycopy 1321 ShouldNotReachHere(); 1322 break; 1323 case arrayof_jlong_arraycopy_id: 1324 size = sizeof(jlong); 1325 aligned = true; 1326 is_oop = false; 1327 dest_uninitialized = false; 1328 break; 1329 case oop_arraycopy_id: 1330 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1331 aligned = !UseCompressedOops; 1332 is_oop = true; 1333 dest_uninitialized = false; 1334 break; 1335 case arrayof_oop_arraycopy_id: 1336 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1337 aligned = !UseCompressedOops; 1338 is_oop = true; 1339 dest_uninitialized = false; 1340 break; 1341 case oop_arraycopy_uninit_id: 1342 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1343 aligned = !UseCompressedOops; 1344 is_oop = true; 1345 dest_uninitialized = true; 1346 break; 1347 case arrayof_oop_arraycopy_uninit_id: 1348 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1349 aligned = !UseCompressedOops; 1350 is_oop = true; 1351 dest_uninitialized = true; 1352 break; 1353 default: 1354 ShouldNotReachHere(); 1355 } 1356 1357 StubCodeMark mark(this, stub_id); 1358 address start = __ pc(); 1359 __ enter(); 1360 1361 if (entry != nullptr) { 1362 *entry = __ pc(); 1363 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1364 BLOCK_COMMENT("Entry:"); 1365 } 1366 1367 // use fwd copy when (d-s) above_equal (count*size) 1368 __ sub(t0, d, s); 1369 __ slli(t1, count, exact_log2(size)); 1370 Label L_continue; 1371 __ bltu(t0, t1, L_continue); 1372 __ j(nooverlap_target); 1373 __ bind(L_continue); 1374 1375 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1376 if (dest_uninitialized) { 1377 decorators |= IS_DEST_UNINITIALIZED; 1378 } 1379 if (aligned) { 1380 decorators |= ARRAYCOPY_ALIGNED; 1381 } 1382 1383 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1384 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1385 1386 if (is_oop) { 1387 // save regs before copy_memory 1388 __ push_reg(RegSet::of(d, count), sp); 1389 } 1390 1391 { 1392 // UnsafeMemoryAccess page error: continue after unsafe access 1393 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1394 UnsafeMemoryAccessMark umam(this, add_entry, true); 1395 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1396 } 1397 1398 if (is_oop) { 1399 __ pop_reg(RegSet::of(d, count), sp); 1400 if (VerifyOops) { 1401 verify_oop_array(size, d, count, t2); 1402 } 1403 } 1404 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1405 __ leave(); 1406 __ mv(x10, zr); // return 0 1407 __ ret(); 1408 return start; 1409 } 1410 1411 // Helper for generating a dynamic type check. 1412 // Smashes t0, t1. 1413 void generate_type_check(Register sub_klass, 1414 Register super_check_offset, 1415 Register super_klass, 1416 Register result, 1417 Register tmp1, 1418 Register tmp2, 1419 Label& L_success) { 1420 assert_different_registers(sub_klass, super_check_offset, super_klass); 1421 1422 BLOCK_COMMENT("type_check:"); 1423 1424 Label L_miss; 1425 1426 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset); 1427 __ check_klass_subtype_slow_path(sub_klass, super_klass, tmp1, tmp2, &L_success, nullptr); 1428 1429 // Fall through on failure! 1430 __ BIND(L_miss); 1431 } 1432 1433 // 1434 // Generate checkcasting array copy stub 1435 // 1436 // Input: 1437 // c_rarg0 - source array address 1438 // c_rarg1 - destination array address 1439 // c_rarg2 - element count, treated as ssize_t, can be zero 1440 // c_rarg3 - size_t ckoff (super_check_offset) 1441 // c_rarg4 - oop ckval (super_klass) 1442 // 1443 // Output: 1444 // x10 == 0 - success 1445 // x10 == -1^K - failure, where K is partial transfer count 1446 // 1447 address generate_checkcast_copy(StubGenStubId stub_id, address* entry) { 1448 bool dest_uninitialized; 1449 switch (stub_id) { 1450 case checkcast_arraycopy_id: 1451 dest_uninitialized = false; 1452 break; 1453 case checkcast_arraycopy_uninit_id: 1454 dest_uninitialized = true; 1455 break; 1456 default: 1457 ShouldNotReachHere(); 1458 } 1459 1460 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1461 1462 // Input registers (after setup_arg_regs) 1463 const Register from = c_rarg0; // source array address 1464 const Register to = c_rarg1; // destination array address 1465 const Register count = c_rarg2; // elementscount 1466 const Register ckoff = c_rarg3; // super_check_offset 1467 const Register ckval = c_rarg4; // super_klass 1468 1469 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1470 RegSet wb_post_saved_regs = RegSet::of(count); 1471 1472 // Registers used as temps (x7, x9, x18 are save-on-entry) 1473 const Register count_save = x19; // orig elementscount 1474 const Register start_to = x18; // destination array start address 1475 const Register copied_oop = x7; // actual oop copied 1476 const Register r9_klass = x9; // oop._klass 1477 1478 // Registers used as gc temps (x15, x16, x17 are save-on-call) 1479 const Register gct1 = x15, gct2 = x16, gct3 = x17; 1480 1481 //--------------------------------------------------------------- 1482 // Assembler stub will be used for this call to arraycopy 1483 // if the two arrays are subtypes of Object[] but the 1484 // destination array type is not equal to or a supertype 1485 // of the source type. Each element must be separately 1486 // checked. 1487 1488 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1489 copied_oop, r9_klass, count_save); 1490 1491 __ align(CodeEntryAlignment); 1492 StubCodeMark mark(this, stub_id); 1493 address start = __ pc(); 1494 1495 __ enter(); // required for proper stackwalking of RuntimeStub frame 1496 1497 // Caller of this entry point must set up the argument registers. 1498 if (entry != nullptr) { 1499 *entry = __ pc(); 1500 BLOCK_COMMENT("Entry:"); 1501 } 1502 1503 // Empty array: Nothing to do 1504 __ beqz(count, L_done); 1505 1506 __ push_reg(RegSet::of(x7, x9, x18, x19), sp); 1507 1508 #ifdef ASSERT 1509 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1510 // The ckoff and ckval must be mutually consistent, 1511 // even though caller generates both. 1512 { Label L; 1513 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1514 __ lwu(start_to, Address(ckval, sco_offset)); 1515 __ beq(ckoff, start_to, L); 1516 __ stop("super_check_offset inconsistent"); 1517 __ bind(L); 1518 } 1519 #endif //ASSERT 1520 1521 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1522 if (dest_uninitialized) { 1523 decorators |= IS_DEST_UNINITIALIZED; 1524 } 1525 1526 bool is_oop = true; 1527 int element_size = UseCompressedOops ? 4 : 8; 1528 1529 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1530 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1531 1532 // save the original count 1533 __ mv(count_save, count); 1534 1535 // Copy from low to high addresses 1536 __ mv(start_to, to); // Save destination array start address 1537 __ j(L_load_element); 1538 1539 // ======== begin loop ======== 1540 // (Loop is rotated; its entry is L_load_element.) 1541 // Loop control: 1542 // for count to 0 do 1543 // copied_oop = load_heap_oop(from++) 1544 // ... generate_type_check ... 1545 // store_heap_oop(to++, copied_oop) 1546 // end 1547 1548 __ align(OptoLoopAlignment); 1549 1550 __ BIND(L_store_element); 1551 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1552 Address(to, 0), copied_oop, 1553 gct1, gct2, gct3); 1554 __ addi(to, to, UseCompressedOops ? 4 : 8); 1555 __ subi(count, count, 1); 1556 __ beqz(count, L_do_card_marks); 1557 1558 // ======== loop entry is here ======== 1559 __ BIND(L_load_element); 1560 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1561 copied_oop, Address(from, 0), 1562 gct1); 1563 __ addi(from, from, UseCompressedOops ? 4 : 8); 1564 __ beqz(copied_oop, L_store_element); 1565 1566 __ load_klass(r9_klass, copied_oop);// query the object klass 1567 1568 BLOCK_COMMENT("type_check:"); 1569 generate_type_check(r9_klass, /*sub_klass*/ 1570 ckoff, /*super_check_offset*/ 1571 ckval, /*super_klass*/ 1572 x10, /*result*/ 1573 gct1, /*tmp1*/ 1574 gct2, /*tmp2*/ 1575 L_store_element); 1576 1577 // Fall through on failure! 1578 1579 // ======== end loop ======== 1580 1581 // It was a real error; we must depend on the caller to finish the job. 1582 // Register count = remaining oops, count_orig = total oops. 1583 // Emit GC store barriers for the oops we have copied and report 1584 // their number to the caller. 1585 1586 __ sub(count, count_save, count); // K = partially copied oop count 1587 __ xori(count, count, -1); // report (-1^K) to caller 1588 __ beqz(count, L_done_pop); 1589 1590 __ BIND(L_do_card_marks); 1591 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs); 1592 1593 __ bind(L_done_pop); 1594 __ pop_reg(RegSet::of(x7, x9, x18, x19), sp); 1595 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1596 1597 __ bind(L_done); 1598 __ mv(x10, count); 1599 __ leave(); 1600 __ ret(); 1601 1602 return start; 1603 } 1604 1605 // Perform range checks on the proposed arraycopy. 1606 // Kills temp, but nothing else. 1607 // Also, clean the sign bits of src_pos and dst_pos. 1608 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1609 Register src_pos, // source position (c_rarg1) 1610 Register dst, // destination array oo (c_rarg2) 1611 Register dst_pos, // destination position (c_rarg3) 1612 Register length, 1613 Register temp, 1614 Label& L_failed) { 1615 BLOCK_COMMENT("arraycopy_range_checks:"); 1616 1617 assert_different_registers(t0, temp); 1618 1619 // if [src_pos + length > arrayOop(src)->length()] then FAIL 1620 __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes())); 1621 __ addw(temp, length, src_pos); 1622 __ bgtu(temp, t0, L_failed); 1623 1624 // if [dst_pos + length > arrayOop(dst)->length()] then FAIL 1625 __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1626 __ addw(temp, length, dst_pos); 1627 __ bgtu(temp, t0, L_failed); 1628 1629 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1630 __ zext(src_pos, src_pos, 32); 1631 __ zext(dst_pos, dst_pos, 32); 1632 1633 BLOCK_COMMENT("arraycopy_range_checks done"); 1634 } 1635 1636 address generate_unsafecopy_common_error_exit() { 1637 address start = __ pc(); 1638 __ mv(x10, 0); 1639 __ leave(); 1640 __ ret(); 1641 return start; 1642 } 1643 1644 // 1645 // Generate 'unsafe' set memory stub 1646 // Though just as safe as the other stubs, it takes an unscaled 1647 // size_t (# bytes) argument instead of an element count. 1648 // 1649 // Input: 1650 // c_rarg0 - destination array address 1651 // c_rarg1 - byte count (size_t) 1652 // c_rarg2 - byte value 1653 // 1654 address generate_unsafe_setmemory() { 1655 __ align(CodeEntryAlignment); 1656 StubGenStubId stub_id = StubGenStubId::unsafe_setmemory_id; 1657 StubCodeMark mark(this, stub_id); 1658 address start = __ pc(); 1659 1660 // bump this on entry, not on exit: 1661 // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr); 1662 1663 Label L_fill_elements; 1664 1665 const Register dest = c_rarg0; 1666 const Register count = c_rarg1; 1667 const Register value = c_rarg2; 1668 const Register cnt_words = x28; // temp register 1669 const Register tmp_reg = x29; // temp register 1670 1671 // Mark remaining code as such which performs Unsafe accesses. 1672 UnsafeMemoryAccessMark umam(this, true, false); 1673 1674 __ enter(); // required for proper stackwalking of RuntimeStub frame 1675 1676 // if count < 8, jump to L_fill_elements 1677 __ mv(tmp_reg, 8); // 8 bytes fill by element 1678 __ bltu(count, tmp_reg, L_fill_elements); 1679 1680 // Propagate byte to 64-bit width 1681 // 8 bit -> 16 bit 1682 __ zext(value, value, 8); 1683 __ slli(tmp_reg, value, 8); 1684 __ orr(value, value, tmp_reg); 1685 // 16 bit -> 32 bit 1686 __ slli(tmp_reg, value, 16); 1687 __ orr(value, value, tmp_reg); 1688 // 32 bit -> 64 bit 1689 __ slli(tmp_reg, value, 32); 1690 __ orr(value, value, tmp_reg); 1691 1692 // Align source address at 8 bytes address boundary. 1693 Label L_skip_align1, L_skip_align2, L_skip_align4; 1694 // One byte misalignment happens. 1695 __ test_bit(tmp_reg, dest, 0); 1696 __ beqz(tmp_reg, L_skip_align1); 1697 __ sb(value, Address(dest, 0)); 1698 __ addi(dest, dest, 1); 1699 __ subi(count, count, 1); 1700 1701 __ bind(L_skip_align1); 1702 // Two bytes misalignment happens. 1703 __ test_bit(tmp_reg, dest, 1); 1704 __ beqz(tmp_reg, L_skip_align2); 1705 __ sh(value, Address(dest, 0)); 1706 __ addi(dest, dest, 2); 1707 __ subi(count, count, 2); 1708 1709 __ bind(L_skip_align2); 1710 // Four bytes misalignment happens. 1711 __ test_bit(tmp_reg, dest, 2); 1712 __ beqz(tmp_reg, L_skip_align4); 1713 __ sw(value, Address(dest, 0)); 1714 __ addi(dest, dest, 4); 1715 __ subi(count, count, 4); 1716 __ bind(L_skip_align4); 1717 1718 // Fill large chunks 1719 __ srli(cnt_words, count, 3); // number of words 1720 __ slli(tmp_reg, cnt_words, 3); 1721 __ sub(count, count, tmp_reg); 1722 { 1723 __ fill_words(dest, cnt_words, value); 1724 } 1725 1726 // Handle copies less than 8 bytes 1727 __ bind(L_fill_elements); 1728 Label L_fill_2, L_fill_1, L_exit; 1729 __ test_bit(tmp_reg, count, 2); 1730 __ beqz(tmp_reg, L_fill_2); 1731 __ sb(value, Address(dest, 0)); 1732 __ sb(value, Address(dest, 1)); 1733 __ sb(value, Address(dest, 2)); 1734 __ sb(value, Address(dest, 3)); 1735 __ addi(dest, dest, 4); 1736 1737 __ bind(L_fill_2); 1738 __ test_bit(tmp_reg, count, 1); 1739 __ beqz(tmp_reg, L_fill_1); 1740 __ sb(value, Address(dest, 0)); 1741 __ sb(value, Address(dest, 1)); 1742 __ addi(dest, dest, 2); 1743 1744 __ bind(L_fill_1); 1745 __ test_bit(tmp_reg, count, 0); 1746 __ beqz(tmp_reg, L_exit); 1747 __ sb(value, Address(dest, 0)); 1748 1749 __ bind(L_exit); 1750 __ leave(); 1751 __ ret(); 1752 1753 return start; 1754 } 1755 1756 // 1757 // Generate 'unsafe' array copy stub 1758 // Though just as safe as the other stubs, it takes an unscaled 1759 // size_t argument instead of an element count. 1760 // 1761 // Input: 1762 // c_rarg0 - source array address 1763 // c_rarg1 - destination array address 1764 // c_rarg2 - byte count, treated as ssize_t, can be zero 1765 // 1766 // Examines the alignment of the operands and dispatches 1767 // to a long, int, short, or byte copy loop. 1768 // 1769 address generate_unsafe_copy(address byte_copy_entry, 1770 address short_copy_entry, 1771 address int_copy_entry, 1772 address long_copy_entry) { 1773 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr && 1774 int_copy_entry != nullptr && long_copy_entry != nullptr); 1775 Label L_long_aligned, L_int_aligned, L_short_aligned; 1776 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1777 1778 __ align(CodeEntryAlignment); 1779 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 1780 StubCodeMark mark(this, stub_id); 1781 address start = __ pc(); 1782 __ enter(); // required for proper stackwalking of RuntimeStub frame 1783 1784 // bump this on entry, not on exit: 1785 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1786 1787 __ orr(t0, s, d); 1788 __ orr(t0, t0, count); 1789 1790 __ andi(t0, t0, BytesPerLong - 1); 1791 __ beqz(t0, L_long_aligned); 1792 __ andi(t0, t0, BytesPerInt - 1); 1793 __ beqz(t0, L_int_aligned); 1794 __ test_bit(t0, t0, 0); 1795 __ beqz(t0, L_short_aligned); 1796 __ j(RuntimeAddress(byte_copy_entry)); 1797 1798 __ BIND(L_short_aligned); 1799 __ srli(count, count, LogBytesPerShort); // size => short_count 1800 __ j(RuntimeAddress(short_copy_entry)); 1801 __ BIND(L_int_aligned); 1802 __ srli(count, count, LogBytesPerInt); // size => int_count 1803 __ j(RuntimeAddress(int_copy_entry)); 1804 __ BIND(L_long_aligned); 1805 __ srli(count, count, LogBytesPerLong); // size => long_count 1806 __ j(RuntimeAddress(long_copy_entry)); 1807 1808 return start; 1809 } 1810 1811 // 1812 // Generate generic array copy stubs 1813 // 1814 // Input: 1815 // c_rarg0 - src oop 1816 // c_rarg1 - src_pos (32-bits) 1817 // c_rarg2 - dst oop 1818 // c_rarg3 - dst_pos (32-bits) 1819 // c_rarg4 - element count (32-bits) 1820 // 1821 // Output: 1822 // x10 == 0 - success 1823 // x10 == -1^K - failure, where K is partial transfer count 1824 // 1825 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 1826 address int_copy_entry, address oop_copy_entry, 1827 address long_copy_entry, address checkcast_copy_entry) { 1828 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr && 1829 int_copy_entry != nullptr && oop_copy_entry != nullptr && 1830 long_copy_entry != nullptr && checkcast_copy_entry != nullptr); 1831 Label L_failed, L_failed_0, L_objArray; 1832 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1833 1834 // Input registers 1835 const Register src = c_rarg0; // source array oop 1836 const Register src_pos = c_rarg1; // source position 1837 const Register dst = c_rarg2; // destination array oop 1838 const Register dst_pos = c_rarg3; // destination position 1839 const Register length = c_rarg4; 1840 1841 // Registers used as temps 1842 const Register dst_klass = c_rarg5; 1843 1844 __ align(CodeEntryAlignment); 1845 1846 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 1847 StubCodeMark mark(this, stub_id); 1848 1849 address start = __ pc(); 1850 1851 __ enter(); // required for proper stackwalking of RuntimeStub frame 1852 1853 // bump this on entry, not on exit: 1854 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1855 1856 //----------------------------------------------------------------------- 1857 // Assembler stub will be used for this call to arraycopy 1858 // if the following conditions are met: 1859 // 1860 // (1) src and dst must not be null. 1861 // (2) src_pos must not be negative. 1862 // (3) dst_pos must not be negative. 1863 // (4) length must not be negative. 1864 // (5) src klass and dst klass should be the same and not null. 1865 // (6) src and dst should be arrays. 1866 // (7) src_pos + length must not exceed length of src. 1867 // (8) dst_pos + length must not exceed length of dst. 1868 // 1869 1870 // if src is null then return -1 1871 __ beqz(src, L_failed); 1872 1873 // if [src_pos < 0] then return -1 1874 __ sext(t0, src_pos, 32); 1875 __ bltz(t0, L_failed); 1876 1877 // if dst is null then return -1 1878 __ beqz(dst, L_failed); 1879 1880 // if [dst_pos < 0] then return -1 1881 __ sext(t0, dst_pos, 32); 1882 __ bltz(t0, L_failed); 1883 1884 // registers used as temp 1885 const Register scratch_length = x28; // elements count to copy 1886 const Register scratch_src_klass = x29; // array klass 1887 const Register lh = x30; // layout helper 1888 1889 // if [length < 0] then return -1 1890 __ sext(scratch_length, length, 32); // length (elements count, 32-bits value) 1891 __ bltz(scratch_length, L_failed); 1892 1893 __ load_klass(scratch_src_klass, src); 1894 #ifdef ASSERT 1895 { 1896 BLOCK_COMMENT("assert klasses not null {"); 1897 Label L1, L2; 1898 __ bnez(scratch_src_klass, L2); // it is broken if klass is null 1899 __ bind(L1); 1900 __ stop("broken null klass"); 1901 __ bind(L2); 1902 __ load_klass(t0, dst, t1); 1903 __ beqz(t0, L1); // this would be broken also 1904 BLOCK_COMMENT("} assert klasses not null done"); 1905 } 1906 #endif 1907 1908 // Load layout helper (32-bits) 1909 // 1910 // |array_tag| | header_size | element_type | |log2_element_size| 1911 // 32 30 24 16 8 2 0 1912 // 1913 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1914 // 1915 1916 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1917 1918 // Handle objArrays completely differently... 1919 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1920 __ lw(lh, Address(scratch_src_klass, lh_offset)); 1921 __ mv(t0, objArray_lh); 1922 __ beq(lh, t0, L_objArray); 1923 1924 // if [src->klass() != dst->klass()] then return -1 1925 __ load_klass(t1, dst); 1926 __ bne(t1, scratch_src_klass, L_failed); 1927 1928 // if src->is_Array() isn't null then return -1 1929 // i.e. (lh >= 0) 1930 __ bgez(lh, L_failed); 1931 1932 // At this point, it is known to be a typeArray (array_tag 0x3). 1933 #ifdef ASSERT 1934 { 1935 BLOCK_COMMENT("assert primitive array {"); 1936 Label L; 1937 __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 1938 __ bge(lh, t1, L); 1939 __ stop("must be a primitive array"); 1940 __ bind(L); 1941 BLOCK_COMMENT("} assert primitive array done"); 1942 } 1943 #endif 1944 1945 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1946 t1, L_failed); 1947 1948 // TypeArrayKlass 1949 // 1950 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize) 1951 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize) 1952 // 1953 1954 const Register t0_offset = t0; // array offset 1955 const Register x30_elsize = lh; // element size 1956 1957 // Get array_header_in_bytes() 1958 int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1); 1959 int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width; 1960 __ slli(t0_offset, lh, XLEN - lh_header_size_msb); // left shift to remove 24 ~ 32; 1961 __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset 1962 1963 __ add(src, src, t0_offset); // src array offset 1964 __ add(dst, dst, t0_offset); // dst array offset 1965 BLOCK_COMMENT("choose copy loop based on element size"); 1966 1967 // next registers should be set before the jump to corresponding stub 1968 const Register from = c_rarg0; // source array address 1969 const Register to = c_rarg1; // destination array address 1970 const Register count = c_rarg2; // elements count 1971 1972 // 'from', 'to', 'count' registers should be set in such order 1973 // since they are the same as 'src', 'src_pos', 'dst'. 1974 1975 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1976 1977 // The possible values of elsize are 0-3, i.e. exact_log2(element 1978 // size in bytes). We do a simple bitwise binary search. 1979 __ BIND(L_copy_bytes); 1980 __ test_bit(t0, x30_elsize, 1); 1981 __ bnez(t0, L_copy_ints); 1982 __ test_bit(t0, x30_elsize, 0); 1983 __ bnez(t0, L_copy_shorts); 1984 __ add(from, src, src_pos); // src_addr 1985 __ add(to, dst, dst_pos); // dst_addr 1986 __ sext(count, scratch_length, 32); // length 1987 __ j(RuntimeAddress(byte_copy_entry)); 1988 1989 __ BIND(L_copy_shorts); 1990 __ shadd(from, src_pos, src, t0, 1); // src_addr 1991 __ shadd(to, dst_pos, dst, t0, 1); // dst_addr 1992 __ sext(count, scratch_length, 32); // length 1993 __ j(RuntimeAddress(short_copy_entry)); 1994 1995 __ BIND(L_copy_ints); 1996 __ test_bit(t0, x30_elsize, 0); 1997 __ bnez(t0, L_copy_longs); 1998 __ shadd(from, src_pos, src, t0, 2); // src_addr 1999 __ shadd(to, dst_pos, dst, t0, 2); // dst_addr 2000 __ sext(count, scratch_length, 32); // length 2001 __ j(RuntimeAddress(int_copy_entry)); 2002 2003 __ BIND(L_copy_longs); 2004 #ifdef ASSERT 2005 { 2006 BLOCK_COMMENT("assert long copy {"); 2007 Label L; 2008 __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize 2009 __ sext(lh, lh, 32); 2010 __ mv(t0, LogBytesPerLong); 2011 __ beq(x30_elsize, t0, L); 2012 __ stop("must be long copy, but elsize is wrong"); 2013 __ bind(L); 2014 BLOCK_COMMENT("} assert long copy done"); 2015 } 2016 #endif 2017 __ shadd(from, src_pos, src, t0, 3); // src_addr 2018 __ shadd(to, dst_pos, dst, t0, 3); // dst_addr 2019 __ sext(count, scratch_length, 32); // length 2020 __ j(RuntimeAddress(long_copy_entry)); 2021 2022 // ObjArrayKlass 2023 __ BIND(L_objArray); 2024 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2025 2026 Label L_plain_copy, L_checkcast_copy; 2027 // test array classes for subtyping 2028 __ load_klass(t2, dst); 2029 __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality 2030 2031 // Identically typed arrays can be copied without element-wise checks. 2032 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2033 t1, L_failed); 2034 2035 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 2036 __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2037 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 2038 __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2039 __ sext(count, scratch_length, 32); // length 2040 __ BIND(L_plain_copy); 2041 __ j(RuntimeAddress(oop_copy_entry)); 2042 2043 __ BIND(L_checkcast_copy); 2044 // live at this point: scratch_src_klass, scratch_length, t2 (dst_klass) 2045 { 2046 // Before looking at dst.length, make sure dst is also an objArray. 2047 __ lwu(t0, Address(t2, lh_offset)); 2048 __ mv(t1, objArray_lh); 2049 __ bne(t0, t1, L_failed); 2050 2051 // It is safe to examine both src.length and dst.length. 2052 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2053 t2, L_failed); 2054 2055 __ load_klass(dst_klass, dst); // reload 2056 2057 // Marshal the base address arguments now, freeing registers. 2058 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 2059 __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2060 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 2061 __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2062 __ sext(count, length, 32); // length (reloaded) 2063 const Register sco_temp = c_rarg3; // this register is free now 2064 assert_different_registers(from, to, count, sco_temp, 2065 dst_klass, scratch_src_klass); 2066 2067 // Generate the type check. 2068 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2069 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 2070 2071 // Smashes t0, t1 2072 generate_type_check(scratch_src_klass, sco_temp, dst_klass, noreg, noreg, noreg, L_plain_copy); 2073 2074 // Fetch destination element klass from the ObjArrayKlass header. 2075 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2076 __ ld(dst_klass, Address(dst_klass, ek_offset)); 2077 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 2078 2079 // the checkcast_copy loop needs two extra arguments: 2080 assert(c_rarg3 == sco_temp, "#3 already in place"); 2081 // Set up arguments for checkcast_copy_entry. 2082 __ mv(c_rarg4, dst_klass); // dst.klass.element_klass 2083 __ j(RuntimeAddress(checkcast_copy_entry)); 2084 } 2085 2086 __ BIND(L_failed); 2087 __ mv(x10, -1); 2088 __ leave(); // required for proper stackwalking of RuntimeStub frame 2089 __ ret(); 2090 2091 return start; 2092 } 2093 2094 // 2095 // Generate stub for array fill. If "aligned" is true, the 2096 // "to" address is assumed to be heapword aligned. 2097 // 2098 // Arguments for generated stub: 2099 // to: c_rarg0 2100 // value: c_rarg1 2101 // count: c_rarg2 treated as signed 2102 // 2103 address generate_fill(StubGenStubId stub_id) { 2104 BasicType t; 2105 bool aligned; 2106 2107 switch (stub_id) { 2108 case jbyte_fill_id: 2109 t = T_BYTE; 2110 aligned = false; 2111 break; 2112 case jshort_fill_id: 2113 t = T_SHORT; 2114 aligned = false; 2115 break; 2116 case jint_fill_id: 2117 t = T_INT; 2118 aligned = false; 2119 break; 2120 case arrayof_jbyte_fill_id: 2121 t = T_BYTE; 2122 aligned = true; 2123 break; 2124 case arrayof_jshort_fill_id: 2125 t = T_SHORT; 2126 aligned = true; 2127 break; 2128 case arrayof_jint_fill_id: 2129 t = T_INT; 2130 aligned = true; 2131 break; 2132 default: 2133 ShouldNotReachHere(); 2134 }; 2135 2136 __ align(CodeEntryAlignment); 2137 StubCodeMark mark(this, stub_id); 2138 address start = __ pc(); 2139 2140 BLOCK_COMMENT("Entry:"); 2141 2142 const Register to = c_rarg0; // source array address 2143 const Register value = c_rarg1; // value 2144 const Register count = c_rarg2; // elements count 2145 2146 const Register bz_base = x28; // base for block_zero routine 2147 const Register cnt_words = x29; // temp register 2148 const Register tmp_reg = t1; 2149 2150 __ enter(); 2151 2152 Label L_fill_elements; 2153 2154 int shift = -1; 2155 switch (t) { 2156 case T_BYTE: 2157 shift = 0; 2158 // Short arrays (< 8 bytes) fill by element 2159 __ mv(tmp_reg, 8 >> shift); 2160 __ bltu(count, tmp_reg, L_fill_elements); 2161 2162 // Zero extend value 2163 // 8 bit -> 16 bit 2164 __ zext(value, value, 8); 2165 __ slli(tmp_reg, value, 8); 2166 __ orr(value, value, tmp_reg); 2167 2168 // 16 bit -> 32 bit 2169 __ slli(tmp_reg, value, 16); 2170 __ orr(value, value, tmp_reg); 2171 break; 2172 case T_SHORT: 2173 shift = 1; 2174 // Short arrays (< 8 bytes) fill by element 2175 __ mv(tmp_reg, 8 >> shift); 2176 __ bltu(count, tmp_reg, L_fill_elements); 2177 2178 // Zero extend value 2179 // 16 bit -> 32 bit 2180 __ zext(value, value, 16); 2181 __ slli(tmp_reg, value, 16); 2182 __ orr(value, value, tmp_reg); 2183 break; 2184 case T_INT: 2185 shift = 2; 2186 // Short arrays (< 8 bytes) fill by element 2187 __ mv(tmp_reg, 8 >> shift); 2188 __ bltu(count, tmp_reg, L_fill_elements); 2189 break; 2190 default: ShouldNotReachHere(); 2191 } 2192 2193 // Align source address at 8 bytes address boundary. 2194 Label L_skip_align1, L_skip_align2, L_skip_align4; 2195 if (!aligned) { 2196 switch (t) { 2197 case T_BYTE: 2198 // One byte misalignment happens only for byte arrays. 2199 __ test_bit(tmp_reg, to, 0); 2200 __ beqz(tmp_reg, L_skip_align1); 2201 __ sb(value, Address(to, 0)); 2202 __ addi(to, to, 1); 2203 __ subiw(count, count, 1); 2204 __ bind(L_skip_align1); 2205 // Fallthrough 2206 case T_SHORT: 2207 // Two bytes misalignment happens only for byte and short (char) arrays. 2208 __ test_bit(tmp_reg, to, 1); 2209 __ beqz(tmp_reg, L_skip_align2); 2210 __ sh(value, Address(to, 0)); 2211 __ addi(to, to, 2); 2212 __ subiw(count, count, 2 >> shift); 2213 __ bind(L_skip_align2); 2214 // Fallthrough 2215 case T_INT: 2216 // Align to 8 bytes, we know we are 4 byte aligned to start. 2217 __ test_bit(tmp_reg, to, 2); 2218 __ beqz(tmp_reg, L_skip_align4); 2219 __ sw(value, Address(to, 0)); 2220 __ addi(to, to, 4); 2221 __ subiw(count, count, 4 >> shift); 2222 __ bind(L_skip_align4); 2223 break; 2224 default: ShouldNotReachHere(); 2225 } 2226 } 2227 2228 // 2229 // Fill large chunks 2230 // 2231 __ srliw(cnt_words, count, 3 - shift); // number of words 2232 2233 // 32 bit -> 64 bit 2234 __ zext(value, value, 32); 2235 __ slli(tmp_reg, value, 32); 2236 __ orr(value, value, tmp_reg); 2237 2238 __ slli(tmp_reg, cnt_words, 3 - shift); 2239 __ subw(count, count, tmp_reg); 2240 { 2241 __ fill_words(to, cnt_words, value); 2242 } 2243 2244 // Handle copies less than 8 bytes. 2245 // Address may not be heapword aligned. 2246 Label L_fill_1, L_fill_2, L_exit; 2247 __ bind(L_fill_elements); 2248 switch (t) { 2249 case T_BYTE: 2250 __ test_bit(tmp_reg, count, 2); 2251 __ beqz(tmp_reg, L_fill_2); 2252 __ sb(value, Address(to, 0)); 2253 __ sb(value, Address(to, 1)); 2254 __ sb(value, Address(to, 2)); 2255 __ sb(value, Address(to, 3)); 2256 __ addi(to, to, 4); 2257 2258 __ bind(L_fill_2); 2259 __ test_bit(tmp_reg, count, 1); 2260 __ beqz(tmp_reg, L_fill_1); 2261 __ sb(value, Address(to, 0)); 2262 __ sb(value, Address(to, 1)); 2263 __ addi(to, to, 2); 2264 2265 __ bind(L_fill_1); 2266 __ test_bit(tmp_reg, count, 0); 2267 __ beqz(tmp_reg, L_exit); 2268 __ sb(value, Address(to, 0)); 2269 break; 2270 case T_SHORT: 2271 __ test_bit(tmp_reg, count, 1); 2272 __ beqz(tmp_reg, L_fill_2); 2273 __ sh(value, Address(to, 0)); 2274 __ sh(value, Address(to, 2)); 2275 __ addi(to, to, 4); 2276 2277 __ bind(L_fill_2); 2278 __ test_bit(tmp_reg, count, 0); 2279 __ beqz(tmp_reg, L_exit); 2280 __ sh(value, Address(to, 0)); 2281 break; 2282 case T_INT: 2283 __ beqz(count, L_exit); 2284 __ sw(value, Address(to, 0)); 2285 break; 2286 default: ShouldNotReachHere(); 2287 } 2288 __ bind(L_exit); 2289 __ leave(); 2290 __ ret(); 2291 2292 return start; 2293 } 2294 2295 void generate_arraycopy_stubs() { 2296 address entry = nullptr; 2297 address entry_jbyte_arraycopy = nullptr; 2298 address entry_jshort_arraycopy = nullptr; 2299 address entry_jint_arraycopy = nullptr; 2300 address entry_oop_arraycopy = nullptr; 2301 address entry_jlong_arraycopy = nullptr; 2302 address entry_checkcast_arraycopy = nullptr; 2303 2304 generate_copy_longs(StubGenStubId::copy_byte_f_id, copy_f, c_rarg0, c_rarg1, t1); 2305 generate_copy_longs(StubGenStubId::copy_byte_b_id, copy_b, c_rarg0, c_rarg1, t1); 2306 2307 address ucm_common_error_exit = generate_unsafecopy_common_error_exit(); 2308 UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit); 2309 2310 StubRoutines::riscv::_zero_blocks = generate_zero_blocks(); 2311 2312 //*** jbyte 2313 // Always need aligned and unaligned versions 2314 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2315 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2316 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2317 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2318 2319 //*** jshort 2320 // Always need aligned and unaligned versions 2321 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2322 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2323 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2324 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2325 2326 //*** jint 2327 // Aligned versions 2328 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2329 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2330 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2331 // entry_jint_arraycopy always points to the unaligned version 2332 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2333 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2334 2335 //*** jlong 2336 // It is always aligned 2337 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2338 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2339 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2340 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2341 2342 //*** oops 2343 StubRoutines::_arrayof_oop_disjoint_arraycopy 2344 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2345 StubRoutines::_arrayof_oop_arraycopy 2346 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2347 // Aligned versions without pre-barriers 2348 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2349 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2350 StubRoutines::_arrayof_oop_arraycopy_uninit 2351 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2352 2353 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2354 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2355 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2356 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2357 2358 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2359 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2360 2361 2362 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2363 entry_jshort_arraycopy, 2364 entry_jint_arraycopy, 2365 entry_jlong_arraycopy); 2366 2367 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2368 entry_jshort_arraycopy, 2369 entry_jint_arraycopy, 2370 entry_oop_arraycopy, 2371 entry_jlong_arraycopy, 2372 entry_checkcast_arraycopy); 2373 2374 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2375 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2376 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2377 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2378 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2379 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2380 2381 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory(); 2382 } 2383 2384 void generate_aes_loadkeys(const Register &key, VectorRegister *working_vregs, int rounds) { 2385 const int step = 16; 2386 for (int i = 0; i < rounds; i++) { 2387 __ vle32_v(working_vregs[i], key); 2388 // The keys are stored in little-endian array, while we need 2389 // to operate in big-endian. 2390 // So performing an endian-swap here with vrev8.v instruction 2391 __ vrev8_v(working_vregs[i], working_vregs[i]); 2392 __ addi(key, key, step); 2393 } 2394 } 2395 2396 void generate_aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) { 2397 assert(rounds <= 15, "rounds should be less than or equal to working_vregs size"); 2398 2399 __ vxor_vv(res, res, working_vregs[0]); 2400 for (int i = 1; i < rounds - 1; i++) { 2401 __ vaesem_vv(res, working_vregs[i]); 2402 } 2403 __ vaesef_vv(res, working_vregs[rounds - 1]); 2404 } 2405 2406 // Arguments: 2407 // 2408 // Inputs: 2409 // c_rarg0 - source byte array address 2410 // c_rarg1 - destination byte array address 2411 // c_rarg2 - K (key) in little endian int array 2412 // 2413 address generate_aescrypt_encryptBlock() { 2414 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support"); 2415 2416 __ align(CodeEntryAlignment); 2417 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2418 StubCodeMark mark(this, stub_id); 2419 2420 Label L_aes128, L_aes192; 2421 2422 const Register from = c_rarg0; // source array address 2423 const Register to = c_rarg1; // destination array address 2424 const Register key = c_rarg2; // key array address 2425 const Register keylen = c_rarg3; 2426 2427 VectorRegister working_vregs[] = { 2428 v4, v5, v6, v7, v8, v9, v10, v11, 2429 v12, v13, v14, v15, v16, v17, v18 2430 }; 2431 const VectorRegister res = v19; 2432 2433 address start = __ pc(); 2434 __ enter(); 2435 2436 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2437 2438 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1); 2439 __ vle32_v(res, from); 2440 2441 __ mv(t2, 52); 2442 __ blt(keylen, t2, L_aes128); 2443 __ beq(keylen, t2, L_aes192); 2444 // Else we fallthrough to the biggest case (256-bit key size) 2445 2446 // Note: the following function performs key += 15*16 2447 generate_aes_loadkeys(key, working_vregs, 15); 2448 generate_aes_encrypt(res, working_vregs, 15); 2449 __ vse32_v(res, to); 2450 __ mv(c_rarg0, 0); 2451 __ leave(); 2452 __ ret(); 2453 2454 __ bind(L_aes192); 2455 // Note: the following function performs key += 13*16 2456 generate_aes_loadkeys(key, working_vregs, 13); 2457 generate_aes_encrypt(res, working_vregs, 13); 2458 __ vse32_v(res, to); 2459 __ mv(c_rarg0, 0); 2460 __ leave(); 2461 __ ret(); 2462 2463 __ bind(L_aes128); 2464 // Note: the following function performs key += 11*16 2465 generate_aes_loadkeys(key, working_vregs, 11); 2466 generate_aes_encrypt(res, working_vregs, 11); 2467 __ vse32_v(res, to); 2468 __ mv(c_rarg0, 0); 2469 __ leave(); 2470 __ ret(); 2471 2472 return start; 2473 } 2474 2475 void generate_aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) { 2476 assert(rounds <= 15, "rounds should be less than or equal to working_vregs size"); 2477 2478 __ vxor_vv(res, res, working_vregs[rounds - 1]); 2479 for (int i = rounds - 2; i > 0; i--) { 2480 __ vaesdm_vv(res, working_vregs[i]); 2481 } 2482 __ vaesdf_vv(res, working_vregs[0]); 2483 } 2484 2485 // Arguments: 2486 // 2487 // Inputs: 2488 // c_rarg0 - source byte array address 2489 // c_rarg1 - destination byte array address 2490 // c_rarg2 - K (key) in little endian int array 2491 // 2492 address generate_aescrypt_decryptBlock() { 2493 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support"); 2494 2495 __ align(CodeEntryAlignment); 2496 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2497 StubCodeMark mark(this, stub_id); 2498 2499 Label L_aes128, L_aes192; 2500 2501 const Register from = c_rarg0; // source array address 2502 const Register to = c_rarg1; // destination array address 2503 const Register key = c_rarg2; // key array address 2504 const Register keylen = c_rarg3; 2505 2506 VectorRegister working_vregs[] = { 2507 v4, v5, v6, v7, v8, v9, v10, v11, 2508 v12, v13, v14, v15, v16, v17, v18 2509 }; 2510 const VectorRegister res = v19; 2511 2512 address start = __ pc(); 2513 __ enter(); // required for proper stackwalking of RuntimeStub frame 2514 2515 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2516 2517 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1); 2518 __ vle32_v(res, from); 2519 2520 __ mv(t2, 52); 2521 __ blt(keylen, t2, L_aes128); 2522 __ beq(keylen, t2, L_aes192); 2523 // Else we fallthrough to the biggest case (256-bit key size) 2524 2525 // Note: the following function performs key += 15*16 2526 generate_aes_loadkeys(key, working_vregs, 15); 2527 generate_aes_decrypt(res, working_vregs, 15); 2528 __ vse32_v(res, to); 2529 __ mv(c_rarg0, 0); 2530 __ leave(); 2531 __ ret(); 2532 2533 __ bind(L_aes192); 2534 // Note: the following function performs key += 13*16 2535 generate_aes_loadkeys(key, working_vregs, 13); 2536 generate_aes_decrypt(res, working_vregs, 13); 2537 __ vse32_v(res, to); 2538 __ mv(c_rarg0, 0); 2539 __ leave(); 2540 __ ret(); 2541 2542 __ bind(L_aes128); 2543 // Note: the following function performs key += 11*16 2544 generate_aes_loadkeys(key, working_vregs, 11); 2545 generate_aes_decrypt(res, working_vregs, 11); 2546 __ vse32_v(res, to); 2547 __ mv(c_rarg0, 0); 2548 __ leave(); 2549 __ ret(); 2550 2551 return start; 2552 } 2553 2554 // code for comparing 8 characters of strings with Latin1 and Utf16 encoding 2555 void compare_string_8_x_LU(Register tmpL, Register tmpU, 2556 Register strL, Register strU, Label& DIFF) { 2557 const Register tmp = x30, tmpLval = x12; 2558 2559 int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE); 2560 assert((base_offset % (UseCompactObjectHeaders ? 4 : 2561 (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); 2562 2563 #ifdef ASSERT 2564 if (AvoidUnalignedAccesses) { 2565 Label align_ok; 2566 __ andi(t0, strL, 0x7); 2567 __ beqz(t0, align_ok); 2568 __ stop("bad alignment"); 2569 __ bind(align_ok); 2570 } 2571 #endif 2572 __ ld(tmpLval, Address(strL)); 2573 __ addi(strL, strL, wordSize); 2574 2575 // compare first 4 characters 2576 __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8); 2577 __ addi(strU, strU, wordSize); 2578 __ inflate_lo32(tmpL, tmpLval); 2579 __ xorr(tmp, tmpU, tmpL); 2580 __ bnez(tmp, DIFF); 2581 2582 // compare second 4 characters 2583 __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8); 2584 __ addi(strU, strU, wordSize); 2585 __ inflate_hi32(tmpL, tmpLval); 2586 __ xorr(tmp, tmpU, tmpL); 2587 __ bnez(tmp, DIFF); 2588 } 2589 2590 // x10 = result 2591 // x11 = str1 2592 // x12 = cnt1 2593 // x13 = str2 2594 // x14 = cnt2 2595 // x28 = tmp1 2596 // x29 = tmp2 2597 // x30 = tmp3 2598 address generate_compare_long_string_different_encoding(StubGenStubId stub_id) { 2599 bool isLU; 2600 switch (stub_id) { 2601 case compare_long_string_LU_id: 2602 isLU = true; 2603 break; 2604 case compare_long_string_UL_id: 2605 isLU = false; 2606 break; 2607 default: 2608 ShouldNotReachHere(); 2609 }; 2610 __ align(CodeEntryAlignment); 2611 StubCodeMark mark(this, stub_id); 2612 address entry = __ pc(); 2613 Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE; 2614 const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14, 2615 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12; 2616 2617 int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE); 2618 assert((base_offset % (UseCompactObjectHeaders ? 4 : 2619 (UseCompressedClassPointers ? 8 : 4))) == 0, "Must be"); 2620 2621 Register strU = isLU ? str2 : str1, 2622 strL = isLU ? str1 : str2, 2623 tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison 2624 tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison 2625 2626 if (AvoidUnalignedAccesses && (base_offset % 8) != 0) { 2627 // Load 4 bytes from strL to make sure main loop is 8-byte aligned 2628 // cnt2 is >= 68 here, no need to check it for >= 0 2629 __ lwu(tmpL, Address(strL)); 2630 __ addi(strL, strL, wordSize / 2); 2631 __ load_long_misaligned(tmpU, Address(strU), tmp4, (base_offset % 8) != 0 ? 4 : 8); 2632 __ addi(strU, strU, wordSize); 2633 __ inflate_lo32(tmp3, tmpL); 2634 __ mv(tmpL, tmp3); 2635 __ xorr(tmp3, tmpU, tmpL); 2636 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2637 __ subi(cnt2, cnt2, wordSize / 2); 2638 } 2639 2640 // we are now 8-bytes aligned on strL when AvoidUnalignedAccesses is true 2641 __ subi(cnt2, cnt2, wordSize * 2); 2642 __ bltz(cnt2, TAIL); 2643 __ bind(SMALL_LOOP); // smaller loop 2644 __ subi(cnt2, cnt2, wordSize * 2); 2645 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2646 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2647 __ bgez(cnt2, SMALL_LOOP); 2648 __ addi(t0, cnt2, wordSize * 2); 2649 __ beqz(t0, DONE); 2650 __ bind(TAIL); // 1..15 characters left 2651 // Aligned access. Load bytes in portions - 4, 2, 1. 2652 2653 __ addi(t0, cnt2, wordSize); 2654 __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process 2655 __ bltz(t0, LOAD_LAST); 2656 // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU 2657 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE); 2658 __ subi(cnt2, cnt2, wordSize); 2659 __ beqz(cnt2, DONE); // no character left 2660 __ bind(LOAD_LAST); // cnt2 = 1..7 characters left 2661 2662 __ subi(cnt2, cnt2, wordSize); // cnt2 is now an offset in strL which points to last 8 bytes 2663 __ slli(t0, cnt2, 1); // t0 is now an offset in strU which points to last 16 bytes 2664 __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string 2665 __ add(strU, strU, t0); // Address of last 16 bytes in UTF-16 string 2666 __ load_int_misaligned(tmpL, Address(strL), t0, false); 2667 __ load_long_misaligned(tmpU, Address(strU), t0, 2); 2668 __ inflate_lo32(tmp3, tmpL); 2669 __ mv(tmpL, tmp3); 2670 __ xorr(tmp3, tmpU, tmpL); 2671 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2672 2673 __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string 2674 __ addi(strU, strU, wordSize); // Address of last 8 bytes in UTF-16 string 2675 __ load_int_misaligned(tmpL, Address(strL), t0, false); 2676 __ load_long_misaligned(tmpU, Address(strU), t0, 2); 2677 __ inflate_lo32(tmp3, tmpL); 2678 __ mv(tmpL, tmp3); 2679 __ xorr(tmp3, tmpU, tmpL); 2680 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2681 __ j(DONE); // no character left 2682 2683 // Find the first different characters in the longwords and 2684 // compute their difference. 2685 __ bind(CALCULATE_DIFFERENCE); 2686 // count bits of trailing zero chars 2687 __ ctzc_bits(tmp4, tmp3); 2688 __ srl(tmp1, tmp1, tmp4); 2689 __ srl(tmp2, tmp2, tmp4); 2690 __ zext(tmp1, tmp1, 16); 2691 __ zext(tmp2, tmp2, 16); 2692 __ sub(result, tmp1, tmp2); 2693 __ bind(DONE); 2694 __ ret(); 2695 return entry; 2696 } 2697 2698 address generate_method_entry_barrier() { 2699 __ align(CodeEntryAlignment); 2700 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 2701 StubCodeMark mark(this, stub_id); 2702 2703 Label deoptimize_label; 2704 2705 address start = __ pc(); 2706 2707 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 2708 2709 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 2710 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 2711 Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 2712 __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr())); 2713 __ lwu(t1, t1); 2714 __ sw(t1, thread_epoch_addr); 2715 // There are two ways this can work: 2716 // - The writer did system icache shootdown after the instruction stream update. 2717 // Hence do nothing. 2718 // - The writer trust us to make sure our icache is in sync before entering. 2719 // Hence use cmodx fence (fence.i, may change). 2720 if (UseCtxFencei) { 2721 __ cmodx_fence(); 2722 } 2723 __ membar(__ LoadLoad); 2724 } 2725 2726 __ set_last_Java_frame(sp, fp, ra); 2727 2728 __ enter(); 2729 __ addi(t1, sp, wordSize); 2730 2731 __ subi(sp, sp, 4 * wordSize); 2732 2733 __ push_call_clobbered_registers(); 2734 2735 __ mv(c_rarg0, t1); 2736 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 2737 2738 __ reset_last_Java_frame(true); 2739 2740 __ mv(t0, x10); 2741 2742 __ pop_call_clobbered_registers(); 2743 2744 __ bnez(t0, deoptimize_label); 2745 2746 __ leave(); 2747 __ ret(); 2748 2749 __ BIND(deoptimize_label); 2750 2751 __ ld(t0, Address(sp, 0)); 2752 __ ld(fp, Address(sp, wordSize)); 2753 __ ld(ra, Address(sp, wordSize * 2)); 2754 __ ld(t1, Address(sp, wordSize * 3)); 2755 2756 __ mv(sp, t0); 2757 __ jr(t1); 2758 2759 return start; 2760 } 2761 2762 // x10 = result 2763 // x11 = str1 2764 // x12 = cnt1 2765 // x13 = str2 2766 // x14 = cnt2 2767 // x28 = tmp1 2768 // x29 = tmp2 2769 // x30 = tmp3 2770 // x31 = tmp4 2771 address generate_compare_long_string_same_encoding(StubGenStubId stub_id) { 2772 bool isLL; 2773 switch (stub_id) { 2774 case compare_long_string_LL_id: 2775 isLL = true; 2776 break; 2777 case compare_long_string_UU_id: 2778 isLL = false; 2779 break; 2780 default: 2781 ShouldNotReachHere(); 2782 }; 2783 __ align(CodeEntryAlignment); 2784 StubCodeMark mark(this, stub_id); 2785 address entry = __ pc(); 2786 Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL, 2787 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF; 2788 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14, 2789 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31; 2790 RegSet spilled_regs = RegSet::of(tmp4, tmp5); 2791 2792 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 2793 // update cnt2 counter with already loaded 8 bytes 2794 __ subi(cnt2, cnt2, wordSize / (isLL ? 1 : 2)); 2795 // update pointers, because of previous read 2796 __ addi(str1, str1, wordSize); 2797 __ addi(str2, str2, wordSize); 2798 // less than 16 bytes left? 2799 __ subi(cnt2, cnt2, isLL ? 16 : 8); 2800 __ push_reg(spilled_regs, sp); 2801 __ bltz(cnt2, TAIL); 2802 __ bind(SMALL_LOOP); 2803 // compare 16 bytes of strings with same encoding 2804 __ ld(tmp5, Address(str1)); 2805 __ addi(str1, str1, 8); 2806 __ xorr(tmp4, tmp1, tmp2); 2807 __ ld(cnt1, Address(str2)); 2808 __ addi(str2, str2, 8); 2809 __ bnez(tmp4, DIFF); 2810 __ ld(tmp1, Address(str1)); 2811 __ addi(str1, str1, 8); 2812 __ xorr(tmp4, tmp5, cnt1); 2813 __ ld(tmp2, Address(str2)); 2814 __ addi(str2, str2, 8); 2815 __ bnez(tmp4, DIFF2); 2816 2817 __ subi(cnt2, cnt2, isLL ? 16 : 8); 2818 __ bgez(cnt2, SMALL_LOOP); 2819 __ bind(TAIL); 2820 __ addi(cnt2, cnt2, isLL ? 16 : 8); 2821 __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); 2822 __ subi(cnt2, cnt2, isLL ? 8 : 4); 2823 __ blez(cnt2, CHECK_LAST); 2824 __ xorr(tmp4, tmp1, tmp2); 2825 __ bnez(tmp4, DIFF); 2826 __ ld(tmp1, Address(str1)); 2827 __ addi(str1, str1, 8); 2828 __ ld(tmp2, Address(str2)); 2829 __ addi(str2, str2, 8); 2830 __ subi(cnt2, cnt2, isLL ? 8 : 4); 2831 __ bind(CHECK_LAST); 2832 if (!isLL) { 2833 __ add(cnt2, cnt2, cnt2); // now in bytes 2834 } 2835 __ xorr(tmp4, tmp1, tmp2); 2836 __ bnez(tmp4, DIFF); 2837 __ add(str1, str1, cnt2); 2838 __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2); 2839 __ add(str2, str2, cnt2); 2840 __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2); 2841 __ xorr(tmp4, tmp5, cnt1); 2842 __ beqz(tmp4, LENGTH_DIFF); 2843 // Find the first different characters in the longwords and 2844 // compute their difference. 2845 __ bind(DIFF2); 2846 // count bits of trailing zero chars 2847 __ ctzc_bits(tmp3, tmp4, isLL); 2848 __ srl(tmp5, tmp5, tmp3); 2849 __ srl(cnt1, cnt1, tmp3); 2850 if (isLL) { 2851 __ zext(tmp5, tmp5, 8); 2852 __ zext(cnt1, cnt1, 8); 2853 } else { 2854 __ zext(tmp5, tmp5, 16); 2855 __ zext(cnt1, cnt1, 16); 2856 } 2857 __ sub(result, tmp5, cnt1); 2858 __ j(LENGTH_DIFF); 2859 __ bind(DIFF); 2860 // count bits of trailing zero chars 2861 __ ctzc_bits(tmp3, tmp4, isLL); 2862 __ srl(tmp1, tmp1, tmp3); 2863 __ srl(tmp2, tmp2, tmp3); 2864 if (isLL) { 2865 __ zext(tmp1, tmp1, 8); 2866 __ zext(tmp2, tmp2, 8); 2867 } else { 2868 __ zext(tmp1, tmp1, 16); 2869 __ zext(tmp2, tmp2, 16); 2870 } 2871 __ sub(result, tmp1, tmp2); 2872 __ j(LENGTH_DIFF); 2873 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 2874 __ xorr(tmp4, tmp1, tmp2); 2875 __ bnez(tmp4, DIFF); 2876 __ bind(LENGTH_DIFF); 2877 __ pop_reg(spilled_regs, sp); 2878 __ ret(); 2879 return entry; 2880 } 2881 2882 void generate_compare_long_strings() { 2883 StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(StubGenStubId::compare_long_string_LL_id); 2884 StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(StubGenStubId::compare_long_string_UU_id); 2885 StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(StubGenStubId::compare_long_string_LU_id); 2886 StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(StubGenStubId::compare_long_string_UL_id); 2887 } 2888 2889 // x10 result 2890 // x11 src 2891 // x12 src count 2892 // x13 pattern 2893 // x14 pattern count 2894 address generate_string_indexof_linear(StubGenStubId stub_id) 2895 { 2896 bool needle_isL; 2897 bool haystack_isL; 2898 switch (stub_id) { 2899 case string_indexof_linear_ll_id: 2900 needle_isL = true; 2901 haystack_isL = true; 2902 break; 2903 case string_indexof_linear_ul_id: 2904 needle_isL = true; 2905 haystack_isL = false; 2906 break; 2907 case string_indexof_linear_uu_id: 2908 needle_isL = false; 2909 haystack_isL = false; 2910 break; 2911 default: 2912 ShouldNotReachHere(); 2913 }; 2914 2915 __ align(CodeEntryAlignment); 2916 StubCodeMark mark(this, stub_id); 2917 address entry = __ pc(); 2918 2919 int needle_chr_size = needle_isL ? 1 : 2; 2920 int haystack_chr_size = haystack_isL ? 1 : 2; 2921 int needle_chr_shift = needle_isL ? 0 : 1; 2922 int haystack_chr_shift = haystack_isL ? 0 : 1; 2923 bool isL = needle_isL && haystack_isL; 2924 // parameters 2925 Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14; 2926 // temporary registers 2927 Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25; 2928 // redefinitions 2929 Register ch1 = x28, ch2 = x29; 2930 RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29); 2931 2932 __ push_reg(spilled_regs, sp); 2933 2934 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 2935 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 2936 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 2937 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 2938 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 2939 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 2940 2941 __ ld(ch1, Address(needle)); 2942 __ ld(ch2, Address(haystack)); 2943 // src.length - pattern.length 2944 __ sub(haystack_len, haystack_len, needle_len); 2945 2946 // first is needle[0] 2947 __ zext(first, ch1, needle_isL ? 8 : 16); 2948 2949 uint64_t mask0101 = UCONST64(0x0101010101010101); 2950 uint64_t mask0001 = UCONST64(0x0001000100010001); 2951 __ mv(mask1, haystack_isL ? mask0101 : mask0001); 2952 __ mul(first, first, mask1); 2953 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 2954 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 2955 __ mv(mask2, haystack_isL ? mask7f7f : mask7fff); 2956 if (needle_isL != haystack_isL) { 2957 __ mv(tmp, ch1); 2958 } 2959 __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size - 1); 2960 __ blez(haystack_len, L_SMALL); 2961 2962 if (needle_isL != haystack_isL) { 2963 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2964 } 2965 // xorr, sub, orr, notr, andr 2966 // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i] 2967 // eg: 2968 // first: aa aa aa aa aa aa aa aa 2969 // ch2: aa aa li nx jd ka aa aa 2970 // match_mask: 80 80 00 00 00 00 80 80 2971 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2972 2973 // search first char of needle, if success, goto L_HAS_ZERO; 2974 __ bnez(match_mask, L_HAS_ZERO); 2975 __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size); 2976 __ addi(result, result, wordSize / haystack_chr_size); 2977 __ addi(haystack, haystack, wordSize); 2978 __ bltz(haystack_len, L_POST_LOOP); 2979 2980 __ bind(L_LOOP); 2981 __ ld(ch2, Address(haystack)); 2982 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2983 __ bnez(match_mask, L_HAS_ZERO); 2984 2985 __ bind(L_LOOP_PROCEED); 2986 __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size); 2987 __ addi(haystack, haystack, wordSize); 2988 __ addi(result, result, wordSize / haystack_chr_size); 2989 __ bgez(haystack_len, L_LOOP); 2990 2991 __ bind(L_POST_LOOP); 2992 __ mv(ch2, -wordSize / haystack_chr_size); 2993 __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check 2994 __ ld(ch2, Address(haystack)); 2995 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2996 __ neg(haystack_len, haystack_len); 2997 __ xorr(ch2, first, ch2); 2998 __ sub(match_mask, ch2, mask1); 2999 __ orr(ch2, ch2, mask2); 3000 __ mv(trailing_zeros, -1); // all bits set 3001 __ j(L_SMALL_PROCEED); 3002 3003 __ align(OptoLoopAlignment); 3004 __ bind(L_SMALL); 3005 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 3006 __ neg(haystack_len, haystack_len); 3007 if (needle_isL != haystack_isL) { 3008 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 3009 } 3010 __ xorr(ch2, first, ch2); 3011 __ sub(match_mask, ch2, mask1); 3012 __ orr(ch2, ch2, mask2); 3013 __ mv(trailing_zeros, -1); // all bits set 3014 3015 __ bind(L_SMALL_PROCEED); 3016 __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits. 3017 __ notr(ch2, ch2); 3018 __ andr(match_mask, match_mask, ch2); 3019 __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check 3020 __ beqz(match_mask, NOMATCH); 3021 3022 __ bind(L_SMALL_HAS_ZERO_LOOP); 3023 // count bits of trailing zero chars 3024 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, ch2, tmp); 3025 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 3026 __ mv(ch2, wordSize / haystack_chr_size); 3027 __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2); 3028 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 3029 __ mv(trailing_zeros, wordSize / haystack_chr_size); 3030 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 3031 3032 __ bind(L_SMALL_CMP_LOOP); 3033 __ shadd(first, trailing_zeros, needle, first, needle_chr_shift); 3034 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 3035 needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first)); 3036 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 3037 __ addi(trailing_zeros, trailing_zeros, 1); 3038 __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP); 3039 __ beq(first, ch2, L_SMALL_CMP_LOOP); 3040 3041 __ bind(L_SMALL_CMP_LOOP_NOMATCH); 3042 __ beqz(match_mask, NOMATCH); 3043 // count bits of trailing zero chars 3044 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 3045 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 3046 __ addi(result, result, 1); 3047 __ addi(haystack, haystack, haystack_chr_size); 3048 __ j(L_SMALL_HAS_ZERO_LOOP); 3049 3050 __ align(OptoLoopAlignment); 3051 __ bind(L_SMALL_CMP_LOOP_LAST_CMP); 3052 __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH); 3053 __ j(DONE); 3054 3055 __ align(OptoLoopAlignment); 3056 __ bind(L_SMALL_CMP_LOOP_LAST_CMP2); 3057 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 3058 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 3059 __ j(DONE); 3060 3061 __ align(OptoLoopAlignment); 3062 __ bind(L_HAS_ZERO); 3063 // count bits of trailing zero chars 3064 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 3065 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 3066 __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2); 3067 __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits) 3068 __ subi(result, result, 1); // array index from 0, so result -= 1 3069 3070 __ bind(L_HAS_ZERO_LOOP); 3071 __ mv(needle_len, wordSize / haystack_chr_size); 3072 __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2); 3073 __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2); 3074 // load next 8 bytes from haystack, and increase result index 3075 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 3076 __ addi(result, result, 1); 3077 __ mv(trailing_zeros, wordSize / haystack_chr_size); 3078 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 3079 3080 // compare one char 3081 __ bind(L_CMP_LOOP); 3082 __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift); 3083 needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len)); 3084 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 3085 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 3086 __ addi(trailing_zeros, trailing_zeros, 1); // next char index 3087 __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2); 3088 __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP); 3089 __ beq(needle_len, ch2, L_CMP_LOOP); 3090 3091 __ bind(L_CMP_LOOP_NOMATCH); 3092 __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH); 3093 // count bits of trailing zero chars 3094 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); 3095 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 3096 __ addi(haystack, haystack, haystack_chr_size); 3097 __ j(L_HAS_ZERO_LOOP); 3098 3099 __ align(OptoLoopAlignment); 3100 __ bind(L_CMP_LOOP_LAST_CMP); 3101 __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH); 3102 __ j(DONE); 3103 3104 __ align(OptoLoopAlignment); 3105 __ bind(L_CMP_LOOP_LAST_CMP2); 3106 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 3107 __ addi(result, result, 1); 3108 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 3109 __ j(DONE); 3110 3111 __ align(OptoLoopAlignment); 3112 __ bind(L_HAS_ZERO_LOOP_NOMATCH); 3113 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 3114 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 3115 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 3116 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 3117 // result by analyzed characters value, so, we can just reset lower bits 3118 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 3119 // 2) restore needle_len and haystack_len values from "compressed" haystack_len 3120 // 3) advance haystack value to represent next haystack octet. result & 7/3 is 3121 // index of last analyzed substring inside current octet. So, haystack in at 3122 // respective start address. We need to advance it to next octet 3123 __ andi(match_mask, result, wordSize / haystack_chr_size - 1); 3124 __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2); 3125 __ andi(result, result, haystack_isL ? -8 : -4); 3126 __ slli(tmp, match_mask, haystack_chr_shift); 3127 __ sub(haystack, haystack, tmp); 3128 __ sext(haystack_len, haystack_len, 32); 3129 __ j(L_LOOP_PROCEED); 3130 3131 __ align(OptoLoopAlignment); 3132 __ bind(NOMATCH); 3133 __ mv(result, -1); 3134 3135 __ bind(DONE); 3136 __ pop_reg(spilled_regs, sp); 3137 __ ret(); 3138 return entry; 3139 } 3140 3141 void generate_string_indexof_stubs() 3142 { 3143 StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_ll_id); 3144 StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_uu_id); 3145 StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(StubGenStubId::string_indexof_linear_ul_id); 3146 } 3147 3148 #ifdef COMPILER2 3149 void generate_lookup_secondary_supers_table_stub() { 3150 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 3151 StubCodeMark mark(this, stub_id); 3152 3153 const Register 3154 r_super_klass = x10, 3155 r_array_base = x11, 3156 r_array_length = x12, 3157 r_array_index = x13, 3158 r_sub_klass = x14, 3159 result = x15, 3160 r_bitmap = x16; 3161 3162 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 3163 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 3164 Label L_success; 3165 __ enter(); 3166 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, result, 3167 r_array_base, r_array_length, r_array_index, 3168 r_bitmap, slot, /*stub_is_near*/true); 3169 __ leave(); 3170 __ ret(); 3171 } 3172 } 3173 3174 // Slow path implementation for UseSecondarySupersTable. 3175 address generate_lookup_secondary_supers_table_slow_path_stub() { 3176 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 3177 StubCodeMark mark(this, stub_id); 3178 3179 address start = __ pc(); 3180 const Register 3181 r_super_klass = x10, // argument 3182 r_array_base = x11, // argument 3183 temp1 = x12, // tmp 3184 r_array_index = x13, // argument 3185 result = x15, // argument 3186 r_bitmap = x16; // argument 3187 3188 3189 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1); 3190 __ ret(); 3191 3192 return start; 3193 } 3194 3195 address generate_mulAdd() 3196 { 3197 __ align(CodeEntryAlignment); 3198 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 3199 StubCodeMark mark(this, stub_id); 3200 3201 address entry = __ pc(); 3202 3203 const Register out = x10; 3204 const Register in = x11; 3205 const Register offset = x12; 3206 const Register len = x13; 3207 const Register k = x14; 3208 const Register tmp = x28; 3209 3210 BLOCK_COMMENT("Entry:"); 3211 __ enter(); 3212 __ mul_add(out, in, offset, len, k, tmp); 3213 __ leave(); 3214 __ ret(); 3215 3216 return entry; 3217 } 3218 3219 /** 3220 * Arguments: 3221 * 3222 * Input: 3223 * c_rarg0 - x address 3224 * c_rarg1 - x length 3225 * c_rarg2 - y address 3226 * c_rarg3 - y length 3227 * c_rarg4 - z address 3228 */ 3229 address generate_multiplyToLen() 3230 { 3231 __ align(CodeEntryAlignment); 3232 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 3233 StubCodeMark mark(this, stub_id); 3234 address entry = __ pc(); 3235 3236 const Register x = x10; 3237 const Register xlen = x11; 3238 const Register y = x12; 3239 const Register ylen = x13; 3240 const Register z = x14; 3241 3242 const Register tmp0 = x15; 3243 const Register tmp1 = x16; 3244 const Register tmp2 = x17; 3245 const Register tmp3 = x7; 3246 const Register tmp4 = x28; 3247 const Register tmp5 = x29; 3248 const Register tmp6 = x30; 3249 const Register tmp7 = x31; 3250 3251 BLOCK_COMMENT("Entry:"); 3252 __ enter(); // required for proper stackwalking of RuntimeStub frame 3253 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3254 __ leave(); // required for proper stackwalking of RuntimeStub frame 3255 __ ret(); 3256 3257 return entry; 3258 } 3259 3260 address generate_squareToLen() 3261 { 3262 __ align(CodeEntryAlignment); 3263 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 3264 StubCodeMark mark(this, stub_id); 3265 address entry = __ pc(); 3266 3267 const Register x = x10; 3268 const Register xlen = x11; 3269 const Register z = x12; 3270 const Register y = x14; // == x 3271 const Register ylen = x15; // == xlen 3272 3273 const Register tmp0 = x13; // zlen, unused 3274 const Register tmp1 = x16; 3275 const Register tmp2 = x17; 3276 const Register tmp3 = x7; 3277 const Register tmp4 = x28; 3278 const Register tmp5 = x29; 3279 const Register tmp6 = x30; 3280 const Register tmp7 = x31; 3281 3282 BLOCK_COMMENT("Entry:"); 3283 __ enter(); 3284 __ mv(y, x); 3285 __ mv(ylen, xlen); 3286 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3287 __ leave(); 3288 __ ret(); 3289 3290 return entry; 3291 } 3292 3293 // Arguments: 3294 // 3295 // Input: 3296 // c_rarg0 - newArr address 3297 // c_rarg1 - oldArr address 3298 // c_rarg2 - newIdx 3299 // c_rarg3 - shiftCount 3300 // c_rarg4 - numIter 3301 // 3302 address generate_bigIntegerLeftShift() { 3303 __ align(CodeEntryAlignment); 3304 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 3305 StubCodeMark mark(this, stub_id); 3306 address entry = __ pc(); 3307 3308 Label loop, exit; 3309 3310 Register newArr = c_rarg0; 3311 Register oldArr = c_rarg1; 3312 Register newIdx = c_rarg2; 3313 Register shiftCount = c_rarg3; 3314 Register numIter = c_rarg4; 3315 3316 Register shiftRevCount = c_rarg5; 3317 Register oldArrNext = t1; 3318 3319 __ beqz(numIter, exit); 3320 __ shadd(newArr, newIdx, newArr, t0, 2); 3321 3322 __ mv(shiftRevCount, 32); 3323 __ sub(shiftRevCount, shiftRevCount, shiftCount); 3324 3325 __ bind(loop); 3326 __ addi(oldArrNext, oldArr, 4); 3327 __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4); 3328 __ vle32_v(v0, oldArr); 3329 __ vle32_v(v4, oldArrNext); 3330 __ vsll_vx(v0, v0, shiftCount); 3331 __ vsrl_vx(v4, v4, shiftRevCount); 3332 __ vor_vv(v0, v0, v4); 3333 __ vse32_v(v0, newArr); 3334 __ sub(numIter, numIter, t0); 3335 __ shadd(oldArr, t0, oldArr, t1, 2); 3336 __ shadd(newArr, t0, newArr, t1, 2); 3337 __ bnez(numIter, loop); 3338 3339 __ bind(exit); 3340 __ ret(); 3341 3342 return entry; 3343 } 3344 3345 // Arguments: 3346 // 3347 // Input: 3348 // c_rarg0 - newArr address 3349 // c_rarg1 - oldArr address 3350 // c_rarg2 - newIdx 3351 // c_rarg3 - shiftCount 3352 // c_rarg4 - numIter 3353 // 3354 address generate_bigIntegerRightShift() { 3355 __ align(CodeEntryAlignment); 3356 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 3357 StubCodeMark mark(this, stub_id); 3358 address entry = __ pc(); 3359 3360 Label loop, exit; 3361 3362 Register newArr = c_rarg0; 3363 Register oldArr = c_rarg1; 3364 Register newIdx = c_rarg2; 3365 Register shiftCount = c_rarg3; 3366 Register numIter = c_rarg4; 3367 Register idx = numIter; 3368 3369 Register shiftRevCount = c_rarg5; 3370 Register oldArrNext = c_rarg6; 3371 Register newArrCur = t0; 3372 Register oldArrCur = t1; 3373 3374 __ beqz(idx, exit); 3375 __ shadd(newArr, newIdx, newArr, t0, 2); 3376 3377 __ mv(shiftRevCount, 32); 3378 __ sub(shiftRevCount, shiftRevCount, shiftCount); 3379 3380 __ bind(loop); 3381 __ vsetvli(t0, idx, Assembler::e32, Assembler::m4); 3382 __ sub(idx, idx, t0); 3383 __ shadd(oldArrNext, idx, oldArr, t1, 2); 3384 __ shadd(newArrCur, idx, newArr, t1, 2); 3385 __ addi(oldArrCur, oldArrNext, 4); 3386 __ vle32_v(v0, oldArrCur); 3387 __ vle32_v(v4, oldArrNext); 3388 __ vsrl_vx(v0, v0, shiftCount); 3389 __ vsll_vx(v4, v4, shiftRevCount); 3390 __ vor_vv(v0, v0, v4); 3391 __ vse32_v(v0, newArrCur); 3392 __ bnez(idx, loop); 3393 3394 __ bind(exit); 3395 __ ret(); 3396 3397 return entry; 3398 } 3399 #endif 3400 3401 #ifdef COMPILER2 3402 class MontgomeryMultiplyGenerator : public MacroAssembler { 3403 3404 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3405 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj; 3406 3407 RegSet _toSave; 3408 bool _squaring; 3409 3410 public: 3411 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3412 : MacroAssembler(as->code()), _squaring(squaring) { 3413 3414 // Register allocation 3415 3416 RegSetIterator<Register> regs = RegSet::range(x10, x26).begin(); 3417 Pa_base = *regs; // Argument registers 3418 if (squaring) { 3419 Pb_base = Pa_base; 3420 } else { 3421 Pb_base = *++regs; 3422 } 3423 Pn_base = *++regs; 3424 Rlen= *++regs; 3425 inv = *++regs; 3426 Pm_base = *++regs; 3427 3428 // Working registers: 3429 Ra = *++regs; // The current digit of a, b, n, and m. 3430 Rb = *++regs; 3431 Rm = *++regs; 3432 Rn = *++regs; 3433 3434 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 3435 Pb = *++regs; 3436 Pm = *++regs; 3437 Pn = *++regs; 3438 3439 tmp0 = *++regs; // Three registers which form a 3440 tmp1 = *++regs; // triple-precision accumuator. 3441 tmp2 = *++regs; 3442 3443 Ri = x6; // Inner and outer loop indexes. 3444 Rj = x7; 3445 3446 Rhi_ab = x28; // Product registers: low and high parts 3447 Rlo_ab = x29; // of a*b and m*n. 3448 Rhi_mn = x30; 3449 Rlo_mn = x31; 3450 3451 // x18 and up are callee-saved. 3452 _toSave = RegSet::range(x18, *regs) + Pm_base; 3453 } 3454 3455 private: 3456 void save_regs() { 3457 push_reg(_toSave, sp); 3458 } 3459 3460 void restore_regs() { 3461 pop_reg(_toSave, sp); 3462 } 3463 3464 template <typename T> 3465 void unroll_2(Register count, T block) { 3466 Label loop, end, odd; 3467 beqz(count, end); 3468 test_bit(t0, count, 0); 3469 bnez(t0, odd); 3470 align(16); 3471 bind(loop); 3472 (this->*block)(); 3473 bind(odd); 3474 (this->*block)(); 3475 subi(count, count, 2); 3476 bgtz(count, loop); 3477 bind(end); 3478 } 3479 3480 template <typename T> 3481 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3482 Label loop, end, odd; 3483 beqz(count, end); 3484 test_bit(tmp, count, 0); 3485 bnez(tmp, odd); 3486 align(16); 3487 bind(loop); 3488 (this->*block)(d, s, tmp); 3489 bind(odd); 3490 (this->*block)(d, s, tmp); 3491 subi(count, count, 2); 3492 bgtz(count, loop); 3493 bind(end); 3494 } 3495 3496 void pre1(RegisterOrConstant i) { 3497 block_comment("pre1"); 3498 // Pa = Pa_base; 3499 // Pb = Pb_base + i; 3500 // Pm = Pm_base; 3501 // Pn = Pn_base + i; 3502 // Ra = *Pa; 3503 // Rb = *Pb; 3504 // Rm = *Pm; 3505 // Rn = *Pn; 3506 if (i.is_register()) { 3507 slli(t0, i.as_register(), LogBytesPerWord); 3508 } else { 3509 mv(t0, i.as_constant()); 3510 slli(t0, t0, LogBytesPerWord); 3511 } 3512 3513 mv(Pa, Pa_base); 3514 add(Pb, Pb_base, t0); 3515 mv(Pm, Pm_base); 3516 add(Pn, Pn_base, t0); 3517 3518 ld(Ra, Address(Pa)); 3519 ld(Rb, Address(Pb)); 3520 ld(Rm, Address(Pm)); 3521 ld(Rn, Address(Pn)); 3522 3523 // Zero the m*n result. 3524 mv(Rhi_mn, zr); 3525 mv(Rlo_mn, zr); 3526 } 3527 3528 // The core multiply-accumulate step of a Montgomery 3529 // multiplication. The idea is to schedule operations as a 3530 // pipeline so that instructions with long latencies (loads and 3531 // multiplies) have time to complete before their results are 3532 // used. This most benefits in-order implementations of the 3533 // architecture but out-of-order ones also benefit. 3534 void step() { 3535 block_comment("step"); 3536 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3537 // Ra = *++Pa; 3538 // Rb = *--Pb; 3539 mulhu(Rhi_ab, Ra, Rb); 3540 mul(Rlo_ab, Ra, Rb); 3541 addi(Pa, Pa, wordSize); 3542 ld(Ra, Address(Pa)); 3543 subi(Pb, Pb, wordSize); 3544 ld(Rb, Address(Pb)); 3545 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the 3546 // previous iteration. 3547 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3548 // Rm = *++Pm; 3549 // Rn = *--Pn; 3550 mulhu(Rhi_mn, Rm, Rn); 3551 mul(Rlo_mn, Rm, Rn); 3552 addi(Pm, Pm, wordSize); 3553 ld(Rm, Address(Pm)); 3554 subi(Pn, Pn, wordSize); 3555 ld(Rn, Address(Pn)); 3556 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3557 } 3558 3559 void post1() { 3560 block_comment("post1"); 3561 3562 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3563 // Ra = *++Pa; 3564 // Rb = *--Pb; 3565 mulhu(Rhi_ab, Ra, Rb); 3566 mul(Rlo_ab, Ra, Rb); 3567 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3568 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3569 3570 // *Pm = Rm = tmp0 * inv; 3571 mul(Rm, tmp0, inv); 3572 sd(Rm, Address(Pm)); 3573 3574 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3575 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3576 mulhu(Rhi_mn, Rm, Rn); 3577 3578 #ifndef PRODUCT 3579 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3580 { 3581 mul(Rlo_mn, Rm, Rn); 3582 add(Rlo_mn, tmp0, Rlo_mn); 3583 Label ok; 3584 beqz(Rlo_mn, ok); 3585 stop("broken Montgomery multiply"); 3586 bind(ok); 3587 } 3588 #endif 3589 // We have very carefully set things up so that 3590 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3591 // the lower half of Rm * Rn because we know the result already: 3592 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3593 // tmp0 != 0. So, rather than do a mul and an cad we just set 3594 // the carry flag iff tmp0 is nonzero. 3595 // 3596 // mul(Rlo_mn, Rm, Rn); 3597 // cad(zr, tmp0, Rlo_mn); 3598 subi(t0, tmp0, 1); 3599 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3600 cadc(tmp0, tmp1, Rhi_mn, t0); 3601 adc(tmp1, tmp2, zr, t0); 3602 mv(tmp2, zr); 3603 } 3604 3605 void pre2(Register i, Register len) { 3606 block_comment("pre2"); 3607 // Pa = Pa_base + i-len; 3608 // Pb = Pb_base + len; 3609 // Pm = Pm_base + i-len; 3610 // Pn = Pn_base + len; 3611 3612 sub(Rj, i, len); 3613 // Rj == i-len 3614 3615 // Ra as temp register 3616 slli(Ra, Rj, LogBytesPerWord); 3617 add(Pa, Pa_base, Ra); 3618 add(Pm, Pm_base, Ra); 3619 slli(Ra, len, LogBytesPerWord); 3620 add(Pb, Pb_base, Ra); 3621 add(Pn, Pn_base, Ra); 3622 3623 // Ra = *++Pa; 3624 // Rb = *--Pb; 3625 // Rm = *++Pm; 3626 // Rn = *--Pn; 3627 addi(Pa, Pa, wordSize); 3628 ld(Ra, Address(Pa)); 3629 subi(Pb, Pb, wordSize); 3630 ld(Rb, Address(Pb)); 3631 addi(Pm, Pm, wordSize); 3632 ld(Rm, Address(Pm)); 3633 subi(Pn, Pn, wordSize); 3634 ld(Rn, Address(Pn)); 3635 3636 mv(Rhi_mn, zr); 3637 mv(Rlo_mn, zr); 3638 } 3639 3640 void post2(Register i, Register len) { 3641 block_comment("post2"); 3642 sub(Rj, i, len); 3643 3644 cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part 3645 3646 // As soon as we know the least significant digit of our result, 3647 // store it. 3648 // Pm_base[i-len] = tmp0; 3649 // Rj as temp register 3650 slli(Rj, Rj, LogBytesPerWord); 3651 add(Rj, Pm_base, Rj); 3652 sd(tmp0, Address(Rj)); 3653 3654 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3655 cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part 3656 adc(tmp1, tmp2, zr, t0); 3657 mv(tmp2, zr); 3658 } 3659 3660 // A carry in tmp0 after Montgomery multiplication means that we 3661 // should subtract multiples of n from our result in m. We'll 3662 // keep doing that until there is no carry. 3663 void normalize(Register len) { 3664 block_comment("normalize"); 3665 // while (tmp0) 3666 // tmp0 = sub(Pm_base, Pn_base, tmp0, len); 3667 Label loop, post, again; 3668 Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now 3669 beqz(tmp0, post); { 3670 bind(again); { 3671 mv(i, zr); 3672 mv(cnt, len); 3673 slli(Rn, i, LogBytesPerWord); 3674 add(Rm, Pm_base, Rn); 3675 ld(Rm, Address(Rm)); 3676 add(Rn, Pn_base, Rn); 3677 ld(Rn, Address(Rn)); 3678 mv(t0, 1); // set carry flag, i.e. no borrow 3679 align(16); 3680 bind(loop); { 3681 notr(Rn, Rn); 3682 add(Rm, Rm, t0); 3683 add(Rm, Rm, Rn); 3684 sltu(t0, Rm, Rn); 3685 slli(Rn, i, LogBytesPerWord); // Rn as temp register 3686 add(Rn, Pm_base, Rn); 3687 sd(Rm, Address(Rn)); 3688 addi(i, i, 1); 3689 slli(Rn, i, LogBytesPerWord); 3690 add(Rm, Pm_base, Rn); 3691 ld(Rm, Address(Rm)); 3692 add(Rn, Pn_base, Rn); 3693 ld(Rn, Address(Rn)); 3694 subi(cnt, cnt, 1); 3695 } bnez(cnt, loop); 3696 subi(tmp0, tmp0, 1); 3697 add(tmp0, tmp0, t0); 3698 } bnez(tmp0, again); 3699 } bind(post); 3700 } 3701 3702 // Move memory at s to d, reversing words. 3703 // Increments d to end of copied memory 3704 // Destroys tmp1, tmp2 3705 // Preserves len 3706 // Leaves s pointing to the address which was in d at start 3707 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3708 assert(tmp1->encoding() < x28->encoding(), "register corruption"); 3709 assert(tmp2->encoding() < x28->encoding(), "register corruption"); 3710 3711 shadd(s, len, s, tmp1, LogBytesPerWord); 3712 mv(tmp1, len); 3713 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3714 slli(tmp1, len, LogBytesPerWord); 3715 sub(s, d, tmp1); 3716 } 3717 // [63...0] -> [31...0][63...32] 3718 void reverse1(Register d, Register s, Register tmp) { 3719 subi(s, s, wordSize); 3720 ld(tmp, Address(s)); 3721 ror(tmp, tmp, 32, t0); 3722 sd(tmp, Address(d)); 3723 addi(d, d, wordSize); 3724 } 3725 3726 void step_squaring() { 3727 // An extra ACC 3728 step(); 3729 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3730 } 3731 3732 void last_squaring(Register i) { 3733 Label dont; 3734 // if ((i & 1) == 0) { 3735 test_bit(t0, i, 0); 3736 bnez(t0, dont); { 3737 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3738 // Ra = *++Pa; 3739 // Rb = *--Pb; 3740 mulhu(Rhi_ab, Ra, Rb); 3741 mul(Rlo_ab, Ra, Rb); 3742 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3743 } bind(dont); 3744 } 3745 3746 void extra_step_squaring() { 3747 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3748 3749 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3750 // Rm = *++Pm; 3751 // Rn = *--Pn; 3752 mulhu(Rhi_mn, Rm, Rn); 3753 mul(Rlo_mn, Rm, Rn); 3754 addi(Pm, Pm, wordSize); 3755 ld(Rm, Address(Pm)); 3756 subi(Pn, Pn, wordSize); 3757 ld(Rn, Address(Pn)); 3758 } 3759 3760 void post1_squaring() { 3761 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3762 3763 // *Pm = Rm = tmp0 * inv; 3764 mul(Rm, tmp0, inv); 3765 sd(Rm, Address(Pm)); 3766 3767 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3768 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3769 mulhu(Rhi_mn, Rm, Rn); 3770 3771 #ifndef PRODUCT 3772 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3773 { 3774 mul(Rlo_mn, Rm, Rn); 3775 add(Rlo_mn, tmp0, Rlo_mn); 3776 Label ok; 3777 beqz(Rlo_mn, ok); { 3778 stop("broken Montgomery multiply"); 3779 } bind(ok); 3780 } 3781 #endif 3782 // We have very carefully set things up so that 3783 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3784 // the lower half of Rm * Rn because we know the result already: 3785 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3786 // tmp0 != 0. So, rather than do a mul and a cad we just set 3787 // the carry flag iff tmp0 is nonzero. 3788 // 3789 // mul(Rlo_mn, Rm, Rn); 3790 // cad(zr, tmp, Rlo_mn); 3791 subi(t0, tmp0, 1); 3792 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3793 cadc(tmp0, tmp1, Rhi_mn, t0); 3794 adc(tmp1, tmp2, zr, t0); 3795 mv(tmp2, zr); 3796 } 3797 3798 // use t0 as carry 3799 void acc(Register Rhi, Register Rlo, 3800 Register tmp0, Register tmp1, Register tmp2) { 3801 cad(tmp0, tmp0, Rlo, t0); 3802 cadc(tmp1, tmp1, Rhi, t0); 3803 adc(tmp2, tmp2, zr, t0); 3804 } 3805 3806 public: 3807 /** 3808 * Fast Montgomery multiplication. The derivation of the 3809 * algorithm is in A Cryptographic Library for the Motorola 3810 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3811 * 3812 * Arguments: 3813 * 3814 * Inputs for multiplication: 3815 * c_rarg0 - int array elements a 3816 * c_rarg1 - int array elements b 3817 * c_rarg2 - int array elements n (the modulus) 3818 * c_rarg3 - int length 3819 * c_rarg4 - int inv 3820 * c_rarg5 - int array elements m (the result) 3821 * 3822 * Inputs for squaring: 3823 * c_rarg0 - int array elements a 3824 * c_rarg1 - int array elements n (the modulus) 3825 * c_rarg2 - int length 3826 * c_rarg3 - int inv 3827 * c_rarg4 - int array elements m (the result) 3828 * 3829 */ 3830 address generate_multiply() { 3831 Label argh, nothing; 3832 bind(argh); 3833 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3834 3835 align(CodeEntryAlignment); 3836 address entry = pc(); 3837 3838 beqz(Rlen, nothing); 3839 3840 enter(); 3841 3842 // Make room. 3843 mv(Ra, 512); 3844 bgt(Rlen, Ra, argh); 3845 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3846 sub(Ra, sp, Ra); 3847 andi(sp, Ra, -2 * wordSize); 3848 3849 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3850 3851 { 3852 // Copy input args, reversing as we go. We use Ra as a 3853 // temporary variable. 3854 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3855 if (!_squaring) 3856 reverse(Ra, Pb_base, Rlen, Ri, Rj); 3857 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3858 } 3859 3860 // Push all call-saved registers and also Pm_base which we'll need 3861 // at the end. 3862 save_regs(); 3863 3864 #ifndef PRODUCT 3865 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3866 { 3867 ld(Rn, Address(Pn_base)); 3868 mul(Rlo_mn, Rn, inv); 3869 mv(t0, -1); 3870 Label ok; 3871 beq(Rlo_mn, t0, ok); 3872 stop("broken inverse in Montgomery multiply"); 3873 bind(ok); 3874 } 3875 #endif 3876 3877 mv(Pm_base, Ra); 3878 3879 mv(tmp0, zr); 3880 mv(tmp1, zr); 3881 mv(tmp2, zr); 3882 3883 block_comment("for (int i = 0; i < len; i++) {"); 3884 mv(Ri, zr); { 3885 Label loop, end; 3886 bge(Ri, Rlen, end); 3887 3888 bind(loop); 3889 pre1(Ri); 3890 3891 block_comment(" for (j = i; j; j--) {"); { 3892 mv(Rj, Ri); 3893 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3894 } block_comment(" } // j"); 3895 3896 post1(); 3897 addiw(Ri, Ri, 1); 3898 blt(Ri, Rlen, loop); 3899 bind(end); 3900 block_comment("} // i"); 3901 } 3902 3903 block_comment("for (int i = len; i < 2*len; i++) {"); 3904 mv(Ri, Rlen); { 3905 Label loop, end; 3906 slli(t0, Rlen, 1); 3907 bge(Ri, t0, end); 3908 3909 bind(loop); 3910 pre2(Ri, Rlen); 3911 3912 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3913 slliw(Rj, Rlen, 1); 3914 subw(Rj, Rj, Ri); 3915 subiw(Rj, Rj, 1); 3916 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3917 } block_comment(" } // j"); 3918 3919 post2(Ri, Rlen); 3920 addiw(Ri, Ri, 1); 3921 slli(t0, Rlen, 1); 3922 blt(Ri, t0, loop); 3923 bind(end); 3924 } 3925 block_comment("} // i"); 3926 3927 normalize(Rlen); 3928 3929 mv(Ra, Pm_base); // Save Pm_base in Ra 3930 restore_regs(); // Restore caller's Pm_base 3931 3932 // Copy our result into caller's Pm_base 3933 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3934 3935 leave(); 3936 bind(nothing); 3937 ret(); 3938 3939 return entry; 3940 } 3941 3942 /** 3943 * 3944 * Arguments: 3945 * 3946 * Inputs: 3947 * c_rarg0 - int array elements a 3948 * c_rarg1 - int array elements n (the modulus) 3949 * c_rarg2 - int length 3950 * c_rarg3 - int inv 3951 * c_rarg4 - int array elements m (the result) 3952 * 3953 */ 3954 address generate_square() { 3955 Label argh; 3956 bind(argh); 3957 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3958 3959 align(CodeEntryAlignment); 3960 address entry = pc(); 3961 3962 enter(); 3963 3964 // Make room. 3965 mv(Ra, 512); 3966 bgt(Rlen, Ra, argh); 3967 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3968 sub(Ra, sp, Ra); 3969 andi(sp, Ra, -2 * wordSize); 3970 3971 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3972 3973 { 3974 // Copy input args, reversing as we go. We use Ra as a 3975 // temporary variable. 3976 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3977 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3978 } 3979 3980 // Push all call-saved registers and also Pm_base which we'll need 3981 // at the end. 3982 save_regs(); 3983 3984 mv(Pm_base, Ra); 3985 3986 mv(tmp0, zr); 3987 mv(tmp1, zr); 3988 mv(tmp2, zr); 3989 3990 block_comment("for (int i = 0; i < len; i++) {"); 3991 mv(Ri, zr); { 3992 Label loop, end; 3993 bind(loop); 3994 bge(Ri, Rlen, end); 3995 3996 pre1(Ri); 3997 3998 block_comment("for (j = (i+1)/2; j; j--) {"); { 3999 addi(Rj, Ri, 1); 4000 srliw(Rj, Rj, 1); 4001 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4002 } block_comment(" } // j"); 4003 4004 last_squaring(Ri); 4005 4006 block_comment(" for (j = i/2; j; j--) {"); { 4007 srliw(Rj, Ri, 1); 4008 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4009 } block_comment(" } // j"); 4010 4011 post1_squaring(); 4012 addi(Ri, Ri, 1); 4013 blt(Ri, Rlen, loop); 4014 4015 bind(end); 4016 block_comment("} // i"); 4017 } 4018 4019 block_comment("for (int i = len; i < 2*len; i++) {"); 4020 mv(Ri, Rlen); { 4021 Label loop, end; 4022 bind(loop); 4023 slli(t0, Rlen, 1); 4024 bge(Ri, t0, end); 4025 4026 pre2(Ri, Rlen); 4027 4028 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4029 slli(Rj, Rlen, 1); 4030 sub(Rj, Rj, Ri); 4031 subi(Rj, Rj, 1); 4032 srliw(Rj, Rj, 1); 4033 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4034 } block_comment(" } // j"); 4035 4036 last_squaring(Ri); 4037 4038 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4039 slli(Rj, Rlen, 1); 4040 sub(Rj, Rj, Ri); 4041 srliw(Rj, Rj, 1); 4042 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4043 } block_comment(" } // j"); 4044 4045 post2(Ri, Rlen); 4046 addi(Ri, Ri, 1); 4047 slli(t0, Rlen, 1); 4048 blt(Ri, t0, loop); 4049 4050 bind(end); 4051 block_comment("} // i"); 4052 } 4053 4054 normalize(Rlen); 4055 4056 mv(Ra, Pm_base); // Save Pm_base in Ra 4057 restore_regs(); // Restore caller's Pm_base 4058 4059 // Copy our result into caller's Pm_base 4060 reverse(Pm_base, Ra, Rlen, Ri, Rj); 4061 4062 leave(); 4063 ret(); 4064 4065 return entry; 4066 } 4067 }; 4068 4069 #endif // COMPILER2 4070 4071 address generate_cont_thaw(Continuation::thaw_kind kind) { 4072 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 4073 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 4074 4075 address start = __ pc(); 4076 4077 if (return_barrier) { 4078 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset())); 4079 } 4080 4081 #ifndef PRODUCT 4082 { 4083 Label OK; 4084 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset())); 4085 __ beq(sp, t0, OK); 4086 __ stop("incorrect sp"); 4087 __ bind(OK); 4088 } 4089 #endif 4090 4091 if (return_barrier) { 4092 // preserve possible return value from a method returning to the return barrier 4093 __ subi(sp, sp, 2 * wordSize); 4094 __ fsd(f10, Address(sp, 0 * wordSize)); 4095 __ sd(x10, Address(sp, 1 * wordSize)); 4096 } 4097 4098 __ mv(c_rarg1, (return_barrier ? 1 : 0)); 4099 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1); 4100 __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames 4101 4102 if (return_barrier) { 4103 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 4104 __ ld(x10, Address(sp, 1 * wordSize)); 4105 __ fld(f10, Address(sp, 0 * wordSize)); 4106 __ addi(sp, sp, 2 * wordSize); 4107 } 4108 4109 #ifndef PRODUCT 4110 { 4111 Label OK; 4112 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset())); 4113 __ beq(sp, t0, OK); 4114 __ stop("incorrect sp"); 4115 __ bind(OK); 4116 } 4117 #endif 4118 4119 Label thaw_success; 4120 // t1 contains the size of the frames to thaw, 0 if overflow or no more frames 4121 __ bnez(t1, thaw_success); 4122 __ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 4123 __ bind(thaw_success); 4124 4125 // make room for the thawed frames 4126 __ sub(t0, sp, t1); 4127 __ andi(sp, t0, -16); // align 4128 4129 if (return_barrier) { 4130 // save original return value -- again 4131 __ subi(sp, sp, 2 * wordSize); 4132 __ fsd(f10, Address(sp, 0 * wordSize)); 4133 __ sd(x10, Address(sp, 1 * wordSize)); 4134 } 4135 4136 // If we want, we can templatize thaw by kind, and have three different entries 4137 __ mv(c_rarg1, kind); 4138 4139 __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1); 4140 __ mv(t1, x10); // x10 is the sp of the yielding frame 4141 4142 if (return_barrier) { 4143 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 4144 __ ld(x10, Address(sp, 1 * wordSize)); 4145 __ fld(f10, Address(sp, 0 * wordSize)); 4146 __ addi(sp, sp, 2 * wordSize); 4147 } else { 4148 __ mv(x10, zr); // return 0 (success) from doYield 4149 } 4150 4151 // we're now on the yield frame (which is in an address above us b/c sp has been pushed down) 4152 __ mv(fp, t1); 4153 __ subi(sp, t1, 2 * wordSize); // now pointing to fp spill 4154 4155 if (return_barrier_exception) { 4156 __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address 4157 __ verify_oop(x10); 4158 __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9 4159 4160 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1); 4161 4162 // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc 4163 4164 __ mv(x11, x10); // the exception handler 4165 __ mv(x10, x9); // restore return value contaning the exception oop 4166 __ verify_oop(x10); 4167 4168 __ leave(); 4169 __ mv(x13, ra); 4170 __ jr(x11); // the exception handler 4171 } else { 4172 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 4173 __ leave(); 4174 __ ret(); 4175 } 4176 4177 return start; 4178 } 4179 4180 address generate_cont_thaw() { 4181 if (!Continuations::enabled()) return nullptr; 4182 4183 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 4184 StubCodeMark mark(this, stub_id); 4185 address start = __ pc(); 4186 generate_cont_thaw(Continuation::thaw_top); 4187 return start; 4188 } 4189 4190 address generate_cont_returnBarrier() { 4191 if (!Continuations::enabled()) return nullptr; 4192 4193 // TODO: will probably need multiple return barriers depending on return type 4194 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 4195 StubCodeMark mark(this, stub_id); 4196 address start = __ pc(); 4197 4198 generate_cont_thaw(Continuation::thaw_return_barrier); 4199 4200 return start; 4201 } 4202 4203 address generate_cont_returnBarrier_exception() { 4204 if (!Continuations::enabled()) return nullptr; 4205 4206 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 4207 StubCodeMark mark(this, stub_id); 4208 address start = __ pc(); 4209 4210 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 4211 4212 return start; 4213 } 4214 4215 address generate_cont_preempt_stub() { 4216 if (!Continuations::enabled()) return nullptr; 4217 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 4218 StubCodeMark mark(this, stub_id); 4219 address start = __ pc(); 4220 4221 __ reset_last_Java_frame(true); 4222 4223 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 4224 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset())); 4225 4226 Label preemption_cancelled; 4227 __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset())); 4228 __ bnez(t0, preemption_cancelled); 4229 4230 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 4231 SharedRuntime::continuation_enter_cleanup(_masm); 4232 __ leave(); 4233 __ ret(); 4234 4235 // We acquired the monitor after freezing the frames so call thaw to continue execution. 4236 __ bind(preemption_cancelled); 4237 __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset())); 4238 __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize))); 4239 __ la(t1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 4240 __ ld(t1, Address(t1)); 4241 __ jr(t1); 4242 4243 return start; 4244 } 4245 4246 #if COMPILER2_OR_JVMCI 4247 4248 #undef __ 4249 #define __ this-> 4250 4251 class Sha2Generator : public MacroAssembler { 4252 StubCodeGenerator* _cgen; 4253 public: 4254 Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {} 4255 address generate_sha256_implCompress(StubGenStubId stub_id) { 4256 return generate_sha2_implCompress(Assembler::e32, stub_id); 4257 } 4258 address generate_sha512_implCompress(StubGenStubId stub_id) { 4259 return generate_sha2_implCompress(Assembler::e64, stub_id); 4260 } 4261 private: 4262 4263 void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { 4264 if (vset_sew == Assembler::e32) __ vle32_v(vr, sr); 4265 else __ vle64_v(vr, sr); 4266 } 4267 4268 void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { 4269 if (vset_sew == Assembler::e32) __ vse32_v(vr, sr); 4270 else __ vse64_v(vr, sr); 4271 } 4272 4273 // Overview of the logic in each "quad round". 4274 // 4275 // The code below repeats 16/20 times the logic implementing four rounds 4276 // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds" 4277 // to implementing the 64/80 single rounds. 4278 // 4279 // // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0]) 4280 // // Output: 4281 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} 4282 // vl1reXX.v vTmp1, ofs 4283 // 4284 // // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b) 4285 // addi ofs, ofs, 16/32 4286 // 4287 // // Add constants to message schedule words: 4288 // // Input 4289 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} 4290 // // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0]; 4291 // // Output 4292 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 4293 // vadd.vv vTmp0, vTmp1, vW0 4294 // 4295 // // 2 rounds of working variables updates. 4296 // // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t] 4297 // // Input: 4298 // // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] " 4299 // // vState0 = {a[t],b[t],e[t],f[t]} 4300 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 4301 // // Output: 4302 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " 4303 // // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] " 4304 // vsha2cl.vv vState1, vState0, vTmp0 4305 // 4306 // // 2 rounds of working variables updates. 4307 // // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t] 4308 // // Input 4309 // // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] " 4310 // // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] " 4311 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " 4312 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} 4313 // // Output: 4314 // // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] " 4315 // vsha2ch.vv vState0, vState1, vTmp0 4316 // 4317 // // Combine 2QW into 1QW 4318 // // 4319 // // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs 4320 // // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3] 4321 // // and it can only take 3 vectors as inputs. Hence we need to combine 4322 // // vW1[0] and vW2[1..3] in a single vector. 4323 // // 4324 // // vmerge Vt4, Vt1, Vt2, V0 4325 // // Input 4326 // // V0 = mask // first word from vW2, 1..3 words from vW1 4327 // // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5} 4328 // // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9} 4329 // // Output 4330 // // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5} 4331 // vmerge.vvm vTmp0, vW2, vW1, v0 4332 // 4333 // // Generate next Four Message Schedule Words (hence allowing for 4 more rounds) 4334 // // Input 4335 // // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0] 4336 // // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12] 4337 // // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4] 4338 // // Output (next four message schedule words) 4339 // // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16] 4340 // vsha2ms.vv vW0, vTmp0, vW3 4341 // 4342 // BEFORE 4343 // vW0 - vW3 hold the message schedule words (initially the block words) 4344 // vW0 = W[ 3: 0] "oldest" 4345 // vW1 = W[ 7: 4] 4346 // vW2 = W[11: 8] 4347 // vW3 = W[15:12] "newest" 4348 // 4349 // vt6 - vt7 hold the working state variables 4350 // vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0} 4351 // vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2} 4352 // 4353 // AFTER 4354 // vW0 - vW3 hold the message schedule words (initially the block words) 4355 // vW1 = W[ 7: 4] "oldest" 4356 // vW2 = W[11: 8] 4357 // vW3 = W[15:12] 4358 // vW0 = W[19:16] "newest" 4359 // 4360 // vState0 and vState1 hold the working state variables 4361 // vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]} 4362 // vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]} 4363 // 4364 // The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round, 4365 // hence the uses of those vectors rotate in each round, and we get back to the 4366 // initial configuration every 4 quad-rounds. We could avoid those changes at 4367 // the cost of moving those vectors at the end of each quad-rounds. 4368 void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4, 4369 Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh, 4370 bool gen_words = true, bool step_const = true) { 4371 __ vleXX_v(vset_sew, vtemp, scalarconst); 4372 if (step_const) { 4373 __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32); 4374 } 4375 __ vadd_vv(vtemp2, vtemp, rot1); 4376 __ vsha2cl_vv(v_cdgh, v_abef, vtemp2); 4377 __ vsha2ch_vv(v_abef, v_cdgh, vtemp2); 4378 if (gen_words) { 4379 __ vmerge_vvm(vtemp2, rot3, rot2); 4380 __ vsha2ms_vv(rot1, vtemp2, rot4); 4381 } 4382 } 4383 4384 // Arguments: 4385 // 4386 // Inputs: 4387 // c_rarg0 - byte[] source+offset 4388 // c_rarg1 - int[] SHA.state 4389 // c_rarg2 - int offset 4390 // c_rarg3 - int limit 4391 // 4392 address generate_sha2_implCompress(Assembler::SEW vset_sew, StubGenStubId stub_id) { 4393 alignas(64) static const uint32_t round_consts_256[64] = { 4394 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 4395 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 4396 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 4397 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 4398 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 4399 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 4400 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 4401 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 4402 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 4403 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 4404 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 4405 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 4406 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 4407 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 4408 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 4409 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 4410 }; 4411 alignas(64) static const uint64_t round_consts_512[80] = { 4412 0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl, 4413 0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l, 4414 0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l, 4415 0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l, 4416 0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l, 4417 0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l, 4418 0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l, 4419 0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l, 4420 0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl, 4421 0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l, 4422 0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl, 4423 0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl, 4424 0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l, 4425 0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l, 4426 0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l, 4427 0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l, 4428 0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l, 4429 0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl, 4430 0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl, 4431 0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl, 4432 0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l, 4433 0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l, 4434 0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal, 4435 0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl, 4436 0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl, 4437 0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al, 4438 0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l 4439 }; 4440 const int const_add = vset_sew == Assembler::e32 ? 16 : 32; 4441 4442 bool multi_block; 4443 switch (stub_id) { 4444 case sha256_implCompress_id: 4445 assert (vset_sew == Assembler::e32, "wrong macroassembler for stub"); 4446 multi_block = false; 4447 break; 4448 case sha256_implCompressMB_id: 4449 assert (vset_sew == Assembler::e32, "wrong macroassembler for stub"); 4450 multi_block = true; 4451 break; 4452 case sha512_implCompress_id: 4453 assert (vset_sew == Assembler::e64, "wrong macroassembler for stub"); 4454 multi_block = false; 4455 break; 4456 case sha512_implCompressMB_id: 4457 assert (vset_sew == Assembler::e64, "wrong macroassembler for stub"); 4458 multi_block = true; 4459 break; 4460 default: 4461 ShouldNotReachHere(); 4462 }; 4463 __ align(CodeEntryAlignment); 4464 StubCodeMark mark(_cgen, stub_id); 4465 address start = __ pc(); 4466 4467 Register buf = c_rarg0; 4468 Register state = c_rarg1; 4469 Register ofs = c_rarg2; 4470 Register limit = c_rarg3; 4471 Register consts = t2; // caller saved 4472 Register state_c = x28; // caller saved 4473 VectorRegister vindex = v2; 4474 VectorRegister vW0 = v4; 4475 VectorRegister vW1 = v6; 4476 VectorRegister vW2 = v8; 4477 VectorRegister vW3 = v10; 4478 VectorRegister vState0 = v12; 4479 VectorRegister vState1 = v14; 4480 VectorRegister vHash0 = v16; 4481 VectorRegister vHash1 = v18; 4482 VectorRegister vTmp0 = v20; 4483 VectorRegister vTmp1 = v22; 4484 4485 Label multi_block_loop; 4486 4487 __ enter(); 4488 4489 address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512; 4490 la(consts, ExternalAddress(constant_table)); 4491 4492 // Register use in this function: 4493 // 4494 // VECTORS 4495 // vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message 4496 // schedule words (Wt). They start with the message block 4497 // content (W0 to W15), then further words in the message 4498 // schedule generated via vsha2ms from previous Wt. 4499 // Initially: 4500 // vW0 = W[ 3:0] = { W3, W2, W1, W0} 4501 // vW1 = W[ 7:4] = { W7, W6, W5, W4} 4502 // vW2 = W[ 11:8] = {W11, W10, W9, W8} 4503 // vW3 = W[15:12] = {W15, W14, W13, W12} 4504 // 4505 // vState0 - vState1 hold the working state variables (a, b, ..., h) 4506 // vState0 = {f[t],e[t],b[t],a[t]} 4507 // vState1 = {h[t],g[t],d[t],c[t]} 4508 // Initially: 4509 // vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1} 4510 // vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1} 4511 // 4512 // v0 = masks for vrgather/vmerge. Single value during the 16 rounds. 4513 // 4514 // vTmp0 = temporary, Wt+Kt 4515 // vTmp1 = temporary, Kt 4516 // 4517 // vHash0/vHash1 = hold the initial values of the hash, byte-swapped. 4518 // 4519 // During most of the function the vector state is configured so that each 4520 // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits). 4521 4522 // vsha2ch/vsha2cl uses EGW of 4*SEW. 4523 // SHA256 SEW = e32, EGW = 128-bits 4524 // SHA512 SEW = e64, EGW = 256-bits 4525 // 4526 // VLEN is required to be at least 128. 4527 // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256) 4528 // 4529 // m1: LMUL=1/2 4530 // ta: tail agnostic (don't care about those lanes) 4531 // ma: mask agnostic (don't care about those lanes) 4532 // x0 is not written, we known the number of vector elements. 4533 4534 if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128 4535 __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta); 4536 } else { 4537 __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta); 4538 } 4539 4540 int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul; 4541 __ li(t0, indexes); 4542 __ vmv_v_x(vindex, t0); 4543 4544 // Step-over a,b, so we are pointing to c. 4545 // const_add is equal to 4x state variable, div by 2 is thus 2, a,b 4546 __ addi(state_c, state, const_add/2); 4547 4548 // Use index-load to get {f,e,b,a},{h,g,d,c} 4549 __ vluxei8_v(vState0, state, vindex); 4550 __ vluxei8_v(vState1, state_c, vindex); 4551 4552 __ bind(multi_block_loop); 4553 4554 // Capture the initial H values in vHash0 and vHash1 to allow for computing 4555 // the resulting H', since H' = H+{a',b',c',...,h'}. 4556 __ vmv_v_v(vHash0, vState0); 4557 __ vmv_v_v(vHash1, vState1); 4558 4559 // Load the 512/1024-bits of the message block in vW0-vW3 and perform 4560 // an endian swap on each 4/8 bytes element. 4561 // 4562 // If Zvkb is not implemented one can use vrgather 4563 // with an index sequence to byte-swap. 4564 // sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] 4565 // <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate 4566 // this sequence. 'vid' gives us the N. 4567 __ vleXX_v(vset_sew, vW0, buf); 4568 __ vrev8_v(vW0, vW0); 4569 __ addi(buf, buf, const_add); 4570 __ vleXX_v(vset_sew, vW1, buf); 4571 __ vrev8_v(vW1, vW1); 4572 __ addi(buf, buf, const_add); 4573 __ vleXX_v(vset_sew, vW2, buf); 4574 __ vrev8_v(vW2, vW2); 4575 __ addi(buf, buf, const_add); 4576 __ vleXX_v(vset_sew, vW3, buf); 4577 __ vrev8_v(vW3, vW3); 4578 __ addi(buf, buf, const_add); 4579 4580 // Set v0 up for the vmerge that replaces the first word (idx==0) 4581 __ vid_v(v0); 4582 __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0) 4583 4584 VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3}; 4585 int rot_pos = 0; 4586 // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2) 4587 const int qr_end = vset_sew == Assembler::e32 ? 12 : 16; 4588 for (int i = 0; i < qr_end; i++) { 4589 sha2_quad_round(vset_sew, 4590 rotation_regs[(rot_pos + 0) & 0x3], 4591 rotation_regs[(rot_pos + 1) & 0x3], 4592 rotation_regs[(rot_pos + 2) & 0x3], 4593 rotation_regs[(rot_pos + 3) & 0x3], 4594 consts, 4595 vTmp1, vTmp0, vState0, vState1); 4596 ++rot_pos; 4597 } 4598 // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2) 4599 // Note that we stop generating new message schedule words (Wt, vW0-13) 4600 // as we already generated all the words we end up consuming (i.e., W[63:60]). 4601 const int qr_c_end = qr_end + 4; 4602 for (int i = qr_end; i < qr_c_end; i++) { 4603 sha2_quad_round(vset_sew, 4604 rotation_regs[(rot_pos + 0) & 0x3], 4605 rotation_regs[(rot_pos + 1) & 0x3], 4606 rotation_regs[(rot_pos + 2) & 0x3], 4607 rotation_regs[(rot_pos + 3) & 0x3], 4608 consts, 4609 vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1)); 4610 ++rot_pos; 4611 } 4612 4613 //-------------------------------------------------------------------------------- 4614 // Compute the updated hash value H' 4615 // H' = H + {h',g',...,b',a'} 4616 // = {h,g,...,b,a} + {h',g',...,b',a'} 4617 // = {h+h',g+g',...,b+b',a+a'} 4618 4619 // H' = H+{a',b',c',...,h'} 4620 __ vadd_vv(vState0, vHash0, vState0); 4621 __ vadd_vv(vState1, vHash1, vState1); 4622 4623 if (multi_block) { 4624 int total_adds = vset_sew == Assembler::e32 ? 240 : 608; 4625 __ subi(consts, consts, total_adds); 4626 __ addi(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128); 4627 __ ble(ofs, limit, multi_block_loop); 4628 __ mv(c_rarg0, ofs); // return ofs 4629 } 4630 4631 // Store H[0..8] = {a,b,c,d,e,f,g,h} from 4632 // vState0 = {f,e,b,a} 4633 // vState1 = {h,g,d,c} 4634 __ vsuxei8_v(vState0, state, vindex); 4635 __ vsuxei8_v(vState1, state_c, vindex); 4636 4637 __ leave(); 4638 __ ret(); 4639 4640 return start; 4641 } 4642 }; 4643 4644 #undef __ 4645 #define __ _masm-> 4646 4647 // Set of L registers that correspond to a contiguous memory area. 4648 // Each 64-bit register typically corresponds to 2 32-bit integers. 4649 template <uint L> 4650 class RegCache { 4651 private: 4652 MacroAssembler *_masm; 4653 Register _regs[L]; 4654 4655 public: 4656 RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) { 4657 assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L); 4658 auto it = rs.begin(); 4659 for (auto &r: _regs) { 4660 r = *it; 4661 ++it; 4662 } 4663 } 4664 4665 // generate load for the i'th register 4666 void gen_load(uint i, Register base) { 4667 assert(i < L, "invalid i: %u", i); 4668 __ ld(_regs[i], Address(base, 8 * i)); 4669 } 4670 4671 // add i'th 32-bit integer to dest 4672 void add_u32(const Register dest, uint i, const Register rtmp = t0) { 4673 assert(i < 2 * L, "invalid i: %u", i); 4674 4675 if (is_even(i)) { 4676 // Use the bottom 32 bits. No need to mask off the top 32 bits 4677 // as addw will do the right thing. 4678 __ addw(dest, dest, _regs[i / 2]); 4679 } else { 4680 // Use the top 32 bits by right-shifting them. 4681 __ srli(rtmp, _regs[i / 2], 32); 4682 __ addw(dest, dest, rtmp); 4683 } 4684 } 4685 }; 4686 4687 typedef RegCache<8> BufRegCache; 4688 4689 // a += value + x + ac; 4690 // a = Integer.rotateLeft(a, s) + b; 4691 void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache, 4692 Register a, Register b, Register c, Register d, 4693 int k, int s, int t, 4694 Register value) { 4695 // a += ac 4696 __ addw(a, a, t, t1); 4697 4698 // a += x; 4699 reg_cache.add_u32(a, k); 4700 // a += value; 4701 __ addw(a, a, value); 4702 4703 // a = Integer.rotateLeft(a, s) + b; 4704 __ rolw(a, a, s); 4705 __ addw(a, a, b); 4706 } 4707 4708 // a += ((b & c) | ((~b) & d)) + x + ac; 4709 // a = Integer.rotateLeft(a, s) + b; 4710 void md5_FF(BufRegCache& reg_cache, 4711 Register a, Register b, Register c, Register d, 4712 int k, int s, int t, 4713 Register rtmp1, Register rtmp2) { 4714 // rtmp1 = b & c 4715 __ andr(rtmp1, b, c); 4716 4717 // rtmp2 = (~b) & d 4718 __ andn(rtmp2, d, b); 4719 4720 // rtmp1 = (b & c) | ((~b) & d) 4721 __ orr(rtmp1, rtmp1, rtmp2); 4722 4723 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4724 } 4725 4726 // a += ((b & d) | (c & (~d))) + x + ac; 4727 // a = Integer.rotateLeft(a, s) + b; 4728 void md5_GG(BufRegCache& reg_cache, 4729 Register a, Register b, Register c, Register d, 4730 int k, int s, int t, 4731 Register rtmp1, Register rtmp2) { 4732 // rtmp1 = b & d 4733 __ andr(rtmp1, b, d); 4734 4735 // rtmp2 = c & (~d) 4736 __ andn(rtmp2, c, d); 4737 4738 // rtmp1 = (b & d) | (c & (~d)) 4739 __ orr(rtmp1, rtmp1, rtmp2); 4740 4741 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4742 } 4743 4744 // a += ((b ^ c) ^ d) + x + ac; 4745 // a = Integer.rotateLeft(a, s) + b; 4746 void md5_HH(BufRegCache& reg_cache, 4747 Register a, Register b, Register c, Register d, 4748 int k, int s, int t, 4749 Register rtmp1, Register rtmp2) { 4750 // rtmp1 = (b ^ c) ^ d 4751 __ xorr(rtmp2, b, c); 4752 __ xorr(rtmp1, rtmp2, d); 4753 4754 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4755 } 4756 4757 // a += (c ^ (b | (~d))) + x + ac; 4758 // a = Integer.rotateLeft(a, s) + b; 4759 void md5_II(BufRegCache& reg_cache, 4760 Register a, Register b, Register c, Register d, 4761 int k, int s, int t, 4762 Register rtmp1, Register rtmp2) { 4763 // rtmp1 = c ^ (b | (~d)) 4764 __ orn(rtmp2, b, d); 4765 __ xorr(rtmp1, c, rtmp2); 4766 4767 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1); 4768 } 4769 4770 // Arguments: 4771 // 4772 // Inputs: 4773 // c_rarg0 - byte[] source+offset 4774 // c_rarg1 - int[] SHA.state 4775 // c_rarg2 - int offset (multi_block == True) 4776 // c_rarg3 - int limit (multi_block == True) 4777 // 4778 // Registers: 4779 // x0 zero (zero) 4780 // x1 ra (return address) 4781 // x2 sp (stack pointer) 4782 // x3 gp (global pointer) 4783 // x4 tp (thread pointer) 4784 // x5 t0 (tmp register) 4785 // x6 t1 (tmp register) 4786 // x7 t2 state0 4787 // x8 f0/s0 (frame pointer) 4788 // x9 s1 4789 // x10 a0 rtmp1 / c_rarg0 4790 // x11 a1 rtmp2 / c_rarg1 4791 // x12 a2 a / c_rarg2 4792 // x13 a3 b / c_rarg3 4793 // x14 a4 c 4794 // x15 a5 d 4795 // x16 a6 buf 4796 // x17 a7 state 4797 // x18 s2 ofs [saved-reg] (multi_block == True) 4798 // x19 s3 limit [saved-reg] (multi_block == True) 4799 // x20 s4 state1 [saved-reg] 4800 // x21 s5 state2 [saved-reg] 4801 // x22 s6 state3 [saved-reg] 4802 // x23 s7 4803 // x24 s8 buf0 [saved-reg] 4804 // x25 s9 buf1 [saved-reg] 4805 // x26 s10 buf2 [saved-reg] 4806 // x27 s11 buf3 [saved-reg] 4807 // x28 t3 buf4 4808 // x29 t4 buf5 4809 // x30 t5 buf6 4810 // x31 t6 buf7 4811 address generate_md5_implCompress(StubGenStubId stub_id) { 4812 __ align(CodeEntryAlignment); 4813 bool multi_block; 4814 switch (stub_id) { 4815 case md5_implCompress_id: 4816 multi_block = false; 4817 break; 4818 case md5_implCompressMB_id: 4819 multi_block = true; 4820 break; 4821 default: 4822 ShouldNotReachHere(); 4823 }; 4824 StubCodeMark mark(this, stub_id); 4825 address start = __ pc(); 4826 4827 // rotation constants 4828 const int S11 = 7; 4829 const int S12 = 12; 4830 const int S13 = 17; 4831 const int S14 = 22; 4832 const int S21 = 5; 4833 const int S22 = 9; 4834 const int S23 = 14; 4835 const int S24 = 20; 4836 const int S31 = 4; 4837 const int S32 = 11; 4838 const int S33 = 16; 4839 const int S34 = 23; 4840 const int S41 = 6; 4841 const int S42 = 10; 4842 const int S43 = 15; 4843 const int S44 = 21; 4844 4845 const int64_t mask32 = 0xffffffff; 4846 4847 Register buf_arg = c_rarg0; // a0 4848 Register state_arg = c_rarg1; // a1 4849 Register ofs_arg = c_rarg2; // a2 4850 Register limit_arg = c_rarg3; // a3 4851 4852 // we'll copy the args to these registers to free up a0-a3 4853 // to use for other values manipulated by instructions 4854 // that can be compressed 4855 Register buf = x16; // a6 4856 Register state = x17; // a7 4857 Register ofs = x18; // s2 4858 Register limit = x19; // s3 4859 4860 // using x12->15 to allow compressed instructions 4861 Register a = x12; // a2 4862 Register b = x13; // a3 4863 Register c = x14; // a4 4864 Register d = x15; // a5 4865 4866 Register state0 = x7; // t2 4867 Register state1 = x20; // s4 4868 Register state2 = x21; // s5 4869 Register state3 = x22; // s6 4870 4871 // using x10->x11 to allow compressed instructions 4872 Register rtmp1 = x10; // a0 4873 Register rtmp2 = x11; // a1 4874 4875 RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11 4876 RegSet reg_cache_regs; 4877 reg_cache_regs += reg_cache_saved_regs; 4878 reg_cache_regs += RegSet::of(t3, t4, t5, t6); 4879 BufRegCache reg_cache(_masm, reg_cache_regs); 4880 4881 RegSet saved_regs; 4882 if (multi_block) { 4883 saved_regs += RegSet::of(ofs, limit); 4884 } 4885 saved_regs += RegSet::of(state1, state2, state3); 4886 saved_regs += reg_cache_saved_regs; 4887 4888 __ push_reg(saved_regs, sp); 4889 4890 __ mv(buf, buf_arg); 4891 __ mv(state, state_arg); 4892 if (multi_block) { 4893 __ mv(ofs, ofs_arg); 4894 __ mv(limit, limit_arg); 4895 } 4896 4897 // to minimize the number of memory operations: 4898 // read the 4 state 4-byte values in pairs, with a single ld, 4899 // and split them into 2 registers. 4900 // 4901 // And, as the core algorithm of md5 works on 32-bits words, so 4902 // in the following code, it does not care about the content of 4903 // higher 32-bits in state[x]. Based on this observation, 4904 // we can apply further optimization, which is to just ignore the 4905 // higher 32-bits in state0/state2, rather than set the higher 4906 // 32-bits of state0/state2 to zero explicitly with extra instructions. 4907 __ ld(state0, Address(state)); 4908 __ srli(state1, state0, 32); 4909 __ ld(state2, Address(state, 8)); 4910 __ srli(state3, state2, 32); 4911 4912 Label md5_loop; 4913 __ BIND(md5_loop); 4914 4915 __ mv(a, state0); 4916 __ mv(b, state1); 4917 __ mv(c, state2); 4918 __ mv(d, state3); 4919 4920 // Round 1 4921 reg_cache.gen_load(0, buf); 4922 md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2); 4923 md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2); 4924 reg_cache.gen_load(1, buf); 4925 md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2); 4926 md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2); 4927 reg_cache.gen_load(2, buf); 4928 md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2); 4929 md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2); 4930 reg_cache.gen_load(3, buf); 4931 md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2); 4932 md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2); 4933 reg_cache.gen_load(4, buf); 4934 md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2); 4935 md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2); 4936 reg_cache.gen_load(5, buf); 4937 md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2); 4938 md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2); 4939 reg_cache.gen_load(6, buf); 4940 md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2); 4941 md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2); 4942 reg_cache.gen_load(7, buf); 4943 md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2); 4944 md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2); 4945 4946 // Round 2 4947 md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2); 4948 md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2); 4949 md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2); 4950 md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2); 4951 md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2); 4952 md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2); 4953 md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2); 4954 md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2); 4955 md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2); 4956 md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2); 4957 md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2); 4958 md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2); 4959 md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2); 4960 md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2); 4961 md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2); 4962 md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2); 4963 4964 // Round 3 4965 md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2); 4966 md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2); 4967 md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2); 4968 md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2); 4969 md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2); 4970 md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2); 4971 md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2); 4972 md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2); 4973 md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2); 4974 md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2); 4975 md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2); 4976 md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2); 4977 md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2); 4978 md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2); 4979 md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2); 4980 md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2); 4981 4982 // Round 4 4983 md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2); 4984 md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2); 4985 md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2); 4986 md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2); 4987 md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2); 4988 md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2); 4989 md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2); 4990 md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2); 4991 md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2); 4992 md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2); 4993 md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2); 4994 md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2); 4995 md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2); 4996 md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2); 4997 md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2); 4998 md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2); 4999 5000 __ addw(state0, state0, a); 5001 __ addw(state1, state1, b); 5002 __ addw(state2, state2, c); 5003 __ addw(state3, state3, d); 5004 5005 if (multi_block) { 5006 __ addi(buf, buf, 64); 5007 __ addi(ofs, ofs, 64); 5008 // if (ofs <= limit) goto m5_loop 5009 __ bge(limit, ofs, md5_loop); 5010 __ mv(c_rarg0, ofs); // return ofs 5011 } 5012 5013 // to minimize the number of memory operations: 5014 // write back the 4 state 4-byte values in pairs, with a single sd 5015 __ mv(t0, mask32); 5016 __ andr(state0, state0, t0); 5017 __ slli(state1, state1, 32); 5018 __ orr(state0, state0, state1); 5019 __ sd(state0, Address(state)); 5020 __ andr(state2, state2, t0); 5021 __ slli(state3, state3, 32); 5022 __ orr(state2, state2, state3); 5023 __ sd(state2, Address(state, 8)); 5024 5025 __ pop_reg(saved_regs, sp); 5026 __ ret(); 5027 5028 return (address) start; 5029 } 5030 5031 /** 5032 * Perform the quarter round calculations on values contained within four vector registers. 5033 * 5034 * @param aVec the SIMD register containing only the "a" values 5035 * @param bVec the SIMD register containing only the "b" values 5036 * @param cVec the SIMD register containing only the "c" values 5037 * @param dVec the SIMD register containing only the "d" values 5038 * @param tmp_vr temporary vector register holds intermedia values. 5039 */ 5040 void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec, 5041 VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) { 5042 // a += b, d ^= a, d <<<= 16 5043 __ vadd_vv(aVec, aVec, bVec); 5044 __ vxor_vv(dVec, dVec, aVec); 5045 __ vrole32_vi(dVec, 16, tmp_vr); 5046 5047 // c += d, b ^= c, b <<<= 12 5048 __ vadd_vv(cVec, cVec, dVec); 5049 __ vxor_vv(bVec, bVec, cVec); 5050 __ vrole32_vi(bVec, 12, tmp_vr); 5051 5052 // a += b, d ^= a, d <<<= 8 5053 __ vadd_vv(aVec, aVec, bVec); 5054 __ vxor_vv(dVec, dVec, aVec); 5055 __ vrole32_vi(dVec, 8, tmp_vr); 5056 5057 // c += d, b ^= c, b <<<= 7 5058 __ vadd_vv(cVec, cVec, dVec); 5059 __ vxor_vv(bVec, bVec, cVec); 5060 __ vrole32_vi(bVec, 7, tmp_vr); 5061 } 5062 5063 /** 5064 * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result) 5065 * 5066 * Input arguments: 5067 * c_rarg0 - state, the starting state 5068 * c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function 5069 * 5070 * Implementation Note: 5071 * Parallelization is achieved by loading individual state elements into vectors for N blocks. 5072 * N depends on single vector register length. 5073 */ 5074 address generate_chacha20Block() { 5075 Label L_Rounds; 5076 5077 __ align(CodeEntryAlignment); 5078 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 5079 StubCodeMark mark(this, stub_id); 5080 address start = __ pc(); 5081 __ enter(); 5082 5083 const int states_len = 16; 5084 const int step = 4; 5085 const Register state = c_rarg0; 5086 const Register key_stream = c_rarg1; 5087 const Register tmp_addr = t0; 5088 const Register length = t1; 5089 5090 // Organize vector registers in an array that facilitates 5091 // putting repetitive opcodes into loop structures below. 5092 const VectorRegister work_vrs[16] = { 5093 v0, v1, v2, v3, v4, v5, v6, v7, 5094 v8, v9, v10, v11, v12, v13, v14, v15 5095 }; 5096 const VectorRegister tmp_vr = v16; 5097 const VectorRegister counter_vr = v17; 5098 5099 { 5100 // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024 5101 // in java level. 5102 __ vsetivli(length, 16, Assembler::e32, Assembler::m1); 5103 } 5104 5105 // Load from source state. 5106 // Every element in source state is duplicated to all elements in the corresponding vector. 5107 __ mv(tmp_addr, state); 5108 for (int i = 0; i < states_len; i += 1) { 5109 __ vlse32_v(work_vrs[i], tmp_addr, zr); 5110 __ addi(tmp_addr, tmp_addr, step); 5111 } 5112 // Adjust counter for every individual block. 5113 __ vid_v(counter_vr); 5114 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); 5115 5116 // Perform 10 iterations of the 8 quarter round set 5117 { 5118 const Register loop = t2; // share t2 with other non-overlapping usages. 5119 __ mv(loop, 10); 5120 __ BIND(L_Rounds); 5121 5122 chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr); 5123 chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr); 5124 chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr); 5125 chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr); 5126 5127 chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr); 5128 chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr); 5129 chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr); 5130 chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr); 5131 5132 __ subi(loop, loop, 1); 5133 __ bnez(loop, L_Rounds); 5134 } 5135 5136 // Add the original state into the end working state. 5137 // We do this by first duplicating every element in source state array to the corresponding 5138 // vector, then adding it to the post-loop working state. 5139 __ mv(tmp_addr, state); 5140 for (int i = 0; i < states_len; i += 1) { 5141 __ vlse32_v(tmp_vr, tmp_addr, zr); 5142 __ addi(tmp_addr, tmp_addr, step); 5143 __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr); 5144 } 5145 // Add the counter overlay onto work_vrs[12] at the end. 5146 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr); 5147 5148 // Store result to key stream. 5149 { 5150 const Register stride = t2; // share t2 with other non-overlapping usages. 5151 // Every block occupies 64 bytes, so we use 64 as stride of the vector store. 5152 __ mv(stride, 64); 5153 for (int i = 0; i < states_len; i += 1) { 5154 __ vsse32_v(work_vrs[i], key_stream, stride); 5155 __ addi(key_stream, key_stream, step); 5156 } 5157 } 5158 5159 // Return length of output key_stream 5160 __ slli(c_rarg0, length, 6); 5161 5162 __ leave(); 5163 __ ret(); 5164 5165 return (address) start; 5166 } 5167 5168 5169 // ------------------------ SHA-1 intrinsic ------------------------ 5170 5171 // K't = 5172 // 5a827999, 0 <= t <= 19 5173 // 6ed9eba1, 20 <= t <= 39 5174 // 8f1bbcdc, 40 <= t <= 59 5175 // ca62c1d6, 60 <= t <= 79 5176 void sha1_prepare_k(Register cur_k, int round) { 5177 assert(round >= 0 && round < 80, "must be"); 5178 5179 static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6}; 5180 if ((round % 20) == 0) { 5181 __ mv(cur_k, ks[round/20]); 5182 } 5183 } 5184 5185 // W't = 5186 // M't, 0 <= t <= 15 5187 // ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 5188 void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) { 5189 assert(round >= 0 && round < 80, "must be"); 5190 5191 if (round < 16) { 5192 // in the first 16 rounds, in ws[], every register contains 2 W't, e.g. 5193 // in ws[0], high part contains W't-0, low part contains W't-1, 5194 // in ws[1], high part contains W't-2, low part contains W't-3, 5195 // ... 5196 // in ws[7], high part contains W't-14, low part contains W't-15. 5197 5198 if ((round % 2) == 0) { 5199 __ ld(ws[round/2], Address(buf, (round/2) * 8)); 5200 // reverse bytes, as SHA-1 is defined in big-endian. 5201 __ revb(ws[round/2], ws[round/2]); 5202 __ srli(cur_w, ws[round/2], 32); 5203 } else { 5204 __ mv(cur_w, ws[round/2]); 5205 } 5206 5207 return; 5208 } 5209 5210 if ((round % 2) == 0) { 5211 int idx = 16; 5212 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 5213 __ srli(t1, ws[(idx-8)/2], 32); 5214 __ xorr(t0, ws[(idx-3)/2], t1); 5215 5216 __ srli(t1, ws[(idx-14)/2], 32); 5217 __ srli(cur_w, ws[(idx-16)/2], 32); 5218 __ xorr(cur_w, cur_w, t1); 5219 5220 __ xorr(cur_w, cur_w, t0); 5221 __ rolw(cur_w, cur_w, 1, t0); 5222 5223 // copy the cur_w value to ws[8]. 5224 // now, valid w't values are at: 5225 // w0: ws[0]'s lower 32 bits 5226 // w1 ~ w14: ws[1] ~ ws[7] 5227 // w15: ws[8]'s higher 32 bits 5228 __ slli(ws[idx/2], cur_w, 32); 5229 5230 return; 5231 } 5232 5233 int idx = 17; 5234 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 5235 __ srli(t1, ws[(idx-3)/2], 32); 5236 __ xorr(t0, t1, ws[(idx-8)/2]); 5237 5238 __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]); 5239 5240 __ xorr(cur_w, cur_w, t0); 5241 __ rolw(cur_w, cur_w, 1, t0); 5242 5243 // copy the cur_w value to ws[8] 5244 __ zext(cur_w, cur_w, 32); 5245 __ orr(ws[idx/2], ws[idx/2], cur_w); 5246 5247 // shift the w't registers, so they start from ws[0] again. 5248 // now, valid w't values are at: 5249 // w0 ~ w15: ws[0] ~ ws[7] 5250 Register ws_0 = ws[0]; 5251 for (int i = 0; i < 16/2; i++) { 5252 ws[i] = ws[i+1]; 5253 } 5254 ws[8] = ws_0; 5255 } 5256 5257 // f't(x, y, z) = 5258 // Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19 5259 // Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39 5260 // Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59 5261 // Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79 5262 void sha1_f(Register dst, Register x, Register y, Register z, int round) { 5263 assert(round >= 0 && round < 80, "must be"); 5264 assert_different_registers(dst, x, y, z, t0, t1); 5265 5266 if (round < 20) { 5267 // (x & y) ^ (~x & z) 5268 __ andr(t0, x, y); 5269 __ andn(dst, z, x); 5270 __ xorr(dst, dst, t0); 5271 } else if (round >= 40 && round < 60) { 5272 // (x & y) ^ (x & z) ^ (y & z) 5273 __ andr(t0, x, y); 5274 __ andr(t1, x, z); 5275 __ andr(dst, y, z); 5276 __ xorr(dst, dst, t0); 5277 __ xorr(dst, dst, t1); 5278 } else { 5279 // x ^ y ^ z 5280 __ xorr(dst, x, y); 5281 __ xorr(dst, dst, z); 5282 } 5283 } 5284 5285 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't 5286 // e = d 5287 // d = c 5288 // c = ROTL'30(b) 5289 // b = a 5290 // a = T 5291 void sha1_process_round(Register a, Register b, Register c, Register d, Register e, 5292 Register cur_k, Register cur_w, Register tmp, int round) { 5293 assert(round >= 0 && round < 80, "must be"); 5294 assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0); 5295 5296 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't 5297 5298 // cur_w will be recalculated at the beginning of each round, 5299 // so, we can reuse it as a temp register here. 5300 Register tmp2 = cur_w; 5301 5302 // reuse e as a temporary register, as we will mv new value into it later 5303 Register tmp3 = e; 5304 __ add(tmp2, cur_k, tmp2); 5305 __ add(tmp3, tmp3, tmp2); 5306 __ rolw(tmp2, a, 5, t0); 5307 5308 sha1_f(tmp, b, c, d, round); 5309 5310 __ add(tmp2, tmp2, tmp); 5311 __ add(tmp2, tmp2, tmp3); 5312 5313 // e = d 5314 // d = c 5315 // c = ROTL'30(b) 5316 // b = a 5317 // a = T 5318 __ mv(e, d); 5319 __ mv(d, c); 5320 5321 __ rolw(c, b, 30); 5322 __ mv(b, a); 5323 __ mv(a, tmp2); 5324 } 5325 5326 // H(i)0 = a + H(i-1)0 5327 // H(i)1 = b + H(i-1)1 5328 // H(i)2 = c + H(i-1)2 5329 // H(i)3 = d + H(i-1)3 5330 // H(i)4 = e + H(i-1)4 5331 void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e, 5332 Register prev_ab, Register prev_cd, Register prev_e) { 5333 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5334 5335 __ add(a, a, prev_ab); 5336 __ srli(prev_ab, prev_ab, 32); 5337 __ add(b, b, prev_ab); 5338 5339 __ add(c, c, prev_cd); 5340 __ srli(prev_cd, prev_cd, 32); 5341 __ add(d, d, prev_cd); 5342 5343 __ add(e, e, prev_e); 5344 } 5345 5346 void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e, 5347 Register prev_ab, Register prev_cd, Register prev_e) { 5348 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0); 5349 5350 __ slli(t0, b, 32); 5351 __ zext(prev_ab, a, 32); 5352 __ orr(prev_ab, prev_ab, t0); 5353 5354 __ slli(t0, d, 32); 5355 __ zext(prev_cd, c, 32); 5356 __ orr(prev_cd, prev_cd, t0); 5357 5358 __ mv(prev_e, e); 5359 } 5360 5361 // Intrinsic for: 5362 // void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs) 5363 // void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit) 5364 // 5365 // Arguments: 5366 // 5367 // Inputs: 5368 // c_rarg0: byte[] src array + offset 5369 // c_rarg1: int[] SHA.state 5370 // - - - - - - below are only for implCompressMultiBlock0 - - - - - - 5371 // c_rarg2: int offset 5372 // c_rarg3: int limit 5373 // 5374 // Outputs: 5375 // - - - - - - below are only for implCompressMultiBlock0 - - - - - - 5376 // c_rarg0: int offset, when (multi_block == true) 5377 // 5378 address generate_sha1_implCompress(StubGenStubId stub_id) { 5379 bool multi_block; 5380 switch (stub_id) { 5381 case sha1_implCompress_id: 5382 multi_block = false; 5383 break; 5384 case sha1_implCompressMB_id: 5385 multi_block = true; 5386 break; 5387 default: 5388 ShouldNotReachHere(); 5389 }; 5390 __ align(CodeEntryAlignment); 5391 StubCodeMark mark(this, stub_id); 5392 5393 address start = __ pc(); 5394 __ enter(); 5395 5396 RegSet saved_regs = RegSet::range(x18, x27); 5397 if (multi_block) { 5398 // use x9 as src below. 5399 saved_regs += RegSet::of(x9); 5400 } 5401 __ push_reg(saved_regs, sp); 5402 5403 // c_rarg0 - c_rarg3: x10 - x13 5404 Register buf = c_rarg0; 5405 Register state = c_rarg1; 5406 Register offset = c_rarg2; 5407 Register limit = c_rarg3; 5408 // use src to contain the original start point of the array. 5409 Register src = x9; 5410 5411 if (multi_block) { 5412 __ sub(limit, limit, offset); 5413 __ add(limit, limit, buf); 5414 __ sub(src, buf, offset); 5415 } 5416 5417 // [args-reg]: x14 - x17 5418 // [temp-reg]: x28 - x31 5419 // [saved-reg]: x18 - x27 5420 5421 // h0/1/2/3/4 5422 const Register a = x14, b = x15, c = x16, d = x17, e = x28; 5423 // w0, w1, ... w15 5424 // put two adjecent w's in one register: 5425 // one at high word part, another at low word part 5426 // at different round (even or odd), w't value reside in different items in ws[]. 5427 // w0 ~ w15, either reside in 5428 // ws[0] ~ ws[7], where 5429 // w0 at higher 32 bits of ws[0], 5430 // w1 at lower 32 bits of ws[0], 5431 // ... 5432 // w14 at higher 32 bits of ws[7], 5433 // w15 at lower 32 bits of ws[7]. 5434 // or, reside in 5435 // w0: ws[0]'s lower 32 bits 5436 // w1 ~ w14: ws[1] ~ ws[7] 5437 // w15: ws[8]'s higher 32 bits 5438 Register ws[9] = {x29, x30, x31, x18, 5439 x19, x20, x21, x22, 5440 x23}; // auxiliary register for calculating w's value 5441 // current k't's value 5442 const Register cur_k = x24; 5443 // current w't's value 5444 const Register cur_w = x25; 5445 // values of a, b, c, d, e in the previous round 5446 const Register prev_ab = x26, prev_cd = x27; 5447 const Register prev_e = offset; // reuse offset/c_rarg2 5448 5449 // load 5 words state into a, b, c, d, e. 5450 // 5451 // To minimize the number of memory operations, we apply following 5452 // optimization: read the states (a/b/c/d) of 4-byte values in pairs, 5453 // with a single ld, and split them into 2 registers. 5454 // 5455 // And, as the core algorithm of SHA-1 works on 32-bits words, so 5456 // in the following code, it does not care about the content of 5457 // higher 32-bits in a/b/c/d/e. Based on this observation, 5458 // we can apply further optimization, which is to just ignore the 5459 // higher 32-bits in a/c/e, rather than set the higher 5460 // 32-bits of a/c/e to zero explicitly with extra instructions. 5461 __ ld(a, Address(state, 0)); 5462 __ srli(b, a, 32); 5463 __ ld(c, Address(state, 8)); 5464 __ srli(d, c, 32); 5465 __ lw(e, Address(state, 16)); 5466 5467 Label L_sha1_loop; 5468 if (multi_block) { 5469 __ BIND(L_sha1_loop); 5470 } 5471 5472 sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5473 5474 for (int round = 0; round < 80; round++) { 5475 // prepare K't value 5476 sha1_prepare_k(cur_k, round); 5477 5478 // prepare W't value 5479 sha1_prepare_w(cur_w, ws, buf, round); 5480 5481 // one round process 5482 sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round); 5483 } 5484 5485 // compute the intermediate hash value 5486 sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e); 5487 5488 if (multi_block) { 5489 int64_t block_bytes = 16 * 4; 5490 __ addi(buf, buf, block_bytes); 5491 5492 __ bge(limit, buf, L_sha1_loop, true); 5493 } 5494 5495 // store back the state. 5496 __ zext(a, a, 32); 5497 __ slli(b, b, 32); 5498 __ orr(a, a, b); 5499 __ sd(a, Address(state, 0)); 5500 __ zext(c, c, 32); 5501 __ slli(d, d, 32); 5502 __ orr(c, c, d); 5503 __ sd(c, Address(state, 8)); 5504 __ sw(e, Address(state, 16)); 5505 5506 // return offset 5507 if (multi_block) { 5508 __ sub(c_rarg0, buf, src); 5509 } 5510 5511 __ pop_reg(saved_regs, sp); 5512 5513 __ leave(); 5514 __ ret(); 5515 5516 return (address) start; 5517 } 5518 5519 /** 5520 * vector registers: 5521 * input VectorRegister's: intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3 5522 * index VectorRegister's: idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7 5523 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11 5524 * 5525 * NOTE: each field will occupy a vector register group 5526 */ 5527 void base64_vector_encode_round(Register src, Register dst, Register codec, 5528 Register size, Register stepSrc, Register stepDst, 5529 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, 5530 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, 5531 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4, 5532 Assembler::LMUL lmul) { 5533 // set vector register type/len 5534 __ vsetvli(x0, size, Assembler::e8, lmul); 5535 5536 // segmented load src into v registers: mem(src) => vr(3) 5537 __ vlseg3e8_v(inputV1, src); 5538 5539 // src = src + register_group_len_bytes * 3 5540 __ add(src, src, stepSrc); 5541 5542 // encoding 5543 // 1. compute index into lookup table: vr(3) => vr(4) 5544 __ vsrl_vi(idxV1, inputV1, 2); 5545 5546 __ vsrl_vi(idxV2, inputV2, 2); 5547 __ vsll_vi(inputV1, inputV1, 6); 5548 __ vor_vv(idxV2, idxV2, inputV1); 5549 __ vsrl_vi(idxV2, idxV2, 2); 5550 5551 __ vsrl_vi(idxV3, inputV3, 4); 5552 __ vsll_vi(inputV2, inputV2, 4); 5553 __ vor_vv(idxV3, inputV2, idxV3); 5554 __ vsrl_vi(idxV3, idxV3, 2); 5555 5556 __ vsll_vi(idxV4, inputV3, 2); 5557 __ vsrl_vi(idxV4, idxV4, 2); 5558 5559 // 2. indexed load: vr(4) => vr(4) 5560 __ vluxei8_v(outputV1, codec, idxV1); 5561 __ vluxei8_v(outputV2, codec, idxV2); 5562 __ vluxei8_v(outputV3, codec, idxV3); 5563 __ vluxei8_v(outputV4, codec, idxV4); 5564 5565 // segmented store encoded data in v registers back to dst: vr(4) => mem(dst) 5566 __ vsseg4e8_v(outputV1, dst); 5567 5568 // dst = dst + register_group_len_bytes * 4 5569 __ add(dst, dst, stepDst); 5570 } 5571 5572 /** 5573 * void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) 5574 * 5575 * Input arguments: 5576 * c_rarg0 - src, source array 5577 * c_rarg1 - sp, src start offset 5578 * c_rarg2 - sl, src end offset 5579 * c_rarg3 - dst, dest array 5580 * c_rarg4 - dp, dst start offset 5581 * c_rarg5 - isURL, Base64 or URL character set 5582 */ 5583 address generate_base64_encodeBlock() { 5584 alignas(64) static const char toBase64[64] = { 5585 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5586 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5587 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5588 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5589 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 5590 }; 5591 5592 alignas(64) static const char toBase64URL[64] = { 5593 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 5594 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 5595 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 5596 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 5597 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 5598 }; 5599 5600 __ align(CodeEntryAlignment); 5601 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 5602 StubCodeMark mark(this, stub_id); 5603 address start = __ pc(); 5604 __ enter(); 5605 5606 Register src = c_rarg0; 5607 Register soff = c_rarg1; 5608 Register send = c_rarg2; 5609 Register dst = c_rarg3; 5610 Register doff = c_rarg4; 5611 Register isURL = c_rarg5; 5612 5613 Register codec = c_rarg6; 5614 Register length = c_rarg7; // total length of src data in bytes 5615 5616 Label ProcessData, Exit; 5617 5618 // length should be multiple of 3 5619 __ sub(length, send, soff); 5620 // real src/dst to process data 5621 __ add(src, src, soff); 5622 __ add(dst, dst, doff); 5623 5624 // load the codec base address 5625 __ la(codec, ExternalAddress((address) toBase64)); 5626 __ beqz(isURL, ProcessData); 5627 __ la(codec, ExternalAddress((address) toBase64URL)); 5628 __ BIND(ProcessData); 5629 5630 // vector version 5631 if (UseRVV) { 5632 Label ProcessM2, ProcessM1, ProcessScalar; 5633 5634 Register size = soff; 5635 Register stepSrcM1 = send; 5636 Register stepSrcM2 = doff; 5637 Register stepDst = isURL; 5638 5639 __ mv(size, MaxVectorSize * 2); 5640 __ mv(stepSrcM1, MaxVectorSize * 3); 5641 __ slli(stepSrcM2, stepSrcM1, 1); 5642 __ mv(stepDst, MaxVectorSize * 2 * 4); 5643 5644 __ blt(length, stepSrcM2, ProcessM1); 5645 5646 __ BIND(ProcessM2); 5647 base64_vector_encode_round(src, dst, codec, 5648 size, stepSrcM2, stepDst, 5649 v2, v4, v6, // inputs 5650 v8, v10, v12, v14, // indexes 5651 v16, v18, v20, v22, // outputs 5652 Assembler::m2); 5653 5654 __ sub(length, length, stepSrcM2); 5655 __ bge(length, stepSrcM2, ProcessM2); 5656 5657 __ BIND(ProcessM1); 5658 __ blt(length, stepSrcM1, ProcessScalar); 5659 5660 __ srli(size, size, 1); 5661 __ srli(stepDst, stepDst, 1); 5662 base64_vector_encode_round(src, dst, codec, 5663 size, stepSrcM1, stepDst, 5664 v1, v2, v3, // inputs 5665 v4, v5, v6, v7, // indexes 5666 v8, v9, v10, v11, // outputs 5667 Assembler::m1); 5668 __ sub(length, length, stepSrcM1); 5669 5670 __ BIND(ProcessScalar); 5671 } 5672 5673 // scalar version 5674 { 5675 Register byte1 = soff, byte0 = send, byte2 = doff; 5676 Register combined24Bits = isURL; 5677 5678 __ beqz(length, Exit); 5679 5680 Label ScalarLoop; 5681 __ BIND(ScalarLoop); 5682 { 5683 // plain: [byte0[7:0] : byte1[7:0] : byte2[7:0]] => 5684 // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]] 5685 5686 // load 3 bytes src data 5687 __ lbu(byte0, Address(src, 0)); 5688 __ lbu(byte1, Address(src, 1)); 5689 __ lbu(byte2, Address(src, 2)); 5690 __ addi(src, src, 3); 5691 5692 // construct 24 bits from 3 bytes 5693 __ slliw(byte0, byte0, 16); 5694 __ slliw(byte1, byte1, 8); 5695 __ orr(combined24Bits, byte0, byte1); 5696 __ orr(combined24Bits, combined24Bits, byte2); 5697 5698 // get codec index and encode(ie. load from codec by index) 5699 __ slliw(byte0, combined24Bits, 8); 5700 __ srliw(byte0, byte0, 26); 5701 __ add(byte0, codec, byte0); 5702 __ lbu(byte0, byte0); 5703 5704 __ slliw(byte1, combined24Bits, 14); 5705 __ srliw(byte1, byte1, 26); 5706 __ add(byte1, codec, byte1); 5707 __ lbu(byte1, byte1); 5708 5709 __ slliw(byte2, combined24Bits, 20); 5710 __ srliw(byte2, byte2, 26); 5711 __ add(byte2, codec, byte2); 5712 __ lbu(byte2, byte2); 5713 5714 __ andi(combined24Bits, combined24Bits, 0x3f); 5715 __ add(combined24Bits, codec, combined24Bits); 5716 __ lbu(combined24Bits, combined24Bits); 5717 5718 // store 4 bytes encoded data 5719 __ sb(byte0, Address(dst, 0)); 5720 __ sb(byte1, Address(dst, 1)); 5721 __ sb(byte2, Address(dst, 2)); 5722 __ sb(combined24Bits, Address(dst, 3)); 5723 5724 __ subi(length, length, 3); 5725 __ addi(dst, dst, 4); 5726 // loop back 5727 __ bnez(length, ScalarLoop); 5728 } 5729 } 5730 5731 __ BIND(Exit); 5732 5733 __ leave(); 5734 __ ret(); 5735 5736 return (address) start; 5737 } 5738 5739 /** 5740 * vector registers: 5741 * input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8 5742 * index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16 5743 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22 5744 * 5745 * NOTE: each field will occupy a single vector register group 5746 */ 5747 void base64_vector_decode_round(Register src, Register dst, Register codec, 5748 Register size, Register stepSrc, Register stepDst, Register failedIdx, 5749 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4, 5750 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4, 5751 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, 5752 Assembler::LMUL lmul) { 5753 // set vector register type/len 5754 __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta); 5755 5756 // segmented load src into v registers: mem(src) => vr(4) 5757 __ vlseg4e8_v(inputV1, src); 5758 5759 // src = src + register_group_len_bytes * 4 5760 __ add(src, src, stepSrc); 5761 5762 // decoding 5763 // 1. indexed load: vr(4) => vr(4) 5764 __ vluxei8_v(idxV1, codec, inputV1); 5765 __ vluxei8_v(idxV2, codec, inputV2); 5766 __ vluxei8_v(idxV3, codec, inputV3); 5767 __ vluxei8_v(idxV4, codec, inputV4); 5768 5769 // 2. check wrong data 5770 __ vor_vv(outputV1, idxV1, idxV2); 5771 __ vor_vv(outputV2, idxV3, idxV4); 5772 __ vor_vv(outputV1, outputV1, outputV2); 5773 __ vmseq_vi(v0, outputV1, -1); 5774 __ vfirst_m(failedIdx, v0); 5775 Label NoFailure, FailureAtIdx0; 5776 // valid value can only be -1 when < 0 5777 __ bltz(failedIdx, NoFailure); 5778 // when the first data (at index 0) fails, no need to process data anymore 5779 __ beqz(failedIdx, FailureAtIdx0); 5780 __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu); 5781 __ slli(stepDst, failedIdx, 1); 5782 __ add(stepDst, failedIdx, stepDst); 5783 __ BIND(NoFailure); 5784 5785 // 3. compute the decoded data: vr(4) => vr(3) 5786 __ vsll_vi(idxV1, idxV1, 2); 5787 __ vsrl_vi(outputV1, idxV2, 4); 5788 __ vor_vv(outputV1, outputV1, idxV1); 5789 5790 __ vsll_vi(idxV2, idxV2, 4); 5791 __ vsrl_vi(outputV2, idxV3, 2); 5792 __ vor_vv(outputV2, outputV2, idxV2); 5793 5794 __ vsll_vi(idxV3, idxV3, 6); 5795 __ vor_vv(outputV3, idxV4, idxV3); 5796 5797 // segmented store encoded data in v registers back to dst: vr(3) => mem(dst) 5798 __ vsseg3e8_v(outputV1, dst); 5799 5800 // dst = dst + register_group_len_bytes * 3 5801 __ add(dst, dst, stepDst); 5802 __ BIND(FailureAtIdx0); 5803 } 5804 5805 /** 5806 * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME) 5807 * 5808 * Input arguments: 5809 * c_rarg0 - src, source array 5810 * c_rarg1 - sp, src start offset 5811 * c_rarg2 - sl, src end offset 5812 * c_rarg3 - dst, dest array 5813 * c_rarg4 - dp, dst start offset 5814 * c_rarg5 - isURL, Base64 or URL character set 5815 * c_rarg6 - isMIME, Decoding MIME block 5816 */ 5817 address generate_base64_decodeBlock() { 5818 5819 static const uint8_t fromBase64[256] = { 5820 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5821 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5822 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 5823 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 5824 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 5825 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 5826 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 5827 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 5828 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5829 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5830 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5831 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5832 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5833 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5834 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5835 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5836 }; 5837 5838 static const uint8_t fromBase64URL[256] = { 5839 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5840 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5841 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 5842 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 5843 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 5844 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 5845 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 5846 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 5847 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5848 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5849 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5850 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5851 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5852 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5853 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5854 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 5855 }; 5856 5857 __ align(CodeEntryAlignment); 5858 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 5859 StubCodeMark mark(this, stub_id); 5860 address start = __ pc(); 5861 __ enter(); 5862 5863 Register src = c_rarg0; 5864 Register soff = c_rarg1; 5865 Register send = c_rarg2; 5866 Register dst = c_rarg3; 5867 Register doff = c_rarg4; 5868 Register isURL = c_rarg5; 5869 Register isMIME = c_rarg6; 5870 5871 Register codec = c_rarg7; 5872 Register dstBackup = t6; 5873 Register length = t3; // total length of src data in bytes 5874 5875 Label ProcessData, Exit; 5876 Label ProcessScalar, ScalarLoop; 5877 5878 // passed in length (send - soff) is guaranteed to be > 4, 5879 // and in this intrinsic we only process data of length in multiple of 4, 5880 // it's not guaranteed to be multiple of 4 by java level, so do it explicitly 5881 __ sub(length, send, soff); 5882 __ andi(length, length, -4); 5883 // real src/dst to process data 5884 __ add(src, src, soff); 5885 __ add(dst, dst, doff); 5886 // backup of dst, used to calculate the return value at exit 5887 __ mv(dstBackup, dst); 5888 5889 // load the codec base address 5890 __ la(codec, ExternalAddress((address) fromBase64)); 5891 __ beqz(isURL, ProcessData); 5892 __ la(codec, ExternalAddress((address) fromBase64URL)); 5893 __ BIND(ProcessData); 5894 5895 // vector version 5896 if (UseRVV) { 5897 // for MIME case, it has a default length limit of 76 which could be 5898 // different(smaller) from (send - soff), so in MIME case, we go through 5899 // the scalar code path directly. 5900 __ bnez(isMIME, ScalarLoop); 5901 5902 Label ProcessM1, ProcessM2; 5903 5904 Register failedIdx = soff; 5905 Register stepSrcM1 = send; 5906 Register stepSrcM2 = doff; 5907 Register stepDst = isURL; 5908 Register size = t4; 5909 5910 __ mv(size, MaxVectorSize * 2); 5911 __ mv(stepSrcM1, MaxVectorSize * 4); 5912 __ slli(stepSrcM2, stepSrcM1, 1); 5913 __ mv(stepDst, MaxVectorSize * 2 * 3); 5914 5915 __ blt(length, stepSrcM2, ProcessM1); 5916 5917 5918 // Assembler::m2 5919 __ BIND(ProcessM2); 5920 base64_vector_decode_round(src, dst, codec, 5921 size, stepSrcM2, stepDst, failedIdx, 5922 v2, v4, v6, v8, // inputs 5923 v10, v12, v14, v16, // indexes 5924 v18, v20, v22, // outputs 5925 Assembler::m2); 5926 __ sub(length, length, stepSrcM2); 5927 5928 // error check 5929 // valid value of failedIdx can only be -1 when < 0 5930 __ bgez(failedIdx, Exit); 5931 5932 __ bge(length, stepSrcM2, ProcessM2); 5933 5934 5935 // Assembler::m1 5936 __ BIND(ProcessM1); 5937 __ blt(length, stepSrcM1, ProcessScalar); 5938 5939 __ srli(size, size, 1); 5940 __ srli(stepDst, stepDst, 1); 5941 base64_vector_decode_round(src, dst, codec, 5942 size, stepSrcM1, stepDst, failedIdx, 5943 v1, v2, v3, v4, // inputs 5944 v5, v6, v7, v8, // indexes 5945 v9, v10, v11, // outputs 5946 Assembler::m1); 5947 __ sub(length, length, stepSrcM1); 5948 5949 // error check 5950 // valid value of failedIdx can only be -1 when < 0 5951 __ bgez(failedIdx, Exit); 5952 5953 __ BIND(ProcessScalar); 5954 __ beqz(length, Exit); 5955 } 5956 5957 // scalar version 5958 { 5959 Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL; 5960 Register combined32Bits = t4; 5961 5962 // encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] => 5963 // plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]] 5964 __ BIND(ScalarLoop); 5965 5966 // load 4 bytes encoded src data 5967 __ lbu(byte0, Address(src, 0)); 5968 __ lbu(byte1, Address(src, 1)); 5969 __ lbu(byte2, Address(src, 2)); 5970 __ lbu(byte3, Address(src, 3)); 5971 __ addi(src, src, 4); 5972 5973 // get codec index and decode (ie. load from codec by index) 5974 __ add(byte0, codec, byte0); 5975 __ add(byte1, codec, byte1); 5976 __ lb(byte0, Address(byte0, 0)); 5977 __ lb(byte1, Address(byte1, 0)); 5978 __ add(byte2, codec, byte2); 5979 __ add(byte3, codec, byte3); 5980 __ lb(byte2, Address(byte2, 0)); 5981 __ lb(byte3, Address(byte3, 0)); 5982 __ slliw(byte0, byte0, 18); 5983 __ slliw(byte1, byte1, 12); 5984 __ orr(byte0, byte0, byte1); 5985 __ orr(byte0, byte0, byte3); 5986 __ slliw(byte2, byte2, 6); 5987 // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time, 5988 // 1. error check below 5989 // 2. decode below 5990 __ orr(combined32Bits, byte0, byte2); 5991 5992 // error check 5993 __ bltz(combined32Bits, Exit); 5994 5995 // store 3 bytes decoded data 5996 __ sraiw(byte0, combined32Bits, 16); 5997 __ sraiw(byte1, combined32Bits, 8); 5998 __ sb(byte0, Address(dst, 0)); 5999 __ sb(byte1, Address(dst, 1)); 6000 __ sb(combined32Bits, Address(dst, 2)); 6001 6002 __ subi(length, length, 4); 6003 __ addi(dst, dst, 3); 6004 // loop back 6005 __ bnez(length, ScalarLoop); 6006 } 6007 6008 __ BIND(Exit); 6009 __ sub(c_rarg0, dst, dstBackup); 6010 6011 __ leave(); 6012 __ ret(); 6013 6014 return (address) start; 6015 } 6016 6017 void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable, 6018 VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc, 6019 Register temp0, Register temp1, Register temp2, Register temp3, 6020 VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) { 6021 6022 assert((lmul == Assembler::m4 && step == 64) || 6023 (lmul == Assembler::m2 && step == 32) || 6024 (lmul == Assembler::m1 && step == 16), 6025 "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16"); 6026 // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used. 6027 // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case. 6028 // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration. 6029 // In non-vectorized code, we update s1 and s2 as: 6030 // s1 <- s1 + b1 6031 // s2 <- s2 + s1 6032 // s1 <- s1 + b2 6033 // s2 <- s2 + b1 6034 // ... 6035 // s1 <- s1 + b64 6036 // s2 <- s2 + s1 6037 // Putting above assignments together, we have: 6038 // s1_new = s1 + b1 + b2 + ... + b64 6039 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) = 6040 // = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) = 6041 // = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1) 6042 6043 __ mv(temp3, step); 6044 // Load data 6045 __ vsetvli(temp0, temp3, Assembler::e8, lmul); 6046 __ vle8_v(vbytes, buff); 6047 __ addi(buff, buff, step); 6048 6049 // Upper bound reduction sum for s1_new: 6050 // 0xFF * 64 = 0x3FC0, so: 6051 // 1. Need to do vector-widening reduction sum 6052 // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements 6053 __ vwredsumu_vs(vs1acc, vbytes, vzero); 6054 // Multiplication for s2_new 6055 __ vwmulu_vv(vs2acc, vtable, vbytes); 6056 6057 // s2 = s2 + s1 * log2(step) 6058 __ slli(temp1, s1, exact_log2(step)); 6059 __ add(s2, s2, temp1); 6060 6061 // Summing up calculated results for s2_new 6062 if (MaxVectorSize > 16) { 6063 __ vsetvli(temp0, temp3, Assembler::e16, lmul); 6064 } else { 6065 // Half of vector-widening multiplication result is in successor of vs2acc 6066 // group for vlen == 16, in which case we need to double vector register 6067 // group width in order to reduction sum all of them 6068 Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 : 6069 (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8; 6070 __ vsetvli(temp0, temp3, Assembler::e16, lmulx2); 6071 } 6072 // Upper bound for reduction sum: 6073 // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so: 6074 // 1. Need to do vector-widening reduction sum 6075 // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements 6076 __ vwredsumu_vs(vtemp1, vs2acc, vzero); 6077 6078 // Extracting results for: 6079 // s1_new 6080 __ vmv_x_s(temp0, vs1acc); 6081 __ add(s1, s1, temp0); 6082 // s2_new 6083 __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1); 6084 __ vmv_x_s(temp1, vtemp1); 6085 __ add(s2, s2, temp1); 6086 } 6087 6088 /*** 6089 * int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len) 6090 * 6091 * Arguments: 6092 * 6093 * Inputs: 6094 * c_rarg0 - int adler 6095 * c_rarg1 - byte* buff (b + off) 6096 * c_rarg2 - int len 6097 * 6098 * Output: 6099 * c_rarg0 - int adler result 6100 */ 6101 address generate_updateBytesAdler32() { 6102 __ align(CodeEntryAlignment); 6103 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 6104 StubCodeMark mark(this, stub_id); 6105 address start = __ pc(); 6106 6107 Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop, 6108 L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1; 6109 6110 // Aliases 6111 Register adler = c_rarg0; 6112 Register s1 = c_rarg0; 6113 Register s2 = c_rarg3; 6114 Register buff = c_rarg1; 6115 Register len = c_rarg2; 6116 Register nmax = c_rarg4; 6117 Register base = c_rarg5; 6118 Register count = c_rarg6; 6119 Register temp0 = t3; 6120 Register temp1 = t4; 6121 Register temp2 = t5; 6122 Register temp3 = t6; 6123 6124 VectorRegister vzero = v31; 6125 VectorRegister vbytes = v8; // group: v8, v9, v10, v11 6126 VectorRegister vs1acc = v12; // group: v12, v13, v14, v15 6127 VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23 6128 VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27 6129 VectorRegister vtable_32 = v4; // group: v4, v5 6130 VectorRegister vtable_16 = v30; 6131 VectorRegister vtemp1 = v28; 6132 VectorRegister vtemp2 = v29; 6133 6134 // Max number of bytes we can process before having to take the mod 6135 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 6136 const uint64_t BASE = 0xfff1; 6137 const uint64_t NMAX = 0x15B0; 6138 6139 // Loops steps 6140 int step_64 = 64; 6141 int step_32 = 32; 6142 int step_16 = 16; 6143 int step_1 = 1; 6144 6145 __ enter(); // Required for proper stackwalking of RuntimeStub frame 6146 __ mv(temp1, 64); 6147 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4); 6148 6149 // Generating accumulation coefficients for further calculations 6150 // vtable_64: 6151 __ vid_v(vtemp1); 6152 __ vrsub_vx(vtable_64, vtemp1, temp1); 6153 // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 } 6154 6155 // vtable_32: 6156 __ mv(temp1, 32); 6157 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2); 6158 __ vid_v(vtemp1); 6159 __ vrsub_vx(vtable_32, vtemp1, temp1); 6160 // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 } 6161 6162 __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1); 6163 // vtable_16: 6164 __ mv(temp1, 16); 6165 __ vid_v(vtemp1); 6166 __ vrsub_vx(vtable_16, vtemp1, temp1); 6167 // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 } 6168 6169 __ vmv_v_i(vzero, 0); 6170 6171 __ mv(base, BASE); 6172 __ mv(nmax, NMAX); 6173 6174 // s1 is initialized to the lower 16 bits of adler 6175 // s2 is initialized to the upper 16 bits of adler 6176 __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff) 6177 __ zext(s1, adler, 16); // s1 = (adler & 0xffff) 6178 6179 // The pipelined loop needs at least 16 elements for 1 iteration 6180 // It does check this, but it is more effective to skip to the cleanup loop 6181 __ mv(temp0, step_16); 6182 __ bgeu(len, temp0, L_nmax); 6183 __ beqz(len, L_combine); 6184 6185 // Jumping to L_by1_loop 6186 __ subi(len, len, step_1); 6187 __ j(L_by1_loop); 6188 6189 __ bind(L_nmax); 6190 __ sub(len, len, nmax); 6191 __ subi(count, nmax, 16); 6192 __ bltz(len, L_by16); 6193 6194 // Align L_nmax loop by 64 6195 __ bind(L_nmax_loop_entry); 6196 __ subi(count, count, 32); 6197 6198 __ bind(L_nmax_loop); 6199 adler32_process_bytes(buff, s1, s2, vtable_64, vzero, 6200 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 6201 vtemp1, vtemp2, step_64, Assembler::m4); 6202 __ subi(count, count, step_64); 6203 __ bgtz(count, L_nmax_loop); 6204 6205 // There are three iterations left to do 6206 adler32_process_bytes(buff, s1, s2, vtable_32, vzero, 6207 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 6208 vtemp1, vtemp2, step_32, Assembler::m2); 6209 adler32_process_bytes(buff, s1, s2, vtable_16, vzero, 6210 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 6211 vtemp1, vtemp2, step_16, Assembler::m1); 6212 6213 // s1 = s1 % BASE 6214 __ remuw(s1, s1, base); 6215 // s2 = s2 % BASE 6216 __ remuw(s2, s2, base); 6217 6218 __ sub(len, len, nmax); 6219 __ subi(count, nmax, 16); 6220 __ bgez(len, L_nmax_loop_entry); 6221 6222 __ bind(L_by16); 6223 __ add(len, len, count); 6224 __ bltz(len, L_by1); 6225 // Trying to unroll 6226 __ mv(temp3, step_64); 6227 __ blt(len, temp3, L_by16_loop); 6228 6229 __ bind(L_by16_loop_unroll); 6230 adler32_process_bytes(buff, s1, s2, vtable_64, vzero, 6231 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 6232 vtemp1, vtemp2, step_64, Assembler::m4); 6233 __ subi(len, len, step_64); 6234 // By now the temp3 should still be 64 6235 __ bge(len, temp3, L_by16_loop_unroll); 6236 6237 __ bind(L_by16_loop); 6238 adler32_process_bytes(buff, s1, s2, vtable_16, vzero, 6239 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3, 6240 vtemp1, vtemp2, step_16, Assembler::m1); 6241 __ subi(len, len, step_16); 6242 __ bgez(len, L_by16_loop); 6243 6244 __ bind(L_by1); 6245 __ addi(len, len, 15); 6246 __ bltz(len, L_do_mod); 6247 6248 __ bind(L_by1_loop); 6249 __ lbu(temp0, Address(buff, 0)); 6250 __ addi(buff, buff, step_1); 6251 __ add(s1, temp0, s1); 6252 __ add(s2, s2, s1); 6253 __ subi(len, len, step_1); 6254 __ bgez(len, L_by1_loop); 6255 6256 __ bind(L_do_mod); 6257 // s1 = s1 % BASE 6258 __ remuw(s1, s1, base); 6259 // s2 = s2 % BASE 6260 __ remuw(s2, s2, base); 6261 6262 // Combine lower bits and higher bits 6263 // adler = s1 | (s2 << 16) 6264 __ bind(L_combine); 6265 __ slli(s2, s2, 16); 6266 __ orr(s1, s1, s2); 6267 6268 __ leave(); // Required for proper stackwalking of RuntimeStub frame 6269 __ ret(); 6270 6271 return start; 6272 } 6273 6274 #endif // COMPILER2_OR_JVMCI 6275 6276 // x10 = input (float16) 6277 // f10 = result (float) 6278 // t1 = temporary register 6279 address generate_float16ToFloat() { 6280 __ align(CodeEntryAlignment); 6281 StubGenStubId stub_id = StubGenStubId::hf2f_id; 6282 StubCodeMark mark(this, stub_id); 6283 address entry = __ pc(); 6284 BLOCK_COMMENT("float16ToFloat:"); 6285 6286 FloatRegister dst = f10; 6287 Register src = x10; 6288 Label NaN_SLOW; 6289 6290 assert(VM_Version::supports_float16_float_conversion(), "must"); 6291 6292 // On riscv, NaN needs a special process as fcvt does not work in that case. 6293 // On riscv, Inf does not need a special process as fcvt can handle it correctly. 6294 // but we consider to get the slow path to process NaN and Inf at the same time, 6295 // as both of them are rare cases, and if we try to get the slow path to handle 6296 // only NaN case it would sacrifise the performance for normal cases, 6297 // i.e. non-NaN and non-Inf cases. 6298 6299 // check whether it's a NaN or +/- Inf. 6300 __ mv(t0, 0x7c00); 6301 __ andr(t1, src, t0); 6302 // jump to stub processing NaN and Inf cases. 6303 __ beq(t0, t1, NaN_SLOW); 6304 6305 // non-NaN or non-Inf cases, just use built-in instructions. 6306 __ fmv_h_x(dst, src); 6307 __ fcvt_s_h(dst, dst); 6308 __ ret(); 6309 6310 __ bind(NaN_SLOW); 6311 // following instructions mainly focus on NaN, as riscv does not handle 6312 // NaN well with fcvt, but the code also works for Inf at the same time. 6313 6314 // construct a NaN in 32 bits from the NaN in 16 bits, 6315 // we need the payloads of non-canonical NaNs to be preserved. 6316 __ mv(t1, 0x7f800000); 6317 // sign-bit was already set via sign-extension if necessary. 6318 __ slli(t0, src, 13); 6319 __ orr(t1, t0, t1); 6320 __ fmv_w_x(dst, t1); 6321 6322 __ ret(); 6323 return entry; 6324 } 6325 6326 // f10 = input (float) 6327 // x10 = result (float16) 6328 // f11 = temporary float register 6329 // t1 = temporary register 6330 address generate_floatToFloat16() { 6331 __ align(CodeEntryAlignment); 6332 StubGenStubId stub_id = StubGenStubId::f2hf_id; 6333 StubCodeMark mark(this, stub_id); 6334 address entry = __ pc(); 6335 BLOCK_COMMENT("floatToFloat16:"); 6336 6337 Register dst = x10; 6338 FloatRegister src = f10, ftmp = f11; 6339 Label NaN_SLOW; 6340 6341 assert(VM_Version::supports_float16_float_conversion(), "must"); 6342 6343 // On riscv, NaN needs a special process as fcvt does not work in that case. 6344 6345 // check whether it's a NaN. 6346 // replace fclass with feq as performance optimization. 6347 __ feq_s(t0, src, src); 6348 // jump to stub processing NaN cases. 6349 __ beqz(t0, NaN_SLOW); 6350 6351 // non-NaN cases, just use built-in instructions. 6352 __ fcvt_h_s(ftmp, src); 6353 __ fmv_x_h(dst, ftmp); 6354 __ ret(); 6355 6356 __ bind(NaN_SLOW); 6357 __ fmv_x_w(dst, src); 6358 6359 // preserve the payloads of non-canonical NaNs. 6360 __ srai(dst, dst, 13); 6361 // preserve the sign bit. 6362 __ srai(t1, dst, 13); 6363 __ slli(t1, t1, 10); 6364 __ mv(t0, 0x3ff); 6365 __ orr(t1, t1, t0); 6366 6367 // get the result by merging sign bit and payloads of preserved non-canonical NaNs. 6368 __ andr(dst, dst, t1); 6369 6370 __ ret(); 6371 return entry; 6372 } 6373 6374 #ifdef COMPILER2 6375 6376 static const int64_t right_2_bits = right_n_bits(2); 6377 static const int64_t right_3_bits = right_n_bits(3); 6378 6379 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 6380 // are represented as long[5], with BITS_PER_LIMB = 26. 6381 // Pack five 26-bit limbs into three 64-bit registers. 6382 void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) { 6383 assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2); 6384 6385 // The goal is to have 128-bit value in dest2:dest1:dest0 6386 __ ld(dest0, Address(src, 0)); // 26 bits in dest0 6387 6388 __ ld(tmp1, Address(src, sizeof(jlong))); 6389 __ slli(tmp1, tmp1, 26); 6390 __ add(dest0, dest0, tmp1); // 52 bits in dest0 6391 6392 __ ld(tmp2, Address(src, 2 * sizeof(jlong))); 6393 __ slli(tmp1, tmp2, 52); 6394 __ add(dest0, dest0, tmp1); // dest0 is full 6395 6396 __ srli(dest1, tmp2, 12); // 14-bit in dest1 6397 6398 __ ld(tmp1, Address(src, 3 * sizeof(jlong))); 6399 __ slli(tmp1, tmp1, 14); 6400 __ add(dest1, dest1, tmp1); // 40-bit in dest1 6401 6402 __ ld(tmp1, Address(src, 4 * sizeof(jlong))); 6403 __ slli(tmp2, tmp1, 40); 6404 __ add(dest1, dest1, tmp2); // dest1 is full 6405 6406 if (dest2->is_valid()) { 6407 __ srli(tmp1, tmp1, 24); 6408 __ mv(dest2, tmp1); // 2 bits in dest2 6409 } else { 6410 #ifdef ASSERT 6411 Label OK; 6412 __ srli(tmp1, tmp1, 24); 6413 __ beq(zr, tmp1, OK); // 2 bits 6414 __ stop("high bits of Poly1305 integer should be zero"); 6415 __ should_not_reach_here(); 6416 __ bind(OK); 6417 #endif 6418 } 6419 } 6420 6421 // As above, but return only a 128-bit integer, packed into two 6422 // 64-bit registers. 6423 void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) { 6424 poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2); 6425 } 6426 6427 // U_2:U_1:U_0: += (U_2 >> 2) * 5 6428 void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) { 6429 assert_different_registers(U_2, U_1, U_0, tmp1, tmp2); 6430 6431 // First, U_2:U_1:U_0 += (U_2 >> 2) 6432 __ srli(tmp1, U_2, 2); 6433 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2 6434 __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits 6435 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2 6436 __ add(U_2, U_2, tmp2); 6437 6438 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 6439 __ slli(tmp1, tmp1, 2); 6440 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2 6441 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2 6442 __ add(U_2, U_2, tmp2); 6443 } 6444 6445 // Poly1305, RFC 7539 6446 // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) 6447 6448 // Arguments: 6449 // c_rarg0: input_start -- where the input is stored 6450 // c_rarg1: length 6451 // c_rarg2: acc_start -- where the output will be stored 6452 // c_rarg3: r_start -- where the randomly generated 128-bit key is stored 6453 6454 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 6455 // description of the tricks used to simplify and accelerate this 6456 // computation. 6457 6458 address generate_poly1305_processBlocks() { 6459 __ align(CodeEntryAlignment); 6460 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 6461 StubCodeMark mark(this, stub_id); 6462 address start = __ pc(); 6463 __ enter(); 6464 Label here; 6465 6466 RegSet saved_regs = RegSet::range(x18, x21); 6467 RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin(); 6468 __ push_reg(saved_regs, sp); 6469 6470 // Arguments 6471 const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3; 6472 6473 // R_n is the 128-bit randomly-generated key, packed into two 6474 // registers. The caller passes this key to us as long[5], with 6475 // BITS_PER_LIMB = 26. 6476 const Register R_0 = *regs, R_1 = *++regs; 6477 poly1305_pack_26(R_0, R_1, r_start, t1, t2); 6478 6479 // RR_n is (R_n >> 2) * 5 6480 const Register RR_0 = *++regs, RR_1 = *++regs; 6481 __ srli(t1, R_0, 2); 6482 __ shadd(RR_0, t1, t1, t2, 2); 6483 __ srli(t1, R_1, 2); 6484 __ shadd(RR_1, t1, t1, t2, 2); 6485 6486 // U_n is the current checksum 6487 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 6488 poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2); 6489 6490 static constexpr int BLOCK_LENGTH = 16; 6491 Label DONE, LOOP; 6492 6493 __ mv(t1, BLOCK_LENGTH); 6494 __ blt(length, t1, DONE); { 6495 __ bind(LOOP); 6496 6497 // S_n is to be the sum of U_n and the next block of data 6498 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 6499 __ ld(S_0, Address(input_start, 0)); 6500 __ ld(S_1, Address(input_start, wordSize)); 6501 6502 __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1 6503 __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1 6504 __ add(S_2, U_2, t1); 6505 6506 __ addi(S_2, S_2, 1); 6507 6508 const Register U_0HI = *++regs, U_1HI = *++regs; 6509 6510 // NB: this logic depends on some of the special properties of 6511 // Poly1305 keys. In particular, because we know that the top 6512 // four bits of R_0 and R_1 are zero, we can add together 6513 // partial products without any risk of needing to propagate a 6514 // carry out. 6515 __ wide_mul(U_0, U_0HI, S_0, R_0); 6516 __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2); 6517 __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2); 6518 6519 __ wide_mul(U_1, U_1HI, S_0, R_1); 6520 __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2); 6521 __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2); 6522 6523 __ andi(U_2, R_0, right_2_bits); 6524 __ mul(U_2, S_2, U_2); 6525 6526 // Partial reduction mod 2**130 - 5 6527 __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1 6528 __ adc(U_2, U_2, U_1HI, t1); 6529 // Sum is now in U_2:U_1:U_0. 6530 6531 // U_2:U_1:U_0: += (U_2 >> 2) * 5 6532 poly1305_reduce(U_2, U_1, U_0, t1, t2); 6533 6534 __ subi(length, length, BLOCK_LENGTH); 6535 __ addi(input_start, input_start, BLOCK_LENGTH); 6536 __ mv(t1, BLOCK_LENGTH); 6537 __ bge(length, t1, LOOP); 6538 } 6539 6540 // Further reduce modulo 2^130 - 5 6541 poly1305_reduce(U_2, U_1, U_0, t1, t2); 6542 6543 // Unpack the sum into five 26-bit limbs and write to memory. 6544 // First 26 bits is the first limb 6545 __ slli(t1, U_0, 38); // Take lowest 26 bits 6546 __ srli(t1, t1, 38); 6547 __ sd(t1, Address(acc_start)); // First 26-bit limb 6548 6549 // 27-52 bits of U_0 is the second limb 6550 __ slli(t1, U_0, 12); // Take next 27-52 bits 6551 __ srli(t1, t1, 38); 6552 __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb 6553 6554 // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register 6555 __ srli(t1, U_0, 52); 6556 __ slli(t2, U_1, 50); 6557 __ srli(t2, t2, 38); 6558 __ add(t1, t1, t2); 6559 __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb 6560 6561 // Storing 15-40 bits of U_1 6562 __ slli(t1, U_1, 24); // Already used up 14 bits 6563 __ srli(t1, t1, 38); // Clear all other bits from t1 6564 __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb 6565 6566 // Storing 41-64 bits of U_1 and first three bits from U_2 in one register 6567 __ srli(t1, U_1, 40); 6568 __ andi(t2, U_2, right_3_bits); 6569 __ slli(t2, t2, 24); 6570 __ add(t1, t1, t2); 6571 __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb 6572 6573 __ bind(DONE); 6574 __ pop_reg(saved_regs, sp); 6575 __ leave(); // Required for proper stackwalking 6576 __ ret(); 6577 6578 return start; 6579 } 6580 6581 #endif // COMPILER2 6582 6583 /** 6584 * Arguments: 6585 * 6586 * Inputs: 6587 * c_rarg0 - int crc 6588 * c_rarg1 - byte* buf 6589 * c_rarg2 - int length 6590 * 6591 * Output: 6592 * c_rarg0 - int crc result 6593 */ 6594 address generate_updateBytesCRC32() { 6595 assert(UseCRC32Intrinsics, "what are we doing here?"); 6596 6597 __ align(CodeEntryAlignment); 6598 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 6599 StubCodeMark mark(this, stub_id); 6600 6601 address start = __ pc(); 6602 6603 // input parameters 6604 const Register crc = c_rarg0; // crc 6605 const Register buf = c_rarg1; // source java byte array address 6606 const Register len = c_rarg2; // length 6607 6608 BLOCK_COMMENT("Entry:"); 6609 __ enter(); // required for proper stackwalking of RuntimeStub frame 6610 6611 __ kernel_crc32(crc, buf, len, 6612 c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables 6613 c_rarg7, t2, t3, t4, t5, t6); // misc tmps 6614 6615 __ leave(); // required for proper stackwalking of RuntimeStub frame 6616 __ ret(); 6617 6618 return start; 6619 } 6620 6621 // exception handler for upcall stubs 6622 address generate_upcall_stub_exception_handler() { 6623 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 6624 StubCodeMark mark(this, stub_id); 6625 address start = __ pc(); 6626 6627 // Native caller has no idea how to handle exceptions, 6628 // so we just crash here. Up to callee to catch exceptions. 6629 __ verify_oop(x10); // return a exception oop in a0 6630 __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception)); 6631 __ should_not_reach_here(); 6632 6633 return start; 6634 } 6635 6636 // load Method* target of MethodHandle 6637 // j_rarg0 = jobject receiver 6638 // xmethod = Method* result 6639 address generate_upcall_stub_load_target() { 6640 6641 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 6642 StubCodeMark mark(this, stub_id); 6643 address start = __ pc(); 6644 6645 __ resolve_global_jobject(j_rarg0, t0, t1); 6646 // Load target method from receiver 6647 __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1); 6648 __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1); 6649 __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1); 6650 __ access_load_at(T_ADDRESS, IN_HEAP, xmethod, 6651 Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 6652 noreg, noreg); 6653 __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 6654 6655 __ ret(); 6656 6657 return start; 6658 } 6659 6660 #undef __ 6661 6662 // Initialization 6663 void generate_preuniverse_stubs() { 6664 // preuniverse stubs are not needed for riscv 6665 } 6666 6667 void generate_initial_stubs() { 6668 // Generate initial stubs and initializes the entry points 6669 6670 // entry points that exist in all platforms Note: This is code 6671 // that could be shared among different platforms - however the 6672 // benefit seems to be smaller than the disadvantage of having a 6673 // much more complicated generator structure. See also comment in 6674 // stubRoutines.hpp. 6675 6676 StubRoutines::_forward_exception_entry = generate_forward_exception(); 6677 6678 if (UnsafeMemoryAccess::_table == nullptr) { 6679 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 6680 } 6681 6682 StubRoutines::_call_stub_entry = 6683 generate_call_stub(StubRoutines::_call_stub_return_address); 6684 6685 // is referenced by megamorphic call 6686 StubRoutines::_catch_exception_entry = generate_catch_exception(); 6687 6688 if (UseCRC32Intrinsics) { 6689 // set table address before stub generation which use it 6690 StubRoutines::_crc_table_adr = (address)StubRoutines::riscv::_crc_table; 6691 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 6692 } 6693 6694 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 6695 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 6696 StubRoutines::_hf2f = generate_float16ToFloat(); 6697 StubRoutines::_f2hf = generate_floatToFloat16(); 6698 } 6699 } 6700 6701 void generate_continuation_stubs() { 6702 // Continuation stubs: 6703 StubRoutines::_cont_thaw = generate_cont_thaw(); 6704 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 6705 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 6706 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 6707 } 6708 6709 void generate_final_stubs() { 6710 // support for verify_oop (must happen after universe_init) 6711 if (VerifyOops) { 6712 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 6713 } 6714 6715 // arraycopy stubs used by compilers 6716 generate_arraycopy_stubs(); 6717 6718 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 6719 6720 #ifdef COMPILER2 6721 if (UseSecondarySupersTable) { 6722 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 6723 if (!InlineSecondarySupersTest) { 6724 generate_lookup_secondary_supers_table_stub(); 6725 } 6726 } 6727 #endif // COMPILER2 6728 6729 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 6730 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 6731 6732 StubRoutines::riscv::set_completed(); 6733 } 6734 6735 void generate_compiler_stubs() { 6736 #ifdef COMPILER2 6737 if (UseMulAddIntrinsic) { 6738 StubRoutines::_mulAdd = generate_mulAdd(); 6739 } 6740 6741 if (UseMultiplyToLenIntrinsic) { 6742 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 6743 } 6744 6745 if (UseSquareToLenIntrinsic) { 6746 StubRoutines::_squareToLen = generate_squareToLen(); 6747 } 6748 6749 if (UseMontgomeryMultiplyIntrinsic) { 6750 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 6751 StubCodeMark mark(this, stub_id); 6752 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 6753 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 6754 } 6755 6756 if (UseMontgomerySquareIntrinsic) { 6757 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 6758 StubCodeMark mark(this, stub_id); 6759 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 6760 StubRoutines::_montgomerySquare = g.generate_square(); 6761 } 6762 6763 if (UseAESIntrinsics) { 6764 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 6765 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 6766 } 6767 6768 if (UsePoly1305Intrinsics) { 6769 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 6770 } 6771 6772 if (UseRVV) { 6773 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 6774 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 6775 } 6776 6777 if (UseSHA256Intrinsics) { 6778 Sha2Generator sha2(_masm, this); 6779 StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 6780 StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 6781 } 6782 6783 if (UseSHA512Intrinsics) { 6784 Sha2Generator sha2(_masm, this); 6785 StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 6786 StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 6787 } 6788 6789 if (UseMD5Intrinsics) { 6790 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 6791 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 6792 } 6793 6794 if (UseChaCha20Intrinsics) { 6795 StubRoutines::_chacha20Block = generate_chacha20Block(); 6796 } 6797 6798 if (UseSHA1Intrinsics) { 6799 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 6800 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 6801 } 6802 6803 if (UseBASE64Intrinsics) { 6804 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 6805 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 6806 } 6807 6808 if (UseAdler32Intrinsics) { 6809 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 6810 } 6811 6812 generate_compare_long_strings(); 6813 6814 generate_string_indexof_stubs(); 6815 6816 #endif // COMPILER2 6817 } 6818 6819 public: 6820 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 6821 switch(blob_id) { 6822 case preuniverse_id: 6823 generate_preuniverse_stubs(); 6824 break; 6825 case initial_id: 6826 generate_initial_stubs(); 6827 break; 6828 case continuation_id: 6829 generate_continuation_stubs(); 6830 break; 6831 case compiler_id: 6832 generate_compiler_stubs(); 6833 break; 6834 case final_id: 6835 generate_final_stubs(); 6836 break; 6837 default: 6838 fatal("unexpected blob id: %d", blob_id); 6839 break; 6840 }; 6841 } 6842 }; // end class declaration 6843 6844 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 6845 StubGenerator g(code, blob_id); 6846 }