1 /* 2 * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/universe.hpp" 35 #include "nativeInst_riscv.hpp" 36 #include "oops/instanceOop.hpp" 37 #include "oops/method.hpp" 38 #include "oops/objArrayKlass.hpp" 39 #include "oops/oop.inline.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "runtime/frame.inline.hpp" 42 #include "runtime/handles.inline.hpp" 43 #include "runtime/sharedRuntime.hpp" 44 #include "runtime/stubCodeGenerator.hpp" 45 #include "runtime/stubRoutines.hpp" 46 #include "runtime/thread.inline.hpp" 47 #include "utilities/align.hpp" 48 #include "utilities/powerOfTwo.hpp" 49 #ifdef COMPILER2 50 #include "opto/runtime.hpp" 51 #endif 52 #if INCLUDE_ZGC 53 #include "gc/z/zThreadLocalData.hpp" 54 #endif 55 56 // Declaration and definition of StubGenerator (no .hpp file). 57 // For a more detailed description of the stub routine structure 58 // see the comment in stubRoutines.hpp 59 60 #undef __ 61 #define __ _masm-> 62 63 #ifdef PRODUCT 64 #define BLOCK_COMMENT(str) /* nothing */ 65 #else 66 #define BLOCK_COMMENT(str) __ block_comment(str) 67 #endif 68 69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 70 71 // Stub Code definitions 72 73 class StubGenerator: public StubCodeGenerator { 74 private: 75 76 #ifdef PRODUCT 77 #define inc_counter_np(counter) ((void)0) 78 #else 79 void inc_counter_np_(int& counter) { 80 __ la(t1, ExternalAddress((address)&counter)); 81 __ lwu(t0, Address(t1, 0)); 82 __ addiw(t0, t0, 1); 83 __ sw(t0, Address(t1, 0)); 84 } 85 #define inc_counter_np(counter) \ 86 BLOCK_COMMENT("inc_counter " #counter); \ 87 inc_counter_np_(counter); 88 #endif 89 90 // Call stubs are used to call Java from C 91 // 92 // Arguments: 93 // c_rarg0: call wrapper address address 94 // c_rarg1: result address 95 // c_rarg2: result type BasicType 96 // c_rarg3: method Method* 97 // c_rarg4: (interpreter) entry point address 98 // c_rarg5: parameters intptr_t* 99 // c_rarg6: parameter size (in words) int 100 // c_rarg7: thread Thread* 101 // 102 // There is no return from the stub itself as any Java result 103 // is written to result 104 // 105 // we save x1 (ra) as the return PC at the base of the frame and 106 // link x8 (fp) below it as the frame pointer installing sp (x2) 107 // into fp. 108 // 109 // we save x10-x17, which accounts for all the c arguments. 110 // 111 // TODO: strictly do we need to save them all? they are treated as 112 // volatile by C so could we omit saving the ones we are going to 113 // place in global registers (thread? method?) or those we only use 114 // during setup of the Java call? 115 // 116 // we don't need to save x5 which C uses as an indirect result location 117 // return register. 118 // 119 // we don't need to save x6-x7 and x28-x31 which both C and Java treat as 120 // volatile 121 // 122 // we save x18-x27 which Java uses as temporary registers and C 123 // expects to be callee-save 124 // 125 // so the stub frame looks like this when we enter Java code 126 // 127 // [ return_from_Java ] <--- sp 128 // [ argument word n ] 129 // ... 130 // -22 [ argument word 1 ] 131 // -21 [ saved x27 ] <--- sp_after_call 132 // -20 [ saved x26 ] 133 // -19 [ saved x25 ] 134 // -18 [ saved x24 ] 135 // -17 [ saved x23 ] 136 // -16 [ saved x22 ] 137 // -15 [ saved x21 ] 138 // -14 [ saved x20 ] 139 // -13 [ saved x19 ] 140 // -12 [ saved x18 ] 141 // -11 [ saved x9 ] 142 // -10 [ call wrapper (x10) ] 143 // -9 [ result (x11) ] 144 // -8 [ result type (x12) ] 145 // -7 [ method (x13) ] 146 // -6 [ entry point (x14) ] 147 // -5 [ parameters (x15) ] 148 // -4 [ parameter size (x16) ] 149 // -3 [ thread (x17) ] 150 // -2 [ saved fp (x8) ] 151 // -1 [ saved ra (x1) ] 152 // 0 [ ] <--- fp == saved sp (x2) 153 154 // Call stub stack layout word offsets from fp 155 enum call_stub_layout { 156 sp_after_call_off = -21, 157 158 x27_off = -21, 159 x26_off = -20, 160 x25_off = -19, 161 x24_off = -18, 162 x23_off = -17, 163 x22_off = -16, 164 x21_off = -15, 165 x20_off = -14, 166 x19_off = -13, 167 x18_off = -12, 168 x9_off = -11, 169 170 call_wrapper_off = -10, 171 result_off = -9, 172 result_type_off = -8, 173 method_off = -7, 174 entry_point_off = -6, 175 parameters_off = -5, 176 parameter_size_off = -4, 177 thread_off = -3, 178 fp_f = -2, 179 retaddr_off = -1, 180 }; 181 182 address generate_call_stub(address& return_address) { 183 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 184 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 185 "adjust this code"); 186 187 StubCodeMark mark(this, "StubRoutines", "call_stub"); 188 address start = __ pc(); 189 190 const Address sp_after_call (fp, sp_after_call_off * wordSize); 191 192 const Address call_wrapper (fp, call_wrapper_off * wordSize); 193 const Address result (fp, result_off * wordSize); 194 const Address result_type (fp, result_type_off * wordSize); 195 const Address method (fp, method_off * wordSize); 196 const Address entry_point (fp, entry_point_off * wordSize); 197 const Address parameters (fp, parameters_off * wordSize); 198 const Address parameter_size(fp, parameter_size_off * wordSize); 199 200 const Address thread (fp, thread_off * wordSize); 201 202 const Address x27_save (fp, x27_off * wordSize); 203 const Address x26_save (fp, x26_off * wordSize); 204 const Address x25_save (fp, x25_off * wordSize); 205 const Address x24_save (fp, x24_off * wordSize); 206 const Address x23_save (fp, x23_off * wordSize); 207 const Address x22_save (fp, x22_off * wordSize); 208 const Address x21_save (fp, x21_off * wordSize); 209 const Address x20_save (fp, x20_off * wordSize); 210 const Address x19_save (fp, x19_off * wordSize); 211 const Address x18_save (fp, x18_off * wordSize); 212 213 const Address x9_save (fp, x9_off * wordSize); 214 215 // stub code 216 217 address riscv_entry = __ pc(); 218 219 // set up frame and move sp to end of save area 220 __ enter(); 221 __ addi(sp, fp, sp_after_call_off * wordSize); 222 223 // save register parameters and Java temporary/global registers 224 // n.b. we save thread even though it gets installed in 225 // xthread because we want to sanity check tp later 226 __ sd(c_rarg7, thread); 227 __ sw(c_rarg6, parameter_size); 228 __ sd(c_rarg5, parameters); 229 __ sd(c_rarg4, entry_point); 230 __ sd(c_rarg3, method); 231 __ sd(c_rarg2, result_type); 232 __ sd(c_rarg1, result); 233 __ sd(c_rarg0, call_wrapper); 234 235 __ sd(x9, x9_save); 236 237 __ sd(x18, x18_save); 238 __ sd(x19, x19_save); 239 __ sd(x20, x20_save); 240 __ sd(x21, x21_save); 241 __ sd(x22, x22_save); 242 __ sd(x23, x23_save); 243 __ sd(x24, x24_save); 244 __ sd(x25, x25_save); 245 __ sd(x26, x26_save); 246 __ sd(x27, x27_save); 247 248 // install Java thread in global register now we have saved 249 // whatever value it held 250 __ mv(xthread, c_rarg7); 251 252 // And method 253 __ mv(xmethod, c_rarg3); 254 255 // set up the heapbase register 256 __ reinit_heapbase(); 257 258 #ifdef ASSERT 259 // make sure we have no pending exceptions 260 { 261 Label L; 262 __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset()))); 263 __ beqz(t0, L); 264 __ stop("StubRoutines::call_stub: entered with pending exception"); 265 __ BIND(L); 266 } 267 #endif 268 // pass parameters if any 269 __ mv(esp, sp); 270 __ slli(t0, c_rarg6, LogBytesPerWord); 271 __ sub(t0, sp, t0); // Move SP out of the way 272 __ andi(sp, t0, -2 * wordSize); 273 274 BLOCK_COMMENT("pass parameters if any"); 275 Label parameters_done; 276 // parameter count is still in c_rarg6 277 // and parameter pointer identifying param 1 is in c_rarg5 278 __ beqz(c_rarg6, parameters_done); 279 280 address loop = __ pc(); 281 __ ld(t0, c_rarg5, 0); 282 __ addi(c_rarg5, c_rarg5, wordSize); 283 __ addi(c_rarg6, c_rarg6, -1); 284 __ push_reg(t0); 285 __ bgtz(c_rarg6, loop); 286 287 __ BIND(parameters_done); 288 289 // call Java entry -- passing methdoOop, and current sp 290 // xmethod: Method* 291 // x30: sender sp 292 BLOCK_COMMENT("call Java function"); 293 __ mv(x30, sp); 294 __ jalr(c_rarg4); 295 296 // save current address for use by exception handling code 297 298 return_address = __ pc(); 299 300 // store result depending on type (everything that is not 301 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 302 // n.b. this assumes Java returns an integral result in x10 303 // and a floating result in j_farg0 304 __ ld(j_rarg2, result); 305 Label is_long, is_float, is_double, exit; 306 __ ld(j_rarg1, result_type); 307 __ li(t0, (u1)T_OBJECT); 308 __ beq(j_rarg1, t0, is_long); 309 __ li(t0, (u1)T_LONG); 310 __ beq(j_rarg1, t0, is_long); 311 __ li(t0, (u1)T_FLOAT); 312 __ beq(j_rarg1, t0, is_float); 313 __ li(t0, (u1)T_DOUBLE); 314 __ beq(j_rarg1, t0, is_double); 315 316 // handle T_INT case 317 __ sw(x10, Address(j_rarg2)); 318 319 __ BIND(exit); 320 321 // pop parameters 322 __ addi(esp, fp, sp_after_call_off * wordSize); 323 324 #ifdef ASSERT 325 // verify that threads correspond 326 { 327 Label L, S; 328 __ ld(t0, thread); 329 __ bne(xthread, t0, S); 330 __ get_thread(t0); 331 __ beq(xthread, t0, L); 332 __ BIND(S); 333 __ stop("StubRoutines::call_stub: threads must correspond"); 334 __ BIND(L); 335 } 336 #endif 337 338 // restore callee-save registers 339 __ ld(x27, x27_save); 340 __ ld(x26, x26_save); 341 __ ld(x25, x25_save); 342 __ ld(x24, x24_save); 343 __ ld(x23, x23_save); 344 __ ld(x22, x22_save); 345 __ ld(x21, x21_save); 346 __ ld(x20, x20_save); 347 __ ld(x19, x19_save); 348 __ ld(x18, x18_save); 349 350 __ ld(x9, x9_save); 351 352 __ ld(c_rarg0, call_wrapper); 353 __ ld(c_rarg1, result); 354 __ ld(c_rarg2, result_type); 355 __ ld(c_rarg3, method); 356 __ ld(c_rarg4, entry_point); 357 __ ld(c_rarg5, parameters); 358 __ ld(c_rarg6, parameter_size); 359 __ ld(c_rarg7, thread); 360 361 // leave frame and return to caller 362 __ leave(); 363 __ ret(); 364 365 // handle return types different from T_INT 366 367 __ BIND(is_long); 368 __ sd(x10, Address(j_rarg2, 0)); 369 __ j(exit); 370 371 __ BIND(is_float); 372 __ fsw(j_farg0, Address(j_rarg2, 0), t0); 373 __ j(exit); 374 375 __ BIND(is_double); 376 __ fsd(j_farg0, Address(j_rarg2, 0), t0); 377 __ j(exit); 378 379 return start; 380 } 381 382 // Return point for a Java call if there's an exception thrown in 383 // Java code. The exception is caught and transformed into a 384 // pending exception stored in JavaThread that can be tested from 385 // within the VM. 386 // 387 // Note: Usually the parameters are removed by the callee. In case 388 // of an exception crossing an activation frame boundary, that is 389 // not the case if the callee is compiled code => need to setup the 390 // sp. 391 // 392 // x10: exception oop 393 394 address generate_catch_exception() { 395 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 396 address start = __ pc(); 397 398 // same as in generate_call_stub(): 399 const Address thread(fp, thread_off * wordSize); 400 401 #ifdef ASSERT 402 // verify that threads correspond 403 { 404 Label L, S; 405 __ ld(t0, thread); 406 __ bne(xthread, t0, S); 407 __ get_thread(t0); 408 __ beq(xthread, t0, L); 409 __ bind(S); 410 __ stop("StubRoutines::catch_exception: threads must correspond"); 411 __ bind(L); 412 } 413 #endif 414 415 // set pending exception 416 __ verify_oop(x10); 417 418 __ sd(x10, Address(xthread, Thread::pending_exception_offset())); 419 __ mv(t0, (address)__FILE__); 420 __ sd(t0, Address(xthread, Thread::exception_file_offset())); 421 __ mv(t0, (int)__LINE__); 422 __ sw(t0, Address(xthread, Thread::exception_line_offset())); 423 424 // complete return to VM 425 assert(StubRoutines::_call_stub_return_address != NULL, 426 "_call_stub_return_address must have been generated before"); 427 __ j(StubRoutines::_call_stub_return_address); 428 429 return start; 430 } 431 432 // Continuation point for runtime calls returning with a pending 433 // exception. The pending exception check happened in the runtime 434 // or native call stub. The pending exception in Thread is 435 // converted into a Java-level exception. 436 // 437 // Contract with Java-level exception handlers: 438 // x10: exception 439 // x13: throwing pc 440 // 441 // NOTE: At entry of this stub, exception-pc must be in RA !! 442 443 // NOTE: this is always used as a jump target within generated code 444 // so it just needs to be generated code with no x86 prolog 445 446 address generate_forward_exception() { 447 StubCodeMark mark(this, "StubRoutines", "forward exception"); 448 address start = __ pc(); 449 450 // Upon entry, RA points to the return address returning into 451 // Java (interpreted or compiled) code; i.e., the return address 452 // becomes the throwing pc. 453 // 454 // Arguments pushed before the runtime call are still on the stack 455 // but the exception handler will reset the stack pointer -> 456 // ignore them. A potential result in registers can be ignored as 457 // well. 458 459 #ifdef ASSERT 460 // make sure this code is only executed if there is a pending exception 461 { 462 Label L; 463 __ ld(t0, Address(xthread, Thread::pending_exception_offset())); 464 __ bnez(t0, L); 465 __ stop("StubRoutines::forward exception: no pending exception (1)"); 466 __ bind(L); 467 } 468 #endif 469 470 // compute exception handler into x9 471 472 // call the VM to find the handler address associated with the 473 // caller address. pass thread in x10 and caller pc (ret address) 474 // in x11. n.b. the caller pc is in ra, unlike x86 where it is on 475 // the stack. 476 __ mv(c_rarg1, ra); 477 // ra will be trashed by the VM call so we move it to x9 478 // (callee-saved) because we also need to pass it to the handler 479 // returned by this call. 480 __ mv(x9, ra); 481 BLOCK_COMMENT("call exception_handler_for_return_address"); 482 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 483 SharedRuntime::exception_handler_for_return_address), 484 xthread, c_rarg1); 485 // we should not really care that ra is no longer the callee 486 // address. we saved the value the handler needs in x9 so we can 487 // just copy it to x13. however, the C2 handler will push its own 488 // frame and then calls into the VM and the VM code asserts that 489 // the PC for the frame above the handler belongs to a compiled 490 // Java method. So, we restore ra here to satisfy that assert. 491 __ mv(ra, x9); 492 // setup x10 & x13 & clear pending exception 493 __ mv(x13, x9); 494 __ mv(x9, x10); 495 __ ld(x10, Address(xthread, Thread::pending_exception_offset())); 496 __ sd(zr, Address(xthread, Thread::pending_exception_offset())); 497 498 #ifdef ASSERT 499 // make sure exception is set 500 { 501 Label L; 502 __ bnez(x10, L); 503 __ stop("StubRoutines::forward exception: no pending exception (2)"); 504 __ bind(L); 505 } 506 #endif 507 508 // continue at exception handler 509 // x10: exception 510 // x13: throwing pc 511 // x9: exception handler 512 __ verify_oop(x10); 513 __ jr(x9); 514 515 return start; 516 } 517 518 // Non-destructive plausibility checks for oops 519 // 520 // Arguments: 521 // x10: oop to verify 522 // t0: error message 523 // 524 // Stack after saving c_rarg3: 525 // [tos + 0]: saved c_rarg3 526 // [tos + 1]: saved c_rarg2 527 // [tos + 2]: saved ra 528 // [tos + 3]: saved t1 529 // [tos + 4]: saved x10 530 // [tos + 5]: saved t0 531 address generate_verify_oop() { 532 533 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 534 address start = __ pc(); 535 536 Label exit, error; 537 538 __ push_reg(0x3000, sp); // save c_rarg2 and c_rarg3 539 540 __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 541 __ ld(c_rarg3, Address(c_rarg2)); 542 __ add(c_rarg3, c_rarg3, 1); 543 __ sd(c_rarg3, Address(c_rarg2)); 544 545 // object is in x10 546 // make sure object is 'reasonable' 547 __ beqz(x10, exit); // if obj is NULL it is OK 548 549 #if INCLUDE_ZGC 550 if (UseZGC) { 551 // Check if mask is good. 552 // verifies that ZAddressBadMask & x10 == 0 553 __ ld(c_rarg3, Address(xthread, ZThreadLocalData::address_bad_mask_offset())); 554 __ andr(c_rarg2, x10, c_rarg3); 555 __ bnez(c_rarg2, error); 556 } 557 #endif 558 559 // Check if the oop is in the right area of memory 560 __ mv(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 561 __ andr(c_rarg2, x10, c_rarg3); 562 __ mv(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 563 564 // Compare c_rarg2 and c_rarg3. 565 __ bne(c_rarg2, c_rarg3, error); 566 567 // make sure klass is 'reasonable', which is not zero. 568 __ load_klass(x10, x10); // get klass 569 __ beqz(x10, error); // if klass is NULL it is broken 570 571 // return if everything seems ok 572 __ bind(exit); 573 574 __ pop_reg(0x3000, sp); // pop c_rarg2 and c_rarg3 575 __ ret(); 576 577 // handle errors 578 __ bind(error); 579 __ pop_reg(0x3000, sp); // pop c_rarg2 and c_rarg3 580 581 __ pusha(); 582 // debug(char* msg, int64_t pc, int64_t regs[]) 583 __ mv(c_rarg0, t0); // pass address of error message 584 __ mv(c_rarg1, ra); // pass return address 585 __ mv(c_rarg2, sp); // pass address of regs on stack 586 #ifndef PRODUCT 587 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 588 #endif 589 BLOCK_COMMENT("call MacroAssembler::debug"); 590 int32_t offset = 0; 591 __ movptr_with_offset(t0, CAST_FROM_FN_PTR(address, MacroAssembler::debug64), offset); 592 __ jalr(x1, t0, offset); 593 __ ebreak(); 594 595 return start; 596 } 597 598 // The inner part of zero_words(). 599 // 600 // Inputs: 601 // x28: the HeapWord-aligned base address of an array to zero. 602 // x29: the count in HeapWords, x29 > 0. 603 // 604 // Returns x28 and x29, adjusted for the caller to clear. 605 // x28: the base address of the tail of words left to clear. 606 // x29: the number of words in the tail. 607 // x29 < MacroAssembler::zero_words_block_size. 608 609 address generate_zero_blocks() { 610 Label done; 611 612 const Register base = x28, cnt = x29; 613 614 __ align(CodeEntryAlignment); 615 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 616 address start = __ pc(); 617 618 { 619 // Clear the remaining blocks. 620 Label loop; 621 __ sub(cnt, cnt, MacroAssembler::zero_words_block_size); 622 __ bltz(cnt, done); 623 __ bind(loop); 624 for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) { 625 __ sd(zr, Address(base, 0)); 626 __ add(base, base, 8); 627 } 628 __ sub(cnt, cnt, MacroAssembler::zero_words_block_size); 629 __ bgez(cnt, loop); 630 __ bind(done); 631 __ add(cnt, cnt, MacroAssembler::zero_words_block_size); 632 } 633 634 __ ret(); 635 636 return start; 637 } 638 639 typedef enum { 640 copy_forwards = 1, 641 copy_backwards = -1 642 } copy_direction; 643 644 // Bulk copy of blocks of 8 words. 645 // 646 // count is a count of words. 647 // 648 // Precondition: count >= 8 649 // 650 // Postconditions: 651 // 652 // The least significant bit of count contains the remaining count 653 // of words to copy. The rest of count is trash. 654 // 655 // s and d are adjusted to point to the remaining words to copy 656 // 657 void generate_copy_longs(Label &start, Register s, Register d, Register count, 658 copy_direction direction) { 659 int unit = wordSize * direction; 660 int bias = wordSize; 661 662 const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16, 663 tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29; 664 665 const Register stride = x30; 666 667 assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3, 668 tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7); 669 assert_different_registers(s, d, count, t0); 670 671 Label again, drain; 672 const char* stub_name = NULL; 673 if (direction == copy_forwards) { 674 stub_name = "forward_copy_longs"; 675 } else { 676 stub_name = "backward_copy_longs"; 677 } 678 StubCodeMark mark(this, "StubRoutines", stub_name); 679 __ align(CodeEntryAlignment); 680 __ bind(start); 681 682 if (direction == copy_forwards) { 683 __ sub(s, s, bias); 684 __ sub(d, d, bias); 685 } 686 687 #ifdef ASSERT 688 // Make sure we are never given < 8 words 689 { 690 Label L; 691 692 __ li(t0, 8); 693 __ bge(count, t0, L); 694 __ stop("genrate_copy_longs called with < 8 words"); 695 __ bind(L); 696 } 697 #endif 698 699 __ ld(tmp_reg0, Address(s, 1 * unit)); 700 __ ld(tmp_reg1, Address(s, 2 * unit)); 701 __ ld(tmp_reg2, Address(s, 3 * unit)); 702 __ ld(tmp_reg3, Address(s, 4 * unit)); 703 __ ld(tmp_reg4, Address(s, 5 * unit)); 704 __ ld(tmp_reg5, Address(s, 6 * unit)); 705 __ ld(tmp_reg6, Address(s, 7 * unit)); 706 __ ld(tmp_reg7, Address(s, 8 * unit)); 707 __ addi(s, s, 8 * unit); 708 709 __ sub(count, count, 16); 710 __ bltz(count, drain); 711 712 __ bind(again); 713 714 __ sd(tmp_reg0, Address(d, 1 * unit)); 715 __ sd(tmp_reg1, Address(d, 2 * unit)); 716 __ sd(tmp_reg2, Address(d, 3 * unit)); 717 __ sd(tmp_reg3, Address(d, 4 * unit)); 718 __ sd(tmp_reg4, Address(d, 5 * unit)); 719 __ sd(tmp_reg5, Address(d, 6 * unit)); 720 __ sd(tmp_reg6, Address(d, 7 * unit)); 721 __ sd(tmp_reg7, Address(d, 8 * unit)); 722 723 __ ld(tmp_reg0, Address(s, 1 * unit)); 724 __ ld(tmp_reg1, Address(s, 2 * unit)); 725 __ ld(tmp_reg2, Address(s, 3 * unit)); 726 __ ld(tmp_reg3, Address(s, 4 * unit)); 727 __ ld(tmp_reg4, Address(s, 5 * unit)); 728 __ ld(tmp_reg5, Address(s, 6 * unit)); 729 __ ld(tmp_reg6, Address(s, 7 * unit)); 730 __ ld(tmp_reg7, Address(s, 8 * unit)); 731 732 __ addi(s, s, 8 * unit); 733 __ addi(d, d, 8 * unit); 734 735 __ sub(count, count, 8); 736 __ bgez(count, again); 737 738 // Drain 739 __ bind(drain); 740 741 __ sd(tmp_reg0, Address(d, 1 * unit)); 742 __ sd(tmp_reg1, Address(d, 2 * unit)); 743 __ sd(tmp_reg2, Address(d, 3 * unit)); 744 __ sd(tmp_reg3, Address(d, 4 * unit)); 745 __ sd(tmp_reg4, Address(d, 5 * unit)); 746 __ sd(tmp_reg5, Address(d, 6 * unit)); 747 __ sd(tmp_reg6, Address(d, 7 * unit)); 748 __ sd(tmp_reg7, Address(d, 8 * unit)); 749 __ addi(d, d, 8 * unit); 750 751 { 752 Label L1, L2; 753 __ andi(t0, count, 4); 754 __ beqz(t0, L1); 755 756 __ ld(tmp_reg0, Address(s, 1 * unit)); 757 __ ld(tmp_reg1, Address(s, 2 * unit)); 758 __ ld(tmp_reg2, Address(s, 3 * unit)); 759 __ ld(tmp_reg3, Address(s, 4 * unit)); 760 __ addi(s, s, 4 * unit); 761 762 __ sd(tmp_reg0, Address(d, 1 * unit)); 763 __ sd(tmp_reg1, Address(d, 2 * unit)); 764 __ sd(tmp_reg2, Address(d, 3 * unit)); 765 __ sd(tmp_reg3, Address(d, 4 * unit)); 766 __ addi(d, d, 4 * unit); 767 768 __ bind(L1); 769 770 if (direction == copy_forwards) { 771 __ addi(s, s, bias); 772 __ addi(d, d, bias); 773 } 774 775 __ andi(t0, count, 2); 776 __ beqz(t0, L2); 777 if (direction == copy_backwards) { 778 __ addi(s, s, 2 * unit); 779 __ ld(tmp_reg0, Address(s)); 780 __ ld(tmp_reg1, Address(s, wordSize)); 781 __ addi(d, d, 2 * unit); 782 __ sd(tmp_reg0, Address(d)); 783 __ sd(tmp_reg1, Address(d, wordSize)); 784 } else { 785 __ ld(tmp_reg0, Address(s)); 786 __ ld(tmp_reg1, Address(s, wordSize)); 787 __ addi(s, s, 2 * unit); 788 __ sd(tmp_reg0, Address(d)); 789 __ sd(tmp_reg1, Address(d, wordSize)); 790 __ addi(d, d, 2 * unit); 791 } 792 __ bind(L2); 793 } 794 795 __ ret(); 796 } 797 798 Label copy_f, copy_b; 799 800 // All-singing all-dancing memory copy. 801 // 802 // Copy count units of memory from s to d. The size of a unit is 803 // step, which can be positive or negative depending on the direction 804 // of copy. If is_aligned is false, we align the source address. 805 // 806 /* 807 * if (is_aligned) { 808 * goto copy_8_bytes; 809 * } 810 * bool is_backwards = step < 0; 811 * int granularity = uabs(step); 812 * count = count * granularity; * count bytes 813 * 814 * if (is_backwards) { 815 * s += count; 816 * d += count; 817 * } 818 * 819 * count limit maybe greater than 16, for better performance 820 * if (count < 16) { 821 * goto copy_small; 822 * } 823 * 824 * if ((dst % 8) == (src % 8)) { 825 * aligned; 826 * goto copy8; 827 * } 828 * 829 * copy_small: 830 * load element one by one; 831 * done; 832 */ 833 834 typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp); 835 836 void copy_memory_v(Register s, Register d, Register count, Register tmp, int step) { 837 bool is_backward = step < 0; 838 int granularity = uabs(step); 839 840 const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17; 841 assert_different_registers(s, d, cnt, vl, tmp, tmp1, tmp2); 842 Assembler::SEW sew = Assembler::elembytes_to_sew(granularity); 843 Label loop_forward, loop_backward, done; 844 845 __ mv(dst, d); 846 __ mv(src, s); 847 __ mv(cnt, count); 848 849 __ bind(loop_forward); 850 __ vsetvli(vl, cnt, sew, Assembler::m8); 851 if (is_backward) { 852 __ bne(vl, cnt, loop_backward); 853 } 854 855 __ vlex_v(v0, src, sew); 856 __ sub(cnt, cnt, vl); 857 __ slli(vl, vl, (int)sew); 858 __ add(src, src, vl); 859 860 __ vsex_v(v0, dst, sew); 861 __ add(dst, dst, vl); 862 __ bnez(cnt, loop_forward); 863 864 if (is_backward) { 865 __ j(done); 866 867 __ bind(loop_backward); 868 __ sub(tmp, cnt, vl); 869 __ slli(tmp, tmp, sew); 870 __ add(tmp1, s, tmp); 871 __ vlex_v(v0, tmp1, sew); 872 __ add(tmp2, d, tmp); 873 __ vsex_v(v0, tmp2, sew); 874 __ sub(cnt, cnt, vl); 875 __ bnez(cnt, loop_forward); 876 __ bind(done); 877 } 878 } 879 880 void copy_memory(bool is_aligned, Register s, Register d, 881 Register count, Register tmp, int step) { 882 if (UseRVV) { 883 return copy_memory_v(s, d, count, tmp, step); 884 } 885 886 bool is_backwards = step < 0; 887 int granularity = uabs(step); 888 889 const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17; 890 891 Label same_aligned; 892 Label copy8, copy_small, done; 893 894 copy_insn ld_arr = NULL, st_arr = NULL; 895 switch (granularity) { 896 case 1 : 897 ld_arr = (copy_insn)&MacroAssembler::lbu; 898 st_arr = (copy_insn)&MacroAssembler::sb; 899 break; 900 case 2 : 901 ld_arr = (copy_insn)&MacroAssembler::lhu; 902 st_arr = (copy_insn)&MacroAssembler::sh; 903 break; 904 case 4 : 905 ld_arr = (copy_insn)&MacroAssembler::lwu; 906 st_arr = (copy_insn)&MacroAssembler::sw; 907 break; 908 case 8 : 909 ld_arr = (copy_insn)&MacroAssembler::ld; 910 st_arr = (copy_insn)&MacroAssembler::sd; 911 break; 912 default : 913 ShouldNotReachHere(); 914 } 915 916 __ beqz(count, done); 917 __ slli(cnt, count, exact_log2(granularity)); 918 if (is_backwards) { 919 __ add(src, s, cnt); 920 __ add(dst, d, cnt); 921 } else { 922 __ mv(src, s); 923 __ mv(dst, d); 924 } 925 926 if (is_aligned) { 927 __ addi(tmp, cnt, -8); 928 __ bgez(tmp, copy8); 929 __ j(copy_small); 930 } 931 932 __ mv(tmp, 16); 933 __ blt(cnt, tmp, copy_small); 934 935 __ xorr(tmp, src, dst); 936 __ andi(tmp, tmp, 0b111); 937 __ bnez(tmp, copy_small); 938 939 __ bind(same_aligned); 940 __ andi(tmp, src, 0b111); 941 __ beqz(tmp, copy8); 942 if (is_backwards) { 943 __ addi(src, src, step); 944 __ addi(dst, dst, step); 945 } 946 (_masm->*ld_arr)(tmp3, Address(src), t0); 947 (_masm->*st_arr)(tmp3, Address(dst), t0); 948 if (!is_backwards) { 949 __ addi(src, src, step); 950 __ addi(dst, dst, step); 951 } 952 __ addi(cnt, cnt, -granularity); 953 __ beqz(cnt, done); 954 __ j(same_aligned); 955 956 __ bind(copy8); 957 if (is_backwards) { 958 __ addi(src, src, -wordSize); 959 __ addi(dst, dst, -wordSize); 960 } 961 __ ld(tmp3, Address(src)); 962 __ sd(tmp3, Address(dst)); 963 if (!is_backwards) { 964 __ addi(src, src, wordSize); 965 __ addi(dst, dst, wordSize); 966 } 967 __ addi(cnt, cnt, -wordSize); 968 __ addi(tmp4, cnt, -8); 969 __ bgez(tmp4, copy8); // cnt >= 8, do next loop 970 971 __ beqz(cnt, done); 972 973 __ bind(copy_small); 974 if (is_backwards) { 975 __ addi(src, src, step); 976 __ addi(dst, dst, step); 977 } 978 (_masm->*ld_arr)(tmp3, Address(src), t0); 979 (_masm->*st_arr)(tmp3, Address(dst), t0); 980 if (!is_backwards) { 981 __ addi(src, src, step); 982 __ addi(dst, dst, step); 983 } 984 __ addi(cnt, cnt, -granularity); 985 __ bgtz(cnt, copy_small); 986 987 __ bind(done); 988 } 989 990 // Scan over array at a for count oops, verifying each one. 991 // Preserves a and count, clobbers t0 and t1. 992 void verify_oop_array(size_t size, Register a, Register count, Register temp) { 993 Label loop, end; 994 __ mv(t1, zr); 995 __ slli(t0, count, exact_log2(size)); 996 __ bind(loop); 997 __ bgeu(t1, t0, end); 998 999 __ add(temp, a, t1); 1000 if (size == (size_t)wordSize) { 1001 __ ld(temp, Address(temp, 0)); 1002 __ verify_oop(temp); 1003 } else { 1004 __ lwu(temp, Address(temp, 0)); 1005 __ decode_heap_oop(temp); // calls verify_oop 1006 } 1007 __ add(t1, t1, size); 1008 __ j(loop); 1009 __ bind(end); 1010 } 1011 1012 // Arguments: 1013 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1014 // ignored 1015 // is_oop - true => oop array, so generate store check code 1016 // name - stub name string 1017 // 1018 // Inputs: 1019 // c_rarg0 - source array address 1020 // c_rarg1 - destination array address 1021 // c_rarg2 - element count, treated as ssize_t, can be zero 1022 // 1023 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1024 // the hardware handle it. The two dwords within qwords that span 1025 // cache line boundaries will still be loaded and stored atomicly. 1026 // 1027 // Side Effects: 1028 // disjoint_int_copy_entry is set to the no-overlap entry point 1029 // used by generate_conjoint_int_oop_copy(). 1030 // 1031 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry, 1032 const char* name, bool dest_uninitialized = false) { 1033 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1034 RegSet saved_reg = RegSet::of(s, d, count); 1035 __ align(CodeEntryAlignment); 1036 StubCodeMark mark(this, "StubRoutines", name); 1037 address start = __ pc(); 1038 __ enter(); 1039 1040 if (entry != NULL) { 1041 *entry = __ pc(); 1042 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1043 BLOCK_COMMENT("Entry:"); 1044 } 1045 1046 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1047 if (dest_uninitialized) { 1048 decorators |= IS_DEST_UNINITIALIZED; 1049 } 1050 if (aligned) { 1051 decorators |= ARRAYCOPY_ALIGNED; 1052 } 1053 1054 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1055 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1056 1057 if (is_oop) { 1058 // save regs before copy_memory 1059 __ push_reg(RegSet::of(d, count), sp); 1060 } 1061 1062 { 1063 // UnsafeCopyMemory page error: continue after ucm 1064 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1065 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1066 copy_memory(aligned, s, d, count, t0, size); 1067 } 1068 1069 if (is_oop) { 1070 __ pop_reg(RegSet::of(d, count), sp); 1071 if (VerifyOops) { 1072 verify_oop_array(size, d, count, t2); 1073 } 1074 } 1075 1076 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1077 1078 __ leave(); 1079 __ mv(x10, zr); // return 0 1080 __ ret(); 1081 return start; 1082 } 1083 1084 // Arguments: 1085 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1086 // ignored 1087 // is_oop - true => oop array, so generate store check code 1088 // name - stub name string 1089 // 1090 // Inputs: 1091 // c_rarg0 - source array address 1092 // c_rarg1 - destination array address 1093 // c_rarg2 - element count, treated as ssize_t, can be zero 1094 // 1095 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1096 // the hardware handle it. The two dwords within qwords that span 1097 // cache line boundaries will still be loaded and stored atomicly. 1098 // 1099 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1100 address* entry, const char* name, 1101 bool dest_uninitialized = false) { 1102 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1103 RegSet saved_regs = RegSet::of(s, d, count); 1104 StubCodeMark mark(this, "StubRoutines", name); 1105 address start = __ pc(); 1106 __ enter(); 1107 1108 if (entry != NULL) { 1109 *entry = __ pc(); 1110 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1111 BLOCK_COMMENT("Entry:"); 1112 } 1113 1114 // use fwd copy when (d-s) above_equal (count*size) 1115 __ sub(t0, d, s); 1116 __ slli(t1, count, exact_log2(size)); 1117 __ bgeu(t0, t1, nooverlap_target); 1118 1119 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1120 if (dest_uninitialized) { 1121 decorators |= IS_DEST_UNINITIALIZED; 1122 } 1123 if (aligned) { 1124 decorators |= ARRAYCOPY_ALIGNED; 1125 } 1126 1127 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1128 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1129 1130 if (is_oop) { 1131 // save regs before copy_memory 1132 __ push_reg(RegSet::of(d, count), sp); 1133 } 1134 1135 { 1136 // UnsafeCopyMemory page error: continue after ucm 1137 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1138 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1139 copy_memory(aligned, s, d, count, t0, -size); 1140 } 1141 1142 if (is_oop) { 1143 __ pop_reg(RegSet::of(d, count), sp); 1144 if (VerifyOops) { 1145 verify_oop_array(size, d, count, t2); 1146 } 1147 } 1148 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1149 __ leave(); 1150 __ mv(x10, zr); // return 0 1151 __ ret(); 1152 return start; 1153 } 1154 1155 // Arguments: 1156 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1157 // ignored 1158 // name - stub name string 1159 // 1160 // Inputs: 1161 // c_rarg0 - source array address 1162 // c_rarg1 - destination array address 1163 // c_rarg2 - element count, treated as ssize_t, can be zero 1164 // 1165 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1166 // we let the hardware handle it. The one to eight bytes within words, 1167 // dwords or qwords that span cache line boundaries will still be loaded 1168 // and stored atomically. 1169 // 1170 // Side Effects: 1171 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1172 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1173 // we let the hardware handle it. The one to eight bytes within words, 1174 // dwords or qwords that span cache line boundaries will still be loaded 1175 // and stored atomically. 1176 // 1177 // Side Effects: 1178 // disjoint_byte_copy_entry is set to the no-overlap entry point 1179 // used by generate_conjoint_byte_copy(). 1180 // 1181 address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) { 1182 const bool not_oop = false; 1183 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1184 } 1185 1186 // Arguments: 1187 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1188 // ignored 1189 // name - stub name string 1190 // 1191 // Inputs: 1192 // c_rarg0 - source array address 1193 // c_rarg1 - destination array address 1194 // c_rarg2 - element count, treated as ssize_t, can be zero 1195 // 1196 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1197 // we let the hardware handle it. The one to eight bytes within words, 1198 // dwords or qwords that span cache line boundaries will still be loaded 1199 // and stored atomically. 1200 // 1201 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1202 address* entry, const char* name) { 1203 const bool not_oop = false; 1204 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1205 } 1206 1207 // Arguments: 1208 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1209 // ignored 1210 // name - stub name string 1211 // 1212 // Inputs: 1213 // c_rarg0 - source array address 1214 // c_rarg1 - destination array address 1215 // c_rarg2 - element count, treated as ssize_t, can be zero 1216 // 1217 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1218 // let the hardware handle it. The two or four words within dwords 1219 // or qwords that span cache line boundaries will still be loaded 1220 // and stored atomically. 1221 // 1222 // Side Effects: 1223 // disjoint_short_copy_entry is set to the no-overlap entry point 1224 // used by generate_conjoint_short_copy(). 1225 // 1226 address generate_disjoint_short_copy(bool aligned, 1227 address* entry, const char* name) { 1228 const bool not_oop = false; 1229 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1230 } 1231 1232 // Arguments: 1233 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1234 // ignored 1235 // name - stub name string 1236 // 1237 // Inputs: 1238 // c_rarg0 - source array address 1239 // c_rarg1 - destination array address 1240 // c_rarg2 - element count, treated as ssize_t, can be zero 1241 // 1242 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1243 // let the hardware handle it. The two or four words within dwords 1244 // or qwords that span cache line boundaries will still be loaded 1245 // and stored atomically. 1246 // 1247 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1248 address* entry, const char* name) { 1249 const bool not_oop = false; 1250 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1251 } 1252 1253 // Arguments: 1254 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1255 // ignored 1256 // name - stub name string 1257 // 1258 // Inputs: 1259 // c_rarg0 - source array address 1260 // c_rarg1 - destination array address 1261 // c_rarg2 - element count, treated as ssize_t, can be zero 1262 // 1263 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1264 // the hardware handle it. The two dwords within qwords that span 1265 // cache line boundaries will still be loaded and stored atomicly. 1266 // 1267 // Side Effects: 1268 // disjoint_int_copy_entry is set to the no-overlap entry point 1269 // used by generate_conjoint_int_oop_copy(). 1270 // 1271 address generate_disjoint_int_copy(bool aligned, address* entry, 1272 const char* name, bool dest_uninitialized = false) { 1273 const bool not_oop = false; 1274 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1275 } 1276 1277 // Arguments: 1278 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1279 // ignored 1280 // name - stub name string 1281 // 1282 // Inputs: 1283 // c_rarg0 - source array address 1284 // c_rarg1 - destination array address 1285 // c_rarg2 - element count, treated as ssize_t, can be zero 1286 // 1287 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1288 // the hardware handle it. The two dwords within qwords that span 1289 // cache line boundaries will still be loaded and stored atomicly. 1290 // 1291 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1292 address* entry, const char* name, 1293 bool dest_uninitialized = false) { 1294 const bool not_oop = false; 1295 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1296 } 1297 1298 1299 // Arguments: 1300 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1301 // ignored 1302 // name - stub name string 1303 // 1304 // Inputs: 1305 // c_rarg0 - source array address 1306 // c_rarg1 - destination array address 1307 // c_rarg2 - element count, treated as size_t, can be zero 1308 // 1309 // Side Effects: 1310 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1311 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1312 // 1313 address generate_disjoint_long_copy(bool aligned, address* entry, 1314 const char* name, bool dest_uninitialized = false) { 1315 const bool not_oop = false; 1316 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1317 } 1318 1319 // Arguments: 1320 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1321 // ignored 1322 // name - stub name string 1323 // 1324 // Inputs: 1325 // c_rarg0 - source array address 1326 // c_rarg1 - destination array address 1327 // c_rarg2 - element count, treated as size_t, can be zero 1328 // 1329 address generate_conjoint_long_copy(bool aligned, 1330 address nooverlap_target, address* entry, 1331 const char* name, bool dest_uninitialized = false) { 1332 const bool not_oop = false; 1333 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1334 } 1335 1336 // Arguments: 1337 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1338 // ignored 1339 // name - stub name string 1340 // 1341 // Inputs: 1342 // c_rarg0 - source array address 1343 // c_rarg1 - destination array address 1344 // c_rarg2 - element count, treated as size_t, can be zero 1345 // 1346 // Side Effects: 1347 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1348 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1349 // 1350 address generate_disjoint_oop_copy(bool aligned, address* entry, 1351 const char* name, bool dest_uninitialized) { 1352 const bool is_oop = true; 1353 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1354 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1355 } 1356 1357 // Arguments: 1358 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1359 // ignored 1360 // name - stub name string 1361 // 1362 // Inputs: 1363 // c_rarg0 - source array address 1364 // c_rarg1 - destination array address 1365 // c_rarg2 - element count, treated as size_t, can be zero 1366 // 1367 address generate_conjoint_oop_copy(bool aligned, 1368 address nooverlap_target, address* entry, 1369 const char* name, bool dest_uninitialized) { 1370 const bool is_oop = true; 1371 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1372 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1373 name, dest_uninitialized); 1374 } 1375 1376 // Helper for generating a dynamic type check. 1377 // Smashes t0, t1. 1378 void generate_type_check(Register sub_klass, 1379 Register super_check_offset, 1380 Register super_klass, 1381 Label& L_success) { 1382 assert_different_registers(sub_klass, super_check_offset, super_klass); 1383 1384 BLOCK_COMMENT("type_check:"); 1385 1386 Label L_miss; 1387 1388 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, super_check_offset); 1389 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1390 1391 // Fall through on failure! 1392 __ BIND(L_miss); 1393 } 1394 1395 // 1396 // Generate checkcasting array copy stub 1397 // 1398 // Input: 1399 // c_rarg0 - source array address 1400 // c_rarg1 - destination array address 1401 // c_rarg2 - element count, treated as ssize_t, can be zero 1402 // c_rarg3 - size_t ckoff (super_check_offset) 1403 // c_rarg4 - oop ckval (super_klass) 1404 // 1405 // Output: 1406 // x10 == 0 - success 1407 // x10 == -1^K - failure, where K is partial transfer count 1408 // 1409 address generate_checkcast_copy(const char* name, address* entry, 1410 bool dest_uninitialized = false) { 1411 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1412 1413 // Input registers (after setup_arg_regs) 1414 const Register from = c_rarg0; // source array address 1415 const Register to = c_rarg1; // destination array address 1416 const Register count = c_rarg2; // elementscount 1417 const Register ckoff = c_rarg3; // super_check_offset 1418 const Register ckval = c_rarg4; // super_klass 1419 1420 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1421 RegSet wb_post_saved_regs = RegSet::of(count); 1422 1423 // Registers used as temps (x7, x9, x18 are save-on-entry) 1424 const Register count_save = x19; // orig elementscount 1425 const Register start_to = x18; // destination array start address 1426 const Register copied_oop = x7; // actual oop copied 1427 const Register r9_klass = x9; // oop._klass 1428 1429 //--------------------------------------------------------------- 1430 // Assembler stub will be used for this call to arraycopy 1431 // if the two arrays are subtypes of Object[] but the 1432 // destination array type is not equal to or a supertype 1433 // of the source type. Each element must be separately 1434 // checked. 1435 1436 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1437 copied_oop, r9_klass, count_save); 1438 1439 __ align(CodeEntryAlignment); 1440 StubCodeMark mark(this, "StubRoutines", name); 1441 address start = __ pc(); 1442 1443 __ enter(); // required for proper stackwalking of RuntimeStub frame 1444 1445 // Caller of this entry point must set up the argument registers. 1446 if (entry != NULL) { 1447 *entry = __ pc(); 1448 BLOCK_COMMENT("Entry:"); 1449 } 1450 1451 // Empty array: Nothing to do 1452 __ beqz(count, L_done); 1453 1454 __ push_reg(RegSet::of(x7, x9, x18, x19), sp); 1455 1456 #ifdef ASSERT 1457 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1458 // The ckoff and ckval must be mutually consistent, 1459 // even though caller generates both. 1460 { Label L; 1461 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1462 __ lwu(start_to, Address(ckval, sco_offset)); 1463 __ beq(ckoff, start_to, L); 1464 __ stop("super_check_offset inconsistent"); 1465 __ bind(L); 1466 } 1467 #endif //ASSERT 1468 1469 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1470 bool is_oop = true; 1471 if (dest_uninitialized) { 1472 decorators |= IS_DEST_UNINITIALIZED; 1473 } 1474 1475 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1476 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1477 1478 // save the original count 1479 __ mv(count_save, count); 1480 1481 // Copy from low to high addresses 1482 __ mv(start_to, to); // Save destination array start address 1483 __ j(L_load_element); 1484 1485 // ======== begin loop ======== 1486 // (Loop is rotated; its entry is L_load_element.) 1487 // Loop control: 1488 // for count to 0 do 1489 // copied_oop = load_heap_oop(from++) 1490 // ... generate_type_check ... 1491 // store_heap_oop(to++, copied_oop) 1492 // end 1493 1494 __ align(OptoLoopAlignment); 1495 1496 __ BIND(L_store_element); 1497 __ store_heap_oop(Address(to, 0), copied_oop, noreg, noreg, AS_RAW); // store the oop 1498 __ add(to, to, UseCompressedOops ? 4 : 8); 1499 __ sub(count, count, 1); 1500 __ beqz(count, L_do_card_marks); 1501 1502 // ======== loop entry is here ======== 1503 __ BIND(L_load_element); 1504 __ load_heap_oop(copied_oop, Address(from, 0), noreg, noreg, AS_RAW); // load the oop 1505 __ add(from, from, UseCompressedOops ? 4 : 8); 1506 __ beqz(copied_oop, L_store_element); 1507 1508 __ load_klass(r9_klass, copied_oop);// query the object klass 1509 generate_type_check(r9_klass, ckoff, ckval, L_store_element); 1510 // ======== end loop ======== 1511 1512 // It was a real error; we must depend on the caller to finish the job. 1513 // Register count = remaining oops, count_orig = total oops. 1514 // Emit GC store barriers for the oops we have copied and report 1515 // their number to the caller. 1516 1517 __ sub(count, count_save, count); // K = partially copied oop count 1518 __ xori(count, count, -1); // report (-1^K) to caller 1519 __ beqz(count, L_done_pop); 1520 1521 __ BIND(L_do_card_marks); 1522 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs); 1523 1524 __ bind(L_done_pop); 1525 __ pop_reg(RegSet::of(x7, x9, x18, x19), sp); 1526 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1527 1528 __ bind(L_done); 1529 __ mv(x10, count); 1530 __ leave(); 1531 __ ret(); 1532 1533 return start; 1534 } 1535 1536 // Perform range checks on the proposed arraycopy. 1537 // Kills temp, but nothing else. 1538 // Also, clean the sign bits of src_pos and dst_pos. 1539 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1540 Register src_pos, // source position (c_rarg1) 1541 Register dst, // destination array oo (c_rarg2) 1542 Register dst_pos, // destination position (c_rarg3) 1543 Register length, 1544 Register temp, 1545 Label& L_failed) { 1546 BLOCK_COMMENT("arraycopy_range_checks:"); 1547 1548 assert_different_registers(t0, temp); 1549 1550 // if [src_pos + length > arrayOop(src)->length()] then FAIL 1551 __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes())); 1552 __ addw(temp, length, src_pos); 1553 __ bgtu(temp, t0, L_failed); 1554 1555 // if [dst_pos + length > arrayOop(dst)->length()] then FAIL 1556 __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1557 __ addw(temp, length, dst_pos); 1558 __ bgtu(temp, t0, L_failed); 1559 1560 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1561 __ zero_extend(src_pos, src_pos, 32); 1562 __ zero_extend(dst_pos, dst_pos, 32); 1563 1564 BLOCK_COMMENT("arraycopy_range_checks done"); 1565 } 1566 1567 // 1568 // Generate 'unsafe' array copy stub 1569 // Though just as safe as the other stubs, it takes an unscaled 1570 // size_t argument instead of an element count. 1571 // 1572 // Input: 1573 // c_rarg0 - source array address 1574 // c_rarg1 - destination array address 1575 // c_rarg2 - byte count, treated as ssize_t, can be zero 1576 // 1577 // Examines the alignment of the operands and dispatches 1578 // to a long, int, short, or byte copy loop. 1579 // 1580 address generate_unsafe_copy(const char* name, 1581 address byte_copy_entry, 1582 address short_copy_entry, 1583 address int_copy_entry, 1584 address long_copy_entry) { 1585 assert_cond(byte_copy_entry != NULL && short_copy_entry != NULL && 1586 int_copy_entry != NULL && long_copy_entry != NULL); 1587 Label L_long_aligned, L_int_aligned, L_short_aligned; 1588 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1589 1590 __ align(CodeEntryAlignment); 1591 StubCodeMark mark(this, "StubRoutines", name); 1592 address start = __ pc(); 1593 __ enter(); // required for proper stackwalking of RuntimeStub frame 1594 1595 // bump this on entry, not on exit: 1596 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1597 1598 __ orr(t0, s, d); 1599 __ orr(t0, t0, count); 1600 1601 __ andi(t0, t0, BytesPerLong - 1); 1602 __ beqz(t0, L_long_aligned); 1603 __ andi(t0, t0, BytesPerInt - 1); 1604 __ beqz(t0, L_int_aligned); 1605 __ andi(t0, t0, 1); 1606 __ beqz(t0, L_short_aligned); 1607 __ j(RuntimeAddress(byte_copy_entry)); 1608 1609 __ BIND(L_short_aligned); 1610 __ srli(count, count, LogBytesPerShort); // size => short_count 1611 __ j(RuntimeAddress(short_copy_entry)); 1612 __ BIND(L_int_aligned); 1613 __ srli(count, count, LogBytesPerInt); // size => int_count 1614 __ j(RuntimeAddress(int_copy_entry)); 1615 __ BIND(L_long_aligned); 1616 __ srli(count, count, LogBytesPerLong); // size => long_count 1617 __ j(RuntimeAddress(long_copy_entry)); 1618 1619 return start; 1620 } 1621 1622 // 1623 // Generate generic array copy stubs 1624 // 1625 // Input: 1626 // c_rarg0 - src oop 1627 // c_rarg1 - src_pos (32-bits) 1628 // c_rarg2 - dst oop 1629 // c_rarg3 - dst_pos (32-bits) 1630 // c_rarg4 - element count (32-bits) 1631 // 1632 // Output: 1633 // x10 == 0 - success 1634 // x10 == -1^K - failure, where K is partial transfer count 1635 // 1636 address generate_generic_copy(const char* name, 1637 address byte_copy_entry, address short_copy_entry, 1638 address int_copy_entry, address oop_copy_entry, 1639 address long_copy_entry, address checkcast_copy_entry) { 1640 assert_cond(byte_copy_entry != NULL && short_copy_entry != NULL && 1641 int_copy_entry != NULL && oop_copy_entry != NULL && 1642 long_copy_entry != NULL && checkcast_copy_entry != NULL); 1643 Label L_failed, L_failed_0, L_objArray; 1644 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1645 1646 // Input registers 1647 const Register src = c_rarg0; // source array oop 1648 const Register src_pos = c_rarg1; // source position 1649 const Register dst = c_rarg2; // destination array oop 1650 const Register dst_pos = c_rarg3; // destination position 1651 const Register length = c_rarg4; 1652 1653 // Registers used as temps 1654 const Register dst_klass = c_rarg5; 1655 1656 __ align(CodeEntryAlignment); 1657 1658 StubCodeMark mark(this, "StubRoutines", name); 1659 1660 address start = __ pc(); 1661 1662 __ enter(); // required for proper stackwalking of RuntimeStub frame 1663 1664 // bump this on entry, not on exit: 1665 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1666 1667 //----------------------------------------------------------------------- 1668 // Assembler stub will be used for this call to arraycopy 1669 // if the following conditions are met: 1670 // 1671 // (1) src and dst must not be null. 1672 // (2) src_pos must not be negative. 1673 // (3) dst_pos must not be negative. 1674 // (4) length must not be negative. 1675 // (5) src klass and dst klass should be the same and not NULL. 1676 // (6) src and dst should be arrays. 1677 // (7) src_pos + length must not exceed length of src. 1678 // (8) dst_pos + length must not exceed length of dst. 1679 // 1680 1681 // if [src == NULL] then return -1 1682 __ beqz(src, L_failed); 1683 1684 // if [src_pos < 0] then return -1 1685 // i.e. sign bit set 1686 __ andi(t0, src_pos, 1UL << 31); 1687 __ bnez(t0, L_failed); 1688 1689 // if [dst == NULL] then return -1 1690 __ beqz(dst, L_failed); 1691 1692 // if [dst_pos < 0] then return -1 1693 // i.e. sign bit set 1694 __ andi(t0, dst_pos, 1UL << 31); 1695 __ bnez(t0, L_failed); 1696 1697 // registers used as temp 1698 const Register scratch_length = x28; // elements count to copy 1699 const Register scratch_src_klass = x29; // array klass 1700 const Register lh = x30; // layout helper 1701 1702 // if [length < 0] then return -1 1703 __ addw(scratch_length, length, zr); // length (elements count, 32-bits value) 1704 // i.e. sign bit set 1705 __ andi(t0, scratch_length, 1UL << 31); 1706 __ bnez(t0, L_failed); 1707 1708 __ load_klass(scratch_src_klass, src); 1709 #ifdef ASSERT 1710 { 1711 BLOCK_COMMENT("assert klasses not null {"); 1712 Label L1, L2; 1713 __ bnez(scratch_src_klass, L2); // it is broken if klass is NULL 1714 __ bind(L1); 1715 __ stop("broken null klass"); 1716 __ bind(L2); 1717 __ load_klass(t0, dst); 1718 __ beqz(t0, L1); // this would be broken also 1719 BLOCK_COMMENT("} assert klasses not null done"); 1720 } 1721 #endif 1722 1723 // Load layout helper (32-bits) 1724 // 1725 // |array_tag| | header_size | element_type | |log2_element_size| 1726 // 32 30 24 16 8 2 0 1727 // 1728 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1729 // 1730 1731 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1732 1733 // Handle objArrays completely differently... 1734 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1735 __ lw(lh, Address(scratch_src_klass, lh_offset)); 1736 __ mvw(t0, objArray_lh); 1737 __ beq(lh, t0, L_objArray); 1738 1739 // if [src->klass() != dst->klass()] then return -1 1740 __ load_klass(t1, dst); 1741 __ bne(t1, scratch_src_klass, L_failed); 1742 1743 // if [src->is_Array() != NULL] then return -1 1744 // i.e. (lh >= 0) 1745 __ andi(t0, lh, 1UL << 31); 1746 __ beqz(t0, L_failed); 1747 1748 // At this point, it is known to be a typeArray (array_tag 0x3). 1749 #ifdef ASSERT 1750 { 1751 BLOCK_COMMENT("assert primitive array {"); 1752 Label L; 1753 __ mvw(t1, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 1754 __ bge(lh, t1, L); 1755 __ stop("must be a primitive array"); 1756 __ bind(L); 1757 BLOCK_COMMENT("} assert primitive array done"); 1758 } 1759 #endif 1760 1761 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1762 t1, L_failed); 1763 1764 // TypeArrayKlass 1765 // 1766 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize) 1767 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize) 1768 // 1769 1770 const Register t0_offset = t0; // array offset 1771 const Register x22_elsize = lh; // element size 1772 1773 // Get array_header_in_bytes() 1774 int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1); 1775 int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width; 1776 __ slli(t0_offset, lh, XLEN - lh_header_size_msb); // left shift to remove 24 ~ 32; 1777 __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset 1778 1779 __ add(src, src, t0_offset); // src array offset 1780 __ add(dst, dst, t0_offset); // dst array offset 1781 BLOCK_COMMENT("choose copy loop based on element size"); 1782 1783 // next registers should be set before the jump to corresponding stub 1784 const Register from = c_rarg0; // source array address 1785 const Register to = c_rarg1; // destination array address 1786 const Register count = c_rarg2; // elements count 1787 1788 // 'from', 'to', 'count' registers should be set in such order 1789 // since they are the same as 'src', 'src_pos', 'dst'. 1790 1791 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1792 1793 // The possible values of elsize are 0-3, i.e. exact_log2(element 1794 // size in bytes). We do a simple bitwise binary search. 1795 __ BIND(L_copy_bytes); 1796 __ andi(t0, x22_elsize, 2); 1797 __ bnez(t0, L_copy_ints); 1798 __ andi(t0, x22_elsize, 1); 1799 __ bnez(t0, L_copy_shorts); 1800 __ add(from, src, src_pos); // src_addr 1801 __ add(to, dst, dst_pos); // dst_addr 1802 __ addw(count, scratch_length, zr); // length 1803 __ j(RuntimeAddress(byte_copy_entry)); 1804 1805 __ BIND(L_copy_shorts); 1806 __ shadd(from, src_pos, src, t0, 1); // src_addr 1807 __ shadd(to, dst_pos, dst, t0, 1); // dst_addr 1808 __ addw(count, scratch_length, zr); // length 1809 __ j(RuntimeAddress(short_copy_entry)); 1810 1811 __ BIND(L_copy_ints); 1812 __ andi(t0, x22_elsize, 1); 1813 __ bnez(t0, L_copy_longs); 1814 __ shadd(from, src_pos, src, t0, 2); // src_addr 1815 __ shadd(to, dst_pos, dst, t0, 2); // dst_addr 1816 __ addw(count, scratch_length, zr); // length 1817 __ j(RuntimeAddress(int_copy_entry)); 1818 1819 __ BIND(L_copy_longs); 1820 #ifdef ASSERT 1821 { 1822 BLOCK_COMMENT("assert long copy {"); 1823 Label L; 1824 __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x22_elsize 1825 __ addw(lh, lh, zr); 1826 __ mvw(t0, LogBytesPerLong); 1827 __ beq(x22_elsize, t0, L); 1828 __ stop("must be long copy, but elsize is wrong"); 1829 __ bind(L); 1830 BLOCK_COMMENT("} assert long copy done"); 1831 } 1832 #endif 1833 __ shadd(from, src_pos, src, t0, 3); // src_addr 1834 __ shadd(to, dst_pos, dst, t0, 3); // dst_addr 1835 __ addw(count, scratch_length, zr); // length 1836 __ j(RuntimeAddress(long_copy_entry)); 1837 1838 // ObjArrayKlass 1839 __ BIND(L_objArray); 1840 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 1841 1842 Label L_plain_copy, L_checkcast_copy; 1843 // test array classes for subtyping 1844 __ load_klass(t2, dst); 1845 __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality 1846 1847 // Identically typed arrays can be copied without element-wise checks. 1848 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1849 t1, L_failed); 1850 1851 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1852 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1853 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1854 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1855 __ addw(count, scratch_length, zr); // length 1856 __ BIND(L_plain_copy); 1857 __ j(RuntimeAddress(oop_copy_entry)); 1858 1859 __ BIND(L_checkcast_copy); 1860 // live at this point: scratch_src_klass, scratch_length, t2 (dst_klass) 1861 { 1862 // Before looking at dst.length, make sure dst is also an objArray. 1863 __ lwu(t0, Address(t2, lh_offset)); 1864 __ mvw(t1, objArray_lh); 1865 __ bne(t0, t1, L_failed); 1866 1867 // It is safe to examine both src.length and dst.length. 1868 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1869 t2, L_failed); 1870 1871 __ load_klass(dst_klass, dst); // reload 1872 1873 // Marshal the base address arguments now, freeing registers. 1874 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1875 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1876 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1877 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1878 __ addw(count, length, zr); // length (reloaded) 1879 const Register sco_temp = c_rarg3; // this register is free now 1880 assert_different_registers(from, to, count, sco_temp, 1881 dst_klass, scratch_src_klass); 1882 1883 // Generate the type check. 1884 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1885 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1886 1887 // Smashes t0, t1 1888 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 1889 1890 // Fetch destination element klass from the ObjArrayKlass header. 1891 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 1892 __ ld(dst_klass, Address(dst_klass, ek_offset)); 1893 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1894 1895 // the checkcast_copy loop needs two extra arguments: 1896 assert(c_rarg3 == sco_temp, "#3 already in place"); 1897 // Set up arguments for checkcast_copy_entry. 1898 __ mv(c_rarg4, dst_klass); // dst.klass.element_klass 1899 __ j(RuntimeAddress(checkcast_copy_entry)); 1900 } 1901 1902 __ BIND(L_failed); 1903 __ li(x10, -1); 1904 __ leave(); // required for proper stackwalking of RuntimeStub frame 1905 __ ret(); 1906 1907 return start; 1908 } 1909 1910 // 1911 // Generate stub for array fill. If "aligned" is true, the 1912 // "to" address is assumed to be heapword aligned. 1913 // 1914 // Arguments for generated stub: 1915 // to: c_rarg0 1916 // value: c_rarg1 1917 // count: c_rarg2 treated as signed 1918 // 1919 address generate_fill(BasicType t, bool aligned, const char* name) { 1920 __ align(CodeEntryAlignment); 1921 StubCodeMark mark(this, "StubRoutines", name); 1922 address start = __ pc(); 1923 1924 BLOCK_COMMENT("Entry:"); 1925 1926 const Register to = c_rarg0; // source array address 1927 const Register value = c_rarg1; // value 1928 const Register count = c_rarg2; // elements count 1929 1930 const Register bz_base = x28; // base for block_zero routine 1931 const Register cnt_words = x29; // temp register 1932 const Register tmp_reg = t1; 1933 1934 __ enter(); 1935 1936 Label L_fill_elements, L_exit1; 1937 1938 int shift = -1; 1939 switch (t) { 1940 case T_BYTE: 1941 shift = 0; 1942 1943 // Zero extend value 1944 // 8 bit -> 16 bit 1945 __ andi(value, value, 0xff); 1946 __ mv(tmp_reg, value); 1947 __ slli(tmp_reg, tmp_reg, 8); 1948 __ orr(value, value, tmp_reg); 1949 1950 // 16 bit -> 32 bit 1951 __ mv(tmp_reg, value); 1952 __ slli(tmp_reg, tmp_reg, 16); 1953 __ orr(value, value, tmp_reg); 1954 1955 __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element 1956 __ bltu(count, tmp_reg, L_fill_elements); 1957 break; 1958 case T_SHORT: 1959 shift = 1; 1960 // Zero extend value 1961 // 16 bit -> 32 bit 1962 __ andi(value, value, 0xffff); 1963 __ mv(tmp_reg, value); 1964 __ slli(tmp_reg, tmp_reg, 16); 1965 __ orr(value, value, tmp_reg); 1966 1967 // Short arrays (< 8 bytes) fill by element 1968 __ mv(tmp_reg, 8 >> shift); 1969 __ bltu(count, tmp_reg, L_fill_elements); 1970 break; 1971 case T_INT: 1972 shift = 2; 1973 1974 // Short arrays (< 8 bytes) fill by element 1975 __ mv(tmp_reg, 8 >> shift); 1976 __ bltu(count, tmp_reg, L_fill_elements); 1977 break; 1978 default: ShouldNotReachHere(); 1979 } 1980 1981 // Align source address at 8 bytes address boundary. 1982 Label L_skip_align1, L_skip_align2, L_skip_align4; 1983 if (!aligned) { 1984 switch (t) { 1985 case T_BYTE: 1986 // One byte misalignment happens only for byte arrays. 1987 __ andi(t0, to, 1); 1988 __ beqz(t0, L_skip_align1); 1989 __ sb(value, Address(to, 0)); 1990 __ addi(to, to, 1); 1991 __ addiw(count, count, -1); 1992 __ bind(L_skip_align1); 1993 // Fallthrough 1994 case T_SHORT: 1995 // Two bytes misalignment happens only for byte and short (char) arrays. 1996 __ andi(t0, to, 2); 1997 __ beqz(t0, L_skip_align2); 1998 __ sh(value, Address(to, 0)); 1999 __ addi(to, to, 2); 2000 __ addiw(count, count, -(2 >> shift)); 2001 __ bind(L_skip_align2); 2002 // Fallthrough 2003 case T_INT: 2004 // Align to 8 bytes, we know we are 4 byte aligned to start. 2005 __ andi(t0, to, 4); 2006 __ beqz(t0, L_skip_align4); 2007 __ sw(value, Address(to, 0)); 2008 __ addi(to, to, 4); 2009 __ addiw(count, count, -(4 >> shift)); 2010 __ bind(L_skip_align4); 2011 break; 2012 default: ShouldNotReachHere(); 2013 } 2014 } 2015 2016 // 2017 // Fill large chunks 2018 // 2019 __ srliw(cnt_words, count, 3 - shift); // number of words 2020 2021 // 32 bit -> 64 bit 2022 __ andi(value, value, 0xffffffff); 2023 __ mv(tmp_reg, value); 2024 __ slli(tmp_reg, tmp_reg, 32); 2025 __ orr(value, value, tmp_reg); 2026 2027 __ slli(tmp_reg, cnt_words, 3 - shift); 2028 __ subw(count, count, tmp_reg); 2029 { 2030 __ fill_words(to, cnt_words, value); 2031 } 2032 2033 // Remaining count is less than 8 bytes. Fill it by a single store. 2034 // Note that the total length is no less than 8 bytes. 2035 if (t == T_BYTE || t == T_SHORT) { 2036 __ beqz(count, L_exit1); 2037 __ shadd(to, count, to, tmp_reg, shift); // points to the end 2038 __ sd(value, Address(to, -8)); // overwrite some elements 2039 __ bind(L_exit1); 2040 __ leave(); 2041 __ ret(); 2042 } 2043 2044 // Handle copies less than 8 bytes. 2045 Label L_fill_2, L_fill_4, L_exit2; 2046 __ bind(L_fill_elements); 2047 switch (t) { 2048 case T_BYTE: 2049 __ andi(t0, count, 1); 2050 __ beqz(t0, L_fill_2); 2051 __ sb(value, Address(to, 0)); 2052 __ addi(to, to, 1); 2053 __ bind(L_fill_2); 2054 __ andi(t0, count, 2); 2055 __ beqz(t0, L_fill_4); 2056 __ sh(value, Address(to, 0)); 2057 __ addi(to, to, 2); 2058 __ bind(L_fill_4); 2059 __ andi(t0, count, 4); 2060 __ beqz(t0, L_exit2); 2061 __ sw(value, Address(to, 0)); 2062 break; 2063 case T_SHORT: 2064 __ andi(t0, count, 1); 2065 __ beqz(t0, L_fill_4); 2066 __ sh(value, Address(to, 0)); 2067 __ addi(to, to, 2); 2068 __ bind(L_fill_4); 2069 __ andi(t0, count, 2); 2070 __ beqz(t0, L_exit2); 2071 __ sw(value, Address(to, 0)); 2072 break; 2073 case T_INT: 2074 __ beqz(count, L_exit2); 2075 __ sw(value, Address(to, 0)); 2076 break; 2077 default: ShouldNotReachHere(); 2078 } 2079 __ bind(L_exit2); 2080 __ leave(); 2081 __ ret(); 2082 return start; 2083 } 2084 2085 void generate_arraycopy_stubs() { 2086 address entry = NULL; 2087 address entry_jbyte_arraycopy = NULL; 2088 address entry_jshort_arraycopy = NULL; 2089 address entry_jint_arraycopy = NULL; 2090 address entry_oop_arraycopy = NULL; 2091 address entry_jlong_arraycopy = NULL; 2092 address entry_checkcast_arraycopy = NULL; 2093 2094 generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards); 2095 generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards); 2096 2097 StubRoutines::riscv::_zero_blocks = generate_zero_blocks(); 2098 2099 //*** jbyte 2100 // Always need aligned and unaligned versions 2101 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2102 "jbyte_disjoint_arraycopy"); 2103 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2104 &entry_jbyte_arraycopy, 2105 "jbyte_arraycopy"); 2106 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2107 "arrayof_jbyte_disjoint_arraycopy"); 2108 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2109 "arrayof_jbyte_arraycopy"); 2110 2111 //*** jshort 2112 // Always need aligned and unaligned versions 2113 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2114 "jshort_disjoint_arraycopy"); 2115 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2116 &entry_jshort_arraycopy, 2117 "jshort_arraycopy"); 2118 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2119 "arrayof_jshort_disjoint_arraycopy"); 2120 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2121 "arrayof_jshort_arraycopy"); 2122 2123 //*** jint 2124 // Aligned versions 2125 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2126 "arrayof_jint_disjoint_arraycopy"); 2127 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2128 "arrayof_jint_arraycopy"); 2129 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2130 // entry_jint_arraycopy always points to the unaligned version 2131 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2132 "jint_disjoint_arraycopy"); 2133 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2134 &entry_jint_arraycopy, 2135 "jint_arraycopy"); 2136 2137 //*** jlong 2138 // It is always aligned 2139 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2140 "arrayof_jlong_disjoint_arraycopy"); 2141 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2142 "arrayof_jlong_arraycopy"); 2143 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2144 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2145 2146 //*** oops 2147 { 2148 // With compressed oops we need unaligned versions; notice that 2149 // we overwrite entry_oop_arraycopy. 2150 bool aligned = !UseCompressedOops; 2151 2152 StubRoutines::_arrayof_oop_disjoint_arraycopy 2153 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2154 /*dest_uninitialized*/false); 2155 StubRoutines::_arrayof_oop_arraycopy 2156 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2157 /*dest_uninitialized*/false); 2158 // Aligned versions without pre-barriers 2159 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2160 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2161 /*dest_uninitialized*/true); 2162 StubRoutines::_arrayof_oop_arraycopy_uninit 2163 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2164 /*dest_uninitialized*/true); 2165 } 2166 2167 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2168 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2169 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2170 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2171 2172 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2173 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2174 /*dest_uninitialized*/true); 2175 2176 2177 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2178 entry_jbyte_arraycopy, 2179 entry_jshort_arraycopy, 2180 entry_jint_arraycopy, 2181 entry_jlong_arraycopy); 2182 2183 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2184 entry_jbyte_arraycopy, 2185 entry_jshort_arraycopy, 2186 entry_jint_arraycopy, 2187 entry_oop_arraycopy, 2188 entry_jlong_arraycopy, 2189 entry_checkcast_arraycopy); 2190 2191 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2192 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2193 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2194 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2195 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2196 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2197 } 2198 2199 // Safefetch stubs. 2200 void generate_safefetch(const char* name, int size, address* entry, 2201 address* fault_pc, address* continuation_pc) { 2202 // safefetch signatures: 2203 // int SafeFetch32(int* adr, int errValue) 2204 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue) 2205 // 2206 // arguments: 2207 // c_rarg0 = adr 2208 // c_rarg1 = errValue 2209 // 2210 // result: 2211 // PPC_RET = *adr or errValue 2212 assert_cond(entry != NULL && fault_pc != NULL && continuation_pc != NULL); 2213 StubCodeMark mark(this, "StubRoutines", name); 2214 2215 // Entry point, pc or function descriptor. 2216 *entry = __ pc(); 2217 2218 // Load *adr into c_rarg1, may fault. 2219 *fault_pc = __ pc(); 2220 switch (size) { 2221 case 4: 2222 // int32_t 2223 __ lw(c_rarg1, Address(c_rarg0, 0)); 2224 break; 2225 case 8: 2226 // int64_t 2227 __ ld(c_rarg1, Address(c_rarg0, 0)); 2228 break; 2229 default: 2230 ShouldNotReachHere(); 2231 } 2232 2233 // return errValue or *adr 2234 *continuation_pc = __ pc(); 2235 __ mv(x10, c_rarg1); 2236 __ ret(); 2237 } 2238 2239 // code for comparing 16 bytes of strings with same encoding 2240 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 2241 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31; 2242 __ ld(tmp5, Address(str1)); 2243 __ addi(str1, str1, 8); 2244 __ xorr(tmp4, tmp1, tmp2); 2245 __ ld(cnt1, Address(str2)); 2246 __ addi(str2, str2, 8); 2247 __ bnez(tmp4, DIFF1); 2248 __ ld(tmp1, Address(str1)); 2249 __ addi(str1, str1, 8); 2250 __ xorr(tmp4, tmp5, cnt1); 2251 __ ld(tmp2, Address(str2)); 2252 __ addi(str2, str2, 8); 2253 __ bnez(tmp4, DIFF2); 2254 } 2255 2256 // code for comparing 8 characters of strings with Latin1 and Utf16 encoding 2257 void compare_string_8_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 2258 Label &DIFF2) { 2259 const Register strU = x12, curU = x7, strL = x29, tmp = x30; 2260 __ ld(tmpL, Address(strL)); 2261 __ addi(strL, strL, 8); 2262 __ ld(tmpU, Address(strU)); 2263 __ addi(strU, strU, 8); 2264 __ inflate_lo32(tmp, tmpL); 2265 __ mv(t0, tmp); 2266 __ xorr(tmp, curU, t0); 2267 __ bnez(tmp, DIFF2); 2268 2269 __ ld(curU, Address(strU)); 2270 __ addi(strU, strU, 8); 2271 __ inflate_hi32(tmp, tmpL); 2272 __ mv(t0, tmp); 2273 __ xorr(tmp, tmpU, t0); 2274 __ bnez(tmp, DIFF1); 2275 } 2276 2277 // x10 = result 2278 // x11 = str1 2279 // x12 = cnt1 2280 // x13 = str2 2281 // x14 = cnt2 2282 // x28 = tmp1 2283 // x29 = tmp2 2284 // x30 = tmp3 2285 address generate_compare_long_string_different_encoding(bool isLU) { 2286 __ align(CodeEntryAlignment); 2287 StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL"); 2288 address entry = __ pc(); 2289 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 2290 DONE, CALCULATE_DIFFERENCE; 2291 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14, 2292 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31; 2293 RegSet spilled_regs = RegSet::of(tmp4, tmp5); 2294 2295 // cnt2 == amount of characters left to compare 2296 // Check already loaded first 4 symbols 2297 __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2); 2298 __ mv(isLU ? tmp1 : tmp2, tmp3); 2299 __ addi(str1, str1, isLU ? wordSize / 2 : wordSize); 2300 __ addi(str2, str2, isLU ? wordSize : wordSize / 2); 2301 __ sub(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 2302 __ push_reg(spilled_regs, sp); 2303 2304 if (isLU) { 2305 __ add(str1, str1, cnt2); 2306 __ shadd(str2, cnt2, str2, t0, 1); 2307 } else { 2308 __ shadd(str1, cnt2, str1, t0, 1); 2309 __ add(str2, str2, cnt2); 2310 } 2311 __ xorr(tmp3, tmp1, tmp2); 2312 __ mv(tmp5, tmp2); 2313 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2314 2315 Register strU = isLU ? str2 : str1, 2316 strL = isLU ? str1 : str2, 2317 tmpU = isLU ? tmp5 : tmp1, // where to keep U for comparison 2318 tmpL = isLU ? tmp1 : tmp5; // where to keep L for comparison 2319 2320 __ sub(tmp2, strL, cnt2); // strL pointer to load from 2321 __ slli(t0, cnt2, 1); 2322 __ sub(cnt1, strU, t0); // strU pointer to load from 2323 2324 __ ld(tmp4, Address(cnt1)); 2325 __ addi(cnt1, cnt1, 8); 2326 __ beqz(cnt2, LOAD_LAST); // no characters left except last load 2327 __ sub(cnt2, cnt2, 16); 2328 __ bltz(cnt2, TAIL); 2329 __ bind(SMALL_LOOP); // smaller loop 2330 __ sub(cnt2, cnt2, 16); 2331 compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2); 2332 compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2); 2333 __ bgez(cnt2, SMALL_LOOP); 2334 __ addi(t0, cnt2, 16); 2335 __ beqz(t0, LOAD_LAST); 2336 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 2337 // Address of 8 bytes before last 4 characters in UTF-16 string 2338 __ shadd(cnt1, cnt2, cnt1, t0, 1); 2339 // Address of 16 bytes before last 4 characters in Latin1 string 2340 __ add(tmp2, tmp2, cnt2); 2341 __ ld(tmp4, Address(cnt1, -8)); 2342 // last 16 characters before last load 2343 compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2); 2344 compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2); 2345 __ j(LOAD_LAST); 2346 __ bind(DIFF2); 2347 __ mv(tmpU, tmp4); 2348 __ bind(DIFF1); 2349 __ mv(tmpL, t0); 2350 __ j(CALCULATE_DIFFERENCE); 2351 __ bind(LOAD_LAST); 2352 // Last 4 UTF-16 characters are already pre-loaded into tmp4 by compare_string_8_x_LU. 2353 // No need to load it again 2354 __ mv(tmpU, tmp4); 2355 __ ld(tmpL, Address(strL)); 2356 __ inflate_lo32(tmp3, tmpL); 2357 __ mv(tmpL, tmp3); 2358 __ xorr(tmp3, tmpU, tmpL); 2359 __ beqz(tmp3, DONE); 2360 2361 // Find the first different characters in the longwords and 2362 // compute their difference. 2363 __ bind(CALCULATE_DIFFERENCE); 2364 __ ctzc_bit(tmp4, tmp3); 2365 __ srl(tmp1, tmp1, tmp4); 2366 __ srl(tmp5, tmp5, tmp4); 2367 __ andi(tmp1, tmp1, 0xFFFF); 2368 __ andi(tmp5, tmp5, 0xFFFF); 2369 __ sub(result, tmp1, tmp5); 2370 __ bind(DONE); 2371 __ pop_reg(spilled_regs, sp); 2372 __ ret(); 2373 return entry; 2374 } 2375 2376 address generate_method_entry_barrier() { 2377 __ align(CodeEntryAlignment); 2378 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 2379 2380 Label deoptimize_label; 2381 2382 address start = __ pc(); 2383 2384 __ set_last_Java_frame(sp, fp, ra, t0); 2385 2386 __ enter(); 2387 __ add(t1, sp, wordSize); 2388 2389 __ sub(sp, sp, 4 * wordSize); 2390 2391 __ push_call_clobbered_registers(); 2392 2393 __ mv(c_rarg0, t1); 2394 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 2395 2396 __ reset_last_Java_frame(true); 2397 2398 __ mv(t0, x10); 2399 2400 __ pop_call_clobbered_registers(); 2401 2402 __ bnez(t0, deoptimize_label); 2403 2404 __ leave(); 2405 __ ret(); 2406 2407 __ BIND(deoptimize_label); 2408 2409 __ ld(t0, Address(sp, 0)); 2410 __ ld(fp, Address(sp, wordSize)); 2411 __ ld(ra, Address(sp, wordSize * 2)); 2412 __ ld(t1, Address(sp, wordSize * 3)); 2413 2414 __ mv(sp, t0); 2415 __ jr(t1); 2416 2417 return start; 2418 } 2419 2420 // x10 = result 2421 // x11 = str1 2422 // x12 = cnt1 2423 // x13 = str2 2424 // x14 = cnt2 2425 // x28 = tmp1 2426 // x29 = tmp2 2427 // x30 = tmp3 2428 // x31 = tmp4 2429 address generate_compare_long_string_same_encoding(bool isLL) { 2430 __ align(CodeEntryAlignment); 2431 StubCodeMark mark(this, "StubRoutines", isLL ? 2432 "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU"); 2433 address entry = __ pc(); 2434 Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL, 2435 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF; 2436 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14, 2437 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31; 2438 RegSet spilled_regs = RegSet::of(tmp4, tmp5); 2439 2440 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 2441 // update cnt2 counter with already loaded 8 bytes 2442 __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2)); 2443 // update pointers, because of previous read 2444 __ add(str1, str1, wordSize); 2445 __ add(str2, str2, wordSize); 2446 // less than 16 bytes left? 2447 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2448 __ push_reg(spilled_regs, sp); 2449 __ bltz(cnt2, TAIL); 2450 __ bind(SMALL_LOOP); 2451 compare_string_16_bytes_same(DIFF, DIFF2); 2452 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2453 __ bgez(cnt2, SMALL_LOOP); 2454 __ bind(TAIL); 2455 __ addi(cnt2, cnt2, isLL ? 16 : 8); 2456 __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); 2457 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2458 __ blez(cnt2, CHECK_LAST); 2459 __ xorr(tmp4, tmp1, tmp2); 2460 __ bnez(tmp4, DIFF); 2461 __ ld(tmp1, Address(str1)); 2462 __ addi(str1, str1, 8); 2463 __ ld(tmp2, Address(str2)); 2464 __ addi(str2, str2, 8); 2465 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2466 __ bind(CHECK_LAST); 2467 if (!isLL) { 2468 __ add(cnt2, cnt2, cnt2); // now in bytes 2469 } 2470 __ xorr(tmp4, tmp1, tmp2); 2471 __ bnez(tmp4, DIFF); 2472 __ add(str1, str1, cnt2); 2473 __ ld(tmp5, Address(str1)); 2474 __ add(str2, str2, cnt2); 2475 __ ld(cnt1, Address(str2)); 2476 __ xorr(tmp4, tmp5, cnt1); 2477 __ beqz(tmp4, LENGTH_DIFF); 2478 // Find the first different characters in the longwords and 2479 // compute their difference. 2480 __ bind(DIFF2); 2481 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2482 __ srl(tmp5, tmp5, tmp3); 2483 __ srl(cnt1, cnt1, tmp3); 2484 if (isLL) { 2485 __ andi(tmp5, tmp5, 0xFF); 2486 __ andi(cnt1, cnt1, 0xFF); 2487 } else { 2488 __ andi(tmp5, tmp5, 0xFFFF); 2489 __ andi(cnt1, cnt1, 0xFFFF); 2490 } 2491 __ sub(result, tmp5, cnt1); 2492 __ j(LENGTH_DIFF); 2493 __ bind(DIFF); 2494 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2495 __ srl(tmp1, tmp1, tmp3); 2496 __ srl(tmp2, tmp2, tmp3); 2497 if (isLL) { 2498 __ andi(tmp1, tmp1, 0xFF); 2499 __ andi(tmp2, tmp2, 0xFF); 2500 } else { 2501 __ andi(tmp1, tmp1, 0xFFFF); 2502 __ andi(tmp2, tmp2, 0xFFFF); 2503 } 2504 __ sub(result, tmp1, tmp2); 2505 __ j(LENGTH_DIFF); 2506 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 2507 __ xorr(tmp4, tmp1, tmp2); 2508 __ bnez(tmp4, DIFF); 2509 __ bind(LENGTH_DIFF); 2510 __ pop_reg(spilled_regs, sp); 2511 __ ret(); 2512 return entry; 2513 } 2514 2515 void generate_compare_long_strings() { 2516 StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true); 2517 StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false); 2518 StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true); 2519 StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false); 2520 } 2521 2522 // x10 result 2523 // x11 src 2524 // x12 src count 2525 // x13 pattern 2526 // x14 pattern count 2527 address generate_string_indexof_linear(bool needle_isL, bool haystack_isL) 2528 { 2529 const char* stubName = needle_isL 2530 ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul") 2531 : "indexof_linear_uu"; 2532 __ align(CodeEntryAlignment); 2533 StubCodeMark mark(this, "StubRoutines", stubName); 2534 address entry = __ pc(); 2535 2536 int needle_chr_size = needle_isL ? 1 : 2; 2537 int haystack_chr_size = haystack_isL ? 1 : 2; 2538 int needle_chr_shift = needle_isL ? 0 : 1; 2539 int haystack_chr_shift = haystack_isL ? 0 : 1; 2540 bool isL = needle_isL && haystack_isL; 2541 // parameters 2542 Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14; 2543 // temporary registers 2544 Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25; 2545 // redefinitions 2546 Register ch1 = x28, ch2 = x29; 2547 RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29); 2548 2549 __ push_reg(spilled_regs, sp); 2550 2551 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 2552 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 2553 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 2554 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 2555 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 2556 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 2557 2558 __ ld(ch1, Address(needle)); 2559 __ ld(ch2, Address(haystack)); 2560 // src.length - pattern.length 2561 __ sub(haystack_len, haystack_len, needle_len); 2562 2563 // first is needle[0] 2564 __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first); 2565 uint64_t mask0101 = UCONST64(0x0101010101010101); 2566 uint64_t mask0001 = UCONST64(0x0001000100010001); 2567 __ mv(mask1, haystack_isL ? mask0101 : mask0001); 2568 __ mul(first, first, mask1); 2569 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 2570 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 2571 __ mv(mask2, haystack_isL ? mask7f7f : mask7fff); 2572 if (needle_isL != haystack_isL) { 2573 __ mv(tmp, ch1); 2574 } 2575 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1); 2576 __ blez(haystack_len, L_SMALL); 2577 2578 if (needle_isL != haystack_isL) { 2579 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2580 } 2581 // xorr, sub, orr, notr, andr 2582 // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i] 2583 // eg: 2584 // first: aa aa aa aa aa aa aa aa 2585 // ch2: aa aa li nx jd ka aa aa 2586 // match_mask: 80 80 00 00 00 00 80 80 2587 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2588 2589 // search first char of needle, if success, goto L_HAS_ZERO; 2590 __ bnez(match_mask, L_HAS_ZERO); 2591 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2592 __ add(result, result, wordSize / haystack_chr_size); 2593 __ add(haystack, haystack, wordSize); 2594 __ bltz(haystack_len, L_POST_LOOP); 2595 2596 __ bind(L_LOOP); 2597 __ ld(ch2, Address(haystack)); 2598 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2599 __ bnez(match_mask, L_HAS_ZERO); 2600 2601 __ bind(L_LOOP_PROCEED); 2602 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2603 __ add(haystack, haystack, wordSize); 2604 __ add(result, result, wordSize / haystack_chr_size); 2605 __ bgez(haystack_len, L_LOOP); 2606 2607 __ bind(L_POST_LOOP); 2608 __ mv(ch2, -wordSize / haystack_chr_size); 2609 __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check 2610 __ ld(ch2, Address(haystack)); 2611 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2612 __ neg(haystack_len, haystack_len); 2613 __ xorr(ch2, first, ch2); 2614 __ sub(match_mask, ch2, mask1); 2615 __ orr(ch2, ch2, mask2); 2616 __ mv(trailing_zeros, -1); // all bits set 2617 __ j(L_SMALL_PROCEED); 2618 2619 __ align(OptoLoopAlignment); 2620 __ bind(L_SMALL); 2621 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2622 __ neg(haystack_len, haystack_len); 2623 if (needle_isL != haystack_isL) { 2624 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2625 } 2626 __ xorr(ch2, first, ch2); 2627 __ sub(match_mask, ch2, mask1); 2628 __ orr(ch2, ch2, mask2); 2629 __ mv(trailing_zeros, -1); // all bits set 2630 2631 __ bind(L_SMALL_PROCEED); 2632 __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits. 2633 __ notr(ch2, ch2); 2634 __ andr(match_mask, match_mask, ch2); 2635 __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check 2636 __ beqz(match_mask, NOMATCH); 2637 2638 __ bind(L_SMALL_HAS_ZERO_LOOP); 2639 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros 2640 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2641 __ mv(ch2, wordSize / haystack_chr_size); 2642 __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2); 2643 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2644 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2645 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2646 2647 __ bind(L_SMALL_CMP_LOOP); 2648 __ shadd(first, trailing_zeros, needle, first, needle_chr_shift); 2649 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2650 needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first)); 2651 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2652 __ add(trailing_zeros, trailing_zeros, 1); 2653 __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP); 2654 __ beq(first, ch2, L_SMALL_CMP_LOOP); 2655 2656 __ bind(L_SMALL_CMP_LOOP_NOMATCH); 2657 __ beqz(match_mask, NOMATCH); 2658 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2659 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2660 __ add(result, result, 1); 2661 __ add(haystack, haystack, haystack_chr_size); 2662 __ j(L_SMALL_HAS_ZERO_LOOP); 2663 2664 __ align(OptoLoopAlignment); 2665 __ bind(L_SMALL_CMP_LOOP_LAST_CMP); 2666 __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2667 __ j(DONE); 2668 2669 __ align(OptoLoopAlignment); 2670 __ bind(L_SMALL_CMP_LOOP_LAST_CMP2); 2671 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2672 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2673 __ j(DONE); 2674 2675 __ align(OptoLoopAlignment); 2676 __ bind(L_HAS_ZERO); 2677 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2678 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2679 __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2); 2680 __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits) 2681 __ sub(result, result, 1); // array index from 0, so result -= 1 2682 2683 __ bind(L_HAS_ZERO_LOOP); 2684 __ mv(needle_len, wordSize / haystack_chr_size); 2685 __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2); 2686 __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2); 2687 // load next 8 bytes from haystack, and increase result index 2688 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2689 __ add(result, result, 1); 2690 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2691 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2692 2693 // compare one char 2694 __ bind(L_CMP_LOOP); 2695 __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift); 2696 needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len)); 2697 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2698 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2699 __ add(trailing_zeros, trailing_zeros, 1); // next char index 2700 __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2); 2701 __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP); 2702 __ beq(needle_len, ch2, L_CMP_LOOP); 2703 2704 __ bind(L_CMP_LOOP_NOMATCH); 2705 __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH); 2706 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index 2707 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2708 __ add(haystack, haystack, haystack_chr_size); 2709 __ j(L_HAS_ZERO_LOOP); 2710 2711 __ align(OptoLoopAlignment); 2712 __ bind(L_CMP_LOOP_LAST_CMP); 2713 __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH); 2714 __ j(DONE); 2715 2716 __ align(OptoLoopAlignment); 2717 __ bind(L_CMP_LOOP_LAST_CMP2); 2718 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2719 __ add(result, result, 1); 2720 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2721 __ j(DONE); 2722 2723 __ align(OptoLoopAlignment); 2724 __ bind(L_HAS_ZERO_LOOP_NOMATCH); 2725 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 2726 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 2727 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 2728 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 2729 // result by analyzed characters value, so, we can just reset lower bits 2730 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 2731 // 2) restore needle_len and haystack_len values from "compressed" haystack_len 2732 // 3) advance haystack value to represent next haystack octet. result & 7/3 is 2733 // index of last analyzed substring inside current octet. So, haystack in at 2734 // respective start address. We need to advance it to next octet 2735 __ andi(match_mask, result, wordSize / haystack_chr_size - 1); 2736 __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2); 2737 __ andi(result, result, haystack_isL ? -8 : -4); 2738 __ slli(tmp, match_mask, haystack_chr_shift); 2739 __ sub(haystack, haystack, tmp); 2740 __ addw(haystack_len, haystack_len, zr); 2741 __ j(L_LOOP_PROCEED); 2742 2743 __ align(OptoLoopAlignment); 2744 __ bind(NOMATCH); 2745 __ mv(result, -1); 2746 2747 __ bind(DONE); 2748 __ pop_reg(spilled_regs, sp); 2749 __ ret(); 2750 return entry; 2751 } 2752 2753 void generate_string_indexof_stubs() 2754 { 2755 StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 2756 StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 2757 StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 2758 } 2759 2760 #ifdef COMPILER2 2761 address generate_mulAdd() 2762 { 2763 __ align(CodeEntryAlignment); 2764 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 2765 2766 address entry = __ pc(); 2767 2768 const Register out = x10; 2769 const Register in = x11; 2770 const Register offset = x12; 2771 const Register len = x13; 2772 const Register k = x14; 2773 const Register tmp = x28; 2774 2775 BLOCK_COMMENT("Entry:"); 2776 __ enter(); 2777 __ mul_add(out, in, offset, len, k, tmp); 2778 __ leave(); 2779 __ ret(); 2780 2781 return entry; 2782 } 2783 2784 /** 2785 * Arguments: 2786 * 2787 * Input: 2788 * c_rarg0 - x address 2789 * c_rarg1 - x length 2790 * c_rarg2 - y address 2791 * c_rarg3 - y length 2792 * c_rarg4 - z address 2793 * c_rarg5 - z length 2794 */ 2795 address generate_multiplyToLen() 2796 { 2797 __ align(CodeEntryAlignment); 2798 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 2799 address entry = __ pc(); 2800 2801 const Register x = x10; 2802 const Register xlen = x11; 2803 const Register y = x12; 2804 const Register ylen = x13; 2805 const Register z = x14; 2806 const Register zlen = x15; 2807 2808 const Register tmp1 = x16; 2809 const Register tmp2 = x17; 2810 const Register tmp3 = x7; 2811 const Register tmp4 = x28; 2812 const Register tmp5 = x29; 2813 const Register tmp6 = x30; 2814 const Register tmp7 = x31; 2815 2816 BLOCK_COMMENT("Entry:"); 2817 __ enter(); // required for proper stackwalking of RuntimeStub frame 2818 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2819 __ leave(); // required for proper stackwalking of RuntimeStub frame 2820 __ ret(); 2821 2822 return entry; 2823 } 2824 2825 address generate_squareToLen() 2826 { 2827 __ align(CodeEntryAlignment); 2828 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 2829 address entry = __ pc(); 2830 2831 const Register x = x10; 2832 const Register xlen = x11; 2833 const Register z = x12; 2834 const Register zlen = x13; 2835 const Register y = x14; // == x 2836 const Register ylen = x15; // == xlen 2837 2838 const Register tmp1 = x16; 2839 const Register tmp2 = x17; 2840 const Register tmp3 = x7; 2841 const Register tmp4 = x28; 2842 const Register tmp5 = x29; 2843 const Register tmp6 = x30; 2844 const Register tmp7 = x31; 2845 2846 BLOCK_COMMENT("Entry:"); 2847 __ enter(); 2848 __ mv(y, x); 2849 __ mv(ylen, xlen); 2850 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2851 __ leave(); 2852 __ ret(); 2853 2854 return entry; 2855 } 2856 2857 // Arguments: 2858 // 2859 // Input: 2860 // c_rarg0 - newArr address 2861 // c_rarg1 - oldArr address 2862 // c_rarg2 - newIdx 2863 // c_rarg3 - shiftCount 2864 // c_rarg4 - numIter 2865 // 2866 address generate_bigIntegerLeftShift() { 2867 __ align(CodeEntryAlignment); 2868 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 2869 address entry = __ pc(); 2870 2871 Label loop, exit; 2872 2873 Register newArr = c_rarg0; 2874 Register oldArr = c_rarg1; 2875 Register newIdx = c_rarg2; 2876 Register shiftCount = c_rarg3; 2877 Register numIter = c_rarg4; 2878 2879 Register shiftRevCount = c_rarg5; 2880 Register oldArrNext = t1; 2881 2882 __ beqz(numIter, exit); 2883 __ shadd(newArr, newIdx, newArr, t0, 2); 2884 2885 __ li(shiftRevCount, 32); 2886 __ sub(shiftRevCount, shiftRevCount, shiftCount); 2887 2888 __ bind(loop); 2889 __ addi(oldArrNext, oldArr, 4); 2890 __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4); 2891 __ vle32_v(v0, oldArr); 2892 __ vle32_v(v4, oldArrNext); 2893 __ vsll_vx(v0, v0, shiftCount); 2894 __ vsrl_vx(v4, v4, shiftRevCount); 2895 __ vor_vv(v0, v0, v4); 2896 __ vse32_v(v0, newArr); 2897 __ sub(numIter, numIter, t0); 2898 __ shadd(oldArr, t0, oldArr, t1, 2); 2899 __ shadd(newArr, t0, newArr, t1, 2); 2900 __ bnez(numIter, loop); 2901 2902 __ bind(exit); 2903 __ ret(); 2904 2905 return entry; 2906 } 2907 2908 // Arguments: 2909 // 2910 // Input: 2911 // c_rarg0 - newArr address 2912 // c_rarg1 - oldArr address 2913 // c_rarg2 - newIdx 2914 // c_rarg3 - shiftCount 2915 // c_rarg4 - numIter 2916 // 2917 address generate_bigIntegerRightShift() { 2918 __ align(CodeEntryAlignment); 2919 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 2920 address entry = __ pc(); 2921 2922 Label loop, exit; 2923 2924 Register newArr = c_rarg0; 2925 Register oldArr = c_rarg1; 2926 Register newIdx = c_rarg2; 2927 Register shiftCount = c_rarg3; 2928 Register numIter = c_rarg4; 2929 Register idx = numIter; 2930 2931 Register shiftRevCount = c_rarg5; 2932 Register oldArrNext = c_rarg6; 2933 Register newArrCur = t0; 2934 Register oldArrCur = t1; 2935 2936 __ beqz(idx, exit); 2937 __ shadd(newArr, newIdx, newArr, t0, 2); 2938 2939 __ li(shiftRevCount, 32); 2940 __ sub(shiftRevCount, shiftRevCount, shiftCount); 2941 2942 __ bind(loop); 2943 __ vsetvli(t0, idx, Assembler::e32, Assembler::m4); 2944 __ sub(idx, idx, t0); 2945 __ shadd(oldArrNext, idx, oldArr, t1, 2); 2946 __ shadd(newArrCur, idx, newArr, t1, 2); 2947 __ addi(oldArrCur, oldArrNext, 4); 2948 __ vle32_v(v0, oldArrCur); 2949 __ vle32_v(v4, oldArrNext); 2950 __ vsrl_vx(v0, v0, shiftCount); 2951 __ vsll_vx(v4, v4, shiftRevCount); 2952 __ vor_vv(v0, v0, v4); 2953 __ vse32_v(v0, newArrCur); 2954 __ bnez(idx, loop); 2955 2956 __ bind(exit); 2957 __ ret(); 2958 2959 return entry; 2960 } 2961 #endif 2962 2963 #ifdef COMPILER2 2964 class MontgomeryMultiplyGenerator : public MacroAssembler { 2965 2966 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 2967 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj; 2968 2969 RegSet _toSave; 2970 bool _squaring; 2971 2972 public: 2973 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 2974 : MacroAssembler(as->code()), _squaring(squaring) { 2975 2976 // Register allocation 2977 2978 Register reg = c_rarg0; 2979 Pa_base = reg; // Argument registers 2980 if (squaring) { 2981 Pb_base = Pa_base; 2982 } else { 2983 Pb_base = ++reg; 2984 } 2985 Pn_base = ++reg; 2986 Rlen= ++reg; 2987 inv = ++reg; 2988 Pm_base = ++reg; 2989 2990 // Working registers: 2991 Ra = ++reg; // The current digit of a, b, n, and m. 2992 Rb = ++reg; 2993 Rm = ++reg; 2994 Rn = ++reg; 2995 2996 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 2997 Pb = ++reg; 2998 Pm = ++reg; 2999 Pn = ++reg; 3000 3001 tmp0 = ++reg; // Three registers which form a 3002 tmp1 = ++reg; // triple-precision accumuator. 3003 tmp2 = ++reg; 3004 3005 Ri = x6; // Inner and outer loop indexes. 3006 Rj = x7; 3007 3008 Rhi_ab = x28; // Product registers: low and high parts 3009 Rlo_ab = x29; // of a*b and m*n. 3010 Rhi_mn = x30; 3011 Rlo_mn = x31; 3012 3013 // x18 and up are callee-saved. 3014 _toSave = RegSet::range(x18, reg) + Pm_base; 3015 } 3016 3017 private: 3018 void save_regs() { 3019 push_reg(_toSave, sp); 3020 } 3021 3022 void restore_regs() { 3023 pop_reg(_toSave, sp); 3024 } 3025 3026 template <typename T> 3027 void unroll_2(Register count, T block) { 3028 Label loop, end, odd; 3029 beqz(count, end); 3030 andi(t0, count, 0x1); 3031 bnez(t0, odd); 3032 align(16); 3033 bind(loop); 3034 (this->*block)(); 3035 bind(odd); 3036 (this->*block)(); 3037 addi(count, count, -2); 3038 bgtz(count, loop); 3039 bind(end); 3040 } 3041 3042 template <typename T> 3043 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3044 Label loop, end, odd; 3045 beqz(count, end); 3046 andi(tmp, count, 0x1); 3047 bnez(tmp, odd); 3048 align(16); 3049 bind(loop); 3050 (this->*block)(d, s, tmp); 3051 bind(odd); 3052 (this->*block)(d, s, tmp); 3053 addi(count, count, -2); 3054 bgtz(count, loop); 3055 bind(end); 3056 } 3057 3058 void pre1(RegisterOrConstant i) { 3059 block_comment("pre1"); 3060 // Pa = Pa_base; 3061 // Pb = Pb_base + i; 3062 // Pm = Pm_base; 3063 // Pn = Pn_base + i; 3064 // Ra = *Pa; 3065 // Rb = *Pb; 3066 // Rm = *Pm; 3067 // Rn = *Pn; 3068 if (i.is_register()) { 3069 slli(t0, i.as_register(), LogBytesPerWord); 3070 } else { 3071 mv(t0, i.as_constant()); 3072 slli(t0, t0, LogBytesPerWord); 3073 } 3074 3075 mv(Pa, Pa_base); 3076 add(Pb, Pb_base, t0); 3077 mv(Pm, Pm_base); 3078 add(Pn, Pn_base, t0); 3079 3080 ld(Ra, Address(Pa)); 3081 ld(Rb, Address(Pb)); 3082 ld(Rm, Address(Pm)); 3083 ld(Rn, Address(Pn)); 3084 3085 // Zero the m*n result. 3086 mv(Rhi_mn, zr); 3087 mv(Rlo_mn, zr); 3088 } 3089 3090 // The core multiply-accumulate step of a Montgomery 3091 // multiplication. The idea is to schedule operations as a 3092 // pipeline so that instructions with long latencies (loads and 3093 // multiplies) have time to complete before their results are 3094 // used. This most benefits in-order implementations of the 3095 // architecture but out-of-order ones also benefit. 3096 void step() { 3097 block_comment("step"); 3098 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3099 // Ra = *++Pa; 3100 // Rb = *--Pb; 3101 mulhu(Rhi_ab, Ra, Rb); 3102 mul(Rlo_ab, Ra, Rb); 3103 addi(Pa, Pa, wordSize); 3104 ld(Ra, Address(Pa)); 3105 addi(Pb, Pb, -wordSize); 3106 ld(Rb, Address(Pb)); 3107 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the 3108 // previous iteration. 3109 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3110 // Rm = *++Pm; 3111 // Rn = *--Pn; 3112 mulhu(Rhi_mn, Rm, Rn); 3113 mul(Rlo_mn, Rm, Rn); 3114 addi(Pm, Pm, wordSize); 3115 ld(Rm, Address(Pm)); 3116 addi(Pn, Pn, -wordSize); 3117 ld(Rn, Address(Pn)); 3118 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3119 } 3120 3121 void post1() { 3122 block_comment("post1"); 3123 3124 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3125 // Ra = *++Pa; 3126 // Rb = *--Pb; 3127 mulhu(Rhi_ab, Ra, Rb); 3128 mul(Rlo_ab, Ra, Rb); 3129 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3130 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3131 3132 // *Pm = Rm = tmp0 * inv; 3133 mul(Rm, tmp0, inv); 3134 sd(Rm, Address(Pm)); 3135 3136 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3137 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3138 mulhu(Rhi_mn, Rm, Rn); 3139 3140 #ifndef PRODUCT 3141 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3142 { 3143 mul(Rlo_mn, Rm, Rn); 3144 add(Rlo_mn, tmp0, Rlo_mn); 3145 Label ok; 3146 beqz(Rlo_mn, ok); 3147 stop("broken Montgomery multiply"); 3148 bind(ok); 3149 } 3150 #endif 3151 // We have very carefully set things up so that 3152 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3153 // the lower half of Rm * Rn because we know the result already: 3154 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3155 // tmp0 != 0. So, rather than do a mul and an cad we just set 3156 // the carry flag iff tmp0 is nonzero. 3157 // 3158 // mul(Rlo_mn, Rm, Rn); 3159 // cad(zr, tmp0, Rlo_mn); 3160 addi(t0, tmp0, -1); 3161 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3162 cadc(tmp0, tmp1, Rhi_mn, t0); 3163 adc(tmp1, tmp2, zr, t0); 3164 mv(tmp2, zr); 3165 } 3166 3167 void pre2(Register i, Register len) { 3168 block_comment("pre2"); 3169 // Pa = Pa_base + i-len; 3170 // Pb = Pb_base + len; 3171 // Pm = Pm_base + i-len; 3172 // Pn = Pn_base + len; 3173 3174 sub(Rj, i, len); 3175 // Rj == i-len 3176 3177 // Ra as temp register 3178 slli(Ra, Rj, LogBytesPerWord); 3179 add(Pa, Pa_base, Ra); 3180 add(Pm, Pm_base, Ra); 3181 slli(Ra, len, LogBytesPerWord); 3182 add(Pb, Pb_base, Ra); 3183 add(Pn, Pn_base, Ra); 3184 3185 // Ra = *++Pa; 3186 // Rb = *--Pb; 3187 // Rm = *++Pm; 3188 // Rn = *--Pn; 3189 add(Pa, Pa, wordSize); 3190 ld(Ra, Address(Pa)); 3191 add(Pb, Pb, -wordSize); 3192 ld(Rb, Address(Pb)); 3193 add(Pm, Pm, wordSize); 3194 ld(Rm, Address(Pm)); 3195 add(Pn, Pn, -wordSize); 3196 ld(Rn, Address(Pn)); 3197 3198 mv(Rhi_mn, zr); 3199 mv(Rlo_mn, zr); 3200 } 3201 3202 void post2(Register i, Register len) { 3203 block_comment("post2"); 3204 sub(Rj, i, len); 3205 3206 cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part 3207 3208 // As soon as we know the least significant digit of our result, 3209 // store it. 3210 // Pm_base[i-len] = tmp0; 3211 // Rj as temp register 3212 slli(Rj, Rj, LogBytesPerWord); 3213 add(Rj, Pm_base, Rj); 3214 sd(tmp0, Address(Rj)); 3215 3216 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3217 cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part 3218 adc(tmp1, tmp2, zr, t0); 3219 mv(tmp2, zr); 3220 } 3221 3222 // A carry in tmp0 after Montgomery multiplication means that we 3223 // should subtract multiples of n from our result in m. We'll 3224 // keep doing that until there is no carry. 3225 void normalize(Register len) { 3226 block_comment("normalize"); 3227 // while (tmp0) 3228 // tmp0 = sub(Pm_base, Pn_base, tmp0, len); 3229 Label loop, post, again; 3230 Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now 3231 beqz(tmp0, post); { 3232 bind(again); { 3233 mv(i, zr); 3234 mv(cnt, len); 3235 slli(Rn, i, LogBytesPerWord); 3236 add(Rm, Pm_base, Rn); 3237 ld(Rm, Address(Rm)); 3238 add(Rn, Pn_base, Rn); 3239 ld(Rn, Address(Rn)); 3240 li(t0, 1); // set carry flag, i.e. no borrow 3241 align(16); 3242 bind(loop); { 3243 notr(Rn, Rn); 3244 add(Rm, Rm, t0); 3245 add(Rm, Rm, Rn); 3246 sltu(t0, Rm, Rn); 3247 slli(Rn, i, LogBytesPerWord); // Rn as temp register 3248 add(Rn, Pm_base, Rn); 3249 sd(Rm, Address(Rn)); 3250 add(i, i, 1); 3251 slli(Rn, i, LogBytesPerWord); 3252 add(Rm, Pm_base, Rn); 3253 ld(Rm, Address(Rm)); 3254 add(Rn, Pn_base, Rn); 3255 ld(Rn, Address(Rn)); 3256 sub(cnt, cnt, 1); 3257 } bnez(cnt, loop); 3258 addi(tmp0, tmp0, -1); 3259 add(tmp0, tmp0, t0); 3260 } bnez(tmp0, again); 3261 } bind(post); 3262 } 3263 3264 // Move memory at s to d, reversing words. 3265 // Increments d to end of copied memory 3266 // Destroys tmp1, tmp2 3267 // Preserves len 3268 // Leaves s pointing to the address which was in d at start 3269 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3270 assert(tmp1 < x28 && tmp2 < x28, "register corruption"); 3271 3272 slli(tmp1, len, LogBytesPerWord); 3273 add(s, s, tmp1); 3274 mv(tmp1, len); 3275 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3276 slli(tmp1, len, LogBytesPerWord); 3277 sub(s, d, tmp1); 3278 } 3279 // [63...0] -> [31...0][63...32] 3280 void reverse1(Register d, Register s, Register tmp) { 3281 addi(s, s, -wordSize); 3282 ld(tmp, Address(s)); 3283 ror_imm(tmp, tmp, 32, t0); 3284 sd(tmp, Address(d)); 3285 addi(d, d, wordSize); 3286 } 3287 3288 void step_squaring() { 3289 // An extra ACC 3290 step(); 3291 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3292 } 3293 3294 void last_squaring(Register i) { 3295 Label dont; 3296 // if ((i & 1) == 0) { 3297 andi(t0, i, 0x1); 3298 bnez(t0, dont); { 3299 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3300 // Ra = *++Pa; 3301 // Rb = *--Pb; 3302 mulhu(Rhi_ab, Ra, Rb); 3303 mul(Rlo_ab, Ra, Rb); 3304 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3305 } bind(dont); 3306 } 3307 3308 void extra_step_squaring() { 3309 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3310 3311 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3312 // Rm = *++Pm; 3313 // Rn = *--Pn; 3314 mulhu(Rhi_mn, Rm, Rn); 3315 mul(Rlo_mn, Rm, Rn); 3316 addi(Pm, Pm, wordSize); 3317 ld(Rm, Address(Pm)); 3318 addi(Pn, Pn, -wordSize); 3319 ld(Rn, Address(Pn)); 3320 } 3321 3322 void post1_squaring() { 3323 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3324 3325 // *Pm = Rm = tmp0 * inv; 3326 mul(Rm, tmp0, inv); 3327 sd(Rm, Address(Pm)); 3328 3329 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3330 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3331 mulhu(Rhi_mn, Rm, Rn); 3332 3333 #ifndef PRODUCT 3334 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3335 { 3336 mul(Rlo_mn, Rm, Rn); 3337 add(Rlo_mn, tmp0, Rlo_mn); 3338 Label ok; 3339 beqz(Rlo_mn, ok); { 3340 stop("broken Montgomery multiply"); 3341 } bind(ok); 3342 } 3343 #endif 3344 // We have very carefully set things up so that 3345 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3346 // the lower half of Rm * Rn because we know the result already: 3347 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3348 // tmp0 != 0. So, rather than do a mul and a cad we just set 3349 // the carry flag iff tmp0 is nonzero. 3350 // 3351 // mul(Rlo_mn, Rm, Rn); 3352 // cad(zr, tmp, Rlo_mn); 3353 addi(t0, tmp0, -1); 3354 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3355 cadc(tmp0, tmp1, Rhi_mn, t0); 3356 adc(tmp1, tmp2, zr, t0); 3357 mv(tmp2, zr); 3358 } 3359 3360 // use t0 as carry 3361 void acc(Register Rhi, Register Rlo, 3362 Register tmp0, Register tmp1, Register tmp2) { 3363 cad(tmp0, tmp0, Rlo, t0); 3364 cadc(tmp1, tmp1, Rhi, t0); 3365 adc(tmp2, tmp2, zr, t0); 3366 } 3367 3368 public: 3369 /** 3370 * Fast Montgomery multiplication. The derivation of the 3371 * algorithm is in A Cryptographic Library for the Motorola 3372 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3373 * 3374 * Arguments: 3375 * 3376 * Inputs for multiplication: 3377 * c_rarg0 - int array elements a 3378 * c_rarg1 - int array elements b 3379 * c_rarg2 - int array elements n (the modulus) 3380 * c_rarg3 - int length 3381 * c_rarg4 - int inv 3382 * c_rarg5 - int array elements m (the result) 3383 * 3384 * Inputs for squaring: 3385 * c_rarg0 - int array elements a 3386 * c_rarg1 - int array elements n (the modulus) 3387 * c_rarg2 - int length 3388 * c_rarg3 - int inv 3389 * c_rarg4 - int array elements m (the result) 3390 * 3391 */ 3392 address generate_multiply() { 3393 Label argh, nothing; 3394 bind(argh); 3395 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3396 3397 align(CodeEntryAlignment); 3398 address entry = pc(); 3399 3400 beqz(Rlen, nothing); 3401 3402 enter(); 3403 3404 // Make room. 3405 li(Ra, 512); 3406 bgt(Rlen, Ra, argh); 3407 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3408 sub(Ra, sp, Ra); 3409 andi(sp, Ra, -2 * wordSize); 3410 3411 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3412 3413 { 3414 // Copy input args, reversing as we go. We use Ra as a 3415 // temporary variable. 3416 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3417 if (!_squaring) 3418 reverse(Ra, Pb_base, Rlen, Ri, Rj); 3419 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3420 } 3421 3422 // Push all call-saved registers and also Pm_base which we'll need 3423 // at the end. 3424 save_regs(); 3425 3426 #ifndef PRODUCT 3427 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3428 { 3429 ld(Rn, Address(Pn_base)); 3430 mul(Rlo_mn, Rn, inv); 3431 li(t0, -1); 3432 Label ok; 3433 beq(Rlo_mn, t0, ok); 3434 stop("broken inverse in Montgomery multiply"); 3435 bind(ok); 3436 } 3437 #endif 3438 3439 mv(Pm_base, Ra); 3440 3441 mv(tmp0, zr); 3442 mv(tmp1, zr); 3443 mv(tmp2, zr); 3444 3445 block_comment("for (int i = 0; i < len; i++) {"); 3446 mv(Ri, zr); { 3447 Label loop, end; 3448 bge(Ri, Rlen, end); 3449 3450 bind(loop); 3451 pre1(Ri); 3452 3453 block_comment(" for (j = i; j; j--) {"); { 3454 mv(Rj, Ri); 3455 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3456 } block_comment(" } // j"); 3457 3458 post1(); 3459 addw(Ri, Ri, 1); 3460 blt(Ri, Rlen, loop); 3461 bind(end); 3462 block_comment("} // i"); 3463 } 3464 3465 block_comment("for (int i = len; i < 2*len; i++) {"); 3466 mv(Ri, Rlen); { 3467 Label loop, end; 3468 slli(t0, Rlen, 1); 3469 bge(Ri, t0, end); 3470 3471 bind(loop); 3472 pre2(Ri, Rlen); 3473 3474 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3475 slliw(Rj, Rlen, 1); 3476 subw(Rj, Rj, Ri); 3477 subw(Rj, Rj, 1); 3478 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3479 } block_comment(" } // j"); 3480 3481 post2(Ri, Rlen); 3482 addw(Ri, Ri, 1); 3483 slli(t0, Rlen, 1); 3484 blt(Ri, t0, loop); 3485 bind(end); 3486 } 3487 block_comment("} // i"); 3488 3489 normalize(Rlen); 3490 3491 mv(Ra, Pm_base); // Save Pm_base in Ra 3492 restore_regs(); // Restore caller's Pm_base 3493 3494 // Copy our result into caller's Pm_base 3495 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3496 3497 leave(); 3498 bind(nothing); 3499 ret(); 3500 3501 return entry; 3502 } 3503 3504 /** 3505 * 3506 * Arguments: 3507 * 3508 * Inputs: 3509 * c_rarg0 - int array elements a 3510 * c_rarg1 - int array elements n (the modulus) 3511 * c_rarg2 - int length 3512 * c_rarg3 - int inv 3513 * c_rarg4 - int array elements m (the result) 3514 * 3515 */ 3516 address generate_square() { 3517 Label argh; 3518 bind(argh); 3519 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3520 3521 align(CodeEntryAlignment); 3522 address entry = pc(); 3523 3524 enter(); 3525 3526 // Make room. 3527 li(Ra, 512); 3528 bgt(Rlen, Ra, argh); 3529 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3530 sub(Ra, sp, Ra); 3531 andi(sp, Ra, -2 * wordSize); 3532 3533 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3534 3535 { 3536 // Copy input args, reversing as we go. We use Ra as a 3537 // temporary variable. 3538 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3539 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3540 } 3541 3542 // Push all call-saved registers and also Pm_base which we'll need 3543 // at the end. 3544 save_regs(); 3545 3546 mv(Pm_base, Ra); 3547 3548 mv(tmp0, zr); 3549 mv(tmp1, zr); 3550 mv(tmp2, zr); 3551 3552 block_comment("for (int i = 0; i < len; i++) {"); 3553 mv(Ri, zr); { 3554 Label loop, end; 3555 bind(loop); 3556 bge(Ri, Rlen, end); 3557 3558 pre1(Ri); 3559 3560 block_comment("for (j = (i+1)/2; j; j--) {"); { 3561 addi(Rj, Ri, 1); 3562 srliw(Rj, Rj, 1); 3563 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3564 } block_comment(" } // j"); 3565 3566 last_squaring(Ri); 3567 3568 block_comment(" for (j = i/2; j; j--) {"); { 3569 srliw(Rj, Ri, 1); 3570 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3571 } block_comment(" } // j"); 3572 3573 post1_squaring(); 3574 addi(Ri, Ri, 1); 3575 blt(Ri, Rlen, loop); 3576 3577 bind(end); 3578 block_comment("} // i"); 3579 } 3580 3581 block_comment("for (int i = len; i < 2*len; i++) {"); 3582 mv(Ri, Rlen); { 3583 Label loop, end; 3584 bind(loop); 3585 slli(t0, Rlen, 1); 3586 bge(Ri, t0, end); 3587 3588 pre2(Ri, Rlen); 3589 3590 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 3591 slli(Rj, Rlen, 1); 3592 sub(Rj, Rj, Ri); 3593 sub(Rj, Rj, 1); 3594 srliw(Rj, Rj, 1); 3595 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3596 } block_comment(" } // j"); 3597 3598 last_squaring(Ri); 3599 3600 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 3601 slli(Rj, Rlen, 1); 3602 sub(Rj, Rj, Ri); 3603 srliw(Rj, Rj, 1); 3604 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3605 } block_comment(" } // j"); 3606 3607 post2(Ri, Rlen); 3608 addi(Ri, Ri, 1); 3609 slli(t0, Rlen, 1); 3610 blt(Ri, t0, loop); 3611 3612 bind(end); 3613 block_comment("} // i"); 3614 } 3615 3616 normalize(Rlen); 3617 3618 mv(Ra, Pm_base); // Save Pm_base in Ra 3619 restore_regs(); // Restore caller's Pm_base 3620 3621 // Copy our result into caller's Pm_base 3622 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3623 3624 leave(); 3625 ret(); 3626 3627 return entry; 3628 } 3629 }; 3630 #endif // COMPILER2 3631 3632 // Continuation point for throwing of implicit exceptions that are 3633 // not handled in the current activation. Fabricates an exception 3634 // oop and initiates normal exception dispatching in this 3635 // frame. Since we need to preserve callee-saved values (currently 3636 // only for C2, but done for C1 as well) we need a callee-saved oop 3637 // map and therefore have to make these stubs into RuntimeStubs 3638 // rather than BufferBlobs. If the compiler needs all registers to 3639 // be preserved between the fault point and the exception handler 3640 // then it must assume responsibility for that in 3641 // AbstractCompiler::continuation_for_implicit_null_exception or 3642 // continuation_for_implicit_division_by_zero_exception. All other 3643 // implicit exceptions (e.g., NullPointerException or 3644 // AbstractMethodError on entry) are either at call sites or 3645 // otherwise assume that stack unwinding will be initiated, so 3646 // caller saved registers were assumed volatile in the compiler. 3647 3648 #undef __ 3649 #define __ masm-> 3650 3651 address generate_throw_exception(const char* name, 3652 address runtime_entry, 3653 Register arg1 = noreg, 3654 Register arg2 = noreg) { 3655 // Information about frame layout at time of blocking runtime call. 3656 // Note that we only have to preserve callee-saved registers since 3657 // the compilers are responsible for supplying a continuation point 3658 // if they expect all registers to be preserved. 3659 // n.b. riscv asserts that frame::arg_reg_save_area_bytes == 0 3660 assert_cond(runtime_entry != NULL); 3661 enum layout { 3662 fp_off = 0, 3663 fp_off2, 3664 return_off, 3665 return_off2, 3666 framesize // inclusive of return address 3667 }; 3668 3669 const int insts_size = 512; 3670 const int locs_size = 64; 3671 3672 CodeBuffer code(name, insts_size, locs_size); 3673 OopMapSet* oop_maps = new OopMapSet(); 3674 MacroAssembler* masm = new MacroAssembler(&code); 3675 assert_cond(oop_maps != NULL && masm != NULL); 3676 3677 address start = __ pc(); 3678 3679 // This is an inlined and slightly modified version of call_VM 3680 // which has the ability to fetch the return PC out of 3681 // thread-local storage and also sets up last_Java_sp slightly 3682 // differently than the real call_VM 3683 3684 __ enter(); // Save FP and RA before call 3685 3686 assert(is_even(framesize / 2), "sp not 16-byte aligned"); 3687 3688 // ra and fp are already in place 3689 __ addi(sp, fp, 0 - ((unsigned)framesize << LogBytesPerInt)); // prolog 3690 3691 int frame_complete = __ pc() - start; 3692 3693 // Set up last_Java_sp and last_Java_fp 3694 address the_pc = __ pc(); 3695 __ set_last_Java_frame(sp, fp, the_pc, t0); 3696 3697 // Call runtime 3698 if (arg1 != noreg) { 3699 assert(arg2 != c_rarg1, "clobbered"); 3700 __ mv(c_rarg1, arg1); 3701 } 3702 if (arg2 != noreg) { 3703 __ mv(c_rarg2, arg2); 3704 } 3705 __ mv(c_rarg0, xthread); 3706 BLOCK_COMMENT("call runtime_entry"); 3707 int32_t offset = 0; 3708 __ movptr_with_offset(t0, runtime_entry, offset); 3709 __ jalr(x1, t0, offset); 3710 3711 // Generate oop map 3712 OopMap* map = new OopMap(framesize, 0); 3713 assert_cond(map != NULL); 3714 3715 oop_maps->add_gc_map(the_pc - start, map); 3716 3717 __ reset_last_Java_frame(true); 3718 3719 __ leave(); 3720 3721 // check for pending exceptions 3722 #ifdef ASSERT 3723 Label L; 3724 __ ld(t0, Address(xthread, Thread::pending_exception_offset())); 3725 __ bnez(t0, L); 3726 __ should_not_reach_here(); 3727 __ bind(L); 3728 #endif // ASSERT 3729 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3730 3731 3732 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3733 RuntimeStub* stub = 3734 RuntimeStub::new_runtime_stub(name, 3735 &code, 3736 frame_complete, 3737 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3738 oop_maps, false); 3739 assert(stub != NULL, "create runtime stub fail!"); 3740 return stub->entry_point(); 3741 } 3742 3743 // Initialization 3744 void generate_initial() { 3745 // Generate initial stubs and initializes the entry points 3746 3747 // entry points that exist in all platforms Note: This is code 3748 // that could be shared among different platforms - however the 3749 // benefit seems to be smaller than the disadvantage of having a 3750 // much more complicated generator structure. See also comment in 3751 // stubRoutines.hpp. 3752 3753 StubRoutines::_forward_exception_entry = generate_forward_exception(); 3754 3755 StubRoutines::_call_stub_entry = 3756 generate_call_stub(StubRoutines::_call_stub_return_address); 3757 3758 // is referenced by megamorphic call 3759 StubRoutines::_catch_exception_entry = generate_catch_exception(); 3760 3761 // Build this early so it's available for the interpreter. 3762 StubRoutines::_throw_StackOverflowError_entry = 3763 generate_throw_exception("StackOverflowError throw_exception", 3764 CAST_FROM_FN_PTR(address, 3765 SharedRuntime::throw_StackOverflowError)); 3766 StubRoutines::_throw_delayed_StackOverflowError_entry = 3767 generate_throw_exception("delayed StackOverflowError throw_exception", 3768 CAST_FROM_FN_PTR(address, 3769 SharedRuntime::throw_delayed_StackOverflowError)); 3770 // Safefetch stubs. 3771 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 3772 &StubRoutines::_safefetch32_fault_pc, 3773 &StubRoutines::_safefetch32_continuation_pc); 3774 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 3775 &StubRoutines::_safefetchN_fault_pc, 3776 &StubRoutines::_safefetchN_continuation_pc); 3777 } 3778 3779 void generate_all() { 3780 // support for verify_oop (must happen after universe_init) 3781 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 3782 StubRoutines::_throw_AbstractMethodError_entry = 3783 generate_throw_exception("AbstractMethodError throw_exception", 3784 CAST_FROM_FN_PTR(address, 3785 SharedRuntime:: 3786 throw_AbstractMethodError)); 3787 3788 StubRoutines::_throw_IncompatibleClassChangeError_entry = 3789 generate_throw_exception("IncompatibleClassChangeError throw_exception", 3790 CAST_FROM_FN_PTR(address, 3791 SharedRuntime:: 3792 throw_IncompatibleClassChangeError)); 3793 3794 StubRoutines::_throw_NullPointerException_at_call_entry = 3795 generate_throw_exception("NullPointerException at call throw_exception", 3796 CAST_FROM_FN_PTR(address, 3797 SharedRuntime:: 3798 throw_NullPointerException_at_call)); 3799 // arraycopy stubs used by compilers 3800 generate_arraycopy_stubs(); 3801 3802 #ifdef COMPILER2 3803 if (UseMulAddIntrinsic) { 3804 StubRoutines::_mulAdd = generate_mulAdd(); 3805 } 3806 3807 if (UseMultiplyToLenIntrinsic) { 3808 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 3809 } 3810 3811 if (UseSquareToLenIntrinsic) { 3812 StubRoutines::_squareToLen = generate_squareToLen(); 3813 } 3814 3815 if (UseMontgomeryMultiplyIntrinsic) { 3816 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 3817 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 3818 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 3819 } 3820 3821 if (UseMontgomerySquareIntrinsic) { 3822 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 3823 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 3824 StubRoutines::_montgomerySquare = g.generate_square(); 3825 } 3826 3827 if (UseRVVForBigIntegerShiftIntrinsics) { 3828 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 3829 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 3830 } 3831 #endif 3832 3833 generate_compare_long_strings(); 3834 3835 generate_string_indexof_stubs(); 3836 3837 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 3838 if (bs_nm != NULL) { 3839 StubRoutines::riscv::_method_entry_barrier = generate_method_entry_barrier(); 3840 } 3841 3842 StubRoutines::riscv::set_completed(); 3843 } 3844 3845 public: 3846 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 3847 if (all) { 3848 generate_all(); 3849 } else { 3850 generate_initial(); 3851 } 3852 } 3853 3854 ~StubGenerator() {} 3855 }; // end class declaration 3856 3857 #define UCM_TABLE_MAX_ENTRIES 8 3858 void StubGenerator_generate(CodeBuffer* code, bool all) { 3859 if (UnsafeCopyMemory::_table == NULL) { 3860 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 3861 } 3862 3863 StubGenerator g(code, all); 3864 }