1 /* 2 * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * Copyright (c) 2020, 2022, Huawei Technologies Co., Ltd. All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/universe.hpp" 35 #include "nativeInst_riscv.hpp" 36 #include "oops/instanceOop.hpp" 37 #include "oops/method.hpp" 38 #include "oops/objArrayKlass.hpp" 39 #include "oops/oop.inline.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "runtime/frame.inline.hpp" 42 #include "runtime/handles.inline.hpp" 43 #include "runtime/sharedRuntime.hpp" 44 #include "runtime/stubCodeGenerator.hpp" 45 #include "runtime/stubRoutines.hpp" 46 #include "runtime/thread.inline.hpp" 47 #include "utilities/align.hpp" 48 #include "utilities/powerOfTwo.hpp" 49 #ifdef COMPILER2 50 #include "opto/runtime.hpp" 51 #endif 52 #if INCLUDE_ZGC 53 #include "gc/z/zThreadLocalData.hpp" 54 #endif 55 56 // Declaration and definition of StubGenerator (no .hpp file). 57 // For a more detailed description of the stub routine structure 58 // see the comment in stubRoutines.hpp 59 60 #undef __ 61 #define __ _masm-> 62 63 #ifdef PRODUCT 64 #define BLOCK_COMMENT(str) /* nothing */ 65 #else 66 #define BLOCK_COMMENT(str) __ block_comment(str) 67 #endif 68 69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 70 71 // Stub Code definitions 72 73 class StubGenerator: public StubCodeGenerator { 74 private: 75 76 #ifdef PRODUCT 77 #define inc_counter_np(counter) ((void)0) 78 #else 79 void inc_counter_np_(int& counter) { 80 __ la(t1, ExternalAddress((address)&counter)); 81 __ lwu(t0, Address(t1, 0)); 82 __ addiw(t0, t0, 1); 83 __ sw(t0, Address(t1, 0)); 84 } 85 #define inc_counter_np(counter) \ 86 BLOCK_COMMENT("inc_counter " #counter); \ 87 inc_counter_np_(counter); 88 #endif 89 90 // Call stubs are used to call Java from C 91 // 92 // Arguments: 93 // c_rarg0: call wrapper address address 94 // c_rarg1: result address 95 // c_rarg2: result type BasicType 96 // c_rarg3: method Method* 97 // c_rarg4: (interpreter) entry point address 98 // c_rarg5: parameters intptr_t* 99 // c_rarg6: parameter size (in words) int 100 // c_rarg7: thread Thread* 101 // 102 // There is no return from the stub itself as any Java result 103 // is written to result 104 // 105 // we save x1 (ra) as the return PC at the base of the frame and 106 // link x8 (fp) below it as the frame pointer installing sp (x2) 107 // into fp. 108 // 109 // we save x10-x17, which accounts for all the c arguments. 110 // 111 // TODO: strictly do we need to save them all? they are treated as 112 // volatile by C so could we omit saving the ones we are going to 113 // place in global registers (thread? method?) or those we only use 114 // during setup of the Java call? 115 // 116 // we don't need to save x5 which C uses as an indirect result location 117 // return register. 118 // 119 // we don't need to save x6-x7 and x28-x31 which both C and Java treat as 120 // volatile 121 // 122 // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary 123 // registers and C expects to be callee-save 124 // 125 // so the stub frame looks like this when we enter Java code 126 // 127 // [ return_from_Java ] <--- sp 128 // [ argument word n ] 129 // ... 130 // -34 [ argument word 1 ] 131 // -33 [ saved f27 ] <--- sp_after_call 132 // -32 [ saved f26 ] 133 // -31 [ saved f25 ] 134 // -30 [ saved f24 ] 135 // -29 [ saved f23 ] 136 // -28 [ saved f22 ] 137 // -27 [ saved f21 ] 138 // -26 [ saved f20 ] 139 // -25 [ saved f19 ] 140 // -24 [ saved f18 ] 141 // -23 [ saved f9 ] 142 // -22 [ saved f8 ] 143 // -21 [ saved x27 ] 144 // -20 [ saved x26 ] 145 // -19 [ saved x25 ] 146 // -18 [ saved x24 ] 147 // -17 [ saved x23 ] 148 // -16 [ saved x22 ] 149 // -15 [ saved x21 ] 150 // -14 [ saved x20 ] 151 // -13 [ saved x19 ] 152 // -12 [ saved x18 ] 153 // -11 [ saved x9 ] 154 // -10 [ call wrapper (x10) ] 155 // -9 [ result (x11) ] 156 // -8 [ result type (x12) ] 157 // -7 [ method (x13) ] 158 // -6 [ entry point (x14) ] 159 // -5 [ parameters (x15) ] 160 // -4 [ parameter size (x16) ] 161 // -3 [ thread (x17) ] 162 // -2 [ saved fp (x8) ] 163 // -1 [ saved ra (x1) ] 164 // 0 [ ] <--- fp == saved sp (x2) 165 166 // Call stub stack layout word offsets from fp 167 enum call_stub_layout { 168 sp_after_call_off = -33, 169 170 f27_off = -33, 171 f26_off = -32, 172 f25_off = -31, 173 f24_off = -30, 174 f23_off = -29, 175 f22_off = -28, 176 f21_off = -27, 177 f20_off = -26, 178 f19_off = -25, 179 f18_off = -24, 180 f9_off = -23, 181 f8_off = -22, 182 183 x27_off = -21, 184 x26_off = -20, 185 x25_off = -19, 186 x24_off = -18, 187 x23_off = -17, 188 x22_off = -16, 189 x21_off = -15, 190 x20_off = -14, 191 x19_off = -13, 192 x18_off = -12, 193 x9_off = -11, 194 195 call_wrapper_off = -10, 196 result_off = -9, 197 result_type_off = -8, 198 method_off = -7, 199 entry_point_off = -6, 200 parameters_off = -5, 201 parameter_size_off = -4, 202 thread_off = -3, 203 fp_f = -2, 204 retaddr_off = -1, 205 }; 206 207 address generate_call_stub(address& return_address) { 208 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 209 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 210 "adjust this code"); 211 212 StubCodeMark mark(this, "StubRoutines", "call_stub"); 213 address start = __ pc(); 214 215 const Address sp_after_call (fp, sp_after_call_off * wordSize); 216 217 const Address call_wrapper (fp, call_wrapper_off * wordSize); 218 const Address result (fp, result_off * wordSize); 219 const Address result_type (fp, result_type_off * wordSize); 220 const Address method (fp, method_off * wordSize); 221 const Address entry_point (fp, entry_point_off * wordSize); 222 const Address parameters (fp, parameters_off * wordSize); 223 const Address parameter_size(fp, parameter_size_off * wordSize); 224 225 const Address thread (fp, thread_off * wordSize); 226 227 const Address f27_save (fp, f27_off * wordSize); 228 const Address f26_save (fp, f26_off * wordSize); 229 const Address f25_save (fp, f25_off * wordSize); 230 const Address f24_save (fp, f24_off * wordSize); 231 const Address f23_save (fp, f23_off * wordSize); 232 const Address f22_save (fp, f22_off * wordSize); 233 const Address f21_save (fp, f21_off * wordSize); 234 const Address f20_save (fp, f20_off * wordSize); 235 const Address f19_save (fp, f19_off * wordSize); 236 const Address f18_save (fp, f18_off * wordSize); 237 const Address f9_save (fp, f9_off * wordSize); 238 const Address f8_save (fp, f8_off * wordSize); 239 240 const Address x27_save (fp, x27_off * wordSize); 241 const Address x26_save (fp, x26_off * wordSize); 242 const Address x25_save (fp, x25_off * wordSize); 243 const Address x24_save (fp, x24_off * wordSize); 244 const Address x23_save (fp, x23_off * wordSize); 245 const Address x22_save (fp, x22_off * wordSize); 246 const Address x21_save (fp, x21_off * wordSize); 247 const Address x20_save (fp, x20_off * wordSize); 248 const Address x19_save (fp, x19_off * wordSize); 249 const Address x18_save (fp, x18_off * wordSize); 250 251 const Address x9_save (fp, x9_off * wordSize); 252 253 // stub code 254 255 address riscv_entry = __ pc(); 256 257 // set up frame and move sp to end of save area 258 __ enter(); 259 __ addi(sp, fp, sp_after_call_off * wordSize); 260 261 // save register parameters and Java temporary/global registers 262 // n.b. we save thread even though it gets installed in 263 // xthread because we want to sanity check tp later 264 __ sd(c_rarg7, thread); 265 __ sw(c_rarg6, parameter_size); 266 __ sd(c_rarg5, parameters); 267 __ sd(c_rarg4, entry_point); 268 __ sd(c_rarg3, method); 269 __ sd(c_rarg2, result_type); 270 __ sd(c_rarg1, result); 271 __ sd(c_rarg0, call_wrapper); 272 273 __ sd(x9, x9_save); 274 275 __ sd(x18, x18_save); 276 __ sd(x19, x19_save); 277 __ sd(x20, x20_save); 278 __ sd(x21, x21_save); 279 __ sd(x22, x22_save); 280 __ sd(x23, x23_save); 281 __ sd(x24, x24_save); 282 __ sd(x25, x25_save); 283 __ sd(x26, x26_save); 284 __ sd(x27, x27_save); 285 286 __ fsd(f8, f8_save); 287 __ fsd(f9, f9_save); 288 __ fsd(f18, f18_save); 289 __ fsd(f19, f19_save); 290 __ fsd(f20, f20_save); 291 __ fsd(f21, f21_save); 292 __ fsd(f22, f22_save); 293 __ fsd(f23, f23_save); 294 __ fsd(f24, f24_save); 295 __ fsd(f25, f25_save); 296 __ fsd(f26, f26_save); 297 __ fsd(f27, f27_save); 298 299 // install Java thread in global register now we have saved 300 // whatever value it held 301 __ mv(xthread, c_rarg7); 302 303 // And method 304 __ mv(xmethod, c_rarg3); 305 306 // set up the heapbase register 307 __ reinit_heapbase(); 308 309 #ifdef ASSERT 310 // make sure we have no pending exceptions 311 { 312 Label L; 313 __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset()))); 314 __ beqz(t0, L); 315 __ stop("StubRoutines::call_stub: entered with pending exception"); 316 __ BIND(L); 317 } 318 #endif 319 // pass parameters if any 320 __ mv(esp, sp); 321 __ slli(t0, c_rarg6, LogBytesPerWord); 322 __ sub(t0, sp, t0); // Move SP out of the way 323 __ andi(sp, t0, -2 * wordSize); 324 325 BLOCK_COMMENT("pass parameters if any"); 326 Label parameters_done; 327 // parameter count is still in c_rarg6 328 // and parameter pointer identifying param 1 is in c_rarg5 329 __ beqz(c_rarg6, parameters_done); 330 331 address loop = __ pc(); 332 __ ld(t0, c_rarg5, 0); 333 __ addi(c_rarg5, c_rarg5, wordSize); 334 __ addi(c_rarg6, c_rarg6, -1); 335 __ push_reg(t0); 336 __ bgtz(c_rarg6, loop); 337 338 __ BIND(parameters_done); 339 340 // call Java entry -- passing methdoOop, and current sp 341 // xmethod: Method* 342 // x30: sender sp 343 BLOCK_COMMENT("call Java function"); 344 __ mv(x30, sp); 345 __ jalr(c_rarg4); 346 347 // save current address for use by exception handling code 348 349 return_address = __ pc(); 350 351 // store result depending on type (everything that is not 352 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 353 // n.b. this assumes Java returns an integral result in x10 354 // and a floating result in j_farg0 355 __ ld(j_rarg2, result); 356 Label is_long, is_float, is_double, exit; 357 __ ld(j_rarg1, result_type); 358 __ li(t0, (u1)T_OBJECT); 359 __ beq(j_rarg1, t0, is_long); 360 __ li(t0, (u1)T_LONG); 361 __ beq(j_rarg1, t0, is_long); 362 __ li(t0, (u1)T_FLOAT); 363 __ beq(j_rarg1, t0, is_float); 364 __ li(t0, (u1)T_DOUBLE); 365 __ beq(j_rarg1, t0, is_double); 366 367 // handle T_INT case 368 __ sw(x10, Address(j_rarg2)); 369 370 __ BIND(exit); 371 372 // pop parameters 373 __ addi(esp, fp, sp_after_call_off * wordSize); 374 375 #ifdef ASSERT 376 // verify that threads correspond 377 { 378 Label L, S; 379 __ ld(t0, thread); 380 __ bne(xthread, t0, S); 381 __ get_thread(t0); 382 __ beq(xthread, t0, L); 383 __ BIND(S); 384 __ stop("StubRoutines::call_stub: threads must correspond"); 385 __ BIND(L); 386 } 387 #endif 388 389 // restore callee-save registers 390 __ fld(f27, f27_save); 391 __ fld(f26, f26_save); 392 __ fld(f25, f25_save); 393 __ fld(f24, f24_save); 394 __ fld(f23, f23_save); 395 __ fld(f22, f22_save); 396 __ fld(f21, f21_save); 397 __ fld(f20, f20_save); 398 __ fld(f19, f19_save); 399 __ fld(f18, f18_save); 400 __ fld(f9, f9_save); 401 __ fld(f8, f8_save); 402 403 __ ld(x27, x27_save); 404 __ ld(x26, x26_save); 405 __ ld(x25, x25_save); 406 __ ld(x24, x24_save); 407 __ ld(x23, x23_save); 408 __ ld(x22, x22_save); 409 __ ld(x21, x21_save); 410 __ ld(x20, x20_save); 411 __ ld(x19, x19_save); 412 __ ld(x18, x18_save); 413 414 __ ld(x9, x9_save); 415 416 __ ld(c_rarg0, call_wrapper); 417 __ ld(c_rarg1, result); 418 __ ld(c_rarg2, result_type); 419 __ ld(c_rarg3, method); 420 __ ld(c_rarg4, entry_point); 421 __ ld(c_rarg5, parameters); 422 __ ld(c_rarg6, parameter_size); 423 __ ld(c_rarg7, thread); 424 425 // leave frame and return to caller 426 __ leave(); 427 __ ret(); 428 429 // handle return types different from T_INT 430 431 __ BIND(is_long); 432 __ sd(x10, Address(j_rarg2, 0)); 433 __ j(exit); 434 435 __ BIND(is_float); 436 __ fsw(j_farg0, Address(j_rarg2, 0), t0); 437 __ j(exit); 438 439 __ BIND(is_double); 440 __ fsd(j_farg0, Address(j_rarg2, 0), t0); 441 __ j(exit); 442 443 return start; 444 } 445 446 // Return point for a Java call if there's an exception thrown in 447 // Java code. The exception is caught and transformed into a 448 // pending exception stored in JavaThread that can be tested from 449 // within the VM. 450 // 451 // Note: Usually the parameters are removed by the callee. In case 452 // of an exception crossing an activation frame boundary, that is 453 // not the case if the callee is compiled code => need to setup the 454 // sp. 455 // 456 // x10: exception oop 457 458 address generate_catch_exception() { 459 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 460 address start = __ pc(); 461 462 // same as in generate_call_stub(): 463 const Address thread(fp, thread_off * wordSize); 464 465 #ifdef ASSERT 466 // verify that threads correspond 467 { 468 Label L, S; 469 __ ld(t0, thread); 470 __ bne(xthread, t0, S); 471 __ get_thread(t0); 472 __ beq(xthread, t0, L); 473 __ bind(S); 474 __ stop("StubRoutines::catch_exception: threads must correspond"); 475 __ bind(L); 476 } 477 #endif 478 479 // set pending exception 480 __ verify_oop(x10); 481 482 __ sd(x10, Address(xthread, Thread::pending_exception_offset())); 483 __ mv(t0, (address)__FILE__); 484 __ sd(t0, Address(xthread, Thread::exception_file_offset())); 485 __ mv(t0, (int)__LINE__); 486 __ sw(t0, Address(xthread, Thread::exception_line_offset())); 487 488 // complete return to VM 489 assert(StubRoutines::_call_stub_return_address != NULL, 490 "_call_stub_return_address must have been generated before"); 491 __ j(StubRoutines::_call_stub_return_address); 492 493 return start; 494 } 495 496 // Continuation point for runtime calls returning with a pending 497 // exception. The pending exception check happened in the runtime 498 // or native call stub. The pending exception in Thread is 499 // converted into a Java-level exception. 500 // 501 // Contract with Java-level exception handlers: 502 // x10: exception 503 // x13: throwing pc 504 // 505 // NOTE: At entry of this stub, exception-pc must be in RA !! 506 507 // NOTE: this is always used as a jump target within generated code 508 // so it just needs to be generated code with no x86 prolog 509 510 address generate_forward_exception() { 511 StubCodeMark mark(this, "StubRoutines", "forward exception"); 512 address start = __ pc(); 513 514 // Upon entry, RA points to the return address returning into 515 // Java (interpreted or compiled) code; i.e., the return address 516 // becomes the throwing pc. 517 // 518 // Arguments pushed before the runtime call are still on the stack 519 // but the exception handler will reset the stack pointer -> 520 // ignore them. A potential result in registers can be ignored as 521 // well. 522 523 #ifdef ASSERT 524 // make sure this code is only executed if there is a pending exception 525 { 526 Label L; 527 __ ld(t0, Address(xthread, Thread::pending_exception_offset())); 528 __ bnez(t0, L); 529 __ stop("StubRoutines::forward exception: no pending exception (1)"); 530 __ bind(L); 531 } 532 #endif 533 534 // compute exception handler into x9 535 536 // call the VM to find the handler address associated with the 537 // caller address. pass thread in x10 and caller pc (ret address) 538 // in x11. n.b. the caller pc is in ra, unlike x86 where it is on 539 // the stack. 540 __ mv(c_rarg1, ra); 541 // ra will be trashed by the VM call so we move it to x9 542 // (callee-saved) because we also need to pass it to the handler 543 // returned by this call. 544 __ mv(x9, ra); 545 BLOCK_COMMENT("call exception_handler_for_return_address"); 546 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 547 SharedRuntime::exception_handler_for_return_address), 548 xthread, c_rarg1); 549 // we should not really care that ra is no longer the callee 550 // address. we saved the value the handler needs in x9 so we can 551 // just copy it to x13. however, the C2 handler will push its own 552 // frame and then calls into the VM and the VM code asserts that 553 // the PC for the frame above the handler belongs to a compiled 554 // Java method. So, we restore ra here to satisfy that assert. 555 __ mv(ra, x9); 556 // setup x10 & x13 & clear pending exception 557 __ mv(x13, x9); 558 __ mv(x9, x10); 559 __ ld(x10, Address(xthread, Thread::pending_exception_offset())); 560 __ sd(zr, Address(xthread, Thread::pending_exception_offset())); 561 562 #ifdef ASSERT 563 // make sure exception is set 564 { 565 Label L; 566 __ bnez(x10, L); 567 __ stop("StubRoutines::forward exception: no pending exception (2)"); 568 __ bind(L); 569 } 570 #endif 571 572 // continue at exception handler 573 // x10: exception 574 // x13: throwing pc 575 // x9: exception handler 576 __ verify_oop(x10); 577 __ jr(x9); 578 579 return start; 580 } 581 582 // Non-destructive plausibility checks for oops 583 // 584 // Arguments: 585 // x10: oop to verify 586 // t0: error message 587 // 588 // Stack after saving c_rarg3: 589 // [tos + 0]: saved c_rarg3 590 // [tos + 1]: saved c_rarg2 591 // [tos + 2]: saved ra 592 // [tos + 3]: saved t1 593 // [tos + 4]: saved x10 594 // [tos + 5]: saved t0 595 address generate_verify_oop() { 596 597 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 598 address start = __ pc(); 599 600 Label exit, error; 601 602 __ push_reg(0x3000, sp); // save c_rarg2 and c_rarg3 603 604 __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 605 __ ld(c_rarg3, Address(c_rarg2)); 606 __ add(c_rarg3, c_rarg3, 1); 607 __ sd(c_rarg3, Address(c_rarg2)); 608 609 // object is in x10 610 // make sure object is 'reasonable' 611 __ beqz(x10, exit); // if obj is NULL it is OK 612 613 #if INCLUDE_ZGC 614 if (UseZGC) { 615 // Check if mask is good. 616 // verifies that ZAddressBadMask & x10 == 0 617 __ ld(c_rarg3, Address(xthread, ZThreadLocalData::address_bad_mask_offset())); 618 __ andr(c_rarg2, x10, c_rarg3); 619 __ bnez(c_rarg2, error); 620 } 621 #endif 622 623 // Check if the oop is in the right area of memory 624 __ mv(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 625 __ andr(c_rarg2, x10, c_rarg3); 626 __ mv(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 627 628 // Compare c_rarg2 and c_rarg3. 629 __ bne(c_rarg2, c_rarg3, error); 630 631 // make sure klass is 'reasonable', which is not zero. 632 __ load_klass(x10, x10); // get klass 633 __ beqz(x10, error); // if klass is NULL it is broken 634 635 // return if everything seems ok 636 __ bind(exit); 637 638 __ pop_reg(0x3000, sp); // pop c_rarg2 and c_rarg3 639 __ ret(); 640 641 // handle errors 642 __ bind(error); 643 __ pop_reg(0x3000, sp); // pop c_rarg2 and c_rarg3 644 645 __ push_reg(RegSet::range(x0, x31), sp); 646 // debug(char* msg, int64_t pc, int64_t regs[]) 647 __ mv(c_rarg0, t0); // pass address of error message 648 __ mv(c_rarg1, ra); // pass return address 649 __ mv(c_rarg2, sp); // pass address of regs on stack 650 #ifndef PRODUCT 651 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 652 #endif 653 BLOCK_COMMENT("call MacroAssembler::debug"); 654 int32_t offset = 0; 655 __ movptr_with_offset(t0, CAST_FROM_FN_PTR(address, MacroAssembler::debug64), offset); 656 __ jalr(x1, t0, offset); 657 __ ebreak(); 658 659 return start; 660 } 661 662 // The inner part of zero_words(). 663 // 664 // Inputs: 665 // x28: the HeapWord-aligned base address of an array to zero. 666 // x29: the count in HeapWords, x29 > 0. 667 // 668 // Returns x28 and x29, adjusted for the caller to clear. 669 // x28: the base address of the tail of words left to clear. 670 // x29: the number of words in the tail. 671 // x29 < MacroAssembler::zero_words_block_size. 672 673 address generate_zero_blocks() { 674 Label done; 675 676 const Register base = x28, cnt = x29; 677 678 __ align(CodeEntryAlignment); 679 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 680 address start = __ pc(); 681 682 { 683 // Clear the remaining blocks. 684 Label loop; 685 __ sub(cnt, cnt, MacroAssembler::zero_words_block_size); 686 __ bltz(cnt, done); 687 __ bind(loop); 688 for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) { 689 __ sd(zr, Address(base, 0)); 690 __ add(base, base, 8); 691 } 692 __ sub(cnt, cnt, MacroAssembler::zero_words_block_size); 693 __ bgez(cnt, loop); 694 __ bind(done); 695 __ add(cnt, cnt, MacroAssembler::zero_words_block_size); 696 } 697 698 __ ret(); 699 700 return start; 701 } 702 703 typedef enum { 704 copy_forwards = 1, 705 copy_backwards = -1 706 } copy_direction; 707 708 // Bulk copy of blocks of 8 words. 709 // 710 // count is a count of words. 711 // 712 // Precondition: count >= 8 713 // 714 // Postconditions: 715 // 716 // The least significant bit of count contains the remaining count 717 // of words to copy. The rest of count is trash. 718 // 719 // s and d are adjusted to point to the remaining words to copy 720 // 721 void generate_copy_longs(Label &start, Register s, Register d, Register count, 722 copy_direction direction) { 723 int unit = wordSize * direction; 724 int bias = wordSize; 725 726 const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16, 727 tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29; 728 729 const Register stride = x30; 730 731 assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3, 732 tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7); 733 assert_different_registers(s, d, count, t0); 734 735 Label again, drain; 736 const char* stub_name = NULL; 737 if (direction == copy_forwards) { 738 stub_name = "forward_copy_longs"; 739 } else { 740 stub_name = "backward_copy_longs"; 741 } 742 StubCodeMark mark(this, "StubRoutines", stub_name); 743 __ align(CodeEntryAlignment); 744 __ bind(start); 745 746 if (direction == copy_forwards) { 747 __ sub(s, s, bias); 748 __ sub(d, d, bias); 749 } 750 751 #ifdef ASSERT 752 // Make sure we are never given < 8 words 753 { 754 Label L; 755 756 __ li(t0, 8); 757 __ bge(count, t0, L); 758 __ stop("genrate_copy_longs called with < 8 words"); 759 __ bind(L); 760 } 761 #endif 762 763 __ ld(tmp_reg0, Address(s, 1 * unit)); 764 __ ld(tmp_reg1, Address(s, 2 * unit)); 765 __ ld(tmp_reg2, Address(s, 3 * unit)); 766 __ ld(tmp_reg3, Address(s, 4 * unit)); 767 __ ld(tmp_reg4, Address(s, 5 * unit)); 768 __ ld(tmp_reg5, Address(s, 6 * unit)); 769 __ ld(tmp_reg6, Address(s, 7 * unit)); 770 __ ld(tmp_reg7, Address(s, 8 * unit)); 771 __ addi(s, s, 8 * unit); 772 773 __ sub(count, count, 16); 774 __ bltz(count, drain); 775 776 __ bind(again); 777 778 __ sd(tmp_reg0, Address(d, 1 * unit)); 779 __ sd(tmp_reg1, Address(d, 2 * unit)); 780 __ sd(tmp_reg2, Address(d, 3 * unit)); 781 __ sd(tmp_reg3, Address(d, 4 * unit)); 782 __ sd(tmp_reg4, Address(d, 5 * unit)); 783 __ sd(tmp_reg5, Address(d, 6 * unit)); 784 __ sd(tmp_reg6, Address(d, 7 * unit)); 785 __ sd(tmp_reg7, Address(d, 8 * unit)); 786 787 __ ld(tmp_reg0, Address(s, 1 * unit)); 788 __ ld(tmp_reg1, Address(s, 2 * unit)); 789 __ ld(tmp_reg2, Address(s, 3 * unit)); 790 __ ld(tmp_reg3, Address(s, 4 * unit)); 791 __ ld(tmp_reg4, Address(s, 5 * unit)); 792 __ ld(tmp_reg5, Address(s, 6 * unit)); 793 __ ld(tmp_reg6, Address(s, 7 * unit)); 794 __ ld(tmp_reg7, Address(s, 8 * unit)); 795 796 __ addi(s, s, 8 * unit); 797 __ addi(d, d, 8 * unit); 798 799 __ sub(count, count, 8); 800 __ bgez(count, again); 801 802 // Drain 803 __ bind(drain); 804 805 __ sd(tmp_reg0, Address(d, 1 * unit)); 806 __ sd(tmp_reg1, Address(d, 2 * unit)); 807 __ sd(tmp_reg2, Address(d, 3 * unit)); 808 __ sd(tmp_reg3, Address(d, 4 * unit)); 809 __ sd(tmp_reg4, Address(d, 5 * unit)); 810 __ sd(tmp_reg5, Address(d, 6 * unit)); 811 __ sd(tmp_reg6, Address(d, 7 * unit)); 812 __ sd(tmp_reg7, Address(d, 8 * unit)); 813 __ addi(d, d, 8 * unit); 814 815 { 816 Label L1, L2; 817 __ andi(t0, count, 4); 818 __ beqz(t0, L1); 819 820 __ ld(tmp_reg0, Address(s, 1 * unit)); 821 __ ld(tmp_reg1, Address(s, 2 * unit)); 822 __ ld(tmp_reg2, Address(s, 3 * unit)); 823 __ ld(tmp_reg3, Address(s, 4 * unit)); 824 __ addi(s, s, 4 * unit); 825 826 __ sd(tmp_reg0, Address(d, 1 * unit)); 827 __ sd(tmp_reg1, Address(d, 2 * unit)); 828 __ sd(tmp_reg2, Address(d, 3 * unit)); 829 __ sd(tmp_reg3, Address(d, 4 * unit)); 830 __ addi(d, d, 4 * unit); 831 832 __ bind(L1); 833 834 if (direction == copy_forwards) { 835 __ addi(s, s, bias); 836 __ addi(d, d, bias); 837 } 838 839 __ andi(t0, count, 2); 840 __ beqz(t0, L2); 841 if (direction == copy_backwards) { 842 __ addi(s, s, 2 * unit); 843 __ ld(tmp_reg0, Address(s)); 844 __ ld(tmp_reg1, Address(s, wordSize)); 845 __ addi(d, d, 2 * unit); 846 __ sd(tmp_reg0, Address(d)); 847 __ sd(tmp_reg1, Address(d, wordSize)); 848 } else { 849 __ ld(tmp_reg0, Address(s)); 850 __ ld(tmp_reg1, Address(s, wordSize)); 851 __ addi(s, s, 2 * unit); 852 __ sd(tmp_reg0, Address(d)); 853 __ sd(tmp_reg1, Address(d, wordSize)); 854 __ addi(d, d, 2 * unit); 855 } 856 __ bind(L2); 857 } 858 859 __ ret(); 860 } 861 862 Label copy_f, copy_b; 863 864 // All-singing all-dancing memory copy. 865 // 866 // Copy count units of memory from s to d. The size of a unit is 867 // step, which can be positive or negative depending on the direction 868 // of copy. If is_aligned is false, we align the source address. 869 // 870 /* 871 * if (is_aligned) { 872 * if (count >= 32) 873 * goto copy32_loop; 874 * if (count >= 8) 875 * goto copy8_loop; 876 * goto copy_small; 877 * } 878 * bool is_backwards = step < 0; 879 * int granularity = uabs(step); 880 * count = count * granularity; * count bytes 881 * 882 * if (is_backwards) { 883 * s += count; 884 * d += count; 885 * } 886 * 887 * count limit maybe greater than 16, for better performance 888 * if (count < 16) { 889 * goto copy_small; 890 * } 891 * 892 * if ((dst % 8) == (src % 8)) { 893 * aligned; 894 * goto copy_big; 895 * } 896 * 897 * copy_big: 898 * if the amount to copy is more than (or equal to) 32 bytes goto copy32_loop 899 * else goto copy8_loop 900 * copy_small: 901 * load element one by one; 902 * done; 903 */ 904 905 typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp); 906 907 void copy_memory_v(Register s, Register d, Register count, Register tmp, int step) { 908 bool is_backward = step < 0; 909 int granularity = uabs(step); 910 911 const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17; 912 assert_different_registers(s, d, cnt, vl, tmp, tmp1, tmp2); 913 Assembler::SEW sew = Assembler::elembytes_to_sew(granularity); 914 Label loop_forward, loop_backward, done; 915 916 __ mv(dst, d); 917 __ mv(src, s); 918 __ mv(cnt, count); 919 920 __ bind(loop_forward); 921 __ vsetvli(vl, cnt, sew, Assembler::m8); 922 if (is_backward) { 923 __ bne(vl, cnt, loop_backward); 924 } 925 926 __ vlex_v(v0, src, sew); 927 __ sub(cnt, cnt, vl); 928 __ slli(vl, vl, (int)sew); 929 __ add(src, src, vl); 930 931 __ vsex_v(v0, dst, sew); 932 __ add(dst, dst, vl); 933 __ bnez(cnt, loop_forward); 934 935 if (is_backward) { 936 __ j(done); 937 938 __ bind(loop_backward); 939 __ sub(tmp, cnt, vl); 940 __ slli(tmp, tmp, sew); 941 __ add(tmp1, s, tmp); 942 __ vlex_v(v0, tmp1, sew); 943 __ add(tmp2, d, tmp); 944 __ vsex_v(v0, tmp2, sew); 945 __ sub(cnt, cnt, vl); 946 __ bnez(cnt, loop_forward); 947 __ bind(done); 948 } 949 } 950 951 void copy_memory(bool is_aligned, Register s, Register d, 952 Register count, Register tmp, int step) { 953 if (UseRVV) { 954 return copy_memory_v(s, d, count, tmp, step); 955 } 956 957 bool is_backwards = step < 0; 958 int granularity = uabs(step); 959 960 const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13; 961 962 Label same_aligned; 963 Label copy_big, copy32_loop, copy8_loop, copy_small, done; 964 965 copy_insn ld_arr = NULL, st_arr = NULL; 966 switch (granularity) { 967 case 1 : 968 ld_arr = (copy_insn)&MacroAssembler::lbu; 969 st_arr = (copy_insn)&MacroAssembler::sb; 970 break; 971 case 2 : 972 ld_arr = (copy_insn)&MacroAssembler::lhu; 973 st_arr = (copy_insn)&MacroAssembler::sh; 974 break; 975 case 4 : 976 ld_arr = (copy_insn)&MacroAssembler::lwu; 977 st_arr = (copy_insn)&MacroAssembler::sw; 978 break; 979 case 8 : 980 ld_arr = (copy_insn)&MacroAssembler::ld; 981 st_arr = (copy_insn)&MacroAssembler::sd; 982 break; 983 default : 984 ShouldNotReachHere(); 985 } 986 987 __ beqz(count, done); 988 __ slli(cnt, count, exact_log2(granularity)); 989 if (is_backwards) { 990 __ add(src, s, cnt); 991 __ add(dst, d, cnt); 992 } else { 993 __ mv(src, s); 994 __ mv(dst, d); 995 } 996 997 if (is_aligned) { 998 __ addi(tmp, cnt, -32); 999 __ bgez(tmp, copy32_loop); 1000 __ addi(tmp, cnt, -8); 1001 __ bgez(tmp, copy8_loop); 1002 __ j(copy_small); 1003 } else { 1004 __ mv(tmp, 16); 1005 __ blt(cnt, tmp, copy_small); 1006 1007 __ xorr(tmp, src, dst); 1008 __ andi(tmp, tmp, 0b111); 1009 __ bnez(tmp, copy_small); 1010 1011 __ bind(same_aligned); 1012 __ andi(tmp, src, 0b111); 1013 __ beqz(tmp, copy_big); 1014 if (is_backwards) { 1015 __ addi(src, src, step); 1016 __ addi(dst, dst, step); 1017 } 1018 (_masm->*ld_arr)(tmp3, Address(src), t0); 1019 (_masm->*st_arr)(tmp3, Address(dst), t0); 1020 if (!is_backwards) { 1021 __ addi(src, src, step); 1022 __ addi(dst, dst, step); 1023 } 1024 __ addi(cnt, cnt, -granularity); 1025 __ beqz(cnt, done); 1026 __ j(same_aligned); 1027 1028 __ bind(copy_big); 1029 __ mv(tmp, 32); 1030 __ blt(cnt, tmp, copy8_loop); 1031 } 1032 __ bind(copy32_loop); 1033 if (is_backwards) { 1034 __ addi(src, src, -wordSize * 4); 1035 __ addi(dst, dst, -wordSize * 4); 1036 } 1037 // we first load 32 bytes, then write it, so the direction here doesn't matter 1038 __ ld(tmp3, Address(src)); 1039 __ ld(tmp4, Address(src, 8)); 1040 __ ld(tmp5, Address(src, 16)); 1041 __ ld(tmp6, Address(src, 24)); 1042 __ sd(tmp3, Address(dst)); 1043 __ sd(tmp4, Address(dst, 8)); 1044 __ sd(tmp5, Address(dst, 16)); 1045 __ sd(tmp6, Address(dst, 24)); 1046 1047 if (!is_backwards) { 1048 __ addi(src, src, wordSize * 4); 1049 __ addi(dst, dst, wordSize * 4); 1050 } 1051 __ addi(tmp, cnt, -(32 + wordSize * 4)); 1052 __ addi(cnt, cnt, -wordSize * 4); 1053 __ bgez(tmp, copy32_loop); // cnt >= 32, do next loop 1054 1055 __ beqz(cnt, done); // if that's all - done 1056 1057 __ addi(tmp, cnt, -8); // if not - copy the reminder 1058 __ bltz(tmp, copy_small); // cnt < 8, go to copy_small, else fall throught to copy8_loop 1059 1060 __ bind(copy8_loop); 1061 if (is_backwards) { 1062 __ addi(src, src, -wordSize); 1063 __ addi(dst, dst, -wordSize); 1064 } 1065 __ ld(tmp3, Address(src)); 1066 __ sd(tmp3, Address(dst)); 1067 if (!is_backwards) { 1068 __ addi(src, src, wordSize); 1069 __ addi(dst, dst, wordSize); 1070 } 1071 __ addi(tmp, cnt, -(8 + wordSize)); 1072 __ addi(cnt, cnt, -wordSize); 1073 __ bgez(tmp, copy8_loop); // cnt >= 8, do next loop 1074 1075 __ beqz(cnt, done); // if that's all - done 1076 1077 __ bind(copy_small); 1078 if (is_backwards) { 1079 __ addi(src, src, step); 1080 __ addi(dst, dst, step); 1081 } 1082 (_masm->*ld_arr)(tmp3, Address(src), t0); 1083 (_masm->*st_arr)(tmp3, Address(dst), t0); 1084 if (!is_backwards) { 1085 __ addi(src, src, step); 1086 __ addi(dst, dst, step); 1087 } 1088 __ addi(cnt, cnt, -granularity); 1089 __ bgtz(cnt, copy_small); 1090 1091 __ bind(done); 1092 } 1093 1094 // Scan over array at a for count oops, verifying each one. 1095 // Preserves a and count, clobbers t0 and t1. 1096 void verify_oop_array(size_t size, Register a, Register count, Register temp) { 1097 Label loop, end; 1098 __ mv(t1, zr); 1099 __ slli(t0, count, exact_log2(size)); 1100 __ bind(loop); 1101 __ bgeu(t1, t0, end); 1102 1103 __ add(temp, a, t1); 1104 if (size == (size_t)wordSize) { 1105 __ ld(temp, Address(temp, 0)); 1106 __ verify_oop(temp); 1107 } else { 1108 __ lwu(temp, Address(temp, 0)); 1109 __ decode_heap_oop(temp); // calls verify_oop 1110 } 1111 __ add(t1, t1, size); 1112 __ j(loop); 1113 __ bind(end); 1114 } 1115 1116 // Arguments: 1117 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1118 // ignored 1119 // is_oop - true => oop array, so generate store check code 1120 // name - stub name string 1121 // 1122 // Inputs: 1123 // c_rarg0 - source array address 1124 // c_rarg1 - destination array address 1125 // c_rarg2 - element count, treated as ssize_t, can be zero 1126 // 1127 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1128 // the hardware handle it. The two dwords within qwords that span 1129 // cache line boundaries will still be loaded and stored atomicly. 1130 // 1131 // Side Effects: 1132 // disjoint_int_copy_entry is set to the no-overlap entry point 1133 // used by generate_conjoint_int_oop_copy(). 1134 // 1135 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address* entry, 1136 const char* name, bool dest_uninitialized = false) { 1137 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1138 RegSet saved_reg = RegSet::of(s, d, count); 1139 __ align(CodeEntryAlignment); 1140 StubCodeMark mark(this, "StubRoutines", name); 1141 address start = __ pc(); 1142 __ enter(); 1143 1144 if (entry != NULL) { 1145 *entry = __ pc(); 1146 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1147 BLOCK_COMMENT("Entry:"); 1148 } 1149 1150 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1151 if (dest_uninitialized) { 1152 decorators |= IS_DEST_UNINITIALIZED; 1153 } 1154 if (aligned) { 1155 decorators |= ARRAYCOPY_ALIGNED; 1156 } 1157 1158 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1159 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1160 1161 if (is_oop) { 1162 // save regs before copy_memory 1163 __ push_reg(RegSet::of(d, count), sp); 1164 } 1165 1166 { 1167 // UnsafeCopyMemory page error: continue after ucm 1168 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1169 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1170 copy_memory(aligned, s, d, count, t0, size); 1171 } 1172 1173 if (is_oop) { 1174 __ pop_reg(RegSet::of(d, count), sp); 1175 if (VerifyOops) { 1176 verify_oop_array(size, d, count, t2); 1177 } 1178 } 1179 1180 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1181 1182 __ leave(); 1183 __ mv(x10, zr); // return 0 1184 __ ret(); 1185 return start; 1186 } 1187 1188 // Arguments: 1189 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1190 // ignored 1191 // is_oop - true => oop array, so generate store check code 1192 // name - stub name string 1193 // 1194 // Inputs: 1195 // c_rarg0 - source array address 1196 // c_rarg1 - destination array address 1197 // c_rarg2 - element count, treated as ssize_t, can be zero 1198 // 1199 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1200 // the hardware handle it. The two dwords within qwords that span 1201 // cache line boundaries will still be loaded and stored atomicly. 1202 // 1203 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1204 address* entry, const char* name, 1205 bool dest_uninitialized = false) { 1206 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1207 RegSet saved_regs = RegSet::of(s, d, count); 1208 StubCodeMark mark(this, "StubRoutines", name); 1209 address start = __ pc(); 1210 __ enter(); 1211 1212 if (entry != NULL) { 1213 *entry = __ pc(); 1214 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1215 BLOCK_COMMENT("Entry:"); 1216 } 1217 1218 // use fwd copy when (d-s) above_equal (count*size) 1219 __ sub(t0, d, s); 1220 __ slli(t1, count, exact_log2(size)); 1221 __ bgeu(t0, t1, nooverlap_target); 1222 1223 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1224 if (dest_uninitialized) { 1225 decorators |= IS_DEST_UNINITIALIZED; 1226 } 1227 if (aligned) { 1228 decorators |= ARRAYCOPY_ALIGNED; 1229 } 1230 1231 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1232 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1233 1234 if (is_oop) { 1235 // save regs before copy_memory 1236 __ push_reg(RegSet::of(d, count), sp); 1237 } 1238 1239 { 1240 // UnsafeCopyMemory page error: continue after ucm 1241 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1242 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1243 copy_memory(aligned, s, d, count, t0, -size); 1244 } 1245 1246 if (is_oop) { 1247 __ pop_reg(RegSet::of(d, count), sp); 1248 if (VerifyOops) { 1249 verify_oop_array(size, d, count, t2); 1250 } 1251 } 1252 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0, RegSet()); 1253 __ leave(); 1254 __ mv(x10, zr); // return 0 1255 __ ret(); 1256 return start; 1257 } 1258 1259 // Arguments: 1260 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1261 // ignored 1262 // name - stub name string 1263 // 1264 // Inputs: 1265 // c_rarg0 - source array address 1266 // c_rarg1 - destination array address 1267 // c_rarg2 - element count, treated as ssize_t, can be zero 1268 // 1269 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1270 // we let the hardware handle it. The one to eight bytes within words, 1271 // dwords or qwords that span cache line boundaries will still be loaded 1272 // and stored atomically. 1273 // 1274 // Side Effects: 1275 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1276 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1277 // we let the hardware handle it. The one to eight bytes within words, 1278 // dwords or qwords that span cache line boundaries will still be loaded 1279 // and stored atomically. 1280 // 1281 // Side Effects: 1282 // disjoint_byte_copy_entry is set to the no-overlap entry point 1283 // used by generate_conjoint_byte_copy(). 1284 // 1285 address generate_disjoint_byte_copy(bool aligned, address* entry, const char* name) { 1286 const bool not_oop = false; 1287 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1288 } 1289 1290 // Arguments: 1291 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1292 // ignored 1293 // name - stub name string 1294 // 1295 // Inputs: 1296 // c_rarg0 - source array address 1297 // c_rarg1 - destination array address 1298 // c_rarg2 - element count, treated as ssize_t, can be zero 1299 // 1300 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1301 // we let the hardware handle it. The one to eight bytes within words, 1302 // dwords or qwords that span cache line boundaries will still be loaded 1303 // and stored atomically. 1304 // 1305 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1306 address* entry, const char* name) { 1307 const bool not_oop = false; 1308 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1309 } 1310 1311 // Arguments: 1312 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1313 // ignored 1314 // name - stub name string 1315 // 1316 // Inputs: 1317 // c_rarg0 - source array address 1318 // c_rarg1 - destination array address 1319 // c_rarg2 - element count, treated as ssize_t, can be zero 1320 // 1321 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1322 // let the hardware handle it. The two or four words within dwords 1323 // or qwords that span cache line boundaries will still be loaded 1324 // and stored atomically. 1325 // 1326 // Side Effects: 1327 // disjoint_short_copy_entry is set to the no-overlap entry point 1328 // used by generate_conjoint_short_copy(). 1329 // 1330 address generate_disjoint_short_copy(bool aligned, 1331 address* entry, const char* name) { 1332 const bool not_oop = false; 1333 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1334 } 1335 1336 // Arguments: 1337 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1338 // ignored 1339 // name - stub name string 1340 // 1341 // Inputs: 1342 // c_rarg0 - source array address 1343 // c_rarg1 - destination array address 1344 // c_rarg2 - element count, treated as ssize_t, can be zero 1345 // 1346 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1347 // let the hardware handle it. The two or four words within dwords 1348 // or qwords that span cache line boundaries will still be loaded 1349 // and stored atomically. 1350 // 1351 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1352 address* entry, const char* name) { 1353 const bool not_oop = false; 1354 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1355 } 1356 1357 // Arguments: 1358 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1359 // ignored 1360 // name - stub name string 1361 // 1362 // Inputs: 1363 // c_rarg0 - source array address 1364 // c_rarg1 - destination array address 1365 // c_rarg2 - element count, treated as ssize_t, can be zero 1366 // 1367 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1368 // the hardware handle it. The two dwords within qwords that span 1369 // cache line boundaries will still be loaded and stored atomicly. 1370 // 1371 // Side Effects: 1372 // disjoint_int_copy_entry is set to the no-overlap entry point 1373 // used by generate_conjoint_int_oop_copy(). 1374 // 1375 address generate_disjoint_int_copy(bool aligned, address* entry, 1376 const char* name, bool dest_uninitialized = false) { 1377 const bool not_oop = false; 1378 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1379 } 1380 1381 // Arguments: 1382 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1383 // ignored 1384 // name - stub name string 1385 // 1386 // Inputs: 1387 // c_rarg0 - source array address 1388 // c_rarg1 - destination array address 1389 // c_rarg2 - element count, treated as ssize_t, can be zero 1390 // 1391 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1392 // the hardware handle it. The two dwords within qwords that span 1393 // cache line boundaries will still be loaded and stored atomicly. 1394 // 1395 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1396 address* entry, const char* name, 1397 bool dest_uninitialized = false) { 1398 const bool not_oop = false; 1399 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1400 } 1401 1402 1403 // Arguments: 1404 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1405 // ignored 1406 // name - stub name string 1407 // 1408 // Inputs: 1409 // c_rarg0 - source array address 1410 // c_rarg1 - destination array address 1411 // c_rarg2 - element count, treated as size_t, can be zero 1412 // 1413 // Side Effects: 1414 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1415 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1416 // 1417 address generate_disjoint_long_copy(bool aligned, address* entry, 1418 const char* name, bool dest_uninitialized = false) { 1419 const bool not_oop = false; 1420 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1421 } 1422 1423 // Arguments: 1424 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1425 // ignored 1426 // name - stub name string 1427 // 1428 // Inputs: 1429 // c_rarg0 - source array address 1430 // c_rarg1 - destination array address 1431 // c_rarg2 - element count, treated as size_t, can be zero 1432 // 1433 address generate_conjoint_long_copy(bool aligned, 1434 address nooverlap_target, address* entry, 1435 const char* name, bool dest_uninitialized = false) { 1436 const bool not_oop = false; 1437 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1438 } 1439 1440 // Arguments: 1441 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1442 // ignored 1443 // name - stub name string 1444 // 1445 // Inputs: 1446 // c_rarg0 - source array address 1447 // c_rarg1 - destination array address 1448 // c_rarg2 - element count, treated as size_t, can be zero 1449 // 1450 // Side Effects: 1451 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1452 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1453 // 1454 address generate_disjoint_oop_copy(bool aligned, address* entry, 1455 const char* name, bool dest_uninitialized) { 1456 const bool is_oop = true; 1457 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1458 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1459 } 1460 1461 // Arguments: 1462 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1463 // ignored 1464 // name - stub name string 1465 // 1466 // Inputs: 1467 // c_rarg0 - source array address 1468 // c_rarg1 - destination array address 1469 // c_rarg2 - element count, treated as size_t, can be zero 1470 // 1471 address generate_conjoint_oop_copy(bool aligned, 1472 address nooverlap_target, address* entry, 1473 const char* name, bool dest_uninitialized) { 1474 const bool is_oop = true; 1475 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1476 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1477 name, dest_uninitialized); 1478 } 1479 1480 // Helper for generating a dynamic type check. 1481 // Smashes t0, t1. 1482 void generate_type_check(Register sub_klass, 1483 Register super_check_offset, 1484 Register super_klass, 1485 Label& L_success) { 1486 assert_different_registers(sub_klass, super_check_offset, super_klass); 1487 1488 BLOCK_COMMENT("type_check:"); 1489 1490 Label L_miss; 1491 1492 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, super_check_offset); 1493 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1494 1495 // Fall through on failure! 1496 __ BIND(L_miss); 1497 } 1498 1499 // 1500 // Generate checkcasting array copy stub 1501 // 1502 // Input: 1503 // c_rarg0 - source array address 1504 // c_rarg1 - destination array address 1505 // c_rarg2 - element count, treated as ssize_t, can be zero 1506 // c_rarg3 - size_t ckoff (super_check_offset) 1507 // c_rarg4 - oop ckval (super_klass) 1508 // 1509 // Output: 1510 // x10 == 0 - success 1511 // x10 == -1^K - failure, where K is partial transfer count 1512 // 1513 address generate_checkcast_copy(const char* name, address* entry, 1514 bool dest_uninitialized = false) { 1515 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1516 1517 // Input registers (after setup_arg_regs) 1518 const Register from = c_rarg0; // source array address 1519 const Register to = c_rarg1; // destination array address 1520 const Register count = c_rarg2; // elementscount 1521 const Register ckoff = c_rarg3; // super_check_offset 1522 const Register ckval = c_rarg4; // super_klass 1523 1524 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1525 RegSet wb_post_saved_regs = RegSet::of(count); 1526 1527 // Registers used as temps (x7, x9, x18 are save-on-entry) 1528 const Register count_save = x19; // orig elementscount 1529 const Register start_to = x18; // destination array start address 1530 const Register copied_oop = x7; // actual oop copied 1531 const Register r9_klass = x9; // oop._klass 1532 1533 //--------------------------------------------------------------- 1534 // Assembler stub will be used for this call to arraycopy 1535 // if the two arrays are subtypes of Object[] but the 1536 // destination array type is not equal to or a supertype 1537 // of the source type. Each element must be separately 1538 // checked. 1539 1540 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1541 copied_oop, r9_klass, count_save); 1542 1543 __ align(CodeEntryAlignment); 1544 StubCodeMark mark(this, "StubRoutines", name); 1545 address start = __ pc(); 1546 1547 __ enter(); // required for proper stackwalking of RuntimeStub frame 1548 1549 // Caller of this entry point must set up the argument registers. 1550 if (entry != NULL) { 1551 *entry = __ pc(); 1552 BLOCK_COMMENT("Entry:"); 1553 } 1554 1555 // Empty array: Nothing to do 1556 __ beqz(count, L_done); 1557 1558 __ push_reg(RegSet::of(x7, x9, x18, x19), sp); 1559 1560 #ifdef ASSERT 1561 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1562 // The ckoff and ckval must be mutually consistent, 1563 // even though caller generates both. 1564 { Label L; 1565 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1566 __ lwu(start_to, Address(ckval, sco_offset)); 1567 __ beq(ckoff, start_to, L); 1568 __ stop("super_check_offset inconsistent"); 1569 __ bind(L); 1570 } 1571 #endif //ASSERT 1572 1573 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1574 bool is_oop = true; 1575 if (dest_uninitialized) { 1576 decorators |= IS_DEST_UNINITIALIZED; 1577 } 1578 1579 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1580 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1581 1582 // save the original count 1583 __ mv(count_save, count); 1584 1585 // Copy from low to high addresses 1586 __ mv(start_to, to); // Save destination array start address 1587 __ j(L_load_element); 1588 1589 // ======== begin loop ======== 1590 // (Loop is rotated; its entry is L_load_element.) 1591 // Loop control: 1592 // for count to 0 do 1593 // copied_oop = load_heap_oop(from++) 1594 // ... generate_type_check ... 1595 // store_heap_oop(to++, copied_oop) 1596 // end 1597 1598 __ align(OptoLoopAlignment); 1599 1600 __ BIND(L_store_element); 1601 __ store_heap_oop(Address(to, 0), copied_oop, noreg, noreg, AS_RAW); // store the oop 1602 __ add(to, to, UseCompressedOops ? 4 : 8); 1603 __ sub(count, count, 1); 1604 __ beqz(count, L_do_card_marks); 1605 1606 // ======== loop entry is here ======== 1607 __ BIND(L_load_element); 1608 __ load_heap_oop(copied_oop, Address(from, 0), noreg, noreg, AS_RAW); // load the oop 1609 __ add(from, from, UseCompressedOops ? 4 : 8); 1610 __ beqz(copied_oop, L_store_element); 1611 1612 __ load_klass(r9_klass, copied_oop);// query the object klass 1613 generate_type_check(r9_klass, ckoff, ckval, L_store_element); 1614 // ======== end loop ======== 1615 1616 // It was a real error; we must depend on the caller to finish the job. 1617 // Register count = remaining oops, count_orig = total oops. 1618 // Emit GC store barriers for the oops we have copied and report 1619 // their number to the caller. 1620 1621 __ sub(count, count_save, count); // K = partially copied oop count 1622 __ xori(count, count, -1); // report (-1^K) to caller 1623 __ beqz(count, L_done_pop); 1624 1625 __ BIND(L_do_card_marks); 1626 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0, wb_post_saved_regs); 1627 1628 __ bind(L_done_pop); 1629 __ pop_reg(RegSet::of(x7, x9, x18, x19), sp); 1630 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1631 1632 __ bind(L_done); 1633 __ mv(x10, count); 1634 __ leave(); 1635 __ ret(); 1636 1637 return start; 1638 } 1639 1640 // Perform range checks on the proposed arraycopy. 1641 // Kills temp, but nothing else. 1642 // Also, clean the sign bits of src_pos and dst_pos. 1643 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1644 Register src_pos, // source position (c_rarg1) 1645 Register dst, // destination array oo (c_rarg2) 1646 Register dst_pos, // destination position (c_rarg3) 1647 Register length, 1648 Register temp, 1649 Label& L_failed) { 1650 BLOCK_COMMENT("arraycopy_range_checks:"); 1651 1652 assert_different_registers(t0, temp); 1653 1654 // if [src_pos + length > arrayOop(src)->length()] then FAIL 1655 __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes())); 1656 __ addw(temp, length, src_pos); 1657 __ bgtu(temp, t0, L_failed); 1658 1659 // if [dst_pos + length > arrayOop(dst)->length()] then FAIL 1660 __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1661 __ addw(temp, length, dst_pos); 1662 __ bgtu(temp, t0, L_failed); 1663 1664 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1665 __ zero_extend(src_pos, src_pos, 32); 1666 __ zero_extend(dst_pos, dst_pos, 32); 1667 1668 BLOCK_COMMENT("arraycopy_range_checks done"); 1669 } 1670 1671 // 1672 // Generate 'unsafe' array copy stub 1673 // Though just as safe as the other stubs, it takes an unscaled 1674 // size_t argument instead of an element count. 1675 // 1676 // Input: 1677 // c_rarg0 - source array address 1678 // c_rarg1 - destination array address 1679 // c_rarg2 - byte count, treated as ssize_t, can be zero 1680 // 1681 // Examines the alignment of the operands and dispatches 1682 // to a long, int, short, or byte copy loop. 1683 // 1684 address generate_unsafe_copy(const char* name, 1685 address byte_copy_entry, 1686 address short_copy_entry, 1687 address int_copy_entry, 1688 address long_copy_entry) { 1689 assert_cond(byte_copy_entry != NULL && short_copy_entry != NULL && 1690 int_copy_entry != NULL && long_copy_entry != NULL); 1691 Label L_long_aligned, L_int_aligned, L_short_aligned; 1692 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1693 1694 __ align(CodeEntryAlignment); 1695 StubCodeMark mark(this, "StubRoutines", name); 1696 address start = __ pc(); 1697 __ enter(); // required for proper stackwalking of RuntimeStub frame 1698 1699 // bump this on entry, not on exit: 1700 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1701 1702 __ orr(t0, s, d); 1703 __ orr(t0, t0, count); 1704 1705 __ andi(t0, t0, BytesPerLong - 1); 1706 __ beqz(t0, L_long_aligned); 1707 __ andi(t0, t0, BytesPerInt - 1); 1708 __ beqz(t0, L_int_aligned); 1709 __ andi(t0, t0, 1); 1710 __ beqz(t0, L_short_aligned); 1711 __ j(RuntimeAddress(byte_copy_entry)); 1712 1713 __ BIND(L_short_aligned); 1714 __ srli(count, count, LogBytesPerShort); // size => short_count 1715 __ j(RuntimeAddress(short_copy_entry)); 1716 __ BIND(L_int_aligned); 1717 __ srli(count, count, LogBytesPerInt); // size => int_count 1718 __ j(RuntimeAddress(int_copy_entry)); 1719 __ BIND(L_long_aligned); 1720 __ srli(count, count, LogBytesPerLong); // size => long_count 1721 __ j(RuntimeAddress(long_copy_entry)); 1722 1723 return start; 1724 } 1725 1726 // 1727 // Generate generic array copy stubs 1728 // 1729 // Input: 1730 // c_rarg0 - src oop 1731 // c_rarg1 - src_pos (32-bits) 1732 // c_rarg2 - dst oop 1733 // c_rarg3 - dst_pos (32-bits) 1734 // c_rarg4 - element count (32-bits) 1735 // 1736 // Output: 1737 // x10 == 0 - success 1738 // x10 == -1^K - failure, where K is partial transfer count 1739 // 1740 address generate_generic_copy(const char* name, 1741 address byte_copy_entry, address short_copy_entry, 1742 address int_copy_entry, address oop_copy_entry, 1743 address long_copy_entry, address checkcast_copy_entry) { 1744 assert_cond(byte_copy_entry != NULL && short_copy_entry != NULL && 1745 int_copy_entry != NULL && oop_copy_entry != NULL && 1746 long_copy_entry != NULL && checkcast_copy_entry != NULL); 1747 Label L_failed, L_failed_0, L_objArray; 1748 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1749 1750 // Input registers 1751 const Register src = c_rarg0; // source array oop 1752 const Register src_pos = c_rarg1; // source position 1753 const Register dst = c_rarg2; // destination array oop 1754 const Register dst_pos = c_rarg3; // destination position 1755 const Register length = c_rarg4; 1756 1757 // Registers used as temps 1758 const Register dst_klass = c_rarg5; 1759 1760 __ align(CodeEntryAlignment); 1761 1762 StubCodeMark mark(this, "StubRoutines", name); 1763 1764 address start = __ pc(); 1765 1766 __ enter(); // required for proper stackwalking of RuntimeStub frame 1767 1768 // bump this on entry, not on exit: 1769 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1770 1771 //----------------------------------------------------------------------- 1772 // Assembler stub will be used for this call to arraycopy 1773 // if the following conditions are met: 1774 // 1775 // (1) src and dst must not be null. 1776 // (2) src_pos must not be negative. 1777 // (3) dst_pos must not be negative. 1778 // (4) length must not be negative. 1779 // (5) src klass and dst klass should be the same and not NULL. 1780 // (6) src and dst should be arrays. 1781 // (7) src_pos + length must not exceed length of src. 1782 // (8) dst_pos + length must not exceed length of dst. 1783 // 1784 1785 // if [src == NULL] then return -1 1786 __ beqz(src, L_failed); 1787 1788 // if [src_pos < 0] then return -1 1789 // i.e. sign bit set 1790 __ andi(t0, src_pos, 1UL << 31); 1791 __ bnez(t0, L_failed); 1792 1793 // if [dst == NULL] then return -1 1794 __ beqz(dst, L_failed); 1795 1796 // if [dst_pos < 0] then return -1 1797 // i.e. sign bit set 1798 __ andi(t0, dst_pos, 1UL << 31); 1799 __ bnez(t0, L_failed); 1800 1801 // registers used as temp 1802 const Register scratch_length = x28; // elements count to copy 1803 const Register scratch_src_klass = x29; // array klass 1804 const Register lh = x30; // layout helper 1805 1806 // if [length < 0] then return -1 1807 __ addw(scratch_length, length, zr); // length (elements count, 32-bits value) 1808 // i.e. sign bit set 1809 __ andi(t0, scratch_length, 1UL << 31); 1810 __ bnez(t0, L_failed); 1811 1812 __ load_klass(scratch_src_klass, src); 1813 #ifdef ASSERT 1814 { 1815 BLOCK_COMMENT("assert klasses not null {"); 1816 Label L1, L2; 1817 __ bnez(scratch_src_klass, L2); // it is broken if klass is NULL 1818 __ bind(L1); 1819 __ stop("broken null klass"); 1820 __ bind(L2); 1821 __ load_klass(t0, dst, t1); 1822 __ beqz(t0, L1); // this would be broken also 1823 BLOCK_COMMENT("} assert klasses not null done"); 1824 } 1825 #endif 1826 1827 // Load layout helper (32-bits) 1828 // 1829 // |array_tag| | header_size | element_type | |log2_element_size| 1830 // 32 30 24 16 8 2 0 1831 // 1832 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1833 // 1834 1835 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1836 1837 // Handle objArrays completely differently... 1838 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1839 __ lw(lh, Address(scratch_src_klass, lh_offset)); 1840 __ mvw(t0, objArray_lh); 1841 __ beq(lh, t0, L_objArray); 1842 1843 // if [src->klass() != dst->klass()] then return -1 1844 __ load_klass(t1, dst); 1845 __ bne(t1, scratch_src_klass, L_failed); 1846 1847 // if [src->is_Array() != NULL] then return -1 1848 // i.e. (lh >= 0) 1849 __ andi(t0, lh, 1UL << 31); 1850 __ beqz(t0, L_failed); 1851 1852 // At this point, it is known to be a typeArray (array_tag 0x3). 1853 #ifdef ASSERT 1854 { 1855 BLOCK_COMMENT("assert primitive array {"); 1856 Label L; 1857 __ mvw(t1, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 1858 __ bge(lh, t1, L); 1859 __ stop("must be a primitive array"); 1860 __ bind(L); 1861 BLOCK_COMMENT("} assert primitive array done"); 1862 } 1863 #endif 1864 1865 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1866 t1, L_failed); 1867 1868 // TypeArrayKlass 1869 // 1870 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize) 1871 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize) 1872 // 1873 1874 const Register t0_offset = t0; // array offset 1875 const Register x22_elsize = lh; // element size 1876 1877 // Get array_header_in_bytes() 1878 int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1); 1879 int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width; 1880 __ slli(t0_offset, lh, XLEN - lh_header_size_msb); // left shift to remove 24 ~ 32; 1881 __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset 1882 1883 __ add(src, src, t0_offset); // src array offset 1884 __ add(dst, dst, t0_offset); // dst array offset 1885 BLOCK_COMMENT("choose copy loop based on element size"); 1886 1887 // next registers should be set before the jump to corresponding stub 1888 const Register from = c_rarg0; // source array address 1889 const Register to = c_rarg1; // destination array address 1890 const Register count = c_rarg2; // elements count 1891 1892 // 'from', 'to', 'count' registers should be set in such order 1893 // since they are the same as 'src', 'src_pos', 'dst'. 1894 1895 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1896 1897 // The possible values of elsize are 0-3, i.e. exact_log2(element 1898 // size in bytes). We do a simple bitwise binary search. 1899 __ BIND(L_copy_bytes); 1900 __ andi(t0, x22_elsize, 2); 1901 __ bnez(t0, L_copy_ints); 1902 __ andi(t0, x22_elsize, 1); 1903 __ bnez(t0, L_copy_shorts); 1904 __ add(from, src, src_pos); // src_addr 1905 __ add(to, dst, dst_pos); // dst_addr 1906 __ addw(count, scratch_length, zr); // length 1907 __ j(RuntimeAddress(byte_copy_entry)); 1908 1909 __ BIND(L_copy_shorts); 1910 __ shadd(from, src_pos, src, t0, 1); // src_addr 1911 __ shadd(to, dst_pos, dst, t0, 1); // dst_addr 1912 __ addw(count, scratch_length, zr); // length 1913 __ j(RuntimeAddress(short_copy_entry)); 1914 1915 __ BIND(L_copy_ints); 1916 __ andi(t0, x22_elsize, 1); 1917 __ bnez(t0, L_copy_longs); 1918 __ shadd(from, src_pos, src, t0, 2); // src_addr 1919 __ shadd(to, dst_pos, dst, t0, 2); // dst_addr 1920 __ addw(count, scratch_length, zr); // length 1921 __ j(RuntimeAddress(int_copy_entry)); 1922 1923 __ BIND(L_copy_longs); 1924 #ifdef ASSERT 1925 { 1926 BLOCK_COMMENT("assert long copy {"); 1927 Label L; 1928 __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x22_elsize 1929 __ addw(lh, lh, zr); 1930 __ mvw(t0, LogBytesPerLong); 1931 __ beq(x22_elsize, t0, L); 1932 __ stop("must be long copy, but elsize is wrong"); 1933 __ bind(L); 1934 BLOCK_COMMENT("} assert long copy done"); 1935 } 1936 #endif 1937 __ shadd(from, src_pos, src, t0, 3); // src_addr 1938 __ shadd(to, dst_pos, dst, t0, 3); // dst_addr 1939 __ addw(count, scratch_length, zr); // length 1940 __ j(RuntimeAddress(long_copy_entry)); 1941 1942 // ObjArrayKlass 1943 __ BIND(L_objArray); 1944 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 1945 1946 Label L_plain_copy, L_checkcast_copy; 1947 // test array classes for subtyping 1948 __ load_klass(t2, dst); 1949 __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality 1950 1951 // Identically typed arrays can be copied without element-wise checks. 1952 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1953 t1, L_failed); 1954 1955 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1956 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1957 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1958 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1959 __ addw(count, scratch_length, zr); // length 1960 __ BIND(L_plain_copy); 1961 __ j(RuntimeAddress(oop_copy_entry)); 1962 1963 __ BIND(L_checkcast_copy); 1964 // live at this point: scratch_src_klass, scratch_length, t2 (dst_klass) 1965 { 1966 // Before looking at dst.length, make sure dst is also an objArray. 1967 __ lwu(t0, Address(t2, lh_offset)); 1968 __ mvw(t1, objArray_lh); 1969 __ bne(t0, t1, L_failed); 1970 1971 // It is safe to examine both src.length and dst.length. 1972 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1973 t2, L_failed); 1974 1975 __ load_klass(dst_klass, dst); // reload 1976 1977 // Marshal the base address arguments now, freeing registers. 1978 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop); 1979 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1980 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop); 1981 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1982 __ addw(count, length, zr); // length (reloaded) 1983 const Register sco_temp = c_rarg3; // this register is free now 1984 assert_different_registers(from, to, count, sco_temp, 1985 dst_klass, scratch_src_klass); 1986 1987 // Generate the type check. 1988 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1989 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1990 1991 // Smashes t0, t1 1992 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 1993 1994 // Fetch destination element klass from the ObjArrayKlass header. 1995 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 1996 __ ld(dst_klass, Address(dst_klass, ek_offset)); 1997 __ lwu(sco_temp, Address(dst_klass, sco_offset)); 1998 1999 // the checkcast_copy loop needs two extra arguments: 2000 assert(c_rarg3 == sco_temp, "#3 already in place"); 2001 // Set up arguments for checkcast_copy_entry. 2002 __ mv(c_rarg4, dst_klass); // dst.klass.element_klass 2003 __ j(RuntimeAddress(checkcast_copy_entry)); 2004 } 2005 2006 __ BIND(L_failed); 2007 __ li(x10, -1); 2008 __ leave(); // required for proper stackwalking of RuntimeStub frame 2009 __ ret(); 2010 2011 return start; 2012 } 2013 2014 // 2015 // Generate stub for array fill. If "aligned" is true, the 2016 // "to" address is assumed to be heapword aligned. 2017 // 2018 // Arguments for generated stub: 2019 // to: c_rarg0 2020 // value: c_rarg1 2021 // count: c_rarg2 treated as signed 2022 // 2023 address generate_fill(BasicType t, bool aligned, const char* name) { 2024 __ align(CodeEntryAlignment); 2025 StubCodeMark mark(this, "StubRoutines", name); 2026 address start = __ pc(); 2027 2028 BLOCK_COMMENT("Entry:"); 2029 2030 const Register to = c_rarg0; // source array address 2031 const Register value = c_rarg1; // value 2032 const Register count = c_rarg2; // elements count 2033 2034 const Register bz_base = x28; // base for block_zero routine 2035 const Register cnt_words = x29; // temp register 2036 const Register tmp_reg = t1; 2037 2038 __ enter(); 2039 2040 Label L_fill_elements, L_exit1; 2041 2042 int shift = -1; 2043 switch (t) { 2044 case T_BYTE: 2045 shift = 0; 2046 2047 // Zero extend value 2048 // 8 bit -> 16 bit 2049 __ andi(value, value, 0xff); 2050 __ mv(tmp_reg, value); 2051 __ slli(tmp_reg, tmp_reg, 8); 2052 __ orr(value, value, tmp_reg); 2053 2054 // 16 bit -> 32 bit 2055 __ mv(tmp_reg, value); 2056 __ slli(tmp_reg, tmp_reg, 16); 2057 __ orr(value, value, tmp_reg); 2058 2059 __ mv(tmp_reg, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2060 __ bltu(count, tmp_reg, L_fill_elements); 2061 break; 2062 case T_SHORT: 2063 shift = 1; 2064 // Zero extend value 2065 // 16 bit -> 32 bit 2066 __ andi(value, value, 0xffff); 2067 __ mv(tmp_reg, value); 2068 __ slli(tmp_reg, tmp_reg, 16); 2069 __ orr(value, value, tmp_reg); 2070 2071 // Short arrays (< 8 bytes) fill by element 2072 __ mv(tmp_reg, 8 >> shift); 2073 __ bltu(count, tmp_reg, L_fill_elements); 2074 break; 2075 case T_INT: 2076 shift = 2; 2077 2078 // Short arrays (< 8 bytes) fill by element 2079 __ mv(tmp_reg, 8 >> shift); 2080 __ bltu(count, tmp_reg, L_fill_elements); 2081 break; 2082 default: ShouldNotReachHere(); 2083 } 2084 2085 // Align source address at 8 bytes address boundary. 2086 Label L_skip_align1, L_skip_align2, L_skip_align4; 2087 if (!aligned) { 2088 switch (t) { 2089 case T_BYTE: 2090 // One byte misalignment happens only for byte arrays. 2091 __ andi(t0, to, 1); 2092 __ beqz(t0, L_skip_align1); 2093 __ sb(value, Address(to, 0)); 2094 __ addi(to, to, 1); 2095 __ addiw(count, count, -1); 2096 __ bind(L_skip_align1); 2097 // Fallthrough 2098 case T_SHORT: 2099 // Two bytes misalignment happens only for byte and short (char) arrays. 2100 __ andi(t0, to, 2); 2101 __ beqz(t0, L_skip_align2); 2102 __ sh(value, Address(to, 0)); 2103 __ addi(to, to, 2); 2104 __ addiw(count, count, -(2 >> shift)); 2105 __ bind(L_skip_align2); 2106 // Fallthrough 2107 case T_INT: 2108 // Align to 8 bytes, we know we are 4 byte aligned to start. 2109 __ andi(t0, to, 4); 2110 __ beqz(t0, L_skip_align4); 2111 __ sw(value, Address(to, 0)); 2112 __ addi(to, to, 4); 2113 __ addiw(count, count, -(4 >> shift)); 2114 __ bind(L_skip_align4); 2115 break; 2116 default: ShouldNotReachHere(); 2117 } 2118 } 2119 2120 // 2121 // Fill large chunks 2122 // 2123 __ srliw(cnt_words, count, 3 - shift); // number of words 2124 2125 // 32 bit -> 64 bit 2126 __ andi(value, value, 0xffffffff); 2127 __ mv(tmp_reg, value); 2128 __ slli(tmp_reg, tmp_reg, 32); 2129 __ orr(value, value, tmp_reg); 2130 2131 __ slli(tmp_reg, cnt_words, 3 - shift); 2132 __ subw(count, count, tmp_reg); 2133 { 2134 __ fill_words(to, cnt_words, value); 2135 } 2136 2137 // Remaining count is less than 8 bytes. Fill it by a single store. 2138 // Note that the total length is no less than 8 bytes. 2139 if (t == T_BYTE || t == T_SHORT) { 2140 __ beqz(count, L_exit1); 2141 __ shadd(to, count, to, tmp_reg, shift); // points to the end 2142 __ sd(value, Address(to, -8)); // overwrite some elements 2143 __ bind(L_exit1); 2144 __ leave(); 2145 __ ret(); 2146 } 2147 2148 // Handle copies less than 8 bytes. 2149 Label L_fill_2, L_fill_4, L_exit2; 2150 __ bind(L_fill_elements); 2151 switch (t) { 2152 case T_BYTE: 2153 __ andi(t0, count, 1); 2154 __ beqz(t0, L_fill_2); 2155 __ sb(value, Address(to, 0)); 2156 __ addi(to, to, 1); 2157 __ bind(L_fill_2); 2158 __ andi(t0, count, 2); 2159 __ beqz(t0, L_fill_4); 2160 __ sh(value, Address(to, 0)); 2161 __ addi(to, to, 2); 2162 __ bind(L_fill_4); 2163 __ andi(t0, count, 4); 2164 __ beqz(t0, L_exit2); 2165 __ sw(value, Address(to, 0)); 2166 break; 2167 case T_SHORT: 2168 __ andi(t0, count, 1); 2169 __ beqz(t0, L_fill_4); 2170 __ sh(value, Address(to, 0)); 2171 __ addi(to, to, 2); 2172 __ bind(L_fill_4); 2173 __ andi(t0, count, 2); 2174 __ beqz(t0, L_exit2); 2175 __ sw(value, Address(to, 0)); 2176 break; 2177 case T_INT: 2178 __ beqz(count, L_exit2); 2179 __ sw(value, Address(to, 0)); 2180 break; 2181 default: ShouldNotReachHere(); 2182 } 2183 __ bind(L_exit2); 2184 __ leave(); 2185 __ ret(); 2186 return start; 2187 } 2188 2189 void generate_arraycopy_stubs() { 2190 address entry = NULL; 2191 address entry_jbyte_arraycopy = NULL; 2192 address entry_jshort_arraycopy = NULL; 2193 address entry_jint_arraycopy = NULL; 2194 address entry_oop_arraycopy = NULL; 2195 address entry_jlong_arraycopy = NULL; 2196 address entry_checkcast_arraycopy = NULL; 2197 2198 generate_copy_longs(copy_f, c_rarg0, c_rarg1, t1, copy_forwards); 2199 generate_copy_longs(copy_b, c_rarg0, c_rarg1, t1, copy_backwards); 2200 2201 StubRoutines::riscv::_zero_blocks = generate_zero_blocks(); 2202 2203 //*** jbyte 2204 // Always need aligned and unaligned versions 2205 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2206 "jbyte_disjoint_arraycopy"); 2207 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2208 &entry_jbyte_arraycopy, 2209 "jbyte_arraycopy"); 2210 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2211 "arrayof_jbyte_disjoint_arraycopy"); 2212 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2213 "arrayof_jbyte_arraycopy"); 2214 2215 //*** jshort 2216 // Always need aligned and unaligned versions 2217 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2218 "jshort_disjoint_arraycopy"); 2219 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2220 &entry_jshort_arraycopy, 2221 "jshort_arraycopy"); 2222 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2223 "arrayof_jshort_disjoint_arraycopy"); 2224 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2225 "arrayof_jshort_arraycopy"); 2226 2227 //*** jint 2228 // Aligned versions 2229 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2230 "arrayof_jint_disjoint_arraycopy"); 2231 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2232 "arrayof_jint_arraycopy"); 2233 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2234 // entry_jint_arraycopy always points to the unaligned version 2235 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2236 "jint_disjoint_arraycopy"); 2237 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2238 &entry_jint_arraycopy, 2239 "jint_arraycopy"); 2240 2241 //*** jlong 2242 // It is always aligned 2243 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2244 "arrayof_jlong_disjoint_arraycopy"); 2245 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2246 "arrayof_jlong_arraycopy"); 2247 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2248 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2249 2250 //*** oops 2251 { 2252 // With compressed oops we need unaligned versions; notice that 2253 // we overwrite entry_oop_arraycopy. 2254 bool aligned = !UseCompressedOops; 2255 2256 StubRoutines::_arrayof_oop_disjoint_arraycopy 2257 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2258 /*dest_uninitialized*/false); 2259 StubRoutines::_arrayof_oop_arraycopy 2260 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2261 /*dest_uninitialized*/false); 2262 // Aligned versions without pre-barriers 2263 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2264 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2265 /*dest_uninitialized*/true); 2266 StubRoutines::_arrayof_oop_arraycopy_uninit 2267 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2268 /*dest_uninitialized*/true); 2269 } 2270 2271 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2272 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2273 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2274 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2275 2276 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2277 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2278 /*dest_uninitialized*/true); 2279 2280 2281 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2282 entry_jbyte_arraycopy, 2283 entry_jshort_arraycopy, 2284 entry_jint_arraycopy, 2285 entry_jlong_arraycopy); 2286 2287 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2288 entry_jbyte_arraycopy, 2289 entry_jshort_arraycopy, 2290 entry_jint_arraycopy, 2291 entry_oop_arraycopy, 2292 entry_jlong_arraycopy, 2293 entry_checkcast_arraycopy); 2294 2295 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2296 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2297 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2298 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2299 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2300 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2301 } 2302 2303 // Safefetch stubs. 2304 void generate_safefetch(const char* name, int size, address* entry, 2305 address* fault_pc, address* continuation_pc) { 2306 // safefetch signatures: 2307 // int SafeFetch32(int* adr, int errValue) 2308 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue) 2309 // 2310 // arguments: 2311 // c_rarg0 = adr 2312 // c_rarg1 = errValue 2313 // 2314 // result: 2315 // PPC_RET = *adr or errValue 2316 assert_cond(entry != NULL && fault_pc != NULL && continuation_pc != NULL); 2317 StubCodeMark mark(this, "StubRoutines", name); 2318 2319 // Entry point, pc or function descriptor. 2320 *entry = __ pc(); 2321 2322 // Load *adr into c_rarg1, may fault. 2323 *fault_pc = __ pc(); 2324 switch (size) { 2325 case 4: 2326 // int32_t 2327 __ lw(c_rarg1, Address(c_rarg0, 0)); 2328 break; 2329 case 8: 2330 // int64_t 2331 __ ld(c_rarg1, Address(c_rarg0, 0)); 2332 break; 2333 default: 2334 ShouldNotReachHere(); 2335 } 2336 2337 // return errValue or *adr 2338 *continuation_pc = __ pc(); 2339 __ mv(x10, c_rarg1); 2340 __ ret(); 2341 } 2342 2343 // code for comparing 16 bytes of strings with same encoding 2344 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 2345 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31; 2346 __ ld(tmp5, Address(str1)); 2347 __ addi(str1, str1, 8); 2348 __ xorr(tmp4, tmp1, tmp2); 2349 __ ld(cnt1, Address(str2)); 2350 __ addi(str2, str2, 8); 2351 __ bnez(tmp4, DIFF1); 2352 __ ld(tmp1, Address(str1)); 2353 __ addi(str1, str1, 8); 2354 __ xorr(tmp4, tmp5, cnt1); 2355 __ ld(tmp2, Address(str2)); 2356 __ addi(str2, str2, 8); 2357 __ bnez(tmp4, DIFF2); 2358 } 2359 2360 // code for comparing 8 characters of strings with Latin1 and Utf16 encoding 2361 void compare_string_8_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 2362 Label &DIFF2) { 2363 const Register strU = x12, curU = x7, strL = x29, tmp = x30; 2364 __ ld(tmpL, Address(strL)); 2365 __ addi(strL, strL, 8); 2366 __ ld(tmpU, Address(strU)); 2367 __ addi(strU, strU, 8); 2368 __ inflate_lo32(tmp, tmpL); 2369 __ mv(t0, tmp); 2370 __ xorr(tmp, curU, t0); 2371 __ bnez(tmp, DIFF2); 2372 2373 __ ld(curU, Address(strU)); 2374 __ addi(strU, strU, 8); 2375 __ inflate_hi32(tmp, tmpL); 2376 __ mv(t0, tmp); 2377 __ xorr(tmp, tmpU, t0); 2378 __ bnez(tmp, DIFF1); 2379 } 2380 2381 // x10 = result 2382 // x11 = str1 2383 // x12 = cnt1 2384 // x13 = str2 2385 // x14 = cnt2 2386 // x28 = tmp1 2387 // x29 = tmp2 2388 // x30 = tmp3 2389 address generate_compare_long_string_different_encoding(bool isLU) { 2390 __ align(CodeEntryAlignment); 2391 StubCodeMark mark(this, "StubRoutines", isLU ? "compare_long_string_different_encoding LU" : "compare_long_string_different_encoding UL"); 2392 address entry = __ pc(); 2393 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 2394 DONE, CALCULATE_DIFFERENCE; 2395 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14, 2396 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31; 2397 RegSet spilled_regs = RegSet::of(tmp4, tmp5); 2398 2399 // cnt2 == amount of characters left to compare 2400 // Check already loaded first 4 symbols 2401 __ inflate_lo32(tmp3, isLU ? tmp1 : tmp2); 2402 __ mv(isLU ? tmp1 : tmp2, tmp3); 2403 __ addi(str1, str1, isLU ? wordSize / 2 : wordSize); 2404 __ addi(str2, str2, isLU ? wordSize : wordSize / 2); 2405 __ sub(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 2406 __ push_reg(spilled_regs, sp); 2407 2408 if (isLU) { 2409 __ add(str1, str1, cnt2); 2410 __ shadd(str2, cnt2, str2, t0, 1); 2411 } else { 2412 __ shadd(str1, cnt2, str1, t0, 1); 2413 __ add(str2, str2, cnt2); 2414 } 2415 __ xorr(tmp3, tmp1, tmp2); 2416 __ mv(tmp5, tmp2); 2417 __ bnez(tmp3, CALCULATE_DIFFERENCE); 2418 2419 Register strU = isLU ? str2 : str1, 2420 strL = isLU ? str1 : str2, 2421 tmpU = isLU ? tmp5 : tmp1, // where to keep U for comparison 2422 tmpL = isLU ? tmp1 : tmp5; // where to keep L for comparison 2423 2424 __ sub(tmp2, strL, cnt2); // strL pointer to load from 2425 __ slli(t0, cnt2, 1); 2426 __ sub(cnt1, strU, t0); // strU pointer to load from 2427 2428 __ ld(tmp4, Address(cnt1)); 2429 __ addi(cnt1, cnt1, 8); 2430 __ beqz(cnt2, LOAD_LAST); // no characters left except last load 2431 __ sub(cnt2, cnt2, 16); 2432 __ bltz(cnt2, TAIL); 2433 __ bind(SMALL_LOOP); // smaller loop 2434 __ sub(cnt2, cnt2, 16); 2435 compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2); 2436 compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2); 2437 __ bgez(cnt2, SMALL_LOOP); 2438 __ addi(t0, cnt2, 16); 2439 __ beqz(t0, LOAD_LAST); 2440 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 2441 // Address of 8 bytes before last 4 characters in UTF-16 string 2442 __ shadd(cnt1, cnt2, cnt1, t0, 1); 2443 // Address of 16 bytes before last 4 characters in Latin1 string 2444 __ add(tmp2, tmp2, cnt2); 2445 __ ld(tmp4, Address(cnt1, -8)); 2446 // last 16 characters before last load 2447 compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2); 2448 compare_string_8_x_LU(tmpL, tmpU, DIFF1, DIFF2); 2449 __ j(LOAD_LAST); 2450 __ bind(DIFF2); 2451 __ mv(tmpU, tmp4); 2452 __ bind(DIFF1); 2453 __ mv(tmpL, t0); 2454 __ j(CALCULATE_DIFFERENCE); 2455 __ bind(LOAD_LAST); 2456 // Last 4 UTF-16 characters are already pre-loaded into tmp4 by compare_string_8_x_LU. 2457 // No need to load it again 2458 __ mv(tmpU, tmp4); 2459 __ ld(tmpL, Address(strL)); 2460 __ inflate_lo32(tmp3, tmpL); 2461 __ mv(tmpL, tmp3); 2462 __ xorr(tmp3, tmpU, tmpL); 2463 __ beqz(tmp3, DONE); 2464 2465 // Find the first different characters in the longwords and 2466 // compute their difference. 2467 __ bind(CALCULATE_DIFFERENCE); 2468 __ ctzc_bit(tmp4, tmp3); 2469 __ srl(tmp1, tmp1, tmp4); 2470 __ srl(tmp5, tmp5, tmp4); 2471 __ andi(tmp1, tmp1, 0xFFFF); 2472 __ andi(tmp5, tmp5, 0xFFFF); 2473 __ sub(result, tmp1, tmp5); 2474 __ bind(DONE); 2475 __ pop_reg(spilled_regs, sp); 2476 __ ret(); 2477 return entry; 2478 } 2479 2480 address generate_method_entry_barrier() { 2481 __ align(CodeEntryAlignment); 2482 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 2483 2484 Label deoptimize_label; 2485 2486 address start = __ pc(); 2487 2488 __ set_last_Java_frame(sp, fp, ra, t0); 2489 2490 __ enter(); 2491 __ add(t1, sp, wordSize); 2492 2493 __ sub(sp, sp, 4 * wordSize); 2494 2495 __ push_call_clobbered_registers(); 2496 2497 __ mv(c_rarg0, t1); 2498 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 2499 2500 __ reset_last_Java_frame(true); 2501 2502 __ mv(t0, x10); 2503 2504 __ pop_call_clobbered_registers(); 2505 2506 __ bnez(t0, deoptimize_label); 2507 2508 __ leave(); 2509 __ ret(); 2510 2511 __ BIND(deoptimize_label); 2512 2513 __ ld(t0, Address(sp, 0)); 2514 __ ld(fp, Address(sp, wordSize)); 2515 __ ld(ra, Address(sp, wordSize * 2)); 2516 __ ld(t1, Address(sp, wordSize * 3)); 2517 2518 __ mv(sp, t0); 2519 __ jr(t1); 2520 2521 return start; 2522 } 2523 2524 // x10 = result 2525 // x11 = str1 2526 // x12 = cnt1 2527 // x13 = str2 2528 // x14 = cnt2 2529 // x28 = tmp1 2530 // x29 = tmp2 2531 // x30 = tmp3 2532 // x31 = tmp4 2533 address generate_compare_long_string_same_encoding(bool isLL) { 2534 __ align(CodeEntryAlignment); 2535 StubCodeMark mark(this, "StubRoutines", isLL ? 2536 "compare_long_string_same_encoding LL" : "compare_long_string_same_encoding UU"); 2537 address entry = __ pc(); 2538 Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL, 2539 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF; 2540 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14, 2541 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31; 2542 RegSet spilled_regs = RegSet::of(tmp4, tmp5); 2543 2544 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 2545 // update cnt2 counter with already loaded 8 bytes 2546 __ sub(cnt2, cnt2, wordSize / (isLL ? 1 : 2)); 2547 // update pointers, because of previous read 2548 __ add(str1, str1, wordSize); 2549 __ add(str2, str2, wordSize); 2550 // less than 16 bytes left? 2551 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2552 __ push_reg(spilled_regs, sp); 2553 __ bltz(cnt2, TAIL); 2554 __ bind(SMALL_LOOP); 2555 compare_string_16_bytes_same(DIFF, DIFF2); 2556 __ sub(cnt2, cnt2, isLL ? 16 : 8); 2557 __ bgez(cnt2, SMALL_LOOP); 2558 __ bind(TAIL); 2559 __ addi(cnt2, cnt2, isLL ? 16 : 8); 2560 __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); 2561 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2562 __ blez(cnt2, CHECK_LAST); 2563 __ xorr(tmp4, tmp1, tmp2); 2564 __ bnez(tmp4, DIFF); 2565 __ ld(tmp1, Address(str1)); 2566 __ addi(str1, str1, 8); 2567 __ ld(tmp2, Address(str2)); 2568 __ addi(str2, str2, 8); 2569 __ sub(cnt2, cnt2, isLL ? 8 : 4); 2570 __ bind(CHECK_LAST); 2571 if (!isLL) { 2572 __ add(cnt2, cnt2, cnt2); // now in bytes 2573 } 2574 __ xorr(tmp4, tmp1, tmp2); 2575 __ bnez(tmp4, DIFF); 2576 __ add(str1, str1, cnt2); 2577 __ ld(tmp5, Address(str1)); 2578 __ add(str2, str2, cnt2); 2579 __ ld(cnt1, Address(str2)); 2580 __ xorr(tmp4, tmp5, cnt1); 2581 __ beqz(tmp4, LENGTH_DIFF); 2582 // Find the first different characters in the longwords and 2583 // compute their difference. 2584 __ bind(DIFF2); 2585 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2586 __ srl(tmp5, tmp5, tmp3); 2587 __ srl(cnt1, cnt1, tmp3); 2588 if (isLL) { 2589 __ andi(tmp5, tmp5, 0xFF); 2590 __ andi(cnt1, cnt1, 0xFF); 2591 } else { 2592 __ andi(tmp5, tmp5, 0xFFFF); 2593 __ andi(cnt1, cnt1, 0xFFFF); 2594 } 2595 __ sub(result, tmp5, cnt1); 2596 __ j(LENGTH_DIFF); 2597 __ bind(DIFF); 2598 __ ctzc_bit(tmp3, tmp4, isLL); // count zero from lsb to msb 2599 __ srl(tmp1, tmp1, tmp3); 2600 __ srl(tmp2, tmp2, tmp3); 2601 if (isLL) { 2602 __ andi(tmp1, tmp1, 0xFF); 2603 __ andi(tmp2, tmp2, 0xFF); 2604 } else { 2605 __ andi(tmp1, tmp1, 0xFFFF); 2606 __ andi(tmp2, tmp2, 0xFFFF); 2607 } 2608 __ sub(result, tmp1, tmp2); 2609 __ j(LENGTH_DIFF); 2610 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 2611 __ xorr(tmp4, tmp1, tmp2); 2612 __ bnez(tmp4, DIFF); 2613 __ bind(LENGTH_DIFF); 2614 __ pop_reg(spilled_regs, sp); 2615 __ ret(); 2616 return entry; 2617 } 2618 2619 void generate_compare_long_strings() { 2620 StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(true); 2621 StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(false); 2622 StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(true); 2623 StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(false); 2624 } 2625 2626 // x10 result 2627 // x11 src 2628 // x12 src count 2629 // x13 pattern 2630 // x14 pattern count 2631 address generate_string_indexof_linear(bool needle_isL, bool haystack_isL) 2632 { 2633 const char* stubName = needle_isL 2634 ? (haystack_isL ? "indexof_linear_ll" : "indexof_linear_ul") 2635 : "indexof_linear_uu"; 2636 __ align(CodeEntryAlignment); 2637 StubCodeMark mark(this, "StubRoutines", stubName); 2638 address entry = __ pc(); 2639 2640 int needle_chr_size = needle_isL ? 1 : 2; 2641 int haystack_chr_size = haystack_isL ? 1 : 2; 2642 int needle_chr_shift = needle_isL ? 0 : 1; 2643 int haystack_chr_shift = haystack_isL ? 0 : 1; 2644 bool isL = needle_isL && haystack_isL; 2645 // parameters 2646 Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14; 2647 // temporary registers 2648 Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25; 2649 // redefinitions 2650 Register ch1 = x28, ch2 = x29; 2651 RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29); 2652 2653 __ push_reg(spilled_regs, sp); 2654 2655 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 2656 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 2657 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 2658 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 2659 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 2660 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 2661 2662 __ ld(ch1, Address(needle)); 2663 __ ld(ch2, Address(haystack)); 2664 // src.length - pattern.length 2665 __ sub(haystack_len, haystack_len, needle_len); 2666 2667 // first is needle[0] 2668 __ andi(first, ch1, needle_isL ? 0xFF : 0xFFFF, first); 2669 uint64_t mask0101 = UCONST64(0x0101010101010101); 2670 uint64_t mask0001 = UCONST64(0x0001000100010001); 2671 __ mv(mask1, haystack_isL ? mask0101 : mask0001); 2672 __ mul(first, first, mask1); 2673 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f); 2674 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff); 2675 __ mv(mask2, haystack_isL ? mask7f7f : mask7fff); 2676 if (needle_isL != haystack_isL) { 2677 __ mv(tmp, ch1); 2678 } 2679 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size - 1); 2680 __ blez(haystack_len, L_SMALL); 2681 2682 if (needle_isL != haystack_isL) { 2683 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2684 } 2685 // xorr, sub, orr, notr, andr 2686 // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i] 2687 // eg: 2688 // first: aa aa aa aa aa aa aa aa 2689 // ch2: aa aa li nx jd ka aa aa 2690 // match_mask: 80 80 00 00 00 00 80 80 2691 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2692 2693 // search first char of needle, if success, goto L_HAS_ZERO; 2694 __ bnez(match_mask, L_HAS_ZERO); 2695 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2696 __ add(result, result, wordSize / haystack_chr_size); 2697 __ add(haystack, haystack, wordSize); 2698 __ bltz(haystack_len, L_POST_LOOP); 2699 2700 __ bind(L_LOOP); 2701 __ ld(ch2, Address(haystack)); 2702 __ compute_match_mask(ch2, first, match_mask, mask1, mask2); 2703 __ bnez(match_mask, L_HAS_ZERO); 2704 2705 __ bind(L_LOOP_PROCEED); 2706 __ sub(haystack_len, haystack_len, wordSize / haystack_chr_size); 2707 __ add(haystack, haystack, wordSize); 2708 __ add(result, result, wordSize / haystack_chr_size); 2709 __ bgez(haystack_len, L_LOOP); 2710 2711 __ bind(L_POST_LOOP); 2712 __ mv(ch2, -wordSize / haystack_chr_size); 2713 __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check 2714 __ ld(ch2, Address(haystack)); 2715 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2716 __ neg(haystack_len, haystack_len); 2717 __ xorr(ch2, first, ch2); 2718 __ sub(match_mask, ch2, mask1); 2719 __ orr(ch2, ch2, mask2); 2720 __ mv(trailing_zeros, -1); // all bits set 2721 __ j(L_SMALL_PROCEED); 2722 2723 __ align(OptoLoopAlignment); 2724 __ bind(L_SMALL); 2725 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift); 2726 __ neg(haystack_len, haystack_len); 2727 if (needle_isL != haystack_isL) { 2728 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros); 2729 } 2730 __ xorr(ch2, first, ch2); 2731 __ sub(match_mask, ch2, mask1); 2732 __ orr(ch2, ch2, mask2); 2733 __ mv(trailing_zeros, -1); // all bits set 2734 2735 __ bind(L_SMALL_PROCEED); 2736 __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits. 2737 __ notr(ch2, ch2); 2738 __ andr(match_mask, match_mask, ch2); 2739 __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check 2740 __ beqz(match_mask, NOMATCH); 2741 2742 __ bind(L_SMALL_HAS_ZERO_LOOP); 2743 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, ch2, tmp); // count trailing zeros 2744 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2745 __ mv(ch2, wordSize / haystack_chr_size); 2746 __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2); 2747 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2748 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2749 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2750 2751 __ bind(L_SMALL_CMP_LOOP); 2752 __ shadd(first, trailing_zeros, needle, first, needle_chr_shift); 2753 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2754 needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first)); 2755 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2756 __ add(trailing_zeros, trailing_zeros, 1); 2757 __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP); 2758 __ beq(first, ch2, L_SMALL_CMP_LOOP); 2759 2760 __ bind(L_SMALL_CMP_LOOP_NOMATCH); 2761 __ beqz(match_mask, NOMATCH); 2762 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2763 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2764 __ add(result, result, 1); 2765 __ add(haystack, haystack, haystack_chr_size); 2766 __ j(L_SMALL_HAS_ZERO_LOOP); 2767 2768 __ align(OptoLoopAlignment); 2769 __ bind(L_SMALL_CMP_LOOP_LAST_CMP); 2770 __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2771 __ j(DONE); 2772 2773 __ align(OptoLoopAlignment); 2774 __ bind(L_SMALL_CMP_LOOP_LAST_CMP2); 2775 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2776 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH); 2777 __ j(DONE); 2778 2779 __ align(OptoLoopAlignment); 2780 __ bind(L_HAS_ZERO); 2781 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, tmp, ch2); 2782 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2783 __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2); 2784 __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits) 2785 __ sub(result, result, 1); // array index from 0, so result -= 1 2786 2787 __ bind(L_HAS_ZERO_LOOP); 2788 __ mv(needle_len, wordSize / haystack_chr_size); 2789 __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2); 2790 __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2); 2791 // load next 8 bytes from haystack, and increase result index 2792 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2793 __ add(result, result, 1); 2794 __ mv(trailing_zeros, wordSize / haystack_chr_size); 2795 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2796 2797 // compare one char 2798 __ bind(L_CMP_LOOP); 2799 __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift); 2800 needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len)); 2801 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift); 2802 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2)); 2803 __ add(trailing_zeros, trailing_zeros, 1); // next char index 2804 __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2); 2805 __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP); 2806 __ beq(needle_len, ch2, L_CMP_LOOP); 2807 2808 __ bind(L_CMP_LOOP_NOMATCH); 2809 __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH); 2810 __ ctzc_bit(trailing_zeros, match_mask, haystack_isL, needle_len, ch2); // find next "first" char index 2811 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15); 2812 __ add(haystack, haystack, haystack_chr_size); 2813 __ j(L_HAS_ZERO_LOOP); 2814 2815 __ align(OptoLoopAlignment); 2816 __ bind(L_CMP_LOOP_LAST_CMP); 2817 __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH); 2818 __ j(DONE); 2819 2820 __ align(OptoLoopAlignment); 2821 __ bind(L_CMP_LOOP_LAST_CMP2); 2822 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL); 2823 __ add(result, result, 1); 2824 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH); 2825 __ j(DONE); 2826 2827 __ align(OptoLoopAlignment); 2828 __ bind(L_HAS_ZERO_LOOP_NOMATCH); 2829 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 2830 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 2831 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 2832 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 2833 // result by analyzed characters value, so, we can just reset lower bits 2834 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 2835 // 2) restore needle_len and haystack_len values from "compressed" haystack_len 2836 // 3) advance haystack value to represent next haystack octet. result & 7/3 is 2837 // index of last analyzed substring inside current octet. So, haystack in at 2838 // respective start address. We need to advance it to next octet 2839 __ andi(match_mask, result, wordSize / haystack_chr_size - 1); 2840 __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2); 2841 __ andi(result, result, haystack_isL ? -8 : -4); 2842 __ slli(tmp, match_mask, haystack_chr_shift); 2843 __ sub(haystack, haystack, tmp); 2844 __ addw(haystack_len, haystack_len, zr); 2845 __ j(L_LOOP_PROCEED); 2846 2847 __ align(OptoLoopAlignment); 2848 __ bind(NOMATCH); 2849 __ mv(result, -1); 2850 2851 __ bind(DONE); 2852 __ pop_reg(spilled_regs, sp); 2853 __ ret(); 2854 return entry; 2855 } 2856 2857 void generate_string_indexof_stubs() 2858 { 2859 StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 2860 StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 2861 StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 2862 } 2863 2864 #ifdef COMPILER2 2865 address generate_mulAdd() 2866 { 2867 __ align(CodeEntryAlignment); 2868 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 2869 2870 address entry = __ pc(); 2871 2872 const Register out = x10; 2873 const Register in = x11; 2874 const Register offset = x12; 2875 const Register len = x13; 2876 const Register k = x14; 2877 const Register tmp = x28; 2878 2879 BLOCK_COMMENT("Entry:"); 2880 __ enter(); 2881 __ mul_add(out, in, offset, len, k, tmp); 2882 __ leave(); 2883 __ ret(); 2884 2885 return entry; 2886 } 2887 2888 /** 2889 * Arguments: 2890 * 2891 * Input: 2892 * c_rarg0 - x address 2893 * c_rarg1 - x length 2894 * c_rarg2 - y address 2895 * c_rarg3 - y length 2896 * c_rarg4 - z address 2897 * c_rarg5 - z length 2898 */ 2899 address generate_multiplyToLen() 2900 { 2901 __ align(CodeEntryAlignment); 2902 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 2903 address entry = __ pc(); 2904 2905 const Register x = x10; 2906 const Register xlen = x11; 2907 const Register y = x12; 2908 const Register ylen = x13; 2909 const Register z = x14; 2910 const Register zlen = x15; 2911 2912 const Register tmp1 = x16; 2913 const Register tmp2 = x17; 2914 const Register tmp3 = x7; 2915 const Register tmp4 = x28; 2916 const Register tmp5 = x29; 2917 const Register tmp6 = x30; 2918 const Register tmp7 = x31; 2919 2920 BLOCK_COMMENT("Entry:"); 2921 __ enter(); // required for proper stackwalking of RuntimeStub frame 2922 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2923 __ leave(); // required for proper stackwalking of RuntimeStub frame 2924 __ ret(); 2925 2926 return entry; 2927 } 2928 2929 address generate_squareToLen() 2930 { 2931 __ align(CodeEntryAlignment); 2932 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 2933 address entry = __ pc(); 2934 2935 const Register x = x10; 2936 const Register xlen = x11; 2937 const Register z = x12; 2938 const Register zlen = x13; 2939 const Register y = x14; // == x 2940 const Register ylen = x15; // == xlen 2941 2942 const Register tmp1 = x16; 2943 const Register tmp2 = x17; 2944 const Register tmp3 = x7; 2945 const Register tmp4 = x28; 2946 const Register tmp5 = x29; 2947 const Register tmp6 = x30; 2948 const Register tmp7 = x31; 2949 2950 BLOCK_COMMENT("Entry:"); 2951 __ enter(); 2952 __ mv(y, x); 2953 __ mv(ylen, xlen); 2954 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2955 __ leave(); 2956 __ ret(); 2957 2958 return entry; 2959 } 2960 2961 // Arguments: 2962 // 2963 // Input: 2964 // c_rarg0 - newArr address 2965 // c_rarg1 - oldArr address 2966 // c_rarg2 - newIdx 2967 // c_rarg3 - shiftCount 2968 // c_rarg4 - numIter 2969 // 2970 address generate_bigIntegerLeftShift() { 2971 __ align(CodeEntryAlignment); 2972 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 2973 address entry = __ pc(); 2974 2975 Label loop, exit; 2976 2977 Register newArr = c_rarg0; 2978 Register oldArr = c_rarg1; 2979 Register newIdx = c_rarg2; 2980 Register shiftCount = c_rarg3; 2981 Register numIter = c_rarg4; 2982 2983 Register shiftRevCount = c_rarg5; 2984 Register oldArrNext = t1; 2985 2986 __ beqz(numIter, exit); 2987 __ shadd(newArr, newIdx, newArr, t0, 2); 2988 2989 __ li(shiftRevCount, 32); 2990 __ sub(shiftRevCount, shiftRevCount, shiftCount); 2991 2992 __ bind(loop); 2993 __ addi(oldArrNext, oldArr, 4); 2994 __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4); 2995 __ vle32_v(v0, oldArr); 2996 __ vle32_v(v4, oldArrNext); 2997 __ vsll_vx(v0, v0, shiftCount); 2998 __ vsrl_vx(v4, v4, shiftRevCount); 2999 __ vor_vv(v0, v0, v4); 3000 __ vse32_v(v0, newArr); 3001 __ sub(numIter, numIter, t0); 3002 __ shadd(oldArr, t0, oldArr, t1, 2); 3003 __ shadd(newArr, t0, newArr, t1, 2); 3004 __ bnez(numIter, loop); 3005 3006 __ bind(exit); 3007 __ ret(); 3008 3009 return entry; 3010 } 3011 3012 // Arguments: 3013 // 3014 // Input: 3015 // c_rarg0 - newArr address 3016 // c_rarg1 - oldArr address 3017 // c_rarg2 - newIdx 3018 // c_rarg3 - shiftCount 3019 // c_rarg4 - numIter 3020 // 3021 address generate_bigIntegerRightShift() { 3022 __ align(CodeEntryAlignment); 3023 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 3024 address entry = __ pc(); 3025 3026 Label loop, exit; 3027 3028 Register newArr = c_rarg0; 3029 Register oldArr = c_rarg1; 3030 Register newIdx = c_rarg2; 3031 Register shiftCount = c_rarg3; 3032 Register numIter = c_rarg4; 3033 Register idx = numIter; 3034 3035 Register shiftRevCount = c_rarg5; 3036 Register oldArrNext = c_rarg6; 3037 Register newArrCur = t0; 3038 Register oldArrCur = t1; 3039 3040 __ beqz(idx, exit); 3041 __ shadd(newArr, newIdx, newArr, t0, 2); 3042 3043 __ li(shiftRevCount, 32); 3044 __ sub(shiftRevCount, shiftRevCount, shiftCount); 3045 3046 __ bind(loop); 3047 __ vsetvli(t0, idx, Assembler::e32, Assembler::m4); 3048 __ sub(idx, idx, t0); 3049 __ shadd(oldArrNext, idx, oldArr, t1, 2); 3050 __ shadd(newArrCur, idx, newArr, t1, 2); 3051 __ addi(oldArrCur, oldArrNext, 4); 3052 __ vle32_v(v0, oldArrCur); 3053 __ vle32_v(v4, oldArrNext); 3054 __ vsrl_vx(v0, v0, shiftCount); 3055 __ vsll_vx(v4, v4, shiftRevCount); 3056 __ vor_vv(v0, v0, v4); 3057 __ vse32_v(v0, newArrCur); 3058 __ bnez(idx, loop); 3059 3060 __ bind(exit); 3061 __ ret(); 3062 3063 return entry; 3064 } 3065 #endif 3066 3067 #ifdef COMPILER2 3068 class MontgomeryMultiplyGenerator : public MacroAssembler { 3069 3070 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3071 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj; 3072 3073 RegSet _toSave; 3074 bool _squaring; 3075 3076 public: 3077 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3078 : MacroAssembler(as->code()), _squaring(squaring) { 3079 3080 // Register allocation 3081 3082 Register reg = c_rarg0; 3083 Pa_base = reg; // Argument registers 3084 if (squaring) { 3085 Pb_base = Pa_base; 3086 } else { 3087 Pb_base = ++reg; 3088 } 3089 Pn_base = ++reg; 3090 Rlen= ++reg; 3091 inv = ++reg; 3092 Pm_base = ++reg; 3093 3094 // Working registers: 3095 Ra = ++reg; // The current digit of a, b, n, and m. 3096 Rb = ++reg; 3097 Rm = ++reg; 3098 Rn = ++reg; 3099 3100 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3101 Pb = ++reg; 3102 Pm = ++reg; 3103 Pn = ++reg; 3104 3105 tmp0 = ++reg; // Three registers which form a 3106 tmp1 = ++reg; // triple-precision accumuator. 3107 tmp2 = ++reg; 3108 3109 Ri = x6; // Inner and outer loop indexes. 3110 Rj = x7; 3111 3112 Rhi_ab = x28; // Product registers: low and high parts 3113 Rlo_ab = x29; // of a*b and m*n. 3114 Rhi_mn = x30; 3115 Rlo_mn = x31; 3116 3117 // x18 and up are callee-saved. 3118 _toSave = RegSet::range(x18, reg) + Pm_base; 3119 } 3120 3121 private: 3122 void save_regs() { 3123 push_reg(_toSave, sp); 3124 } 3125 3126 void restore_regs() { 3127 pop_reg(_toSave, sp); 3128 } 3129 3130 template <typename T> 3131 void unroll_2(Register count, T block) { 3132 Label loop, end, odd; 3133 beqz(count, end); 3134 andi(t0, count, 0x1); 3135 bnez(t0, odd); 3136 align(16); 3137 bind(loop); 3138 (this->*block)(); 3139 bind(odd); 3140 (this->*block)(); 3141 addi(count, count, -2); 3142 bgtz(count, loop); 3143 bind(end); 3144 } 3145 3146 template <typename T> 3147 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3148 Label loop, end, odd; 3149 beqz(count, end); 3150 andi(tmp, count, 0x1); 3151 bnez(tmp, odd); 3152 align(16); 3153 bind(loop); 3154 (this->*block)(d, s, tmp); 3155 bind(odd); 3156 (this->*block)(d, s, tmp); 3157 addi(count, count, -2); 3158 bgtz(count, loop); 3159 bind(end); 3160 } 3161 3162 void pre1(RegisterOrConstant i) { 3163 block_comment("pre1"); 3164 // Pa = Pa_base; 3165 // Pb = Pb_base + i; 3166 // Pm = Pm_base; 3167 // Pn = Pn_base + i; 3168 // Ra = *Pa; 3169 // Rb = *Pb; 3170 // Rm = *Pm; 3171 // Rn = *Pn; 3172 if (i.is_register()) { 3173 slli(t0, i.as_register(), LogBytesPerWord); 3174 } else { 3175 mv(t0, i.as_constant()); 3176 slli(t0, t0, LogBytesPerWord); 3177 } 3178 3179 mv(Pa, Pa_base); 3180 add(Pb, Pb_base, t0); 3181 mv(Pm, Pm_base); 3182 add(Pn, Pn_base, t0); 3183 3184 ld(Ra, Address(Pa)); 3185 ld(Rb, Address(Pb)); 3186 ld(Rm, Address(Pm)); 3187 ld(Rn, Address(Pn)); 3188 3189 // Zero the m*n result. 3190 mv(Rhi_mn, zr); 3191 mv(Rlo_mn, zr); 3192 } 3193 3194 // The core multiply-accumulate step of a Montgomery 3195 // multiplication. The idea is to schedule operations as a 3196 // pipeline so that instructions with long latencies (loads and 3197 // multiplies) have time to complete before their results are 3198 // used. This most benefits in-order implementations of the 3199 // architecture but out-of-order ones also benefit. 3200 void step() { 3201 block_comment("step"); 3202 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3203 // Ra = *++Pa; 3204 // Rb = *--Pb; 3205 mulhu(Rhi_ab, Ra, Rb); 3206 mul(Rlo_ab, Ra, Rb); 3207 addi(Pa, Pa, wordSize); 3208 ld(Ra, Address(Pa)); 3209 addi(Pb, Pb, -wordSize); 3210 ld(Rb, Address(Pb)); 3211 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the 3212 // previous iteration. 3213 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3214 // Rm = *++Pm; 3215 // Rn = *--Pn; 3216 mulhu(Rhi_mn, Rm, Rn); 3217 mul(Rlo_mn, Rm, Rn); 3218 addi(Pm, Pm, wordSize); 3219 ld(Rm, Address(Pm)); 3220 addi(Pn, Pn, -wordSize); 3221 ld(Rn, Address(Pn)); 3222 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3223 } 3224 3225 void post1() { 3226 block_comment("post1"); 3227 3228 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3229 // Ra = *++Pa; 3230 // Rb = *--Pb; 3231 mulhu(Rhi_ab, Ra, Rb); 3232 mul(Rlo_ab, Ra, Rb); 3233 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3234 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3235 3236 // *Pm = Rm = tmp0 * inv; 3237 mul(Rm, tmp0, inv); 3238 sd(Rm, Address(Pm)); 3239 3240 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3241 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3242 mulhu(Rhi_mn, Rm, Rn); 3243 3244 #ifndef PRODUCT 3245 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3246 { 3247 mul(Rlo_mn, Rm, Rn); 3248 add(Rlo_mn, tmp0, Rlo_mn); 3249 Label ok; 3250 beqz(Rlo_mn, ok); 3251 stop("broken Montgomery multiply"); 3252 bind(ok); 3253 } 3254 #endif 3255 // We have very carefully set things up so that 3256 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3257 // the lower half of Rm * Rn because we know the result already: 3258 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3259 // tmp0 != 0. So, rather than do a mul and an cad we just set 3260 // the carry flag iff tmp0 is nonzero. 3261 // 3262 // mul(Rlo_mn, Rm, Rn); 3263 // cad(zr, tmp0, Rlo_mn); 3264 addi(t0, tmp0, -1); 3265 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3266 cadc(tmp0, tmp1, Rhi_mn, t0); 3267 adc(tmp1, tmp2, zr, t0); 3268 mv(tmp2, zr); 3269 } 3270 3271 void pre2(Register i, Register len) { 3272 block_comment("pre2"); 3273 // Pa = Pa_base + i-len; 3274 // Pb = Pb_base + len; 3275 // Pm = Pm_base + i-len; 3276 // Pn = Pn_base + len; 3277 3278 sub(Rj, i, len); 3279 // Rj == i-len 3280 3281 // Ra as temp register 3282 slli(Ra, Rj, LogBytesPerWord); 3283 add(Pa, Pa_base, Ra); 3284 add(Pm, Pm_base, Ra); 3285 slli(Ra, len, LogBytesPerWord); 3286 add(Pb, Pb_base, Ra); 3287 add(Pn, Pn_base, Ra); 3288 3289 // Ra = *++Pa; 3290 // Rb = *--Pb; 3291 // Rm = *++Pm; 3292 // Rn = *--Pn; 3293 add(Pa, Pa, wordSize); 3294 ld(Ra, Address(Pa)); 3295 add(Pb, Pb, -wordSize); 3296 ld(Rb, Address(Pb)); 3297 add(Pm, Pm, wordSize); 3298 ld(Rm, Address(Pm)); 3299 add(Pn, Pn, -wordSize); 3300 ld(Rn, Address(Pn)); 3301 3302 mv(Rhi_mn, zr); 3303 mv(Rlo_mn, zr); 3304 } 3305 3306 void post2(Register i, Register len) { 3307 block_comment("post2"); 3308 sub(Rj, i, len); 3309 3310 cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part 3311 3312 // As soon as we know the least significant digit of our result, 3313 // store it. 3314 // Pm_base[i-len] = tmp0; 3315 // Rj as temp register 3316 slli(Rj, Rj, LogBytesPerWord); 3317 add(Rj, Pm_base, Rj); 3318 sd(tmp0, Address(Rj)); 3319 3320 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3321 cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part 3322 adc(tmp1, tmp2, zr, t0); 3323 mv(tmp2, zr); 3324 } 3325 3326 // A carry in tmp0 after Montgomery multiplication means that we 3327 // should subtract multiples of n from our result in m. We'll 3328 // keep doing that until there is no carry. 3329 void normalize(Register len) { 3330 block_comment("normalize"); 3331 // while (tmp0) 3332 // tmp0 = sub(Pm_base, Pn_base, tmp0, len); 3333 Label loop, post, again; 3334 Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now 3335 beqz(tmp0, post); { 3336 bind(again); { 3337 mv(i, zr); 3338 mv(cnt, len); 3339 slli(Rn, i, LogBytesPerWord); 3340 add(Rm, Pm_base, Rn); 3341 ld(Rm, Address(Rm)); 3342 add(Rn, Pn_base, Rn); 3343 ld(Rn, Address(Rn)); 3344 li(t0, 1); // set carry flag, i.e. no borrow 3345 align(16); 3346 bind(loop); { 3347 notr(Rn, Rn); 3348 add(Rm, Rm, t0); 3349 add(Rm, Rm, Rn); 3350 sltu(t0, Rm, Rn); 3351 slli(Rn, i, LogBytesPerWord); // Rn as temp register 3352 add(Rn, Pm_base, Rn); 3353 sd(Rm, Address(Rn)); 3354 add(i, i, 1); 3355 slli(Rn, i, LogBytesPerWord); 3356 add(Rm, Pm_base, Rn); 3357 ld(Rm, Address(Rm)); 3358 add(Rn, Pn_base, Rn); 3359 ld(Rn, Address(Rn)); 3360 sub(cnt, cnt, 1); 3361 } bnez(cnt, loop); 3362 addi(tmp0, tmp0, -1); 3363 add(tmp0, tmp0, t0); 3364 } bnez(tmp0, again); 3365 } bind(post); 3366 } 3367 3368 // Move memory at s to d, reversing words. 3369 // Increments d to end of copied memory 3370 // Destroys tmp1, tmp2 3371 // Preserves len 3372 // Leaves s pointing to the address which was in d at start 3373 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3374 assert(tmp1 < x28 && tmp2 < x28, "register corruption"); 3375 3376 slli(tmp1, len, LogBytesPerWord); 3377 add(s, s, tmp1); 3378 mv(tmp1, len); 3379 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3380 slli(tmp1, len, LogBytesPerWord); 3381 sub(s, d, tmp1); 3382 } 3383 // [63...0] -> [31...0][63...32] 3384 void reverse1(Register d, Register s, Register tmp) { 3385 addi(s, s, -wordSize); 3386 ld(tmp, Address(s)); 3387 ror_imm(tmp, tmp, 32, t0); 3388 sd(tmp, Address(d)); 3389 addi(d, d, wordSize); 3390 } 3391 3392 void step_squaring() { 3393 // An extra ACC 3394 step(); 3395 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3396 } 3397 3398 void last_squaring(Register i) { 3399 Label dont; 3400 // if ((i & 1) == 0) { 3401 andi(t0, i, 0x1); 3402 bnez(t0, dont); { 3403 // MACC(Ra, Rb, tmp0, tmp1, tmp2); 3404 // Ra = *++Pa; 3405 // Rb = *--Pb; 3406 mulhu(Rhi_ab, Ra, Rb); 3407 mul(Rlo_ab, Ra, Rb); 3408 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2); 3409 } bind(dont); 3410 } 3411 3412 void extra_step_squaring() { 3413 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3414 3415 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3416 // Rm = *++Pm; 3417 // Rn = *--Pn; 3418 mulhu(Rhi_mn, Rm, Rn); 3419 mul(Rlo_mn, Rm, Rn); 3420 addi(Pm, Pm, wordSize); 3421 ld(Rm, Address(Pm)); 3422 addi(Pn, Pn, -wordSize); 3423 ld(Rn, Address(Pn)); 3424 } 3425 3426 void post1_squaring() { 3427 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n 3428 3429 // *Pm = Rm = tmp0 * inv; 3430 mul(Rm, tmp0, inv); 3431 sd(Rm, Address(Pm)); 3432 3433 // MACC(Rm, Rn, tmp0, tmp1, tmp2); 3434 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; 3435 mulhu(Rhi_mn, Rm, Rn); 3436 3437 #ifndef PRODUCT 3438 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply"); 3439 { 3440 mul(Rlo_mn, Rm, Rn); 3441 add(Rlo_mn, tmp0, Rlo_mn); 3442 Label ok; 3443 beqz(Rlo_mn, ok); { 3444 stop("broken Montgomery multiply"); 3445 } bind(ok); 3446 } 3447 #endif 3448 // We have very carefully set things up so that 3449 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate 3450 // the lower half of Rm * Rn because we know the result already: 3451 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff 3452 // tmp0 != 0. So, rather than do a mul and a cad we just set 3453 // the carry flag iff tmp0 is nonzero. 3454 // 3455 // mul(Rlo_mn, Rm, Rn); 3456 // cad(zr, tmp, Rlo_mn); 3457 addi(t0, tmp0, -1); 3458 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero 3459 cadc(tmp0, tmp1, Rhi_mn, t0); 3460 adc(tmp1, tmp2, zr, t0); 3461 mv(tmp2, zr); 3462 } 3463 3464 // use t0 as carry 3465 void acc(Register Rhi, Register Rlo, 3466 Register tmp0, Register tmp1, Register tmp2) { 3467 cad(tmp0, tmp0, Rlo, t0); 3468 cadc(tmp1, tmp1, Rhi, t0); 3469 adc(tmp2, tmp2, zr, t0); 3470 } 3471 3472 public: 3473 /** 3474 * Fast Montgomery multiplication. The derivation of the 3475 * algorithm is in A Cryptographic Library for the Motorola 3476 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3477 * 3478 * Arguments: 3479 * 3480 * Inputs for multiplication: 3481 * c_rarg0 - int array elements a 3482 * c_rarg1 - int array elements b 3483 * c_rarg2 - int array elements n (the modulus) 3484 * c_rarg3 - int length 3485 * c_rarg4 - int inv 3486 * c_rarg5 - int array elements m (the result) 3487 * 3488 * Inputs for squaring: 3489 * c_rarg0 - int array elements a 3490 * c_rarg1 - int array elements n (the modulus) 3491 * c_rarg2 - int length 3492 * c_rarg3 - int inv 3493 * c_rarg4 - int array elements m (the result) 3494 * 3495 */ 3496 address generate_multiply() { 3497 Label argh, nothing; 3498 bind(argh); 3499 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3500 3501 align(CodeEntryAlignment); 3502 address entry = pc(); 3503 3504 beqz(Rlen, nothing); 3505 3506 enter(); 3507 3508 // Make room. 3509 li(Ra, 512); 3510 bgt(Rlen, Ra, argh); 3511 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3512 sub(Ra, sp, Ra); 3513 andi(sp, Ra, -2 * wordSize); 3514 3515 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3516 3517 { 3518 // Copy input args, reversing as we go. We use Ra as a 3519 // temporary variable. 3520 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3521 if (!_squaring) 3522 reverse(Ra, Pb_base, Rlen, Ri, Rj); 3523 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3524 } 3525 3526 // Push all call-saved registers and also Pm_base which we'll need 3527 // at the end. 3528 save_regs(); 3529 3530 #ifndef PRODUCT 3531 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3532 { 3533 ld(Rn, Address(Pn_base)); 3534 mul(Rlo_mn, Rn, inv); 3535 li(t0, -1); 3536 Label ok; 3537 beq(Rlo_mn, t0, ok); 3538 stop("broken inverse in Montgomery multiply"); 3539 bind(ok); 3540 } 3541 #endif 3542 3543 mv(Pm_base, Ra); 3544 3545 mv(tmp0, zr); 3546 mv(tmp1, zr); 3547 mv(tmp2, zr); 3548 3549 block_comment("for (int i = 0; i < len; i++) {"); 3550 mv(Ri, zr); { 3551 Label loop, end; 3552 bge(Ri, Rlen, end); 3553 3554 bind(loop); 3555 pre1(Ri); 3556 3557 block_comment(" for (j = i; j; j--) {"); { 3558 mv(Rj, Ri); 3559 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3560 } block_comment(" } // j"); 3561 3562 post1(); 3563 addw(Ri, Ri, 1); 3564 blt(Ri, Rlen, loop); 3565 bind(end); 3566 block_comment("} // i"); 3567 } 3568 3569 block_comment("for (int i = len; i < 2*len; i++) {"); 3570 mv(Ri, Rlen); { 3571 Label loop, end; 3572 slli(t0, Rlen, 1); 3573 bge(Ri, t0, end); 3574 3575 bind(loop); 3576 pre2(Ri, Rlen); 3577 3578 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3579 slliw(Rj, Rlen, 1); 3580 subw(Rj, Rj, Ri); 3581 subw(Rj, Rj, 1); 3582 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3583 } block_comment(" } // j"); 3584 3585 post2(Ri, Rlen); 3586 addw(Ri, Ri, 1); 3587 slli(t0, Rlen, 1); 3588 blt(Ri, t0, loop); 3589 bind(end); 3590 } 3591 block_comment("} // i"); 3592 3593 normalize(Rlen); 3594 3595 mv(Ra, Pm_base); // Save Pm_base in Ra 3596 restore_regs(); // Restore caller's Pm_base 3597 3598 // Copy our result into caller's Pm_base 3599 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3600 3601 leave(); 3602 bind(nothing); 3603 ret(); 3604 3605 return entry; 3606 } 3607 3608 /** 3609 * 3610 * Arguments: 3611 * 3612 * Inputs: 3613 * c_rarg0 - int array elements a 3614 * c_rarg1 - int array elements n (the modulus) 3615 * c_rarg2 - int length 3616 * c_rarg3 - int inv 3617 * c_rarg4 - int array elements m (the result) 3618 * 3619 */ 3620 address generate_square() { 3621 Label argh; 3622 bind(argh); 3623 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3624 3625 align(CodeEntryAlignment); 3626 address entry = pc(); 3627 3628 enter(); 3629 3630 // Make room. 3631 li(Ra, 512); 3632 bgt(Rlen, Ra, argh); 3633 slli(Ra, Rlen, exact_log2(4 * sizeof(jint))); 3634 sub(Ra, sp, Ra); 3635 andi(sp, Ra, -2 * wordSize); 3636 3637 srliw(Rlen, Rlen, 1); // length in longwords = len/2 3638 3639 { 3640 // Copy input args, reversing as we go. We use Ra as a 3641 // temporary variable. 3642 reverse(Ra, Pa_base, Rlen, Ri, Rj); 3643 reverse(Ra, Pn_base, Rlen, Ri, Rj); 3644 } 3645 3646 // Push all call-saved registers and also Pm_base which we'll need 3647 // at the end. 3648 save_regs(); 3649 3650 mv(Pm_base, Ra); 3651 3652 mv(tmp0, zr); 3653 mv(tmp1, zr); 3654 mv(tmp2, zr); 3655 3656 block_comment("for (int i = 0; i < len; i++) {"); 3657 mv(Ri, zr); { 3658 Label loop, end; 3659 bind(loop); 3660 bge(Ri, Rlen, end); 3661 3662 pre1(Ri); 3663 3664 block_comment("for (j = (i+1)/2; j; j--) {"); { 3665 addi(Rj, Ri, 1); 3666 srliw(Rj, Rj, 1); 3667 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3668 } block_comment(" } // j"); 3669 3670 last_squaring(Ri); 3671 3672 block_comment(" for (j = i/2; j; j--) {"); { 3673 srliw(Rj, Ri, 1); 3674 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3675 } block_comment(" } // j"); 3676 3677 post1_squaring(); 3678 addi(Ri, Ri, 1); 3679 blt(Ri, Rlen, loop); 3680 3681 bind(end); 3682 block_comment("} // i"); 3683 } 3684 3685 block_comment("for (int i = len; i < 2*len; i++) {"); 3686 mv(Ri, Rlen); { 3687 Label loop, end; 3688 bind(loop); 3689 slli(t0, Rlen, 1); 3690 bge(Ri, t0, end); 3691 3692 pre2(Ri, Rlen); 3693 3694 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 3695 slli(Rj, Rlen, 1); 3696 sub(Rj, Rj, Ri); 3697 sub(Rj, Rj, 1); 3698 srliw(Rj, Rj, 1); 3699 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3700 } block_comment(" } // j"); 3701 3702 last_squaring(Ri); 3703 3704 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 3705 slli(Rj, Rlen, 1); 3706 sub(Rj, Rj, Ri); 3707 srliw(Rj, Rj, 1); 3708 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3709 } block_comment(" } // j"); 3710 3711 post2(Ri, Rlen); 3712 addi(Ri, Ri, 1); 3713 slli(t0, Rlen, 1); 3714 blt(Ri, t0, loop); 3715 3716 bind(end); 3717 block_comment("} // i"); 3718 } 3719 3720 normalize(Rlen); 3721 3722 mv(Ra, Pm_base); // Save Pm_base in Ra 3723 restore_regs(); // Restore caller's Pm_base 3724 3725 // Copy our result into caller's Pm_base 3726 reverse(Pm_base, Ra, Rlen, Ri, Rj); 3727 3728 leave(); 3729 ret(); 3730 3731 return entry; 3732 } 3733 }; 3734 #endif // COMPILER2 3735 3736 // Continuation point for throwing of implicit exceptions that are 3737 // not handled in the current activation. Fabricates an exception 3738 // oop and initiates normal exception dispatching in this 3739 // frame. Since we need to preserve callee-saved values (currently 3740 // only for C2, but done for C1 as well) we need a callee-saved oop 3741 // map and therefore have to make these stubs into RuntimeStubs 3742 // rather than BufferBlobs. If the compiler needs all registers to 3743 // be preserved between the fault point and the exception handler 3744 // then it must assume responsibility for that in 3745 // AbstractCompiler::continuation_for_implicit_null_exception or 3746 // continuation_for_implicit_division_by_zero_exception. All other 3747 // implicit exceptions (e.g., NullPointerException or 3748 // AbstractMethodError on entry) are either at call sites or 3749 // otherwise assume that stack unwinding will be initiated, so 3750 // caller saved registers were assumed volatile in the compiler. 3751 3752 #undef __ 3753 #define __ masm-> 3754 3755 address generate_throw_exception(const char* name, 3756 address runtime_entry, 3757 Register arg1 = noreg, 3758 Register arg2 = noreg) { 3759 // Information about frame layout at time of blocking runtime call. 3760 // Note that we only have to preserve callee-saved registers since 3761 // the compilers are responsible for supplying a continuation point 3762 // if they expect all registers to be preserved. 3763 // n.b. riscv asserts that frame::arg_reg_save_area_bytes == 0 3764 assert_cond(runtime_entry != NULL); 3765 enum layout { 3766 fp_off = 0, 3767 fp_off2, 3768 return_off, 3769 return_off2, 3770 framesize // inclusive of return address 3771 }; 3772 3773 const int insts_size = 512; 3774 const int locs_size = 64; 3775 3776 CodeBuffer code(name, insts_size, locs_size); 3777 OopMapSet* oop_maps = new OopMapSet(); 3778 MacroAssembler* masm = new MacroAssembler(&code); 3779 assert_cond(oop_maps != NULL && masm != NULL); 3780 3781 address start = __ pc(); 3782 3783 // This is an inlined and slightly modified version of call_VM 3784 // which has the ability to fetch the return PC out of 3785 // thread-local storage and also sets up last_Java_sp slightly 3786 // differently than the real call_VM 3787 3788 __ enter(); // Save FP and RA before call 3789 3790 assert(is_even(framesize / 2), "sp not 16-byte aligned"); 3791 3792 // ra and fp are already in place 3793 __ addi(sp, fp, 0 - ((unsigned)framesize << LogBytesPerInt)); // prolog 3794 3795 int frame_complete = __ pc() - start; 3796 3797 // Set up last_Java_sp and last_Java_fp 3798 address the_pc = __ pc(); 3799 __ set_last_Java_frame(sp, fp, the_pc, t0); 3800 3801 // Call runtime 3802 if (arg1 != noreg) { 3803 assert(arg2 != c_rarg1, "clobbered"); 3804 __ mv(c_rarg1, arg1); 3805 } 3806 if (arg2 != noreg) { 3807 __ mv(c_rarg2, arg2); 3808 } 3809 __ mv(c_rarg0, xthread); 3810 BLOCK_COMMENT("call runtime_entry"); 3811 int32_t offset = 0; 3812 __ movptr_with_offset(t0, runtime_entry, offset); 3813 __ jalr(x1, t0, offset); 3814 3815 // Generate oop map 3816 OopMap* map = new OopMap(framesize, 0); 3817 assert_cond(map != NULL); 3818 3819 oop_maps->add_gc_map(the_pc - start, map); 3820 3821 __ reset_last_Java_frame(true); 3822 3823 __ leave(); 3824 3825 // check for pending exceptions 3826 #ifdef ASSERT 3827 Label L; 3828 __ ld(t0, Address(xthread, Thread::pending_exception_offset())); 3829 __ bnez(t0, L); 3830 __ should_not_reach_here(); 3831 __ bind(L); 3832 #endif // ASSERT 3833 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3834 3835 3836 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3837 RuntimeStub* stub = 3838 RuntimeStub::new_runtime_stub(name, 3839 &code, 3840 frame_complete, 3841 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3842 oop_maps, false); 3843 assert(stub != NULL, "create runtime stub fail!"); 3844 return stub->entry_point(); 3845 } 3846 3847 // Initialization 3848 void generate_initial() { 3849 // Generate initial stubs and initializes the entry points 3850 3851 // entry points that exist in all platforms Note: This is code 3852 // that could be shared among different platforms - however the 3853 // benefit seems to be smaller than the disadvantage of having a 3854 // much more complicated generator structure. See also comment in 3855 // stubRoutines.hpp. 3856 3857 StubRoutines::_forward_exception_entry = generate_forward_exception(); 3858 3859 StubRoutines::_call_stub_entry = 3860 generate_call_stub(StubRoutines::_call_stub_return_address); 3861 3862 // is referenced by megamorphic call 3863 StubRoutines::_catch_exception_entry = generate_catch_exception(); 3864 3865 // Build this early so it's available for the interpreter. 3866 StubRoutines::_throw_StackOverflowError_entry = 3867 generate_throw_exception("StackOverflowError throw_exception", 3868 CAST_FROM_FN_PTR(address, 3869 SharedRuntime::throw_StackOverflowError)); 3870 StubRoutines::_throw_delayed_StackOverflowError_entry = 3871 generate_throw_exception("delayed StackOverflowError throw_exception", 3872 CAST_FROM_FN_PTR(address, 3873 SharedRuntime::throw_delayed_StackOverflowError)); 3874 // Safefetch stubs. 3875 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 3876 &StubRoutines::_safefetch32_fault_pc, 3877 &StubRoutines::_safefetch32_continuation_pc); 3878 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 3879 &StubRoutines::_safefetchN_fault_pc, 3880 &StubRoutines::_safefetchN_continuation_pc); 3881 } 3882 3883 void generate_all() { 3884 // support for verify_oop (must happen after universe_init) 3885 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 3886 StubRoutines::_throw_AbstractMethodError_entry = 3887 generate_throw_exception("AbstractMethodError throw_exception", 3888 CAST_FROM_FN_PTR(address, 3889 SharedRuntime:: 3890 throw_AbstractMethodError)); 3891 3892 StubRoutines::_throw_IncompatibleClassChangeError_entry = 3893 generate_throw_exception("IncompatibleClassChangeError throw_exception", 3894 CAST_FROM_FN_PTR(address, 3895 SharedRuntime:: 3896 throw_IncompatibleClassChangeError)); 3897 3898 StubRoutines::_throw_NullPointerException_at_call_entry = 3899 generate_throw_exception("NullPointerException at call throw_exception", 3900 CAST_FROM_FN_PTR(address, 3901 SharedRuntime:: 3902 throw_NullPointerException_at_call)); 3903 // arraycopy stubs used by compilers 3904 generate_arraycopy_stubs(); 3905 3906 #ifdef COMPILER2 3907 if (UseMulAddIntrinsic) { 3908 StubRoutines::_mulAdd = generate_mulAdd(); 3909 } 3910 3911 if (UseMultiplyToLenIntrinsic) { 3912 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 3913 } 3914 3915 if (UseSquareToLenIntrinsic) { 3916 StubRoutines::_squareToLen = generate_squareToLen(); 3917 } 3918 3919 if (UseMontgomeryMultiplyIntrinsic) { 3920 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 3921 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 3922 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 3923 } 3924 3925 if (UseMontgomerySquareIntrinsic) { 3926 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 3927 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 3928 StubRoutines::_montgomerySquare = g.generate_square(); 3929 } 3930 3931 if (UseRVVForBigIntegerShiftIntrinsics) { 3932 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 3933 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 3934 } 3935 #endif 3936 3937 generate_compare_long_strings(); 3938 3939 generate_string_indexof_stubs(); 3940 3941 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 3942 if (bs_nm != NULL) { 3943 StubRoutines::riscv::_method_entry_barrier = generate_method_entry_barrier(); 3944 } 3945 3946 StubRoutines::riscv::set_completed(); 3947 } 3948 3949 public: 3950 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 3951 if (all) { 3952 generate_all(); 3953 } else { 3954 generate_initial(); 3955 } 3956 } 3957 3958 ~StubGenerator() {} 3959 }; // end class declaration 3960 3961 #define UCM_TABLE_MAX_ENTRIES 8 3962 void StubGenerator_generate(CodeBuffer* code, bool all) { 3963 if (UnsafeCopyMemory::_table == NULL) { 3964 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 3965 } 3966 3967 StubGenerator g(code, all); 3968 }