1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2025, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/universe.hpp"
  34 #include "nativeInst_riscv.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/method.hpp"
  37 #include "oops/objArrayKlass.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "prims/upcallLinker.hpp"
  41 #include "runtime/continuation.hpp"
  42 #include "runtime/continuationEntry.inline.hpp"
  43 #include "runtime/frame.inline.hpp"
  44 #include "runtime/handles.inline.hpp"
  45 #include "runtime/javaThread.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubCodeGenerator.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "utilities/align.hpp"
  50 #include "utilities/powerOfTwo.hpp"
  51 #ifdef COMPILER2
  52 #include "opto/runtime.hpp"
  53 #endif
  54 
  55 // Declaration and definition of StubGenerator (no .hpp file).
  56 // For a more detailed description of the stub routine structure
  57 // see the comment in stubRoutines.hpp
  58 
  59 #undef __
  60 #define __ _masm->
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(uint& counter) {
  79     __ incrementw(ExternalAddress((address)&counter));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save x1 (ra) as the return PC at the base of the frame and
 102   // link x8 (fp) below it as the frame pointer installing sp (x2)
 103   // into fp.
 104   //
 105   // we save x10-x17, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save x5 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 116   // volatile
 117   //
 118   // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
 119   // registers and C expects to be callee-save
 120   //
 121   // so the stub frame looks like this when we enter Java code
 122   //
 123   //     [ return_from_Java     ] <--- sp
 124   //     [ argument word n      ]
 125   //      ...
 126   // -35 [ argument word 1      ]
 127   // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
 128   // -33 [ saved f27            ]
 129   // -32 [ saved f26            ]
 130   // -31 [ saved f25            ]
 131   // -30 [ saved f24            ]
 132   // -29 [ saved f23            ]
 133   // -28 [ saved f22            ]
 134   // -27 [ saved f21            ]
 135   // -26 [ saved f20            ]
 136   // -25 [ saved f19            ]
 137   // -24 [ saved f18            ]
 138   // -23 [ saved f9             ]
 139   // -22 [ saved f8             ]
 140   // -21 [ saved x27            ]
 141   // -20 [ saved x26            ]
 142   // -19 [ saved x25            ]
 143   // -18 [ saved x24            ]
 144   // -17 [ saved x23            ]
 145   // -16 [ saved x22            ]
 146   // -15 [ saved x21            ]
 147   // -14 [ saved x20            ]
 148   // -13 [ saved x19            ]
 149   // -12 [ saved x18            ]
 150   // -11 [ saved x9             ]
 151   // -10 [ call wrapper   (x10) ]
 152   //  -9 [ result         (x11) ]
 153   //  -8 [ result type    (x12) ]
 154   //  -7 [ method         (x13) ]
 155   //  -6 [ entry point    (x14) ]
 156   //  -5 [ parameters     (x15) ]
 157   //  -4 [ parameter size (x16) ]
 158   //  -3 [ thread         (x17) ]
 159   //  -2 [ saved fp       (x8)  ]
 160   //  -1 [ saved ra       (x1)  ]
 161   //   0 [                      ] <--- fp == saved sp (x2)
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off  = -34,
 166 
 167     frm_off            = sp_after_call_off,
 168     f27_off            = -33,
 169     f26_off            = -32,
 170     f25_off            = -31,
 171     f24_off            = -30,
 172     f23_off            = -29,
 173     f22_off            = -28,
 174     f21_off            = -27,
 175     f20_off            = -26,
 176     f19_off            = -25,
 177     f18_off            = -24,
 178     f9_off             = -23,
 179     f8_off             = -22,
 180 
 181     x27_off            = -21,
 182     x26_off            = -20,
 183     x25_off            = -19,
 184     x24_off            = -18,
 185     x23_off            = -17,
 186     x22_off            = -16,
 187     x21_off            = -15,
 188     x20_off            = -14,
 189     x19_off            = -13,
 190     x18_off            = -12,
 191     x9_off             = -11,
 192 
 193     call_wrapper_off   = -10,
 194     result_off         = -9,
 195     result_type_off    = -8,
 196     method_off         = -7,
 197     entry_point_off    = -6,
 198     parameters_off     = -5,
 199     parameter_size_off = -4,
 200     thread_off         = -3,
 201     fp_f               = -2,
 202     retaddr_off        = -1,
 203   };
 204 
 205   address generate_call_stub(address& return_address) {
 206     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 207            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 208            "adjust this code");
 209 
 210     StubId stub_id = StubId::stubgen_call_stub_id;
 211     StubCodeMark mark(this, stub_id);
 212     address start = __ pc();
 213 
 214     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 215 
 216     const Address frm_save      (fp, frm_off           * wordSize);
 217     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 218     const Address result        (fp, result_off         * wordSize);
 219     const Address result_type   (fp, result_type_off    * wordSize);
 220     const Address method        (fp, method_off         * wordSize);
 221     const Address entry_point   (fp, entry_point_off    * wordSize);
 222     const Address parameters    (fp, parameters_off     * wordSize);
 223     const Address parameter_size(fp, parameter_size_off * wordSize);
 224 
 225     const Address thread        (fp, thread_off         * wordSize);
 226 
 227     const Address f27_save      (fp, f27_off            * wordSize);
 228     const Address f26_save      (fp, f26_off            * wordSize);
 229     const Address f25_save      (fp, f25_off            * wordSize);
 230     const Address f24_save      (fp, f24_off            * wordSize);
 231     const Address f23_save      (fp, f23_off            * wordSize);
 232     const Address f22_save      (fp, f22_off            * wordSize);
 233     const Address f21_save      (fp, f21_off            * wordSize);
 234     const Address f20_save      (fp, f20_off            * wordSize);
 235     const Address f19_save      (fp, f19_off            * wordSize);
 236     const Address f18_save      (fp, f18_off            * wordSize);
 237     const Address f9_save       (fp, f9_off             * wordSize);
 238     const Address f8_save       (fp, f8_off             * wordSize);
 239 
 240     const Address x27_save      (fp, x27_off            * wordSize);
 241     const Address x26_save      (fp, x26_off            * wordSize);
 242     const Address x25_save      (fp, x25_off            * wordSize);
 243     const Address x24_save      (fp, x24_off            * wordSize);
 244     const Address x23_save      (fp, x23_off            * wordSize);
 245     const Address x22_save      (fp, x22_off            * wordSize);
 246     const Address x21_save      (fp, x21_off            * wordSize);
 247     const Address x20_save      (fp, x20_off            * wordSize);
 248     const Address x19_save      (fp, x19_off            * wordSize);
 249     const Address x18_save      (fp, x18_off            * wordSize);
 250 
 251     const Address x9_save       (fp, x9_off             * wordSize);
 252 
 253     // stub code
 254 
 255     address riscv_entry = __ pc();
 256 
 257     // set up frame and move sp to end of save area
 258     __ enter();
 259     __ addi(sp, fp, sp_after_call_off * wordSize);
 260 
 261     // save register parameters and Java temporary/global registers
 262     // n.b. we save thread even though it gets installed in
 263     // xthread because we want to sanity check tp later
 264     __ sd(c_rarg7, thread);
 265     __ sw(c_rarg6, parameter_size);
 266     __ sd(c_rarg5, parameters);
 267     __ sd(c_rarg4, entry_point);
 268     __ sd(c_rarg3, method);
 269     __ sd(c_rarg2, result_type);
 270     __ sd(c_rarg1, result);
 271     __ sd(c_rarg0, call_wrapper);
 272 
 273     __ sd(x9, x9_save);
 274 
 275     __ sd(x18, x18_save);
 276     __ sd(x19, x19_save);
 277     __ sd(x20, x20_save);
 278     __ sd(x21, x21_save);
 279     __ sd(x22, x22_save);
 280     __ sd(x23, x23_save);
 281     __ sd(x24, x24_save);
 282     __ sd(x25, x25_save);
 283     __ sd(x26, x26_save);
 284     __ sd(x27, x27_save);
 285 
 286     __ fsd(f8,  f8_save);
 287     __ fsd(f9,  f9_save);
 288     __ fsd(f18, f18_save);
 289     __ fsd(f19, f19_save);
 290     __ fsd(f20, f20_save);
 291     __ fsd(f21, f21_save);
 292     __ fsd(f22, f22_save);
 293     __ fsd(f23, f23_save);
 294     __ fsd(f24, f24_save);
 295     __ fsd(f25, f25_save);
 296     __ fsd(f26, f26_save);
 297     __ fsd(f27, f27_save);
 298 
 299     __ frrm(t0);
 300     __ sd(t0, frm_save);
 301     // Set frm to the state we need. We do want Round to Nearest. We
 302     // don't want non-IEEE rounding modes.
 303     Label skip_fsrmi;
 304     guarantee(__ RoundingMode::rne == 0, "must be");
 305     __ beqz(t0, skip_fsrmi);
 306     __ fsrmi(__ RoundingMode::rne);
 307     __ bind(skip_fsrmi);
 308 
 309     // install Java thread in global register now we have saved
 310     // whatever value it held
 311     __ mv(xthread, c_rarg7);
 312 
 313     // And method
 314     __ mv(xmethod, c_rarg3);
 315 
 316     // set up the heapbase register
 317     __ reinit_heapbase();
 318 
 319 #ifdef ASSERT
 320     // make sure we have no pending exceptions
 321     {
 322       Label L;
 323       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 324       __ beqz(t0, L);
 325       __ stop("StubRoutines::call_stub: entered with pending exception");
 326       __ BIND(L);
 327     }
 328 #endif
 329     // pass parameters if any
 330     __ mv(esp, sp);
 331     __ slli(t0, c_rarg6, LogBytesPerWord);
 332     __ sub(t0, sp, t0); // Move SP out of the way
 333     __ andi(sp, t0, -2 * wordSize);
 334 
 335     BLOCK_COMMENT("pass parameters if any");
 336     Label parameters_done;
 337     // parameter count is still in c_rarg6
 338     // and parameter pointer identifying param 1 is in c_rarg5
 339     __ beqz(c_rarg6, parameters_done);
 340 
 341     address loop = __ pc();
 342     __ ld(t0, Address(c_rarg5, 0));
 343     __ addi(c_rarg5, c_rarg5, wordSize);
 344     __ subi(c_rarg6, c_rarg6, 1);
 345     __ push_reg(t0);
 346     __ bgtz(c_rarg6, loop);
 347 
 348     __ BIND(parameters_done);
 349 
 350     // call Java entry -- passing methdoOop, and current sp
 351     //      xmethod: Method*
 352     //      x19_sender_sp: sender sp
 353     BLOCK_COMMENT("call Java function");
 354     __ mv(x19_sender_sp, sp);
 355     __ jalr(c_rarg4);
 356 
 357     // save current address for use by exception handling code
 358 
 359     return_address = __ pc();
 360 
 361     // store result depending on type (everything that is not
 362     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 363     // n.b. this assumes Java returns an integral result in x10
 364     // and a floating result in j_farg0
 365     __ ld(j_rarg2, result);
 366     Label is_long, is_float, is_double, exit;
 367     __ ld(j_rarg1, result_type);
 368     __ mv(t0, (u1)T_OBJECT);
 369     __ beq(j_rarg1, t0, is_long);
 370     __ mv(t0, (u1)T_LONG);
 371     __ beq(j_rarg1, t0, is_long);
 372     __ mv(t0, (u1)T_FLOAT);
 373     __ beq(j_rarg1, t0, is_float);
 374     __ mv(t0, (u1)T_DOUBLE);
 375     __ beq(j_rarg1, t0, is_double);
 376 
 377     // handle T_INT case
 378     __ sw(x10, Address(j_rarg2));
 379 
 380     __ BIND(exit);
 381 
 382     // pop parameters
 383     __ addi(esp, fp, sp_after_call_off * wordSize);
 384 
 385 #ifdef ASSERT
 386     // verify that threads correspond
 387     {
 388       Label L, S;
 389       __ ld(t0, thread);
 390       __ bne(xthread, t0, S);
 391       __ get_thread(t0);
 392       __ beq(xthread, t0, L);
 393       __ BIND(S);
 394       __ stop("StubRoutines::call_stub: threads must correspond");
 395       __ BIND(L);
 396     }
 397 #endif
 398 
 399     __ pop_cont_fastpath(xthread);
 400 
 401     // restore callee-save registers
 402     __ fld(f27, f27_save);
 403     __ fld(f26, f26_save);
 404     __ fld(f25, f25_save);
 405     __ fld(f24, f24_save);
 406     __ fld(f23, f23_save);
 407     __ fld(f22, f22_save);
 408     __ fld(f21, f21_save);
 409     __ fld(f20, f20_save);
 410     __ fld(f19, f19_save);
 411     __ fld(f18, f18_save);
 412     __ fld(f9,  f9_save);
 413     __ fld(f8,  f8_save);
 414 
 415     __ ld(x27, x27_save);
 416     __ ld(x26, x26_save);
 417     __ ld(x25, x25_save);
 418     __ ld(x24, x24_save);
 419     __ ld(x23, x23_save);
 420     __ ld(x22, x22_save);
 421     __ ld(x21, x21_save);
 422     __ ld(x20, x20_save);
 423     __ ld(x19, x19_save);
 424     __ ld(x18, x18_save);
 425 
 426     __ ld(x9, x9_save);
 427 
 428     // restore frm
 429     Label skip_fsrm;
 430     __ ld(t0, frm_save);
 431     __ frrm(t1);
 432     __ beq(t0, t1, skip_fsrm);
 433     __ fsrm(t0);
 434     __ bind(skip_fsrm);
 435 
 436     __ ld(c_rarg0, call_wrapper);
 437     __ ld(c_rarg1, result);
 438     __ ld(c_rarg2, result_type);
 439     __ ld(c_rarg3, method);
 440     __ ld(c_rarg4, entry_point);
 441     __ ld(c_rarg5, parameters);
 442     __ ld(c_rarg6, parameter_size);
 443     __ ld(c_rarg7, thread);
 444 
 445     // leave frame and return to caller
 446     __ leave();
 447     __ ret();
 448 
 449     // handle return types different from T_INT
 450 
 451     __ BIND(is_long);
 452     __ sd(x10, Address(j_rarg2, 0));
 453     __ j(exit);
 454 
 455     __ BIND(is_float);
 456     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 457     __ j(exit);
 458 
 459     __ BIND(is_double);
 460     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 461     __ j(exit);
 462 
 463     return start;
 464   }
 465 
 466   // Return point for a Java call if there's an exception thrown in
 467   // Java code.  The exception is caught and transformed into a
 468   // pending exception stored in JavaThread that can be tested from
 469   // within the VM.
 470   //
 471   // Note: Usually the parameters are removed by the callee. In case
 472   // of an exception crossing an activation frame boundary, that is
 473   // not the case if the callee is compiled code => need to setup the
 474   // sp.
 475   //
 476   // x10: exception oop
 477 
 478   address generate_catch_exception() {
 479     StubId stub_id = StubId::stubgen_catch_exception_id;
 480     StubCodeMark mark(this, stub_id);
 481     address start = __ pc();
 482 
 483     // same as in generate_call_stub():
 484     const Address thread(fp, thread_off * wordSize);
 485 
 486 #ifdef ASSERT
 487     // verify that threads correspond
 488     {
 489       Label L, S;
 490       __ ld(t0, thread);
 491       __ bne(xthread, t0, S);
 492       __ get_thread(t0);
 493       __ beq(xthread, t0, L);
 494       __ bind(S);
 495       __ stop("StubRoutines::catch_exception: threads must correspond");
 496       __ bind(L);
 497     }
 498 #endif
 499 
 500     // set pending exception
 501     __ verify_oop(x10);
 502 
 503     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 504     __ mv(t0, (address)__FILE__);
 505     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 506     __ mv(t0, (int)__LINE__);
 507     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 508 
 509     // complete return to VM
 510     assert(StubRoutines::_call_stub_return_address != nullptr,
 511            "_call_stub_return_address must have been generated before");
 512     __ j(RuntimeAddress(StubRoutines::_call_stub_return_address));
 513 
 514     return start;
 515   }
 516 
 517   // Continuation point for runtime calls returning with a pending
 518   // exception.  The pending exception check happened in the runtime
 519   // or native call stub.  The pending exception in Thread is
 520   // converted into a Java-level exception.
 521   //
 522   // Contract with Java-level exception handlers:
 523   // x10: exception
 524   // x13: throwing pc
 525   //
 526   // NOTE: At entry of this stub, exception-pc must be in RA !!
 527 
 528   // NOTE: this is always used as a jump target within generated code
 529   // so it just needs to be generated code with no x86 prolog
 530 
 531   address generate_forward_exception() {
 532     StubId stub_id = StubId::stubgen_forward_exception_id;
 533     StubCodeMark mark(this, stub_id);
 534     address start = __ pc();
 535 
 536     // Upon entry, RA points to the return address returning into
 537     // Java (interpreted or compiled) code; i.e., the return address
 538     // becomes the throwing pc.
 539     //
 540     // Arguments pushed before the runtime call are still on the stack
 541     // but the exception handler will reset the stack pointer ->
 542     // ignore them.  A potential result in registers can be ignored as
 543     // well.
 544 
 545 #ifdef ASSERT
 546     // make sure this code is only executed if there is a pending exception
 547     {
 548       Label L;
 549       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 550       __ bnez(t0, L);
 551       __ stop("StubRoutines::forward exception: no pending exception (1)");
 552       __ bind(L);
 553     }
 554 #endif
 555 
 556     // compute exception handler into x9
 557 
 558     // call the VM to find the handler address associated with the
 559     // caller address. pass thread in x10 and caller pc (ret address)
 560     // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
 561     // the stack.
 562     __ mv(c_rarg1, ra);
 563     // ra will be trashed by the VM call so we move it to x9
 564     // (callee-saved) because we also need to pass it to the handler
 565     // returned by this call.
 566     __ mv(x9, ra);
 567     BLOCK_COMMENT("call exception_handler_for_return_address");
 568     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 569                          SharedRuntime::exception_handler_for_return_address),
 570                     xthread, c_rarg1);
 571     // we should not really care that ra is no longer the callee
 572     // address. we saved the value the handler needs in x9 so we can
 573     // just copy it to x13. however, the C2 handler will push its own
 574     // frame and then calls into the VM and the VM code asserts that
 575     // the PC for the frame above the handler belongs to a compiled
 576     // Java method. So, we restore ra here to satisfy that assert.
 577     __ mv(ra, x9);
 578     // setup x10 & x13 & clear pending exception
 579     __ mv(x13, x9);
 580     __ mv(x9, x10);
 581     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 582     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 583 
 584 #ifdef ASSERT
 585     // make sure exception is set
 586     {
 587       Label L;
 588       __ bnez(x10, L);
 589       __ stop("StubRoutines::forward exception: no pending exception (2)");
 590       __ bind(L);
 591     }
 592 #endif
 593 
 594     // continue at exception handler
 595     // x10: exception
 596     // x13: throwing pc
 597     // x9: exception handler
 598     __ verify_oop(x10);
 599     __ jr(x9);
 600 
 601     return start;
 602   }
 603 
 604   // Non-destructive plausibility checks for oops
 605   //
 606   // Arguments:
 607   //    x10: oop to verify
 608   //    t0: error message
 609   //
 610   // Stack after saving c_rarg3:
 611   //    [tos + 0]: saved c_rarg3
 612   //    [tos + 1]: saved c_rarg2
 613   //    [tos + 2]: saved ra
 614   //    [tos + 3]: saved t1
 615   //    [tos + 4]: saved x10
 616   //    [tos + 5]: saved t0
 617   address generate_verify_oop() {
 618 
 619     StubId stub_id = StubId::stubgen_verify_oop_id;
 620     StubCodeMark mark(this, stub_id);
 621     address start = __ pc();
 622 
 623     Label exit, error;
 624 
 625     __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
 626 
 627     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 628     __ ld(c_rarg3, Address(c_rarg2));
 629     __ addi(c_rarg3, c_rarg3, 1);
 630     __ sd(c_rarg3, Address(c_rarg2));
 631 
 632     // object is in x10
 633     // make sure object is 'reasonable'
 634     __ beqz(x10, exit); // if obj is null it is OK
 635 
 636     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 637     bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
 638 
 639     // return if everything seems ok
 640     __ bind(exit);
 641 
 642     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp);  // pop c_rarg2 and c_rarg3
 643     __ ret();
 644 
 645     // handle errors
 646     __ bind(error);
 647     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
 648 
 649     __ push_reg(RegSet::range(x0, x31), sp);
 650     // debug(char* msg, int64_t pc, int64_t regs[])
 651     __ mv(c_rarg0, t0);             // pass address of error message
 652     __ mv(c_rarg1, ra);             // pass return address
 653     __ mv(c_rarg2, sp);             // pass address of regs on stack
 654 #ifndef PRODUCT
 655     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 656 #endif
 657     BLOCK_COMMENT("call MacroAssembler::debug");
 658     __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 659     __ ebreak();
 660 
 661     return start;
 662   }
 663 
 664   // The inner part of zero_words().
 665   //
 666   // Inputs:
 667   // x28: the HeapWord-aligned base address of an array to zero.
 668   // x29: the count in HeapWords, x29 > 0.
 669   //
 670   // Returns x28 and x29, adjusted for the caller to clear.
 671   // x28: the base address of the tail of words left to clear.
 672   // x29: the number of words in the tail.
 673   //      x29 < MacroAssembler::zero_words_block_size.
 674 
 675   address generate_zero_blocks() {
 676     Label done;
 677 
 678     const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
 679 
 680     __ align(CodeEntryAlignment);
 681     StubId stub_id = StubId::stubgen_zero_blocks_id;
 682     StubCodeMark mark(this, stub_id);
 683     address start = __ pc();
 684 
 685     if (UseBlockZeroing) {
 686       int zicboz_block_size = VM_Version::zicboz_block_size.value();
 687       // Ensure count >= 2 * zicboz_block_size so that it still deserves
 688       // a cbo.zero after alignment.
 689       Label small;
 690       int low_limit = MAX2(2 * zicboz_block_size, (int)BlockZeroingLowLimit) / wordSize;
 691       __ mv(tmp1, low_limit);
 692       __ blt(cnt, tmp1, small);
 693       __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
 694       __ bind(small);
 695     }
 696 
 697     {
 698       // Clear the remaining blocks.
 699       Label loop;
 700       __ mv(tmp1, MacroAssembler::zero_words_block_size);
 701       __ blt(cnt, tmp1, done);
 702       __ bind(loop);
 703       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 704         __ sd(zr, Address(base, i * wordSize));
 705       }
 706       __ addi(base, base, MacroAssembler::zero_words_block_size * wordSize);
 707       __ subi(cnt, cnt, MacroAssembler::zero_words_block_size);
 708       __ bge(cnt, tmp1, loop);
 709       __ bind(done);
 710     }
 711 
 712     __ ret();
 713 
 714     return start;
 715   }
 716 
 717   typedef enum {
 718     copy_forwards = 1,
 719     copy_backwards = -1
 720   } copy_direction;
 721 
 722   // Bulk copy of blocks of 8 words.
 723   //
 724   // count is a count of words.
 725   //
 726   // Precondition: count >= 8
 727   //
 728   // Postconditions:
 729   //
 730   // The least significant bit of count contains the remaining count
 731   // of words to copy.  The rest of count is trash.
 732   //
 733   // s and d are adjusted to point to the remaining words to copy
 734   //
 735   address generate_copy_longs(StubId stub_id, Register s, Register d, Register count) {
 736     BasicType type;
 737     copy_direction direction;
 738     switch (stub_id) {
 739     case StubId::stubgen_copy_byte_f_id:
 740       direction = copy_forwards;
 741       type = T_BYTE;
 742       break;
 743     case StubId::stubgen_copy_byte_b_id:
 744       direction = copy_backwards;
 745       type = T_BYTE;
 746       break;
 747     default:
 748       ShouldNotReachHere();
 749     }
 750     int unit = wordSize * direction;
 751     int bias = wordSize;
 752 
 753     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 754       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 755 
 756     const Register stride = x30;
 757 
 758     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 759       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 760     assert_different_registers(s, d, count, t0);
 761 
 762     Label again, drain;
 763     StubCodeMark mark(this, stub_id);
 764     __ align(CodeEntryAlignment);
 765     address start = __ pc();
 766 
 767     if (direction == copy_forwards) {
 768       __ sub(s, s, bias);
 769       __ sub(d, d, bias);
 770     }
 771 
 772 #ifdef ASSERT
 773     // Make sure we are never given < 8 words
 774     {
 775       Label L;
 776 
 777       __ mv(t0, 8);
 778       __ bge(count, t0, L);
 779       __ stop("genrate_copy_longs called with < 8 words");
 780       __ bind(L);
 781     }
 782 #endif
 783 
 784     __ ld(tmp_reg0, Address(s, 1 * unit));
 785     __ ld(tmp_reg1, Address(s, 2 * unit));
 786     __ ld(tmp_reg2, Address(s, 3 * unit));
 787     __ ld(tmp_reg3, Address(s, 4 * unit));
 788     __ ld(tmp_reg4, Address(s, 5 * unit));
 789     __ ld(tmp_reg5, Address(s, 6 * unit));
 790     __ ld(tmp_reg6, Address(s, 7 * unit));
 791     __ ld(tmp_reg7, Address(s, 8 * unit));
 792     __ addi(s, s, 8 * unit);
 793 
 794     __ subi(count, count, 16);
 795     __ bltz(count, drain);
 796 
 797     __ bind(again);
 798 
 799     __ sd(tmp_reg0, Address(d, 1 * unit));
 800     __ sd(tmp_reg1, Address(d, 2 * unit));
 801     __ sd(tmp_reg2, Address(d, 3 * unit));
 802     __ sd(tmp_reg3, Address(d, 4 * unit));
 803     __ sd(tmp_reg4, Address(d, 5 * unit));
 804     __ sd(tmp_reg5, Address(d, 6 * unit));
 805     __ sd(tmp_reg6, Address(d, 7 * unit));
 806     __ sd(tmp_reg7, Address(d, 8 * unit));
 807 
 808     __ ld(tmp_reg0, Address(s, 1 * unit));
 809     __ ld(tmp_reg1, Address(s, 2 * unit));
 810     __ ld(tmp_reg2, Address(s, 3 * unit));
 811     __ ld(tmp_reg3, Address(s, 4 * unit));
 812     __ ld(tmp_reg4, Address(s, 5 * unit));
 813     __ ld(tmp_reg5, Address(s, 6 * unit));
 814     __ ld(tmp_reg6, Address(s, 7 * unit));
 815     __ ld(tmp_reg7, Address(s, 8 * unit));
 816 
 817     __ addi(s, s, 8 * unit);
 818     __ addi(d, d, 8 * unit);
 819 
 820     __ subi(count, count, 8);
 821     __ bgez(count, again);
 822 
 823     // Drain
 824     __ bind(drain);
 825 
 826     __ sd(tmp_reg0, Address(d, 1 * unit));
 827     __ sd(tmp_reg1, Address(d, 2 * unit));
 828     __ sd(tmp_reg2, Address(d, 3 * unit));
 829     __ sd(tmp_reg3, Address(d, 4 * unit));
 830     __ sd(tmp_reg4, Address(d, 5 * unit));
 831     __ sd(tmp_reg5, Address(d, 6 * unit));
 832     __ sd(tmp_reg6, Address(d, 7 * unit));
 833     __ sd(tmp_reg7, Address(d, 8 * unit));
 834     __ addi(d, d, 8 * unit);
 835 
 836     {
 837       Label L1, L2;
 838       __ test_bit(t0, count, 2);
 839       __ beqz(t0, L1);
 840 
 841       __ ld(tmp_reg0, Address(s, 1 * unit));
 842       __ ld(tmp_reg1, Address(s, 2 * unit));
 843       __ ld(tmp_reg2, Address(s, 3 * unit));
 844       __ ld(tmp_reg3, Address(s, 4 * unit));
 845       __ addi(s, s, 4 * unit);
 846 
 847       __ sd(tmp_reg0, Address(d, 1 * unit));
 848       __ sd(tmp_reg1, Address(d, 2 * unit));
 849       __ sd(tmp_reg2, Address(d, 3 * unit));
 850       __ sd(tmp_reg3, Address(d, 4 * unit));
 851       __ addi(d, d, 4 * unit);
 852 
 853       __ bind(L1);
 854 
 855       if (direction == copy_forwards) {
 856         __ addi(s, s, bias);
 857         __ addi(d, d, bias);
 858       }
 859 
 860       __ test_bit(t0, count, 1);
 861       __ beqz(t0, L2);
 862       if (direction == copy_backwards) {
 863         __ addi(s, s, 2 * unit);
 864         __ ld(tmp_reg0, Address(s));
 865         __ ld(tmp_reg1, Address(s, wordSize));
 866         __ addi(d, d, 2 * unit);
 867         __ sd(tmp_reg0, Address(d));
 868         __ sd(tmp_reg1, Address(d, wordSize));
 869       } else {
 870         __ ld(tmp_reg0, Address(s));
 871         __ ld(tmp_reg1, Address(s, wordSize));
 872         __ addi(s, s, 2 * unit);
 873         __ sd(tmp_reg0, Address(d));
 874         __ sd(tmp_reg1, Address(d, wordSize));
 875         __ addi(d, d, 2 * unit);
 876       }
 877       __ bind(L2);
 878     }
 879 
 880     __ ret();
 881 
 882     return start;
 883   }
 884 
 885   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 886 
 887   void copy_memory_v(Register s, Register d, Register count, int step) {
 888     bool is_backward = step < 0;
 889     int granularity = g_uabs(step);
 890 
 891     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 892     assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
 893     Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
 894     Label loop_forward, loop_backward, done;
 895 
 896     __ mv(dst, d);
 897     __ mv(src, s);
 898     __ mv(cnt, count);
 899 
 900     __ bind(loop_forward);
 901     __ vsetvli(vl, cnt, sew, Assembler::m8);
 902     if (is_backward) {
 903       __ bne(vl, cnt, loop_backward);
 904     }
 905 
 906     __ vlex_v(v0, src, sew);
 907     __ sub(cnt, cnt, vl);
 908     if (sew != Assembler::e8) {
 909       // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 910       __ slli(vl, vl, sew);
 911     }
 912     __ add(src, src, vl);
 913 
 914     __ vsex_v(v0, dst, sew);
 915     __ add(dst, dst, vl);
 916     __ bnez(cnt, loop_forward);
 917 
 918     if (is_backward) {
 919       __ j(done);
 920 
 921       __ bind(loop_backward);
 922       __ sub(t0, cnt, vl);
 923       if (sew != Assembler::e8) {
 924         // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 925         __ slli(t0, t0, sew);
 926       }
 927       __ add(tmp1, s, t0);
 928       __ vlex_v(v0, tmp1, sew);
 929       __ add(tmp2, d, t0);
 930       __ vsex_v(v0, tmp2, sew);
 931       __ sub(cnt, cnt, vl);
 932       __ bnez(cnt, loop_forward);
 933       __ bind(done);
 934     }
 935   }
 936 
 937   // All-singing all-dancing memory copy.
 938   //
 939   // Copy count units of memory from s to d.  The size of a unit is
 940   // step, which can be positive or negative depending on the direction
 941   // of copy.
 942   //
 943   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 944                    Register s, Register d, Register count, int step) {
 945     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 946     if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
 947       return copy_memory_v(s, d, count, step);
 948     }
 949 
 950     bool is_backwards = step < 0;
 951     int granularity = g_uabs(step);
 952 
 953     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
 954     const Register gct1 = x28, gct2 = x29, gct3 = t2;
 955 
 956     Label same_aligned;
 957     Label copy_big, copy32_loop, copy8_loop, copy_small, done;
 958 
 959     // The size of copy32_loop body increases significantly with ZGC GC barriers.
 960     // Need conditional far branches to reach a point beyond the loop in this case.
 961     bool is_far = UseZGC;
 962 
 963     __ beqz(count, done, is_far);
 964     __ slli(cnt, count, exact_log2(granularity));
 965     if (is_backwards) {
 966       __ add(src, s, cnt);
 967       __ add(dst, d, cnt);
 968     } else {
 969       __ mv(src, s);
 970       __ mv(dst, d);
 971     }
 972 
 973     if (is_aligned) {
 974       __ subi(t0, cnt, 32);
 975       __ bgez(t0, copy32_loop);
 976       __ subi(t0, cnt, 8);
 977       __ bgez(t0, copy8_loop, is_far);
 978       __ j(copy_small);
 979     } else {
 980       __ mv(t0, 16);
 981       __ blt(cnt, t0, copy_small, is_far);
 982 
 983       __ xorr(t0, src, dst);
 984       __ andi(t0, t0, 0b111);
 985       __ bnez(t0, copy_small, is_far);
 986 
 987       __ bind(same_aligned);
 988       __ andi(t0, src, 0b111);
 989       __ beqz(t0, copy_big);
 990       if (is_backwards) {
 991         __ addi(src, src, step);
 992         __ addi(dst, dst, step);
 993       }
 994       bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
 995       bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
 996       if (!is_backwards) {
 997         __ addi(src, src, step);
 998         __ addi(dst, dst, step);
 999       }
1000       __ subi(cnt, cnt, granularity);
1001       __ beqz(cnt, done, is_far);
1002       __ j(same_aligned);
1003 
1004       __ bind(copy_big);
1005       __ mv(t0, 32);
1006       __ blt(cnt, t0, copy8_loop, is_far);
1007     }
1008 
1009     __ bind(copy32_loop);
1010     if (is_backwards) {
1011       __ subi(src, src, wordSize * 4);
1012       __ subi(dst, dst, wordSize * 4);
1013     }
1014     // we first load 32 bytes, then write it, so the direction here doesn't matter
1015     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src),     gct1);
1016     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8),  gct1);
1017     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
1018     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
1019 
1020     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst),     tmp3, gct1, gct2, gct3);
1021     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8),  tmp4, gct1, gct2, gct3);
1022     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
1023     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
1024 
1025     if (!is_backwards) {
1026       __ addi(src, src, wordSize * 4);
1027       __ addi(dst, dst, wordSize * 4);
1028     }
1029     __ subi(t0, cnt, 32 + wordSize * 4);
1030     __ subi(cnt, cnt, wordSize * 4);
1031     __ bgez(t0, copy32_loop); // cnt >= 32, do next loop
1032 
1033     __ beqz(cnt, done); // if that's all - done
1034 
1035     __ subi(t0, cnt, 8); // if not - copy the reminder
1036     __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
1037 
1038     __ bind(copy8_loop);
1039     if (is_backwards) {
1040       __ subi(src, src, wordSize);
1041       __ subi(dst, dst, wordSize);
1042     }
1043     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1044     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1045 
1046     if (!is_backwards) {
1047       __ addi(src, src, wordSize);
1048       __ addi(dst, dst, wordSize);
1049     }
1050     __ subi(t0, cnt, 8 + wordSize);
1051     __ subi(cnt, cnt, wordSize);
1052     __ bgez(t0, copy8_loop); // cnt >= 8, do next loop
1053 
1054     __ beqz(cnt, done); // if that's all - done
1055 
1056     __ bind(copy_small);
1057     if (is_backwards) {
1058       __ addi(src, src, step);
1059       __ addi(dst, dst, step);
1060     }
1061 
1062     bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
1063     bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
1064 
1065     if (!is_backwards) {
1066       __ addi(src, src, step);
1067       __ addi(dst, dst, step);
1068     }
1069     __ subi(cnt, cnt, granularity);
1070     __ bgtz(cnt, copy_small);
1071 
1072     __ bind(done);
1073   }
1074 
1075   // Scan over array at a for count oops, verifying each one.
1076   // Preserves a and count, clobbers t0 and t1.
1077   void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1078     Label loop, end;
1079     __ mv(t1, zr);
1080     __ slli(t0, count, exact_log2(size));
1081     __ bind(loop);
1082     __ bgeu(t1, t0, end);
1083 
1084     __ add(temp, a, t1);
1085     if (size == (size_t)wordSize) {
1086       __ ld(temp, Address(temp, 0));
1087       __ verify_oop(temp);
1088     } else {
1089       __ lwu(temp, Address(temp, 0));
1090       __ decode_heap_oop(temp); // calls verify_oop
1091     }
1092     __ add(t1, t1, size);
1093     __ j(loop);
1094     __ bind(end);
1095   }
1096 
1097   // Arguments:
1098   //   stub_id - is used to name the stub and identify all details of
1099   //             how to perform the copy.
1100   //
1101   //   nopush_entry - is assigned to the stub's post push entry point
1102   //                  unless it is null
1103   //
1104   // Inputs:
1105   //   c_rarg0   - source array address
1106   //   c_rarg1   - destination array address
1107   //   c_rarg2   - element count, treated as ssize_t, can be zero
1108   //
1109   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1110   // the hardware handle it.  The two dwords within qwords that span
1111   // cache line boundaries will still be loaded and stored atomically.
1112   //
1113   // Side Effects: nopush_entry is set to the (post push) entry point
1114   //               so it can be used by the corresponding conjoint
1115   //               copy method
1116   //
1117   address generate_disjoint_copy(StubId stub_id, address* nopush_entry) {
1118     size_t size;
1119     bool aligned;
1120     bool is_oop;
1121     bool dest_uninitialized;
1122     switch (stub_id) {
1123     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1124       size = sizeof(jbyte);
1125       aligned = false;
1126       is_oop = false;
1127       dest_uninitialized = false;
1128       break;
1129     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1130       size = sizeof(jbyte);
1131       aligned = true;
1132       is_oop = false;
1133       dest_uninitialized = false;
1134       break;
1135     case StubId::stubgen_jshort_disjoint_arraycopy_id:
1136       size = sizeof(jshort);
1137       aligned = false;
1138       is_oop = false;
1139       dest_uninitialized = false;
1140       break;
1141     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1142       size = sizeof(jshort);
1143       aligned = true;
1144       is_oop = false;
1145       dest_uninitialized = false;
1146       break;
1147     case StubId::stubgen_jint_disjoint_arraycopy_id:
1148       size = sizeof(jint);
1149       aligned = false;
1150       is_oop = false;
1151       dest_uninitialized = false;
1152       break;
1153     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1154       size = sizeof(jint);
1155       aligned = true;
1156       is_oop = false;
1157       dest_uninitialized = false;
1158       break;
1159     case StubId::stubgen_jlong_disjoint_arraycopy_id:
1160       // since this is always aligned we can (should!) use the same
1161       // stub as for case arrayof_jlong_disjoint_arraycopy
1162       ShouldNotReachHere();
1163       break;
1164     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1165       size = sizeof(jlong);
1166       aligned = true;
1167       is_oop = false;
1168       dest_uninitialized = false;
1169       break;
1170     case StubId::stubgen_oop_disjoint_arraycopy_id:
1171       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1172       aligned = !UseCompressedOops;
1173       is_oop = true;
1174       dest_uninitialized = false;
1175       break;
1176     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1177       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1178       aligned = !UseCompressedOops;
1179       is_oop = true;
1180       dest_uninitialized = false;
1181       break;
1182     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1183       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1184       aligned = !UseCompressedOops;
1185       is_oop = true;
1186       dest_uninitialized = true;
1187       break;
1188     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1189       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1190       aligned = !UseCompressedOops;
1191       is_oop = true;
1192       dest_uninitialized = true;
1193       break;
1194     default:
1195       ShouldNotReachHere();
1196       break;
1197     }
1198 
1199     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1200     RegSet saved_reg = RegSet::of(s, d, count);
1201     __ align(CodeEntryAlignment);
1202     StubCodeMark mark(this, stub_id);
1203     address start = __ pc();
1204     __ enter();
1205 
1206     if (nopush_entry != nullptr) {
1207      *nopush_entry = __ pc();
1208       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1209       BLOCK_COMMENT("Entry:");
1210     }
1211 
1212     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1213     if (dest_uninitialized) {
1214       decorators |= IS_DEST_UNINITIALIZED;
1215     }
1216     if (aligned) {
1217       decorators |= ARRAYCOPY_ALIGNED;
1218     }
1219 
1220     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1221     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1222 
1223     if (is_oop) {
1224       // save regs before copy_memory
1225       __ push_reg(RegSet::of(d, count), sp);
1226     }
1227 
1228     {
1229       // UnsafeMemoryAccess page error: continue after unsafe access
1230       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1231       UnsafeMemoryAccessMark umam(this, add_entry, true);
1232       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1233     }
1234 
1235     if (is_oop) {
1236       __ pop_reg(RegSet::of(d, count), sp);
1237       if (VerifyOops) {
1238         verify_oop_array(size, d, count, t2);
1239       }
1240     }
1241 
1242     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1243 
1244     __ leave();
1245     __ mv(x10, zr); // return 0
1246     __ ret();
1247     return start;
1248   }
1249 
1250   // Arguments:
1251   //   stub_id - is used to name the stub and identify all details of
1252   //             how to perform the copy.
1253   //
1254   //   nooverlap_target - identifes the (post push) entry for the
1255   //             corresponding disjoint copy routine which can be
1256   //             jumped to if the ranges do not actually overlap
1257   //
1258   //   nopush_entry - is assigned to the stub's post push entry point
1259   //                 unless it is null
1260   //
1261   // Inputs:
1262   //   c_rarg0   - source array address
1263   //   c_rarg1   - destination array address
1264   //   c_rarg2   - element count, treated as ssize_t, can be zero
1265   //
1266   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1267   // the hardware handle it.  The two dwords within qwords that span
1268   // cache line boundaries will still be loaded and stored atomically.
1269   //
1270   // Side Effects:
1271   //   nopush_entry is set to the no-overlap entry point so it can be
1272   //   used by some other conjoint copy method
1273   //
1274   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1275     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1276     RegSet saved_regs = RegSet::of(s, d, count);
1277     int size;
1278     bool aligned;
1279     bool is_oop;
1280     bool dest_uninitialized;
1281     switch (stub_id) {
1282     case StubId::stubgen_jbyte_arraycopy_id:
1283       size = sizeof(jbyte);
1284       aligned = false;
1285       is_oop = false;
1286       dest_uninitialized = false;
1287       break;
1288     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1289       size = sizeof(jbyte);
1290       aligned = true;
1291       is_oop = false;
1292       dest_uninitialized = false;
1293       break;
1294     case StubId::stubgen_jshort_arraycopy_id:
1295       size = sizeof(jshort);
1296       aligned = false;
1297       is_oop = false;
1298       dest_uninitialized = false;
1299       break;
1300     case StubId::stubgen_arrayof_jshort_arraycopy_id:
1301       size = sizeof(jshort);
1302       aligned = true;
1303       is_oop = false;
1304       dest_uninitialized = false;
1305       break;
1306     case StubId::stubgen_jint_arraycopy_id:
1307       size = sizeof(jint);
1308       aligned = false;
1309       is_oop = false;
1310       dest_uninitialized = false;
1311       break;
1312     case StubId::stubgen_arrayof_jint_arraycopy_id:
1313       size = sizeof(jint);
1314       aligned = true;
1315       is_oop = false;
1316       dest_uninitialized = false;
1317       break;
1318     case StubId::stubgen_jlong_arraycopy_id:
1319       // since this is always aligned we can (should!) use the same
1320       // stub as for case arrayof_jlong_disjoint_arraycopy
1321       ShouldNotReachHere();
1322       break;
1323     case StubId::stubgen_arrayof_jlong_arraycopy_id:
1324       size = sizeof(jlong);
1325       aligned = true;
1326       is_oop = false;
1327       dest_uninitialized = false;
1328       break;
1329     case StubId::stubgen_oop_arraycopy_id:
1330       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1331       aligned = !UseCompressedOops;
1332       is_oop = true;
1333       dest_uninitialized = false;
1334       break;
1335     case StubId::stubgen_arrayof_oop_arraycopy_id:
1336       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1337       aligned = !UseCompressedOops;
1338       is_oop = true;
1339       dest_uninitialized = false;
1340       break;
1341     case StubId::stubgen_oop_arraycopy_uninit_id:
1342       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1343       aligned = !UseCompressedOops;
1344       is_oop = true;
1345       dest_uninitialized = true;
1346       break;
1347     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1348       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1349       aligned = !UseCompressedOops;
1350       is_oop = true;
1351       dest_uninitialized = true;
1352       break;
1353     default:
1354       ShouldNotReachHere();
1355     }
1356 
1357     StubCodeMark mark(this, stub_id);
1358     address start = __ pc();
1359     __ enter();
1360 
1361     if (nopush_entry != nullptr) {
1362       *nopush_entry = __ pc();
1363       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1364       BLOCK_COMMENT("Entry:");
1365     }
1366 
1367     // use fwd copy when (d-s) above_equal (count*size)
1368     __ sub(t0, d, s);
1369     __ slli(t1, count, exact_log2(size));
1370     Label L_continue;
1371     __ bltu(t0, t1, L_continue);
1372     __ j(RuntimeAddress(nooverlap_target));
1373     __ bind(L_continue);
1374 
1375     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1376     if (dest_uninitialized) {
1377       decorators |= IS_DEST_UNINITIALIZED;
1378     }
1379     if (aligned) {
1380       decorators |= ARRAYCOPY_ALIGNED;
1381     }
1382 
1383     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1384     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1385 
1386     if (is_oop) {
1387       // save regs before copy_memory
1388       __ push_reg(RegSet::of(d, count), sp);
1389     }
1390 
1391     {
1392       // UnsafeMemoryAccess page error: continue after unsafe access
1393       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1394       UnsafeMemoryAccessMark umam(this, add_entry, true);
1395       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1396     }
1397 
1398     if (is_oop) {
1399       __ pop_reg(RegSet::of(d, count), sp);
1400       if (VerifyOops) {
1401         verify_oop_array(size, d, count, t2);
1402       }
1403     }
1404     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1405     __ leave();
1406     __ mv(x10, zr); // return 0
1407     __ ret();
1408     return start;
1409   }
1410 
1411   // Helper for generating a dynamic type check.
1412   // Smashes t0, t1.
1413   void generate_type_check(Register sub_klass,
1414                            Register super_check_offset,
1415                            Register super_klass,
1416                            Register result,
1417                            Register tmp1,
1418                            Register tmp2,
1419                            Label& L_success) {
1420     assert_different_registers(sub_klass, super_check_offset, super_klass);
1421 
1422     BLOCK_COMMENT("type_check:");
1423 
1424     Label L_miss;
1425 
1426     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
1427     __ check_klass_subtype_slow_path(sub_klass, super_klass, tmp1, tmp2, &L_success, nullptr);
1428 
1429     // Fall through on failure!
1430     __ BIND(L_miss);
1431   }
1432 
1433   //
1434   //  Generate checkcasting array copy stub
1435   //
1436   //  Input:
1437   //    c_rarg0   - source array address
1438   //    c_rarg1   - destination array address
1439   //    c_rarg2   - element count, treated as ssize_t, can be zero
1440   //    c_rarg3   - size_t ckoff (super_check_offset)
1441   //    c_rarg4   - oop ckval (super_klass)
1442   //
1443   //  Output:
1444   //    x10 ==  0  -  success
1445   //    x10 == -1^K - failure, where K is partial transfer count
1446   //
1447   address generate_checkcast_copy(StubId stub_id, address* nopush_entry) {
1448     bool dest_uninitialized;
1449     switch (stub_id) {
1450     case StubId::stubgen_checkcast_arraycopy_id:
1451       dest_uninitialized = false;
1452       break;
1453     case StubId::stubgen_checkcast_arraycopy_uninit_id:
1454       dest_uninitialized = true;
1455       break;
1456     default:
1457       ShouldNotReachHere();
1458     }
1459 
1460     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1461 
1462     // Input registers (after setup_arg_regs)
1463     const Register from        = c_rarg0;   // source array address
1464     const Register to          = c_rarg1;   // destination array address
1465     const Register count       = c_rarg2;   // elementscount
1466     const Register ckoff       = c_rarg3;   // super_check_offset
1467     const Register ckval       = c_rarg4;   // super_klass
1468 
1469     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1470 
1471     // Registers used as temps (x7, x9, x18 are save-on-entry)
1472     const Register count_save  = x19;       // orig elementscount
1473     const Register start_to    = x18;       // destination array start address
1474     const Register copied_oop  = x7;        // actual oop copied
1475     const Register r9_klass    = x9;        // oop._klass
1476 
1477     // Registers used as gc temps (x15, x16, x17 are save-on-call)
1478     const Register gct1 = x15, gct2 = x16, gct3 = x17;
1479 
1480     //---------------------------------------------------------------
1481     // Assembler stub will be used for this call to arraycopy
1482     // if the two arrays are subtypes of Object[] but the
1483     // destination array type is not equal to or a supertype
1484     // of the source type.  Each element must be separately
1485     // checked.
1486 
1487     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1488                                copied_oop, r9_klass, count_save);
1489 
1490     __ align(CodeEntryAlignment);
1491     StubCodeMark mark(this, stub_id);
1492     address start = __ pc();
1493 
1494     __ enter(); // required for proper stackwalking of RuntimeStub frame
1495 
1496     // Caller of this entry point must set up the argument registers.
1497     if (nopush_entry != nullptr) {
1498       *nopush_entry = __ pc();
1499       BLOCK_COMMENT("Entry:");
1500     }
1501 
1502     // Empty array:  Nothing to do
1503     __ beqz(count, L_done);
1504 
1505     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1506 
1507 #ifdef ASSERT
1508     BLOCK_COMMENT("assert consistent ckoff/ckval");
1509     // The ckoff and ckval must be mutually consistent,
1510     // even though caller generates both.
1511     { Label L;
1512       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1513       __ lwu(start_to, Address(ckval, sco_offset));
1514       __ beq(ckoff, start_to, L);
1515       __ stop("super_check_offset inconsistent");
1516       __ bind(L);
1517     }
1518 #endif //ASSERT
1519 
1520     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1521     if (dest_uninitialized) {
1522       decorators |= IS_DEST_UNINITIALIZED;
1523     }
1524 
1525     bool is_oop = true;
1526     int element_size = UseCompressedOops ? 4 : 8;
1527 
1528     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1529     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1530 
1531     // save the original count
1532     __ mv(count_save, count);
1533 
1534     // Copy from low to high addresses
1535     __ mv(start_to, to);              // Save destination array start address
1536     __ j(L_load_element);
1537 
1538     // ======== begin loop ========
1539     // (Loop is rotated; its entry is L_load_element.)
1540     // Loop control:
1541     //   for count to 0 do
1542     //     copied_oop = load_heap_oop(from++)
1543     //     ... generate_type_check ...
1544     //     store_heap_oop(to++, copied_oop)
1545     //   end
1546 
1547     __ align(OptoLoopAlignment);
1548 
1549     __ BIND(L_store_element);
1550     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1551                       Address(to, 0), copied_oop,
1552                       gct1, gct2, gct3);
1553     __ addi(to, to, UseCompressedOops ? 4 : 8);
1554     __ subi(count, count, 1);
1555     __ beqz(count, L_do_card_marks);
1556 
1557     // ======== loop entry is here ========
1558     __ BIND(L_load_element);
1559     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1560                      copied_oop, Address(from, 0),
1561                      gct1);
1562     __ addi(from, from, UseCompressedOops ? 4 : 8);
1563     __ beqz(copied_oop, L_store_element);
1564 
1565     __ load_klass(r9_klass, copied_oop);// query the object klass
1566 
1567     BLOCK_COMMENT("type_check:");
1568     generate_type_check(r9_klass, /*sub_klass*/
1569                         ckoff,    /*super_check_offset*/
1570                         ckval,    /*super_klass*/
1571                         x10,      /*result*/
1572                         gct1,     /*tmp1*/
1573                         gct2,     /*tmp2*/
1574                         L_store_element);
1575 
1576     // Fall through on failure!
1577 
1578     // ======== end loop ========
1579 
1580     // It was a real error; we must depend on the caller to finish the job.
1581     // Register count = remaining oops, count_orig = total oops.
1582     // Emit GC store barriers for the oops we have copied and report
1583     // their number to the caller.
1584 
1585     __ sub(count, count_save, count);     // K = partially copied oop count
1586     __ xori(count, count, -1);            // report (-1^K) to caller
1587     __ beqz(count, L_done_pop);
1588 
1589     __ BIND(L_do_card_marks);
1590     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0);
1591 
1592     __ bind(L_done_pop);
1593     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1594     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1595 
1596     __ bind(L_done);
1597     __ mv(x10, count);
1598     __ leave();
1599     __ ret();
1600 
1601     return start;
1602   }
1603 
1604   // Perform range checks on the proposed arraycopy.
1605   // Kills temp, but nothing else.
1606   // Also, clean the sign bits of src_pos and dst_pos.
1607   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1608                               Register src_pos, // source position (c_rarg1)
1609                               Register dst,     // destination array oo (c_rarg2)
1610                               Register dst_pos, // destination position (c_rarg3)
1611                               Register length,
1612                               Register temp,
1613                               Label& L_failed) {
1614     BLOCK_COMMENT("arraycopy_range_checks:");
1615 
1616     assert_different_registers(t0, temp);
1617 
1618     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1619     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1620     __ addw(temp, length, src_pos);
1621     __ bgtu(temp, t0, L_failed);
1622 
1623     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1624     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1625     __ addw(temp, length, dst_pos);
1626     __ bgtu(temp, t0, L_failed);
1627 
1628     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1629     __ zext(src_pos, src_pos, 32);
1630     __ zext(dst_pos, dst_pos, 32);
1631 
1632     BLOCK_COMMENT("arraycopy_range_checks done");
1633   }
1634 
1635   address generate_unsafecopy_common_error_exit() {
1636     address start = __ pc();
1637     __ mv(x10, 0);
1638     __ leave();
1639     __ ret();
1640     return start;
1641   }
1642 
1643   //
1644   //  Generate 'unsafe' set memory stub
1645   //  Though just as safe as the other stubs, it takes an unscaled
1646   //  size_t (# bytes) argument instead of an element count.
1647   //
1648   //  Input:
1649   //    c_rarg0   - destination array address
1650   //    c_rarg1   - byte count (size_t)
1651   //    c_rarg2   - byte value
1652   //
1653   address generate_unsafe_setmemory() {
1654     __ align(CodeEntryAlignment);
1655     StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
1656     StubCodeMark mark(this, stub_id);
1657     address start = __ pc();
1658 
1659     // bump this on entry, not on exit:
1660     // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
1661 
1662     Label L_fill_elements;
1663 
1664     const Register dest = c_rarg0;
1665     const Register count = c_rarg1;
1666     const Register value = c_rarg2;
1667     const Register cnt_words = x28; // temp register
1668     const Register tmp_reg   = x29; // temp register
1669 
1670     // Mark remaining code as such which performs Unsafe accesses.
1671     UnsafeMemoryAccessMark umam(this, true, false);
1672 
1673     __ enter(); // required for proper stackwalking of RuntimeStub frame
1674 
1675     // if count < 8, jump to L_fill_elements
1676     __ mv(tmp_reg, 8); // 8 bytes fill by element
1677     __ bltu(count, tmp_reg, L_fill_elements);
1678 
1679     // Propagate byte to 64-bit width
1680     // 8 bit -> 16 bit
1681     __ zext(value, value, 8);
1682     __ slli(tmp_reg, value, 8);
1683     __ orr(value, value, tmp_reg);
1684     // 16 bit -> 32 bit
1685     __ slli(tmp_reg, value, 16);
1686     __ orr(value, value, tmp_reg);
1687     // 32 bit -> 64 bit
1688     __ slli(tmp_reg, value, 32);
1689     __ orr(value, value, tmp_reg);
1690 
1691     // Align source address at 8 bytes address boundary.
1692     Label L_skip_align1, L_skip_align2, L_skip_align4;
1693     // One byte misalignment happens.
1694     __ test_bit(tmp_reg, dest, 0);
1695     __ beqz(tmp_reg, L_skip_align1);
1696     __ sb(value, Address(dest, 0));
1697     __ addi(dest, dest, 1);
1698     __ subi(count, count, 1);
1699 
1700     __ bind(L_skip_align1);
1701     // Two bytes misalignment happens.
1702     __ test_bit(tmp_reg, dest, 1);
1703     __ beqz(tmp_reg, L_skip_align2);
1704     __ sh(value, Address(dest, 0));
1705     __ addi(dest, dest, 2);
1706     __ subi(count, count, 2);
1707 
1708     __ bind(L_skip_align2);
1709     // Four bytes misalignment happens.
1710     __ test_bit(tmp_reg, dest, 2);
1711     __ beqz(tmp_reg, L_skip_align4);
1712     __ sw(value, Address(dest, 0));
1713     __ addi(dest, dest, 4);
1714     __ subi(count, count, 4);
1715     __ bind(L_skip_align4);
1716 
1717     //  Fill large chunks
1718     __ srli(cnt_words, count, 3); // number of words
1719     __ slli(tmp_reg, cnt_words, 3);
1720     __ sub(count, count, tmp_reg);
1721     {
1722       __ fill_words(dest, cnt_words, value);
1723     }
1724 
1725     // Handle copies less than 8 bytes
1726     __ bind(L_fill_elements);
1727     Label L_fill_2, L_fill_1, L_exit;
1728     __ test_bit(tmp_reg, count, 2);
1729     __ beqz(tmp_reg, L_fill_2);
1730     __ sb(value, Address(dest, 0));
1731     __ sb(value, Address(dest, 1));
1732     __ sb(value, Address(dest, 2));
1733     __ sb(value, Address(dest, 3));
1734     __ addi(dest, dest, 4);
1735 
1736     __ bind(L_fill_2);
1737     __ test_bit(tmp_reg, count, 1);
1738     __ beqz(tmp_reg, L_fill_1);
1739     __ sb(value, Address(dest, 0));
1740     __ sb(value, Address(dest, 1));
1741     __ addi(dest, dest, 2);
1742 
1743     __ bind(L_fill_1);
1744     __ test_bit(tmp_reg, count, 0);
1745     __ beqz(tmp_reg, L_exit);
1746     __ sb(value, Address(dest, 0));
1747 
1748     __ bind(L_exit);
1749     __ leave();
1750     __ ret();
1751 
1752     return start;
1753   }
1754 
1755   //
1756   //  Generate 'unsafe' array copy stub
1757   //  Though just as safe as the other stubs, it takes an unscaled
1758   //  size_t argument instead of an element count.
1759   //
1760   //  Input:
1761   //    c_rarg0   - source array address
1762   //    c_rarg1   - destination array address
1763   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1764   //
1765   // Examines the alignment of the operands and dispatches
1766   // to a long, int, short, or byte copy loop.
1767   //
1768   address generate_unsafe_copy(address byte_copy_entry,
1769                                address short_copy_entry,
1770                                address int_copy_entry,
1771                                address long_copy_entry) {
1772     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1773                 int_copy_entry != nullptr && long_copy_entry != nullptr);
1774     Label L_long_aligned, L_int_aligned, L_short_aligned;
1775     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1776 
1777     __ align(CodeEntryAlignment);
1778     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
1779     StubCodeMark mark(this, stub_id);
1780     address start = __ pc();
1781     __ enter(); // required for proper stackwalking of RuntimeStub frame
1782 
1783     // bump this on entry, not on exit:
1784     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1785 
1786     __ orr(t0, s, d);
1787     __ orr(t0, t0, count);
1788 
1789     __ andi(t0, t0, BytesPerLong - 1);
1790     __ beqz(t0, L_long_aligned);
1791     __ andi(t0, t0, BytesPerInt - 1);
1792     __ beqz(t0, L_int_aligned);
1793     __ test_bit(t0, t0, 0);
1794     __ beqz(t0, L_short_aligned);
1795     __ j(RuntimeAddress(byte_copy_entry));
1796 
1797     __ BIND(L_short_aligned);
1798     __ srli(count, count, LogBytesPerShort);  // size => short_count
1799     __ j(RuntimeAddress(short_copy_entry));
1800     __ BIND(L_int_aligned);
1801     __ srli(count, count, LogBytesPerInt);    // size => int_count
1802     __ j(RuntimeAddress(int_copy_entry));
1803     __ BIND(L_long_aligned);
1804     __ srli(count, count, LogBytesPerLong);   // size => long_count
1805     __ j(RuntimeAddress(long_copy_entry));
1806 
1807     return start;
1808   }
1809 
1810   //
1811   //  Generate generic array copy stubs
1812   //
1813   //  Input:
1814   //    c_rarg0    -  src oop
1815   //    c_rarg1    -  src_pos (32-bits)
1816   //    c_rarg2    -  dst oop
1817   //    c_rarg3    -  dst_pos (32-bits)
1818   //    c_rarg4    -  element count (32-bits)
1819   //
1820   //  Output:
1821   //    x10 ==  0  -  success
1822   //    x10 == -1^K - failure, where K is partial transfer count
1823   //
1824   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
1825                                 address int_copy_entry, address oop_copy_entry,
1826                                 address long_copy_entry, address checkcast_copy_entry) {
1827     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1828                 int_copy_entry != nullptr && oop_copy_entry != nullptr &&
1829                 long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
1830     Label L_failed, L_failed_0, L_objArray;
1831     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1832 
1833     // Input registers
1834     const Register src        = c_rarg0;  // source array oop
1835     const Register src_pos    = c_rarg1;  // source position
1836     const Register dst        = c_rarg2;  // destination array oop
1837     const Register dst_pos    = c_rarg3;  // destination position
1838     const Register length     = c_rarg4;
1839 
1840     // Registers used as temps
1841     const Register dst_klass = c_rarg5;
1842 
1843     __ align(CodeEntryAlignment);
1844 
1845     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
1846     StubCodeMark mark(this, stub_id);
1847 
1848     address start = __ pc();
1849 
1850     __ enter(); // required for proper stackwalking of RuntimeStub frame
1851 
1852     // bump this on entry, not on exit:
1853     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1854 
1855     //-----------------------------------------------------------------------
1856     // Assembler stub will be used for this call to arraycopy
1857     // if the following conditions are met:
1858     //
1859     // (1) src and dst must not be null.
1860     // (2) src_pos must not be negative.
1861     // (3) dst_pos must not be negative.
1862     // (4) length  must not be negative.
1863     // (5) src klass and dst klass should be the same and not null.
1864     // (6) src and dst should be arrays.
1865     // (7) src_pos + length must not exceed length of src.
1866     // (8) dst_pos + length must not exceed length of dst.
1867     //
1868 
1869     // if src is null then return -1
1870     __ beqz(src, L_failed);
1871 
1872     // if [src_pos < 0] then return -1
1873     __ sext(t0, src_pos, 32);
1874     __ bltz(t0, L_failed);
1875 
1876     // if dst is null then return -1
1877     __ beqz(dst, L_failed);
1878 
1879     // if [dst_pos < 0] then return -1
1880     __ sext(t0, dst_pos, 32);
1881     __ bltz(t0, L_failed);
1882 
1883     // registers used as temp
1884     const Register scratch_length    = x28; // elements count to copy
1885     const Register scratch_src_klass = x29; // array klass
1886     const Register lh                = x30; // layout helper
1887 
1888     // if [length < 0] then return -1
1889     __ sext(scratch_length, length, 32); // length (elements count, 32-bits value)
1890     __ bltz(scratch_length, L_failed);
1891 
1892     __ load_klass(scratch_src_klass, src);
1893 #ifdef ASSERT
1894     {
1895       BLOCK_COMMENT("assert klasses not null {");
1896       Label L1, L2;
1897       __ bnez(scratch_src_klass, L2);   // it is broken if klass is null
1898       __ bind(L1);
1899       __ stop("broken null klass");
1900       __ bind(L2);
1901       __ load_klass(t0, dst, t1);
1902       __ beqz(t0, L1);     // this would be broken also
1903       BLOCK_COMMENT("} assert klasses not null done");
1904     }
1905 #endif
1906 
1907     // Load layout helper (32-bits)
1908     //
1909     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1910     // 32        30    24            16              8     2                 0
1911     //
1912     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1913     //
1914 
1915     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1916 
1917     // Handle objArrays completely differently...
1918     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1919     __ lw(lh, Address(scratch_src_klass, lh_offset));
1920     __ mv(t0, objArray_lh);
1921     __ beq(lh, t0, L_objArray);
1922 
1923     // if [src->klass() != dst->klass()] then return -1
1924     __ load_klass(t1, dst);
1925     __ bne(t1, scratch_src_klass, L_failed);
1926 
1927     // if src->is_Array() isn't null then return -1
1928     // i.e. (lh >= 0)
1929     __ bgez(lh, L_failed);
1930 
1931     // At this point, it is known to be a typeArray (array_tag 0x3).
1932 #ifdef ASSERT
1933     {
1934       BLOCK_COMMENT("assert primitive array {");
1935       Label L;
1936       __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1937       __ bge(lh, t1, L);
1938       __ stop("must be a primitive array");
1939       __ bind(L);
1940       BLOCK_COMMENT("} assert primitive array done");
1941     }
1942 #endif
1943 
1944     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1945                            t1, L_failed);
1946 
1947     // TypeArrayKlass
1948     //
1949     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1950     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1951     //
1952 
1953     const Register t0_offset = t0;    // array offset
1954     const Register x30_elsize = lh;   // element size
1955 
1956     // Get array_header_in_bytes()
1957     int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1958     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1959     __ slli(t0_offset, lh, XLEN - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1960     __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1961 
1962     __ add(src, src, t0_offset);           // src array offset
1963     __ add(dst, dst, t0_offset);           // dst array offset
1964     BLOCK_COMMENT("choose copy loop based on element size");
1965 
1966     // next registers should be set before the jump to corresponding stub
1967     const Register from     = c_rarg0;  // source array address
1968     const Register to       = c_rarg1;  // destination array address
1969     const Register count    = c_rarg2;  // elements count
1970 
1971     // 'from', 'to', 'count' registers should be set in such order
1972     // since they are the same as 'src', 'src_pos', 'dst'.
1973 
1974     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1975 
1976     // The possible values of elsize are 0-3, i.e. exact_log2(element
1977     // size in bytes).  We do a simple bitwise binary search.
1978   __ BIND(L_copy_bytes);
1979     __ test_bit(t0, x30_elsize, 1);
1980     __ bnez(t0, L_copy_ints);
1981     __ test_bit(t0, x30_elsize, 0);
1982     __ bnez(t0, L_copy_shorts);
1983     __ add(from, src, src_pos); // src_addr
1984     __ add(to, dst, dst_pos); // dst_addr
1985     __ sext(count, scratch_length, 32); // length
1986     __ j(RuntimeAddress(byte_copy_entry));
1987 
1988   __ BIND(L_copy_shorts);
1989     __ shadd(from, src_pos, src, t0, 1); // src_addr
1990     __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1991     __ sext(count, scratch_length, 32); // length
1992     __ j(RuntimeAddress(short_copy_entry));
1993 
1994   __ BIND(L_copy_ints);
1995     __ test_bit(t0, x30_elsize, 0);
1996     __ bnez(t0, L_copy_longs);
1997     __ shadd(from, src_pos, src, t0, 2); // src_addr
1998     __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
1999     __ sext(count, scratch_length, 32); // length
2000     __ j(RuntimeAddress(int_copy_entry));
2001 
2002   __ BIND(L_copy_longs);
2003 #ifdef ASSERT
2004     {
2005       BLOCK_COMMENT("assert long copy {");
2006       Label L;
2007       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
2008       __ sext(lh, lh, 32);
2009       __ mv(t0, LogBytesPerLong);
2010       __ beq(x30_elsize, t0, L);
2011       __ stop("must be long copy, but elsize is wrong");
2012       __ bind(L);
2013       BLOCK_COMMENT("} assert long copy done");
2014     }
2015 #endif
2016     __ shadd(from, src_pos, src, t0, 3); // src_addr
2017     __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
2018     __ sext(count, scratch_length, 32); // length
2019     __ j(RuntimeAddress(long_copy_entry));
2020 
2021     // ObjArrayKlass
2022   __ BIND(L_objArray);
2023     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2024 
2025     Label L_plain_copy, L_checkcast_copy;
2026     // test array classes for subtyping
2027     __ load_klass(t2, dst);
2028     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
2029 
2030     // Identically typed arrays can be copied without element-wise checks.
2031     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2032                            t1, L_failed);
2033 
2034     __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2035     __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2036     __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2037     __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2038     __ sext(count, scratch_length, 32); // length
2039   __ BIND(L_plain_copy);
2040     __ j(RuntimeAddress(oop_copy_entry));
2041 
2042   __ BIND(L_checkcast_copy);
2043     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
2044     {
2045       // Before looking at dst.length, make sure dst is also an objArray.
2046       __ lwu(t0, Address(t2, lh_offset));
2047       __ mv(t1, objArray_lh);
2048       __ bne(t0, t1, L_failed);
2049 
2050       // It is safe to examine both src.length and dst.length.
2051       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2052                              t2, L_failed);
2053 
2054       __ load_klass(dst_klass, dst); // reload
2055 
2056       // Marshal the base address arguments now, freeing registers.
2057       __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2058       __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2059       __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2060       __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2061       __ sext(count, length, 32); // length (reloaded)
2062       const Register sco_temp = c_rarg3; // this register is free now
2063       assert_different_registers(from, to, count, sco_temp,
2064                                  dst_klass, scratch_src_klass);
2065 
2066       // Generate the type check.
2067       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2068       __ lwu(sco_temp, Address(dst_klass, sco_offset));
2069 
2070       // Smashes t0, t1
2071       generate_type_check(scratch_src_klass, sco_temp, dst_klass, noreg, noreg, noreg, L_plain_copy);
2072 
2073       // Fetch destination element klass from the ObjArrayKlass header.
2074       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2075       __ ld(dst_klass, Address(dst_klass, ek_offset));
2076       __ lwu(sco_temp, Address(dst_klass, sco_offset));
2077 
2078       // the checkcast_copy loop needs two extra arguments:
2079       assert(c_rarg3 == sco_temp, "#3 already in place");
2080       // Set up arguments for checkcast_copy_entry.
2081       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
2082       __ j(RuntimeAddress(checkcast_copy_entry));
2083     }
2084 
2085   __ BIND(L_failed);
2086     __ mv(x10, -1);
2087     __ leave();   // required for proper stackwalking of RuntimeStub frame
2088     __ ret();
2089 
2090     return start;
2091   }
2092 
2093   //
2094   // Generate stub for array fill. If "aligned" is true, the
2095   // "to" address is assumed to be heapword aligned.
2096   //
2097   // Arguments for generated stub:
2098   //   to:    c_rarg0
2099   //   value: c_rarg1
2100   //   count: c_rarg2 treated as signed
2101   //
2102   address generate_fill(StubId stub_id) {
2103     BasicType t;
2104     bool aligned;
2105 
2106     switch (stub_id) {
2107     case StubId::stubgen_jbyte_fill_id:
2108       t = T_BYTE;
2109       aligned = false;
2110       break;
2111     case StubId::stubgen_jshort_fill_id:
2112       t = T_SHORT;
2113       aligned = false;
2114       break;
2115     case StubId::stubgen_jint_fill_id:
2116       t = T_INT;
2117       aligned = false;
2118       break;
2119     case StubId::stubgen_arrayof_jbyte_fill_id:
2120       t = T_BYTE;
2121       aligned = true;
2122       break;
2123     case StubId::stubgen_arrayof_jshort_fill_id:
2124       t = T_SHORT;
2125       aligned = true;
2126       break;
2127     case StubId::stubgen_arrayof_jint_fill_id:
2128       t = T_INT;
2129       aligned = true;
2130       break;
2131     default:
2132       ShouldNotReachHere();
2133     };
2134 
2135     __ align(CodeEntryAlignment);
2136     StubCodeMark mark(this, stub_id);
2137     address start = __ pc();
2138 
2139     BLOCK_COMMENT("Entry:");
2140 
2141     const Register to        = c_rarg0;  // source array address
2142     const Register value     = c_rarg1;  // value
2143     const Register count     = c_rarg2;  // elements count
2144 
2145     const Register bz_base   = x28;      // base for block_zero routine
2146     const Register cnt_words = x29;      // temp register
2147     const Register tmp_reg   = t1;
2148 
2149     __ enter();
2150 
2151     Label L_fill_elements;
2152 
2153     int shift = -1;
2154     switch (t) {
2155       case T_BYTE:
2156         shift = 0;
2157         // Short arrays (< 8 bytes) fill by element
2158         __ mv(tmp_reg, 8 >> shift);
2159         __ bltu(count, tmp_reg, L_fill_elements);
2160 
2161         // Zero extend value
2162         // 8 bit -> 16 bit
2163         __ zext(value, value, 8);
2164         __ slli(tmp_reg, value, 8);
2165         __ orr(value, value, tmp_reg);
2166 
2167         // 16 bit -> 32 bit
2168         __ slli(tmp_reg, value, 16);
2169         __ orr(value, value, tmp_reg);
2170         break;
2171       case T_SHORT:
2172         shift = 1;
2173         // Short arrays (< 8 bytes) fill by element
2174         __ mv(tmp_reg, 8 >> shift);
2175         __ bltu(count, tmp_reg, L_fill_elements);
2176 
2177         // Zero extend value
2178         // 16 bit -> 32 bit
2179         __ zext(value, value, 16);
2180         __ slli(tmp_reg, value, 16);
2181         __ orr(value, value, tmp_reg);
2182         break;
2183       case T_INT:
2184         shift = 2;
2185         // Short arrays (< 8 bytes) fill by element
2186         __ mv(tmp_reg, 8 >> shift);
2187         __ bltu(count, tmp_reg, L_fill_elements);
2188         break;
2189       default: ShouldNotReachHere();
2190     }
2191 
2192     // Align source address at 8 bytes address boundary.
2193     Label L_skip_align1, L_skip_align2, L_skip_align4;
2194     if (!aligned) {
2195       switch (t) {
2196         case T_BYTE:
2197           // One byte misalignment happens only for byte arrays.
2198           __ test_bit(tmp_reg, to, 0);
2199           __ beqz(tmp_reg, L_skip_align1);
2200           __ sb(value, Address(to, 0));
2201           __ addi(to, to, 1);
2202           __ subiw(count, count, 1);
2203           __ bind(L_skip_align1);
2204           // Fallthrough
2205         case T_SHORT:
2206           // Two bytes misalignment happens only for byte and short (char) arrays.
2207           __ test_bit(tmp_reg, to, 1);
2208           __ beqz(tmp_reg, L_skip_align2);
2209           __ sh(value, Address(to, 0));
2210           __ addi(to, to, 2);
2211           __ subiw(count, count, 2 >> shift);
2212           __ bind(L_skip_align2);
2213           // Fallthrough
2214         case T_INT:
2215           // Align to 8 bytes, we know we are 4 byte aligned to start.
2216           __ test_bit(tmp_reg, to, 2);
2217           __ beqz(tmp_reg, L_skip_align4);
2218           __ sw(value, Address(to, 0));
2219           __ addi(to, to, 4);
2220           __ subiw(count, count, 4 >> shift);
2221           __ bind(L_skip_align4);
2222           break;
2223         default: ShouldNotReachHere();
2224       }
2225     }
2226 
2227     //
2228     //  Fill large chunks
2229     //
2230     __ srliw(cnt_words, count, 3 - shift); // number of words
2231 
2232     // 32 bit -> 64 bit
2233     __ zext(value, value, 32);
2234     __ slli(tmp_reg, value, 32);
2235     __ orr(value, value, tmp_reg);
2236 
2237     __ slli(tmp_reg, cnt_words, 3 - shift);
2238     __ subw(count, count, tmp_reg);
2239     {
2240       __ fill_words(to, cnt_words, value);
2241     }
2242 
2243     // Handle copies less than 8 bytes.
2244     // Address may not be heapword aligned.
2245     Label L_fill_1, L_fill_2, L_exit;
2246     __ bind(L_fill_elements);
2247     switch (t) {
2248       case T_BYTE:
2249         __ test_bit(tmp_reg, count, 2);
2250         __ beqz(tmp_reg, L_fill_2);
2251         __ sb(value, Address(to, 0));
2252         __ sb(value, Address(to, 1));
2253         __ sb(value, Address(to, 2));
2254         __ sb(value, Address(to, 3));
2255         __ addi(to, to, 4);
2256 
2257         __ bind(L_fill_2);
2258         __ test_bit(tmp_reg, count, 1);
2259         __ beqz(tmp_reg, L_fill_1);
2260         __ sb(value, Address(to, 0));
2261         __ sb(value, Address(to, 1));
2262         __ addi(to, to, 2);
2263 
2264         __ bind(L_fill_1);
2265         __ test_bit(tmp_reg, count, 0);
2266         __ beqz(tmp_reg, L_exit);
2267         __ sb(value, Address(to, 0));
2268         break;
2269       case T_SHORT:
2270         __ test_bit(tmp_reg, count, 1);
2271         __ beqz(tmp_reg, L_fill_2);
2272         __ sh(value, Address(to, 0));
2273         __ sh(value, Address(to, 2));
2274         __ addi(to, to, 4);
2275 
2276         __ bind(L_fill_2);
2277         __ test_bit(tmp_reg, count, 0);
2278         __ beqz(tmp_reg, L_exit);
2279         __ sh(value, Address(to, 0));
2280         break;
2281       case T_INT:
2282         __ beqz(count, L_exit);
2283         __ sw(value, Address(to, 0));
2284         break;
2285       default: ShouldNotReachHere();
2286     }
2287     __ bind(L_exit);
2288     __ leave();
2289     __ ret();
2290 
2291     return start;
2292   }
2293 
2294   void generate_arraycopy_stubs() {
2295     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2296     // entry immediately following their stack push. This can be used
2297     // as a post-push branch target for compatible stubs when they
2298     // identify a special case that can be handled by the fallback
2299     // stub e.g a disjoint copy stub may be use as a special case
2300     // fallback for its compatible conjoint copy stub.
2301     //
2302     // A no push entry is always returned in the following local and
2303     // then published by assigning to the appropriate entry field in
2304     // class StubRoutines. The entry value is then passed to the
2305     // generator for the compatible stub. That means the entry must be
2306     // listed when saving to/restoring from the AOT cache, ensuring
2307     // that the inter-stub jumps are noted at AOT-cache save and
2308     // relocated at AOT cache load.
2309     address nopush_entry = nullptr;
2310 
2311     // generate the common exit first so later stubs can rely on it if
2312     // they want an UnsafeMemoryAccess exit non-local to the stub
2313     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2314     // register the stub as the default exit with class UnsafeMemoryAccess
2315     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2316 
2317     // generate and publish riscv-specific bulk copy routines first
2318     // so we can call them from other copy stubs
2319     StubRoutines::riscv::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, c_rarg0, c_rarg1, t1);
2320     StubRoutines::riscv::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, c_rarg0, c_rarg1, t1);
2321 
2322     StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2323 
2324     //*** jbyte
2325     // Always need aligned and unaligned versions
2326     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2327     // disjoint nopush entry is needed by conjoint copy
2328     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
2329     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2330     // conjoint nopush entry is needed by generic/unsafe copy
2331     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2332     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2333     // disjoint arrayof nopush entry is needed by conjoint copy
2334     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
2335     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2336 
2337     //*** jshort
2338     // Always need aligned and unaligned versions
2339     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2340     // disjoint nopush entry is needed by conjoint copy
2341     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
2342     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2343     // conjoint nopush entry is used by generic/unsafe copy
2344     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2345     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2346     // disjoint arrayof nopush entry is needed by conjoint copy
2347     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2348     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2349 
2350     //*** jint
2351     // Aligned versions
2352     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2353     // disjoint arrayof nopush entry is needed by conjoint copy
2354     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2355     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2356     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2357     // entry_jint_arraycopy always points to the unaligned version
2358     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2359     // disjoint nopush entry is needed by conjoint copy
2360     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
2361     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2362     // conjoint nopush entry is needed by generic/unsafe copy
2363     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2364 
2365     //*** jlong
2366     // It is always aligned
2367     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2368     // disjoint arrayof nopush entry is needed by conjoint copy
2369     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2370     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2371     // conjoint nopush entry is needed by generic/unsafe copy
2372     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2373     // disjoint normal/nopush and conjoint normal entries are not
2374     // generated since the arrayof versions are the same
2375     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2376     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2377     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2378 
2379     //*** oops
2380     StubRoutines::_arrayof_oop_disjoint_arraycopy
2381       = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2382       // disjoint arrayof nopush entry is needed by conjoint copy
2383     StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2384     StubRoutines::_arrayof_oop_arraycopy
2385       = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2386     // conjoint arrayof nopush entry is needed by generic/unsafe copy
2387     StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2388     // Aligned versions without pre-barriers
2389     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2390       = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2391     // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2392     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2393 
2394     // note that we don't need a returned nopush entry because the
2395     // generic/unsafe copy does not cater for uninit arrays.
2396     StubRoutines::_arrayof_oop_arraycopy_uninit
2397       = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2398 
2399     // for oop copies reuse arrayof entries for non-arrayof cases
2400     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2401     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2402     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2403     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2404     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2405     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2406 
2407     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2408     // checkcast nopush entry is needed by generic copy
2409     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2410     // note that we don't need a returned nopush entry because the
2411     // generic copy does not cater for uninit arrays.
2412     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2413 
2414 
2415     // unsafe arraycopy may fallback on conjoint stubs
2416     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2417                                                               StubRoutines::_jshort_arraycopy_nopush,
2418                                                               StubRoutines::_jint_arraycopy_nopush,
2419                                                               StubRoutines::_jlong_arraycopy_nopush);
2420 
2421     // generic arraycopy may fallback on conjoint stubs
2422     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2423                                                                StubRoutines::_jshort_arraycopy_nopush,
2424                                                                StubRoutines::_jint_arraycopy_nopush,
2425                                                                StubRoutines::_oop_arraycopy_nopush,
2426                                                                StubRoutines::_jlong_arraycopy_nopush,
2427                                                                StubRoutines::_checkcast_arraycopy_nopush);
2428 
2429     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2430     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2431     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2432     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2433     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2434     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2435 
2436     StubRoutines::_unsafe_setmemory    = generate_unsafe_setmemory();
2437   }
2438 
2439   void aes_load_keys(const Register &key, VectorRegister *working_vregs, int rounds) {
2440     const int step = 16;
2441     for (int i = 0; i < rounds; i++) {
2442       __ vle32_v(working_vregs[i], key);
2443       // The keys are stored in little-endian array, while we need
2444       // to operate in big-endian.
2445       // So performing an endian-swap here with vrev8.v instruction
2446       __ vrev8_v(working_vregs[i], working_vregs[i]);
2447       __ addi(key, key, step);
2448     }
2449   }
2450 
2451   void aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2452     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2453 
2454     __ vxor_vv(res, res, working_vregs[0]);
2455     for (int i = 1; i < rounds - 1; i++) {
2456       __ vaesem_vv(res, working_vregs[i]);
2457     }
2458     __ vaesef_vv(res, working_vregs[rounds - 1]);
2459   }
2460 
2461   // Arguments:
2462   //
2463   // Inputs:
2464   //   c_rarg0   - source byte array address
2465   //   c_rarg1   - destination byte array address
2466   //   c_rarg2   - sessionKe (key) in little endian int array
2467   //
2468   address generate_aescrypt_encryptBlock() {
2469     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2470 
2471     __ align(CodeEntryAlignment);
2472     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2473     StubCodeMark mark(this, stub_id);
2474 
2475     Label L_aes128, L_aes192;
2476 
2477     const Register from        = c_rarg0;  // source array address
2478     const Register to          = c_rarg1;  // destination array address
2479     const Register key         = c_rarg2;  // key array address
2480     const Register keylen      = c_rarg3;
2481 
2482     VectorRegister working_vregs[] = {
2483       v4, v5, v6, v7, v8, v9, v10, v11,
2484       v12, v13, v14, v15, v16, v17, v18
2485     };
2486     const VectorRegister res   = v19;
2487 
2488     address start = __ pc();
2489     __ enter();
2490 
2491     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2492 
2493     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2494     __ vle32_v(res, from);
2495 
2496     __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2497     __ bltu(keylen, t2, L_aes128);
2498     __ beq(keylen, t2, L_aes192);
2499     // Else we fallthrough to the biggest case (256-bit key size)
2500 
2501     // Note: the following function performs key += 15*16
2502     aes_load_keys(key, working_vregs, 15);
2503     aes_encrypt(res, working_vregs, 15);
2504     __ vse32_v(res, to);
2505     __ mv(c_rarg0, 0);
2506     __ leave();
2507     __ ret();
2508 
2509   __ bind(L_aes192);
2510     // Note: the following function performs key += 13*16
2511     aes_load_keys(key, working_vregs, 13);
2512     aes_encrypt(res, working_vregs, 13);
2513     __ vse32_v(res, to);
2514     __ mv(c_rarg0, 0);
2515     __ leave();
2516     __ ret();
2517 
2518   __ bind(L_aes128);
2519     // Note: the following function performs key += 11*16
2520     aes_load_keys(key, working_vregs, 11);
2521     aes_encrypt(res, working_vregs, 11);
2522     __ vse32_v(res, to);
2523     __ mv(c_rarg0, 0);
2524     __ leave();
2525     __ ret();
2526 
2527     return start;
2528   }
2529 
2530   void aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2531     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2532 
2533     __ vxor_vv(res, res, working_vregs[rounds - 1]);
2534     for (int i = rounds - 2; i > 0; i--) {
2535       __ vaesdm_vv(res, working_vregs[i]);
2536     }
2537     __ vaesdf_vv(res, working_vregs[0]);
2538   }
2539 
2540   // Arguments:
2541   //
2542   // Inputs:
2543   //   c_rarg0   - source byte array address
2544   //   c_rarg1   - destination byte array address
2545   //   c_rarg2   - sessionKe (key) in little endian int array
2546   //
2547   address generate_aescrypt_decryptBlock() {
2548     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2549 
2550     __ align(CodeEntryAlignment);
2551     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2552     StubCodeMark mark(this, stub_id);
2553 
2554     Label L_aes128, L_aes192;
2555 
2556     const Register from        = c_rarg0;  // source array address
2557     const Register to          = c_rarg1;  // destination array address
2558     const Register key         = c_rarg2;  // key array address
2559     const Register keylen      = c_rarg3;
2560 
2561     VectorRegister working_vregs[] = {
2562       v4, v5, v6, v7, v8, v9, v10, v11,
2563       v12, v13, v14, v15, v16, v17, v18
2564     };
2565     const VectorRegister res   = v19;
2566 
2567     address start = __ pc();
2568     __ enter(); // required for proper stackwalking of RuntimeStub frame
2569 
2570     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2571 
2572     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2573     __ vle32_v(res, from);
2574 
2575     __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2576     __ bltu(keylen, t2, L_aes128);
2577     __ beq(keylen, t2, L_aes192);
2578     // Else we fallthrough to the biggest case (256-bit key size)
2579 
2580     // Note: the following function performs key += 15*16
2581     aes_load_keys(key, working_vregs, 15);
2582     aes_decrypt(res, working_vregs, 15);
2583     __ vse32_v(res, to);
2584     __ mv(c_rarg0, 0);
2585     __ leave();
2586     __ ret();
2587 
2588   __ bind(L_aes192);
2589     // Note: the following function performs key += 13*16
2590     aes_load_keys(key, working_vregs, 13);
2591     aes_decrypt(res, working_vregs, 13);
2592     __ vse32_v(res, to);
2593     __ mv(c_rarg0, 0);
2594     __ leave();
2595     __ ret();
2596 
2597   __ bind(L_aes128);
2598     // Note: the following function performs key += 11*16
2599     aes_load_keys(key, working_vregs, 11);
2600     aes_decrypt(res, working_vregs, 11);
2601     __ vse32_v(res, to);
2602     __ mv(c_rarg0, 0);
2603     __ leave();
2604     __ ret();
2605 
2606     return start;
2607   }
2608 
2609   void cipherBlockChaining_encryptAESCrypt(int round, Register from, Register to, Register key,
2610                                            Register rvec, Register input_len) {
2611     const Register len = x29;
2612 
2613     VectorRegister working_vregs[] = {
2614       v1, v2, v3, v4, v5, v6, v7, v8,
2615       v9, v10, v11, v12, v13, v14, v15
2616     };
2617 
2618     const unsigned int BLOCK_SIZE = 16;
2619 
2620     __ mv(len, input_len);
2621     // load init rvec
2622     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2623     __ vle32_v(v16, rvec);
2624 
2625     aes_load_keys(key, working_vregs, round);
2626     Label L_enc_loop;
2627     __ bind(L_enc_loop);
2628     // Encrypt from source by block size
2629       __ vle32_v(v17, from);
2630       __ addi(from, from, BLOCK_SIZE);
2631       __ vxor_vv(v16, v16, v17);
2632       aes_encrypt(v16, working_vregs, round);
2633       __ vse32_v(v16, to);
2634       __ addi(to, to, BLOCK_SIZE);
2635       __ subi(len, len, BLOCK_SIZE);
2636       __ bnez(len, L_enc_loop);
2637 
2638     // save current rvec and return
2639     __ vse32_v(v16, rvec);
2640     __ mv(x10, input_len);
2641     __ leave();
2642     __ ret();
2643   }
2644 
2645   // Arguments:
2646   //
2647   // Inputs:
2648   //   c_rarg0   - source byte array address
2649   //   c_rarg1   - destination byte array address
2650   //   c_rarg2   - K (key) in little endian int array
2651   //   c_rarg3   - r vector byte array address
2652   //   c_rarg4   - input length
2653   //
2654   // Output:
2655   //   x10       - input length
2656   //
2657   address generate_cipherBlockChaining_encryptAESCrypt() {
2658     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2659     __ align(CodeEntryAlignment);
2660     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2661     StubCodeMark mark(this, stub_id);
2662 
2663     const Register from       = c_rarg0;
2664     const Register to         = c_rarg1;
2665     const Register key        = c_rarg2;
2666     const Register rvec       = c_rarg3;
2667     const Register input_len  = c_rarg4;
2668 
2669     const Register keylen     = x28;
2670 
2671     address start = __ pc();
2672     __ enter();
2673 
2674     Label L_aes128, L_aes192;
2675     // Compute #rounds for AES based on the length of the key array
2676     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2677     __ mv(t0, 52);
2678     __ bltu(keylen, t0, L_aes128);
2679     __ beq(keylen, t0, L_aes192);
2680     // Else we fallthrough to the biggest case (256-bit key size)
2681 
2682     // Note: the following function performs key += 15*16
2683     cipherBlockChaining_encryptAESCrypt(15, from, to, key, rvec, input_len);
2684 
2685     // Note: the following function performs key += 11*16
2686     __ bind(L_aes128);
2687     cipherBlockChaining_encryptAESCrypt(11, from, to, key, rvec, input_len);
2688 
2689     // Note: the following function performs key += 13*16
2690     __ bind(L_aes192);
2691     cipherBlockChaining_encryptAESCrypt(13, from, to, key, rvec, input_len);
2692 
2693     return start;
2694   }
2695 
2696   void cipherBlockChaining_decryptAESCrypt(int round, Register from, Register to, Register key,
2697                                            Register rvec, Register input_len) {
2698     const Register len = x29;
2699 
2700     VectorRegister working_vregs[] = {
2701       v1, v2, v3, v4, v5, v6, v7, v8,
2702       v9, v10, v11, v12, v13, v14, v15
2703     };
2704 
2705     const unsigned int BLOCK_SIZE = 16;
2706 
2707     __ mv(len, input_len);
2708     // load init rvec
2709     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2710     __ vle32_v(v16, rvec);
2711 
2712     aes_load_keys(key, working_vregs, round);
2713     Label L_dec_loop;
2714     // Decrypt from source by block size
2715     __ bind(L_dec_loop);
2716       __ vle32_v(v17, from);
2717       __ addi(from, from, BLOCK_SIZE);
2718       __ vmv_v_v(v18, v17);
2719       aes_decrypt(v17, working_vregs, round);
2720       __ vxor_vv(v17, v17, v16);
2721       __ vse32_v(v17, to);
2722       __ vmv_v_v(v16, v18);
2723       __ addi(to, to, BLOCK_SIZE);
2724       __ subi(len, len, BLOCK_SIZE);
2725       __ bnez(len, L_dec_loop);
2726 
2727     // save current rvec and return
2728     __ vse32_v(v16, rvec);
2729     __ mv(x10, input_len);
2730     __ leave();
2731     __ ret();
2732   }
2733 
2734   // Arguments:
2735   //
2736   // Inputs:
2737   //   c_rarg0   - source byte array address
2738   //   c_rarg1   - destination byte array address
2739   //   c_rarg2   - K (key) in little endian int array
2740   //   c_rarg3   - r vector byte array address
2741   //   c_rarg4   - input length
2742   //
2743   // Output:
2744   //   x10       - input length
2745   //
2746   address generate_cipherBlockChaining_decryptAESCrypt() {
2747     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2748     __ align(CodeEntryAlignment);
2749     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
2750     StubCodeMark mark(this, stub_id);
2751 
2752     const Register from        = c_rarg0;
2753     const Register to          = c_rarg1;
2754     const Register key         = c_rarg2;
2755     const Register rvec        = c_rarg3;
2756     const Register input_len   = c_rarg4;
2757 
2758     const Register keylen      = x28;
2759 
2760     address start = __ pc();
2761     __ enter();
2762 
2763     Label L_aes128, L_aes192, L_aes128_loop, L_aes192_loop, L_aes256_loop;
2764     // Compute #rounds for AES based on the length of the key array
2765     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2766     __ mv(t0, 52);
2767     __ bltu(keylen, t0, L_aes128);
2768     __ beq(keylen, t0, L_aes192);
2769     // Else we fallthrough to the biggest case (256-bit key size)
2770 
2771     // Note: the following function performs key += 15*16
2772     cipherBlockChaining_decryptAESCrypt(15, from, to, key, rvec, input_len);
2773 
2774     // Note: the following function performs key += 11*16
2775     __ bind(L_aes128);
2776     cipherBlockChaining_decryptAESCrypt(11, from, to, key, rvec, input_len);
2777 
2778     // Note: the following function performs key += 13*16
2779     __ bind(L_aes192);
2780     cipherBlockChaining_decryptAESCrypt(13, from, to, key, rvec, input_len);
2781 
2782     return start;
2783   }
2784 
2785   // Load big-endian 128-bit from memory.
2786   void be_load_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2787     __ ld(counter_lo, Address(counter, 8)); // Load 128-bits from counter
2788     __ ld(counter_hi, Address(counter));
2789     __ rev8(counter_lo, counter_lo);        // Convert big-endian to little-endian
2790     __ rev8(counter_hi, counter_hi);
2791   }
2792 
2793   // Little-endian 128-bit + 64-bit -> 128-bit addition.
2794   void add_counter_128(Register counter_hi, Register counter_lo) {
2795     assert_different_registers(counter_hi, counter_lo, t0);
2796     __ addi(counter_lo, counter_lo, 1);
2797     __ seqz(t0, counter_lo);                // Check for result overflow
2798     __ add(counter_hi, counter_hi, t0);     // Add 1 if overflow otherwise 0
2799   }
2800 
2801   // Store big-endian 128-bit to memory.
2802   void be_store_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2803     assert_different_registers(counter_hi, counter_lo, t0, t1);
2804     __ rev8(t0, counter_lo);                // Convert little-endian to big-endian
2805     __ rev8(t1, counter_hi);
2806     __ sd(t0, Address(counter, 8));         // Store 128-bits to counter
2807     __ sd(t1, Address(counter));
2808   }
2809 
2810   void counterMode_AESCrypt(int round, Register in, Register out, Register key, Register counter,
2811                             Register input_len,  Register saved_encrypted_ctr, Register used_ptr) {
2812     // Algorithm:
2813     //
2814     //   aes_load_keys();
2815     //   load_counter_128(counter_hi, counter_lo, counter);
2816     //
2817     //   L_next:
2818     //     if (used >= BLOCK_SIZE) goto L_main_loop;
2819     //
2820     //   L_encrypt_next:
2821     //       *out = *in ^ saved_encrypted_ctr[used]);
2822     //       out++; in++; used++; len--;
2823     //       if (len == 0) goto L_exit;
2824     //       goto L_next;
2825     //
2826     //   L_main_loop:
2827     //     if (len == 0) goto L_exit;
2828     //     saved_encrypted_ctr = aes_encrypt(counter);
2829     //
2830     //     add_counter_128(counter_hi, counter_lo);
2831     //     be_store_counter_128(counter_hi, counter_lo, counter);
2832     //     used = 0;
2833     //
2834     //     if(len < BLOCK_SIZE) goto L_encrypt_next;
2835     //
2836     //     v_in = load_16Byte(in);
2837     //     v_out = load_16Byte(out);
2838     //     v_saved_encrypted_ctr = load_16Byte(saved_encrypted_ctr);
2839     //     v_out = v_in ^ v_saved_encrypted_ctr;
2840     //     out += BLOCK_SIZE;
2841     //     in += BLOCK_SIZE;
2842     //     len -= BLOCK_SIZE;
2843     //     used = BLOCK_SIZE;
2844     //     goto L_main_loop;
2845     //
2846     //
2847     //   L_exit:
2848     //     store(used);
2849     //     result = input_len
2850     //     return result;
2851 
2852     const Register used          = x28;
2853     const Register len           = x29;
2854     const Register counter_hi    = x30;
2855     const Register counter_lo    = x31;
2856     const Register block_size    = t2;
2857 
2858     const unsigned int BLOCK_SIZE = 16;
2859 
2860     VectorRegister working_vregs[] = {
2861       v1, v2, v3, v4, v5, v6, v7, v8,
2862       v9, v10, v11, v12, v13, v14, v15
2863     };
2864 
2865     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2866 
2867     __ lwu(used, Address(used_ptr));
2868     __ mv(len, input_len);
2869     __ mv(block_size, BLOCK_SIZE);
2870 
2871     // load keys to working_vregs according to round
2872     aes_load_keys(key, working_vregs, round);
2873 
2874     // 128-bit big-endian load
2875     be_load_counter_128(counter_hi, counter_lo, counter);
2876 
2877     Label L_next, L_encrypt_next, L_main_loop, L_exit;
2878     // Check the last saved_encrypted_ctr used value, we fall through
2879     // to L_encrypt_next when the used value lower than block_size
2880     __ bind(L_next);
2881     __ bgeu(used, block_size, L_main_loop);
2882 
2883     // There is still data left fewer than block_size after L_main_loop
2884     // or last used, we encrypt them one by one.
2885     __ bind(L_encrypt_next);
2886     __ add(t0, saved_encrypted_ctr, used);
2887     __ lbu(t1, Address(t0));
2888     __ lbu(t0, Address(in));
2889     __ xorr(t1, t1, t0);
2890     __ sb(t1, Address(out));
2891     __ addi(in, in, 1);
2892     __ addi(out, out, 1);
2893     __ addi(used, used, 1);
2894     __ subi(len, len, 1);
2895     __ beqz(len, L_exit);
2896     __ j(L_next);
2897 
2898     // We will calculate the next saved_encrypted_ctr and encrypt the blocks of data
2899     // one by one until there is less than a full block remaining if len not zero
2900     __ bind(L_main_loop);
2901     __ beqz(len, L_exit);
2902     __ vle32_v(v16, counter);
2903 
2904     // encrypt counter according to round
2905     aes_encrypt(v16, working_vregs, round);
2906 
2907     __ vse32_v(v16, saved_encrypted_ctr);
2908 
2909     // 128-bit little-endian increment
2910     add_counter_128(counter_hi, counter_lo);
2911     // 128-bit big-endian store
2912     be_store_counter_128(counter_hi, counter_lo, counter);
2913 
2914     __ mv(used, 0);
2915     // Check if we have a full block_size
2916     __ bltu(len, block_size, L_encrypt_next);
2917 
2918     // We have one full block to encrypt at least
2919     __ vle32_v(v17, in);
2920     __ vxor_vv(v16, v16, v17);
2921     __ vse32_v(v16, out);
2922     __ add(out, out, block_size);
2923     __ add(in, in, block_size);
2924     __ sub(len, len, block_size);
2925     __ mv(used, block_size);
2926     __ j(L_main_loop);
2927 
2928     __ bind(L_exit);
2929     __ sw(used, Address(used_ptr));
2930     __ mv(x10, input_len);
2931     __ leave();
2932     __ ret();
2933   };
2934 
2935   // CTR AES crypt.
2936   // Arguments:
2937   //
2938   // Inputs:
2939   //   c_rarg0   - source byte array address
2940   //   c_rarg1   - destination byte array address
2941   //   c_rarg2   - K (key) in little endian int array
2942   //   c_rarg3   - counter vector byte array address
2943   //   c_rarg4   - input length
2944   //   c_rarg5   - saved encryptedCounter start
2945   //   c_rarg6   - saved used length
2946   //
2947   // Output:
2948   //   x10       - input length
2949   //
2950   address generate_counterMode_AESCrypt() {
2951     assert(UseAESCTRIntrinsics, "need AES instructions (Zvkned extension) and Zbb extension support");
2952 
2953     __ align(CodeEntryAlignment);
2954     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
2955     StubCodeMark mark(this, stub_id);
2956 
2957     const Register in                  = c_rarg0;
2958     const Register out                 = c_rarg1;
2959     const Register key                 = c_rarg2;
2960     const Register counter             = c_rarg3;
2961     const Register input_len           = c_rarg4;
2962     const Register saved_encrypted_ctr = c_rarg5;
2963     const Register used_len_ptr        = c_rarg6;
2964 
2965     const Register keylen              = c_rarg7; // temporary register
2966 
2967     const address start = __ pc();
2968     __ enter();
2969 
2970     Label L_exit;
2971     __ beqz(input_len, L_exit);
2972 
2973     Label L_aes128, L_aes192;
2974     // Compute #rounds for AES based on the length of the key array
2975     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2976     __ mv(t0, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2977     __ bltu(keylen, t0, L_aes128);
2978     __ beq(keylen, t0, L_aes192);
2979     // Else we fallthrough to the biggest case (256-bit key size)
2980 
2981     // Note: the following function performs crypt with key += 15*16
2982     counterMode_AESCrypt(15, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2983 
2984     // Note: the following function performs crypt with key += 13*16
2985     __ bind(L_aes192);
2986     counterMode_AESCrypt(13, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2987 
2988     // Note: the following function performs crypt with key += 11*16
2989     __ bind(L_aes128);
2990     counterMode_AESCrypt(11, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2991 
2992     __ bind(L_exit);
2993     __ mv(x10, input_len);
2994     __ leave();
2995     __ ret();
2996 
2997     return start;
2998   }
2999 
3000   void ghash_loop(Register state, Register subkeyH, Register data, Register blocks,
3001                   VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3) {
3002     VectorRegister partial_hash = vtmp1;
3003     VectorRegister hash_subkey  = vtmp2;
3004     VectorRegister cipher_text  = vtmp3;
3005 
3006     const unsigned int BLOCK_SIZE = 16;
3007 
3008     __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3009     __ vle64_v(hash_subkey, subkeyH);
3010     __ vrev8_v(hash_subkey, hash_subkey);
3011     __ vle64_v(partial_hash, state);
3012     __ vrev8_v(partial_hash, partial_hash);
3013 
3014     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
3015     Label L_ghash_loop;
3016     __ bind(L_ghash_loop);
3017       __ vle32_v(cipher_text, data);
3018       __ addi(data, data, BLOCK_SIZE);
3019       __ vghsh_vv(partial_hash, hash_subkey, cipher_text);
3020       __ subi(blocks, blocks, 1);
3021       __ bnez(blocks, L_ghash_loop);
3022 
3023     __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3024     __ vrev8_v(partial_hash, partial_hash);
3025     __ vse64_v(partial_hash, state);
3026   }
3027 
3028   /**
3029    *  Arguments:
3030    *
3031    *  Input:
3032    *  c_rarg0   - current state address
3033    *  c_rarg1   - H key address
3034    *  c_rarg2   - data address
3035    *  c_rarg3   - number of blocks
3036    *
3037    *  Output:
3038    *  Updated state at c_rarg0
3039    */
3040   address generate_ghash_processBlocks() {
3041     assert(UseGHASHIntrinsics, "need GHASH instructions (Zvkg extension) and Zvbb support");
3042 
3043     __ align(CodeEntryAlignment);
3044     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
3045     StubCodeMark mark(this, stub_id);
3046 
3047     address start = __ pc();
3048     __ enter();
3049 
3050     Register state   = c_rarg0;
3051     Register subkeyH = c_rarg1;
3052     Register data    = c_rarg2;
3053     Register blocks  = c_rarg3;
3054 
3055     VectorRegister vtmp1 = v1;
3056     VectorRegister vtmp2 = v2;
3057     VectorRegister vtmp3 = v3;
3058 
3059     ghash_loop(state, subkeyH, data, blocks, vtmp1, vtmp2, vtmp3);
3060 
3061     __ leave();
3062     __ ret();
3063 
3064     return start;
3065   }
3066 
3067   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
3068   void compare_string_8_x_LU(Register tmpL, Register tmpU,
3069                              Register strL, Register strU, Label& DIFF) {
3070     const Register tmp = x30, tmpLval = x12;
3071 
3072     int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3073     assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3074 
3075 #ifdef ASSERT
3076     if (AvoidUnalignedAccesses) {
3077       Label align_ok;
3078       __ andi(t0, strL, 0x7);
3079       __ beqz(t0, align_ok);
3080       __ stop("bad alignment");
3081       __ bind(align_ok);
3082     }
3083 #endif
3084     __ ld(tmpLval, Address(strL));
3085     __ addi(strL, strL, wordSize);
3086 
3087     // compare first 4 characters
3088     __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3089     __ addi(strU, strU, wordSize);
3090     __ inflate_lo32(tmpL, tmpLval);
3091     __ xorr(tmp, tmpU, tmpL);
3092     __ bnez(tmp, DIFF);
3093 
3094     // compare second 4 characters
3095     __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3096     __ addi(strU, strU, wordSize);
3097     __ inflate_hi32(tmpL, tmpLval);
3098     __ xorr(tmp, tmpU, tmpL);
3099     __ bnez(tmp, DIFF);
3100   }
3101 
3102   // x10  = result
3103   // x11  = str1
3104   // x12  = cnt1
3105   // x13  = str2
3106   // x14  = cnt2
3107   // x28  = tmp1
3108   // x29  = tmp2
3109   // x30  = tmp3
3110   address generate_compare_long_string_different_encoding(StubId stub_id) {
3111     bool isLU;
3112     switch (stub_id) {
3113     case StubId::stubgen_compare_long_string_LU_id:
3114       isLU = true;
3115       break;
3116     case StubId::stubgen_compare_long_string_UL_id:
3117       isLU = false;
3118       break;
3119     default:
3120       ShouldNotReachHere();
3121     };
3122     __ align(CodeEntryAlignment);
3123     StubCodeMark mark(this, stub_id);
3124     address entry = __ pc();
3125     Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
3126     const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
3127                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
3128 
3129     int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3130     assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3131 
3132     Register strU = isLU ? str2 : str1,
3133              strL = isLU ? str1 : str2,
3134              tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
3135              tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
3136 
3137     if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
3138       // Load 4 bytes from strL to make sure main loop is 8-byte aligned
3139       // cnt2 is >= 68 here, no need to check it for >= 0
3140       __ lwu(tmpL, Address(strL));
3141       __ addi(strL, strL, wordSize / 2);
3142       __ load_long_misaligned(tmpU, Address(strU), tmp4, (base_offset % 8) != 0 ? 4 : 8);
3143       __ addi(strU, strU, wordSize);
3144       __ inflate_lo32(tmp3, tmpL);
3145       __ mv(tmpL, tmp3);
3146       __ xorr(tmp3, tmpU, tmpL);
3147       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3148       __ subi(cnt2, cnt2, wordSize / 2);
3149     }
3150 
3151     // we are now 8-bytes aligned on strL when AvoidUnalignedAccesses is true
3152     __ subi(cnt2, cnt2, wordSize * 2);
3153     __ bltz(cnt2, TAIL);
3154     __ bind(SMALL_LOOP); // smaller loop
3155       __ subi(cnt2, cnt2, wordSize * 2);
3156       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3157       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3158       __ bgez(cnt2, SMALL_LOOP);
3159       __ addi(t0, cnt2, wordSize * 2);
3160       __ beqz(t0, DONE);
3161     __ bind(TAIL);  // 1..15 characters left
3162       // Aligned access. Load bytes in portions - 4, 2, 1.
3163 
3164       __ addi(t0, cnt2, wordSize);
3165       __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
3166       __ bltz(t0, LOAD_LAST);
3167       // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
3168       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3169       __ subi(cnt2, cnt2, wordSize);
3170       __ beqz(cnt2, DONE);  // no character left
3171       __ bind(LOAD_LAST);   // cnt2 = 1..7 characters left
3172 
3173       __ subi(cnt2, cnt2, wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
3174       __ slli(t0, cnt2, 1);     // t0 is now an offset in strU which points to last 16 bytes
3175       __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
3176       __ add(strU, strU, t0);   // Address of last 16 bytes in UTF-16 string
3177       __ load_int_misaligned(tmpL, Address(strL), t0, false);
3178       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3179       __ inflate_lo32(tmp3, tmpL);
3180       __ mv(tmpL, tmp3);
3181       __ xorr(tmp3, tmpU, tmpL);
3182       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3183 
3184       __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
3185       __ addi(strU, strU, wordSize);   // Address of last 8 bytes in UTF-16 string
3186       __ load_int_misaligned(tmpL, Address(strL), t0, false);
3187       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3188       __ inflate_lo32(tmp3, tmpL);
3189       __ mv(tmpL, tmp3);
3190       __ xorr(tmp3, tmpU, tmpL);
3191       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3192       __ j(DONE); // no character left
3193 
3194       // Find the first different characters in the longwords and
3195       // compute their difference.
3196     __ bind(CALCULATE_DIFFERENCE);
3197       // count bits of trailing zero chars
3198       __ ctzc_bits(tmp4, tmp3);
3199       __ srl(tmp1, tmp1, tmp4);
3200       __ srl(tmp2, tmp2, tmp4);
3201       __ zext(tmp1, tmp1, 16);
3202       __ zext(tmp2, tmp2, 16);
3203       __ sub(result, tmp1, tmp2);
3204     __ bind(DONE);
3205       __ ret();
3206     return entry;
3207   }
3208 
3209   address generate_method_entry_barrier() {
3210     __ align(CodeEntryAlignment);
3211     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
3212     StubCodeMark mark(this, stub_id);
3213 
3214     Label deoptimize_label;
3215 
3216     address start = __ pc();
3217 
3218     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
3219 
3220     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
3221       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
3222       Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
3223       __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
3224       __ lwu(t1, t1);
3225       __ sw(t1, thread_epoch_addr);
3226       // There are two ways this can work:
3227       // - The writer did system icache shootdown after the instruction stream update.
3228       //   Hence do nothing.
3229       // - The writer trust us to make sure our icache is in sync before entering.
3230       //   Hence use cmodx fence (fence.i, may change).
3231       if (UseCtxFencei) {
3232         __ cmodx_fence();
3233       }
3234       __ membar(__ LoadLoad);
3235     }
3236 
3237     __ set_last_Java_frame(sp, fp, ra);
3238 
3239     __ enter();
3240     __ addi(t1, sp, wordSize);
3241 
3242     __ subi(sp, sp, 4 * wordSize);
3243 
3244     __ push_call_clobbered_registers();
3245 
3246     __ mv(c_rarg0, t1);
3247     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
3248 
3249     __ reset_last_Java_frame(true);
3250 
3251     __ mv(t0, x10);
3252 
3253     __ pop_call_clobbered_registers();
3254 
3255     __ bnez(t0, deoptimize_label);
3256 
3257     __ leave();
3258     __ ret();
3259 
3260     __ BIND(deoptimize_label);
3261 
3262     __ ld(t0, Address(sp, 0));
3263     __ ld(fp, Address(sp, wordSize));
3264     __ ld(ra, Address(sp, wordSize * 2));
3265     __ ld(t1, Address(sp, wordSize * 3));
3266 
3267     __ mv(sp, t0);
3268     __ jr(t1);
3269 
3270     return start;
3271   }
3272 
3273   // x10  = result
3274   // x11  = str1
3275   // x12  = cnt1
3276   // x13  = str2
3277   // x14  = cnt2
3278   // x28  = tmp1
3279   // x29  = tmp2
3280   // x30  = tmp3
3281   // x31  = tmp4
3282   address generate_compare_long_string_same_encoding(StubId stub_id) {
3283     bool isLL;
3284     switch (stub_id) {
3285     case StubId::stubgen_compare_long_string_LL_id:
3286       isLL = true;
3287       break;
3288     case StubId::stubgen_compare_long_string_UU_id:
3289       isLL = false;
3290       break;
3291     default:
3292       ShouldNotReachHere();
3293     };
3294     __ align(CodeEntryAlignment);
3295     StubCodeMark mark(this, stub_id);
3296     address entry = __ pc();
3297     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
3298           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
3299     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
3300                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
3301     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
3302 
3303     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
3304     // update cnt2 counter with already loaded 8 bytes
3305     __ subi(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
3306     // update pointers, because of previous read
3307     __ addi(str1, str1, wordSize);
3308     __ addi(str2, str2, wordSize);
3309     // less than 16 bytes left?
3310     __ subi(cnt2, cnt2, isLL ? 16 : 8);
3311     __ push_reg(spilled_regs, sp);
3312     __ bltz(cnt2, TAIL);
3313     __ bind(SMALL_LOOP);
3314       // compare 16 bytes of strings with same encoding
3315       __ ld(tmp5, Address(str1));
3316       __ addi(str1, str1, 8);
3317       __ xorr(tmp4, tmp1, tmp2);
3318       __ ld(cnt1, Address(str2));
3319       __ addi(str2, str2, 8);
3320       __ bnez(tmp4, DIFF);
3321       __ ld(tmp1, Address(str1));
3322       __ addi(str1, str1, 8);
3323       __ xorr(tmp4, tmp5, cnt1);
3324       __ ld(tmp2, Address(str2));
3325       __ addi(str2, str2, 8);
3326       __ bnez(tmp4, DIFF2);
3327 
3328       __ subi(cnt2, cnt2, isLL ? 16 : 8);
3329       __ bgez(cnt2, SMALL_LOOP);
3330     __ bind(TAIL);
3331       __ addi(cnt2, cnt2, isLL ? 16 : 8);
3332       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
3333       __ subi(cnt2, cnt2, isLL ? 8 : 4);
3334       __ blez(cnt2, CHECK_LAST);
3335       __ xorr(tmp4, tmp1, tmp2);
3336       __ bnez(tmp4, DIFF);
3337       __ ld(tmp1, Address(str1));
3338       __ addi(str1, str1, 8);
3339       __ ld(tmp2, Address(str2));
3340       __ addi(str2, str2, 8);
3341       __ subi(cnt2, cnt2, isLL ? 8 : 4);
3342     __ bind(CHECK_LAST);
3343       if (!isLL) {
3344         __ add(cnt2, cnt2, cnt2); // now in bytes
3345       }
3346       __ xorr(tmp4, tmp1, tmp2);
3347       __ bnez(tmp4, DIFF);
3348       __ add(str1, str1, cnt2);
3349       __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
3350       __ add(str2, str2, cnt2);
3351       __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
3352       __ xorr(tmp4, tmp5, cnt1);
3353       __ beqz(tmp4, LENGTH_DIFF);
3354       // Find the first different characters in the longwords and
3355       // compute their difference.
3356     __ bind(DIFF2);
3357       // count bits of trailing zero chars
3358       __ ctzc_bits(tmp3, tmp4, isLL);
3359       __ srl(tmp5, tmp5, tmp3);
3360       __ srl(cnt1, cnt1, tmp3);
3361       if (isLL) {
3362         __ zext(tmp5, tmp5, 8);
3363         __ zext(cnt1, cnt1, 8);
3364       } else {
3365         __ zext(tmp5, tmp5, 16);
3366         __ zext(cnt1, cnt1, 16);
3367       }
3368       __ sub(result, tmp5, cnt1);
3369       __ j(LENGTH_DIFF);
3370     __ bind(DIFF);
3371       // count bits of trailing zero chars
3372       __ ctzc_bits(tmp3, tmp4, isLL);
3373       __ srl(tmp1, tmp1, tmp3);
3374       __ srl(tmp2, tmp2, tmp3);
3375       if (isLL) {
3376         __ zext(tmp1, tmp1, 8);
3377         __ zext(tmp2, tmp2, 8);
3378       } else {
3379         __ zext(tmp1, tmp1, 16);
3380         __ zext(tmp2, tmp2, 16);
3381       }
3382       __ sub(result, tmp1, tmp2);
3383       __ j(LENGTH_DIFF);
3384     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
3385       __ xorr(tmp4, tmp1, tmp2);
3386       __ bnez(tmp4, DIFF);
3387     __ bind(LENGTH_DIFF);
3388       __ pop_reg(spilled_regs, sp);
3389       __ ret();
3390     return entry;
3391   }
3392 
3393   void generate_compare_long_strings() {
3394     StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_LL_id);
3395     StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_UU_id);
3396     StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_LU_id);
3397     StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_UL_id);
3398   }
3399 
3400   // x10 result
3401   // x11 src
3402   // x12 src count
3403   // x13 pattern
3404   // x14 pattern count
3405   address generate_string_indexof_linear(StubId stub_id)
3406   {
3407     bool needle_isL;
3408     bool haystack_isL;
3409     switch (stub_id) {
3410     case StubId::stubgen_string_indexof_linear_ll_id:
3411       needle_isL = true;
3412       haystack_isL = true;
3413       break;
3414     case StubId::stubgen_string_indexof_linear_ul_id:
3415       needle_isL = true;
3416       haystack_isL = false;
3417       break;
3418     case StubId::stubgen_string_indexof_linear_uu_id:
3419       needle_isL = false;
3420       haystack_isL = false;
3421       break;
3422     default:
3423       ShouldNotReachHere();
3424     };
3425 
3426     __ align(CodeEntryAlignment);
3427     StubCodeMark mark(this, stub_id);
3428     address entry = __ pc();
3429 
3430     int needle_chr_size = needle_isL ? 1 : 2;
3431     int haystack_chr_size = haystack_isL ? 1 : 2;
3432     int needle_chr_shift = needle_isL ? 0 : 1;
3433     int haystack_chr_shift = haystack_isL ? 0 : 1;
3434     bool isL = needle_isL && haystack_isL;
3435     // parameters
3436     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
3437     // temporary registers
3438     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
3439     // redefinitions
3440     Register ch1 = x28, ch2 = x29;
3441     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
3442 
3443     __ push_reg(spilled_regs, sp);
3444 
3445     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
3446           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
3447           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
3448           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
3449           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
3450           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
3451 
3452     __ ld(ch1, Address(needle));
3453     __ ld(ch2, Address(haystack));
3454     // src.length - pattern.length
3455     __ sub(haystack_len, haystack_len, needle_len);
3456 
3457     // first is needle[0]
3458     __ zext(first, ch1, needle_isL ? 8 : 16);
3459 
3460     uint64_t mask0101 = UCONST64(0x0101010101010101);
3461     uint64_t mask0001 = UCONST64(0x0001000100010001);
3462     __ mv(mask1, haystack_isL ? mask0101 : mask0001);
3463     __ mul(first, first, mask1);
3464     uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
3465     uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
3466     __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
3467     if (needle_isL != haystack_isL) {
3468       __ mv(tmp, ch1);
3469     }
3470     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
3471     __ blez(haystack_len, L_SMALL);
3472 
3473     if (needle_isL != haystack_isL) {
3474       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3475     }
3476     // xorr, sub, orr, notr, andr
3477     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
3478     // eg:
3479     // first:        aa aa aa aa aa aa aa aa
3480     // ch2:          aa aa li nx jd ka aa aa
3481     // match_mask:   80 80 00 00 00 00 80 80
3482     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3483 
3484     // search first char of needle, if success, goto L_HAS_ZERO;
3485     __ bnez(match_mask, L_HAS_ZERO);
3486     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3487     __ addi(result, result, wordSize / haystack_chr_size);
3488     __ addi(haystack, haystack, wordSize);
3489     __ bltz(haystack_len, L_POST_LOOP);
3490 
3491     __ bind(L_LOOP);
3492     __ ld(ch2, Address(haystack));
3493     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3494     __ bnez(match_mask, L_HAS_ZERO);
3495 
3496     __ bind(L_LOOP_PROCEED);
3497     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3498     __ addi(haystack, haystack, wordSize);
3499     __ addi(result, result, wordSize / haystack_chr_size);
3500     __ bgez(haystack_len, L_LOOP);
3501 
3502     __ bind(L_POST_LOOP);
3503     __ mv(ch2, -wordSize / haystack_chr_size);
3504     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
3505     __ ld(ch2, Address(haystack));
3506     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3507     __ neg(haystack_len, haystack_len);
3508     __ xorr(ch2, first, ch2);
3509     __ sub(match_mask, ch2, mask1);
3510     __ orr(ch2, ch2, mask2);
3511     __ mv(trailing_zeros, -1); // all bits set
3512     __ j(L_SMALL_PROCEED);
3513 
3514     __ align(OptoLoopAlignment);
3515     __ bind(L_SMALL);
3516     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3517     __ neg(haystack_len, haystack_len);
3518     if (needle_isL != haystack_isL) {
3519       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3520     }
3521     __ xorr(ch2, first, ch2);
3522     __ sub(match_mask, ch2, mask1);
3523     __ orr(ch2, ch2, mask2);
3524     __ mv(trailing_zeros, -1); // all bits set
3525 
3526     __ bind(L_SMALL_PROCEED);
3527     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
3528     __ notr(ch2, ch2);
3529     __ andr(match_mask, match_mask, ch2);
3530     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
3531     __ beqz(match_mask, NOMATCH);
3532 
3533     __ bind(L_SMALL_HAS_ZERO_LOOP);
3534     // count bits of trailing zero chars
3535     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, ch2, tmp);
3536     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3537     __ mv(ch2, wordSize / haystack_chr_size);
3538     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
3539     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3540     __ mv(trailing_zeros, wordSize / haystack_chr_size);
3541     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3542 
3543     __ bind(L_SMALL_CMP_LOOP);
3544     __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
3545     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3546     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
3547     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3548     __ addi(trailing_zeros, trailing_zeros, 1);
3549     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
3550     __ beq(first, ch2, L_SMALL_CMP_LOOP);
3551 
3552     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
3553     __ beqz(match_mask, NOMATCH);
3554     // count bits of trailing zero chars
3555     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3556     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3557     __ addi(result, result, 1);
3558     __ addi(haystack, haystack, haystack_chr_size);
3559     __ j(L_SMALL_HAS_ZERO_LOOP);
3560 
3561     __ align(OptoLoopAlignment);
3562     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
3563     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3564     __ j(DONE);
3565 
3566     __ align(OptoLoopAlignment);
3567     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
3568     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3569     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3570     __ j(DONE);
3571 
3572     __ align(OptoLoopAlignment);
3573     __ bind(L_HAS_ZERO);
3574     // count bits of trailing zero chars
3575     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3576     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3577     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
3578     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
3579     __ subi(result, result, 1); // array index from 0, so result -= 1
3580 
3581     __ bind(L_HAS_ZERO_LOOP);
3582     __ mv(needle_len, wordSize / haystack_chr_size);
3583     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
3584     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
3585     // load next 8 bytes from haystack, and increase result index
3586     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3587     __ addi(result, result, 1);
3588     __ mv(trailing_zeros, wordSize / haystack_chr_size);
3589     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3590 
3591     // compare one char
3592     __ bind(L_CMP_LOOP);
3593     __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
3594     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
3595     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3596     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3597     __ addi(trailing_zeros, trailing_zeros, 1); // next char index
3598     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
3599     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
3600     __ beq(needle_len, ch2, L_CMP_LOOP);
3601 
3602     __ bind(L_CMP_LOOP_NOMATCH);
3603     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
3604     // count bits of trailing zero chars
3605     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, needle_len, ch2);
3606     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3607     __ addi(haystack, haystack, haystack_chr_size);
3608     __ j(L_HAS_ZERO_LOOP);
3609 
3610     __ align(OptoLoopAlignment);
3611     __ bind(L_CMP_LOOP_LAST_CMP);
3612     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
3613     __ j(DONE);
3614 
3615     __ align(OptoLoopAlignment);
3616     __ bind(L_CMP_LOOP_LAST_CMP2);
3617     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3618     __ addi(result, result, 1);
3619     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3620     __ j(DONE);
3621 
3622     __ align(OptoLoopAlignment);
3623     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
3624     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
3625     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
3626     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
3627     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
3628     // result by analyzed characters value, so, we can just reset lower bits
3629     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
3630     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
3631     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
3632     // index of last analyzed substring inside current octet. So, haystack in at
3633     // respective start address. We need to advance it to next octet
3634     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
3635     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
3636     __ andi(result, result, haystack_isL ? -8 : -4);
3637     __ slli(tmp, match_mask, haystack_chr_shift);
3638     __ sub(haystack, haystack, tmp);
3639     __ sext(haystack_len, haystack_len, 32);
3640     __ j(L_LOOP_PROCEED);
3641 
3642     __ align(OptoLoopAlignment);
3643     __ bind(NOMATCH);
3644     __ mv(result, -1);
3645 
3646     __ bind(DONE);
3647     __ pop_reg(spilled_regs, sp);
3648     __ ret();
3649     return entry;
3650   }
3651 
3652   void generate_string_indexof_stubs()
3653   {
3654     StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ll_id);
3655     StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_uu_id);
3656     StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ul_id);
3657   }
3658 
3659 #ifdef COMPILER2
3660   void generate_lookup_secondary_supers_table_stub() {
3661     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
3662     StubCodeMark mark(this, stub_id);
3663 
3664     const Register
3665       r_super_klass  = x10,
3666       r_array_base   = x11,
3667       r_array_length = x12,
3668       r_array_index  = x13,
3669       r_sub_klass    = x14,
3670       result         = x15,
3671       r_bitmap       = x16;
3672 
3673     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
3674       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
3675       Label L_success;
3676       __ enter();
3677       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, result,
3678                                              r_array_base, r_array_length, r_array_index,
3679                                              r_bitmap, slot, /*stub_is_near*/true);
3680       __ leave();
3681       __ ret();
3682     }
3683   }
3684 
3685   // Slow path implementation for UseSecondarySupersTable.
3686   address generate_lookup_secondary_supers_table_slow_path_stub() {
3687     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
3688     StubCodeMark mark(this, stub_id);
3689 
3690     address start = __ pc();
3691     const Register
3692       r_super_klass  = x10,        // argument
3693       r_array_base   = x11,        // argument
3694       temp1          = x12,        // tmp
3695       r_array_index  = x13,        // argument
3696       result         = x15,        // argument
3697       r_bitmap       = x16;        // argument
3698 
3699 
3700     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
3701     __ ret();
3702 
3703     return start;
3704   }
3705 
3706   address generate_mulAdd()
3707   {
3708     __ align(CodeEntryAlignment);
3709     StubId stub_id = StubId::stubgen_mulAdd_id;
3710     StubCodeMark mark(this, stub_id);
3711 
3712     address entry = __ pc();
3713 
3714     const Register out     = x10;
3715     const Register in      = x11;
3716     const Register offset  = x12;
3717     const Register len     = x13;
3718     const Register k       = x14;
3719     const Register tmp     = x28;
3720 
3721     BLOCK_COMMENT("Entry:");
3722     __ enter();
3723     __ mul_add(out, in, offset, len, k, tmp);
3724     __ leave();
3725     __ ret();
3726 
3727     return entry;
3728   }
3729 
3730   /**
3731    *  Arguments:
3732    *
3733    *  Input:
3734    *    c_rarg0   - x address
3735    *    c_rarg1   - x length
3736    *    c_rarg2   - y address
3737    *    c_rarg3   - y length
3738    *    c_rarg4   - z address
3739    */
3740   address generate_multiplyToLen()
3741   {
3742     __ align(CodeEntryAlignment);
3743     StubId stub_id = StubId::stubgen_multiplyToLen_id;
3744     StubCodeMark mark(this, stub_id);
3745     address entry = __ pc();
3746 
3747     const Register x     = x10;
3748     const Register xlen  = x11;
3749     const Register y     = x12;
3750     const Register ylen  = x13;
3751     const Register z     = x14;
3752 
3753     const Register tmp0  = x15;
3754     const Register tmp1  = x16;
3755     const Register tmp2  = x17;
3756     const Register tmp3  = x7;
3757     const Register tmp4  = x28;
3758     const Register tmp5  = x29;
3759     const Register tmp6  = x30;
3760     const Register tmp7  = x31;
3761 
3762     BLOCK_COMMENT("Entry:");
3763     __ enter(); // required for proper stackwalking of RuntimeStub frame
3764     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3765     __ leave(); // required for proper stackwalking of RuntimeStub frame
3766     __ ret();
3767 
3768     return entry;
3769   }
3770 
3771   address generate_squareToLen()
3772   {
3773     __ align(CodeEntryAlignment);
3774     StubId stub_id = StubId::stubgen_squareToLen_id;
3775     StubCodeMark mark(this, stub_id);
3776     address entry = __ pc();
3777 
3778     const Register x     = x10;
3779     const Register xlen  = x11;
3780     const Register z     = x12;
3781     const Register y     = x14; // == x
3782     const Register ylen  = x15; // == xlen
3783 
3784     const Register tmp0  = x13; // zlen, unused
3785     const Register tmp1  = x16;
3786     const Register tmp2  = x17;
3787     const Register tmp3  = x7;
3788     const Register tmp4  = x28;
3789     const Register tmp5  = x29;
3790     const Register tmp6  = x30;
3791     const Register tmp7  = x31;
3792 
3793     BLOCK_COMMENT("Entry:");
3794     __ enter();
3795     __ mv(y, x);
3796     __ mv(ylen, xlen);
3797     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3798     __ leave();
3799     __ ret();
3800 
3801     return entry;
3802   }
3803 
3804   // Arguments:
3805   //
3806   // Input:
3807   //   c_rarg0   - newArr address
3808   //   c_rarg1   - oldArr address
3809   //   c_rarg2   - newIdx
3810   //   c_rarg3   - shiftCount
3811   //   c_rarg4   - numIter
3812   //
3813   address generate_bigIntegerLeftShift() {
3814     __ align(CodeEntryAlignment);
3815     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
3816     StubCodeMark mark(this, stub_id);
3817     address entry = __ pc();
3818 
3819     Label loop, exit;
3820 
3821     Register newArr        = c_rarg0;
3822     Register oldArr        = c_rarg1;
3823     Register newIdx        = c_rarg2;
3824     Register shiftCount    = c_rarg3;
3825     Register numIter       = c_rarg4;
3826 
3827     Register shiftRevCount = c_rarg5;
3828     Register oldArrNext    = t1;
3829 
3830     __ beqz(numIter, exit);
3831     __ shadd(newArr, newIdx, newArr, t0, 2);
3832 
3833     __ mv(shiftRevCount, 32);
3834     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3835 
3836     __ bind(loop);
3837     __ addi(oldArrNext, oldArr, 4);
3838     __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
3839     __ vle32_v(v0, oldArr);
3840     __ vle32_v(v4, oldArrNext);
3841     __ vsll_vx(v0, v0, shiftCount);
3842     __ vsrl_vx(v4, v4, shiftRevCount);
3843     __ vor_vv(v0, v0, v4);
3844     __ vse32_v(v0, newArr);
3845     __ sub(numIter, numIter, t0);
3846     __ shadd(oldArr, t0, oldArr, t1, 2);
3847     __ shadd(newArr, t0, newArr, t1, 2);
3848     __ bnez(numIter, loop);
3849 
3850     __ bind(exit);
3851     __ ret();
3852 
3853     return entry;
3854   }
3855 
3856   // Arguments:
3857   //
3858   // Input:
3859   //   c_rarg0   - newArr address
3860   //   c_rarg1   - oldArr address
3861   //   c_rarg2   - newIdx
3862   //   c_rarg3   - shiftCount
3863   //   c_rarg4   - numIter
3864   //
3865   address generate_bigIntegerRightShift() {
3866     __ align(CodeEntryAlignment);
3867     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
3868     StubCodeMark mark(this, stub_id);
3869     address entry = __ pc();
3870 
3871     Label loop, exit;
3872 
3873     Register newArr        = c_rarg0;
3874     Register oldArr        = c_rarg1;
3875     Register newIdx        = c_rarg2;
3876     Register shiftCount    = c_rarg3;
3877     Register numIter       = c_rarg4;
3878     Register idx           = numIter;
3879 
3880     Register shiftRevCount = c_rarg5;
3881     Register oldArrNext    = c_rarg6;
3882     Register newArrCur     = t0;
3883     Register oldArrCur     = t1;
3884 
3885     __ beqz(idx, exit);
3886     __ shadd(newArr, newIdx, newArr, t0, 2);
3887 
3888     __ mv(shiftRevCount, 32);
3889     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3890 
3891     __ bind(loop);
3892     __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3893     __ sub(idx, idx, t0);
3894     __ shadd(oldArrNext, idx, oldArr, t1, 2);
3895     __ shadd(newArrCur, idx, newArr, t1, 2);
3896     __ addi(oldArrCur, oldArrNext, 4);
3897     __ vle32_v(v0, oldArrCur);
3898     __ vle32_v(v4, oldArrNext);
3899     __ vsrl_vx(v0, v0, shiftCount);
3900     __ vsll_vx(v4, v4, shiftRevCount);
3901     __ vor_vv(v0, v0, v4);
3902     __ vse32_v(v0, newArrCur);
3903     __ bnez(idx, loop);
3904 
3905     __ bind(exit);
3906     __ ret();
3907 
3908     return entry;
3909   }
3910 #endif
3911 
3912 #ifdef COMPILER2
3913   class MontgomeryMultiplyGenerator : public MacroAssembler {
3914 
3915     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3916       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3917 
3918     RegSet _toSave;
3919     bool _squaring;
3920 
3921   public:
3922     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3923       : MacroAssembler(as->code()), _squaring(squaring) {
3924 
3925       // Register allocation
3926 
3927       RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
3928       Pa_base = *regs;       // Argument registers
3929       if (squaring) {
3930         Pb_base = Pa_base;
3931       } else {
3932         Pb_base = *++regs;
3933       }
3934       Pn_base = *++regs;
3935       Rlen= *++regs;
3936       inv = *++regs;
3937       Pm_base = *++regs;
3938 
3939                         // Working registers:
3940       Ra =  *++regs;    // The current digit of a, b, n, and m.
3941       Rb =  *++regs;
3942       Rm =  *++regs;
3943       Rn =  *++regs;
3944 
3945       Pa =  *++regs;      // Pointers to the current/next digit of a, b, n, and m.
3946       Pb =  *++regs;
3947       Pm =  *++regs;
3948       Pn =  *++regs;
3949 
3950       tmp0 =  *++regs;    // Three registers which form a
3951       tmp1 =  *++regs;    // triple-precision accumuator.
3952       tmp2 =  *++regs;
3953 
3954       Ri =  x6;         // Inner and outer loop indexes.
3955       Rj =  x7;
3956 
3957       Rhi_ab = x28;     // Product registers: low and high parts
3958       Rlo_ab = x29;     // of a*b and m*n.
3959       Rhi_mn = x30;
3960       Rlo_mn = x31;
3961 
3962       // x18 and up are callee-saved.
3963       _toSave = RegSet::range(x18, *regs) + Pm_base;
3964     }
3965 
3966   private:
3967     void save_regs() {
3968       push_reg(_toSave, sp);
3969     }
3970 
3971     void restore_regs() {
3972       pop_reg(_toSave, sp);
3973     }
3974 
3975     template <typename T>
3976     void unroll_2(Register count, T block) {
3977       Label loop, end, odd;
3978       beqz(count, end);
3979       test_bit(t0, count, 0);
3980       bnez(t0, odd);
3981       align(16);
3982       bind(loop);
3983       (this->*block)();
3984       bind(odd);
3985       (this->*block)();
3986       subi(count, count, 2);
3987       bgtz(count, loop);
3988       bind(end);
3989     }
3990 
3991     template <typename T>
3992     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3993       Label loop, end, odd;
3994       beqz(count, end);
3995       test_bit(tmp, count, 0);
3996       bnez(tmp, odd);
3997       align(16);
3998       bind(loop);
3999       (this->*block)(d, s, tmp);
4000       bind(odd);
4001       (this->*block)(d, s, tmp);
4002       subi(count, count, 2);
4003       bgtz(count, loop);
4004       bind(end);
4005     }
4006 
4007     void pre1(RegisterOrConstant i) {
4008       block_comment("pre1");
4009       // Pa = Pa_base;
4010       // Pb = Pb_base + i;
4011       // Pm = Pm_base;
4012       // Pn = Pn_base + i;
4013       // Ra = *Pa;
4014       // Rb = *Pb;
4015       // Rm = *Pm;
4016       // Rn = *Pn;
4017       if (i.is_register()) {
4018         slli(t0, i.as_register(), LogBytesPerWord);
4019       } else {
4020         mv(t0, i.as_constant());
4021         slli(t0, t0, LogBytesPerWord);
4022       }
4023 
4024       mv(Pa, Pa_base);
4025       add(Pb, Pb_base, t0);
4026       mv(Pm, Pm_base);
4027       add(Pn, Pn_base, t0);
4028 
4029       ld(Ra, Address(Pa));
4030       ld(Rb, Address(Pb));
4031       ld(Rm, Address(Pm));
4032       ld(Rn, Address(Pn));
4033 
4034       // Zero the m*n result.
4035       mv(Rhi_mn, zr);
4036       mv(Rlo_mn, zr);
4037     }
4038 
4039     // The core multiply-accumulate step of a Montgomery
4040     // multiplication.  The idea is to schedule operations as a
4041     // pipeline so that instructions with long latencies (loads and
4042     // multiplies) have time to complete before their results are
4043     // used.  This most benefits in-order implementations of the
4044     // architecture but out-of-order ones also benefit.
4045     void step() {
4046       block_comment("step");
4047       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4048       // Ra = *++Pa;
4049       // Rb = *--Pb;
4050       mulhu(Rhi_ab, Ra, Rb);
4051       mul(Rlo_ab, Ra, Rb);
4052       addi(Pa, Pa, wordSize);
4053       ld(Ra, Address(Pa));
4054       subi(Pb, Pb, wordSize);
4055       ld(Rb, Address(Pb));
4056       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
4057                                             // previous iteration.
4058       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4059       // Rm = *++Pm;
4060       // Rn = *--Pn;
4061       mulhu(Rhi_mn, Rm, Rn);
4062       mul(Rlo_mn, Rm, Rn);
4063       addi(Pm, Pm, wordSize);
4064       ld(Rm, Address(Pm));
4065       subi(Pn, Pn, wordSize);
4066       ld(Rn, Address(Pn));
4067       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4068     }
4069 
4070     void post1() {
4071       block_comment("post1");
4072 
4073       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4074       // Ra = *++Pa;
4075       // Rb = *--Pb;
4076       mulhu(Rhi_ab, Ra, Rb);
4077       mul(Rlo_ab, Ra, Rb);
4078       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4079       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4080 
4081       // *Pm = Rm = tmp0 * inv;
4082       mul(Rm, tmp0, inv);
4083       sd(Rm, Address(Pm));
4084 
4085       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4086       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4087       mulhu(Rhi_mn, Rm, Rn);
4088 
4089 #ifndef PRODUCT
4090       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4091       {
4092         mul(Rlo_mn, Rm, Rn);
4093         add(Rlo_mn, tmp0, Rlo_mn);
4094         Label ok;
4095         beqz(Rlo_mn, ok);
4096         stop("broken Montgomery multiply");
4097         bind(ok);
4098       }
4099 #endif
4100       // We have very carefully set things up so that
4101       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4102       // the lower half of Rm * Rn because we know the result already:
4103       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
4104       // tmp0 != 0.  So, rather than do a mul and an cad we just set
4105       // the carry flag iff tmp0 is nonzero.
4106       //
4107       // mul(Rlo_mn, Rm, Rn);
4108       // cad(zr, tmp0, Rlo_mn);
4109       subi(t0, tmp0, 1);
4110       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4111       cadc(tmp0, tmp1, Rhi_mn, t0);
4112       adc(tmp1, tmp2, zr, t0);
4113       mv(tmp2, zr);
4114     }
4115 
4116     void pre2(Register i, Register len) {
4117       block_comment("pre2");
4118       // Pa = Pa_base + i-len;
4119       // Pb = Pb_base + len;
4120       // Pm = Pm_base + i-len;
4121       // Pn = Pn_base + len;
4122 
4123       sub(Rj, i, len);
4124       // Rj == i-len
4125 
4126       // Ra as temp register
4127       slli(Ra, Rj, LogBytesPerWord);
4128       add(Pa, Pa_base, Ra);
4129       add(Pm, Pm_base, Ra);
4130       slli(Ra, len, LogBytesPerWord);
4131       add(Pb, Pb_base, Ra);
4132       add(Pn, Pn_base, Ra);
4133 
4134       // Ra = *++Pa;
4135       // Rb = *--Pb;
4136       // Rm = *++Pm;
4137       // Rn = *--Pn;
4138       addi(Pa, Pa, wordSize);
4139       ld(Ra, Address(Pa));
4140       subi(Pb, Pb, wordSize);
4141       ld(Rb, Address(Pb));
4142       addi(Pm, Pm, wordSize);
4143       ld(Rm, Address(Pm));
4144       subi(Pn, Pn, wordSize);
4145       ld(Rn, Address(Pn));
4146 
4147       mv(Rhi_mn, zr);
4148       mv(Rlo_mn, zr);
4149     }
4150 
4151     void post2(Register i, Register len) {
4152       block_comment("post2");
4153       sub(Rj, i, len);
4154 
4155       cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
4156 
4157       // As soon as we know the least significant digit of our result,
4158       // store it.
4159       // Pm_base[i-len] = tmp0;
4160       // Rj as temp register
4161       slli(Rj, Rj, LogBytesPerWord);
4162       add(Rj, Pm_base, Rj);
4163       sd(tmp0, Address(Rj));
4164 
4165       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4166       cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
4167       adc(tmp1, tmp2, zr, t0);
4168       mv(tmp2, zr);
4169     }
4170 
4171     // A carry in tmp0 after Montgomery multiplication means that we
4172     // should subtract multiples of n from our result in m.  We'll
4173     // keep doing that until there is no carry.
4174     void normalize(Register len) {
4175       block_comment("normalize");
4176       // while (tmp0)
4177       //   tmp0 = sub(Pm_base, Pn_base, tmp0, len);
4178       Label loop, post, again;
4179       Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
4180       beqz(tmp0, post); {
4181         bind(again); {
4182           mv(i, zr);
4183           mv(cnt, len);
4184           slli(Rn, i, LogBytesPerWord);
4185           add(Rm, Pm_base, Rn);
4186           ld(Rm, Address(Rm));
4187           add(Rn, Pn_base, Rn);
4188           ld(Rn, Address(Rn));
4189           mv(t0, 1); // set carry flag, i.e. no borrow
4190           align(16);
4191           bind(loop); {
4192             notr(Rn, Rn);
4193             add(Rm, Rm, t0);
4194             add(Rm, Rm, Rn);
4195             sltu(t0, Rm, Rn);
4196             slli(Rn, i, LogBytesPerWord); // Rn as temp register
4197             add(Rn, Pm_base, Rn);
4198             sd(Rm, Address(Rn));
4199             addi(i, i, 1);
4200             slli(Rn, i, LogBytesPerWord);
4201             add(Rm, Pm_base, Rn);
4202             ld(Rm, Address(Rm));
4203             add(Rn, Pn_base, Rn);
4204             ld(Rn, Address(Rn));
4205             subi(cnt, cnt, 1);
4206           } bnez(cnt, loop);
4207           subi(tmp0, tmp0, 1);
4208           add(tmp0, tmp0, t0);
4209         } bnez(tmp0, again);
4210       } bind(post);
4211     }
4212 
4213     // Move memory at s to d, reversing words.
4214     //    Increments d to end of copied memory
4215     //    Destroys tmp1, tmp2
4216     //    Preserves len
4217     //    Leaves s pointing to the address which was in d at start
4218     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4219       assert(tmp1->encoding() < x28->encoding(), "register corruption");
4220       assert(tmp2->encoding() < x28->encoding(), "register corruption");
4221 
4222       shadd(s, len, s, tmp1, LogBytesPerWord);
4223       mv(tmp1, len);
4224       unroll_2(tmp1,  &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4225       slli(tmp1, len, LogBytesPerWord);
4226       sub(s, d, tmp1);
4227     }
4228     // [63...0] -> [31...0][63...32]
4229     void reverse1(Register d, Register s, Register tmp) {
4230       subi(s, s, wordSize);
4231       ld(tmp, Address(s));
4232       ror(tmp, tmp, 32, t0);
4233       sd(tmp, Address(d));
4234       addi(d, d, wordSize);
4235     }
4236 
4237     void step_squaring() {
4238       // An extra ACC
4239       step();
4240       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4241     }
4242 
4243     void last_squaring(Register i) {
4244       Label dont;
4245       // if ((i & 1) == 0) {
4246       test_bit(t0, i, 0);
4247       bnez(t0, dont); {
4248         // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4249         // Ra = *++Pa;
4250         // Rb = *--Pb;
4251         mulhu(Rhi_ab, Ra, Rb);
4252         mul(Rlo_ab, Ra, Rb);
4253         acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4254       } bind(dont);
4255     }
4256 
4257     void extra_step_squaring() {
4258       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4259 
4260       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4261       // Rm = *++Pm;
4262       // Rn = *--Pn;
4263       mulhu(Rhi_mn, Rm, Rn);
4264       mul(Rlo_mn, Rm, Rn);
4265       addi(Pm, Pm, wordSize);
4266       ld(Rm, Address(Pm));
4267       subi(Pn, Pn, wordSize);
4268       ld(Rn, Address(Pn));
4269     }
4270 
4271     void post1_squaring() {
4272       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4273 
4274       // *Pm = Rm = tmp0 * inv;
4275       mul(Rm, tmp0, inv);
4276       sd(Rm, Address(Pm));
4277 
4278       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4279       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4280       mulhu(Rhi_mn, Rm, Rn);
4281 
4282 #ifndef PRODUCT
4283       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4284       {
4285         mul(Rlo_mn, Rm, Rn);
4286         add(Rlo_mn, tmp0, Rlo_mn);
4287         Label ok;
4288         beqz(Rlo_mn, ok); {
4289           stop("broken Montgomery multiply");
4290         } bind(ok);
4291       }
4292 #endif
4293       // We have very carefully set things up so that
4294       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4295       // the lower half of Rm * Rn because we know the result already:
4296       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
4297       // tmp0 != 0.  So, rather than do a mul and a cad we just set
4298       // the carry flag iff tmp0 is nonzero.
4299       //
4300       // mul(Rlo_mn, Rm, Rn);
4301       // cad(zr, tmp, Rlo_mn);
4302       subi(t0, tmp0, 1);
4303       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4304       cadc(tmp0, tmp1, Rhi_mn, t0);
4305       adc(tmp1, tmp2, zr, t0);
4306       mv(tmp2, zr);
4307     }
4308 
4309     // use t0 as carry
4310     void acc(Register Rhi, Register Rlo,
4311              Register tmp0, Register tmp1, Register tmp2) {
4312       cad(tmp0, tmp0, Rlo, t0);
4313       cadc(tmp1, tmp1, Rhi, t0);
4314       adc(tmp2, tmp2, zr, t0);
4315     }
4316 
4317   public:
4318     /**
4319      * Fast Montgomery multiplication.  The derivation of the
4320      * algorithm is in A Cryptographic Library for the Motorola
4321      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4322      *
4323      * Arguments:
4324      *
4325      * Inputs for multiplication:
4326      *   c_rarg0   - int array elements a
4327      *   c_rarg1   - int array elements b
4328      *   c_rarg2   - int array elements n (the modulus)
4329      *   c_rarg3   - int length
4330      *   c_rarg4   - int inv
4331      *   c_rarg5   - int array elements m (the result)
4332      *
4333      * Inputs for squaring:
4334      *   c_rarg0   - int array elements a
4335      *   c_rarg1   - int array elements n (the modulus)
4336      *   c_rarg2   - int length
4337      *   c_rarg3   - int inv
4338      *   c_rarg4   - int array elements m (the result)
4339      *
4340      */
4341     address generate_multiply() {
4342       Label argh, nothing;
4343       bind(argh);
4344       stop("MontgomeryMultiply total_allocation must be <= 8192");
4345 
4346       align(CodeEntryAlignment);
4347       address entry = pc();
4348 
4349       beqz(Rlen, nothing);
4350 
4351       enter();
4352 
4353       // Make room.
4354       mv(Ra, 512);
4355       bgt(Rlen, Ra, argh);
4356       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4357       sub(Ra, sp, Ra);
4358       andi(sp, Ra, -2 * wordSize);
4359 
4360       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
4361 
4362       {
4363         // Copy input args, reversing as we go.  We use Ra as a
4364         // temporary variable.
4365         reverse(Ra, Pa_base, Rlen, Ri, Rj);
4366         if (!_squaring)
4367           reverse(Ra, Pb_base, Rlen, Ri, Rj);
4368         reverse(Ra, Pn_base, Rlen, Ri, Rj);
4369       }
4370 
4371       // Push all call-saved registers and also Pm_base which we'll need
4372       // at the end.
4373       save_regs();
4374 
4375 #ifndef PRODUCT
4376       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4377       {
4378         ld(Rn, Address(Pn_base));
4379         mul(Rlo_mn, Rn, inv);
4380         mv(t0, -1);
4381         Label ok;
4382         beq(Rlo_mn, t0, ok);
4383         stop("broken inverse in Montgomery multiply");
4384         bind(ok);
4385       }
4386 #endif
4387 
4388       mv(Pm_base, Ra);
4389 
4390       mv(tmp0, zr);
4391       mv(tmp1, zr);
4392       mv(tmp2, zr);
4393 
4394       block_comment("for (int i = 0; i < len; i++) {");
4395       mv(Ri, zr); {
4396         Label loop, end;
4397         bge(Ri, Rlen, end);
4398 
4399         bind(loop);
4400         pre1(Ri);
4401 
4402         block_comment("  for (j = i; j; j--) {"); {
4403           mv(Rj, Ri);
4404           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4405         } block_comment("  } // j");
4406 
4407         post1();
4408         addiw(Ri, Ri, 1);
4409         blt(Ri, Rlen, loop);
4410         bind(end);
4411         block_comment("} // i");
4412       }
4413 
4414       block_comment("for (int i = len; i < 2*len; i++) {");
4415       mv(Ri, Rlen); {
4416         Label loop, end;
4417         slli(t0, Rlen, 1);
4418         bge(Ri, t0, end);
4419 
4420         bind(loop);
4421         pre2(Ri, Rlen);
4422 
4423         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4424           slliw(Rj, Rlen, 1);
4425           subw(Rj, Rj, Ri);
4426           subiw(Rj, Rj, 1);
4427           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4428         } block_comment("  } // j");
4429 
4430         post2(Ri, Rlen);
4431         addiw(Ri, Ri, 1);
4432         slli(t0, Rlen, 1);
4433         blt(Ri, t0, loop);
4434         bind(end);
4435       }
4436       block_comment("} // i");
4437 
4438       normalize(Rlen);
4439 
4440       mv(Ra, Pm_base);  // Save Pm_base in Ra
4441       restore_regs();  // Restore caller's Pm_base
4442 
4443       // Copy our result into caller's Pm_base
4444       reverse(Pm_base, Ra, Rlen, Ri, Rj);
4445 
4446       leave();
4447       bind(nothing);
4448       ret();
4449 
4450       return entry;
4451     }
4452 
4453     /**
4454      *
4455      * Arguments:
4456      *
4457      * Inputs:
4458      *   c_rarg0   - int array elements a
4459      *   c_rarg1   - int array elements n (the modulus)
4460      *   c_rarg2   - int length
4461      *   c_rarg3   - int inv
4462      *   c_rarg4   - int array elements m (the result)
4463      *
4464      */
4465     address generate_square() {
4466       Label argh;
4467       bind(argh);
4468       stop("MontgomeryMultiply total_allocation must be <= 8192");
4469 
4470       align(CodeEntryAlignment);
4471       address entry = pc();
4472 
4473       enter();
4474 
4475       // Make room.
4476       mv(Ra, 512);
4477       bgt(Rlen, Ra, argh);
4478       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4479       sub(Ra, sp, Ra);
4480       andi(sp, Ra, -2 * wordSize);
4481 
4482       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
4483 
4484       {
4485         // Copy input args, reversing as we go.  We use Ra as a
4486         // temporary variable.
4487         reverse(Ra, Pa_base, Rlen, Ri, Rj);
4488         reverse(Ra, Pn_base, Rlen, Ri, Rj);
4489       }
4490 
4491       // Push all call-saved registers and also Pm_base which we'll need
4492       // at the end.
4493       save_regs();
4494 
4495       mv(Pm_base, Ra);
4496 
4497       mv(tmp0, zr);
4498       mv(tmp1, zr);
4499       mv(tmp2, zr);
4500 
4501       block_comment("for (int i = 0; i < len; i++) {");
4502       mv(Ri, zr); {
4503         Label loop, end;
4504         bind(loop);
4505         bge(Ri, Rlen, end);
4506 
4507         pre1(Ri);
4508 
4509         block_comment("for (j = (i+1)/2; j; j--) {"); {
4510           addi(Rj, Ri, 1);
4511           srliw(Rj, Rj, 1);
4512           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4513         } block_comment("  } // j");
4514 
4515         last_squaring(Ri);
4516 
4517         block_comment("  for (j = i/2; j; j--) {"); {
4518           srliw(Rj, Ri, 1);
4519           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4520         } block_comment("  } // j");
4521 
4522         post1_squaring();
4523         addi(Ri, Ri, 1);
4524         blt(Ri, Rlen, loop);
4525 
4526         bind(end);
4527         block_comment("} // i");
4528       }
4529 
4530       block_comment("for (int i = len; i < 2*len; i++) {");
4531       mv(Ri, Rlen); {
4532         Label loop, end;
4533         bind(loop);
4534         slli(t0, Rlen, 1);
4535         bge(Ri, t0, end);
4536 
4537         pre2(Ri, Rlen);
4538 
4539         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4540           slli(Rj, Rlen, 1);
4541           sub(Rj, Rj, Ri);
4542           subi(Rj, Rj, 1);
4543           srliw(Rj, Rj, 1);
4544           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4545         } block_comment("  } // j");
4546 
4547         last_squaring(Ri);
4548 
4549         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4550           slli(Rj, Rlen, 1);
4551           sub(Rj, Rj, Ri);
4552           srliw(Rj, Rj, 1);
4553           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4554         } block_comment("  } // j");
4555 
4556         post2(Ri, Rlen);
4557         addi(Ri, Ri, 1);
4558         slli(t0, Rlen, 1);
4559         blt(Ri, t0, loop);
4560 
4561         bind(end);
4562         block_comment("} // i");
4563       }
4564 
4565       normalize(Rlen);
4566 
4567       mv(Ra, Pm_base);  // Save Pm_base in Ra
4568       restore_regs();  // Restore caller's Pm_base
4569 
4570       // Copy our result into caller's Pm_base
4571       reverse(Pm_base, Ra, Rlen, Ri, Rj);
4572 
4573       leave();
4574       ret();
4575 
4576       return entry;
4577     }
4578   };
4579 
4580 #endif // COMPILER2
4581 
4582   address generate_cont_thaw(Continuation::thaw_kind kind) {
4583     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
4584     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
4585 
4586     address start = __ pc();
4587 
4588     if (return_barrier) {
4589       __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4590     }
4591 
4592 #ifndef PRODUCT
4593     {
4594       Label OK;
4595       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4596       __ beq(sp, t0, OK);
4597       __ stop("incorrect sp");
4598       __ bind(OK);
4599     }
4600 #endif
4601 
4602     if (return_barrier) {
4603       // preserve possible return value from a method returning to the return barrier
4604       __ subi(sp, sp, 2 * wordSize);
4605       __ fsd(f10, Address(sp, 0 * wordSize));
4606       __ sd(x10, Address(sp, 1 * wordSize));
4607     }
4608 
4609     __ mv(c_rarg1, (return_barrier ? 1 : 0));
4610     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
4611     __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
4612 
4613     if (return_barrier) {
4614       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4615       __ ld(x10, Address(sp, 1 * wordSize));
4616       __ fld(f10, Address(sp, 0 * wordSize));
4617       __ addi(sp, sp, 2 * wordSize);
4618     }
4619 
4620 #ifndef PRODUCT
4621     {
4622       Label OK;
4623       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4624       __ beq(sp, t0, OK);
4625       __ stop("incorrect sp");
4626       __ bind(OK);
4627     }
4628 #endif
4629 
4630     Label thaw_success;
4631     // t1 contains the size of the frames to thaw, 0 if overflow or no more frames
4632     __ bnez(t1, thaw_success);
4633     __ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
4634     __ bind(thaw_success);
4635 
4636     // make room for the thawed frames
4637     __ sub(t0, sp, t1);
4638     __ andi(sp, t0, -16); // align
4639 
4640     if (return_barrier) {
4641       // save original return value -- again
4642       __ subi(sp, sp, 2 * wordSize);
4643       __ fsd(f10, Address(sp, 0 * wordSize));
4644       __ sd(x10, Address(sp, 1 * wordSize));
4645     }
4646 
4647     // If we want, we can templatize thaw by kind, and have three different entries
4648     __ mv(c_rarg1, kind);
4649 
4650     __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
4651     __ mv(t1, x10); // x10 is the sp of the yielding frame
4652 
4653     if (return_barrier) {
4654       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4655       __ ld(x10, Address(sp, 1 * wordSize));
4656       __ fld(f10, Address(sp, 0 * wordSize));
4657       __ addi(sp, sp, 2 * wordSize);
4658     } else {
4659       __ mv(x10, zr); // return 0 (success) from doYield
4660     }
4661 
4662     // we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
4663     __ mv(fp, t1);
4664     __ subi(sp, t1, 2 * wordSize); // now pointing to fp spill
4665 
4666     if (return_barrier_exception) {
4667       __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
4668       __ verify_oop(x10);
4669       __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
4670 
4671       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
4672 
4673       // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
4674 
4675       __ mv(x11, x10); // the exception handler
4676       __ mv(x10, x9); // restore return value contaning the exception oop
4677       __ verify_oop(x10);
4678 
4679       __ leave();
4680       __ mv(x13, ra);
4681       __ jr(x11); // the exception handler
4682     } else {
4683       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4684       __ leave();
4685       __ ret();
4686     }
4687 
4688     return start;
4689   }
4690 
4691   address generate_cont_thaw() {
4692     if (!Continuations::enabled()) return nullptr;
4693 
4694     StubId stub_id = StubId::stubgen_cont_thaw_id;
4695     StubCodeMark mark(this, stub_id);
4696     address start = __ pc();
4697     generate_cont_thaw(Continuation::thaw_top);
4698     return start;
4699   }
4700 
4701   address generate_cont_returnBarrier() {
4702     if (!Continuations::enabled()) return nullptr;
4703 
4704     // TODO: will probably need multiple return barriers depending on return type
4705     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
4706     StubCodeMark mark(this, stub_id);
4707     address start = __ pc();
4708 
4709     generate_cont_thaw(Continuation::thaw_return_barrier);
4710 
4711     return start;
4712   }
4713 
4714   address generate_cont_returnBarrier_exception() {
4715     if (!Continuations::enabled()) return nullptr;
4716 
4717     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
4718     StubCodeMark mark(this, stub_id);
4719     address start = __ pc();
4720 
4721     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
4722 
4723     return start;
4724   }
4725 
4726   address generate_cont_preempt_stub() {
4727     if (!Continuations::enabled()) return nullptr;
4728     StubId stub_id = StubId::stubgen_cont_preempt_id;
4729     StubCodeMark mark(this, stub_id);
4730     address start = __ pc();
4731 
4732     __ reset_last_Java_frame(true);
4733 
4734     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4735     __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4736 
4737     Label preemption_cancelled;
4738     __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset()));
4739     __ bnez(t0, preemption_cancelled);
4740 
4741     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4742     SharedRuntime::continuation_enter_cleanup(_masm);
4743     __ leave();
4744     __ ret();
4745 
4746     // We acquired the monitor after freezing the frames so call thaw to continue execution.
4747     __ bind(preemption_cancelled);
4748     __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset()));
4749     __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize)));
4750     __ la(t1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
4751     __ ld(t1, Address(t1));
4752     __ jr(t1);
4753 
4754     return start;
4755   }
4756 
4757 #ifdef COMPILER2
4758 
4759 #undef __
4760 #define __ this->
4761 
4762   class Sha2Generator : public MacroAssembler {
4763     StubCodeGenerator* _cgen;
4764    public:
4765       Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
4766       address generate_sha256_implCompress(StubId stub_id) {
4767         return generate_sha2_implCompress(Assembler::e32, stub_id);
4768       }
4769       address generate_sha512_implCompress(StubId stub_id) {
4770         return generate_sha2_implCompress(Assembler::e64, stub_id);
4771       }
4772    private:
4773 
4774     void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4775       if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
4776       else                            __ vle64_v(vr, sr);
4777     }
4778 
4779     void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4780       if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
4781       else                            __ vse64_v(vr, sr);
4782     }
4783 
4784     // Overview of the logic in each "quad round".
4785     //
4786     // The code below repeats 16/20 times the logic implementing four rounds
4787     // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
4788     // to implementing the 64/80 single rounds.
4789     //
4790     //    // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
4791     //    // Output:
4792     //    //   vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4793     //    vl1reXX.v vTmp1, ofs
4794     //
4795     //    // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
4796     //    addi ofs, ofs, 16/32
4797     //
4798     //    // Add constants to message schedule words:
4799     //    //  Input
4800     //    //    vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4801     //    //    vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
4802     //    //  Output
4803     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4804     //    vadd.vv vTmp0, vTmp1, vW0
4805     //
4806     //    //  2 rounds of working variables updates.
4807     //    //     vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
4808     //    //  Input:
4809     //    //    vState1 = {c[t],d[t],g[t],h[t]}   " = vState1[t] "
4810     //    //    vState0 = {a[t],b[t],e[t],f[t]}
4811     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4812     //    //  Output:
4813     //    //    vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}  " = vState0[t+2] "
4814     //    //        = {h[t+4],g[t+4],d[t+4],c[t+4]}  " = vState1[t+4] "
4815     //    vsha2cl.vv vState1, vState0, vTmp0
4816     //
4817     //    //  2 rounds of working variables updates.
4818     //    //     vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
4819     //    //  Input
4820     //    //   vState0 = {a[t],b[t],e[t],f[t]}       " = vState0[t] "
4821     //    //       = {h[t+2],g[t+2],d[t+2],c[t+2]}   " = vState1[t+2] "
4822     //    //   vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}   " = vState0[t+2] "
4823     //    //   vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4824     //    //  Output:
4825     //    //   vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]}   " = vState0[t+4] "
4826     //    vsha2ch.vv vState0, vState1, vTmp0
4827     //
4828     //    // Combine 2QW into 1QW
4829     //    //
4830     //    // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
4831     //    //     vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
4832     //    // and it can only take 3 vectors as inputs. Hence we need to combine
4833     //    // vW1[0] and vW2[1..3] in a single vector.
4834     //    //
4835     //    // vmerge Vt4, Vt1, Vt2, V0
4836     //    // Input
4837     //    //  V0 = mask // first word from vW2, 1..3 words from vW1
4838     //    //  vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
4839     //    //  vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
4840     //    // Output
4841     //    //  Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
4842     //    vmerge.vvm vTmp0, vW2, vW1, v0
4843     //
4844     //    // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
4845     //    // Input
4846     //    //  vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]}     W[ 3: 0]
4847     //    //  vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]}     W[15:12]
4848     //    //  vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]}     W[11: 9,4]
4849     //    // Output (next four message schedule words)
4850     //    //  vW0 = {W[t+19],  W[t+18],  W[t+17],  W[t+16]}  W[19:16]
4851     //    vsha2ms.vv vW0, vTmp0, vW3
4852     //
4853     // BEFORE
4854     //  vW0 - vW3 hold the message schedule words (initially the block words)
4855     //    vW0 = W[ 3: 0]   "oldest"
4856     //    vW1 = W[ 7: 4]
4857     //    vW2 = W[11: 8]
4858     //    vW3 = W[15:12]   "newest"
4859     //
4860     //  vt6 - vt7 hold the working state variables
4861     //    vState0 = {a[t],b[t],e[t],f[t]}   // initially {H5,H4,H1,H0}
4862     //    vState1 = {c[t],d[t],g[t],h[t]}   // initially {H7,H6,H3,H2}
4863     //
4864     // AFTER
4865     //  vW0 - vW3 hold the message schedule words (initially the block words)
4866     //    vW1 = W[ 7: 4]   "oldest"
4867     //    vW2 = W[11: 8]
4868     //    vW3 = W[15:12]
4869     //    vW0 = W[19:16]   "newest"
4870     //
4871     //  vState0 and vState1 hold the working state variables
4872     //    vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
4873     //    vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
4874     //
4875     //  The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
4876     //  hence the uses of those vectors rotate in each round, and we get back to the
4877     //  initial configuration every 4 quad-rounds. We could avoid those changes at
4878     //  the cost of moving those vectors at the end of each quad-rounds.
4879     void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
4880                          Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
4881                          bool gen_words = true, bool step_const = true) {
4882       __ vleXX_v(vset_sew, vtemp, scalarconst);
4883       if (step_const) {
4884         __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
4885       }
4886       __ vadd_vv(vtemp2, vtemp, rot1);
4887       __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
4888       __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
4889       if (gen_words) {
4890         __ vmerge_vvm(vtemp2, rot3, rot2);
4891         __ vsha2ms_vv(rot1, vtemp2, rot4);
4892       }
4893     }
4894 
4895     // Arguments:
4896     //
4897     // Inputs:
4898     //   c_rarg0   - byte[]  source+offset
4899     //   c_rarg1   - int[]   SHA.state
4900     //   c_rarg2   - int     offset
4901     //   c_rarg3   - int     limit
4902     //
4903     address generate_sha2_implCompress(Assembler::SEW vset_sew, StubId stub_id) {
4904       alignas(64) static const uint32_t round_consts_256[64] = {
4905         0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
4906         0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
4907         0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
4908         0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
4909         0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
4910         0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
4911         0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
4912         0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
4913         0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
4914         0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
4915         0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
4916         0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
4917         0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
4918         0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
4919         0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
4920         0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
4921       };
4922       alignas(64) static const uint64_t round_consts_512[80] = {
4923         0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
4924         0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
4925         0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
4926         0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
4927         0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
4928         0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
4929         0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
4930         0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
4931         0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
4932         0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
4933         0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
4934         0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
4935         0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
4936         0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
4937         0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
4938         0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
4939         0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
4940         0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
4941         0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
4942         0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
4943         0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
4944         0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
4945         0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
4946         0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
4947         0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
4948         0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
4949         0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
4950       };
4951       const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
4952 
4953       bool multi_block;
4954       switch (stub_id) {
4955       case StubId::stubgen_sha256_implCompress_id:
4956         assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4957         multi_block = false;
4958         break;
4959       case StubId::stubgen_sha256_implCompressMB_id:
4960         assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4961         multi_block = true;
4962         break;
4963       case StubId::stubgen_sha512_implCompress_id:
4964         assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4965         multi_block = false;
4966         break;
4967       case StubId::stubgen_sha512_implCompressMB_id:
4968         assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4969         multi_block = true;
4970         break;
4971       default:
4972         ShouldNotReachHere();
4973       };
4974       __ align(CodeEntryAlignment);
4975       StubCodeMark mark(_cgen, stub_id);
4976       address start = __ pc();
4977 
4978       Register buf   = c_rarg0;
4979       Register state = c_rarg1;
4980       Register ofs   = c_rarg2;
4981       Register limit = c_rarg3;
4982       Register consts =  t2; // caller saved
4983       Register state_c = x28; // caller saved
4984       VectorRegister vindex = v2;
4985       VectorRegister vW0 = v4;
4986       VectorRegister vW1 = v6;
4987       VectorRegister vW2 = v8;
4988       VectorRegister vW3 = v10;
4989       VectorRegister vState0 = v12;
4990       VectorRegister vState1 = v14;
4991       VectorRegister vHash0  = v16;
4992       VectorRegister vHash1  = v18;
4993       VectorRegister vTmp0   = v20;
4994       VectorRegister vTmp1   = v22;
4995 
4996       Label multi_block_loop;
4997 
4998       __ enter();
4999 
5000       address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
5001       la(consts, ExternalAddress(constant_table));
5002 
5003       // Register use in this function:
5004       //
5005       // VECTORS
5006       //  vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
5007       //             schedule words (Wt). They start with the message block
5008       //             content (W0 to W15), then further words in the message
5009       //             schedule generated via vsha2ms from previous Wt.
5010       //   Initially:
5011       //     vW0 = W[  3:0] = { W3,  W2,  W1,  W0}
5012       //     vW1 = W[  7:4] = { W7,  W6,  W5,  W4}
5013       //     vW2 = W[ 11:8] = {W11, W10,  W9,  W8}
5014       //     vW3 = W[15:12] = {W15, W14, W13, W12}
5015       //
5016       //  vState0 - vState1 hold the working state variables (a, b, ..., h)
5017       //    vState0 = {f[t],e[t],b[t],a[t]}
5018       //    vState1 = {h[t],g[t],d[t],c[t]}
5019       //   Initially:
5020       //    vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
5021       //    vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
5022       //
5023       //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
5024       //
5025       //  vTmp0 = temporary, Wt+Kt
5026       //  vTmp1 = temporary, Kt
5027       //
5028       //  vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
5029       //
5030       // During most of the function the vector state is configured so that each
5031       // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
5032 
5033       // vsha2ch/vsha2cl uses EGW of 4*SEW.
5034       // SHA256 SEW = e32, EGW = 128-bits
5035       // SHA512 SEW = e64, EGW = 256-bits
5036       //
5037       // VLEN is required to be at least 128.
5038       // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
5039       //
5040       // m1: LMUL=1/2
5041       // ta: tail agnostic (don't care about those lanes)
5042       // ma: mask agnostic (don't care about those lanes)
5043       // x0 is not written, we known the number of vector elements.
5044 
5045       if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
5046         __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
5047       } else {
5048         __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
5049       }
5050 
5051       int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
5052       __ li(t0, indexes);
5053       __ vmv_v_x(vindex, t0);
5054 
5055       // Step-over a,b, so we are pointing to c.
5056       // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
5057       __ addi(state_c, state, const_add/2);
5058 
5059       // Use index-load to get {f,e,b,a},{h,g,d,c}
5060       __ vluxei8_v(vState0, state, vindex);
5061       __ vluxei8_v(vState1, state_c, vindex);
5062 
5063       __ bind(multi_block_loop);
5064 
5065       // Capture the initial H values in vHash0 and vHash1 to allow for computing
5066       // the resulting H', since H' = H+{a',b',c',...,h'}.
5067       __ vmv_v_v(vHash0, vState0);
5068       __ vmv_v_v(vHash1, vState1);
5069 
5070       // Load the 512/1024-bits of the message block in vW0-vW3 and perform
5071       // an endian swap on each 4/8 bytes element.
5072       //
5073       // If Zvkb is not implemented one can use vrgather
5074       // with an index sequence to byte-swap.
5075       //  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
5076       //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
5077       //  this sequence. 'vid' gives us the N.
5078       __ vleXX_v(vset_sew, vW0, buf);
5079       __ vrev8_v(vW0, vW0);
5080       __ addi(buf, buf, const_add);
5081       __ vleXX_v(vset_sew, vW1, buf);
5082       __ vrev8_v(vW1, vW1);
5083       __ addi(buf, buf, const_add);
5084       __ vleXX_v(vset_sew, vW2, buf);
5085       __ vrev8_v(vW2, vW2);
5086       __ addi(buf, buf, const_add);
5087       __ vleXX_v(vset_sew, vW3, buf);
5088       __ vrev8_v(vW3, vW3);
5089       __ addi(buf, buf, const_add);
5090 
5091       // Set v0 up for the vmerge that replaces the first word (idx==0)
5092       __ vid_v(v0);
5093       __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
5094 
5095       VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
5096       int rot_pos = 0;
5097       // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
5098       const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
5099       for (int i = 0; i < qr_end; i++) {
5100         sha2_quad_round(vset_sew,
5101                    rotation_regs[(rot_pos + 0) & 0x3],
5102                    rotation_regs[(rot_pos + 1) & 0x3],
5103                    rotation_regs[(rot_pos + 2) & 0x3],
5104                    rotation_regs[(rot_pos + 3) & 0x3],
5105                    consts,
5106                    vTmp1, vTmp0, vState0, vState1);
5107         ++rot_pos;
5108       }
5109       // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
5110       // Note that we stop generating new message schedule words (Wt, vW0-13)
5111       // as we already generated all the words we end up consuming (i.e., W[63:60]).
5112       const int qr_c_end = qr_end + 4;
5113       for (int i = qr_end; i < qr_c_end; i++) {
5114         sha2_quad_round(vset_sew,
5115                    rotation_regs[(rot_pos + 0) & 0x3],
5116                    rotation_regs[(rot_pos + 1) & 0x3],
5117                    rotation_regs[(rot_pos + 2) & 0x3],
5118                    rotation_regs[(rot_pos + 3) & 0x3],
5119                    consts,
5120                    vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
5121         ++rot_pos;
5122       }
5123 
5124       //--------------------------------------------------------------------------------
5125       // Compute the updated hash value H'
5126       //   H' = H + {h',g',...,b',a'}
5127       //      = {h,g,...,b,a} + {h',g',...,b',a'}
5128       //      = {h+h',g+g',...,b+b',a+a'}
5129 
5130       // H' = H+{a',b',c',...,h'}
5131       __ vadd_vv(vState0, vHash0, vState0);
5132       __ vadd_vv(vState1, vHash1, vState1);
5133 
5134       if (multi_block) {
5135         int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
5136         __ subi(consts, consts, total_adds);
5137         __ addi(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
5138         __ ble(ofs, limit, multi_block_loop);
5139         __ mv(c_rarg0, ofs); // return ofs
5140       }
5141 
5142       // Store H[0..8] = {a,b,c,d,e,f,g,h} from
5143       //  vState0 = {f,e,b,a}
5144       //  vState1 = {h,g,d,c}
5145       __ vsuxei8_v(vState0, state,   vindex);
5146       __ vsuxei8_v(vState1, state_c, vindex);
5147 
5148       __ leave();
5149       __ ret();
5150 
5151       return start;
5152     }
5153   };
5154 
5155 #undef __
5156 #define __ _masm->
5157 
5158   // Set of L registers that correspond to a contiguous memory area.
5159   // Each 64-bit register typically corresponds to 2 32-bit integers.
5160   template <uint L>
5161   class RegCache {
5162   private:
5163     MacroAssembler *_masm;
5164     Register _regs[L];
5165 
5166   public:
5167     RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
5168       assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
5169       auto it = rs.begin();
5170       for (auto &r: _regs) {
5171         r = *it;
5172         ++it;
5173       }
5174     }
5175 
5176     // generate load for the i'th register
5177     void gen_load(uint i, Register base) {
5178       assert(i < L, "invalid i: %u", i);
5179       __ ld(_regs[i], Address(base, 8 * i));
5180     }
5181 
5182     // add i'th 32-bit integer to dest
5183     void add_u32(const Register dest, uint i, const Register rtmp = t0) {
5184       assert(i < 2 * L, "invalid i: %u", i);
5185 
5186       if (is_even(i)) {
5187         // Use the bottom 32 bits. No need to mask off the top 32 bits
5188         // as addw will do the right thing.
5189         __ addw(dest, dest, _regs[i / 2]);
5190       } else {
5191         // Use the top 32 bits by right-shifting them.
5192         __ srli(rtmp, _regs[i / 2], 32);
5193         __ addw(dest, dest, rtmp);
5194       }
5195     }
5196   };
5197 
5198   typedef RegCache<8> BufRegCache;
5199 
5200   // a += value + x + ac;
5201   // a = Integer.rotateLeft(a, s) + b;
5202   void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
5203                                Register a, Register b, Register c, Register d,
5204                                int k, int s, int t,
5205                                Register value) {
5206     // a += ac
5207     __ addw(a, a, t, t1);
5208 
5209     // a += x;
5210     reg_cache.add_u32(a, k);
5211     // a += value;
5212     __ addw(a, a, value);
5213 
5214     // a = Integer.rotateLeft(a, s) + b;
5215     __ rolw(a, a, s);
5216     __ addw(a, a, b);
5217   }
5218 
5219   // a += ((b & c) | ((~b) & d)) + x + ac;
5220   // a = Integer.rotateLeft(a, s) + b;
5221   void md5_FF(BufRegCache& reg_cache,
5222               Register a, Register b, Register c, Register d,
5223               int k, int s, int t,
5224               Register rtmp1, Register rtmp2) {
5225     // rtmp1 = b & c
5226     __ andr(rtmp1, b, c);
5227 
5228     // rtmp2 = (~b) & d
5229     __ andn(rtmp2, d, b);
5230 
5231     // rtmp1 = (b & c) | ((~b) & d)
5232     __ orr(rtmp1, rtmp1, rtmp2);
5233 
5234     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5235   }
5236 
5237   // a += ((b & d) | (c & (~d))) + x + ac;
5238   // a = Integer.rotateLeft(a, s) + b;
5239   void md5_GG(BufRegCache& reg_cache,
5240               Register a, Register b, Register c, Register d,
5241               int k, int s, int t,
5242               Register rtmp1, Register rtmp2) {
5243     // rtmp1 = b & d
5244     __ andr(rtmp1, b, d);
5245 
5246     // rtmp2 = c & (~d)
5247     __ andn(rtmp2, c, d);
5248 
5249     // rtmp1 = (b & d) | (c & (~d))
5250     __ orr(rtmp1, rtmp1, rtmp2);
5251 
5252     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5253   }
5254 
5255   // a += ((b ^ c) ^ d) + x + ac;
5256   // a = Integer.rotateLeft(a, s) + b;
5257   void md5_HH(BufRegCache& reg_cache,
5258               Register a, Register b, Register c, Register d,
5259               int k, int s, int t,
5260               Register rtmp1, Register rtmp2) {
5261     // rtmp1 = (b ^ c) ^ d
5262     __ xorr(rtmp2, b, c);
5263     __ xorr(rtmp1, rtmp2, d);
5264 
5265     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5266   }
5267 
5268   // a += (c ^ (b | (~d))) + x + ac;
5269   // a = Integer.rotateLeft(a, s) + b;
5270   void md5_II(BufRegCache& reg_cache,
5271               Register a, Register b, Register c, Register d,
5272               int k, int s, int t,
5273               Register rtmp1, Register rtmp2) {
5274     // rtmp1 = c ^ (b | (~d))
5275     __ orn(rtmp2, b, d);
5276     __ xorr(rtmp1, c, rtmp2);
5277 
5278     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5279   }
5280 
5281   // Arguments:
5282   //
5283   // Inputs:
5284   //   c_rarg0   - byte[]  source+offset
5285   //   c_rarg1   - int[]   SHA.state
5286   //   c_rarg2   - int     offset  (multi_block == True)
5287   //   c_rarg3   - int     limit   (multi_block == True)
5288   //
5289   // Registers:
5290   //    x0   zero  (zero)
5291   //    x1     ra  (return address)
5292   //    x2     sp  (stack pointer)
5293   //    x3     gp  (global pointer)
5294   //    x4     tp  (thread pointer)
5295   //    x5     t0  (tmp register)
5296   //    x6     t1  (tmp register)
5297   //    x7     t2  state0
5298   //    x8  f0/s0  (frame pointer)
5299   //    x9     s1
5300   //   x10     a0  rtmp1 / c_rarg0
5301   //   x11     a1  rtmp2 / c_rarg1
5302   //   x12     a2  a     / c_rarg2
5303   //   x13     a3  b     / c_rarg3
5304   //   x14     a4  c
5305   //   x15     a5  d
5306   //   x16     a6  buf
5307   //   x17     a7  state
5308   //   x18     s2  ofs     [saved-reg]  (multi_block == True)
5309   //   x19     s3  limit   [saved-reg]  (multi_block == True)
5310   //   x20     s4  state1  [saved-reg]
5311   //   x21     s5  state2  [saved-reg]
5312   //   x22     s6  state3  [saved-reg]
5313   //   x23     s7
5314   //   x24     s8  buf0    [saved-reg]
5315   //   x25     s9  buf1    [saved-reg]
5316   //   x26    s10  buf2    [saved-reg]
5317   //   x27    s11  buf3    [saved-reg]
5318   //   x28     t3  buf4
5319   //   x29     t4  buf5
5320   //   x30     t5  buf6
5321   //   x31     t6  buf7
5322   address generate_md5_implCompress(StubId stub_id) {
5323     __ align(CodeEntryAlignment);
5324     bool multi_block;
5325     switch (stub_id) {
5326     case StubId::stubgen_md5_implCompress_id:
5327       multi_block = false;
5328       break;
5329     case StubId::stubgen_md5_implCompressMB_id:
5330       multi_block = true;
5331       break;
5332     default:
5333       ShouldNotReachHere();
5334     };
5335     StubCodeMark mark(this, stub_id);
5336     address start = __ pc();
5337 
5338     // rotation constants
5339     const int S11 = 7;
5340     const int S12 = 12;
5341     const int S13 = 17;
5342     const int S14 = 22;
5343     const int S21 = 5;
5344     const int S22 = 9;
5345     const int S23 = 14;
5346     const int S24 = 20;
5347     const int S31 = 4;
5348     const int S32 = 11;
5349     const int S33 = 16;
5350     const int S34 = 23;
5351     const int S41 = 6;
5352     const int S42 = 10;
5353     const int S43 = 15;
5354     const int S44 = 21;
5355 
5356     const int64_t mask32 = 0xffffffff;
5357 
5358     Register buf_arg   = c_rarg0; // a0
5359     Register state_arg = c_rarg1; // a1
5360     Register ofs_arg   = c_rarg2; // a2
5361     Register limit_arg = c_rarg3; // a3
5362 
5363     // we'll copy the args to these registers to free up a0-a3
5364     // to use for other values manipulated by instructions
5365     // that can be compressed
5366     Register buf       = x16; // a6
5367     Register state     = x17; // a7
5368     Register ofs       = x18; // s2
5369     Register limit     = x19; // s3
5370 
5371     // using x12->15 to allow compressed instructions
5372     Register a         = x12; // a2
5373     Register b         = x13; // a3
5374     Register c         = x14; // a4
5375     Register d         = x15; // a5
5376 
5377     Register state0    =  x7; // t2
5378     Register state1    = x20; // s4
5379     Register state2    = x21; // s5
5380     Register state3    = x22; // s6
5381 
5382     // using x10->x11 to allow compressed instructions
5383     Register rtmp1     = x10; // a0
5384     Register rtmp2     = x11; // a1
5385 
5386     RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
5387     RegSet reg_cache_regs;
5388     reg_cache_regs += reg_cache_saved_regs;
5389     reg_cache_regs += RegSet::of(t3, t4, t5, t6);
5390     BufRegCache reg_cache(_masm, reg_cache_regs);
5391 
5392     RegSet saved_regs;
5393     if (multi_block) {
5394       saved_regs += RegSet::of(ofs, limit);
5395     }
5396     saved_regs += RegSet::of(state1, state2, state3);
5397     saved_regs += reg_cache_saved_regs;
5398 
5399     __ push_reg(saved_regs, sp);
5400 
5401     __ mv(buf, buf_arg);
5402     __ mv(state, state_arg);
5403     if (multi_block) {
5404       __ mv(ofs, ofs_arg);
5405       __ mv(limit, limit_arg);
5406     }
5407 
5408     // to minimize the number of memory operations:
5409     // read the 4 state 4-byte values in pairs, with a single ld,
5410     // and split them into 2 registers.
5411     //
5412     // And, as the core algorithm of md5 works on 32-bits words, so
5413     // in the following code, it does not care about the content of
5414     // higher 32-bits in state[x]. Based on this observation,
5415     // we can apply further optimization, which is to just ignore the
5416     // higher 32-bits in state0/state2, rather than set the higher
5417     // 32-bits of state0/state2 to zero explicitly with extra instructions.
5418     __ ld(state0, Address(state));
5419     __ srli(state1, state0, 32);
5420     __ ld(state2, Address(state, 8));
5421     __ srli(state3, state2, 32);
5422 
5423     Label md5_loop;
5424     __ BIND(md5_loop);
5425 
5426     __ mv(a, state0);
5427     __ mv(b, state1);
5428     __ mv(c, state2);
5429     __ mv(d, state3);
5430 
5431     // Round 1
5432     reg_cache.gen_load(0, buf);
5433     md5_FF(reg_cache, a, b, c, d,  0, S11, 0xd76aa478, rtmp1, rtmp2);
5434     md5_FF(reg_cache, d, a, b, c,  1, S12, 0xe8c7b756, rtmp1, rtmp2);
5435     reg_cache.gen_load(1, buf);
5436     md5_FF(reg_cache, c, d, a, b,  2, S13, 0x242070db, rtmp1, rtmp2);
5437     md5_FF(reg_cache, b, c, d, a,  3, S14, 0xc1bdceee, rtmp1, rtmp2);
5438     reg_cache.gen_load(2, buf);
5439     md5_FF(reg_cache, a, b, c, d,  4, S11, 0xf57c0faf, rtmp1, rtmp2);
5440     md5_FF(reg_cache, d, a, b, c,  5, S12, 0x4787c62a, rtmp1, rtmp2);
5441     reg_cache.gen_load(3, buf);
5442     md5_FF(reg_cache, c, d, a, b,  6, S13, 0xa8304613, rtmp1, rtmp2);
5443     md5_FF(reg_cache, b, c, d, a,  7, S14, 0xfd469501, rtmp1, rtmp2);
5444     reg_cache.gen_load(4, buf);
5445     md5_FF(reg_cache, a, b, c, d,  8, S11, 0x698098d8, rtmp1, rtmp2);
5446     md5_FF(reg_cache, d, a, b, c,  9, S12, 0x8b44f7af, rtmp1, rtmp2);
5447     reg_cache.gen_load(5, buf);
5448     md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
5449     md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
5450     reg_cache.gen_load(6, buf);
5451     md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
5452     md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
5453     reg_cache.gen_load(7, buf);
5454     md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
5455     md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
5456 
5457     // Round 2
5458     md5_GG(reg_cache, a, b, c, d,  1, S21, 0xf61e2562, rtmp1, rtmp2);
5459     md5_GG(reg_cache, d, a, b, c,  6, S22, 0xc040b340, rtmp1, rtmp2);
5460     md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
5461     md5_GG(reg_cache, b, c, d, a,  0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
5462     md5_GG(reg_cache, a, b, c, d,  5, S21, 0xd62f105d, rtmp1, rtmp2);
5463     md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
5464     md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
5465     md5_GG(reg_cache, b, c, d, a,  4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
5466     md5_GG(reg_cache, a, b, c, d,  9, S21, 0x21e1cde6, rtmp1, rtmp2);
5467     md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
5468     md5_GG(reg_cache, c, d, a, b,  3, S23, 0xf4d50d87, rtmp1, rtmp2);
5469     md5_GG(reg_cache, b, c, d, a,  8, S24, 0x455a14ed, rtmp1, rtmp2);
5470     md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
5471     md5_GG(reg_cache, d, a, b, c,  2, S22, 0xfcefa3f8, rtmp1, rtmp2);
5472     md5_GG(reg_cache, c, d, a, b,  7, S23, 0x676f02d9, rtmp1, rtmp2);
5473     md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
5474 
5475     // Round 3
5476     md5_HH(reg_cache, a, b, c, d,  5, S31, 0xfffa3942, rtmp1, rtmp2);
5477     md5_HH(reg_cache, d, a, b, c,  8, S32, 0x8771f681, rtmp1, rtmp2);
5478     md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
5479     md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
5480     md5_HH(reg_cache, a, b, c, d,  1, S31, 0xa4beea44, rtmp1, rtmp2);
5481     md5_HH(reg_cache, d, a, b, c,  4, S32, 0x4bdecfa9, rtmp1, rtmp2);
5482     md5_HH(reg_cache, c, d, a, b,  7, S33, 0xf6bb4b60, rtmp1, rtmp2);
5483     md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
5484     md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
5485     md5_HH(reg_cache, d, a, b, c,  0, S32, 0xeaa127fa, rtmp1, rtmp2);
5486     md5_HH(reg_cache, c, d, a, b,  3, S33, 0xd4ef3085, rtmp1, rtmp2);
5487     md5_HH(reg_cache, b, c, d, a,  6, S34, 0x04881d05, rtmp1, rtmp2);
5488     md5_HH(reg_cache, a, b, c, d,  9, S31, 0xd9d4d039, rtmp1, rtmp2);
5489     md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
5490     md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
5491     md5_HH(reg_cache, b, c, d, a,  2, S34, 0xc4ac5665, rtmp1, rtmp2);
5492 
5493     // Round 4
5494     md5_II(reg_cache, a, b, c, d,  0, S41, 0xf4292244, rtmp1, rtmp2);
5495     md5_II(reg_cache, d, a, b, c,  7, S42, 0x432aff97, rtmp1, rtmp2);
5496     md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
5497     md5_II(reg_cache, b, c, d, a,  5, S44, 0xfc93a039, rtmp1, rtmp2);
5498     md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
5499     md5_II(reg_cache, d, a, b, c,  3, S42, 0x8f0ccc92, rtmp1, rtmp2);
5500     md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
5501     md5_II(reg_cache, b, c, d, a,  1, S44, 0x85845dd1, rtmp1, rtmp2);
5502     md5_II(reg_cache, a, b, c, d,  8, S41, 0x6fa87e4f, rtmp1, rtmp2);
5503     md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
5504     md5_II(reg_cache, c, d, a, b,  6, S43, 0xa3014314, rtmp1, rtmp2);
5505     md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
5506     md5_II(reg_cache, a, b, c, d,  4, S41, 0xf7537e82, rtmp1, rtmp2);
5507     md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
5508     md5_II(reg_cache, c, d, a, b,  2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
5509     md5_II(reg_cache, b, c, d, a,  9, S44, 0xeb86d391, rtmp1, rtmp2);
5510 
5511     __ addw(state0, state0, a);
5512     __ addw(state1, state1, b);
5513     __ addw(state2, state2, c);
5514     __ addw(state3, state3, d);
5515 
5516     if (multi_block) {
5517       __ addi(buf, buf, 64);
5518       __ addi(ofs, ofs, 64);
5519       // if (ofs <= limit) goto m5_loop
5520       __ bge(limit, ofs, md5_loop);
5521       __ mv(c_rarg0, ofs); // return ofs
5522     }
5523 
5524     // to minimize the number of memory operations:
5525     // write back the 4 state 4-byte values in pairs, with a single sd
5526     __ mv(t0, mask32);
5527     __ andr(state0, state0, t0);
5528     __ slli(state1, state1, 32);
5529     __ orr(state0, state0, state1);
5530     __ sd(state0, Address(state));
5531     __ andr(state2, state2, t0);
5532     __ slli(state3, state3, 32);
5533     __ orr(state2, state2, state3);
5534     __ sd(state2, Address(state, 8));
5535 
5536     __ pop_reg(saved_regs, sp);
5537     __ ret();
5538 
5539     return (address) start;
5540   }
5541 
5542   /**
5543    * Perform the quarter round calculations on values contained within four vector registers.
5544    *
5545    * @param aVec the SIMD register containing only the "a" values
5546    * @param bVec the SIMD register containing only the "b" values
5547    * @param cVec the SIMD register containing only the "c" values
5548    * @param dVec the SIMD register containing only the "d" values
5549    * @param tmp_vr temporary vector register holds intermedia values.
5550    */
5551   void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
5552                           VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
5553     // a += b, d ^= a, d <<<= 16
5554     __ vadd_vv(aVec, aVec, bVec);
5555     __ vxor_vv(dVec, dVec, aVec);
5556     __ vrole32_vi(dVec, 16, tmp_vr);
5557 
5558     // c += d, b ^= c, b <<<= 12
5559     __ vadd_vv(cVec, cVec, dVec);
5560     __ vxor_vv(bVec, bVec, cVec);
5561     __ vrole32_vi(bVec, 12, tmp_vr);
5562 
5563     // a += b, d ^= a, d <<<= 8
5564     __ vadd_vv(aVec, aVec, bVec);
5565     __ vxor_vv(dVec, dVec, aVec);
5566     __ vrole32_vi(dVec, 8, tmp_vr);
5567 
5568     // c += d, b ^= c, b <<<= 7
5569     __ vadd_vv(cVec, cVec, dVec);
5570     __ vxor_vv(bVec, bVec, cVec);
5571     __ vrole32_vi(bVec, 7, tmp_vr);
5572   }
5573 
5574   /**
5575    * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
5576    *
5577    *  Input arguments:
5578    *  c_rarg0   - state, the starting state
5579    *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function
5580    *
5581    *  Implementation Note:
5582    *   Parallelization is achieved by loading individual state elements into vectors for N blocks.
5583    *   N depends on single vector register length.
5584    */
5585   address generate_chacha20Block() {
5586     Label L_Rounds;
5587 
5588     __ align(CodeEntryAlignment);
5589     StubId stub_id = StubId::stubgen_chacha20Block_id;
5590     StubCodeMark mark(this, stub_id);
5591     address start = __ pc();
5592     __ enter();
5593 
5594     const int states_len = 16;
5595     const int step = 4;
5596     const Register state = c_rarg0;
5597     const Register key_stream = c_rarg1;
5598     const Register tmp_addr = t0;
5599     const Register length = t1;
5600 
5601     // Organize vector registers in an array that facilitates
5602     // putting repetitive opcodes into loop structures below.
5603     const VectorRegister work_vrs[16] = {
5604       v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
5605       v8, v9, v10, v11, v12, v13, v14, v15
5606     };
5607     const VectorRegister tmp_vr = v16;
5608     const VectorRegister counter_vr = v17;
5609 
5610     {
5611       // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
5612       // in java level.
5613       __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
5614     }
5615 
5616     // Load from source state.
5617     // Every element in source state is duplicated to all elements in the corresponding vector.
5618     __ mv(tmp_addr, state);
5619     for (int i = 0; i < states_len; i += 1) {
5620       __ vlse32_v(work_vrs[i], tmp_addr, zr);
5621       __ addi(tmp_addr, tmp_addr, step);
5622     }
5623     // Adjust counter for every individual block.
5624     __ vid_v(counter_vr);
5625     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5626 
5627     // Perform 10 iterations of the 8 quarter round set
5628     {
5629       const Register loop = t2; // share t2 with other non-overlapping usages.
5630       __ mv(loop, 10);
5631       __ BIND(L_Rounds);
5632 
5633       chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8],  work_vrs[12], tmp_vr);
5634       chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9],  work_vrs[13], tmp_vr);
5635       chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
5636       chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
5637 
5638       chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
5639       chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
5640       chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8],  work_vrs[13], tmp_vr);
5641       chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9],  work_vrs[14], tmp_vr);
5642 
5643       __ subi(loop, loop, 1);
5644       __ bnez(loop, L_Rounds);
5645     }
5646 
5647     // Add the original state into the end working state.
5648     // We do this by first duplicating every element in source state array to the corresponding
5649     // vector, then adding it to the post-loop working state.
5650     __ mv(tmp_addr, state);
5651     for (int i = 0; i < states_len; i += 1) {
5652       __ vlse32_v(tmp_vr, tmp_addr, zr);
5653       __ addi(tmp_addr, tmp_addr, step);
5654       __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
5655     }
5656     // Add the counter overlay onto work_vrs[12] at the end.
5657     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5658 
5659     // Store result to key stream.
5660     {
5661       const Register stride = t2; // share t2 with other non-overlapping usages.
5662       // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
5663       __ mv(stride, 64);
5664       for (int i = 0; i < states_len; i += 1) {
5665         __ vsse32_v(work_vrs[i], key_stream, stride);
5666         __ addi(key_stream, key_stream, step);
5667       }
5668     }
5669 
5670     // Return length of output key_stream
5671     __ slli(c_rarg0, length, 6);
5672 
5673     __ leave();
5674     __ ret();
5675 
5676     return (address) start;
5677   }
5678 
5679 
5680   // ------------------------ SHA-1 intrinsic ------------------------
5681 
5682   // K't =
5683   //    5a827999, 0  <= t <= 19
5684   //    6ed9eba1, 20 <= t <= 39
5685   //    8f1bbcdc, 40 <= t <= 59
5686   //    ca62c1d6, 60 <= t <= 79
5687   void sha1_prepare_k(Register cur_k, int round) {
5688     assert(round >= 0 && round < 80, "must be");
5689 
5690     static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
5691     if ((round % 20) == 0) {
5692       __ mv(cur_k, ks[round/20]);
5693     }
5694   }
5695 
5696   // W't =
5697   //    M't,                                      0 <=  t <= 15
5698   //    ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5699   void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
5700     assert(round >= 0 && round < 80, "must be");
5701 
5702     if (round < 16) {
5703       // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
5704       //   in ws[0], high part contains W't-0, low part contains W't-1,
5705       //   in ws[1], high part contains W't-2, low part contains W't-3,
5706       //   ...
5707       //   in ws[7], high part contains W't-14, low part contains W't-15.
5708 
5709       if ((round % 2) == 0) {
5710         __ ld(ws[round/2], Address(buf, (round/2) * 8));
5711         // reverse bytes, as SHA-1 is defined in big-endian.
5712         __ revb(ws[round/2], ws[round/2]);
5713         __ srli(cur_w, ws[round/2], 32);
5714       } else {
5715         __ mv(cur_w, ws[round/2]);
5716       }
5717 
5718       return;
5719     }
5720 
5721     if ((round % 2) == 0) {
5722       int idx = 16;
5723       // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5724       __ srli(t1, ws[(idx-8)/2], 32);
5725       __ xorr(t0, ws[(idx-3)/2], t1);
5726 
5727       __ srli(t1, ws[(idx-14)/2], 32);
5728       __ srli(cur_w, ws[(idx-16)/2], 32);
5729       __ xorr(cur_w, cur_w, t1);
5730 
5731       __ xorr(cur_w, cur_w, t0);
5732       __ rolw(cur_w, cur_w, 1, t0);
5733 
5734       // copy the cur_w value to ws[8].
5735       // now, valid w't values are at:
5736       //  w0:       ws[0]'s lower 32 bits
5737       //  w1 ~ w14: ws[1] ~ ws[7]
5738       //  w15:      ws[8]'s higher 32 bits
5739       __ slli(ws[idx/2], cur_w, 32);
5740 
5741       return;
5742     }
5743 
5744     int idx = 17;
5745     // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5746     __ srli(t1, ws[(idx-3)/2], 32);
5747     __ xorr(t0, t1, ws[(idx-8)/2]);
5748 
5749     __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
5750 
5751     __ xorr(cur_w, cur_w, t0);
5752     __ rolw(cur_w, cur_w, 1, t0);
5753 
5754     // copy the cur_w value to ws[8]
5755     __ zext(cur_w, cur_w, 32);
5756     __ orr(ws[idx/2], ws[idx/2], cur_w);
5757 
5758     // shift the w't registers, so they start from ws[0] again.
5759     // now, valid w't values are at:
5760     //  w0 ~ w15: ws[0] ~ ws[7]
5761     Register ws_0 = ws[0];
5762     for (int i = 0; i < 16/2; i++) {
5763       ws[i] = ws[i+1];
5764     }
5765     ws[8] = ws_0;
5766   }
5767 
5768   // f't(x, y, z) =
5769   //    Ch(x, y, z)     = (x & y) ^ (~x & z)            , 0  <= t <= 19
5770   //    Parity(x, y, z) = x ^ y ^ z                     , 20 <= t <= 39
5771   //    Maj(x, y, z)    = (x & y) ^ (x & z) ^ (y & z)   , 40 <= t <= 59
5772   //    Parity(x, y, z) = x ^ y ^ z                     , 60 <= t <= 79
5773   void sha1_f(Register dst, Register x, Register y, Register z, int round) {
5774     assert(round >= 0 && round < 80, "must be");
5775     assert_different_registers(dst, x, y, z, t0, t1);
5776 
5777     if (round < 20) {
5778       // (x & y) ^ (~x & z)
5779       __ andr(t0, x, y);
5780       __ andn(dst, z, x);
5781       __ xorr(dst, dst, t0);
5782     } else if (round >= 40 && round < 60) {
5783       // (x & y) ^ (x & z) ^ (y & z)
5784       __ andr(t0, x, y);
5785       __ andr(t1, x, z);
5786       __ andr(dst, y, z);
5787       __ xorr(dst, dst, t0);
5788       __ xorr(dst, dst, t1);
5789     } else {
5790       // x ^ y ^ z
5791       __ xorr(dst, x, y);
5792       __ xorr(dst, dst, z);
5793     }
5794   }
5795 
5796   // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5797   // e = d
5798   // d = c
5799   // c = ROTL'30(b)
5800   // b = a
5801   // a = T
5802   void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
5803                           Register cur_k, Register cur_w, Register tmp, int round) {
5804     assert(round >= 0 && round < 80, "must be");
5805     assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
5806 
5807     // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5808 
5809     // cur_w will be recalculated at the beginning of each round,
5810     // so, we can reuse it as a temp register here.
5811     Register tmp2 = cur_w;
5812 
5813     // reuse e as a temporary register, as we will mv new value into it later
5814     Register tmp3 = e;
5815     __ add(tmp2, cur_k, tmp2);
5816     __ add(tmp3, tmp3, tmp2);
5817     __ rolw(tmp2, a, 5, t0);
5818 
5819     sha1_f(tmp, b, c, d, round);
5820 
5821     __ add(tmp2, tmp2, tmp);
5822     __ add(tmp2, tmp2, tmp3);
5823 
5824     // e = d
5825     // d = c
5826     // c = ROTL'30(b)
5827     // b = a
5828     // a = T
5829     __ mv(e, d);
5830     __ mv(d, c);
5831 
5832     __ rolw(c, b, 30);
5833     __ mv(b, a);
5834     __ mv(a, tmp2);
5835   }
5836 
5837   // H(i)0 = a + H(i-1)0
5838   // H(i)1 = b + H(i-1)1
5839   // H(i)2 = c + H(i-1)2
5840   // H(i)3 = d + H(i-1)3
5841   // H(i)4 = e + H(i-1)4
5842   void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
5843                               Register prev_ab, Register prev_cd, Register prev_e) {
5844     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5845 
5846     __ add(a, a, prev_ab);
5847     __ srli(prev_ab, prev_ab, 32);
5848     __ add(b, b, prev_ab);
5849 
5850     __ add(c, c, prev_cd);
5851     __ srli(prev_cd, prev_cd, 32);
5852     __ add(d, d, prev_cd);
5853 
5854     __ add(e, e, prev_e);
5855   }
5856 
5857   void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
5858                                 Register prev_ab, Register prev_cd, Register prev_e) {
5859     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
5860 
5861     __ slli(t0, b, 32);
5862     __ zext(prev_ab, a, 32);
5863     __ orr(prev_ab, prev_ab, t0);
5864 
5865     __ slli(t0, d, 32);
5866     __ zext(prev_cd, c, 32);
5867     __ orr(prev_cd, prev_cd, t0);
5868 
5869     __ mv(prev_e, e);
5870   }
5871 
5872   // Intrinsic for:
5873   //   void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
5874   //   void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
5875   //
5876   // Arguments:
5877   //
5878   // Inputs:
5879   //   c_rarg0: byte[]  src array + offset
5880   //   c_rarg1: int[]   SHA.state
5881   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5882   //   c_rarg2: int     offset
5883   //   c_rarg3: int     limit
5884   //
5885   // Outputs:
5886   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5887   //   c_rarg0: int offset, when (multi_block == true)
5888   //
5889   address generate_sha1_implCompress(StubId stub_id) {
5890       bool multi_block;
5891       switch (stub_id) {
5892       case StubId::stubgen_sha1_implCompress_id:
5893         multi_block = false;
5894         break;
5895       case StubId::stubgen_sha1_implCompressMB_id:
5896         multi_block = true;
5897         break;
5898       default:
5899         ShouldNotReachHere();
5900       };
5901     __ align(CodeEntryAlignment);
5902     StubCodeMark mark(this, stub_id);
5903 
5904     address start = __ pc();
5905     __ enter();
5906 
5907     RegSet saved_regs = RegSet::range(x18, x27);
5908     if (multi_block) {
5909       // use x9 as src below.
5910       saved_regs += RegSet::of(x9);
5911     }
5912     __ push_reg(saved_regs, sp);
5913 
5914     // c_rarg0 - c_rarg3: x10 - x13
5915     Register buf    = c_rarg0;
5916     Register state  = c_rarg1;
5917     Register offset = c_rarg2;
5918     Register limit  = c_rarg3;
5919     // use src to contain the original start point of the array.
5920     Register src    = x9;
5921 
5922     if (multi_block) {
5923       __ sub(limit, limit, offset);
5924       __ add(limit, limit, buf);
5925       __ sub(src, buf, offset);
5926     }
5927 
5928     // [args-reg]:  x14 - x17
5929     // [temp-reg]:  x28 - x31
5930     // [saved-reg]: x18 - x27
5931 
5932     // h0/1/2/3/4
5933     const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5934     // w0, w1, ... w15
5935     // put two adjecent w's in one register:
5936     //    one at high word part, another at low word part
5937     // at different round (even or odd), w't value reside in different items in ws[].
5938     // w0 ~ w15, either reside in
5939     //    ws[0] ~ ws[7], where
5940     //      w0 at higher 32 bits of ws[0],
5941     //      w1 at lower 32 bits of ws[0],
5942     //      ...
5943     //      w14 at higher 32 bits of ws[7],
5944     //      w15 at lower 32 bits of ws[7].
5945     // or, reside in
5946     //    w0:       ws[0]'s lower 32 bits
5947     //    w1 ~ w14: ws[1] ~ ws[7]
5948     //    w15:      ws[8]'s higher 32 bits
5949     Register ws[9] = {x29, x30, x31, x18,
5950                       x19, x20, x21, x22,
5951                       x23}; // auxiliary register for calculating w's value
5952     // current k't's value
5953     const Register cur_k = x24;
5954     // current w't's value
5955     const Register cur_w = x25;
5956     // values of a, b, c, d, e in the previous round
5957     const Register prev_ab = x26, prev_cd = x27;
5958     const Register prev_e = offset; // reuse offset/c_rarg2
5959 
5960     // load 5 words state into a, b, c, d, e.
5961     //
5962     // To minimize the number of memory operations, we apply following
5963     // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5964     // with a single ld, and split them into 2 registers.
5965     //
5966     // And, as the core algorithm of SHA-1 works on 32-bits words, so
5967     // in the following code, it does not care about the content of
5968     // higher 32-bits in a/b/c/d/e. Based on this observation,
5969     // we can apply further optimization, which is to just ignore the
5970     // higher 32-bits in a/c/e, rather than set the higher
5971     // 32-bits of a/c/e to zero explicitly with extra instructions.
5972     __ ld(a, Address(state, 0));
5973     __ srli(b, a, 32);
5974     __ ld(c, Address(state, 8));
5975     __ srli(d, c, 32);
5976     __ lw(e, Address(state, 16));
5977 
5978     Label L_sha1_loop;
5979     if (multi_block) {
5980       __ BIND(L_sha1_loop);
5981     }
5982 
5983     sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5984 
5985     for (int round = 0; round < 80; round++) {
5986       // prepare K't value
5987       sha1_prepare_k(cur_k, round);
5988 
5989       // prepare W't value
5990       sha1_prepare_w(cur_w, ws, buf, round);
5991 
5992       // one round process
5993       sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
5994     }
5995 
5996     // compute the intermediate hash value
5997     sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5998 
5999     if (multi_block) {
6000       int64_t block_bytes = 16 * 4;
6001       __ addi(buf, buf, block_bytes);
6002 
6003       __ bge(limit, buf, L_sha1_loop, true);
6004     }
6005 
6006     // store back the state.
6007     __ zext(a, a, 32);
6008     __ slli(b, b, 32);
6009     __ orr(a, a, b);
6010     __ sd(a, Address(state, 0));
6011     __ zext(c, c, 32);
6012     __ slli(d, d, 32);
6013     __ orr(c, c, d);
6014     __ sd(c, Address(state, 8));
6015     __ sw(e, Address(state, 16));
6016 
6017     // return offset
6018     if (multi_block) {
6019       __ sub(c_rarg0, buf, src);
6020     }
6021 
6022     __ pop_reg(saved_regs, sp);
6023 
6024     __ leave();
6025     __ ret();
6026 
6027     return (address) start;
6028   }
6029 
6030   /**
6031    * vector registers:
6032    *   input VectorRegister's:  intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
6033    *   index VectorRegister's:  idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
6034    *   output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
6035    *
6036    * NOTE: each field will occupy a vector register group
6037    */
6038   void base64_vector_encode_round(Register src, Register dst, Register codec,
6039                     Register size, Register stepSrc, Register stepDst,
6040                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
6041                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6042                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
6043                     Assembler::LMUL lmul) {
6044     // set vector register type/len
6045     __ vsetvli(x0, size, Assembler::e8, lmul);
6046 
6047     // segmented load src into v registers: mem(src) => vr(3)
6048     __ vlseg3e8_v(inputV1, src);
6049 
6050     // src = src + register_group_len_bytes * 3
6051     __ add(src, src, stepSrc);
6052 
6053     // encoding
6054     //   1. compute index into lookup table: vr(3) => vr(4)
6055     __ vsrl_vi(idxV1, inputV1, 2);
6056 
6057     __ vsrl_vi(idxV2, inputV2, 2);
6058     __ vsll_vi(inputV1, inputV1, 6);
6059     __ vor_vv(idxV2, idxV2, inputV1);
6060     __ vsrl_vi(idxV2, idxV2, 2);
6061 
6062     __ vsrl_vi(idxV3, inputV3, 4);
6063     __ vsll_vi(inputV2, inputV2, 4);
6064     __ vor_vv(idxV3, inputV2, idxV3);
6065     __ vsrl_vi(idxV3, idxV3, 2);
6066 
6067     __ vsll_vi(idxV4, inputV3, 2);
6068     __ vsrl_vi(idxV4, idxV4, 2);
6069 
6070     //   2. indexed load: vr(4) => vr(4)
6071     __ vluxei8_v(outputV1, codec, idxV1);
6072     __ vluxei8_v(outputV2, codec, idxV2);
6073     __ vluxei8_v(outputV3, codec, idxV3);
6074     __ vluxei8_v(outputV4, codec, idxV4);
6075 
6076     // segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
6077     __ vsseg4e8_v(outputV1, dst);
6078 
6079     // dst = dst + register_group_len_bytes * 4
6080     __ add(dst, dst, stepDst);
6081   }
6082 
6083   /**
6084    *  void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
6085    *
6086    *  Input arguments:
6087    *  c_rarg0   - src, source array
6088    *  c_rarg1   - sp, src start offset
6089    *  c_rarg2   - sl, src end offset
6090    *  c_rarg3   - dst, dest array
6091    *  c_rarg4   - dp, dst start offset
6092    *  c_rarg5   - isURL, Base64 or URL character set
6093    */
6094   address generate_base64_encodeBlock() {
6095     alignas(64) static const char toBase64[64] = {
6096       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6097       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6098       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6099       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6100       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6101     };
6102 
6103     alignas(64) static const char toBase64URL[64] = {
6104       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6105       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6106       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6107       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6108       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6109     };
6110 
6111     __ align(CodeEntryAlignment);
6112     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
6113     StubCodeMark mark(this, stub_id);
6114     address start = __ pc();
6115     __ enter();
6116 
6117     Register src    = c_rarg0;
6118     Register soff   = c_rarg1;
6119     Register send   = c_rarg2;
6120     Register dst    = c_rarg3;
6121     Register doff   = c_rarg4;
6122     Register isURL  = c_rarg5;
6123 
6124     Register codec  = c_rarg6;
6125     Register length = c_rarg7; // total length of src data in bytes
6126 
6127     Label ProcessData, Exit;
6128 
6129     // length should be multiple of 3
6130     __ sub(length, send, soff);
6131     // real src/dst to process data
6132     __ add(src, src, soff);
6133     __ add(dst, dst, doff);
6134 
6135     // load the codec base address
6136     __ la(codec, ExternalAddress((address) toBase64));
6137     __ beqz(isURL, ProcessData);
6138     __ la(codec, ExternalAddress((address) toBase64URL));
6139     __ BIND(ProcessData);
6140 
6141     // vector version
6142     if (UseRVV) {
6143       Label ProcessM2, ProcessM1, ProcessScalar;
6144 
6145       Register size      = soff;
6146       Register stepSrcM1 = send;
6147       Register stepSrcM2 = doff;
6148       Register stepDst   = isURL;
6149 
6150       __ mv(size, MaxVectorSize * 2);
6151       __ mv(stepSrcM1, MaxVectorSize * 3);
6152       __ slli(stepSrcM2, stepSrcM1, 1);
6153       __ mv(stepDst, MaxVectorSize * 2 * 4);
6154 
6155       __ blt(length, stepSrcM2, ProcessM1);
6156 
6157       __ BIND(ProcessM2);
6158       base64_vector_encode_round(src, dst, codec,
6159                     size, stepSrcM2, stepDst,
6160                     v2, v4, v6,         // inputs
6161                     v8, v10, v12, v14,  // indexes
6162                     v16, v18, v20, v22, // outputs
6163                     Assembler::m2);
6164 
6165       __ sub(length, length, stepSrcM2);
6166       __ bge(length, stepSrcM2, ProcessM2);
6167 
6168       __ BIND(ProcessM1);
6169       __ blt(length, stepSrcM1, ProcessScalar);
6170 
6171       __ srli(size, size, 1);
6172       __ srli(stepDst, stepDst, 1);
6173       base64_vector_encode_round(src, dst, codec,
6174                     size, stepSrcM1, stepDst,
6175                     v1, v2, v3,         // inputs
6176                     v4, v5, v6, v7,     // indexes
6177                     v8, v9, v10, v11,   // outputs
6178                     Assembler::m1);
6179       __ sub(length, length, stepSrcM1);
6180 
6181       __ BIND(ProcessScalar);
6182     }
6183 
6184     // scalar version
6185     {
6186       Register byte1 = soff, byte0 = send, byte2 = doff;
6187       Register combined24Bits = isURL;
6188 
6189       __ beqz(length, Exit);
6190 
6191       Label ScalarLoop;
6192       __ BIND(ScalarLoop);
6193       {
6194         // plain:   [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
6195         // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
6196 
6197         // load 3 bytes src data
6198         __ lbu(byte0, Address(src, 0));
6199         __ lbu(byte1, Address(src, 1));
6200         __ lbu(byte2, Address(src, 2));
6201         __ addi(src, src, 3);
6202 
6203         // construct 24 bits from 3 bytes
6204         __ slliw(byte0, byte0, 16);
6205         __ slliw(byte1, byte1, 8);
6206         __ orr(combined24Bits, byte0, byte1);
6207         __ orr(combined24Bits, combined24Bits, byte2);
6208 
6209         // get codec index and encode(ie. load from codec by index)
6210         __ slliw(byte0, combined24Bits, 8);
6211         __ srliw(byte0, byte0, 26);
6212         __ add(byte0, codec, byte0);
6213         __ lbu(byte0, byte0);
6214 
6215         __ slliw(byte1, combined24Bits, 14);
6216         __ srliw(byte1, byte1, 26);
6217         __ add(byte1, codec, byte1);
6218         __ lbu(byte1, byte1);
6219 
6220         __ slliw(byte2, combined24Bits, 20);
6221         __ srliw(byte2, byte2, 26);
6222         __ add(byte2, codec, byte2);
6223         __ lbu(byte2, byte2);
6224 
6225         __ andi(combined24Bits, combined24Bits, 0x3f);
6226         __ add(combined24Bits, codec, combined24Bits);
6227         __ lbu(combined24Bits, combined24Bits);
6228 
6229         // store 4 bytes encoded data
6230         __ sb(byte0, Address(dst, 0));
6231         __ sb(byte1, Address(dst, 1));
6232         __ sb(byte2, Address(dst, 2));
6233         __ sb(combined24Bits, Address(dst, 3));
6234 
6235         __ subi(length, length, 3);
6236         __ addi(dst, dst, 4);
6237         // loop back
6238         __ bnez(length, ScalarLoop);
6239       }
6240     }
6241 
6242     __ BIND(Exit);
6243 
6244     __ leave();
6245     __ ret();
6246 
6247     return (address) start;
6248   }
6249 
6250   /**
6251    * vector registers:
6252    * input VectorRegister's:  intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
6253    * index VectorRegister's:  idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
6254    * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
6255    *
6256    * NOTE: each field will occupy a single vector register group
6257    */
6258   void base64_vector_decode_round(Register src, Register dst, Register codec,
6259                     Register size, Register stepSrc, Register stepDst, Register failedIdx,
6260                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
6261                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6262                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
6263                     Assembler::LMUL lmul) {
6264     // set vector register type/len
6265     __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
6266 
6267     // segmented load src into v registers: mem(src) => vr(4)
6268     __ vlseg4e8_v(inputV1, src);
6269 
6270     // src = src + register_group_len_bytes * 4
6271     __ add(src, src, stepSrc);
6272 
6273     // decoding
6274     //   1. indexed load: vr(4) => vr(4)
6275     __ vluxei8_v(idxV1, codec, inputV1);
6276     __ vluxei8_v(idxV2, codec, inputV2);
6277     __ vluxei8_v(idxV3, codec, inputV3);
6278     __ vluxei8_v(idxV4, codec, inputV4);
6279 
6280     //   2. check wrong data
6281     __ vor_vv(outputV1, idxV1, idxV2);
6282     __ vor_vv(outputV2, idxV3, idxV4);
6283     __ vor_vv(outputV1, outputV1, outputV2);
6284     __ vmseq_vi(v0, outputV1, -1);
6285     __ vfirst_m(failedIdx, v0);
6286     Label NoFailure, FailureAtIdx0;
6287     // valid value can only be -1 when < 0
6288     __ bltz(failedIdx, NoFailure);
6289     // when the first data (at index 0) fails, no need to process data anymore
6290     __ beqz(failedIdx, FailureAtIdx0);
6291     __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
6292     __ slli(stepDst, failedIdx, 1);
6293     __ add(stepDst, failedIdx, stepDst);
6294     __ BIND(NoFailure);
6295 
6296     //   3. compute the decoded data: vr(4) => vr(3)
6297     __ vsll_vi(idxV1, idxV1, 2);
6298     __ vsrl_vi(outputV1, idxV2, 4);
6299     __ vor_vv(outputV1, outputV1, idxV1);
6300 
6301     __ vsll_vi(idxV2, idxV2, 4);
6302     __ vsrl_vi(outputV2, idxV3, 2);
6303     __ vor_vv(outputV2, outputV2, idxV2);
6304 
6305     __ vsll_vi(idxV3, idxV3, 6);
6306     __ vor_vv(outputV3, idxV4, idxV3);
6307 
6308     // segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
6309     __ vsseg3e8_v(outputV1, dst);
6310 
6311     // dst = dst + register_group_len_bytes * 3
6312     __ add(dst, dst, stepDst);
6313     __ BIND(FailureAtIdx0);
6314   }
6315 
6316   /**
6317    * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
6318    *
6319    *  Input arguments:
6320    *  c_rarg0   - src, source array
6321    *  c_rarg1   - sp, src start offset
6322    *  c_rarg2   - sl, src end offset
6323    *  c_rarg3   - dst, dest array
6324    *  c_rarg4   - dp, dst start offset
6325    *  c_rarg5   - isURL, Base64 or URL character set
6326    *  c_rarg6   - isMIME, Decoding MIME block
6327    */
6328   address generate_base64_decodeBlock() {
6329 
6330     static const uint8_t fromBase64[256] = {
6331         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6332         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6333         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6334         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6335         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6336         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6337         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6338         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6339         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6340         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6341         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6342         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6343         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6344         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6345         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6346         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6347     };
6348 
6349     static const uint8_t fromBase64URL[256] = {
6350         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6351         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6352         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6353         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6354         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6355         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6356         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6357         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6358         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6359         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6360         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6361         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6362         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6363         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6364         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6365         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6366     };
6367 
6368     __ align(CodeEntryAlignment);
6369     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
6370     StubCodeMark mark(this, stub_id);
6371     address start = __ pc();
6372     __ enter();
6373 
6374     Register src    = c_rarg0;
6375     Register soff   = c_rarg1;
6376     Register send   = c_rarg2;
6377     Register dst    = c_rarg3;
6378     Register doff   = c_rarg4;
6379     Register isURL  = c_rarg5;
6380     Register isMIME = c_rarg6;
6381 
6382     Register codec     = c_rarg7;
6383     Register dstBackup = t6;
6384     Register length    = t3;     // total length of src data in bytes
6385 
6386     Label ProcessData, Exit;
6387     Label ProcessScalar, ScalarLoop;
6388 
6389     // passed in length (send - soff) is guaranteed to be > 4,
6390     // and in this intrinsic we only process data of length in multiple of 4,
6391     // it's not guaranteed to be multiple of 4 by java level, so do it explicitly
6392     __ sub(length, send, soff);
6393     __ andi(length, length, -4);
6394     // real src/dst to process data
6395     __ add(src, src, soff);
6396     __ add(dst, dst, doff);
6397     // backup of dst, used to calculate the return value at exit
6398     __ mv(dstBackup, dst);
6399 
6400     // load the codec base address
6401     __ la(codec, ExternalAddress((address) fromBase64));
6402     __ beqz(isURL, ProcessData);
6403     __ la(codec, ExternalAddress((address) fromBase64URL));
6404     __ BIND(ProcessData);
6405 
6406     // vector version
6407     if (UseRVV) {
6408       // for MIME case, it has a default length limit of 76 which could be
6409       // different(smaller) from (send - soff), so in MIME case, we go through
6410       // the scalar code path directly.
6411       __ bnez(isMIME, ScalarLoop);
6412 
6413       Label ProcessM1, ProcessM2;
6414 
6415       Register failedIdx = soff;
6416       Register stepSrcM1 = send;
6417       Register stepSrcM2 = doff;
6418       Register stepDst   = isURL;
6419       Register size      = t4;
6420 
6421       __ mv(size, MaxVectorSize * 2);
6422       __ mv(stepSrcM1, MaxVectorSize * 4);
6423       __ slli(stepSrcM2, stepSrcM1, 1);
6424       __ mv(stepDst, MaxVectorSize * 2 * 3);
6425 
6426       __ blt(length, stepSrcM2, ProcessM1);
6427 
6428 
6429       // Assembler::m2
6430       __ BIND(ProcessM2);
6431       base64_vector_decode_round(src, dst, codec,
6432                     size, stepSrcM2, stepDst, failedIdx,
6433                     v2, v4, v6, v8,      // inputs
6434                     v10, v12, v14, v16,  // indexes
6435                     v18, v20, v22,       // outputs
6436                     Assembler::m2);
6437       __ sub(length, length, stepSrcM2);
6438 
6439       // error check
6440       // valid value of failedIdx can only be -1 when < 0
6441       __ bgez(failedIdx, Exit);
6442 
6443       __ bge(length, stepSrcM2, ProcessM2);
6444 
6445 
6446       // Assembler::m1
6447       __ BIND(ProcessM1);
6448       __ blt(length, stepSrcM1, ProcessScalar);
6449 
6450       __ srli(size, size, 1);
6451       __ srli(stepDst, stepDst, 1);
6452       base64_vector_decode_round(src, dst, codec,
6453                     size, stepSrcM1, stepDst, failedIdx,
6454                     v1, v2, v3, v4,      // inputs
6455                     v5, v6, v7, v8,      // indexes
6456                     v9, v10, v11,        // outputs
6457                     Assembler::m1);
6458       __ sub(length, length, stepSrcM1);
6459 
6460       // error check
6461       // valid value of failedIdx can only be -1 when < 0
6462       __ bgez(failedIdx, Exit);
6463 
6464       __ BIND(ProcessScalar);
6465       __ beqz(length, Exit);
6466     }
6467 
6468     // scalar version
6469     {
6470       Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
6471       Register combined32Bits = t4;
6472 
6473       // encoded:   [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
6474       // plain:     [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
6475       __ BIND(ScalarLoop);
6476 
6477       // load 4 bytes encoded src data
6478       __ lbu(byte0, Address(src, 0));
6479       __ lbu(byte1, Address(src, 1));
6480       __ lbu(byte2, Address(src, 2));
6481       __ lbu(byte3, Address(src, 3));
6482       __ addi(src, src, 4);
6483 
6484       // get codec index and decode (ie. load from codec by index)
6485       __ add(byte0, codec, byte0);
6486       __ add(byte1, codec, byte1);
6487       __ lb(byte0, Address(byte0, 0));
6488       __ lb(byte1, Address(byte1, 0));
6489       __ add(byte2, codec, byte2);
6490       __ add(byte3, codec, byte3);
6491       __ lb(byte2, Address(byte2, 0));
6492       __ lb(byte3, Address(byte3, 0));
6493       __ slliw(byte0, byte0, 18);
6494       __ slliw(byte1, byte1, 12);
6495       __ orr(byte0, byte0, byte1);
6496       __ orr(byte0, byte0, byte3);
6497       __ slliw(byte2, byte2, 6);
6498       // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
6499       //  1. error check below
6500       //  2. decode below
6501       __ orr(combined32Bits, byte0, byte2);
6502 
6503       // error check
6504       __ bltz(combined32Bits, Exit);
6505 
6506       // store 3 bytes decoded data
6507       __ sraiw(byte0, combined32Bits, 16);
6508       __ sraiw(byte1, combined32Bits, 8);
6509       __ sb(byte0, Address(dst, 0));
6510       __ sb(byte1, Address(dst, 1));
6511       __ sb(combined32Bits, Address(dst, 2));
6512 
6513       __ subi(length, length, 4);
6514       __ addi(dst, dst, 3);
6515       // loop back
6516       __ bnez(length, ScalarLoop);
6517     }
6518 
6519     __ BIND(Exit);
6520     __ sub(c_rarg0, dst, dstBackup);
6521 
6522     __ leave();
6523     __ ret();
6524 
6525     return (address) start;
6526   }
6527 
6528   void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
6529     VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
6530     Register temp0, Register temp1, Register temp2,  Register temp3,
6531     VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
6532 
6533     assert((lmul == Assembler::m4 && step == 64) ||
6534            (lmul == Assembler::m2 && step == 32) ||
6535            (lmul == Assembler::m1 && step == 16),
6536            "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
6537     // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
6538     // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
6539     // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
6540     // In non-vectorized code, we update s1 and s2 as:
6541     //   s1 <- s1 + b1
6542     //   s2 <- s2 + s1
6543     //   s1 <- s1 + b2
6544     //   s2 <- s2 + b1
6545     //   ...
6546     //   s1 <- s1 + b64
6547     //   s2 <- s2 + s1
6548     // Putting above assignments together, we have:
6549     //   s1_new = s1 + b1 + b2 + ... + b64
6550     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
6551     //          = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
6552     //          = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
6553 
6554     __ mv(temp3, step);
6555     // Load data
6556     __ vsetvli(temp0, temp3, Assembler::e8, lmul);
6557     __ vle8_v(vbytes, buff);
6558     __ addi(buff, buff, step);
6559 
6560     // Upper bound reduction sum for s1_new:
6561     // 0xFF * 64 = 0x3FC0, so:
6562     // 1. Need to do vector-widening reduction sum
6563     // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
6564     __ vwredsumu_vs(vs1acc, vbytes, vzero);
6565     // Multiplication for s2_new
6566     __ vwmulu_vv(vs2acc, vtable, vbytes);
6567 
6568     // s2 = s2 + s1 * log2(step)
6569     __ slli(temp1, s1, exact_log2(step));
6570     __ add(s2, s2, temp1);
6571 
6572     // Summing up calculated results for s2_new
6573     if (MaxVectorSize > 16) {
6574       __ vsetvli(temp0, temp3, Assembler::e16, lmul);
6575     } else {
6576       // Half of vector-widening multiplication result is in successor of vs2acc
6577       // group for vlen == 16, in which case we need to double vector register
6578       // group width in order to reduction sum all of them
6579       Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
6580                                (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
6581       __ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
6582     }
6583     // Upper bound for reduction sum:
6584     // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
6585     // 1. Need to do vector-widening reduction sum
6586     // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
6587     __ vwredsumu_vs(vtemp1, vs2acc, vzero);
6588 
6589     // Extracting results for:
6590     // s1_new
6591     __ vmv_x_s(temp0, vs1acc);
6592     __ add(s1, s1, temp0);
6593     // s2_new
6594     __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
6595     __ vmv_x_s(temp1, vtemp1);
6596     __ add(s2, s2, temp1);
6597   }
6598 
6599   /***
6600    *  int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
6601    *
6602    *  Arguments:
6603    *
6604    *  Inputs:
6605    *   c_rarg0   - int   adler
6606    *   c_rarg1   - byte* buff (b + off)
6607    *   c_rarg2   - int   len
6608    *
6609    *  Output:
6610    *   c_rarg0   - int adler result
6611    */
6612   address generate_updateBytesAdler32() {
6613     __ align(CodeEntryAlignment);
6614     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
6615     StubCodeMark mark(this, stub_id);
6616     address start = __ pc();
6617 
6618     Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
6619       L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
6620 
6621     // Aliases
6622     Register adler  = c_rarg0;
6623     Register s1     = c_rarg0;
6624     Register s2     = c_rarg3;
6625     Register buff   = c_rarg1;
6626     Register len    = c_rarg2;
6627     Register nmax  = c_rarg4;
6628     Register base  = c_rarg5;
6629     Register count = c_rarg6;
6630     Register temp0 = t3;
6631     Register temp1 = t4;
6632     Register temp2 = t5;
6633     Register temp3 = t6;
6634 
6635     VectorRegister vzero = v31;
6636     VectorRegister vbytes = v8; // group: v8, v9, v10, v11
6637     VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
6638     VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
6639     VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
6640     VectorRegister vtable_32 = v4; // group: v4, v5
6641     VectorRegister vtable_16 = v30;
6642     VectorRegister vtemp1 = v28;
6643     VectorRegister vtemp2 = v29;
6644 
6645     // Max number of bytes we can process before having to take the mod
6646     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
6647     const uint64_t BASE = 0xfff1;
6648     const uint64_t NMAX = 0x15B0;
6649 
6650     // Loops steps
6651     int step_64 = 64;
6652     int step_32 = 32;
6653     int step_16 = 16;
6654     int step_1  = 1;
6655 
6656     __ enter(); // Required for proper stackwalking of RuntimeStub frame
6657     __ mv(temp1, 64);
6658     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
6659 
6660     // Generating accumulation coefficients for further calculations
6661     // vtable_64:
6662     __ vid_v(vtemp1);
6663     __ vrsub_vx(vtable_64, vtemp1, temp1);
6664     // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
6665 
6666     // vtable_32:
6667     __ mv(temp1, 32);
6668     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
6669     __ vid_v(vtemp1);
6670     __ vrsub_vx(vtable_32, vtemp1, temp1);
6671     // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
6672 
6673     __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
6674     // vtable_16:
6675     __ mv(temp1, 16);
6676     __ vid_v(vtemp1);
6677     __ vrsub_vx(vtable_16, vtemp1, temp1);
6678     // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
6679 
6680     __ vmv_v_i(vzero, 0);
6681 
6682     __ mv(base, BASE);
6683     __ mv(nmax, NMAX);
6684 
6685     // s1 is initialized to the lower 16 bits of adler
6686     // s2 is initialized to the upper 16 bits of adler
6687     __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
6688     __ zext(s1, adler, 16); // s1 = (adler & 0xffff)
6689 
6690     // The pipelined loop needs at least 16 elements for 1 iteration
6691     // It does check this, but it is more effective to skip to the cleanup loop
6692     __ mv(temp0, step_16);
6693     __ bgeu(len, temp0, L_nmax);
6694     __ beqz(len, L_combine);
6695 
6696     // Jumping to L_by1_loop
6697     __ subi(len, len, step_1);
6698     __ j(L_by1_loop);
6699 
6700   __ bind(L_nmax);
6701     __ sub(len, len, nmax);
6702     __ subi(count, nmax, 16);
6703     __ bltz(len, L_by16);
6704 
6705   // Align L_nmax loop by 64
6706   __ bind(L_nmax_loop_entry);
6707     __ subi(count, count, 32);
6708 
6709   __ bind(L_nmax_loop);
6710     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6711       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6712       vtemp1, vtemp2, step_64, Assembler::m4);
6713     __ subi(count, count, step_64);
6714     __ bgtz(count, L_nmax_loop);
6715 
6716     // There are three iterations left to do
6717     adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
6718       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6719       vtemp1, vtemp2, step_32, Assembler::m2);
6720     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6721       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6722       vtemp1, vtemp2, step_16, Assembler::m1);
6723 
6724     // s1 = s1 % BASE
6725     __ remuw(s1, s1, base);
6726     // s2 = s2 % BASE
6727     __ remuw(s2, s2, base);
6728 
6729     __ sub(len, len, nmax);
6730     __ subi(count, nmax, 16);
6731     __ bgez(len, L_nmax_loop_entry);
6732 
6733   __ bind(L_by16);
6734     __ add(len, len, count);
6735     __ bltz(len, L_by1);
6736     // Trying to unroll
6737     __ mv(temp3, step_64);
6738     __ blt(len, temp3, L_by16_loop);
6739 
6740   __ bind(L_by16_loop_unroll);
6741     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6742       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6743       vtemp1, vtemp2, step_64, Assembler::m4);
6744     __ subi(len, len, step_64);
6745     // By now the temp3 should still be 64
6746     __ bge(len, temp3, L_by16_loop_unroll);
6747 
6748   __ bind(L_by16_loop);
6749     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6750       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6751       vtemp1, vtemp2, step_16, Assembler::m1);
6752     __ subi(len, len, step_16);
6753     __ bgez(len, L_by16_loop);
6754 
6755   __ bind(L_by1);
6756     __ addi(len, len, 15);
6757     __ bltz(len, L_do_mod);
6758 
6759   __ bind(L_by1_loop);
6760     __ lbu(temp0, Address(buff, 0));
6761     __ addi(buff, buff, step_1);
6762     __ add(s1, temp0, s1);
6763     __ add(s2, s2, s1);
6764     __ subi(len, len, step_1);
6765     __ bgez(len, L_by1_loop);
6766 
6767   __ bind(L_do_mod);
6768     // s1 = s1 % BASE
6769     __ remuw(s1, s1, base);
6770     // s2 = s2 % BASE
6771     __ remuw(s2, s2, base);
6772 
6773     // Combine lower bits and higher bits
6774     // adler = s1 | (s2 << 16)
6775   __ bind(L_combine);
6776     __ slli(s2, s2, 16);
6777     __ orr(s1, s1, s2);
6778 
6779     __ leave(); // Required for proper stackwalking of RuntimeStub frame
6780     __ ret();
6781 
6782     return start;
6783   }
6784 
6785 #endif // COMPILER2
6786 
6787   // x10 = input (float16)
6788   // f10 = result (float)
6789   // t1  = temporary register
6790   address generate_float16ToFloat() {
6791     __ align(CodeEntryAlignment);
6792     StubId stub_id = StubId::stubgen_hf2f_id;
6793     StubCodeMark mark(this, stub_id);
6794     address entry = __ pc();
6795     BLOCK_COMMENT("float16ToFloat:");
6796 
6797     FloatRegister dst = f10;
6798     Register src = x10;
6799     Label NaN_SLOW;
6800 
6801     assert(VM_Version::supports_float16_float_conversion(), "must");
6802 
6803     // On riscv, NaN needs a special process as fcvt does not work in that case.
6804     // On riscv, Inf does not need a special process as fcvt can handle it correctly.
6805     // but we consider to get the slow path to process NaN and Inf at the same time,
6806     // as both of them are rare cases, and if we try to get the slow path to handle
6807     // only NaN case it would sacrifise the performance for normal cases,
6808     // i.e. non-NaN and non-Inf cases.
6809 
6810     // check whether it's a NaN or +/- Inf.
6811     __ mv(t0, 0x7c00);
6812     __ andr(t1, src, t0);
6813     // jump to stub processing NaN and Inf cases.
6814     __ beq(t0, t1, NaN_SLOW);
6815 
6816     // non-NaN or non-Inf cases, just use built-in instructions.
6817     __ fmv_h_x(dst, src);
6818     __ fcvt_s_h(dst, dst);
6819     __ ret();
6820 
6821     __ bind(NaN_SLOW);
6822     // following instructions mainly focus on NaN, as riscv does not handle
6823     // NaN well with fcvt, but the code also works for Inf at the same time.
6824 
6825     // construct a NaN in 32 bits from the NaN in 16 bits,
6826     // we need the payloads of non-canonical NaNs to be preserved.
6827     __ mv(t1, 0x7f800000);
6828     // sign-bit was already set via sign-extension if necessary.
6829     __ slli(t0, src, 13);
6830     __ orr(t1, t0, t1);
6831     __ fmv_w_x(dst, t1);
6832 
6833     __ ret();
6834     return entry;
6835   }
6836 
6837   // f10 = input (float)
6838   // x10 = result (float16)
6839   // f11 = temporary float register
6840   // t1  = temporary register
6841   address generate_floatToFloat16() {
6842     __ align(CodeEntryAlignment);
6843     StubId stub_id = StubId::stubgen_f2hf_id;
6844     StubCodeMark mark(this, stub_id);
6845     address entry = __ pc();
6846     BLOCK_COMMENT("floatToFloat16:");
6847 
6848     Register dst = x10;
6849     FloatRegister src = f10, ftmp = f11;
6850     Label NaN_SLOW;
6851 
6852     assert(VM_Version::supports_float16_float_conversion(), "must");
6853 
6854     // On riscv, NaN needs a special process as fcvt does not work in that case.
6855 
6856     // check whether it's a NaN.
6857     // replace fclass with feq as performance optimization.
6858     __ feq_s(t0, src, src);
6859     // jump to stub processing NaN cases.
6860     __ beqz(t0, NaN_SLOW);
6861 
6862     // non-NaN cases, just use built-in instructions.
6863     __ fcvt_h_s(ftmp, src);
6864     __ fmv_x_h(dst, ftmp);
6865     __ ret();
6866 
6867     __ bind(NaN_SLOW);
6868 
6869     __ float_to_float16_NaN(dst, src, t0, t1);
6870 
6871     __ ret();
6872     return entry;
6873   }
6874 
6875 #ifdef COMPILER2
6876 
6877 static const int64_t right_2_bits = right_n_bits(2);
6878 static const int64_t right_3_bits = right_n_bits(3);
6879 
6880   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
6881   // are represented as long[5], with BITS_PER_LIMB = 26.
6882   // Pack five 26-bit limbs into three 64-bit registers.
6883   void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
6884     assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
6885 
6886     // The goal is to have 128-bit value in dest2:dest1:dest0
6887     __ ld(dest0, Address(src, 0));    // 26 bits in dest0
6888 
6889     __ ld(tmp1, Address(src, sizeof(jlong)));
6890     __ slli(tmp1, tmp1, 26);
6891     __ add(dest0, dest0, tmp1);       // 52 bits in dest0
6892 
6893     __ ld(tmp2, Address(src, 2 * sizeof(jlong)));
6894     __ slli(tmp1, tmp2, 52);
6895     __ add(dest0, dest0, tmp1);       // dest0 is full
6896 
6897     __ srli(dest1, tmp2, 12);         // 14-bit in dest1
6898 
6899     __ ld(tmp1, Address(src, 3 * sizeof(jlong)));
6900     __ slli(tmp1, tmp1, 14);
6901     __ add(dest1, dest1, tmp1);       // 40-bit in dest1
6902 
6903     __ ld(tmp1, Address(src, 4 * sizeof(jlong)));
6904     __ slli(tmp2, tmp1, 40);
6905     __ add(dest1, dest1, tmp2);       // dest1 is full
6906 
6907     if (dest2->is_valid()) {
6908       __ srli(tmp1, tmp1, 24);
6909       __ mv(dest2, tmp1);               // 2 bits in dest2
6910     } else {
6911 #ifdef ASSERT
6912       Label OK;
6913       __ srli(tmp1, tmp1, 24);
6914       __ beq(zr, tmp1, OK);           // 2 bits
6915       __ stop("high bits of Poly1305 integer should be zero");
6916       __ should_not_reach_here();
6917       __ bind(OK);
6918 #endif
6919     }
6920   }
6921 
6922   // As above, but return only a 128-bit integer, packed into two
6923   // 64-bit registers.
6924   void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
6925     poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
6926   }
6927 
6928   // U_2:U_1:U_0: += (U_2 >> 2) * 5
6929   void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
6930     assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
6931 
6932     // First, U_2:U_1:U_0 += (U_2 >> 2)
6933     __ srli(tmp1, U_2, 2);
6934     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6935     __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
6936     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6937     __ add(U_2, U_2, tmp2);
6938 
6939     // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
6940     __ slli(tmp1, tmp1, 2);
6941     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6942     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6943     __ add(U_2, U_2, tmp2);
6944   }
6945 
6946   // Poly1305, RFC 7539
6947   // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
6948 
6949   // Arguments:
6950   //    c_rarg0:   input_start -- where the input is stored
6951   //    c_rarg1:   length
6952   //    c_rarg2:   acc_start -- where the output will be stored
6953   //    c_rarg3:   r_start -- where the randomly generated 128-bit key is stored
6954 
6955   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
6956   // description of the tricks used to simplify and accelerate this
6957   // computation.
6958 
6959   address generate_poly1305_processBlocks() {
6960     __ align(CodeEntryAlignment);
6961     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
6962     StubCodeMark mark(this, stub_id);
6963     address start = __ pc();
6964     __ enter();
6965     Label here;
6966 
6967     RegSet saved_regs = RegSet::range(x18, x21);
6968     RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
6969     __ push_reg(saved_regs, sp);
6970 
6971     // Arguments
6972     const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
6973 
6974     // R_n is the 128-bit randomly-generated key, packed into two
6975     // registers. The caller passes this key to us as long[5], with
6976     // BITS_PER_LIMB = 26.
6977     const Register R_0 = *regs, R_1 = *++regs;
6978     poly1305_pack_26(R_0, R_1, r_start, t1, t2);
6979 
6980     // RR_n is (R_n >> 2) * 5
6981     const Register RR_0 = *++regs, RR_1 = *++regs;
6982     __ srli(t1, R_0, 2);
6983     __ shadd(RR_0, t1, t1, t2, 2);
6984     __ srli(t1, R_1, 2);
6985     __ shadd(RR_1, t1, t1, t2, 2);
6986 
6987     // U_n is the current checksum
6988     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
6989     poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
6990 
6991     static constexpr int BLOCK_LENGTH = 16;
6992     Label DONE, LOOP;
6993 
6994     __ mv(t1, BLOCK_LENGTH);
6995     __ blt(length, t1, DONE); {
6996       __ bind(LOOP);
6997 
6998       // S_n is to be the sum of U_n and the next block of data
6999       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7000       __ ld(S_0, Address(input_start, 0));
7001       __ ld(S_1, Address(input_start, wordSize));
7002 
7003       __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
7004       __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
7005       __ add(S_2, U_2, t1);
7006 
7007       __ addi(S_2, S_2, 1);
7008 
7009       const Register U_0HI = *++regs, U_1HI = *++regs;
7010 
7011       // NB: this logic depends on some of the special properties of
7012       // Poly1305 keys. In particular, because we know that the top
7013       // four bits of R_0 and R_1 are zero, we can add together
7014       // partial products without any risk of needing to propagate a
7015       // carry out.
7016       __ wide_mul(U_0, U_0HI, S_0, R_0);
7017       __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
7018       __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
7019 
7020       __ wide_mul(U_1, U_1HI, S_0, R_1);
7021       __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
7022       __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
7023 
7024       __ andi(U_2, R_0, right_2_bits);
7025       __ mul(U_2, S_2, U_2);
7026 
7027       // Partial reduction mod 2**130 - 5
7028       __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
7029       __ adc(U_2, U_2, U_1HI, t1);
7030       // Sum is now in U_2:U_1:U_0.
7031 
7032       // U_2:U_1:U_0: += (U_2 >> 2) * 5
7033       poly1305_reduce(U_2, U_1, U_0, t1, t2);
7034 
7035       __ subi(length, length, BLOCK_LENGTH);
7036       __ addi(input_start, input_start, BLOCK_LENGTH);
7037       __ mv(t1, BLOCK_LENGTH);
7038       __ bge(length, t1, LOOP);
7039     }
7040 
7041     // Further reduce modulo 2^130 - 5
7042     poly1305_reduce(U_2, U_1, U_0, t1, t2);
7043 
7044     // Unpack the sum into five 26-bit limbs and write to memory.
7045     // First 26 bits is the first limb
7046     __ slli(t1, U_0, 38); // Take lowest 26 bits
7047     __ srli(t1, t1, 38);
7048     __ sd(t1, Address(acc_start)); // First 26-bit limb
7049 
7050     // 27-52 bits of U_0 is the second limb
7051     __ slli(t1, U_0, 12); // Take next 27-52 bits
7052     __ srli(t1, t1, 38);
7053     __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
7054 
7055     // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
7056     __ srli(t1, U_0, 52);
7057     __ slli(t2, U_1, 50);
7058     __ srli(t2, t2, 38);
7059     __ add(t1, t1, t2);
7060     __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
7061 
7062     // Storing 15-40 bits of U_1
7063     __ slli(t1, U_1, 24); // Already used up 14 bits
7064     __ srli(t1, t1, 38); // Clear all other bits from t1
7065     __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
7066 
7067     // Storing 41-64 bits of U_1 and first three bits from U_2 in one register
7068     __ srli(t1, U_1, 40);
7069     __ andi(t2, U_2, right_3_bits);
7070     __ slli(t2, t2, 24);
7071     __ add(t1, t1, t2);
7072     __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
7073 
7074     __ bind(DONE);
7075     __ pop_reg(saved_regs, sp);
7076     __ leave(); // Required for proper stackwalking
7077     __ ret();
7078 
7079     return start;
7080   }
7081 
7082   address generate_arrays_hashcode_powers_of_31() {
7083     assert(UseRVV, "sanity");
7084     const int lmul = 2;
7085     const int stride = MaxVectorSize / sizeof(jint) * lmul;
7086     __ align(CodeEntryAlignment);
7087     StubCodeMark mark(this, "StubRoutines", "arrays_hashcode_powers_of_31");
7088     address start = __ pc();
7089     for (int i = stride; i >= 0; i--) {
7090         jint power_of_31 = 1;
7091         for (int j = i; j > 0; j--) {
7092           power_of_31 = java_multiply(power_of_31, 31);
7093         }
7094         __ emit_int32(power_of_31);
7095     }
7096 
7097     return start;
7098   }
7099 
7100 #endif // COMPILER2
7101 
7102   /**
7103    *  Arguments:
7104    *
7105    * Inputs:
7106    *   c_rarg0   - int crc
7107    *   c_rarg1   - byte* buf
7108    *   c_rarg2   - int length
7109    *
7110    * Output:
7111    *   c_rarg0   - int crc result
7112    */
7113   address generate_updateBytesCRC32() {
7114     assert(UseCRC32Intrinsics, "what are we doing here?");
7115 
7116     __ align(CodeEntryAlignment);
7117     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7118     StubCodeMark mark(this, stub_id);
7119 
7120     address start = __ pc();
7121 
7122     // input parameters
7123     const Register crc    = c_rarg0;  // crc
7124     const Register buf    = c_rarg1;  // source java byte array address
7125     const Register len    = c_rarg2;  // length
7126 
7127     BLOCK_COMMENT("Entry:");
7128     __ enter(); // required for proper stackwalking of RuntimeStub frame
7129 
7130     __ kernel_crc32(crc, buf, len,
7131                     c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables
7132                     c_rarg7, t2, t3, t4, t5, t6);       // misc tmps
7133 
7134     __ leave(); // required for proper stackwalking of RuntimeStub frame
7135     __ ret();
7136 
7137     return start;
7138   }
7139 
7140   // exception handler for upcall stubs
7141   address generate_upcall_stub_exception_handler() {
7142     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
7143     StubCodeMark mark(this, stub_id);
7144     address start = __ pc();
7145 
7146     // Native caller has no idea how to handle exceptions,
7147     // so we just crash here. Up to callee to catch exceptions.
7148     __ verify_oop(x10); // return a exception oop in a0
7149     __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
7150     __ should_not_reach_here();
7151 
7152     return start;
7153   }
7154 
7155   // load Method* target of MethodHandle
7156   // j_rarg0 = jobject receiver
7157   // xmethod = Method* result
7158   address generate_upcall_stub_load_target() {
7159 
7160     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
7161     StubCodeMark mark(this, stub_id);
7162     address start = __ pc();
7163 
7164     __ resolve_global_jobject(j_rarg0, t0, t1);
7165       // Load target method from receiver
7166     __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1);
7167     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1);
7168     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1);
7169     __ access_load_at(T_ADDRESS, IN_HEAP, xmethod,
7170                       Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7171                       noreg, noreg);
7172     __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7173 
7174     __ ret();
7175 
7176     return start;
7177   }
7178 
7179 #undef __
7180 
7181   // Initialization
7182   void generate_preuniverse_stubs() {
7183     // preuniverse stubs are not needed for riscv
7184   }
7185 
7186   void generate_initial_stubs() {
7187     // Generate initial stubs and initializes the entry points
7188 
7189     // entry points that exist in all platforms Note: This is code
7190     // that could be shared among different platforms - however the
7191     // benefit seems to be smaller than the disadvantage of having a
7192     // much more complicated generator structure. See also comment in
7193     // stubRoutines.hpp.
7194 
7195     StubRoutines::_forward_exception_entry = generate_forward_exception();
7196 
7197     if (UnsafeMemoryAccess::_table == nullptr) {
7198       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
7199     }
7200 
7201     StubRoutines::_call_stub_entry =
7202       generate_call_stub(StubRoutines::_call_stub_return_address);
7203 
7204     // is referenced by megamorphic call
7205     StubRoutines::_catch_exception_entry = generate_catch_exception();
7206 
7207     if (UseCRC32Intrinsics) {
7208       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7209     }
7210 
7211     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
7212         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
7213       StubRoutines::_hf2f = generate_float16ToFloat();
7214       StubRoutines::_f2hf = generate_floatToFloat16();
7215     }
7216   }
7217 
7218   void generate_continuation_stubs() {
7219     // Continuation stubs:
7220     StubRoutines::_cont_thaw             = generate_cont_thaw();
7221     StubRoutines::_cont_returnBarrier    = generate_cont_returnBarrier();
7222     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7223     StubRoutines::_cont_preempt_stub     = generate_cont_preempt_stub();
7224   }
7225 
7226   void generate_final_stubs() {
7227     // support for verify_oop (must happen after universe_init)
7228     if (VerifyOops) {
7229       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7230     }
7231 
7232     // arraycopy stubs used by compilers
7233     generate_arraycopy_stubs();
7234 
7235     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
7236 
7237 #ifdef COMPILER2
7238     if (UseSecondarySupersTable) {
7239       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
7240       if (!InlineSecondarySupersTest) {
7241         generate_lookup_secondary_supers_table_stub();
7242       }
7243     }
7244 #endif // COMPILER2
7245 
7246     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
7247     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
7248 
7249     StubRoutines::riscv::set_completed();
7250   }
7251 
7252   void generate_compiler_stubs() {
7253 #ifdef COMPILER2
7254     if (UseMulAddIntrinsic) {
7255       StubRoutines::_mulAdd = generate_mulAdd();
7256     }
7257 
7258     if (UseMultiplyToLenIntrinsic) {
7259       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7260     }
7261 
7262     if (UseSquareToLenIntrinsic) {
7263       StubRoutines::_squareToLen = generate_squareToLen();
7264     }
7265 
7266     if (UseMontgomeryMultiplyIntrinsic) {
7267       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
7268       StubCodeMark mark(this, stub_id);
7269       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7270       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7271     }
7272 
7273     if (UseMontgomerySquareIntrinsic) {
7274       StubId stub_id = StubId::stubgen_montgomerySquare_id;
7275       StubCodeMark mark(this, stub_id);
7276       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7277       StubRoutines::_montgomerySquare = g.generate_square();
7278     }
7279 
7280     if (UseAESIntrinsics) {
7281       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7282       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7283       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7284       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7285     }
7286 
7287     if (UseAESCTRIntrinsics) {
7288       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7289     }
7290 
7291     if (UseGHASHIntrinsics) {
7292       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7293     }
7294 
7295     if (UsePoly1305Intrinsics) {
7296       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
7297     }
7298 
7299     if (UseRVV) {
7300       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7301       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7302     }
7303 
7304     if (UseVectorizedHashCodeIntrinsic && UseRVV) {
7305       StubRoutines::riscv::_arrays_hashcode_powers_of_31 = generate_arrays_hashcode_powers_of_31();
7306     }
7307 
7308     if (UseSHA256Intrinsics) {
7309       Sha2Generator sha2(_masm, this);
7310       StubRoutines::_sha256_implCompress   = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
7311       StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
7312     }
7313 
7314     if (UseSHA512Intrinsics) {
7315       Sha2Generator sha2(_masm, this);
7316       StubRoutines::_sha512_implCompress   = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
7317       StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
7318     }
7319 
7320     if (UseMD5Intrinsics) {
7321       StubRoutines::_md5_implCompress   = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
7322       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
7323     }
7324 
7325     if (UseChaCha20Intrinsics) {
7326       StubRoutines::_chacha20Block = generate_chacha20Block();
7327     }
7328 
7329     if (UseSHA1Intrinsics) {
7330       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
7331       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
7332     }
7333 
7334     if (UseBASE64Intrinsics) {
7335       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7336       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7337     }
7338 
7339     if (UseAdler32Intrinsics) {
7340       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7341     }
7342 
7343     generate_compare_long_strings();
7344 
7345     generate_string_indexof_stubs();
7346 
7347 #endif // COMPILER2
7348   }
7349 
7350  public:
7351   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
7352     switch(blob_id) {
7353     case BlobId::stubgen_preuniverse_id:
7354       generate_preuniverse_stubs();
7355       break;
7356     case BlobId::stubgen_initial_id:
7357       generate_initial_stubs();
7358       break;
7359     case BlobId::stubgen_continuation_id:
7360       generate_continuation_stubs();
7361       break;
7362     case BlobId::stubgen_compiler_id:
7363       generate_compiler_stubs();
7364       break;
7365     case BlobId::stubgen_final_id:
7366       generate_final_stubs();
7367       break;
7368     default:
7369       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
7370       break;
7371     };
7372   }
7373 }; // end class declaration
7374 
7375 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
7376   StubGenerator g(code, blob_id, stub_data);
7377 }